fair.c source code [linux/kernel/sched/fair.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4	*
5	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	*
7	* Interactivity improvements by Mike Galbraith
8	* (C) 2007 Mike Galbraith <efault@gmx.de>
9	*
10	* Various enhancements by Dmitry Adamushko.
11	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12	*
13	* Group scheduling enhancements by Srivatsa Vaddagiri
14	* Copyright IBM Corporation, 2007
15	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16	*
17	* Scaled math optimizations by Thomas Gleixner
18	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19	*
20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	*/
23	#include <linux/energy_model.h>
24	#include <linux/mmap_lock.h>
25	#include <linux/hugetlb_inline.h>
26	#include <linux/jiffies.h>
27	#include <linux/mm_api.h>
28	#include <linux/highmem.h>
29	#include <linux/spinlock_api.h>
30	#include <linux/cpumask_api.h>
31	#include <linux/lockdep_api.h>
32	#include <linux/softirq.h>
33	#include <linux/refcount_api.h>
34	#include <linux/topology.h>
35	#include <linux/sched/clock.h>
36	#include <linux/sched/cond_resched.h>
37	#include <linux/sched/cputime.h>
38	#include <linux/sched/isolation.h>
39	#include <linux/sched/nohz.h>
40	#include <linux/sched/prio.h>
41
42	#include <linux/cpuidle.h>
43	#include <linux/interrupt.h>
44	#include <linux/memory-tiers.h>
45	#include <linux/mempolicy.h>
46	#include <linux/mutex_api.h>
47	#include <linux/profile.h>
48	#include <linux/psi.h>
49	#include <linux/ratelimit.h>
50	#include <linux/task_work.h>
51	#include <linux/rbtree_augmented.h>
52
53	#include <asm/switch_to.h>
54
55	#include <uapi/linux/sched/types.h>
56
57	#include "sched.h"
58	#include "stats.h"
59	#include "autogroup.h"
60
61	/*
62	* The initial- and re-scaling of tunables is configurable
63	*
64	* Options are:
65	*
66	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
67	* SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
68	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
69	*
70	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
71	*/
72	unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
73
74	/*
75	* Minimal preemption granularity for CPU-bound tasks:
76	*
77	* (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
78	*/
79	unsigned int sysctl_sched_base_slice = `700000ULL`;
80	static unsigned int normalized_sysctl_sched_base_slice = `700000ULL`;
81
82	__read_mostly unsigned int sysctl_sched_migration_cost = `500000UL`;
83
84	static int __init setup_sched_thermal_decay_shift(char *str)
85	{
86	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
87	return `1`;
88	}
89	__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
90
91	#ifdef CONFIG_SMP
92	/*
93	* For asym packing, by default the lower numbered CPU has higher priority.
94	*/
95	int __weak arch_asym_cpu_priority(int cpu)
96	{
97	return -cpu;
98	}
99
100	/*
101	* The margin used when comparing utilization with CPU capacity.
102	*
103	* (default: ~20%)
104	*/
105	#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
106
107	/*
108	* The margin used when comparing CPU capacities.
109	* is 'cap1' noticeably greater than 'cap2'
110	*
111	* (default: ~5%)
112	*/
113	#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
114	#endif
115
116	#ifdef CONFIG_CFS_BANDWIDTH
117	/*
118	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
119	* each time a cfs_rq requests quota.
120	*
121	* Note: in the case that the slice exceeds the runtime remaining (either due
122	* to consumption or the quota being specified to be smaller than the slice)
123	* we will always only issue the remaining available time.
124	*
125	* (default: 5 msec, units: microseconds)
126	*/
127	static unsigned int sysctl_sched_cfs_bandwidth_slice = `5000UL`;
128	#endif
129
130	#ifdef CONFIG_NUMA_BALANCING
131	/ Restrict the NUMA promotion throughput (MB/s) for each target node. /
132	static unsigned int sysctl_numa_balancing_promote_rate_limit = `65536`;
133	#endif
134
135	#ifdef CONFIG_SYSCTL
136	static const struct ctl_table sched_fair_sysctls[] = {
137	#ifdef CONFIG_CFS_BANDWIDTH
138	{
139	.procname = "sched_cfs_bandwidth_slice_us",
140	.data = &sysctl_sched_cfs_bandwidth_slice,
141	.maxlen = sizeof(unsigned int),
142	.mode = `0644`,
143	.proc_handler = proc_dointvec_minmax,
144	.extra1 = SYSCTL_ONE,
145	},
146	#endif
147	#ifdef CONFIG_NUMA_BALANCING
148	{
149	.procname = "numa_balancing_promote_rate_limit_MBps",
150	.data = &sysctl_numa_balancing_promote_rate_limit,
151	.maxlen = sizeof(unsigned int),
152	.mode = `0644`,
153	.proc_handler = proc_dointvec_minmax,
154	.extra1 = SYSCTL_ZERO,
155	},
156	#endif /* CONFIG_NUMA_BALANCING */
157	};
158
159	static int __init sched_fair_sysctl_init(void)
160	{
161	register_sysctl_init("kernel", sched_fair_sysctls);
162	return `0`;
163	}
164	late_initcall(sched_fair_sysctl_init);
165	#endif
166
167	static inline void update_load_add(struct load_weight lw, unsigned* long inc)
168	{
169	lw->weight += inc;
170	lw->inv_weight = `0`;
171	}
172
173	static inline void update_load_sub(struct load_weight lw, unsigned* long dec)
174	{
175	lw->weight -= dec;
176	lw->inv_weight = `0`;
177	}
178
179	static inline void update_load_set(struct load_weight lw, unsigned* long w)
180	{
181	lw->weight = w;
182	lw->inv_weight = `0`;
183	}
184
185	/*
186	* Increase the granularity value when there are more CPUs,
187	* because with more CPUs the 'effective latency' as visible
188	* to users decreases. But the relationship is not linear,
189	* so pick a second-best guess by going with the log2 of the
190	* number of CPUs.
191	*
192	* This idea comes from the SD scheduler of Con Kolivas:
193	*/
194	static unsigned int get_update_sysctl_factor(void)
195	{
196	unsigned int cpus = min_t(unsigned int, num_online_cpus(), `8`);
197	unsigned int factor;
198
199	switch (sysctl_sched_tunable_scaling) {
200	case SCHED_TUNABLESCALING_NONE:
201	factor = `1`;
202	break;
203	case SCHED_TUNABLESCALING_LINEAR:
204	factor = cpus;
205	break;
206	case SCHED_TUNABLESCALING_LOG:
207	default:
208	factor = `1` + ilog2(cpus);
209	break;
210	}
211
212	return factor;
213	}
214
215	static void update_sysctl(void)
216	{
217	unsigned int factor = get_update_sysctl_factor();
218
219	#define SET_SYSCTL(name) \
220	(sysctl_##name = (factor) * normalized_sysctl_##name)
221	SET_SYSCTL(sched_base_slice);
222	#undef SET_SYSCTL
223	}
224
225	void __init sched_init_granularity(void)
226	{
227	update_sysctl();
228	}
229
230	#define WMULT_CONST (~0U)
231	#define WMULT_SHIFT 32
232
233	static void __update_inv_weight(struct load_weight *lw)
234	{
235	unsigned long w;
236
237	if (likely(lw->inv_weight))
238	return;
239
240	w = scale_load_down(lw->weight);
241
242	if (BITS_PER_LONG > `32` && unlikely(w >= WMULT_CONST))
243	lw->inv_weight = `1`;
244	else if (unlikely(!w))
245	lw->inv_weight = WMULT_CONST;
246	else
247	lw->inv_weight = WMULT_CONST / w;
248	}
249
250	/*
251	* delta_exec * weight / lw.weight
252	* OR
253	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
254	*
255	* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
256	* we're guaranteed shift stays positive because inv_weight is guaranteed to
257	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
258	*
259	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
260	* weight/lw.weight <= 1, and therefore our shift will also be positive.
261	*/
262	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
263	{
264	u64 fact = scale_load_down(weight);
265	u32 fact_hi = (u32)(fact >> `32`);
266	int shift = WMULT_SHIFT;
267	int fs;
268
269	__update_inv_weight(lw);
270
271	if (unlikely(fact_hi)) {
272	fs = fls(x: fact_hi);
273	shift -= fs;
274	fact >>= fs;
275	}
276
277	fact = mul_u32_u32(a: fact, b: lw->inv_weight);
278
279	fact_hi = (u32)(fact >> `32`);
280	if (fact_hi) {
281	fs = fls(x: fact_hi);
282	shift -= fs;
283	fact >>= fs;
284	}
285
286	return mul_u64_u32_shr(a: delta_exec, mul: fact, shift);
287	}
288
289	/*
290	* delta /= w
291	*/
292	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
293	{
294	if (unlikely(se->load.weight != NICE_0_LOAD))
295	delta = __calc_delta(delta_exec: delta, NICE_0_LOAD, lw: &se->load);
296
297	return delta;
298	}
299
300	const struct sched_class fair_sched_class;
301
302	/**************************************************************
303	* CFS operations on generic schedulable entities:
304	*/
305
306	#ifdef CONFIG_FAIR_GROUP_SCHED
307
308	/ Walk up scheduling entities hierarchy /
309	#define for_each_sched_entity(se) \
310	for (; se; se = se->parent)
311
312	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
313	{
314	struct rq *rq = rq_of(cfs_rq);
315	int cpu = cpu_of(rq);
316
317	if (cfs_rq->on_list)
318	return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
319
320	cfs_rq->on_list = `1`;
321
322	/*
323	* Ensure we either appear before our parent (if already
324	* enqueued) or force our parent to appear after us when it is
325	* enqueued. The fact that we always enqueue bottom-up
326	* reduces this to two cases and a special case for the root
327	* cfs_rq. Furthermore, it also means that we will always reset
328	* tmp_alone_branch either when the branch is connected
329	* to a tree or when we reach the top of the tree
330	*/
331	if (cfs_rq->tg->parent &&
332	cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
333	/*
334	* If parent is already on the list, we add the child
335	* just before. Thanks to circular linked property of
336	* the list, this means to put the child at the tail
337	* of the list that starts by parent.
338	*/
339	list_add_tail_rcu(new: &cfs_rq->leaf_cfs_rq_list,
340	head: &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
341	/*
342	* The branch is now connected to its tree so we can
343	* reset tmp_alone_branch to the beginning of the
344	* list.
345	*/
346	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
347	return true;
348	}
349
350	if (!cfs_rq->tg->parent) {
351	/*
352	* cfs rq without parent should be put
353	* at the tail of the list.
354	*/
355	list_add_tail_rcu(new: &cfs_rq->leaf_cfs_rq_list,
356	head: &rq->leaf_cfs_rq_list);
357	/*
358	* We have reach the top of a tree so we can reset
359	* tmp_alone_branch to the beginning of the list.
360	*/
361	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
362	return true;
363	}
364
365	/*
366	* The parent has not already been added so we want to
367	* make sure that it will be put after us.
368	* tmp_alone_branch points to the begin of the branch
369	* where we will add parent.
370	*/
371	list_add_rcu(new: &cfs_rq->leaf_cfs_rq_list, head: rq->tmp_alone_branch);
372	/*
373	* update tmp_alone_branch to points to the new begin
374	* of the branch
375	*/
376	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
377	return false;
378	}
379
380	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
381	{
382	if (cfs_rq->on_list) {
383	struct rq *rq = rq_of(cfs_rq);
384
385	/*
386	* With cfs_rq being unthrottled/throttled during an enqueue,
387	* it can happen the tmp_alone_branch points to the leaf that
388	* we finally want to delete. In this case, tmp_alone_branch moves
389	* to the prev element but it will point to rq->leaf_cfs_rq_list
390	* at the end of the enqueue.
391	*/
392	if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
393	rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
394
395	list_del_rcu(entry: &cfs_rq->leaf_cfs_rq_list);
396	cfs_rq->on_list = `0`;
397	}
398	}
399
400	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
401	{
402	WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
403	}
404
405	/ Iterate through all leaf cfs_rq's on a runqueue /
406	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
407	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
408	leaf_cfs_rq_list)
409
410	/ Do the two (enqueued) entities belong to the same group ? /
411	static inline struct cfs_rq *
412	is_same_group(struct sched_entity se, struct* sched_entity *pse)
413	{
414	if (se->cfs_rq == pse->cfs_rq)
415	return se->cfs_rq;
416
417	return NULL;
418	}
419
420	static inline struct sched_entity parent_entity(const* struct sched_entity *se)
421	{
422	return se->parent;
423	}
424
425	static void
426	find_matching_se(struct sched_entity se, struct sched_entity pse)
427	{
428	int se_depth, pse_depth;
429
430	/*
431	* preemption test can be made between sibling entities who are in the
432	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
433	* both tasks until we find their ancestors who are siblings of common
434	* parent.
435	*/
436
437	/ First walk up until both entities are at same depth /
438	se_depth = (*se)->depth;
439	pse_depth = (*pse)->depth;
440
441	while (se_depth > pse_depth) {
442	se_depth--;
443	se = parent_entity(se: se);
444	}
445
446	while (pse_depth > se_depth) {
447	pse_depth--;
448	pse = parent_entity(se: pse);
449	}
450
451	while (!is_same_group(se: se, pse: pse)) {
452	se = parent_entity(se: se);
453	pse = parent_entity(se: pse);
454	}
455	}
456
457	static int tg_is_idle(struct task_group *tg)
458	{
459	return tg->idle > `0`;
460	}
461
462	static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
463	{
464	return cfs_rq->idle > `0`;
465	}
466
467	static int se_is_idle(struct sched_entity *se)
468	{
469	if (entity_is_task(se))
470	return task_has_idle_policy(p: task_of(se));
471	return cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se));
472	}
473
474	#else /* !CONFIG_FAIR_GROUP_SCHED */
475
476	#define for_each_sched_entity(se) \
477	for (; se; se = NULL)
478
479	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
480	{
481	return true;
482	}
483
484	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
485	{
486	}
487
488	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
489	{
490	}
491
492	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
493	for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
494
495	static inline struct sched_entity parent_entity(struct* sched_entity *se)
496	{
497	return NULL;
498	}
499
500	static inline void
501	find_matching_se(struct sched_entity se, struct sched_entity pse)
502	{
503	}
504
505	static inline int tg_is_idle(struct task_group *tg)
506	{
507	return `0`;
508	}
509
510	static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
511	{
512	return `0`;
513	}
514
515	static int se_is_idle(struct sched_entity *se)
516	{
517	return task_has_idle_policy(task_of(se));
518	}
519
520	#endif /* CONFIG_FAIR_GROUP_SCHED */
521
522	static __always_inline
523	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
524
525	/**************************************************************
526	* Scheduling class tree data structure manipulation methods:
527	*/
528
529	static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
530	{
531	s64 delta = (s64)(vruntime - max_vruntime);
532	if (delta > `0`)
533	max_vruntime = vruntime;
534
535	return max_vruntime;
536	}
537
538	static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
539	{
540	s64 delta = (s64)(vruntime - min_vruntime);
541	if (delta < `0`)
542	min_vruntime = vruntime;
543
544	return min_vruntime;
545	}
546
547	static inline bool entity_before(const struct sched_entity *a,
548	const struct sched_entity *b)
549	{
550	/*
551	* Tiebreak on vruntime seems unnecessary since it can
552	* hardly happen.
553	*/
554	return (s64)(a->deadline - b->deadline) < `0`;
555	}
556
557	static inline s64 entity_key(struct cfs_rq cfs_rq, struct* sched_entity *se)
558	{
559	return (s64)(se->vruntime - cfs_rq->min_vruntime);
560	}
561
562	#define __node_2_se(node) \
563	rb_entry((node), struct sched_entity, run_node)
564
565	/*
566	* Compute virtual time from the per-task service numbers:
567	*
568	* Fair schedulers conserve lag:
569	*
570	* \Sum lag_i = 0
571	*
572	* Where lag_i is given by:
573	*
574	* lag_i = S - s_i = w_i * (V - v_i)
575	*
576	* Where S is the ideal service time and V is it's virtual time counterpart.
577	* Therefore:
578	*
579	* \Sum lag_i = 0
580	* \Sum w_i * (V - v_i) = 0
581	* \Sum w_i * V - w_i * v_i = 0
582	*
583	* From which we can solve an expression for V in v_i (which we have in
584	* se->vruntime):
585	*
586	* \Sum v_i * w_i \Sum v_i * w_i
587	* V = -------------- = --------------
588	* \Sum w_i W
589	*
590	* Specifically, this is the weighted average of all entity virtual runtimes.
591	*
592	* [[ NOTE: this is only equal to the ideal scheduler under the condition
593	* that join/leave operations happen at lag_i = 0, otherwise the
594	* virtual time has non-contiguous motion equivalent to:
595	*
596	* V +-= lag_i / W
597	*
598	* Also see the comment in place_entity() that deals with this. ]]
599	*
600	* However, since v_i is u64, and the multiplication could easily overflow
601	* transform it into a relative form that uses smaller quantities:
602	*
603	* Substitute: v_i == (v_i - v0) + v0
604	*
605	* \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
606	* V = ---------------------------- = --------------------- + v0
607	* W W
608	*
609	* Which we track using:
610	*
611	* v0 := cfs_rq->min_vruntime
612	* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
613	* \Sum w_i := cfs_rq->avg_load
614	*
615	* Since min_vruntime is a monotonic increasing variable that closely tracks
616	* the per-task service, these deltas: (v_i - v), will be in the order of the
617	* maximal (virtual) lag induced in the system due to quantisation.
618	*
619	* Also, we use scale_load_down() to reduce the size.
620	*
621	* As measured, the max (key * weight) value was ~44 bits for a kernel build.
622	*/
623	static void
624	avg_vruntime_add(struct cfs_rq cfs_rq, struct* sched_entity *se)
625	{
626	unsigned long weight = scale_load_down(se->load.weight);
627	s64 key = entity_key(cfs_rq, se);
628
629	cfs_rq->avg_vruntime += key * weight;
630	cfs_rq->avg_load += weight;
631	}
632
633	static void
634	avg_vruntime_sub(struct cfs_rq cfs_rq, struct* sched_entity *se)
635	{
636	unsigned long weight = scale_load_down(se->load.weight);
637	s64 key = entity_key(cfs_rq, se);
638
639	cfs_rq->avg_vruntime -= key * weight;
640	cfs_rq->avg_load -= weight;
641	}
642
643	static inline
644	void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
645	{
646	/*
647	* v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
648	*/
649	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
650	}
651
652	/*
653	* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
654	* For this to be so, the result of this function must have a left bias.
655	*/
656	u64 avg_vruntime(struct cfs_rq *cfs_rq)
657	{
658	struct sched_entity *curr = cfs_rq->curr;
659	s64 avg = cfs_rq->avg_vruntime;
660	long load = cfs_rq->avg_load;
661
662	if (curr && curr->on_rq) {
663	unsigned long weight = scale_load_down(curr->load.weight);
664
665	avg += entity_key(cfs_rq, se: curr) * weight;
666	load += weight;
667	}
668
669	if (load) {
670	/ sign flips effective floor / ceiling /
671	if (avg < `0`)
672	avg -= (load - `1`);
673	avg = div_s64(dividend: avg, divisor: load);
674	}
675
676	return cfs_rq->min_vruntime + avg;
677	}
678
679	/*
680	* lag_i = S - s_i = w_i * (V - v_i)
681	*
682	* However, since V is approximated by the weighted average of all entities it
683	* is possible -- by addition/removal/reweight to the tree -- to move V around
684	* and end up with a larger lag than we started with.
685	*
686	* Limit this to either double the slice length with a minimum of TICK_NSEC
687	* since that is the timing granularity.
688	*
689	* EEVDF gives the following limit for a steady state system:
690	*
691	* -r_max < lag < max(r_max, q)
692	*
693	* XXX could add max_slice to the augmented data to track this.
694	*/
695	static void update_entity_lag(struct cfs_rq cfs_rq, struct* sched_entity *se)
696	{
697	s64 vlag, limit;
698
699	WARN_ON_ONCE(!se->on_rq);
700
701	vlag = avg_vruntime(cfs_rq) - se->vruntime;
702	limit = calc_delta_fair(max_t(u64, `2`*se->slice, TICK_NSEC), se);
703
704	se->vlag = clamp(vlag, -limit, limit);
705	}
706
707	/*
708	* Entity is eligible once it received less service than it ought to have,
709	* eg. lag >= 0.
710	*
711	* lag_i = S - s_i = w_i*(V - v_i)
712	*
713	* lag_i >= 0 -> V >= v_i
714	*
715	* \Sum (v_i - v)*w_i
716	* V = ------------------ + v
717	* \Sum w_i
718	*
719	* lag_i >= 0 -> \Sum (v_i - v)w_i >= (v_i - v)(\Sum w_i)
720	*
721	* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
722	* to the loss in precision caused by the division.
723	*/
724	static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
725	{
726	struct sched_entity *curr = cfs_rq->curr;
727	s64 avg = cfs_rq->avg_vruntime;
728	long load = cfs_rq->avg_load;
729
730	if (curr && curr->on_rq) {
731	unsigned long weight = scale_load_down(curr->load.weight);
732
733	avg += entity_key(cfs_rq, se: curr) * weight;
734	load += weight;
735	}
736
737	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
738	}
739
740	int entity_eligible(struct cfs_rq cfs_rq, struct* sched_entity *se)
741	{
742	return vruntime_eligible(cfs_rq, vruntime: se->vruntime);
743	}
744
745	static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
746	{
747	u64 min_vruntime = cfs_rq->min_vruntime;
748	/*
749	* open coded max_vruntime() to allow updating avg_vruntime
750	*/
751	s64 delta = (s64)(vruntime - min_vruntime);
752	if (delta > `0`) {
753	avg_vruntime_update(cfs_rq, delta);
754	min_vruntime = vruntime;
755	}
756	return min_vruntime;
757	}
758
759	static void update_min_vruntime(struct cfs_rq *cfs_rq)
760	{
761	struct sched_entity *se = __pick_root_entity(cfs_rq);
762	struct sched_entity *curr = cfs_rq->curr;
763	u64 vruntime = cfs_rq->min_vruntime;
764
765	if (curr) {
766	if (curr->on_rq)
767	vruntime = curr->vruntime;
768	else
769	curr = NULL;
770	}
771
772	if (se) {
773	if (!curr)
774	vruntime = se->min_vruntime;
775	else
776	vruntime = min_vruntime(min_vruntime: vruntime, vruntime: se->min_vruntime);
777	}
778
779	/ ensure we never gain time by being placed backwards. /
780	cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
781	}
782
783	static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
784	{
785	struct sched_entity *root = __pick_root_entity(cfs_rq);
786	struct sched_entity *curr = cfs_rq->curr;
787	u64 min_slice = ~`0ULL`;
788
789	if (curr && curr->on_rq)
790	min_slice = curr->slice;
791
792	if (root)
793	min_slice = min(min_slice, root->min_slice);
794
795	return min_slice;
796	}
797
798	static inline bool __entity_less(struct rb_node a, const* struct rb_node *b)
799	{
800	return entity_before(__node_2_se(a), __node_2_se(b));
801	}
802
803	#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
804
805	static inline void __min_vruntime_update(struct sched_entity se, struct* rb_node *node)
806	{
807	if (node) {
808	struct sched_entity *rse = __node_2_se(node);
809	if (vruntime_gt(min_vruntime, se, rse))
810	se->min_vruntime = rse->min_vruntime;
811	}
812	}
813
814	static inline void __min_slice_update(struct sched_entity se, struct* rb_node *node)
815	{
816	if (node) {
817	struct sched_entity *rse = __node_2_se(node);
818	if (rse->min_slice < se->min_slice)
819	se->min_slice = rse->min_slice;
820	}
821	}
822
823	/*
824	* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
825	*/
826	static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
827	{
828	u64 old_min_vruntime = se->min_vruntime;
829	u64 old_min_slice = se->min_slice;
830	struct rb_node *node = &se->run_node;
831
832	se->min_vruntime = se->vruntime;
833	__min_vruntime_update(se, node: node->rb_right);
834	__min_vruntime_update(se, node: node->rb_left);
835
836	se->min_slice = se->slice;
837	__min_slice_update(se, node: node->rb_right);
838	__min_slice_update(se, node: node->rb_left);
839
840	return se->min_vruntime == old_min_vruntime &&
841	se->min_slice == old_min_slice;
842	}
843
844	RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
845	run_node, min_vruntime, min_vruntime_update);
846
847	/*
848	* Enqueue an entity into the rb-tree:
849	*/
850	static void __enqueue_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
851	{
852	avg_vruntime_add(cfs_rq, se);
853	se->min_vruntime = se->vruntime;
854	se->min_slice = se->slice;
855	rb_add_augmented_cached(node: &se->run_node, tree: &cfs_rq->tasks_timeline,
856	less: __entity_less, augment: &min_vruntime_cb);
857	}
858
859	static void __dequeue_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
860	{
861	rb_erase_augmented_cached(node: &se->run_node, root: &cfs_rq->tasks_timeline,
862	augment: &min_vruntime_cb);
863	avg_vruntime_sub(cfs_rq, se);
864	}
865
866	struct sched_entity __pick_root_entity(struct* cfs_rq *cfs_rq)
867	{
868	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
869
870	if (!root)
871	return NULL;
872
873	return __node_2_se(root);
874	}
875
876	struct sched_entity __pick_first_entity(struct* cfs_rq *cfs_rq)
877	{
878	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
879
880	if (!left)
881	return NULL;
882
883	return __node_2_se(left);
884	}
885
886	/*
887	* HACK, stash a copy of deadline at the point of pick in vlag,
888	* which isn't used until dequeue.
889	*/
890	static inline void set_protect_slice(struct sched_entity *se)
891	{
892	se->vlag = se->deadline;
893	}
894
895	static inline bool protect_slice(struct sched_entity *se)
896	{
897	return se->vlag == se->deadline;
898	}
899
900	static inline void cancel_protect_slice(struct sched_entity *se)
901	{
902	if (protect_slice(se))
903	se->vlag = se->deadline + `1`;
904	}
905
906	/*
907	* Earliest Eligible Virtual Deadline First
908	*
909	* In order to provide latency guarantees for different request sizes
910	* EEVDF selects the best runnable task from two criteria:
911	*
912	* 1) the task must be eligible (must be owed service)
913	*
914	* 2) from those tasks that meet 1), we select the one
915	* with the earliest virtual deadline.
916	*
917	* We can do this in O(log n) time due to an augmented RB-tree. The
918	* tree keeps the entries sorted on deadline, but also functions as a
919	* heap based on the vruntime by keeping:
920	*
921	* se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
922	*
923	* Which allows tree pruning through eligibility.
924	*/
925	static struct sched_entity pick_eevdf(struct* cfs_rq *cfs_rq)
926	{
927	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
928	struct sched_entity *se = __pick_first_entity(cfs_rq);
929	struct sched_entity *curr = cfs_rq->curr;
930	struct sched_entity *best = NULL;
931
932	/*
933	* We can safely skip eligibility check if there is only one entity
934	* in this cfs_rq, saving some cycles.
935	*/
936	if (cfs_rq->nr_queued == `1`)
937	return curr && curr->on_rq ? curr : se;
938
939	if (curr && (!curr->on_rq \|\| !entity_eligible(cfs_rq, se: curr)))
940	curr = NULL;
941
942	if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(se: curr))
943	return curr;
944
945	/ Pick the leftmost entity if it's eligible /
946	if (se && entity_eligible(cfs_rq, se)) {
947	best = se;
948	goto found;
949	}
950
951	/ Heap search for the EEVD entity /
952	while (node) {
953	struct rb_node *left = node->rb_left;
954
955	/*
956	* Eligible entities in left subtree are always better
957	* choices, since they have earlier deadlines.
958	*/
959	if (left && vruntime_eligible(cfs_rq,
960	__node_2_se(left)->min_vruntime)) {
961	node = left;
962	continue;
963	}
964
965	se = __node_2_se(node);
966
967	/*
968	* The left subtree either is empty or has no eligible
969	* entity, so check the current node since it is the one
970	* with earliest deadline that might be eligible.
971	*/
972	if (entity_eligible(cfs_rq, se)) {
973	best = se;
974	break;
975	}
976
977	node = node->rb_right;
978	}
979	found:
980	if (!best \|\| (curr && entity_before(a: curr, b: best)))
981	best = curr;
982
983	return best;
984	}
985
986	struct sched_entity __pick_last_entity(struct* cfs_rq *cfs_rq)
987	{
988	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
989
990	if (!last)
991	return NULL;
992
993	return __node_2_se(last);
994	}
995
996	/**************************************************************
997	* Scheduling class statistics methods:
998	*/
999	#ifdef CONFIG_SMP
1000	int sched_update_scaling(void)
1001	{
1002	unsigned int factor = get_update_sysctl_factor();
1003
1004	#define WRT_SYSCTL(name) \
1005	(normalized_sysctl_##name = sysctl_##name / (factor))
1006	WRT_SYSCTL(sched_base_slice);
1007	#undef WRT_SYSCTL
1008
1009	return `0`;
1010	}
1011	#endif
1012
1013	static void clear_buddies(struct cfs_rq cfs_rq, struct* sched_entity *se);
1014
1015	/*
1016	* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
1017	* this is probably good enough.
1018	*/
1019	static bool update_deadline(struct cfs_rq cfs_rq, struct* sched_entity *se)
1020	{
1021	if ((s64)(se->vruntime - se->deadline) < `0`)
1022	return false;
1023
1024	/*
1025	* For EEVDF the virtual time slope is determined by w_i (iow.
1026	* nice) while the request time r_i is determined by
1027	* sysctl_sched_base_slice.
1028	*/
1029	if (!se->custom_slice)
1030	se->slice = sysctl_sched_base_slice;
1031
1032	/*
1033	* EEVDF: vd_i = ve_i + r_i / w_i
1034	*/
1035	se->deadline = se->vruntime + calc_delta_fair(delta: se->slice, se);
1036
1037	/*
1038	* The task has consumed its request, reschedule.
1039	*/
1040	return true;
1041	}
1042
1043	#include "pelt.h"
1044	#ifdef CONFIG_SMP
1045
1046	static int select_idle_sibling(struct task_struct p, int* prev_cpu, int cpu);
1047	static unsigned long task_h_load(struct task_struct *p);
1048	static unsigned long capacity_of(int cpu);
1049
1050	/ Give new sched_entity start runnable values to heavy its load in infant time /
1051	void init_entity_runnable_average(struct sched_entity *se)
1052	{
1053	struct sched_avg *sa = &se->avg;
1054
1055	memset(sa, `0`, sizeof(*sa));
1056
1057	/*
1058	* Tasks are initialized with full load to be seen as heavy tasks until
1059	* they get a chance to stabilize to their real load level.
1060	* Group entities are initialized with zero load to reflect the fact that
1061	* nothing has been attached to the task group yet.
1062	*/
1063	if (entity_is_task(se))
1064	sa->load_avg = scale_load_down(se->load.weight);
1065
1066	/ when this task is enqueued, it will contribute to its cfs_rq's load_avg /
1067	}
1068
1069	/*
1070	* With new tasks being created, their initial util_avgs are extrapolated
1071	* based on the cfs_rq's current util_avg:
1072	*
1073	* util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
1074	* * se_weight(se)
1075	*
1076	* However, in many cases, the above util_avg does not give a desired
1077	* value. Moreover, the sum of the util_avgs may be divergent, such
1078	* as when the series is a harmonic series.
1079	*
1080	* To solve this problem, we also cap the util_avg of successive tasks to
1081	* only 1/2 of the left utilization budget:
1082	*
1083	* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1084	*
1085	* where n denotes the nth task and cpu_scale the CPU capacity.
1086	*
1087	* For example, for a CPU with 1024 of capacity, a simplest series from
1088	* the beginning would be like:
1089	*
1090	* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
1091	* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1092	*
1093	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1094	* if util_avg > util_avg_cap.
1095	*/
1096	void post_init_entity_util_avg(struct task_struct *p)
1097	{
1098	struct sched_entity *se = &p->se;
1099	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1100	struct sched_avg *sa = &se->avg;
1101	long cpu_scale = arch_scale_cpu_capacity(cpu: cpu_of(rq: rq_of(cfs_rq)));
1102	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / `2`;
1103
1104	if (p->sched_class != &fair_sched_class) {
1105	/*
1106	* For !fair tasks do:
1107	*
1108	update_cfs_rq_load_avg(now, cfs_rq);
1109	attach_entity_load_avg(cfs_rq, se);
1110	switched_from_fair(rq, p);
1111	*
1112	* such that the next switched_to_fair() has the
1113	* expected state.
1114	*/
1115	se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1116	return;
1117	}
1118
1119	if (cap > `0`) {
1120	if (cfs_rq->avg.util_avg != `0`) {
1121	sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
1122	sa->util_avg /= (cfs_rq->avg.load_avg + `1`);
1123
1124	if (sa->util_avg > cap)
1125	sa->util_avg = cap;
1126	} else {
1127	sa->util_avg = cap;
1128	}
1129	}
1130
1131	sa->runnable_avg = sa->util_avg;
1132	}
1133
1134	#else /* !CONFIG_SMP */
1135	void init_entity_runnable_average(struct sched_entity *se)
1136	{
1137	}
1138	void post_init_entity_util_avg(struct task_struct *p)
1139	{
1140	}
1141	static void update_tg_load_avg(struct cfs_rq *cfs_rq)
1142	{
1143	}
1144	#endif /* CONFIG_SMP */
1145
1146	static s64 update_curr_se(struct rq rq, struct* sched_entity *curr)
1147	{
1148	u64 now = rq_clock_task(rq);
1149	s64 delta_exec;
1150
1151	delta_exec = now - curr->exec_start;
1152	if (unlikely(delta_exec <= `0`))
1153	return delta_exec;
1154
1155	curr->exec_start = now;
1156	curr->sum_exec_runtime += delta_exec;
1157
1158	if (schedstat_enabled()) {
1159	struct sched_statistics *stats;
1160
1161	stats = __schedstats_from_se(se: curr);
1162	__schedstat_set(stats->exec_max,
1163	max(delta_exec, stats->exec_max));
1164	}
1165
1166	return delta_exec;
1167	}
1168
1169	static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
1170	{
1171	trace_sched_stat_runtime(tsk: p, runtime: delta_exec);
1172	account_group_exec_runtime(tsk: p, ns: delta_exec);
1173	cgroup_account_cputime(task: p, delta_exec);
1174	}
1175
1176	static inline bool did_preempt_short(struct cfs_rq cfs_rq, struct* sched_entity *curr)
1177	{
1178	if (!sched_feat(PREEMPT_SHORT))
1179	return false;
1180
1181	if (curr->vlag == curr->deadline)
1182	return false;
1183
1184	return !entity_eligible(cfs_rq, se: curr);
1185	}
1186
1187	static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
1188	struct sched_entity pse, struct* sched_entity *se)
1189	{
1190	if (!sched_feat(PREEMPT_SHORT))
1191	return false;
1192
1193	if (pse->slice >= se->slice)
1194	return false;
1195
1196	if (!entity_eligible(cfs_rq, se: pse))
1197	return false;
1198
1199	if (entity_before(a: pse, b: se))
1200	return true;
1201
1202	if (!entity_eligible(cfs_rq, se))
1203	return true;
1204
1205	return false;
1206	}
1207
1208	/*
1209	* Used by other classes to account runtime.
1210	*/
1211	s64 update_curr_common(struct rq *rq)
1212	{
1213	struct task_struct *donor = rq->donor;
1214	s64 delta_exec;
1215
1216	delta_exec = update_curr_se(rq, curr: &donor->se);
1217	if (likely(delta_exec > `0`))
1218	update_curr_task(p: donor, delta_exec);
1219
1220	return delta_exec;
1221	}
1222
1223	/*
1224	* Update the current task's runtime statistics.
1225	*/
1226	static void update_curr(struct cfs_rq *cfs_rq)
1227	{
1228	struct sched_entity *curr = cfs_rq->curr;
1229	struct rq *rq = rq_of(cfs_rq);
1230	s64 delta_exec;
1231	bool resched;
1232
1233	if (unlikely(!curr))
1234	return;
1235
1236	delta_exec = update_curr_se(rq, curr);
1237	if (unlikely(delta_exec <= `0`))
1238	return;
1239
1240	curr->vruntime += calc_delta_fair(delta: delta_exec, se: curr);
1241	resched = update_deadline(cfs_rq, se: curr);
1242	update_min_vruntime(cfs_rq);
1243
1244	if (entity_is_task(curr)) {
1245	struct task_struct *p = task_of(se: curr);
1246
1247	update_curr_task(p, delta_exec);
1248
1249	/*
1250	* If the fair_server is active, we need to account for the
1251	* fair_server time whether or not the task is running on
1252	* behalf of fair_server or not:
1253	* - If the task is running on behalf of fair_server, we need
1254	* to limit its time based on the assigned runtime.
1255	* - Fair task that runs outside of fair_server should account
1256	* against fair_server such that it can account for this time
1257	* and possibly avoid running this period.
1258	*/
1259	if (dl_server_active(dl_se: &rq->fair_server))
1260	dl_server_update(dl_se: &rq->fair_server, delta_exec);
1261	}
1262
1263	account_cfs_rq_runtime(cfs_rq, delta_exec);
1264
1265	if (cfs_rq->nr_queued == `1`)
1266	return;
1267
1268	if (resched \|\| did_preempt_short(cfs_rq, curr)) {
1269	resched_curr_lazy(rq);
1270	clear_buddies(cfs_rq, se: curr);
1271	}
1272	}
1273
1274	static void update_curr_fair(struct rq *rq)
1275	{
1276	update_curr(cfs_rq: cfs_rq_of(se: &rq->donor->se));
1277	}
1278
1279	static inline void
1280	update_stats_wait_start_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1281	{
1282	struct sched_statistics *stats;
1283	struct task_struct *p = NULL;
1284
1285	if (!schedstat_enabled())
1286	return;
1287
1288	stats = __schedstats_from_se(se);
1289
1290	if (entity_is_task(se))
1291	p = task_of(se);
1292
1293	__update_stats_wait_start(rq: rq_of(cfs_rq), p, stats);
1294	}
1295
1296	static inline void
1297	update_stats_wait_end_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1298	{
1299	struct sched_statistics *stats;
1300	struct task_struct *p = NULL;
1301
1302	if (!schedstat_enabled())
1303	return;
1304
1305	stats = __schedstats_from_se(se);
1306
1307	/*
1308	* When the sched_schedstat changes from 0 to 1, some sched se
1309	* maybe already in the runqueue, the se->statistics.wait_start
1310	* will be 0.So it will let the delta wrong. We need to avoid this
1311	* scenario.
1312	*/
1313	if (unlikely(!schedstat_val(stats->wait_start)))
1314	return;
1315
1316	if (entity_is_task(se))
1317	p = task_of(se);
1318
1319	__update_stats_wait_end(rq: rq_of(cfs_rq), p, stats);
1320	}
1321
1322	static inline void
1323	update_stats_enqueue_sleeper_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1324	{
1325	struct sched_statistics *stats;
1326	struct task_struct *tsk = NULL;
1327
1328	if (!schedstat_enabled())
1329	return;
1330
1331	stats = __schedstats_from_se(se);
1332
1333	if (entity_is_task(se))
1334	tsk = task_of(se);
1335
1336	__update_stats_enqueue_sleeper(rq: rq_of(cfs_rq), p: tsk, stats);
1337	}
1338
1339	/*
1340	* Task is being enqueued - update stats:
1341	*/
1342	static inline void
1343	update_stats_enqueue_fair(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
1344	{
1345	if (!schedstat_enabled())
1346	return;
1347
1348	/*
1349	* Are we enqueueing a waiting task? (for current tasks
1350	* a dequeue/enqueue event is a NOP)
1351	*/
1352	if (se != cfs_rq->curr)
1353	update_stats_wait_start_fair(cfs_rq, se);
1354
1355	if (flags & ENQUEUE_WAKEUP)
1356	update_stats_enqueue_sleeper_fair(cfs_rq, se);
1357	}
1358
1359	static inline void
1360	update_stats_dequeue_fair(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
1361	{
1362
1363	if (!schedstat_enabled())
1364	return;
1365
1366	/*
1367	* Mark the end of the wait period if dequeueing a
1368	* waiting task:
1369	*/
1370	if (se != cfs_rq->curr)
1371	update_stats_wait_end_fair(cfs_rq, se);
1372
1373	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1374	struct task_struct *tsk = task_of(se);
1375	unsigned int state;
1376
1377	/ XXX racy against TTWU /
1378	state = READ_ONCE(tsk->__state);
1379	if (state & TASK_INTERRUPTIBLE)
1380	__schedstat_set(tsk->stats.sleep_start,
1381	rq_clock(rq_of(cfs_rq)));
1382	if (state & TASK_UNINTERRUPTIBLE)
1383	__schedstat_set(tsk->stats.block_start,
1384	rq_clock(rq_of(cfs_rq)));
1385	}
1386	}
1387
1388	/*
1389	* We are picking a new current task - update its stats:
1390	*/
1391	static inline void
1392	update_stats_curr_start(struct cfs_rq cfs_rq, struct* sched_entity *se)
1393	{
1394	/*
1395	* We are starting a new run period:
1396	*/
1397	se->exec_start = rq_clock_task(rq: rq_of(cfs_rq));
1398	}
1399
1400	/**************************************************
1401	* Scheduling class queueing methods:
1402	*/
1403
1404	static inline bool is_core_idle(int cpu)
1405	{
1406	#ifdef CONFIG_SCHED_SMT
1407	int sibling;
1408
1409	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1410	if (cpu == sibling)
1411	continue;
1412
1413	if (!idle_cpu(cpu: sibling))
1414	return false;
1415	}
1416	#endif
1417
1418	return true;
1419	}
1420
1421	#ifdef CONFIG_NUMA
1422	#define NUMA_IMBALANCE_MIN 2
1423
1424	static inline long
1425	adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1426	{
1427	/*
1428	* Allow a NUMA imbalance if busy CPUs is less than the maximum
1429	* threshold. Above this threshold, individual tasks may be contending
1430	* for both memory bandwidth and any shared HT resources. This is an
1431	* approximation as the number of running tasks may not be related to
1432	* the number of busy CPUs due to sched_setaffinity.
1433	*/
1434	if (dst_running > imb_numa_nr)
1435	return imbalance;
1436
1437	/*
1438	* Allow a small imbalance based on a simple pair of communicating
1439	* tasks that remain local when the destination is lightly loaded.
1440	*/
1441	if (imbalance <= NUMA_IMBALANCE_MIN)
1442	return `0`;
1443
1444	return imbalance;
1445	}
1446	#endif /* CONFIG_NUMA */
1447
1448	#ifdef CONFIG_NUMA_BALANCING
1449	/*
1450	* Approximate time to scan a full NUMA task in ms. The task scan period is
1451	* calculated based on the tasks virtual memory size and
1452	* numa_balancing_scan_size.
1453	*/
1454	unsigned int sysctl_numa_balancing_scan_period_min = `1000`;
1455	unsigned int sysctl_numa_balancing_scan_period_max = `60000`;
1456
1457	/ Portion of address space to scan in MB /
1458	unsigned int sysctl_numa_balancing_scan_size = `256`;
1459
1460	/ Scan @scan_size MB every @scan_period after an initial @scan_delay in ms /
1461	unsigned int sysctl_numa_balancing_scan_delay = `1000`;
1462
1463	/ The page with hint page fault latency < threshold in ms is considered hot /
1464	unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1465
1466	struct numa_group {
1467	refcount_t refcount;
1468
1469	spinlock_t lock; / nr_tasks, tasks /
1470	int nr_tasks;
1471	pid_t gid;
1472	int active_nodes;
1473
1474	struct rcu_head rcu;
1475	unsigned long total_faults;
1476	unsigned long max_faults_cpu;
1477	/*
1478	* faults[] array is split into two regions: faults_mem and faults_cpu.
1479	*
1480	* Faults_cpu is used to decide whether memory should move
1481	* towards the CPU. As a consequence, these stats are weighted
1482	* more by CPU use than by memory faults.
1483	*/
1484	unsigned long faults[];
1485	};
1486
1487	/*
1488	* For functions that can be called in multiple contexts that permit reading
1489	* ->numa_group (see struct task_struct for locking rules).
1490	*/
1491	static struct numa_group deref_task_numa_group(struct* task_struct *p)
1492	{
1493	return rcu_dereference_check(p->numa_group, p == current \|\|
1494	(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1495	}
1496
1497	static struct numa_group deref_curr_numa_group(struct* task_struct *p)
1498	{
1499	return rcu_dereference_protected(p->numa_group, p == current);
1500	}
1501
1502	static inline unsigned long group_faults_priv(struct numa_group *ng);
1503	static inline unsigned long group_faults_shared(struct numa_group *ng);
1504
1505	static unsigned int task_nr_scan_windows(struct task_struct *p)
1506	{
1507	unsigned long rss = `0`;
1508	unsigned long nr_scan_pages;
1509
1510	/*
1511	* Calculations based on RSS as non-present and empty pages are skipped
1512	* by the PTE scanner and NUMA hinting faults should be trapped based
1513	* on resident pages
1514	*/
1515	nr_scan_pages = sysctl_numa_balancing_scan_size << (`20` - PAGE_SHIFT);
1516	rss = get_mm_rss(mm: p->mm);
1517	if (!rss)
1518	rss = nr_scan_pages;
1519
1520	rss = round_up(rss, nr_scan_pages);
1521	return rss / nr_scan_pages;
1522	}
1523
1524	/ For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. /
1525	#define MAX_SCAN_WINDOW 2560
1526
1527	static unsigned int task_scan_min(struct task_struct *p)
1528	{
1529	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1530	unsigned int scan, floor;
1531	unsigned int windows = `1`;
1532
1533	if (scan_size < MAX_SCAN_WINDOW)
1534	windows = MAX_SCAN_WINDOW / scan_size;
1535	floor = `1000` / windows;
1536
1537	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1538	return max_t(unsigned int, floor, scan);
1539	}
1540
1541	static unsigned int task_scan_start(struct task_struct *p)
1542	{
1543	unsigned long smin = task_scan_min(p);
1544	unsigned long period = smin;
1545	struct numa_group *ng;
1546
1547	/ Scale the maximum scan period with the amount of shared memory. /
1548	rcu_read_lock();
1549	ng = rcu_dereference(p->numa_group);
1550	if (ng) {
1551	unsigned long shared = group_faults_shared(ng);
1552	unsigned long private = group_faults_priv(ng);
1553
1554	period *= refcount_read(r: &ng->refcount);
1555	period *= shared + `1`;
1556	period /= private + shared + `1`;
1557	}
1558	rcu_read_unlock();
1559
1560	return max(smin, period);
1561	}
1562
1563	static unsigned int task_scan_max(struct task_struct *p)
1564	{
1565	unsigned long smin = task_scan_min(p);
1566	unsigned long smax;
1567	struct numa_group *ng;
1568
1569	/ Watch for min being lower than max due to floor calculations /
1570	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1571
1572	/ Scale the maximum scan period with the amount of shared memory. /
1573	ng = deref_curr_numa_group(p);
1574	if (ng) {
1575	unsigned long shared = group_faults_shared(ng);
1576	unsigned long private = group_faults_priv(ng);
1577	unsigned long period = smax;
1578
1579	period *= refcount_read(r: &ng->refcount);
1580	period *= shared + `1`;
1581	period /= private + shared + `1`;
1582
1583	smax = max(smax, period);
1584	}
1585
1586	return max(smin, smax);
1587	}
1588
1589	static void account_numa_enqueue(struct rq rq, struct* task_struct *p)
1590	{
1591	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1592	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1593	}
1594
1595	static void account_numa_dequeue(struct rq rq, struct* task_struct *p)
1596	{
1597	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1598	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1599	}
1600
1601	/ Shared or private faults. /
1602	#define NR_NUMA_HINT_FAULT_TYPES 2
1603
1604	/ Memory and CPU locality /
1605	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1606
1607	/ Averaged statistics, and temporary buffers. /
1608	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1609
1610	pid_t task_numa_group_id(struct task_struct *p)
1611	{
1612	struct numa_group *ng;
1613	pid_t gid = `0`;
1614
1615	rcu_read_lock();
1616	ng = rcu_dereference(p->numa_group);
1617	if (ng)
1618	gid = ng->gid;
1619	rcu_read_unlock();
1620
1621	return gid;
1622	}
1623
1624	/*
1625	* The averaged statistics, shared & private, memory & CPU,
1626	* occupy the first half of the array. The second half of the
1627	* array is for current counters, which are averaged into the
1628	* first set by task_numa_placement.
1629	*/
1630	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1631	{
1632	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1633	}
1634
1635	static inline unsigned long task_faults(struct task_struct p, int* nid)
1636	{
1637	if (!p->numa_faults)
1638	return `0`;
1639
1640	return p->numa_faults[task_faults_idx(s: NUMA_MEM, nid, priv: `0`)] +
1641	p->numa_faults[task_faults_idx(s: NUMA_MEM, nid, priv: `1`)];
1642	}
1643
1644	static inline unsigned long group_faults(struct task_struct p, int* nid)
1645	{
1646	struct numa_group *ng = deref_task_numa_group(p);
1647
1648	if (!ng)
1649	return `0`;
1650
1651	return ng->faults[task_faults_idx(s: NUMA_MEM, nid, priv: `0`)] +
1652	ng->faults[task_faults_idx(s: NUMA_MEM, nid, priv: `1`)];
1653	}
1654
1655	static inline unsigned long group_faults_cpu(struct numa_group group, int* nid)
1656	{
1657	return group->faults[task_faults_idx(s: NUMA_CPU, nid, priv: `0`)] +
1658	group->faults[task_faults_idx(s: NUMA_CPU, nid, priv: `1`)];
1659	}
1660
1661	static inline unsigned long group_faults_priv(struct numa_group *ng)
1662	{
1663	unsigned long faults = `0`;
1664	int node;
1665
1666	for_each_online_node(node) {
1667	faults += ng->faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `1`)];
1668	}
1669
1670	return faults;
1671	}
1672
1673	static inline unsigned long group_faults_shared(struct numa_group *ng)
1674	{
1675	unsigned long faults = `0`;
1676	int node;
1677
1678	for_each_online_node(node) {
1679	faults += ng->faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `0`)];
1680	}
1681
1682	return faults;
1683	}
1684
1685	/*
1686	* A node triggering more than 1/3 as many NUMA faults as the maximum is
1687	* considered part of a numa group's pseudo-interleaving set. Migrations
1688	* between these nodes are slowed down, to allow things to settle down.
1689	*/
1690	#define ACTIVE_NODE_FRACTION 3
1691
1692	static bool numa_is_active_node(int nid, struct numa_group *ng)
1693	{
1694	return group_faults_cpu(group: ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1695	}
1696
1697	/ Handle placement on systems where not all nodes are directly connected. /
1698	static unsigned long score_nearby_nodes(struct task_struct p, int* nid,
1699	int lim_dist, bool task)
1700	{
1701	unsigned long score = `0`;
1702	int node, max_dist;
1703
1704	/*
1705	* All nodes are directly connected, and the same distance
1706	* from each other. No need for fancy placement algorithms.
1707	*/
1708	if (sched_numa_topology_type == NUMA_DIRECT)
1709	return `0`;
1710
1711	/ sched_max_numa_distance may be changed in parallel. /
1712	max_dist = READ_ONCE(sched_max_numa_distance);
1713	/*
1714	* This code is called for each node, introducing N^2 complexity,
1715	* which should be OK given the number of nodes rarely exceeds 8.
1716	*/
1717	for_each_online_node(node) {
1718	unsigned long faults;
1719	int dist = node_distance(nid, node);
1720
1721	/*
1722	* The furthest away nodes in the system are not interesting
1723	* for placement; nid was already counted.
1724	*/
1725	if (dist >= max_dist \|\| node == nid)
1726	continue;
1727
1728	/*
1729	* On systems with a backplane NUMA topology, compare groups
1730	* of nodes, and move tasks towards the group with the most
1731	* memory accesses. When comparing two nodes at distance
1732	* "hoplimit", only nodes closer by than "hoplimit" are part
1733	* of each group. Skip other nodes.
1734	*/
1735	if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1736	continue;
1737
1738	/ Add up the faults from nearby nodes. /
1739	if (task)
1740	faults = task_faults(p, nid: node);
1741	else
1742	faults = group_faults(p, nid: node);
1743
1744	/*
1745	* On systems with a glueless mesh NUMA topology, there are
1746	* no fixed "groups of nodes". Instead, nodes that are not
1747	* directly connected bounce traffic through intermediate
1748	* nodes; a numa_group can occupy any set of nodes.
1749	* The further away a node is, the less the faults count.
1750	* This seems to result in good task placement.
1751	*/
1752	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1753	faults *= (max_dist - dist);
1754	faults /= (max_dist - LOCAL_DISTANCE);
1755	}
1756
1757	score += faults;
1758	}
1759
1760	return score;
1761	}
1762
1763	/*
1764	* These return the fraction of accesses done by a particular task, or
1765	* task group, on a particular numa node. The group weight is given a
1766	* larger multiplier, in order to group tasks together that are almost
1767	* evenly spread out between numa nodes.
1768	*/
1769	static inline unsigned long task_weight(struct task_struct p, int* nid,
1770	int dist)
1771	{
1772	unsigned long faults, total_faults;
1773
1774	if (!p->numa_faults)
1775	return `0`;
1776
1777	total_faults = p->total_numa_faults;
1778
1779	if (!total_faults)
1780	return `0`;
1781
1782	faults = task_faults(p, nid);
1783	faults += score_nearby_nodes(p, nid, lim_dist: dist, task: true);
1784
1785	return `1000` * faults / total_faults;
1786	}
1787
1788	static inline unsigned long group_weight(struct task_struct p, int* nid,
1789	int dist)
1790	{
1791	struct numa_group *ng = deref_task_numa_group(p);
1792	unsigned long faults, total_faults;
1793
1794	if (!ng)
1795	return `0`;
1796
1797	total_faults = ng->total_faults;
1798
1799	if (!total_faults)
1800	return `0`;
1801
1802	faults = group_faults(p, nid);
1803	faults += score_nearby_nodes(p, nid, lim_dist: dist, task: false);
1804
1805	return `1000` * faults / total_faults;
1806	}
1807
1808	/*
1809	* If memory tiering mode is enabled, cpupid of slow memory page is
1810	* used to record scan time instead of CPU and PID. When tiering mode
1811	* is disabled at run time, the scan time (in cpupid) will be
1812	* interpreted as CPU and PID. So CPU needs to be checked to avoid to
1813	* access out of array bound.
1814	*/
1815	static inline bool cpupid_valid(int cpupid)
1816	{
1817	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1818	}
1819
1820	/*
1821	* For memory tiering mode, if there are enough free pages (more than
1822	* enough watermark defined here) in fast memory node, to take full
1823	* advantage of fast memory capacity, all recently accessed slow
1824	* memory pages will be migrated to fast memory node without
1825	* considering hot threshold.
1826	*/
1827	static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1828	{
1829	int z;
1830	unsigned long enough_wmark;
1831
1832	enough_wmark = max(`1UL` * `1024` * `1024` * `1024` >> PAGE_SHIFT,
1833	pgdat->node_present_pages >> `4`);
1834	for (z = pgdat->nr_zones - `1`; z >= `0`; z--) {
1835	struct zone *zone = pgdat->node_zones + z;
1836
1837	if (!populated_zone(zone))
1838	continue;
1839
1840	if (zone_watermark_ok(z: zone, order: `0`,
1841	mark: promo_wmark_pages(z: zone) + enough_wmark,
1842	highest_zoneidx: ZONE_MOVABLE, alloc_flags: `0`))
1843	return true;
1844	}
1845	return false;
1846	}
1847
1848	/*
1849	* For memory tiering mode, when page tables are scanned, the scan
1850	* time will be recorded in struct page in addition to make page
1851	* PROT_NONE for slow memory page. So when the page is accessed, in
1852	* hint page fault handler, the hint page fault latency is calculated
1853	* via,
1854	*
1855	* hint page fault latency = hint page fault time - scan time
1856	*
1857	* The smaller the hint page fault latency, the higher the possibility
1858	* for the page to be hot.
1859	*/
1860	static int numa_hint_fault_latency(struct folio *folio)
1861	{
1862	int last_time, time;
1863
1864	time = jiffies_to_msecs(j: jiffies);
1865	last_time = folio_xchg_access_time(folio, time);
1866
1867	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1868	}
1869
1870	/*
1871	* For memory tiering mode, too high promotion/demotion throughput may
1872	* hurt application latency. So we provide a mechanism to rate limit
1873	* the number of pages that are tried to be promoted.
1874	*/
1875	static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1876	unsigned long rate_limit, int nr)
1877	{
1878	unsigned long nr_cand;
1879	unsigned int now, start;
1880
1881	now = jiffies_to_msecs(j: jiffies);
1882	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1883	nr_cand = node_page_state(pgdat, item: PGPROMOTE_CANDIDATE);
1884	start = pgdat->nbp_rl_start;
1885	if (now - start > MSEC_PER_SEC &&
1886	cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1887	pgdat->nbp_rl_nr_cand = nr_cand;
1888	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1889	return true;
1890	return false;
1891	}
1892
1893	#define NUMA_MIGRATION_ADJUST_STEPS 16
1894
1895	static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1896	unsigned long rate_limit,
1897	unsigned int ref_th)
1898	{
1899	unsigned int now, start, th_period, unit_th, th;
1900	unsigned long nr_cand, ref_cand, diff_cand;
1901
1902	now = jiffies_to_msecs(j: jiffies);
1903	th_period = sysctl_numa_balancing_scan_period_max;
1904	start = pgdat->nbp_th_start;
1905	if (now - start > th_period &&
1906	cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1907	ref_cand = rate_limit *
1908	sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1909	nr_cand = node_page_state(pgdat, item: PGPROMOTE_CANDIDATE);
1910	diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1911	unit_th = ref_th * `2` / NUMA_MIGRATION_ADJUST_STEPS;
1912	th = pgdat->nbp_threshold ? : ref_th;
1913	if (diff_cand > ref_cand * `11` / `10`)
1914	th = max(th - unit_th, unit_th);
1915	else if (diff_cand < ref_cand * `9` / `10`)
1916	th = min(th + unit_th, ref_th * `2`);
1917	pgdat->nbp_th_nr_cand = nr_cand;
1918	pgdat->nbp_threshold = th;
1919	}
1920	}
1921
1922	bool should_numa_migrate_memory(struct task_struct p, struct* folio *folio,
1923	int src_nid, int dst_cpu)
1924	{
1925	struct numa_group *ng = deref_curr_numa_group(p);
1926	int dst_nid = cpu_to_node(cpu: dst_cpu);
1927	int last_cpupid, this_cpupid;
1928
1929	/*
1930	* Cannot migrate to memoryless nodes.
1931	*/
1932	if (!node_state(node: dst_nid, state: N_MEMORY))
1933	return false;
1934
1935	/*
1936	* The pages in slow memory node should be migrated according
1937	* to hot/cold instead of private/shared.
1938	*/
1939	if (folio_use_access_time(folio)) {
1940	struct pglist_data *pgdat;
1941	unsigned long rate_limit;
1942	unsigned int latency, th, def_th;
1943
1944	pgdat = NODE_DATA(dst_nid);
1945	if (pgdat_free_space_enough(pgdat)) {
1946	/ workload changed, reset hot threshold /
1947	pgdat->nbp_threshold = `0`;
1948	return true;
1949	}
1950
1951	def_th = sysctl_numa_balancing_hot_threshold;
1952	rate_limit = sysctl_numa_balancing_promote_rate_limit << \
1953	(`20` - PAGE_SHIFT);
1954	numa_promotion_adjust_threshold(pgdat, rate_limit, ref_th: def_th);
1955
1956	th = pgdat->nbp_threshold ? : def_th;
1957	latency = numa_hint_fault_latency(folio);
1958	if (latency >= th)
1959	return false;
1960
1961	return !numa_promotion_rate_limit(pgdat, rate_limit,
1962	nr: folio_nr_pages(folio));
1963	}
1964
1965	this_cpupid = cpu_pid_to_cpupid(cpu: dst_cpu, current->pid);
1966	last_cpupid = folio_xchg_last_cpupid(folio, cpupid: this_cpupid);
1967
1968	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1969	!node_is_toptier(node: src_nid) && !cpupid_valid(cpupid: last_cpupid))
1970	return false;
1971
1972	/*
1973	* Allow first faults or private faults to migrate immediately early in
1974	* the lifetime of a task. The magic number 4 is based on waiting for
1975	* two full passes of the "multi-stage node selection" test that is
1976	* executed below.
1977	*/
1978	if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= `4`) &&
1979	(cpupid_pid_unset(cpupid: last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
1980	return true;
1981
1982	/*
1983	* Multi-stage node selection is used in conjunction with a periodic
1984	* migration fault to build a temporal task<->page relation. By using
1985	* a two-stage filter we remove short/unlikely relations.
1986	*
1987	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1988	* a task's usage of a particular page (n_p) per total usage of this
1989	* page (n_t) (in a given time-span) to a probability.
1990	*
1991	* Our periodic faults will sample this probability and getting the
1992	* same result twice in a row, given these samples are fully
1993	* independent, is then given by P(n)^2, provided our sample period
1994	* is sufficiently short compared to the usage pattern.
1995	*
1996	* This quadric squishes small probabilities, making it less likely we
1997	* act on an unlikely task<->page relation.
1998	*/
1999	if (!cpupid_pid_unset(cpupid: last_cpupid) &&
2000	cpupid_to_nid(cpupid: last_cpupid) != dst_nid)
2001	return false;
2002
2003	/ Always allow migrate on private faults /
2004	if (cpupid_match_pid(p, last_cpupid))
2005	return true;
2006
2007	/ A shared fault, but p->numa_group has not been set up yet. /
2008	if (!ng)
2009	return true;
2010
2011	/*
2012	* Destination node is much more heavily used than the source
2013	* node? Allow migration.
2014	*/
2015	if (group_faults_cpu(group: ng, nid: dst_nid) > group_faults_cpu(group: ng, nid: src_nid) *
2016	ACTIVE_NODE_FRACTION)
2017	return true;
2018
2019	/*
2020	* Distribute memory according to CPU & memory use on each node,
2021	* with 3/4 hysteresis to avoid unnecessary memory migrations:
2022	*
2023	* faults_cpu(dst) 3 faults_cpu(src)
2024	* --------------- * - > ---------------
2025	* faults_mem(dst) 4 faults_mem(src)
2026	*/
2027	return group_faults_cpu(group: ng, nid: dst_nid) * group_faults(p, nid: src_nid) * `3` >
2028	group_faults_cpu(group: ng, nid: src_nid) * group_faults(p, nid: dst_nid) * `4`;
2029	}
2030
2031	/*
2032	* 'numa_type' describes the node at the moment of load balancing.
2033	*/
2034	enum numa_type {
2035	/ The node has spare capacity that can be used to run more tasks. /
2036	node_has_spare = `0`,
2037	/*
2038	* The node is fully used and the tasks don't compete for more CPU
2039	* cycles. Nevertheless, some tasks might wait before running.
2040	*/
2041	node_fully_busy,
2042	/*
2043	* The node is overloaded and can't provide expected CPU cycles to all
2044	* tasks.
2045	*/
2046	node_overloaded
2047	};
2048
2049	/ Cached statistics for all CPUs within a node /
2050	struct numa_stats {
2051	unsigned long load;
2052	unsigned long runnable;
2053	unsigned long util;
2054	/ Total compute capacity of CPUs on a node /
2055	unsigned long compute_capacity;
2056	unsigned int nr_running;
2057	unsigned int weight;
2058	enum numa_type node_type;
2059	int idle_cpu;
2060	};
2061
2062	struct task_numa_env {
2063	struct task_struct *p;
2064
2065	int src_cpu, src_nid;
2066	int dst_cpu, dst_nid;
2067	int imb_numa_nr;
2068
2069	struct numa_stats src_stats, dst_stats;
2070
2071	int imbalance_pct;
2072	int dist;
2073
2074	struct task_struct *best_task;
2075	long best_imp;
2076	int best_cpu;
2077	};
2078
2079	static unsigned long cpu_load(struct rq *rq);
2080	static unsigned long cpu_runnable(struct rq *rq);
2081
2082	static inline enum
2083	numa_type numa_classify(unsigned int imbalance_pct,
2084	struct numa_stats *ns)
2085	{
2086	if ((ns->nr_running > ns->weight) &&
2087	(((ns->compute_capacity * `100`) < (ns->util * imbalance_pct)) \|\|
2088	((ns->compute_capacity * imbalance_pct) < (ns->runnable * `100`))))
2089	return node_overloaded;
2090
2091	if ((ns->nr_running < ns->weight) \|\|
2092	(((ns->compute_capacity * `100`) > (ns->util * imbalance_pct)) &&
2093	((ns->compute_capacity * imbalance_pct) > (ns->runnable * `100`))))
2094	return node_has_spare;
2095
2096	return node_fully_busy;
2097	}
2098
2099	#ifdef CONFIG_SCHED_SMT
2100	/ Forward declarations of select_idle_sibling helpers /
2101	static inline bool test_idle_cores(int cpu);
2102	static inline int numa_idle_core(int idle_core, int cpu)
2103	{
2104	if (!static_branch_likely(&sched_smt_present) \|\|
2105	idle_core >= `0` \|\| !test_idle_cores(cpu))
2106	return idle_core;
2107
2108	/*
2109	* Prefer cores instead of packing HT siblings
2110	* and triggering future load balancing.
2111	*/
2112	if (is_core_idle(cpu))
2113	idle_core = cpu;
2114
2115	return idle_core;
2116	}
2117	#else
2118	static inline int numa_idle_core(int idle_core, int cpu)
2119	{
2120	return idle_core;
2121	}
2122	#endif
2123
2124	/*
2125	* Gather all necessary information to make NUMA balancing placement
2126	* decisions that are compatible with standard load balancer. This
2127	* borrows code and logic from update_sg_lb_stats but sharing a
2128	* common implementation is impractical.
2129	*/
2130	static void update_numa_stats(struct task_numa_env *env,
2131	struct numa_stats ns, int* nid,
2132	bool find_idle)
2133	{
2134	int cpu, idle_core = -`1`;
2135
2136	memset(ns, `0`, sizeof(*ns));
2137	ns->idle_cpu = -`1`;
2138
2139	rcu_read_lock();
2140	for_each_cpu(cpu, cpumask_of_node(nid)) {
2141	struct rq *rq = cpu_rq(cpu);
2142
2143	ns->load += cpu_load(rq);
2144	ns->runnable += cpu_runnable(rq);
2145	ns->util += cpu_util_cfs(cpu);
2146	ns->nr_running += rq->cfs.h_nr_runnable;
2147	ns->compute_capacity += capacity_of(cpu);
2148
2149	if (find_idle && idle_core < `0` && !rq->nr_running && idle_cpu(cpu)) {
2150	if (READ_ONCE(rq->numa_migrate_on) \|\|
2151	!cpumask_test_cpu(cpu, cpumask: env->p->cpus_ptr))
2152	continue;
2153
2154	if (ns->idle_cpu == -`1`)
2155	ns->idle_cpu = cpu;
2156
2157	idle_core = numa_idle_core(idle_core, cpu);
2158	}
2159	}
2160	rcu_read_unlock();
2161
2162	ns->weight = cpumask_weight(srcp: cpumask_of_node(node: nid));
2163
2164	ns->node_type = numa_classify(imbalance_pct: env->imbalance_pct, ns);
2165
2166	if (idle_core >= `0`)
2167	ns->idle_cpu = idle_core;
2168	}
2169
2170	static void task_numa_assign(struct task_numa_env *env,
2171	struct task_struct p, long* imp)
2172	{
2173	struct rq *rq = cpu_rq(env->dst_cpu);
2174
2175	/ Check if run-queue part of active NUMA balance. /
2176	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, `1`)) {
2177	int cpu;
2178	int start = env->dst_cpu;
2179
2180	/ Find alternative idle CPU. /
2181	for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + `1`) {
2182	if (cpu == env->best_cpu \|\| !idle_cpu(cpu) \|\|
2183	!cpumask_test_cpu(cpu, cpumask: env->p->cpus_ptr)) {
2184	continue;
2185	}
2186
2187	env->dst_cpu = cpu;
2188	rq = cpu_rq(env->dst_cpu);
2189	if (!xchg(&rq->numa_migrate_on, `1`))
2190	goto assign;
2191	}
2192
2193	/ Failed to find an alternative idle CPU /
2194	return;
2195	}
2196
2197	assign:
2198	/*
2199	* Clear previous best_cpu/rq numa-migrate flag, since task now
2200	* found a better CPU to move/swap.
2201	*/
2202	if (env->best_cpu != -`1` && env->best_cpu != env->dst_cpu) {
2203	rq = cpu_rq(env->best_cpu);
2204	WRITE_ONCE(rq->numa_migrate_on, `0`);
2205	}
2206
2207	if (env->best_task)
2208	put_task_struct(t: env->best_task);
2209	if (p)
2210	get_task_struct(t: p);
2211
2212	env->best_task = p;
2213	env->best_imp = imp;
2214	env->best_cpu = env->dst_cpu;
2215	}
2216
2217	static bool load_too_imbalanced(long src_load, long dst_load,
2218	struct task_numa_env *env)
2219	{
2220	long imb, old_imb;
2221	long orig_src_load, orig_dst_load;
2222	long src_capacity, dst_capacity;
2223
2224	/*
2225	* The load is corrected for the CPU capacity available on each node.
2226	*
2227	* src_load dst_load
2228	* ------------ vs ---------
2229	* src_capacity dst_capacity
2230	*/
2231	src_capacity = env->src_stats.compute_capacity;
2232	dst_capacity = env->dst_stats.compute_capacity;
2233
2234	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2235
2236	orig_src_load = env->src_stats.load;
2237	orig_dst_load = env->dst_stats.load;
2238
2239	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2240
2241	/ Would this change make things worse? /
2242	return (imb > old_imb);
2243	}
2244
2245	/*
2246	* Maximum NUMA importance can be 1998 (2*999);
2247	* SMALLIMP @ 30 would be close to 1998/64.
2248	* Used to deter task migration.
2249	*/
2250	#define SMALLIMP 30
2251
2252	/*
2253	* This checks if the overall compute and NUMA accesses of the system would
2254	* be improved if the source tasks was migrated to the target dst_cpu taking
2255	* into account that it might be best if task running on the dst_cpu should
2256	* be exchanged with the source task
2257	*/
2258	static bool task_numa_compare(struct task_numa_env *env,
2259	long taskimp, long groupimp, bool maymove)
2260	{
2261	struct numa_group cur_ng, p_ng = deref_curr_numa_group(p: env->p);
2262	struct rq *dst_rq = cpu_rq(env->dst_cpu);
2263	long imp = p_ng ? groupimp : taskimp;
2264	struct task_struct *cur;
2265	long src_load, dst_load;
2266	int dist = env->dist;
2267	long moveimp = imp;
2268	long load;
2269	bool stopsearch = false;
2270
2271	if (READ_ONCE(dst_rq->numa_migrate_on))
2272	return false;
2273
2274	rcu_read_lock();
2275	cur = rcu_dereference(dst_rq->curr);
2276	if (cur && ((cur->flags & (PF_EXITING \| PF_KTHREAD)) \|\|
2277	!cur->mm))
2278	cur = NULL;
2279
2280	/*
2281	* Because we have preemption enabled we can get migrated around and
2282	* end try selecting ourselves (current == env->p) as a swap candidate.
2283	*/
2284	if (cur == env->p) {
2285	stopsearch = true;
2286	goto unlock;
2287	}
2288
2289	if (!cur) {
2290	if (maymove && moveimp >= env->best_imp)
2291	goto assign;
2292	else
2293	goto unlock;
2294	}
2295
2296	/ Skip this swap candidate if cannot move to the source cpu. /
2297	if (!cpumask_test_cpu(cpu: env->src_cpu, cpumask: cur->cpus_ptr))
2298	goto unlock;
2299
2300	/*
2301	* Skip this swap candidate if it is not moving to its preferred
2302	* node and the best task is.
2303	*/
2304	if (env->best_task &&
2305	env->best_task->numa_preferred_nid == env->src_nid &&
2306	cur->numa_preferred_nid != env->src_nid) {
2307	goto unlock;
2308	}
2309
2310	/*
2311	* "imp" is the fault differential for the source task between the
2312	* source and destination node. Calculate the total differential for
2313	* the source task and potential destination task. The more negative
2314	* the value is, the more remote accesses that would be expected to
2315	* be incurred if the tasks were swapped.
2316	*
2317	* If dst and source tasks are in the same NUMA group, or not
2318	* in any group then look only at task weights.
2319	*/
2320	cur_ng = rcu_dereference(cur->numa_group);
2321	if (cur_ng == p_ng) {
2322	/*
2323	* Do not swap within a group or between tasks that have
2324	* no group if there is spare capacity. Swapping does
2325	* not address the load imbalance and helps one task at
2326	* the cost of punishing another.
2327	*/
2328	if (env->dst_stats.node_type == node_has_spare)
2329	goto unlock;
2330
2331	imp = taskimp + task_weight(p: cur, nid: env->src_nid, dist) -
2332	task_weight(p: cur, nid: env->dst_nid, dist);
2333	/*
2334	* Add some hysteresis to prevent swapping the
2335	* tasks within a group over tiny differences.
2336	*/
2337	if (cur_ng)
2338	imp -= imp / `16`;
2339	} else {
2340	/*
2341	* Compare the group weights. If a task is all by itself
2342	* (not part of a group), use the task weight instead.
2343	*/
2344	if (cur_ng && p_ng)
2345	imp += group_weight(p: cur, nid: env->src_nid, dist) -
2346	group_weight(p: cur, nid: env->dst_nid, dist);
2347	else
2348	imp += task_weight(p: cur, nid: env->src_nid, dist) -
2349	task_weight(p: cur, nid: env->dst_nid, dist);
2350	}
2351
2352	/ Discourage picking a task already on its preferred node /
2353	if (cur->numa_preferred_nid == env->dst_nid)
2354	imp -= imp / `16`;
2355
2356	/*
2357	* Encourage picking a task that moves to its preferred node.
2358	* This potentially makes imp larger than it's maximum of
2359	* 1998 (see SMALLIMP and task_weight for why) but in this
2360	* case, it does not matter.
2361	*/
2362	if (cur->numa_preferred_nid == env->src_nid)
2363	imp += imp / `8`;
2364
2365	if (maymove && moveimp > imp && moveimp > env->best_imp) {
2366	imp = moveimp;
2367	cur = NULL;
2368	goto assign;
2369	}
2370
2371	/*
2372	* Prefer swapping with a task moving to its preferred node over a
2373	* task that is not.
2374	*/
2375	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2376	env->best_task->numa_preferred_nid != env->src_nid) {
2377	goto assign;
2378	}
2379
2380	/*
2381	* If the NUMA importance is less than SMALLIMP,
2382	* task migration might only result in ping pong
2383	* of tasks and also hurt performance due to cache
2384	* misses.
2385	*/
2386	if (imp < SMALLIMP \|\| imp <= env->best_imp + SMALLIMP / `2`)
2387	goto unlock;
2388
2389	/*
2390	* In the overloaded case, try and keep the load balanced.
2391	*/
2392	load = task_h_load(p: env->p) - task_h_load(p: cur);
2393	if (!load)
2394	goto assign;
2395
2396	dst_load = env->dst_stats.load + load;
2397	src_load = env->src_stats.load - load;
2398
2399	if (load_too_imbalanced(src_load, dst_load, env))
2400	goto unlock;
2401
2402	assign:
2403	/ Evaluate an idle CPU for a task numa move. /
2404	if (!cur) {
2405	int cpu = env->dst_stats.idle_cpu;
2406
2407	/ Nothing cached so current CPU went idle since the search. /
2408	if (cpu < `0`)
2409	cpu = env->dst_cpu;
2410
2411	/*
2412	* If the CPU is no longer truly idle and the previous best CPU
2413	* is, keep using it.
2414	*/
2415	if (!idle_cpu(cpu) && env->best_cpu >= `0` &&
2416	idle_cpu(cpu: env->best_cpu)) {
2417	cpu = env->best_cpu;
2418	}
2419
2420	env->dst_cpu = cpu;
2421	}
2422
2423	task_numa_assign(env, p: cur, imp);
2424
2425	/*
2426	* If a move to idle is allowed because there is capacity or load
2427	* balance improves then stop the search. While a better swap
2428	* candidate may exist, a search is not free.
2429	*/
2430	if (maymove && !cur && env->best_cpu >= `0` && idle_cpu(cpu: env->best_cpu))
2431	stopsearch = true;
2432
2433	/*
2434	* If a swap candidate must be identified and the current best task
2435	* moves its preferred node then stop the search.
2436	*/
2437	if (!maymove && env->best_task &&
2438	env->best_task->numa_preferred_nid == env->src_nid) {
2439	stopsearch = true;
2440	}
2441	unlock:
2442	rcu_read_unlock();
2443
2444	return stopsearch;
2445	}
2446
2447	static void task_numa_find_cpu(struct task_numa_env *env,
2448	long taskimp, long groupimp)
2449	{
2450	bool maymove = false;
2451	int cpu;
2452
2453	/*
2454	* If dst node has spare capacity, then check if there is an
2455	* imbalance that would be overruled by the load balancer.
2456	*/
2457	if (env->dst_stats.node_type == node_has_spare) {
2458	unsigned int imbalance;
2459	int src_running, dst_running;
2460
2461	/*
2462	* Would movement cause an imbalance? Note that if src has
2463	* more running tasks that the imbalance is ignored as the
2464	* move improves the imbalance from the perspective of the
2465	* CPU load balancer.
2466	* */
2467	src_running = env->src_stats.nr_running - `1`;
2468	dst_running = env->dst_stats.nr_running + `1`;
2469	imbalance = max(`0`, dst_running - src_running);
2470	imbalance = adjust_numa_imbalance(imbalance, dst_running,
2471	imb_numa_nr: env->imb_numa_nr);
2472
2473	/ Use idle CPU if there is no imbalance /
2474	if (!imbalance) {
2475	maymove = true;
2476	if (env->dst_stats.idle_cpu >= `0`) {
2477	env->dst_cpu = env->dst_stats.idle_cpu;
2478	task_numa_assign(env, NULL, imp: `0`);
2479	return;
2480	}
2481	}
2482	} else {
2483	long src_load, dst_load, load;
2484	/*
2485	* If the improvement from just moving env->p direction is better
2486	* than swapping tasks around, check if a move is possible.
2487	*/
2488	load = task_h_load(p: env->p);
2489	dst_load = env->dst_stats.load + load;
2490	src_load = env->src_stats.load - load;
2491	maymove = !load_too_imbalanced(src_load, dst_load, env);
2492	}
2493
2494	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
2495	/ Skip this CPU if the source task cannot migrate /
2496	if (!cpumask_test_cpu(cpu, cpumask: env->p->cpus_ptr))
2497	continue;
2498
2499	env->dst_cpu = cpu;
2500	if (task_numa_compare(env, taskimp, groupimp, maymove))
2501	break;
2502	}
2503	}
2504
2505	static int task_numa_migrate(struct task_struct *p)
2506	{
2507	struct task_numa_env env = {
2508	.p = p,
2509
2510	.src_cpu = task_cpu(p),
2511	.src_nid = task_node(p),
2512
2513	.imbalance_pct = `112`,
2514
2515	.best_task = NULL,
2516	.best_imp = `0`,
2517	.best_cpu = -`1`,
2518	};
2519	unsigned long taskweight, groupweight;
2520	struct sched_domain *sd;
2521	long taskimp, groupimp;
2522	struct numa_group *ng;
2523	struct rq *best_rq;
2524	int nid, ret, dist;
2525
2526	/*
2527	* Pick the lowest SD_NUMA domain, as that would have the smallest
2528	* imbalance and would be the first to start moving tasks about.
2529	*
2530	* And we want to avoid any moving of tasks about, as that would create
2531	* random movement of tasks -- counter the numa conditions we're trying
2532	* to satisfy here.
2533	*/
2534	rcu_read_lock();
2535	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2536	if (sd) {
2537	env.imbalance_pct = `100` + (sd->imbalance_pct - `100`) / `2`;
2538	env.imb_numa_nr = sd->imb_numa_nr;
2539	}
2540	rcu_read_unlock();
2541
2542	/*
2543	* Cpusets can break the scheduler domain tree into smaller
2544	* balance domains, some of which do not cross NUMA boundaries.
2545	* Tasks that are "trapped" in such domains cannot be migrated
2546	* elsewhere, so there is no point in (re)trying.
2547	*/
2548	if (unlikely(!sd)) {
2549	sched_setnuma(p, node: task_node(p));
2550	return -EINVAL;
2551	}
2552
2553	env.dst_nid = p->numa_preferred_nid;
2554	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2555	taskweight = task_weight(p, nid: env.src_nid, dist);
2556	groupweight = group_weight(p, nid: env.src_nid, dist);
2557	update_numa_stats(env: &env, ns: &env.src_stats, nid: env.src_nid, find_idle: false);
2558	taskimp = task_weight(p, nid: env.dst_nid, dist) - taskweight;
2559	groupimp = group_weight(p, nid: env.dst_nid, dist) - groupweight;
2560	update_numa_stats(env: &env, ns: &env.dst_stats, nid: env.dst_nid, find_idle: true);
2561
2562	/ Try to find a spot on the preferred nid. /
2563	task_numa_find_cpu(env: &env, taskimp, groupimp);
2564
2565	/*
2566	* Look at other nodes in these cases:
2567	* - there is no space available on the preferred_nid
2568	* - the task is part of a numa_group that is interleaved across
2569	* multiple NUMA nodes; in order to better consolidate the group,
2570	* we need to check other locations.
2571	*/
2572	ng = deref_curr_numa_group(p);
2573	if (env.best_cpu == -`1` \|\| (ng && ng->active_nodes > `1`)) {
2574	for_each_node_state(nid, N_CPU) {
2575	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
2576	continue;
2577
2578	dist = node_distance(env.src_nid, env.dst_nid);
2579	if (sched_numa_topology_type == NUMA_BACKPLANE &&
2580	dist != env.dist) {
2581	taskweight = task_weight(p, nid: env.src_nid, dist);
2582	groupweight = group_weight(p, nid: env.src_nid, dist);
2583	}
2584
2585	/ Only consider nodes where both task and groups benefit /
2586	taskimp = task_weight(p, nid, dist) - taskweight;
2587	groupimp = group_weight(p, nid, dist) - groupweight;
2588	if (taskimp < `0` && groupimp < `0`)
2589	continue;
2590
2591	env.dist = dist;
2592	env.dst_nid = nid;
2593	update_numa_stats(env: &env, ns: &env.dst_stats, nid: env.dst_nid, find_idle: true);
2594	task_numa_find_cpu(env: &env, taskimp, groupimp);
2595	}
2596	}
2597
2598	/*
2599	* If the task is part of a workload that spans multiple NUMA nodes,
2600	* and is migrating into one of the workload's active nodes, remember
2601	* this node as the task's preferred numa node, so the workload can
2602	* settle down.
2603	* A task that migrated to a second choice node will be better off
2604	* trying for a better one later. Do not set the preferred node here.
2605	*/
2606	if (ng) {
2607	if (env.best_cpu == -`1`)
2608	nid = env.src_nid;
2609	else
2610	nid = cpu_to_node(cpu: env.best_cpu);
2611
2612	if (nid != p->numa_preferred_nid)
2613	sched_setnuma(p, node: nid);
2614	}
2615
2616	/ No better CPU than the current one was found. /
2617	if (env.best_cpu == -`1`) {
2618	trace_sched_stick_numa(src_tsk: p, src_cpu: env.src_cpu, NULL, dst_cpu: -`1`);
2619	return -EAGAIN;
2620	}
2621
2622	best_rq = cpu_rq(env.best_cpu);
2623	if (env.best_task == NULL) {
2624	ret = migrate_task_to(p, cpu: env.best_cpu);
2625	WRITE_ONCE(best_rq->numa_migrate_on, `0`);
2626	if (ret != `0`)
2627	trace_sched_stick_numa(src_tsk: p, src_cpu: env.src_cpu, NULL, dst_cpu: env.best_cpu);
2628	return ret;
2629	}
2630
2631	ret = migrate_swap(p, t: env.best_task, cpu: env.best_cpu, scpu: env.src_cpu);
2632	WRITE_ONCE(best_rq->numa_migrate_on, `0`);
2633
2634	if (ret != `0`)
2635	trace_sched_stick_numa(src_tsk: p, src_cpu: env.src_cpu, dst_tsk: env.best_task, dst_cpu: env.best_cpu);
2636	put_task_struct(t: env.best_task);
2637	return ret;
2638	}
2639
2640	/ Attempt to migrate a task to a CPU on the preferred node. /
2641	static void numa_migrate_preferred(struct task_struct *p)
2642	{
2643	unsigned long interval = HZ;
2644
2645	/ This task has no NUMA fault statistics yet /
2646	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
2647	return;
2648
2649	/ Periodically retry migrating the task to the preferred node /
2650	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / `16`);
2651	p->numa_migrate_retry = jiffies + interval;
2652
2653	/ Success if task is already running on preferred CPU /
2654	if (task_node(p) == p->numa_preferred_nid)
2655	return;
2656
2657	/ Otherwise, try migrate to a CPU on the preferred node /
2658	task_numa_migrate(p);
2659	}
2660
2661	/*
2662	* Find out how many nodes the workload is actively running on. Do this by
2663	* tracking the nodes from which NUMA hinting faults are triggered. This can
2664	* be different from the set of nodes where the workload's memory is currently
2665	* located.
2666	*/
2667	static void numa_group_count_active_nodes(struct numa_group *numa_group)
2668	{
2669	unsigned long faults, max_faults = `0`;
2670	int nid, active_nodes = `0`;
2671
2672	for_each_node_state(nid, N_CPU) {
2673	faults = group_faults_cpu(group: numa_group, nid);
2674	if (faults > max_faults)
2675	max_faults = faults;
2676	}
2677
2678	for_each_node_state(nid, N_CPU) {
2679	faults = group_faults_cpu(group: numa_group, nid);
2680	if (faults * ACTIVE_NODE_FRACTION > max_faults)
2681	active_nodes++;
2682	}
2683
2684	numa_group->max_faults_cpu = max_faults;
2685	numa_group->active_nodes = active_nodes;
2686	}
2687
2688	/*
2689	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2690	* increments. The more local the fault statistics are, the higher the scan
2691	* period will be for the next scan window. If local/(local+remote) ratio is
2692	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2693	* the scan period will decrease. Aim for 70% local accesses.
2694	*/
2695	#define NUMA_PERIOD_SLOTS 10
2696	#define NUMA_PERIOD_THRESHOLD 7
2697
2698	/*
2699	* Increase the scan period (slow down scanning) if the majority of
2700	* our memory is already on our local node, or if the majority of
2701	* the page accesses are shared with other processes.
2702	* Otherwise, decrease the scan period.
2703	*/
2704	static void update_task_scan_period(struct task_struct *p,
2705	unsigned long shared, unsigned long private)
2706	{
2707	unsigned int period_slot;
2708	int lr_ratio, ps_ratio;
2709	int diff;
2710
2711	unsigned long remote = p->numa_faults_locality[`0`];
2712	unsigned long local = p->numa_faults_locality[`1`];
2713
2714	/*
2715	* If there were no record hinting faults then either the task is
2716	* completely idle or all activity is in areas that are not of interest
2717	* to automatic numa balancing. Related to that, if there were failed
2718	* migration then it implies we are migrating too quickly or the local
2719	* node is overloaded. In either case, scan slower
2720	*/
2721	if (local + shared == `0` \|\| p->numa_faults_locality[`2`]) {
2722	p->numa_scan_period = min(p->numa_scan_period_max,
2723	p->numa_scan_period << `1`);
2724
2725	p->mm->numa_next_scan = jiffies +
2726	msecs_to_jiffies(m: p->numa_scan_period);
2727
2728	return;
2729	}
2730
2731	/*
2732	* Prepare to scale scan period relative to the current period.
2733	* == NUMA_PERIOD_THRESHOLD scan period stays the same
2734	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2735	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2736	*/
2737	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2738	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2739	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2740
2741	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2742	/*
2743	* Most memory accesses are local. There is no need to
2744	* do fast NUMA scanning, since memory is already local.
2745	*/
2746	int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2747	if (!slot)
2748	slot = `1`;
2749	diff = slot * period_slot;
2750	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2751	/*
2752	* Most memory accesses are shared with other tasks.
2753	* There is no point in continuing fast NUMA scanning,
2754	* since other tasks may just move the memory elsewhere.
2755	*/
2756	int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2757	if (!slot)
2758	slot = `1`;
2759	diff = slot * period_slot;
2760	} else {
2761	/*
2762	* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2763	* yet they are not on the local NUMA node. Speed up
2764	* NUMA scanning to get the memory moved over.
2765	*/
2766	int ratio = max(lr_ratio, ps_ratio);
2767	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2768	}
2769
2770	p->numa_scan_period = clamp(p->numa_scan_period + diff,
2771	task_scan_min(p), task_scan_max(p));
2772	memset(p->numa_faults_locality, `0`, sizeof(p->numa_faults_locality));
2773	}
2774
2775	/*
2776	* Get the fraction of time the task has been running since the last
2777	* NUMA placement cycle. The scheduler keeps similar statistics, but
2778	* decays those on a 32ms period, which is orders of magnitude off
2779	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
2780	* stats only if the task is so new there are no NUMA statistics yet.
2781	*/
2782	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
2783	{
2784	u64 runtime, delta, now;
2785	/ Use the start of this time slice to avoid calculations. /
2786	now = p->se.exec_start;
2787	runtime = p->se.sum_exec_runtime;
2788
2789	if (p->last_task_numa_placement) {
2790	delta = runtime - p->last_sum_exec_runtime;
2791	*period = now - p->last_task_numa_placement;
2792
2793	/ Avoid time going backwards, prevent potential divide error: /
2794	if (unlikely((s64)*period < `0`))
2795	*period = `0`;
2796	} else {
2797	delta = p->se.avg.load_sum;
2798	*period = LOAD_AVG_MAX;
2799	}
2800
2801	p->last_sum_exec_runtime = runtime;
2802	p->last_task_numa_placement = now;
2803
2804	return delta;
2805	}
2806
2807	/*
2808	* Determine the preferred nid for a task in a numa_group. This needs to
2809	* be done in a way that produces consistent results with group_weight,
2810	* otherwise workloads might not converge.
2811	*/
2812	static int preferred_group_nid(struct task_struct p, int* nid)
2813	{
2814	nodemask_t nodes;
2815	int dist;
2816
2817	/ Direct connections between all NUMA nodes. /
2818	if (sched_numa_topology_type == NUMA_DIRECT)
2819	return nid;
2820
2821	/*
2822	* On a system with glueless mesh NUMA topology, group_weight
2823	* scores nodes according to the number of NUMA hinting faults on
2824	* both the node itself, and on nearby nodes.
2825	*/
2826	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2827	unsigned long score, max_score = `0`;
2828	int node, max_node = nid;
2829
2830	dist = sched_max_numa_distance;
2831
2832	for_each_node_state(node, N_CPU) {
2833	score = group_weight(p, nid: node, dist);
2834	if (score > max_score) {
2835	max_score = score;
2836	max_node = node;
2837	}
2838	}
2839	return max_node;
2840	}
2841
2842	/*
2843	* Finding the preferred nid in a system with NUMA backplane
2844	* interconnect topology is more involved. The goal is to locate
2845	* tasks from numa_groups near each other in the system, and
2846	* untangle workloads from different sides of the system. This requires
2847	* searching down the hierarchy of node groups, recursively searching
2848	* inside the highest scoring group of nodes. The nodemask tricks
2849	* keep the complexity of the search down.
2850	*/
2851	nodes = node_states[N_CPU];
2852	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2853	unsigned long max_faults = `0`;
2854	nodemask_t max_group = NODE_MASK_NONE;
2855	int a, b;
2856
2857	/ Are there nodes at this distance from each other? /
2858	if (!find_numa_distance(distance: dist))
2859	continue;
2860
2861	for_each_node_mask(a, nodes) {
2862	unsigned long faults = `0`;
2863	nodemask_t this_group;
2864	nodes_clear(this_group);
2865
2866	/ Sum group's NUMA faults; includes a==b case. /
2867	for_each_node_mask(b, nodes) {
2868	if (node_distance(a, b) < dist) {
2869	faults += group_faults(p, nid: b);
2870	node_set(b, this_group);
2871	node_clear(b, nodes);
2872	}
2873	}
2874
2875	/ Remember the top group. /
2876	if (faults > max_faults) {
2877	max_faults = faults;
2878	max_group = this_group;
2879	/*
2880	* subtle: at the smallest distance there is
2881	* just one node left in each "group", the
2882	* winner is the preferred nid.
2883	*/
2884	nid = a;
2885	}
2886	}
2887	/ Next round, evaluate the nodes within max_group. /
2888	if (!max_faults)
2889	break;
2890	nodes = max_group;
2891	}
2892	return nid;
2893	}
2894
2895	static void task_numa_placement(struct task_struct *p)
2896	{
2897	int seq, nid, max_nid = NUMA_NO_NODE;
2898	unsigned long max_faults = `0`;
2899	unsigned long fault_types[`2`] = { `0`, `0` };
2900	unsigned long total_faults;
2901	u64 runtime, period;
2902	spinlock_t *group_lock = NULL;
2903	struct numa_group *ng;
2904
2905	/*
2906	* The p->mm->numa_scan_seq field gets updated without
2907	* exclusive access. Use READ_ONCE() here to ensure
2908	* that the field is read in a single access:
2909	*/
2910	seq = READ_ONCE(p->mm->numa_scan_seq);
2911	if (p->numa_scan_seq == seq)
2912	return;
2913	p->numa_scan_seq = seq;
2914	p->numa_scan_period_max = task_scan_max(p);
2915
2916	total_faults = p->numa_faults_locality[`0`] +
2917	p->numa_faults_locality[`1`];
2918	runtime = numa_get_avg_runtime(p, period: &period);
2919
2920	/ If the task is part of a group prevent parallel updates to group stats /
2921	ng = deref_curr_numa_group(p);
2922	if (ng) {
2923	group_lock = &ng->lock;
2924	spin_lock_irq(lock: group_lock);
2925	}
2926
2927	/ Find the node with the highest number of faults /
2928	for_each_online_node(nid) {
2929	/ Keep track of the offsets in numa_faults array /
2930	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2931	unsigned long faults = `0`, group_faults = `0`;
2932	int priv;
2933
2934	for (priv = `0`; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2935	long diff, f_diff, f_weight;
2936
2937	mem_idx = task_faults_idx(s: NUMA_MEM, nid, priv);
2938	membuf_idx = task_faults_idx(s: NUMA_MEMBUF, nid, priv);
2939	cpu_idx = task_faults_idx(s: NUMA_CPU, nid, priv);
2940	cpubuf_idx = task_faults_idx(s: NUMA_CPUBUF, nid, priv);
2941
2942	/ Decay existing window, copy faults since last scan /
2943	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / `2`;
2944	fault_types[priv] += p->numa_faults[membuf_idx];
2945	p->numa_faults[membuf_idx] = `0`;
2946
2947	/*
2948	* Normalize the faults_from, so all tasks in a group
2949	* count according to CPU use, instead of by the raw
2950	* number of faults. Tasks with little runtime have
2951	* little over-all impact on throughput, and thus their
2952	* faults are less important.
2953	*/
2954	f_weight = div64_u64(dividend: runtime << `16`, divisor: period + `1`);
2955	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2956	(total_faults + `1`);
2957	f_diff = f_weight - p->numa_faults[cpu_idx] / `2`;
2958	p->numa_faults[cpubuf_idx] = `0`;
2959
2960	p->numa_faults[mem_idx] += diff;
2961	p->numa_faults[cpu_idx] += f_diff;
2962	faults += p->numa_faults[mem_idx];
2963	p->total_numa_faults += diff;
2964	if (ng) {
2965	/*
2966	* safe because we can only change our own group
2967	*
2968	* mem_idx represents the offset for a given
2969	* nid and priv in a specific region because it
2970	* is at the beginning of the numa_faults array.
2971	*/
2972	ng->faults[mem_idx] += diff;
2973	ng->faults[cpu_idx] += f_diff;
2974	ng->total_faults += diff;
2975	group_faults += ng->faults[mem_idx];
2976	}
2977	}
2978
2979	if (!ng) {
2980	if (faults > max_faults) {
2981	max_faults = faults;
2982	max_nid = nid;
2983	}
2984	} else if (group_faults > max_faults) {
2985	max_faults = group_faults;
2986	max_nid = nid;
2987	}
2988	}
2989
2990	/ Cannot migrate task to CPU-less node /
2991	max_nid = numa_nearest_node(node: max_nid, state: N_CPU);
2992
2993	if (ng) {
2994	numa_group_count_active_nodes(numa_group: ng);
2995	spin_unlock_irq(lock: group_lock);
2996	max_nid = preferred_group_nid(p, nid: max_nid);
2997	}
2998
2999	if (max_faults) {
3000	/ Set the new preferred node /
3001	if (max_nid != p->numa_preferred_nid)
3002	sched_setnuma(p, node: max_nid);
3003	}
3004
3005	update_task_scan_period(p, shared: fault_types[`0`], private: fault_types[`1`]);
3006	}
3007
3008	static inline int get_numa_group(struct numa_group *grp)
3009	{
3010	return refcount_inc_not_zero(r: &grp->refcount);
3011	}
3012
3013	static inline void put_numa_group(struct numa_group *grp)
3014	{
3015	if (refcount_dec_and_test(r: &grp->refcount))
3016	kfree_rcu(grp, rcu);
3017	}
3018
3019	static void task_numa_group(struct task_struct p, int* cpupid, int flags,
3020	int *priv)
3021	{
3022	struct numa_group grp, my_grp;
3023	struct task_struct *tsk;
3024	bool join = false;
3025	int cpu = cpupid_to_cpu(cpupid);
3026	int i;
3027
3028	if (unlikely(!deref_curr_numa_group(p))) {
3029	unsigned int size = sizeof(struct numa_group) +
3030	NR_NUMA_HINT_FAULT_STATS *
3031	nr_node_ids * sizeof(unsigned long);
3032
3033	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
3034	if (!grp)
3035	return;
3036
3037	refcount_set(r: &grp->refcount, n: `1`);
3038	grp->active_nodes = `1`;
3039	grp->max_faults_cpu = `0`;
3040	spin_lock_init(&grp->lock);
3041	grp->gid = p->pid;
3042
3043	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3044	grp->faults[i] = p->numa_faults[i];
3045
3046	grp->total_faults = p->total_numa_faults;
3047
3048	grp->nr_tasks++;
3049	rcu_assign_pointer(p->numa_group, grp);
3050	}
3051
3052	rcu_read_lock();
3053	tsk = READ_ONCE(cpu_rq(cpu)->curr);
3054
3055	if (!cpupid_match_pid(tsk, cpupid))
3056	goto no_join;
3057
3058	grp = rcu_dereference(tsk->numa_group);
3059	if (!grp)
3060	goto no_join;
3061
3062	my_grp = deref_curr_numa_group(p);
3063	if (grp == my_grp)
3064	goto no_join;
3065
3066	/*
3067	* Only join the other group if its bigger; if we're the bigger group,
3068	* the other task will join us.
3069	*/
3070	if (my_grp->nr_tasks > grp->nr_tasks)
3071	goto no_join;
3072
3073	/*
3074	* Tie-break on the grp address.
3075	*/
3076	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3077	goto no_join;
3078
3079	/ Always join threads in the same process. /
3080	if (tsk->mm == current->mm)
3081	join = true;
3082
3083	/ Simple filter to avoid false positives due to PID collisions /
3084	if (flags & TNF_SHARED)
3085	join = true;
3086
3087	/ Update priv based on whether false sharing was detected /
3088	*priv = !join;
3089
3090	if (join && !get_numa_group(grp))
3091	goto no_join;
3092
3093	rcu_read_unlock();
3094
3095	if (!join)
3096	return;
3097
3098	WARN_ON_ONCE(irqs_disabled());
3099	double_lock_irq(l1: &my_grp->lock, l2: &grp->lock);
3100
3101	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3102	my_grp->faults[i] -= p->numa_faults[i];
3103	grp->faults[i] += p->numa_faults[i];
3104	}
3105	my_grp->total_faults -= p->total_numa_faults;
3106	grp->total_faults += p->total_numa_faults;
3107
3108	my_grp->nr_tasks--;
3109	grp->nr_tasks++;
3110
3111	spin_unlock(lock: &my_grp->lock);
3112	spin_unlock_irq(lock: &grp->lock);
3113
3114	rcu_assign_pointer(p->numa_group, grp);
3115
3116	put_numa_group(grp: my_grp);
3117	return;
3118
3119	no_join:
3120	rcu_read_unlock();
3121	return;
3122	}
3123
3124	/*
3125	* Get rid of NUMA statistics associated with a task (either current or dead).
3126	* If @final is set, the task is dead and has reached refcount zero, so we can
3127	* safely free all relevant data structures. Otherwise, there might be
3128	* concurrent reads from places like load balancing and procfs, and we should
3129	* reset the data back to default state without freeing ->numa_faults.
3130	*/
3131	void task_numa_free(struct task_struct *p, bool final)
3132	{
3133	/ safe: p either is current or is being freed by current /
3134	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3135	unsigned long *numa_faults = p->numa_faults;
3136	unsigned long flags;
3137	int i;
3138
3139	if (!numa_faults)
3140	return;
3141
3142	if (grp) {
3143	spin_lock_irqsave(&grp->lock, flags);
3144	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3145	grp->faults[i] -= p->numa_faults[i];
3146	grp->total_faults -= p->total_numa_faults;
3147
3148	grp->nr_tasks--;
3149	spin_unlock_irqrestore(lock: &grp->lock, flags);
3150	RCU_INIT_POINTER(p->numa_group, NULL);
3151	put_numa_group(grp);
3152	}
3153
3154	if (final) {
3155	p->numa_faults = NULL;
3156	kfree(objp: numa_faults);
3157	} else {
3158	p->total_numa_faults = `0`;
3159	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3160	numa_faults[i] = `0`;
3161	}
3162	}
3163
3164	/*
3165	* Got a PROT_NONE fault for a page on @node.
3166	*/
3167	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3168	{
3169	struct task_struct *p = current;
3170	bool migrated = flags & TNF_MIGRATED;
3171	int cpu_node = task_node(current);
3172	int local = !!(flags & TNF_FAULT_LOCAL);
3173	struct numa_group *ng;
3174	int priv;
3175
3176	if (!static_branch_likely(&sched_numa_balancing))
3177	return;
3178
3179	/ for example, ksmd faulting in a user's mm /
3180	if (!p->mm)
3181	return;
3182
3183	/*
3184	* NUMA faults statistics are unnecessary for the slow memory
3185	* node for memory tiering mode.
3186	*/
3187	if (!node_is_toptier(node: mem_node) &&
3188	(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING \|\|
3189	!cpupid_valid(cpupid: last_cpupid)))
3190	return;
3191
3192	/ Allocate buffer to track faults on a per-node basis /
3193	if (unlikely(!p->numa_faults)) {
3194	int size = sizeof(p->numa_faults)
3195	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3196
3197	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
3198	if (!p->numa_faults)
3199	return;
3200
3201	p->total_numa_faults = `0`;
3202	memset(p->numa_faults_locality, `0`, sizeof(p->numa_faults_locality));
3203	}
3204
3205	/*
3206	* First accesses are treated as private, otherwise consider accesses
3207	* to be private if the accessing pid has not changed
3208	*/
3209	if (unlikely(last_cpupid == (-`1` & LAST_CPUPID_MASK))) {
3210	priv = `1`;
3211	} else {
3212	priv = cpupid_match_pid(p, last_cpupid);
3213	if (!priv && !(flags & TNF_NO_GROUP))
3214	task_numa_group(p, cpupid: last_cpupid, flags, priv: &priv);
3215	}
3216
3217	/*
3218	* If a workload spans multiple NUMA nodes, a shared fault that
3219	* occurs wholly within the set of nodes that the workload is
3220	* actively using should be counted as local. This allows the
3221	* scan rate to slow down when a workload has settled down.
3222	*/
3223	ng = deref_curr_numa_group(p);
3224	if (!priv && !local && ng && ng->active_nodes > `1` &&
3225	numa_is_active_node(nid: cpu_node, ng) &&
3226	numa_is_active_node(nid: mem_node, ng))
3227	local = `1`;
3228
3229	/*
3230	* Retry to migrate task to preferred node periodically, in case it
3231	* previously failed, or the scheduler moved us.
3232	*/
3233	if (time_after(jiffies, p->numa_migrate_retry)) {
3234	task_numa_placement(p);
3235	numa_migrate_preferred(p);
3236	}
3237
3238	if (migrated)
3239	p->numa_pages_migrated += pages;
3240	if (flags & TNF_MIGRATE_FAIL)
3241	p->numa_faults_locality[`2`] += pages;
3242
3243	p->numa_faults[task_faults_idx(s: NUMA_MEMBUF, nid: mem_node, priv)] += pages;
3244	p->numa_faults[task_faults_idx(s: NUMA_CPUBUF, nid: cpu_node, priv)] += pages;
3245	p->numa_faults_locality[local] += pages;
3246	}
3247
3248	static void reset_ptenuma_scan(struct task_struct *p)
3249	{
3250	/*
3251	* We only did a read acquisition of the mmap sem, so
3252	* p->mm->numa_scan_seq is written to without exclusive access
3253	* and the update is not guaranteed to be atomic. That's not
3254	* much of an issue though, since this is just used for
3255	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3256	* expensive, to avoid any form of compiler optimizations:
3257	*/
3258	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + `1`);
3259	p->mm->numa_scan_offset = `0`;
3260	}
3261
3262	static bool vma_is_accessed(struct mm_struct mm, struct* vm_area_struct *vma)
3263	{
3264	unsigned long pids;
3265	/*
3266	* Allow unconditional access first two times, so that all the (pages)
3267	* of VMAs get prot_none fault introduced irrespective of accesses.
3268	* This is also done to avoid any side effect of task scanning
3269	* amplifying the unfairness of disjoint set of VMAs' access.
3270	*/
3271	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < `2`)
3272	return true;
3273
3274	pids = vma->numab_state->pids_active[`0`] \| vma->numab_state->pids_active[`1`];
3275	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3276	return true;
3277
3278	/*
3279	* Complete a scan that has already started regardless of PID access, or
3280	* some VMAs may never be scanned in multi-threaded applications:
3281	*/
3282	if (mm->numa_scan_offset > vma->vm_start) {
3283	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_IGNORE_PID);
3284	return true;
3285	}
3286
3287	/*
3288	* This vma has not been accessed for a while, and if the number
3289	* the threads in the same process is low, which means no other
3290	* threads can help scan this vma, force a vma scan.
3291	*/
3292	if (READ_ONCE(mm->numa_scan_seq) >
3293	(vma->numab_state->prev_scan_seq + get_nr_threads(current)))
3294	return true;
3295
3296	return false;
3297	}
3298
3299	#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3300
3301	/*
3302	* The expensive part of numa migration is done from task_work context.
3303	* Triggered from task_tick_numa().
3304	*/
3305	static void task_numa_work(struct callback_head *work)
3306	{
3307	unsigned long migrate, next_scan, now = jiffies;
3308	struct task_struct *p = current;
3309	struct mm_struct *mm = p->mm;
3310	u64 runtime = p->se.sum_exec_runtime;
3311	struct vm_area_struct *vma;
3312	unsigned long start, end;
3313	unsigned long nr_pte_updates = `0`;
3314	long pages, virtpages;
3315	struct vma_iterator vmi;
3316	bool vma_pids_skipped;
3317	bool vma_pids_forced = false;
3318
3319	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
3320
3321	work->next = work;
3322	/*
3323	* Who cares about NUMA placement when they're dying.
3324	*
3325	* NOTE: make sure not to dereference p->mm before this check,
3326	* exit_task_work() happens _after_ exit_mm() so we could be called
3327	* without p->mm even though we still had it when we enqueued this
3328	* work.
3329	*/
3330	if (p->flags & PF_EXITING)
3331	return;
3332
3333	/*
3334	* Memory is pinned to only one NUMA node via cpuset.mems, naturally
3335	* no page can be migrated.
3336	*/
3337	if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == `1`) {
3338	trace_sched_skip_cpuset_numa(current, mem_allowed_ptr: &cpuset_current_mems_allowed);
3339	return;
3340	}
3341
3342	if (!mm->numa_next_scan) {
3343	mm->numa_next_scan = now +
3344	msecs_to_jiffies(m: sysctl_numa_balancing_scan_delay);
3345	}
3346
3347	/*
3348	* Enforce maximal scan/migration frequency..
3349	*/
3350	migrate = mm->numa_next_scan;
3351	if (time_before(now, migrate))
3352	return;
3353
3354	if (p->numa_scan_period == `0`) {
3355	p->numa_scan_period_max = task_scan_max(p);
3356	p->numa_scan_period = task_scan_start(p);
3357	}
3358
3359	next_scan = now + msecs_to_jiffies(m: p->numa_scan_period);
3360	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3361	return;
3362
3363	/*
3364	* Delay this task enough that another task of this mm will likely win
3365	* the next time around.
3366	*/
3367	p->node_stamp += `2` * TICK_NSEC;
3368
3369	pages = sysctl_numa_balancing_scan_size;
3370	pages <<= `20` - PAGE_SHIFT; / MB in pages /
3371	virtpages = pages * `8`; / Scan up to this much virtual space /
3372	if (!pages)
3373	return;
3374
3375
3376	if (!mmap_read_trylock(mm))
3377	return;
3378
3379	/*
3380	* VMAs are skipped if the current PID has not trapped a fault within
3381	* the VMA recently. Allow scanning to be forced if there is no
3382	* suitable VMA remaining.
3383	*/
3384	vma_pids_skipped = false;
3385
3386	retry_pids:
3387	start = mm->numa_scan_offset;
3388	vma_iter_init(vmi: &vmi, mm, addr: start);
3389	vma = vma_next(vmi: &vmi);
3390	if (!vma) {
3391	reset_ptenuma_scan(p);
3392	start = `0`;
3393	vma_iter_set(vmi: &vmi, addr: start);
3394	vma = vma_next(vmi: &vmi);
3395	}
3396
3397	for (; vma; vma = vma_next(vmi: &vmi)) {
3398	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
3399	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
3400	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_UNSUITABLE);
3401	continue;
3402	}
3403
3404	/*
3405	* Shared library pages mapped by multiple processes are not
3406	* migrated as it is expected they are cache replicated. Avoid
3407	* hinting faults in read-only file-backed mappings or the vDSO
3408	* as migrating the pages will be of marginal benefit.
3409	*/
3410	if (!vma->vm_mm \|\|
3411	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ))) {
3412	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_SHARED_RO);
3413	continue;
3414	}
3415
3416	/*
3417	* Skip inaccessible VMAs to avoid any confusion between
3418	* PROT_NONE and NUMA hinting PTEs
3419	*/
3420	if (!vma_is_accessible(vma)) {
3421	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_INACCESSIBLE);
3422	continue;
3423	}
3424
3425	/ Initialise new per-VMA NUMAB state. /
3426	if (!vma->numab_state) {
3427	struct vma_numab_state *ptr;
3428
3429	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
3430	if (!ptr)
3431	continue;
3432
3433	if (cmpxchg(&vma->numab_state, NULL, ptr)) {
3434	kfree(objp: ptr);
3435	continue;
3436	}
3437
3438	vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3439
3440	vma->numab_state->next_scan = now +
3441	msecs_to_jiffies(m: sysctl_numa_balancing_scan_delay);
3442
3443	/ Reset happens after 4 times scan delay of scan start /
3444	vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
3445	msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3446
3447	/*
3448	* Ensure prev_scan_seq does not match numa_scan_seq,
3449	* to prevent VMAs being skipped prematurely on the
3450	* first scan:
3451	*/
3452	vma->numab_state->prev_scan_seq = mm->numa_scan_seq - `1`;
3453	}
3454
3455	/*
3456	* Scanning the VMAs of short lived tasks add more overhead. So
3457	* delay the scan for new VMAs.
3458	*/
3459	if (mm->numa_scan_seq && time_before(jiffies,
3460	vma->numab_state->next_scan)) {
3461	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_SCAN_DELAY);
3462	continue;
3463	}
3464
3465	/ RESET access PIDs regularly for old VMAs. /
3466	if (mm->numa_scan_seq &&
3467	time_after(jiffies, vma->numab_state->pids_active_reset)) {
3468	vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3469	msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3470	vma->numab_state->pids_active[`0`] = READ_ONCE(vma->numab_state->pids_active[`1`]);
3471	vma->numab_state->pids_active[`1`] = `0`;
3472	}
3473
3474	/ Do not rescan VMAs twice within the same sequence. /
3475	if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3476	mm->numa_scan_offset = vma->vm_end;
3477	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_SEQ_COMPLETED);
3478	continue;
3479	}
3480
3481	/*
3482	* Do not scan the VMA if task has not accessed it, unless no other
3483	* VMA candidate exists.
3484	*/
3485	if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3486	vma_pids_skipped = true;
3487	trace_sched_skip_vma_numa(mm, vma, reason: NUMAB_SKIP_PID_INACTIVE);
3488	continue;
3489	}
3490
3491	do {
3492	start = max(start, vma->vm_start);
3493	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3494	end = min(end, vma->vm_end);
3495	nr_pte_updates = change_prot_numa(vma, start, end);
3496
3497	/*
3498	* Try to scan sysctl_numa_balancing_size worth of
3499	* hpages that have at least one present PTE that
3500	* is not already PTE-numa. If the VMA contains
3501	* areas that are unused or already full of prot_numa
3502	* PTEs, scan up to virtpages, to skip through those
3503	* areas faster.
3504	*/
3505	if (nr_pte_updates)
3506	pages -= (end - start) >> PAGE_SHIFT;
3507	virtpages -= (end - start) >> PAGE_SHIFT;
3508
3509	start = end;
3510	if (pages <= `0` \|\| virtpages <= `0`)
3511	goto out;
3512
3513	cond_resched();
3514	} while (end != vma->vm_end);
3515
3516	/ VMA scan is complete, do not scan until next sequence. /
3517	vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3518
3519	/*
3520	* Only force scan within one VMA at a time, to limit the
3521	* cost of scanning a potentially uninteresting VMA.
3522	*/
3523	if (vma_pids_forced)
3524	break;
3525	}
3526
3527	/*
3528	* If no VMAs are remaining and VMAs were skipped due to the PID
3529	* not accessing the VMA previously, then force a scan to ensure
3530	* forward progress:
3531	*/
3532	if (!vma && !vma_pids_forced && vma_pids_skipped) {
3533	vma_pids_forced = true;
3534	goto retry_pids;
3535	}
3536
3537	out:
3538	/*
3539	* It is possible to reach the end of the VMA list but the last few
3540	* VMAs are not guaranteed to the vma_migratable. If they are not, we
3541	* would find the !migratable VMA on the next scan but not reset the
3542	* scanner to the start so check it now.
3543	*/
3544	if (vma)
3545	mm->numa_scan_offset = start;
3546	else
3547	reset_ptenuma_scan(p);
3548	mmap_read_unlock(mm);
3549
3550	/*
3551	* Make sure tasks use at least 32x as much time to run other code
3552	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
3553	* Usually update_task_scan_period slows down scanning enough; on an
3554	* overloaded system we need to limit overhead on a per task basis.
3555	*/
3556	if (unlikely(p->se.sum_exec_runtime != runtime)) {
3557	u64 diff = p->se.sum_exec_runtime - runtime;
3558	p->node_stamp += `32` * diff;
3559	}
3560	}
3561
3562	void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
3563	{
3564	int mm_users = `0`;
3565	struct mm_struct *mm = p->mm;
3566
3567	if (mm) {
3568	mm_users = atomic_read(v: &mm->mm_users);
3569	if (mm_users == `1`) {
3570	mm->numa_next_scan = jiffies + msecs_to_jiffies(m: sysctl_numa_balancing_scan_delay);
3571	mm->numa_scan_seq = `0`;
3572	}
3573	}
3574	p->node_stamp = `0`;
3575	p->numa_scan_seq = mm ? mm->numa_scan_seq : `0`;
3576	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
3577	p->numa_migrate_retry = `0`;
3578	/ Protect against double add, see task_tick_numa and task_numa_work /
3579	p->numa_work.next = &p->numa_work;
3580	p->numa_faults = NULL;
3581	p->numa_pages_migrated = `0`;
3582	p->total_numa_faults = `0`;
3583	RCU_INIT_POINTER(p->numa_group, NULL);
3584	p->last_task_numa_placement = `0`;
3585	p->last_sum_exec_runtime = `0`;
3586
3587	init_task_work(twork: &p->numa_work, func: task_numa_work);
3588
3589	/ New address space, reset the preferred nid /
3590	if (!(clone_flags & CLONE_VM)) {
3591	p->numa_preferred_nid = NUMA_NO_NODE;
3592	return;
3593	}
3594
3595	/*
3596	* New thread, keep existing numa_preferred_nid which should be copied
3597	* already by arch_dup_task_struct but stagger when scans start.
3598	*/
3599	if (mm) {
3600	unsigned int delay;
3601
3602	delay = min_t(unsigned int, task_scan_max(current),
3603	current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3604	delay += `2` * TICK_NSEC;
3605	p->node_stamp = delay;
3606	}
3607	}
3608
3609	/*
3610	* Drive the periodic memory faults..
3611	*/
3612	static void task_tick_numa(struct rq rq, struct* task_struct *curr)
3613	{
3614	struct callback_head *work = &curr->numa_work;
3615	u64 period, now;
3616
3617	/*
3618	* We don't care about NUMA placement if we don't have memory.
3619	*/
3620	if (!curr->mm \|\| (curr->flags & (PF_EXITING \| PF_KTHREAD)) \|\| work->next != work)
3621	return;
3622
3623	/*
3624	* Using runtime rather than walltime has the dual advantage that
3625	* we (mostly) drive the selection from busy threads and that the
3626	* task needs to have done some actual work before we bother with
3627	* NUMA placement.
3628	*/
3629	now = curr->se.sum_exec_runtime;
3630	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3631
3632	if (now > curr->node_stamp + period) {
3633	if (!curr->node_stamp)
3634	curr->numa_scan_period = task_scan_start(p: curr);
3635	curr->node_stamp += period;
3636
3637	if (!time_before(jiffies, curr->mm->numa_next_scan))
3638	task_work_add(task: curr, twork: work, mode: TWA_RESUME);
3639	}
3640	}
3641
3642	static void update_scan_period(struct task_struct p, int* new_cpu)
3643	{
3644	int src_nid = cpu_to_node(cpu: task_cpu(p));
3645	int dst_nid = cpu_to_node(cpu: new_cpu);
3646
3647	if (!static_branch_likely(&sched_numa_balancing))
3648	return;
3649
3650	if (!p->mm \|\| !p->numa_faults \|\| (p->flags & PF_EXITING))
3651	return;
3652
3653	if (src_nid == dst_nid)
3654	return;
3655
3656	/*
3657	* Allow resets if faults have been trapped before one scan
3658	* has completed. This is most likely due to a new task that
3659	* is pulled cross-node due to wakeups or load balancing.
3660	*/
3661	if (p->numa_scan_seq) {
3662	/*
3663	* Avoid scan adjustments if moving to the preferred
3664	* node or if the task was not previously running on
3665	* the preferred node.
3666	*/
3667	if (dst_nid == p->numa_preferred_nid \|\|
3668	(p->numa_preferred_nid != NUMA_NO_NODE &&
3669	src_nid != p->numa_preferred_nid))
3670	return;
3671	}
3672
3673	p->numa_scan_period = task_scan_start(p);
3674	}
3675
3676	#else
3677	static void task_tick_numa(struct rq rq, struct* task_struct *curr)
3678	{
3679	}
3680
3681	static inline void account_numa_enqueue(struct rq rq, struct* task_struct *p)
3682	{
3683	}
3684
3685	static inline void account_numa_dequeue(struct rq rq, struct* task_struct *p)
3686	{
3687	}
3688
3689	static inline void update_scan_period(struct task_struct p, int* new_cpu)
3690	{
3691	}
3692
3693	#endif /* CONFIG_NUMA_BALANCING */
3694
3695	static void
3696	account_entity_enqueue(struct cfs_rq cfs_rq, struct* sched_entity *se)
3697	{
3698	update_load_add(lw: &cfs_rq->load, inc: se->load.weight);
3699	#ifdef CONFIG_SMP
3700	if (entity_is_task(se)) {
3701	struct rq *rq = rq_of(cfs_rq);
3702
3703	account_numa_enqueue(rq, p: task_of(se));
3704	list_add(new: &se->group_node, head: &rq->cfs_tasks);
3705	}
3706	#endif
3707	cfs_rq->nr_queued++;
3708	}
3709
3710	static void
3711	account_entity_dequeue(struct cfs_rq cfs_rq, struct* sched_entity *se)
3712	{
3713	update_load_sub(lw: &cfs_rq->load, dec: se->load.weight);
3714	#ifdef CONFIG_SMP
3715	if (entity_is_task(se)) {
3716	account_numa_dequeue(rq: rq_of(cfs_rq), p: task_of(se));
3717	list_del_init(entry: &se->group_node);
3718	}
3719	#endif
3720	cfs_rq->nr_queued--;
3721	}
3722
3723	/*
3724	* Signed add and clamp on underflow.
3725	*
3726	* Explicitly do a load-store to ensure the intermediate value never hits
3727	* memory. This allows lockless observations without ever seeing the negative
3728	* values.
3729	*/
3730	#define add_positive(_ptr, _val) do { \
3731	typeof(_ptr) ptr = (_ptr); \
3732	typeof(_val) val = (_val); \
3733	typeof(ptr) res, var = READ_ONCE(ptr); \
3734	\
3735	res = var + val; \
3736	\
3737	if (val < 0 && res > var) \
3738	res = 0; \
3739	\
3740	WRITE_ONCE(*ptr, res); \
3741	} while (0)
3742
3743	/*
3744	* Unsigned subtract and clamp on underflow.
3745	*
3746	* Explicitly do a load-store to ensure the intermediate value never hits
3747	* memory. This allows lockless observations without ever seeing the negative
3748	* values.
3749	*/
3750	#define sub_positive(_ptr, _val) do { \
3751	typeof(_ptr) ptr = (_ptr); \
3752	typeof(*ptr) val = (_val); \
3753	typeof(ptr) res, var = READ_ONCE(ptr); \
3754	res = var - val; \
3755	if (res > var) \
3756	res = 0; \
3757	WRITE_ONCE(*ptr, res); \
3758	} while (0)
3759
3760	/*
3761	* Remove and clamp on negative, from a local variable.
3762	*
3763	* A variant of sub_positive(), which does not use explicit load-store
3764	* and is thus optimized for local variable updates.
3765	*/
3766	#define lsub_positive(_ptr, _val) do { \
3767	typeof(_ptr) ptr = (_ptr); \
3768	ptr -= min_t(typeof(ptr), *ptr, _val); \
3769	} while (0)
3770
3771	#ifdef CONFIG_SMP
3772	static inline void
3773	enqueue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
3774	{
3775	cfs_rq->avg.load_avg += se->avg.load_avg;
3776	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3777	}
3778
3779	static inline void
3780	dequeue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
3781	{
3782	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3783	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3784	/ See update_cfs_rq_load_avg() /
3785	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3786	cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3787	}
3788	#else
3789	static inline void
3790	enqueue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se) { }
3791	static inline void
3792	dequeue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se) { }
3793	#endif
3794
3795	static void place_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags);
3796
3797	static void reweight_entity(struct cfs_rq cfs_rq, struct* sched_entity *se,
3798	unsigned long weight)
3799	{
3800	bool curr = cfs_rq->curr == se;
3801
3802	if (se->on_rq) {
3803	/ commit outstanding execution time /
3804	update_curr(cfs_rq);
3805	update_entity_lag(cfs_rq, se);
3806	se->deadline -= se->vruntime;
3807	se->rel_deadline = `1`;
3808	cfs_rq->nr_queued--;
3809	if (!curr)
3810	__dequeue_entity(cfs_rq, se);
3811	update_load_sub(lw: &cfs_rq->load, dec: se->load.weight);
3812	}
3813	dequeue_load_avg(cfs_rq, se);
3814
3815	/*
3816	* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3817	* we need to scale se->vlag when w_i changes.
3818	*/
3819	se->vlag = div_s64(dividend: se->vlag * se->load.weight, divisor: weight);
3820	if (se->rel_deadline)
3821	se->deadline = div_s64(dividend: se->deadline * se->load.weight, divisor: weight);
3822
3823	update_load_set(lw: &se->load, w: weight);
3824
3825	#ifdef CONFIG_SMP
3826	do {
3827	u32 divider = get_pelt_divider(avg: &se->avg);
3828
3829	se->avg.load_avg = div_u64(dividend: se_weight(se) * se->avg.load_sum, divisor: divider);
3830	} while (`0`);
3831	#endif
3832
3833	enqueue_load_avg(cfs_rq, se);
3834	if (se->on_rq) {
3835	place_entity(cfs_rq, se, flags: `0`);
3836	update_load_add(lw: &cfs_rq->load, inc: se->load.weight);
3837	if (!curr)
3838	__enqueue_entity(cfs_rq, se);
3839	cfs_rq->nr_queued++;
3840
3841	/*
3842	* The entity's vruntime has been adjusted, so let's check
3843	* whether the rq-wide min_vruntime needs updated too. Since
3844	* the calculations above require stable min_vruntime rather
3845	* than up-to-date one, we do the update at the end of the
3846	* reweight process.
3847	*/
3848	update_min_vruntime(cfs_rq);
3849	}
3850	}
3851
3852	static void reweight_task_fair(struct rq rq, struct* task_struct *p,
3853	const struct load_weight *lw)
3854	{
3855	struct sched_entity *se = &p->se;
3856	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3857	struct load_weight *load = &se->load;
3858
3859	reweight_entity(cfs_rq, se, weight: lw->weight);
3860	load->inv_weight = lw->inv_weight;
3861	}
3862
3863	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3864
3865	#ifdef CONFIG_FAIR_GROUP_SCHED
3866	#ifdef CONFIG_SMP
3867	/*
3868	* All this does is approximate the hierarchical proportion which includes that
3869	* global sum we all love to hate.
3870	*
3871	* That is, the weight of a group entity, is the proportional share of the
3872	* group weight based on the group runqueue weights. That is:
3873	*
3874	* tg->weight * grq->load.weight
3875	* ge->load.weight = ----------------------------- (1)
3876	* \Sum grq->load.weight
3877	*
3878	* Now, because computing that sum is prohibitively expensive to compute (been
3879	* there, done that) we approximate it with this average stuff. The average
3880	* moves slower and therefore the approximation is cheaper and more stable.
3881	*
3882	* So instead of the above, we substitute:
3883	*
3884	* grq->load.weight -> grq->avg.load_avg (2)
3885	*
3886	* which yields the following:
3887	*
3888	* tg->weight * grq->avg.load_avg
3889	* ge->load.weight = ------------------------------ (3)
3890	* tg->load_avg
3891	*
3892	* Where: tg->load_avg ~= \Sum grq->avg.load_avg
3893	*
3894	* That is shares_avg, and it is right (given the approximation (2)).
3895	*
3896	* The problem with it is that because the average is slow -- it was designed
3897	* to be exactly that of course -- this leads to transients in boundary
3898	* conditions. In specific, the case where the group was idle and we start the
3899	* one task. It takes time for our CPU's grq->avg.load_avg to build up,
3900	* yielding bad latency etc..
3901	*
3902	* Now, in that special case (1) reduces to:
3903	*
3904	* tg->weight * grq->load.weight
3905	* ge->load.weight = ----------------------------- = tg->weight (4)
3906	* grp->load.weight
3907	*
3908	* That is, the sum collapses because all other CPUs are idle; the UP scenario.
3909	*
3910	* So what we do is modify our approximation (3) to approach (4) in the (near)
3911	* UP case, like:
3912	*
3913	* ge->load.weight =
3914	*
3915	* tg->weight * grq->load.weight
3916	* --------------------------------------------------- (5)
3917	* tg->load_avg - grq->avg.load_avg + grq->load.weight
3918	*
3919	* But because grq->load.weight can drop to 0, resulting in a divide by zero,
3920	* we need to use grq->avg.load_avg as its lower bound, which then gives:
3921	*
3922	*
3923	* tg->weight * grq->load.weight
3924	* ge->load.weight = ----------------------------- (6)
3925	* tg_load_avg'
3926	*
3927	* Where:
3928	*
3929	* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3930	* max(grq->load.weight, grq->avg.load_avg)
3931	*
3932	* And that is shares_weight and is icky. In the (near) UP case it approaches
3933	* (4) while in the normal case it approaches (3). It consistently
3934	* overestimates the ge->load.weight and therefore:
3935	*
3936	* \Sum ge->load.weight >= tg->weight
3937	*
3938	* hence icky!
3939	*/
3940	static long calc_group_shares(struct cfs_rq *cfs_rq)
3941	{
3942	long tg_weight, tg_shares, load, shares;
3943	struct task_group *tg = cfs_rq->tg;
3944
3945	tg_shares = READ_ONCE(tg->shares);
3946
3947	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3948
3949	tg_weight = atomic_long_read(v: &tg->load_avg);
3950
3951	/ Ensure tg_weight >= load /
3952	tg_weight -= cfs_rq->tg_load_avg_contrib;
3953	tg_weight += load;
3954
3955	shares = (tg_shares * load);
3956	if (tg_weight)
3957	shares /= tg_weight;
3958
3959	/*
3960	* MIN_SHARES has to be unscaled here to support per-CPU partitioning
3961	* of a group with small tg->shares value. It is a floor value which is
3962	* assigned as a minimum load.weight to the sched_entity representing
3963	* the group on a CPU.
3964	*
3965	* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3966	* on an 8-core system with 8 tasks each runnable on one CPU shares has
3967	* to be 1510241/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3968	* case no task is runnable on a CPU MIN_SHARES=2 should be returned
3969	* instead of 0.
3970	*/
3971	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3972	}
3973	#endif /* CONFIG_SMP */
3974
3975	/*
3976	* Recomputes the group entity based on the current state of its group
3977	* runqueue.
3978	*/
3979	static void update_cfs_group(struct sched_entity *se)
3980	{
3981	struct cfs_rq *gcfs_rq = group_cfs_rq(grp: se);
3982	long shares;
3983
3984	/*
3985	* When a group becomes empty, preserve its weight. This matters for
3986	* DELAY_DEQUEUE.
3987	*/
3988	if (!gcfs_rq \|\| !gcfs_rq->load.weight)
3989	return;
3990
3991	if (throttled_hierarchy(cfs_rq: gcfs_rq))
3992	return;
3993
3994	#ifndef CONFIG_SMP
3995	shares = READ_ONCE(gcfs_rq->tg->shares);
3996	#else
3997	shares = calc_group_shares(cfs_rq: gcfs_rq);
3998	#endif
3999	if (unlikely(se->load.weight != shares))
4000	reweight_entity(cfs_rq: cfs_rq_of(se), se, weight: shares);
4001	}
4002
4003	#else /* CONFIG_FAIR_GROUP_SCHED */
4004	static inline void update_cfs_group(struct sched_entity *se)
4005	{
4006	}
4007	#endif /* CONFIG_FAIR_GROUP_SCHED */
4008
4009	static inline void cfs_rq_util_change(struct cfs_rq cfs_rq, int* flags)
4010	{
4011	struct rq *rq = rq_of(cfs_rq);
4012
4013	if (&rq->cfs == cfs_rq) {
4014	/*
4015	* There are a few boundary cases this might miss but it should
4016	* get called often enough that that should (hopefully) not be
4017	* a real problem.
4018	*
4019	* It will not get called when we go idle, because the idle
4020	* thread is a different class (!fair), nor will the utilization
4021	* number include things like RT tasks.
4022	*
4023	* As is, the util number is not freq-invariant (we'd have to
4024	* implement arch_scale_freq_capacity() for that).
4025	*
4026	* See cpu_util_cfs().
4027	*/
4028	cpufreq_update_util(rq, flags);
4029	}
4030	}
4031
4032	#ifdef CONFIG_SMP
4033	static inline bool load_avg_is_decayed(struct sched_avg *sa)
4034	{
4035	if (sa->load_sum)
4036	return false;
4037
4038	if (sa->util_sum)
4039	return false;
4040
4041	if (sa->runnable_sum)
4042	return false;
4043
4044	/*
4045	* _avg must be null when _sum are null because _avg = _sum / divider
4046	* Make sure that rounding and/or propagation of PELT values never
4047	* break this.
4048	*/
4049	WARN_ON_ONCE(sa->load_avg \|\|
4050	sa->util_avg \|\|
4051	sa->runnable_avg);
4052
4053	return true;
4054	}
4055
4056	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4057	{
4058	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4059	cfs_rq->last_update_time_copy);
4060	}
4061	#ifdef CONFIG_FAIR_GROUP_SCHED
4062	/*
4063	* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4064	* immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4065	* bottom-up, we only have to test whether the cfs_rq before us on the list
4066	* is our child.
4067	* If cfs_rq is not on the list, test whether a child needs its to be added to
4068	* connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
4069	*/
4070	static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4071	{
4072	struct cfs_rq *prev_cfs_rq;
4073	struct list_head *prev;
4074	struct rq *rq = rq_of(cfs_rq);
4075
4076	if (cfs_rq->on_list) {
4077	prev = cfs_rq->leaf_cfs_rq_list.prev;
4078	} else {
4079	prev = rq->tmp_alone_branch;
4080	}
4081
4082	if (prev == &rq->leaf_cfs_rq_list)
4083	return false;
4084
4085	prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4086
4087	return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4088	}
4089
4090	static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4091	{
4092	if (cfs_rq->load.weight)
4093	return false;
4094
4095	if (!load_avg_is_decayed(sa: &cfs_rq->avg))
4096	return false;
4097
4098	if (child_cfs_rq_on_list(cfs_rq))
4099	return false;
4100
4101	return true;
4102	}
4103
4104	/**
4105	* update_tg_load_avg - update the tg's load avg
4106	* @cfs_rq: the cfs_rq whose avg changed
4107	*
4108	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4109	* However, because tg->load_avg is a global value there are performance
4110	* considerations.
4111	*
4112	* In order to avoid having to look at the other cfs_rq's, we use a
4113	* differential update where we store the last value we propagated. This in
4114	* turn allows skipping updates if the differential is 'small'.
4115	*
4116	* Updating tg's load_avg is necessary before update_cfs_share().
4117	*/
4118	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4119	{
4120	long delta;
4121	u64 now;
4122
4123	/*
4124	* No need to update load_avg for root_task_group as it is not used.
4125	*/
4126	if (cfs_rq->tg == &root_task_group)
4127	return;
4128
4129	/ rq has been offline and doesn't contribute to the share anymore: /
4130	if (!cpu_active(cpu: cpu_of(rq: rq_of(cfs_rq))))
4131	return;
4132
4133	/*
4134	* For migration heavy workloads, access to tg->load_avg can be
4135	* unbound. Limit the update rate to at most once per ms.
4136	*/
4137	now = sched_clock_cpu(cpu: cpu_of(rq: rq_of(cfs_rq)));
4138	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4139	return;
4140
4141	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4142	if (abs(delta) > cfs_rq->tg_load_avg_contrib / `64`) {
4143	atomic_long_add(i: delta, v: &cfs_rq->tg->load_avg);
4144	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4145	cfs_rq->last_update_tg_load_avg = now;
4146	}
4147	}
4148
4149	static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4150	{
4151	long delta;
4152	u64 now;
4153
4154	/*
4155	* No need to update load_avg for root_task_group, as it is not used.
4156	*/
4157	if (cfs_rq->tg == &root_task_group)
4158	return;
4159
4160	now = sched_clock_cpu(cpu: cpu_of(rq: rq_of(cfs_rq)));
4161	delta = `0` - cfs_rq->tg_load_avg_contrib;
4162	atomic_long_add(i: delta, v: &cfs_rq->tg->load_avg);
4163	cfs_rq->tg_load_avg_contrib = `0`;
4164	cfs_rq->last_update_tg_load_avg = now;
4165	}
4166
4167	/ CPU offline callback: /
4168	static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4169	{
4170	struct task_group *tg;
4171
4172	lockdep_assert_rq_held(rq);
4173
4174	/*
4175	* The rq clock has already been updated in
4176	* set_rq_offline(), so we should skip updating
4177	* the rq clock again in unthrottle_cfs_rq().
4178	*/
4179	rq_clock_start_loop_update(rq);
4180
4181	rcu_read_lock();
4182	list_for_each_entry_rcu(tg, &task_groups, list) {
4183	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4184
4185	clear_tg_load_avg(cfs_rq);
4186	}
4187	rcu_read_unlock();
4188
4189	rq_clock_stop_loop_update(rq);
4190	}
4191
4192	/*
4193	* Called within set_task_rq() right before setting a task's CPU. The
4194	* caller only guarantees p->pi_lock is held; no other assumptions,
4195	* including the state of rq->lock, should be made.
4196	*/
4197	void set_task_rq_fair(struct sched_entity *se,
4198	struct cfs_rq prev, struct* cfs_rq *next)
4199	{
4200	u64 p_last_update_time;
4201	u64 n_last_update_time;
4202
4203	if (!sched_feat(ATTACH_AGE_LOAD))
4204	return;
4205
4206	/*
4207	* We are supposed to update the task to "current" time, then its up to
4208	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
4209	* getting what current time is, so simply throw away the out-of-date
4210	* time. This will result in the wakee task is less decayed, but giving
4211	* the wakee more load sounds not bad.
4212	*/
4213	if (!(se->avg.last_update_time && prev))
4214	return;
4215
4216	p_last_update_time = cfs_rq_last_update_time(cfs_rq: prev);
4217	n_last_update_time = cfs_rq_last_update_time(cfs_rq: next);
4218
4219	__update_load_avg_blocked_se(now: p_last_update_time, se);
4220	se->avg.last_update_time = n_last_update_time;
4221	}
4222
4223	/*
4224	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4225	* propagate its contribution. The key to this propagation is the invariant
4226	* that for each group:
4227	*
4228	* ge->avg == grq->avg (1)
4229	*
4230	* _IFF_ we look at the pure running and runnable sums. Because they
4231	* represent the very same entity, just at different points in the hierarchy.
4232	*
4233	* Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4234	* and simply copies the running/runnable sum over (but still wrong, because
4235	* the group entity and group rq do not have their PELT windows aligned).
4236	*
4237	* However, update_tg_cfs_load() is more complex. So we have:
4238	*
4239	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
4240	*
4241	* And since, like util, the runnable part should be directly transferable,
4242	* the following would _appear_ to be the straight forward approach:
4243	*
4244	* grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
4245	*
4246	* And per (1) we have:
4247	*
4248	* ge->avg.runnable_avg == grq->avg.runnable_avg
4249	*
4250	* Which gives:
4251	*
4252	* ge->load.weight * grq->avg.load_avg
4253	* ge->avg.load_avg = ----------------------------------- (4)
4254	* grq->load.weight
4255	*
4256	* Except that is wrong!
4257	*
4258	* Because while for entities historical weight is not important and we
4259	* really only care about our future and therefore can consider a pure
4260	* runnable sum, runqueues can NOT do this.
4261	*
4262	* We specifically want runqueues to have a load_avg that includes
4263	* historical weights. Those represent the blocked load, the load we expect
4264	* to (shortly) return to us. This only works by keeping the weights as
4265	* integral part of the sum. We therefore cannot decompose as per (3).
4266	*
4267	* Another reason this doesn't work is that runnable isn't a 0-sum entity.
4268	* Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4269	* rq itself is runnable anywhere between 2/3 and 1 depending on how the
4270	* runnable section of these tasks overlap (or not). If they were to perfectly
4271	* align the rq as a whole would be runnable 2/3 of the time. If however we
4272	* always have at least 1 runnable task, the rq as a whole is always runnable.
4273	*
4274	* So we'll have to approximate.. :/
4275	*
4276	* Given the constraint:
4277	*
4278	* ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4279	*
4280	* We can construct a rule that adds runnable to a rq by assuming minimal
4281	* overlap.
4282	*
4283	* On removal, we'll assume each task is equally runnable; which yields:
4284	*
4285	* grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4286	*
4287	* XXX: only do this for the part of runnable > running ?
4288	*
4289	*/
4290	static inline void
4291	update_tg_cfs_util(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4292	{
4293	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4294	u32 new_sum, divider;
4295
4296	/ Nothing to update /
4297	if (!delta_avg)
4298	return;
4299
4300	/*
4301	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4302	* See ___update_load_avg() for details.
4303	*/
4304	divider = get_pelt_divider(avg: &cfs_rq->avg);
4305
4306
4307	/ Set new sched_entity's utilization /
4308	se->avg.util_avg = gcfs_rq->avg.util_avg;
4309	new_sum = se->avg.util_avg * divider;
4310	delta_sum = (long)new_sum - (long)se->avg.util_sum;
4311	se->avg.util_sum = new_sum;
4312
4313	/ Update parent cfs_rq utilization /
4314	add_positive(&cfs_rq->avg.util_avg, delta_avg);
4315	add_positive(&cfs_rq->avg.util_sum, delta_sum);
4316
4317	/ See update_cfs_rq_load_avg() /
4318	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4319	cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4320	}
4321
4322	static inline void
4323	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4324	{
4325	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4326	u32 new_sum, divider;
4327
4328	/ Nothing to update /
4329	if (!delta_avg)
4330	return;
4331
4332	/*
4333	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4334	* See ___update_load_avg() for details.
4335	*/
4336	divider = get_pelt_divider(avg: &cfs_rq->avg);
4337
4338	/ Set new sched_entity's runnable /
4339	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4340	new_sum = se->avg.runnable_avg * divider;
4341	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4342	se->avg.runnable_sum = new_sum;
4343
4344	/ Update parent cfs_rq runnable /
4345	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
4346	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
4347	/ See update_cfs_rq_load_avg() /
4348	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4349	cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4350	}
4351
4352	static inline void
4353	update_tg_cfs_load(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4354	{
4355	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4356	unsigned long load_avg;
4357	u64 load_sum = `0`;
4358	s64 delta_sum;
4359	u32 divider;
4360
4361	if (!runnable_sum)
4362	return;
4363
4364	gcfs_rq->prop_runnable_sum = `0`;
4365
4366	/*
4367	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4368	* See ___update_load_avg() for details.
4369	*/
4370	divider = get_pelt_divider(avg: &cfs_rq->avg);
4371
4372	if (runnable_sum >= `0`) {
4373	/*
4374	* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4375	* the CPU is saturated running == runnable.
4376	*/
4377	runnable_sum += se->avg.load_sum;
4378	runnable_sum = min_t(long, runnable_sum, divider);
4379	} else {
4380	/*
4381	* Estimate the new unweighted runnable_sum of the gcfs_rq by
4382	* assuming all tasks are equally runnable.
4383	*/
4384	if (scale_load_down(gcfs_rq->load.weight)) {
4385	load_sum = div_u64(dividend: gcfs_rq->avg.load_sum,
4386	scale_load_down(gcfs_rq->load.weight));
4387	}
4388
4389	/ But make sure to not inflate se's runnable /
4390	runnable_sum = min(se->avg.load_sum, load_sum);
4391	}
4392
4393	/*
4394	* runnable_sum can't be lower than running_sum
4395	* Rescale running sum to be in the same range as runnable sum
4396	* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
4397	* runnable_sum is in [0 : LOAD_AVG_MAX]
4398	*/
4399	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4400	runnable_sum = max(runnable_sum, running_sum);
4401
4402	load_sum = se_weight(se) * runnable_sum;
4403	load_avg = div_u64(dividend: load_sum, divisor: divider);
4404
4405	delta_avg = load_avg - se->avg.load_avg;
4406	if (!delta_avg)
4407	return;
4408
4409	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4410
4411	se->avg.load_sum = runnable_sum;
4412	se->avg.load_avg = load_avg;
4413	add_positive(&cfs_rq->avg.load_avg, delta_avg);
4414	add_positive(&cfs_rq->avg.load_sum, delta_sum);
4415	/ See update_cfs_rq_load_avg() /
4416	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
4417	cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
4418	}
4419
4420	static inline void add_tg_cfs_propagate(struct cfs_rq cfs_rq, long* runnable_sum)
4421	{
4422	cfs_rq->propagate = `1`;
4423	cfs_rq->prop_runnable_sum += runnable_sum;
4424	}
4425
4426	/ Update task and its cfs_rq load average /
4427	static inline int propagate_entity_load_avg(struct sched_entity *se)
4428	{
4429	struct cfs_rq cfs_rq, gcfs_rq;
4430
4431	if (entity_is_task(se))
4432	return `0`;
4433
4434	gcfs_rq = group_cfs_rq(grp: se);
4435	if (!gcfs_rq->propagate)
4436	return `0`;
4437
4438	gcfs_rq->propagate = `0`;
4439
4440	cfs_rq = cfs_rq_of(se);
4441
4442	add_tg_cfs_propagate(cfs_rq, runnable_sum: gcfs_rq->prop_runnable_sum);
4443
4444	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4445	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4446	update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4447
4448	trace_pelt_cfs_tp(cfs_rq);
4449	trace_pelt_se_tp(se);
4450
4451	return `1`;
4452	}
4453
4454	/*
4455	* Check if we need to update the load and the utilization of a blocked
4456	* group_entity:
4457	*/
4458	static inline bool skip_blocked_update(struct sched_entity *se)
4459	{
4460	struct cfs_rq *gcfs_rq = group_cfs_rq(grp: se);
4461
4462	/*
4463	* If sched_entity still have not zero load or utilization, we have to
4464	* decay it:
4465	*/
4466	if (se->avg.load_avg \|\| se->avg.util_avg)
4467	return false;
4468
4469	/*
4470	* If there is a pending propagation, we have to update the load and
4471	* the utilization of the sched_entity:
4472	*/
4473	if (gcfs_rq->propagate)
4474	return false;
4475
4476	/*
4477	* Otherwise, the load and the utilization of the sched_entity is
4478	* already zero and there is no pending propagation, so it will be a
4479	* waste of time to try to decay it:
4480	*/
4481	return true;
4482	}
4483
4484	#else /* CONFIG_FAIR_GROUP_SCHED */
4485
4486	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4487
4488	static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4489
4490	static inline int propagate_entity_load_avg(struct sched_entity *se)
4491	{
4492	return `0`;
4493	}
4494
4495	static inline void add_tg_cfs_propagate(struct cfs_rq cfs_rq, long* runnable_sum) {}
4496
4497	#endif /* CONFIG_FAIR_GROUP_SCHED */
4498
4499	#ifdef CONFIG_NO_HZ_COMMON
4500	static inline void migrate_se_pelt_lag(struct sched_entity *se)
4501	{
4502	u64 throttled = `0`, now, lut;
4503	struct cfs_rq *cfs_rq;
4504	struct rq *rq;
4505	bool is_idle;
4506
4507	if (load_avg_is_decayed(sa: &se->avg))
4508	return;
4509
4510	cfs_rq = cfs_rq_of(se);
4511	rq = rq_of(cfs_rq);
4512
4513	rcu_read_lock();
4514	is_idle = is_idle_task(rcu_dereference(rq->curr));
4515	rcu_read_unlock();
4516
4517	/*
4518	* The lag estimation comes with a cost we don't want to pay all the
4519	* time. Hence, limiting to the case where the source CPU is idle and
4520	* we know we are at the greatest risk to have an outdated clock.
4521	*/
4522	if (!is_idle)
4523	return;
4524
4525	/*
4526	* Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4527	*
4528	* last_update_time (the cfs_rq's last_update_time)
4529	* = cfs_rq_clock_pelt()@cfs_rq_idle
4530	* = rq_clock_pelt()@cfs_rq_idle
4531	* - cfs->throttled_clock_pelt_time@cfs_rq_idle
4532	*
4533	* cfs_idle_lag (delta between rq's update and cfs_rq's update)
4534	* = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4535	*
4536	* rq_idle_lag (delta between now and rq's update)
4537	* = sched_clock_cpu() - rq_clock()@rq_idle
4538	*
4539	* We can then write:
4540	*
4541	* now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4542	* sched_clock_cpu() - rq_clock()@rq_idle
4543	* Where:
4544	* rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4545	* rq_clock()@rq_idle is rq->clock_idle
4546	* cfs->throttled_clock_pelt_time@cfs_rq_idle
4547	* is cfs_rq->throttled_pelt_idle
4548	*/
4549
4550	#ifdef CONFIG_CFS_BANDWIDTH
4551	throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4552	/ The clock has been stopped for throttling /
4553	if (throttled == U64_MAX)
4554	return;
4555	#endif
4556	now = u64_u32_load(rq->clock_pelt_idle);
4557	/*
4558	* Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4559	* is observed the old clock_pelt_idle value and the new clock_idle,
4560	* which lead to an underestimation. The opposite would lead to an
4561	* overestimation.
4562	*/
4563	smp_rmb();
4564	lut = cfs_rq_last_update_time(cfs_rq);
4565
4566	now -= throttled;
4567	if (now < lut)
4568	/*
4569	* cfs_rq->avg.last_update_time is more recent than our
4570	* estimation, let's use it.
4571	*/
4572	now = lut;
4573	else
4574	now += sched_clock_cpu(cpu: cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4575
4576	__update_load_avg_blocked_se(now, se);
4577	}
4578	#else
4579	static void migrate_se_pelt_lag(struct sched_entity *se) {}
4580	#endif
4581
4582	/**
4583	* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4584	* @now: current time, as per cfs_rq_clock_pelt()
4585	* @cfs_rq: cfs_rq to update
4586	*
4587	* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4588	* avg. The immediate corollary is that all (fair) tasks must be attached.
4589	*
4590	* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4591	*
4592	* Return: true if the load decayed or we removed load.
4593	*
4594	* Since both these conditions indicate a changed cfs_rq->avg.load we should
4595	* call update_tg_load_avg() when this function returns true.
4596	*/
4597	static inline int
4598	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4599	{
4600	unsigned long removed_load = `0`, removed_util = `0`, removed_runnable = `0`;
4601	struct sched_avg *sa = &cfs_rq->avg;
4602	int decayed = `0`;
4603
4604	if (cfs_rq->removed.nr) {
4605	unsigned long r;
4606	u32 divider = get_pelt_divider(avg: &cfs_rq->avg);
4607
4608	raw_spin_lock(&cfs_rq->removed.lock);
4609	swap(cfs_rq->removed.util_avg, removed_util);
4610	swap(cfs_rq->removed.load_avg, removed_load);
4611	swap(cfs_rq->removed.runnable_avg, removed_runnable);
4612	cfs_rq->removed.nr = `0`;
4613	raw_spin_unlock(&cfs_rq->removed.lock);
4614
4615	r = removed_load;
4616	sub_positive(&sa->load_avg, r);
4617	sub_positive(&sa->load_sum, r * divider);
4618	/ See sa->util_sum below /
4619	sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
4620
4621	r = removed_util;
4622	sub_positive(&sa->util_avg, r);
4623	sub_positive(&sa->util_sum, r * divider);
4624	/*
4625	* Because of rounding, se->util_sum might ends up being +1 more than
4626	* cfs->util_sum. Although this is not a problem by itself, detaching
4627	* a lot of tasks with the rounding problem between 2 updates of
4628	* util_avg (~1ms) can make cfs->util_sum becoming null whereas
4629	* cfs_util_avg is not.
4630	* Check that util_sum is still above its lower bound for the new
4631	* util_avg. Given that period_contrib might have moved since the last
4632	* sync, we are only sure that util_sum must be above or equal to
4633	* util_avg * minimum possible divider
4634	*/
4635	sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
4636
4637	r = removed_runnable;
4638	sub_positive(&sa->runnable_avg, r);
4639	sub_positive(&sa->runnable_sum, r * divider);
4640	/ See sa->util_sum above /
4641	sa->runnable_sum = max_t(u32, sa->runnable_sum,
4642	sa->runnable_avg * PELT_MIN_DIVIDER);
4643
4644	/*
4645	* removed_runnable is the unweighted version of removed_load so we
4646	* can use it to estimate removed_load_sum.
4647	*/
4648	add_tg_cfs_propagate(cfs_rq,
4649	runnable_sum: -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4650
4651	decayed = `1`;
4652	}
4653
4654	decayed \|= __update_load_avg_cfs_rq(now, cfs_rq);
4655	u64_u32_store_copy(sa->last_update_time,
4656	cfs_rq->last_update_time_copy,
4657	sa->last_update_time);
4658	return decayed;
4659	}
4660
4661	/**
4662	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
4663	* @cfs_rq: cfs_rq to attach to
4664	* @se: sched_entity to attach
4665	*
4666	* Must call update_cfs_rq_load_avg() before this, since we rely on
4667	* cfs_rq->avg.last_update_time being current.
4668	*/
4669	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
4670	{
4671	/*
4672	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4673	* See ___update_load_avg() for details.
4674	*/
4675	u32 divider = get_pelt_divider(avg: &cfs_rq->avg);
4676
4677	/*
4678	* When we attach the @se to the @cfs_rq, we must align the decay
4679	* window because without that, really weird and wonderful things can
4680	* happen.
4681	*
4682	* XXX illustrate
4683	*/
4684	se->avg.last_update_time = cfs_rq->avg.last_update_time;
4685	se->avg.period_contrib = cfs_rq->avg.period_contrib;
4686
4687	/*
4688	* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4689	* period_contrib. This isn't strictly correct, but since we're
4690	* entirely outside of the PELT hierarchy, nobody cares if we truncate
4691	* _sum a little.
4692	*/
4693	se->avg.util_sum = se->avg.util_avg * divider;
4694
4695	se->avg.runnable_sum = se->avg.runnable_avg * divider;
4696
4697	se->avg.load_sum = se->avg.load_avg * divider;
4698	if (se_weight(se) < se->avg.load_sum)
4699	se->avg.load_sum = div_u64(dividend: se->avg.load_sum, divisor: se_weight(se));
4700	else
4701	se->avg.load_sum = `1`;
4702
4703	enqueue_load_avg(cfs_rq, se);
4704	cfs_rq->avg.util_avg += se->avg.util_avg;
4705	cfs_rq->avg.util_sum += se->avg.util_sum;
4706	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4707	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4708
4709	add_tg_cfs_propagate(cfs_rq, runnable_sum: se->avg.load_sum);
4710
4711	cfs_rq_util_change(cfs_rq, flags: `0`);
4712
4713	trace_pelt_cfs_tp(cfs_rq);
4714	}
4715
4716	/**
4717	* detach_entity_load_avg - detach this entity from its cfs_rq load avg
4718	* @cfs_rq: cfs_rq to detach from
4719	* @se: sched_entity to detach
4720	*
4721	* Must call update_cfs_rq_load_avg() before this, since we rely on
4722	* cfs_rq->avg.last_update_time being current.
4723	*/
4724	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
4725	{
4726	dequeue_load_avg(cfs_rq, se);
4727	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4728	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4729	/ See update_cfs_rq_load_avg() /
4730	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4731	cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4732
4733	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
4734	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
4735	/ See update_cfs_rq_load_avg() /
4736	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4737	cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4738
4739	add_tg_cfs_propagate(cfs_rq, runnable_sum: -se->avg.load_sum);
4740
4741	cfs_rq_util_change(cfs_rq, flags: `0`);
4742
4743	trace_pelt_cfs_tp(cfs_rq);
4744	}
4745
4746	/*
4747	* Optional action to be done while updating the load average
4748	*/
4749	#define UPDATE_TG 0x1
4750	#define SKIP_AGE_LOAD 0x2
4751	#define DO_ATTACH 0x4
4752	#define DO_DETACH 0x8
4753
4754	/ Update task and its cfs_rq load average /
4755	static inline void update_load_avg(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
4756	{
4757	u64 now = cfs_rq_clock_pelt(cfs_rq);
4758	int decayed;
4759
4760	/*
4761	* Track task load average for carrying it to new CPU after migrated, and
4762	* track group sched_entity load average for task_h_load calculation in migration
4763	*/
4764	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4765	__update_load_avg_se(now, cfs_rq, se);
4766
4767	decayed = update_cfs_rq_load_avg(now, cfs_rq);
4768	decayed \|= propagate_entity_load_avg(se);
4769
4770	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4771
4772	/*
4773	* DO_ATTACH means we're here from enqueue_entity().
4774	* !last_update_time means we've passed through
4775	* migrate_task_rq_fair() indicating we migrated.
4776	*
4777	* IOW we're enqueueing a task on a new CPU.
4778	*/
4779	attach_entity_load_avg(cfs_rq, se);
4780	update_tg_load_avg(cfs_rq);
4781
4782	} else if (flags & DO_DETACH) {
4783	/*
4784	* DO_DETACH means we're here from dequeue_entity()
4785	* and we are migrating task out of the CPU.
4786	*/
4787	detach_entity_load_avg(cfs_rq, se);
4788	update_tg_load_avg(cfs_rq);
4789	} else if (decayed) {
4790	cfs_rq_util_change(cfs_rq, flags: `0`);
4791
4792	if (flags & UPDATE_TG)
4793	update_tg_load_avg(cfs_rq);
4794	}
4795	}
4796
4797	/*
4798	* Synchronize entity load avg of dequeued entity without locking
4799	* the previous rq.
4800	*/
4801	static void sync_entity_load_avg(struct sched_entity *se)
4802	{
4803	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4804	u64 last_update_time;
4805
4806	last_update_time = cfs_rq_last_update_time(cfs_rq);
4807	__update_load_avg_blocked_se(now: last_update_time, se);
4808	}
4809
4810	/*
4811	* Task first catches up with cfs_rq, and then subtract
4812	* itself from the cfs_rq (task must be off the queue now).
4813	*/
4814	static void remove_entity_load_avg(struct sched_entity *se)
4815	{
4816	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4817	unsigned long flags;
4818
4819	/*
4820	* tasks cannot exit without having gone through wake_up_new_task() ->
4821	* enqueue_task_fair() which will have added things to the cfs_rq,
4822	* so we can remove unconditionally.
4823	*/
4824
4825	sync_entity_load_avg(se);
4826
4827	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4828	++cfs_rq->removed.nr;
4829	cfs_rq->removed.util_avg += se->avg.util_avg;
4830	cfs_rq->removed.load_avg += se->avg.load_avg;
4831	cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
4832	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4833	}
4834
4835	static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4836	{
4837	return cfs_rq->avg.runnable_avg;
4838	}
4839
4840	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4841	{
4842	return cfs_rq->avg.load_avg;
4843	}
4844
4845	static int sched_balance_newidle(struct rq this_rq, struct* rq_flags *rf);
4846
4847	static inline unsigned long task_util(struct task_struct *p)
4848	{
4849	return READ_ONCE(p->se.avg.util_avg);
4850	}
4851
4852	static inline unsigned long task_runnable(struct task_struct *p)
4853	{
4854	return READ_ONCE(p->se.avg.runnable_avg);
4855	}
4856
4857	static inline unsigned long _task_util_est(struct task_struct *p)
4858	{
4859	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4860	}
4861
4862	static inline unsigned long task_util_est(struct task_struct *p)
4863	{
4864	return max(task_util(p), _task_util_est(p));
4865	}
4866
4867	static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4868	struct task_struct *p)
4869	{
4870	unsigned int enqueued;
4871
4872	if (!sched_feat(UTIL_EST))
4873	return;
4874
4875	/ Update root cfs_rq's estimated utilization /
4876	enqueued = cfs_rq->avg.util_est;
4877	enqueued += _task_util_est(p);
4878	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4879
4880	trace_sched_util_est_cfs_tp(cfs_rq);
4881	}
4882
4883	static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4884	struct task_struct *p)
4885	{
4886	unsigned int enqueued;
4887
4888	if (!sched_feat(UTIL_EST))
4889	return;
4890
4891	/ Update root cfs_rq's estimated utilization /
4892	enqueued = cfs_rq->avg.util_est;
4893	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4894	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4895
4896	trace_sched_util_est_cfs_tp(cfs_rq);
4897	}
4898
4899	#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4900
4901	static inline void util_est_update(struct cfs_rq *cfs_rq,
4902	struct task_struct *p,
4903	bool task_sleep)
4904	{
4905	unsigned int ewma, dequeued, last_ewma_diff;
4906
4907	if (!sched_feat(UTIL_EST))
4908	return;
4909
4910	/*
4911	* Skip update of task's estimated utilization when the task has not
4912	* yet completed an activation, e.g. being migrated.
4913	*/
4914	if (!task_sleep)
4915	return;
4916
4917	/ Get current estimate of utilization /
4918	ewma = READ_ONCE(p->se.avg.util_est);
4919
4920	/*
4921	* If the PELT values haven't changed since enqueue time,
4922	* skip the util_est update.
4923	*/
4924	if (ewma & UTIL_AVG_UNCHANGED)
4925	return;
4926
4927	/ Get utilization at dequeue /
4928	dequeued = task_util(p);
4929
4930	/*
4931	* Reset EWMA on utilization increases, the moving average is used only
4932	* to smooth utilization decreases.
4933	*/
4934	if (ewma <= dequeued) {
4935	ewma = dequeued;
4936	goto done;
4937	}
4938
4939	/*
4940	* Skip update of task's estimated utilization when its members are
4941	* already ~1% close to its last activation value.
4942	*/
4943	last_ewma_diff = ewma - dequeued;
4944	if (last_ewma_diff < UTIL_EST_MARGIN)
4945	goto done;
4946
4947	/*
4948	* To avoid underestimate of task utilization, skip updates of EWMA if
4949	* we cannot grant that thread got all CPU time it wanted.
4950	*/
4951	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4952	goto done;
4953
4954
4955	/*
4956	* Update Task's estimated utilization
4957	*
4958	* When *p completes an activation we can consolidate another sample
4959	* of the task size. This is done by using this value to update the
4960	* Exponential Weighted Moving Average (EWMA):
4961	*
4962	* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4963	* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4964	* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4965	* = w * ( -last_ewma_diff ) + ewma(t-1)
4966	* = w * (-last_ewma_diff + ewma(t-1) / w)
4967	*
4968	* Where 'w' is the weight of new samples, which is configured to be
4969	* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4970	*/
4971	ewma <<= UTIL_EST_WEIGHT_SHIFT;
4972	ewma -= last_ewma_diff;
4973	ewma >>= UTIL_EST_WEIGHT_SHIFT;
4974	done:
4975	ewma \|= UTIL_AVG_UNCHANGED;
4976	WRITE_ONCE(p->se.avg.util_est, ewma);
4977
4978	trace_sched_util_est_se_tp(se: &p->se);
4979	}
4980
4981	static inline unsigned long get_actual_cpu_capacity(int cpu)
4982	{
4983	unsigned long capacity = arch_scale_cpu_capacity(cpu);
4984
4985	capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
4986
4987	return capacity;
4988	}
4989
4990	static inline int util_fits_cpu(unsigned long util,
4991	unsigned long uclamp_min,
4992	unsigned long uclamp_max,
4993	int cpu)
4994	{
4995	unsigned long capacity = capacity_of(cpu);
4996	unsigned long capacity_orig;
4997	bool fits, uclamp_max_fits;
4998
4999	/*
5000	* Check if the real util fits without any uclamp boost/cap applied.
5001	*/
5002	fits = fits_capacity(util, capacity);
5003
5004	if (!uclamp_is_used())
5005	return fits;
5006
5007	/*
5008	* We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
5009	* uclamp_max. We only care about capacity pressure (by using
5010	* capacity_of()) for comparing against the real util.
5011	*
5012	* If a task is boosted to 1024 for example, we don't want a tiny
5013	* pressure to skew the check whether it fits a CPU or not.
5014	*
5015	* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
5016	* should fit a little cpu even if there's some pressure.
5017	*
5018	* Only exception is for HW or cpufreq pressure since it has a direct impact
5019	* on available OPP of the system.
5020	*
5021	* We honour it for uclamp_min only as a drop in performance level
5022	* could result in not getting the requested minimum performance level.
5023	*
5024	* For uclamp_max, we can tolerate a drop in performance level as the
5025	* goal is to cap the task. So it's okay if it's getting less.
5026	*/
5027	capacity_orig = arch_scale_cpu_capacity(cpu);
5028
5029	/*
5030	* We want to force a task to fit a cpu as implied by uclamp_max.
5031	* But we do have some corner cases to cater for..
5032	*
5033	*
5034	* C=z
5035	* \| ___
5036	* \| C=y \| \|
5037	* \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
5038	* \| C=x \| \| \| \|
5039	* \| ___ \| \| \| \|
5040	* \| \| \| \| \| \| \| (util somewhere in this region)
5041	* \| \| \| \| \| \| \|
5042	* \| \| \| \| \| \| \|
5043	* +----------------------------------------
5044	* CPU0 CPU1 CPU2
5045	*
5046	* In the above example if a task is capped to a specific performance
5047	* point, y, then when:
5048	*
5049	* * util = 80% of x then it does not fit on CPU0 and should migrate
5050	* to CPU1
5051	* * util = 80% of y then it is forced to fit on CPU1 to honour
5052	* uclamp_max request.
5053	*
5054	* which is what we're enforcing here. A task always fits if
5055	* uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5056	* the normal upmigration rules should withhold still.
5057	*
5058	* Only exception is when we are on max capacity, then we need to be
5059	* careful not to block overutilized state. This is so because:
5060	*
5061	* 1. There's no concept of capping at max_capacity! We can't go
5062	* beyond this performance level anyway.
5063	* 2. The system is being saturated when we're operating near
5064	* max capacity, it doesn't make sense to block overutilized.
5065	*/
5066	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5067	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5068	fits = fits \|\| uclamp_max_fits;
5069
5070	/*
5071	*
5072	* C=z
5073	* \| ___ (region a, capped, util >= uclamp_max)
5074	* \| C=y \| \|
5075	* \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
5076	* \| C=x \| \| \| \|
5077	* \| ___ \| \| \| \| (region b, uclamp_min <= util <= uclamp_max)
5078	* \|_ _ _\|_ _\|_ _ _ _\| _ \| _ _ _\| _ \| _ _ _ _ _ uclamp_min
5079	* \| \| \| \| \| \| \|
5080	* \| \| \| \| \| \| \| (region c, boosted, util < uclamp_min)
5081	* +----------------------------------------
5082	* CPU0 CPU1 CPU2
5083	*
5084	* a) If util > uclamp_max, then we're capped, we don't care about
5085	* actual fitness value here. We only care if uclamp_max fits
5086	* capacity without taking margin/pressure into account.
5087	* See comment above.
5088	*
5089	* b) If uclamp_min <= util <= uclamp_max, then the normal
5090	* fits_capacity() rules apply. Except we need to ensure that we
5091	* enforce we remain within uclamp_max, see comment above.
5092	*
5093	* c) If util < uclamp_min, then we are boosted. Same as (b) but we
5094	* need to take into account the boosted value fits the CPU without
5095	* taking margin/pressure into account.
5096	*
5097	* Cases (a) and (b) are handled in the 'fits' variable already. We
5098	* just need to consider an extra check for case (c) after ensuring we
5099	* handle the case uclamp_min > uclamp_max.
5100	*/
5101	uclamp_min = min(uclamp_min, uclamp_max);
5102	if (fits && (util < uclamp_min) &&
5103	(uclamp_min > get_actual_cpu_capacity(cpu)))
5104	return -`1`;
5105
5106	return fits;
5107	}
5108
5109	static inline int task_fits_cpu(struct task_struct p, int* cpu)
5110	{
5111	unsigned long uclamp_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
5112	unsigned long uclamp_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
5113	unsigned long util = task_util_est(p);
5114	/*
5115	* Return true only if the cpu fully fits the task requirements, which
5116	* include the utilization but also the performance hints.
5117	*/
5118	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > `0`);
5119	}
5120
5121	static inline void update_misfit_status(struct task_struct p, struct* rq *rq)
5122	{
5123	int cpu = cpu_of(rq);
5124
5125	if (!sched_asym_cpucap_active())
5126	return;
5127
5128	/*
5129	* Affinity allows us to go somewhere higher? Or are we on biggest
5130	* available CPU already? Or do we fit into this CPU ?
5131	*/
5132	if (!p \|\| (p->nr_cpus_allowed == `1`) \|\|
5133	(arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) \|\|
5134	task_fits_cpu(p, cpu)) {
5135
5136	rq->misfit_task_load = `0`;
5137	return;
5138	}
5139
5140	/*
5141	* Make sure that misfit_task_load will not be null even if
5142	* task_h_load() returns 0.
5143	*/
5144	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), `1`);
5145	}
5146
5147	#else /* CONFIG_SMP */
5148
5149	static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
5150	{
5151	return !cfs_rq->nr_queued;
5152	}
5153
5154	#define UPDATE_TG 0x0
5155	#define SKIP_AGE_LOAD 0x0
5156	#define DO_ATTACH 0x0
5157	#define DO_DETACH 0x0
5158
5159	static inline void update_load_avg(struct cfs_rq cfs_rq, struct* sched_entity se, int* not_used1)
5160	{
5161	cfs_rq_util_change(cfs_rq, `0`);
5162	}
5163
5164	static inline void remove_entity_load_avg(struct sched_entity *se) {}
5165
5166	static inline void
5167	attach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se) {}
5168	static inline void
5169	detach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se) {}
5170
5171	static inline int sched_balance_newidle(struct rq rq, struct* rq_flags *rf)
5172	{
5173	return `0`;
5174	}
5175
5176	static inline void
5177	util_est_enqueue(struct cfs_rq cfs_rq, struct* task_struct *p) {}
5178
5179	static inline void
5180	util_est_dequeue(struct cfs_rq cfs_rq, struct* task_struct *p) {}
5181
5182	static inline void
5183	util_est_update(struct cfs_rq cfs_rq, struct* task_struct *p,
5184	bool task_sleep) {}
5185	static inline void update_misfit_status(struct task_struct p, struct* rq *rq) {}
5186
5187	#endif /* CONFIG_SMP */
5188
5189	void __setparam_fair(struct task_struct p, const* struct sched_attr *attr)
5190	{
5191	struct sched_entity *se = &p->se;
5192
5193	p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5194	if (attr->sched_runtime) {
5195	se->custom_slice = `1`;
5196	se->slice = clamp_t(u64, attr->sched_runtime,
5197	NSEC_PER_MSEC/`10`, / HZ=1000 * 10 /
5198	NSEC_PER_MSEC`100`); /* HZ=100 / 10 /
5199	} else {
5200	se->custom_slice = `0`;
5201	se->slice = sysctl_sched_base_slice;
5202	}
5203	}
5204
5205	static void
5206	place_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5207	{
5208	u64 vslice, vruntime = avg_vruntime(cfs_rq);
5209	s64 lag = `0`;
5210
5211	if (!se->custom_slice)
5212	se->slice = sysctl_sched_base_slice;
5213	vslice = calc_delta_fair(delta: se->slice, se);
5214
5215	/*
5216	* Due to how V is constructed as the weighted average of entities,
5217	* adding tasks with positive lag, or removing tasks with negative lag
5218	* will move 'time' backwards, this can screw around with the lag of
5219	* other tasks.
5220	*
5221	* EEVDF: placement strategy #1 / #2
5222	*/
5223	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
5224	struct sched_entity *curr = cfs_rq->curr;
5225	unsigned long load;
5226
5227	lag = se->vlag;
5228
5229	/*
5230	* If we want to place a task and preserve lag, we have to
5231	* consider the effect of the new entity on the weighted
5232	* average and compensate for this, otherwise lag can quickly
5233	* evaporate.
5234	*
5235	* Lag is defined as:
5236	*
5237	* lag_i = S - s_i = w_i * (V - v_i)
5238	*
5239	* To avoid the 'w_i' term all over the place, we only track
5240	* the virtual lag:
5241	*
5242	* vl_i = V - v_i <=> v_i = V - vl_i
5243	*
5244	* And we take V to be the weighted average of all v:
5245	*
5246	* V = (\Sum w_j*v_j) / W
5247	*
5248	* Where W is: \Sum w_j
5249	*
5250	* Then, the weighted average after adding an entity with lag
5251	* vl_i is given by:
5252	*
5253	* V' = (\Sum w_jv_j + w_iv_i) / (W + w_i)
5254	* = (WV + w_i(V - vl_i)) / (W + w_i)
5255	* = (WV + w_iV - w_i*vl_i) / (W + w_i)
5256	* = (V(W + w_i) - w_il) / (W + w_i)
5257	* = V - w_i*vl_i / (W + w_i)
5258	*
5259	* And the actual lag after adding an entity with vl_i is:
5260	*
5261	* vl'_i = V' - v_i
5262	* = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5263	* = vl_i - w_i*vl_i / (W + w_i)
5264	*
5265	* Which is strictly less than vl_i. So in order to preserve lag
5266	* we should inflate the lag before placement such that the
5267	* effective lag after placement comes out right.
5268	*
5269	* As such, invert the above relation for vl'_i to get the vl_i
5270	* we need to use such that the lag after placement is the lag
5271	* we computed before dequeue.
5272	*
5273	* vl'_i = vl_i - w_i*vl_i / (W + w_i)
5274	* = ((W + w_i)vl_i - w_ivl_i) / (W + w_i)
5275	*
5276	* (W + w_i)vl'_i = (W + w_i)vl_i - w_i*vl_i
5277	* = W*vl_i
5278	*
5279	* vl_i = (W + w_i)*vl'_i / W
5280	*/
5281	load = cfs_rq->avg_load;
5282	if (curr && curr->on_rq)
5283	load += scale_load_down(curr->load.weight);
5284
5285	lag *= load + scale_load_down(se->load.weight);
5286	if (WARN_ON_ONCE(!load))
5287	load = `1`;
5288	lag = div_s64(dividend: lag, divisor: load);
5289	}
5290
5291	se->vruntime = vruntime - lag;
5292
5293	if (se->rel_deadline) {
5294	se->deadline += se->vruntime;
5295	se->rel_deadline = `0`;
5296	return;
5297	}
5298
5299	/*
5300	* When joining the competition; the existing tasks will be,
5301	* on average, halfway through their slice, as such start tasks
5302	* off with half a slice to ease into the competition.
5303	*/
5304	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5305	vslice /= `2`;
5306
5307	/*
5308	* EEVDF: vd_i = ve_i + r_i/w_i
5309	*/
5310	se->deadline = se->vruntime + vslice;
5311	}
5312
5313	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5314	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5315
5316	static void
5317	requeue_delayed_entity(struct sched_entity *se);
5318
5319	static void
5320	enqueue_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5321	{
5322	bool curr = cfs_rq->curr == se;
5323
5324	/*
5325	* If we're the current task, we must renormalise before calling
5326	* update_curr().
5327	*/
5328	if (curr)
5329	place_entity(cfs_rq, se, flags);
5330
5331	update_curr(cfs_rq);
5332
5333	/*
5334	* When enqueuing a sched_entity, we must:
5335	* - Update loads to have both entity and cfs_rq synced with now.
5336	* - For group_entity, update its runnable_weight to reflect the new
5337	* h_nr_runnable of its group cfs_rq.
5338	* - For group_entity, update its weight to reflect the new share of
5339	* its group cfs_rq
5340	* - Add its new weight to cfs_rq->load.weight
5341	*/
5342	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
5343	se_update_runnable(se);
5344	/*
5345	* XXX update_load_avg() above will have attached us to the pelt sum;
5346	* but update_cfs_group() here will re-adjust the weight and have to
5347	* undo/redo all that. Seems wasteful.
5348	*/
5349	update_cfs_group(se);
5350
5351	/*
5352	* XXX now that the entity has been re-weighted, and it's lag adjusted,
5353	* we can place the entity.
5354	*/
5355	if (!curr)
5356	place_entity(cfs_rq, se, flags);
5357
5358	account_entity_enqueue(cfs_rq, se);
5359
5360	/ Entity has migrated, no longer consider this task hot /
5361	if (flags & ENQUEUE_MIGRATED)
5362	se->exec_start = `0`;
5363
5364	check_schedstat_required();
5365	update_stats_enqueue_fair(cfs_rq, se, flags);
5366	if (!curr)
5367	__enqueue_entity(cfs_rq, se);
5368	se->on_rq = `1`;
5369
5370	if (cfs_rq->nr_queued == `1`) {
5371	check_enqueue_throttle(cfs_rq);
5372	if (!throttled_hierarchy(cfs_rq)) {
5373	list_add_leaf_cfs_rq(cfs_rq);
5374	} else {
5375	#ifdef CONFIG_CFS_BANDWIDTH
5376	struct rq *rq = rq_of(cfs_rq);
5377
5378	if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5379	cfs_rq->throttled_clock = rq_clock(rq);
5380	if (!cfs_rq->throttled_clock_self)
5381	cfs_rq->throttled_clock_self = rq_clock(rq);
5382	#endif
5383	}
5384	}
5385	}
5386
5387	static void __clear_buddies_next(struct sched_entity *se)
5388	{
5389	for_each_sched_entity(se) {
5390	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5391	if (cfs_rq->next != se)
5392	break;
5393
5394	cfs_rq->next = NULL;
5395	}
5396	}
5397
5398	static void clear_buddies(struct cfs_rq cfs_rq, struct* sched_entity *se)
5399	{
5400	if (cfs_rq->next == se)
5401	__clear_buddies_next(se);
5402	}
5403
5404	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5405
5406	static void set_delayed(struct sched_entity *se)
5407	{
5408	se->sched_delayed = `1`;
5409
5410	/*
5411	* Delayed se of cfs_rq have no tasks queued on them.
5412	* Do not adjust h_nr_runnable since dequeue_entities()
5413	* will account it for blocked tasks.
5414	*/
5415	if (!entity_is_task(se))
5416	return;
5417
5418	for_each_sched_entity(se) {
5419	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5420
5421	cfs_rq->h_nr_runnable--;
5422	if (cfs_rq_throttled(cfs_rq))
5423	break;
5424	}
5425	}
5426
5427	static void clear_delayed(struct sched_entity *se)
5428	{
5429	se->sched_delayed = `0`;
5430
5431	/*
5432	* Delayed se of cfs_rq have no tasks queued on them.
5433	* Do not adjust h_nr_runnable since a dequeue has
5434	* already accounted for it or an enqueue of a task
5435	* below it will account for it in enqueue_task_fair().
5436	*/
5437	if (!entity_is_task(se))
5438	return;
5439
5440	for_each_sched_entity(se) {
5441	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5442
5443	cfs_rq->h_nr_runnable++;
5444	if (cfs_rq_throttled(cfs_rq))
5445	break;
5446	}
5447	}
5448
5449	static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
5450	{
5451	clear_delayed(se);
5452	if (sched_feat(DELAY_ZERO) && se->vlag > `0`)
5453	se->vlag = `0`;
5454	}
5455
5456	static bool
5457	dequeue_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5458	{
5459	bool sleep = flags & DEQUEUE_SLEEP;
5460	int action = UPDATE_TG;
5461
5462	update_curr(cfs_rq);
5463	clear_buddies(cfs_rq, se);
5464
5465	if (flags & DEQUEUE_DELAYED) {
5466	WARN_ON_ONCE(!se->sched_delayed);
5467	} else {
5468	bool delay = sleep;
5469	/*
5470	* DELAY_DEQUEUE relies on spurious wakeups, special task
5471	* states must not suffer spurious wakeups, excempt them.
5472	*/
5473	if (flags & DEQUEUE_SPECIAL)
5474	delay = false;
5475
5476	WARN_ON_ONCE(delay && se->sched_delayed);
5477
5478	if (sched_feat(DELAY_DEQUEUE) && delay &&
5479	!entity_eligible(cfs_rq, se)) {
5480	update_load_avg(cfs_rq, se, flags: `0`);
5481	set_delayed(se);
5482	return false;
5483	}
5484	}
5485
5486	if (entity_is_task(se) && task_on_rq_migrating(p: task_of(se)))
5487	action \|= DO_DETACH;
5488
5489	/*
5490	* When dequeuing a sched_entity, we must:
5491	* - Update loads to have both entity and cfs_rq synced with now.
5492	* - For group_entity, update its runnable_weight to reflect the new
5493	* h_nr_runnable of its group cfs_rq.
5494	* - Subtract its previous weight from cfs_rq->load.weight.
5495	* - For group entity, update its weight to reflect the new share
5496	* of its group cfs_rq.
5497	*/
5498	update_load_avg(cfs_rq, se, flags: action);
5499	se_update_runnable(se);
5500
5501	update_stats_dequeue_fair(cfs_rq, se, flags);
5502
5503	update_entity_lag(cfs_rq, se);
5504	if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
5505	se->deadline -= se->vruntime;
5506	se->rel_deadline = `1`;
5507	}
5508
5509	if (se != cfs_rq->curr)
5510	__dequeue_entity(cfs_rq, se);
5511	se->on_rq = `0`;
5512	account_entity_dequeue(cfs_rq, se);
5513
5514	/ return excess runtime on last dequeue /
5515	return_cfs_rq_runtime(cfs_rq);
5516
5517	update_cfs_group(se);
5518
5519	/*
5520	* Now advance min_vruntime if @se was the entity holding it back,
5521	* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5522	* put back on, and if we advance min_vruntime, we'll be placed back
5523	* further than we started -- i.e. we'll be penalized.
5524	*/
5525	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5526	update_min_vruntime(cfs_rq);
5527
5528	if (flags & DEQUEUE_DELAYED)
5529	finish_delayed_dequeue_entity(se);
5530
5531	if (cfs_rq->nr_queued == `0`)
5532	update_idle_cfs_rq_clock_pelt(cfs_rq);
5533
5534	return true;
5535	}
5536
5537	static void
5538	set_next_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
5539	{
5540	clear_buddies(cfs_rq, se);
5541
5542	/ 'current' is not kept within the tree. /
5543	if (se->on_rq) {
5544	/*
5545	* Any task has to be enqueued before it get to execute on
5546	* a CPU. So account for the time it spent waiting on the
5547	* runqueue.
5548	*/
5549	update_stats_wait_end_fair(cfs_rq, se);
5550	__dequeue_entity(cfs_rq, se);
5551	update_load_avg(cfs_rq, se, UPDATE_TG);
5552
5553	set_protect_slice(se);
5554	}
5555
5556	update_stats_curr_start(cfs_rq, se);
5557	WARN_ON_ONCE(cfs_rq->curr);
5558	cfs_rq->curr = se;
5559
5560	/*
5561	* Track our maximum slice length, if the CPU's load is at
5562	* least twice that of our own weight (i.e. don't track it
5563	* when there are only lesser-weight tasks around):
5564	*/
5565	if (schedstat_enabled() &&
5566	rq_of(cfs_rq)->cfs.load.weight >= `2`*se->load.weight) {
5567	struct sched_statistics *stats;
5568
5569	stats = __schedstats_from_se(se);
5570	__schedstat_set(stats->slice_max,
5571	max((u64)stats->slice_max,
5572	se->sum_exec_runtime - se->prev_sum_exec_runtime));
5573	}
5574
5575	se->prev_sum_exec_runtime = se->sum_exec_runtime;
5576	}
5577
5578	static int dequeue_entities(struct rq rq, struct* sched_entity se, int* flags);
5579
5580	/*
5581	* Pick the next process, keeping these things in mind, in this order:
5582	* 1) keep things fair between processes/task groups
5583	* 2) pick the "next" process, since someone really wants that to run
5584	* 3) pick the "last" process, for cache locality
5585	* 4) do not run the "skip" process, if something else is available
5586	*/
5587	static struct sched_entity *
5588	pick_next_entity(struct rq rq, struct* cfs_rq *cfs_rq)
5589	{
5590	struct sched_entity *se;
5591
5592	/*
5593	* Picking the ->next buddy will affect latency but not fairness.
5594	*/
5595	if (sched_feat(PICK_BUDDY) &&
5596	cfs_rq->next && entity_eligible(cfs_rq, se: cfs_rq->next)) {
5597	/ ->next will never be delayed /
5598	WARN_ON_ONCE(cfs_rq->next->sched_delayed);
5599	return cfs_rq->next;
5600	}
5601
5602	se = pick_eevdf(cfs_rq);
5603	if (se->sched_delayed) {
5604	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
5605	/*
5606	* Must not reference @se again, see __block_task().
5607	*/
5608	return NULL;
5609	}
5610	return se;
5611	}
5612
5613	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5614
5615	static void put_prev_entity(struct cfs_rq cfs_rq, struct* sched_entity *prev)
5616	{
5617	/*
5618	* If still on the runqueue then deactivate_task()
5619	* was not called and update_curr() has to be done:
5620	*/
5621	if (prev->on_rq)
5622	update_curr(cfs_rq);
5623
5624	/ throttle cfs_rqs exceeding runtime /
5625	check_cfs_rq_runtime(cfs_rq);
5626
5627	if (prev->on_rq) {
5628	update_stats_wait_start_fair(cfs_rq, se: prev);
5629	/ Put 'current' back into the tree. /
5630	__enqueue_entity(cfs_rq, se: prev);
5631	/ in !on_rq case, update occurred at dequeue /
5632	update_load_avg(cfs_rq, se: prev, flags: `0`);
5633	}
5634	WARN_ON_ONCE(cfs_rq->curr != prev);
5635	cfs_rq->curr = NULL;
5636	}
5637
5638	static void
5639	entity_tick(struct cfs_rq cfs_rq, struct* sched_entity curr, int* queued)
5640	{
5641	/*
5642	* Update run-time statistics of the 'current'.
5643	*/
5644	update_curr(cfs_rq);
5645
5646	/*
5647	* Ensure that runnable average is periodically updated.
5648	*/
5649	update_load_avg(cfs_rq, se: curr, UPDATE_TG);
5650	update_cfs_group(se: curr);
5651
5652	#ifdef CONFIG_SCHED_HRTICK
5653	/*
5654	* queued ticks are scheduled to match the slice, so don't bother
5655	* validating it and just reschedule.
5656	*/
5657	if (queued) {
5658	resched_curr_lazy(rq: rq_of(cfs_rq));
5659	return;
5660	}
5661	#endif
5662	}
5663
5664
5665	/**************************************************
5666	* CFS bandwidth control machinery
5667	*/
5668
5669	#ifdef CONFIG_CFS_BANDWIDTH
5670
5671	#ifdef CONFIG_JUMP_LABEL
5672	static struct static_key __cfs_bandwidth_used;
5673
5674	static inline bool cfs_bandwidth_used(void)
5675	{
5676	return static_key_false(key: &__cfs_bandwidth_used);
5677	}
5678
5679	void cfs_bandwidth_usage_inc(void)
5680	{
5681	static_key_slow_inc_cpuslocked(key: &__cfs_bandwidth_used);
5682	}
5683
5684	void cfs_bandwidth_usage_dec(void)
5685	{
5686	static_key_slow_dec_cpuslocked(key: &__cfs_bandwidth_used);
5687	}
5688	#else /* CONFIG_JUMP_LABEL */
5689	static bool cfs_bandwidth_used(void)
5690	{
5691	return true;
5692	}
5693
5694	void cfs_bandwidth_usage_inc(void) {}
5695	void cfs_bandwidth_usage_dec(void) {}
5696	#endif /* CONFIG_JUMP_LABEL */
5697
5698	/*
5699	* default period for cfs group bandwidth.
5700	* default: 0.1s, units: nanoseconds
5701	*/
5702	static inline u64 default_cfs_period(void)
5703	{
5704	return `100000000ULL`;
5705	}
5706
5707	static inline u64 sched_cfs_bandwidth_slice(void)
5708	{
5709	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5710	}
5711
5712	/*
5713	* Replenish runtime according to assigned quota. We use sched_clock_cpu
5714	* directly instead of rq->clock to avoid adding additional synchronization
5715	* around rq->lock.
5716	*
5717	* requires cfs_b->lock
5718	*/
5719	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5720	{
5721	s64 runtime;
5722
5723	if (unlikely(cfs_b->quota == RUNTIME_INF))
5724	return;
5725
5726	cfs_b->runtime += cfs_b->quota;
5727	runtime = cfs_b->runtime_snap - cfs_b->runtime;
5728	if (runtime > `0`) {
5729	cfs_b->burst_time += runtime;
5730	cfs_b->nr_burst++;
5731	}
5732
5733	cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5734	cfs_b->runtime_snap = cfs_b->runtime;
5735	}
5736
5737	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct* task_group *tg)
5738	{
5739	return &tg->cfs_bandwidth;
5740	}
5741
5742	/ returns 0 on failure to allocate runtime /
5743	static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5744	struct cfs_rq *cfs_rq, u64 target_runtime)
5745	{
5746	u64 min_amount, amount = `0`;
5747
5748	lockdep_assert_held(&cfs_b->lock);
5749
5750	/ note: this is a positive sum as runtime_remaining <= 0 /
5751	min_amount = target_runtime - cfs_rq->runtime_remaining;
5752
5753	if (cfs_b->quota == RUNTIME_INF)
5754	amount = min_amount;
5755	else {
5756	start_cfs_bandwidth(cfs_b);
5757
5758	if (cfs_b->runtime > `0`) {
5759	amount = min(cfs_b->runtime, min_amount);
5760	cfs_b->runtime -= amount;
5761	cfs_b->idle = `0`;
5762	}
5763	}
5764
5765	cfs_rq->runtime_remaining += amount;
5766
5767	return cfs_rq->runtime_remaining > `0`;
5768	}
5769
5770	/ returns 0 on failure to allocate runtime /
5771	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5772	{
5773	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg: cfs_rq->tg);
5774	int ret;
5775
5776	raw_spin_lock(&cfs_b->lock);
5777	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime: sched_cfs_bandwidth_slice());
5778	raw_spin_unlock(&cfs_b->lock);
5779
5780	return ret;
5781	}
5782
5783	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5784	{
5785	/ dock delta_exec before expiring quota (as it could span periods) /
5786	cfs_rq->runtime_remaining -= delta_exec;
5787
5788	if (likely(cfs_rq->runtime_remaining > `0`))
5789	return;
5790
5791	if (cfs_rq->throttled)
5792	return;
5793	/*
5794	* if we're unable to extend our runtime we resched so that the active
5795	* hierarchy can be throttled
5796	*/
5797	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5798	resched_curr(rq: rq_of(cfs_rq));
5799	}
5800
5801	static __always_inline
5802	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5803	{
5804	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
5805	return;
5806
5807	__account_cfs_rq_runtime(cfs_rq, delta_exec);
5808	}
5809
5810	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5811	{
5812	return cfs_bandwidth_used() && cfs_rq->throttled;
5813	}
5814
5815	/ check whether cfs_rq, or any parent, is throttled /
5816	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5817	{
5818	return cfs_bandwidth_used() && cfs_rq->throttle_count;
5819	}
5820
5821	/*
5822	* Ensure that neither of the group entities corresponding to src_cpu or
5823	* dest_cpu are members of a throttled hierarchy when performing group
5824	* load-balance operations.
5825	*/
5826	static inline int throttled_lb_pair(struct task_group *tg,
5827	int src_cpu, int dest_cpu)
5828	{
5829	struct cfs_rq src_cfs_rq, dest_cfs_rq;
5830
5831	src_cfs_rq = tg->cfs_rq[src_cpu];
5832	dest_cfs_rq = tg->cfs_rq[dest_cpu];
5833
5834	return throttled_hierarchy(cfs_rq: src_cfs_rq) \|\|
5835	throttled_hierarchy(cfs_rq: dest_cfs_rq);
5836	}
5837
5838	static int tg_unthrottle_up(struct task_group tg, void* *data)
5839	{
5840	struct rq *rq = data;
5841	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5842
5843	cfs_rq->throttle_count--;
5844	if (!cfs_rq->throttle_count) {
5845	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5846	cfs_rq->throttled_clock_pelt;
5847
5848	/ Add cfs_rq with load or one or more already running entities to the list /
5849	if (!cfs_rq_is_decayed(cfs_rq))
5850	list_add_leaf_cfs_rq(cfs_rq);
5851
5852	if (cfs_rq->throttled_clock_self) {
5853	u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5854
5855	cfs_rq->throttled_clock_self = `0`;
5856
5857	if (WARN_ON_ONCE((s64)delta < `0`))
5858	delta = `0`;
5859
5860	cfs_rq->throttled_clock_self_time += delta;
5861	}
5862	}
5863
5864	return `0`;
5865	}
5866
5867	static int tg_throttle_down(struct task_group tg, void* *data)
5868	{
5869	struct rq *rq = data;
5870	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5871
5872	/ group is entering throttled state, stop time /
5873	if (!cfs_rq->throttle_count) {
5874	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5875	list_del_leaf_cfs_rq(cfs_rq);
5876
5877	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
5878	if (cfs_rq->nr_queued)
5879	cfs_rq->throttled_clock_self = rq_clock(rq);
5880	}
5881	cfs_rq->throttle_count++;
5882
5883	return `0`;
5884	}
5885
5886	static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
5887	{
5888	struct rq *rq = rq_of(cfs_rq);
5889	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg: cfs_rq->tg);
5890	struct sched_entity *se;
5891	long queued_delta, runnable_delta, idle_delta, dequeue = `1`;
5892	long rq_h_nr_queued = rq->cfs.h_nr_queued;
5893
5894	raw_spin_lock(&cfs_b->lock);
5895	/ This will start the period timer if necessary /
5896	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime: `1`)) {
5897	/*
5898	* We have raced with bandwidth becoming available, and if we
5899	* actually throttled the timer might not unthrottle us for an
5900	* entire period. We additionally needed to make sure that any
5901	* subsequent check_cfs_rq_runtime calls agree not to throttle
5902	* us, as we may commit to do cfs put_prev+pick_next, so we ask
5903	* for 1ns of runtime rather than just check cfs_b.
5904	*/
5905	dequeue = `0`;
5906	} else {
5907	list_add_tail_rcu(new: &cfs_rq->throttled_list,
5908	head: &cfs_b->throttled_cfs_rq);
5909	}
5910	raw_spin_unlock(&cfs_b->lock);
5911
5912	if (!dequeue)
5913	return false; / Throttle no longer required. /
5914
5915	se = cfs_rq->tg->se[cpu_of(rq: rq_of(cfs_rq))];
5916
5917	/ freeze hierarchy runnable averages while throttled /
5918	rcu_read_lock();
5919	walk_tg_tree_from(from: cfs_rq->tg, down: tg_throttle_down, up: tg_nop, data: (void *)rq);
5920	rcu_read_unlock();
5921
5922	queued_delta = cfs_rq->h_nr_queued;
5923	runnable_delta = cfs_rq->h_nr_runnable;
5924	idle_delta = cfs_rq->h_nr_idle;
5925	for_each_sched_entity(se) {
5926	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5927	int flags;
5928
5929	/ throttled entity or throttle-on-deactivate /
5930	if (!se->on_rq)
5931	goto done;
5932
5933	/*
5934	* Abuse SPECIAL to avoid delayed dequeue in this instance.
5935	* This avoids teaching dequeue_entities() about throttled
5936	* entities and keeps things relatively simple.
5937	*/
5938	flags = DEQUEUE_SLEEP \| DEQUEUE_SPECIAL;
5939	if (se->sched_delayed)
5940	flags \|= DEQUEUE_DELAYED;
5941	dequeue_entity(cfs_rq: qcfs_rq, se, flags);
5942
5943	if (cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se)))
5944	idle_delta = cfs_rq->h_nr_queued;
5945
5946	qcfs_rq->h_nr_queued -= queued_delta;
5947	qcfs_rq->h_nr_runnable -= runnable_delta;
5948	qcfs_rq->h_nr_idle -= idle_delta;
5949
5950	if (qcfs_rq->load.weight) {
5951	/ Avoid re-evaluating load for this entity: /
5952	se = parent_entity(se);
5953	break;
5954	}
5955	}
5956
5957	for_each_sched_entity(se) {
5958	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5959	/ throttled entity or throttle-on-deactivate /
5960	if (!se->on_rq)
5961	goto done;
5962
5963	update_load_avg(cfs_rq: qcfs_rq, se, flags: `0`);
5964	se_update_runnable(se);
5965
5966	if (cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se)))
5967	idle_delta = cfs_rq->h_nr_queued;
5968
5969	qcfs_rq->h_nr_queued -= queued_delta;
5970	qcfs_rq->h_nr_runnable -= runnable_delta;
5971	qcfs_rq->h_nr_idle -= idle_delta;
5972	}
5973
5974	/ At this point se is NULL and we are at root level/
5975	sub_nr_running(rq, count: queued_delta);
5976
5977	/ Stop the fair server if throttling resulted in no runnable tasks /
5978	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
5979	dl_server_stop(dl_se: &rq->fair_server);
5980	done:
5981	/*
5982	* Note: distribution will already see us throttled via the
5983	* throttled-list. rq->lock protects completion.
5984	*/
5985	cfs_rq->throttled = `1`;
5986	WARN_ON_ONCE(cfs_rq->throttled_clock);
5987	if (cfs_rq->nr_queued)
5988	cfs_rq->throttled_clock = rq_clock(rq);
5989	return true;
5990	}
5991
5992	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5993	{
5994	struct rq *rq = rq_of(cfs_rq);
5995	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg: cfs_rq->tg);
5996	struct sched_entity *se;
5997	long queued_delta, runnable_delta, idle_delta;
5998	long rq_h_nr_queued = rq->cfs.h_nr_queued;
5999
6000	se = cfs_rq->tg->se[cpu_of(rq)];
6001
6002	cfs_rq->throttled = `0`;
6003
6004	update_rq_clock(rq);
6005
6006	raw_spin_lock(&cfs_b->lock);
6007	if (cfs_rq->throttled_clock) {
6008	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
6009	cfs_rq->throttled_clock = `0`;
6010	}
6011	list_del_rcu(entry: &cfs_rq->throttled_list);
6012	raw_spin_unlock(&cfs_b->lock);
6013
6014	/ update hierarchical throttle state /
6015	walk_tg_tree_from(from: cfs_rq->tg, down: tg_nop, up: tg_unthrottle_up, data: (void *)rq);
6016
6017	if (!cfs_rq->load.weight) {
6018	if (!cfs_rq->on_list)
6019	return;
6020	/*
6021	* Nothing to run but something to decay (on_list)?
6022	* Complete the branch.
6023	*/
6024	for_each_sched_entity(se) {
6025	if (list_add_leaf_cfs_rq(cfs_rq: cfs_rq_of(se)))
6026	break;
6027	}
6028	goto unthrottle_throttle;
6029	}
6030
6031	queued_delta = cfs_rq->h_nr_queued;
6032	runnable_delta = cfs_rq->h_nr_runnable;
6033	idle_delta = cfs_rq->h_nr_idle;
6034	for_each_sched_entity(se) {
6035	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
6036
6037	/ Handle any unfinished DELAY_DEQUEUE business first. /
6038	if (se->sched_delayed) {
6039	int flags = DEQUEUE_SLEEP \| DEQUEUE_DELAYED;
6040
6041	dequeue_entity(cfs_rq: qcfs_rq, se, flags);
6042	} else if (se->on_rq)
6043	break;
6044	enqueue_entity(cfs_rq: qcfs_rq, se, ENQUEUE_WAKEUP);
6045
6046	if (cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se)))
6047	idle_delta = cfs_rq->h_nr_queued;
6048
6049	qcfs_rq->h_nr_queued += queued_delta;
6050	qcfs_rq->h_nr_runnable += runnable_delta;
6051	qcfs_rq->h_nr_idle += idle_delta;
6052
6053	/ end evaluation on encountering a throttled cfs_rq /
6054	if (cfs_rq_throttled(cfs_rq: qcfs_rq))
6055	goto unthrottle_throttle;
6056	}
6057
6058	for_each_sched_entity(se) {
6059	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
6060
6061	update_load_avg(cfs_rq: qcfs_rq, se, UPDATE_TG);
6062	se_update_runnable(se);
6063
6064	if (cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se)))
6065	idle_delta = cfs_rq->h_nr_queued;
6066
6067	qcfs_rq->h_nr_queued += queued_delta;
6068	qcfs_rq->h_nr_runnable += runnable_delta;
6069	qcfs_rq->h_nr_idle += idle_delta;
6070
6071	/ end evaluation on encountering a throttled cfs_rq /
6072	if (cfs_rq_throttled(cfs_rq: qcfs_rq))
6073	goto unthrottle_throttle;
6074	}
6075
6076	/ Start the fair server if un-throttling resulted in new runnable tasks /
6077	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
6078	dl_server_start(dl_se: &rq->fair_server);
6079
6080	/ At this point se is NULL and we are at root level/
6081	add_nr_running(rq, count: queued_delta);
6082
6083	unthrottle_throttle:
6084	assert_list_leaf_cfs_rq(rq);
6085
6086	/ Determine whether we need to wake up potentially idle CPU: /
6087	if (rq->curr == rq->idle && rq->cfs.nr_queued)
6088	resched_curr(rq);
6089	}
6090
6091	#ifdef CONFIG_SMP
6092	static void __cfsb_csd_unthrottle(void *arg)
6093	{
6094	struct cfs_rq cursor, tmp;
6095	struct rq *rq = arg;
6096	struct rq_flags rf;
6097
6098	rq_lock(rq, rf: &rf);
6099
6100	/*
6101	* Iterating over the list can trigger several call to
6102	* update_rq_clock() in unthrottle_cfs_rq().
6103	* Do it once and skip the potential next ones.
6104	*/
6105	update_rq_clock(rq);
6106	rq_clock_start_loop_update(rq);
6107
6108	/*
6109	* Since we hold rq lock we're safe from concurrent manipulation of
6110	* the CSD list. However, this RCU critical section annotates the
6111	* fact that we pair with sched_free_group_rcu(), so that we cannot
6112	* race with group being freed in the window between removing it
6113	* from the list and advancing to the next entry in the list.
6114	*/
6115	rcu_read_lock();
6116
6117	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
6118	throttled_csd_list) {
6119	list_del_init(entry: &cursor->throttled_csd_list);
6120
6121	if (cfs_rq_throttled(cfs_rq: cursor))
6122	unthrottle_cfs_rq(cfs_rq: cursor);
6123	}
6124
6125	rcu_read_unlock();
6126
6127	rq_clock_stop_loop_update(rq);
6128	rq_unlock(rq, rf: &rf);
6129	}
6130
6131	static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6132	{
6133	struct rq *rq = rq_of(cfs_rq);
6134	bool first;
6135
6136	if (rq == this_rq()) {
6137	unthrottle_cfs_rq(cfs_rq);
6138	return;
6139	}
6140
6141	/ Already enqueued /
6142	if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
6143	return;
6144
6145	first = list_empty(head: &rq->cfsb_csd_list);
6146	list_add_tail(new: &cfs_rq->throttled_csd_list, head: &rq->cfsb_csd_list);
6147	if (first)
6148	smp_call_function_single_async(cpu: cpu_of(rq), csd: &rq->cfsb_csd);
6149	}
6150	#else
6151	static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6152	{
6153	unthrottle_cfs_rq(cfs_rq);
6154	}
6155	#endif
6156
6157	static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6158	{
6159	lockdep_assert_rq_held(rq: rq_of(cfs_rq));
6160
6161	if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) \|\|
6162	cfs_rq->runtime_remaining <= `0`))
6163	return;
6164
6165	__unthrottle_cfs_rq_async(cfs_rq);
6166	}
6167
6168	static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6169	{
6170	int this_cpu = smp_processor_id();
6171	u64 runtime, remaining = `1`;
6172	bool throttled = false;
6173	struct cfs_rq cfs_rq, tmp;
6174	struct rq_flags rf;
6175	struct rq *rq;
6176	LIST_HEAD(local_unthrottle);
6177
6178	rcu_read_lock();
6179	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6180	throttled_list) {
6181	rq = rq_of(cfs_rq);
6182
6183	if (!remaining) {
6184	throttled = true;
6185	break;
6186	}
6187
6188	rq_lock_irqsave(rq, rf: &rf);
6189	if (!cfs_rq_throttled(cfs_rq))
6190	goto next;
6191
6192	/ Already queued for async unthrottle /
6193	if (!list_empty(head: &cfs_rq->throttled_csd_list))
6194	goto next;
6195
6196	/ By the above checks, this should never be true /
6197	WARN_ON_ONCE(cfs_rq->runtime_remaining > `0`);
6198
6199	raw_spin_lock(&cfs_b->lock);
6200	runtime = -cfs_rq->runtime_remaining + `1`;
6201	if (runtime > cfs_b->runtime)
6202	runtime = cfs_b->runtime;
6203	cfs_b->runtime -= runtime;
6204	remaining = cfs_b->runtime;
6205	raw_spin_unlock(&cfs_b->lock);
6206
6207	cfs_rq->runtime_remaining += runtime;
6208
6209	/ we check whether we're throttled above /
6210	if (cfs_rq->runtime_remaining > `0`) {
6211	if (cpu_of(rq) != this_cpu) {
6212	unthrottle_cfs_rq_async(cfs_rq);
6213	} else {
6214	/*
6215	* We currently only expect to be unthrottling
6216	* a single cfs_rq locally.
6217	*/
6218	WARN_ON_ONCE(!list_empty(&local_unthrottle));
6219	list_add_tail(new: &cfs_rq->throttled_csd_list,
6220	head: &local_unthrottle);
6221	}
6222	} else {
6223	throttled = true;
6224	}
6225
6226	next:
6227	rq_unlock_irqrestore(rq, rf: &rf);
6228	}
6229
6230	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6231	throttled_csd_list) {
6232	struct rq *rq = rq_of(cfs_rq);
6233
6234	rq_lock_irqsave(rq, rf: &rf);
6235
6236	list_del_init(entry: &cfs_rq->throttled_csd_list);
6237
6238	if (cfs_rq_throttled(cfs_rq))
6239	unthrottle_cfs_rq(cfs_rq);
6240
6241	rq_unlock_irqrestore(rq, rf: &rf);
6242	}
6243	WARN_ON_ONCE(!list_empty(&local_unthrottle));
6244
6245	rcu_read_unlock();
6246
6247	return throttled;
6248	}
6249
6250	/*
6251	* Responsible for refilling a task_group's bandwidth and unthrottling its
6252	* cfs_rqs as appropriate. If there has been no activity within the last
6253	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
6254	* used to track this state.
6255	*/
6256	static int do_sched_cfs_period_timer(struct cfs_bandwidth cfs_b, int* overrun, unsigned long flags)
6257	{
6258	int throttled;
6259
6260	/ no need to continue the timer with no bandwidth constraint /
6261	if (cfs_b->quota == RUNTIME_INF)
6262	goto out_deactivate;
6263
6264	throttled = !list_empty(head: &cfs_b->throttled_cfs_rq);
6265	cfs_b->nr_periods += overrun;
6266
6267	/ Refill extra burst quota even if cfs_b->idle /
6268	__refill_cfs_bandwidth_runtime(cfs_b);
6269
6270	/*
6271	* idle depends on !throttled (for the case of a large deficit), and if
6272	* we're going inactive then everything else can be deferred
6273	*/
6274	if (cfs_b->idle && !throttled)
6275	goto out_deactivate;
6276
6277	if (!throttled) {
6278	/ mark as potentially idle for the upcoming period /
6279	cfs_b->idle = `1`;
6280	return `0`;
6281	}
6282
6283	/ account preceding periods in which throttling occurred /
6284	cfs_b->nr_throttled += overrun;
6285
6286	/*
6287	* This check is repeated as we release cfs_b->lock while we unthrottle.
6288	*/
6289	while (throttled && cfs_b->runtime > `0`) {
6290	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6291	/ we can't nest cfs_b->lock while distributing bandwidth /
6292	throttled = distribute_cfs_runtime(cfs_b);
6293	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6294	}
6295
6296	/*
6297	* While we are ensured activity in the period following an
6298	* unthrottle, this also covers the case in which the new bandwidth is
6299	* insufficient to cover the existing bandwidth deficit. (Forcing the
6300	* timer to remain active while there are any throttled entities.)
6301	*/
6302	cfs_b->idle = `0`;
6303
6304	return `0`;
6305
6306	out_deactivate:
6307	return `1`;
6308	}
6309
6310	/ a cfs_rq won't donate quota below this amount /
6311	static const u64 min_cfs_rq_runtime = `1` * NSEC_PER_MSEC;
6312	/ minimum remaining period time to redistribute slack quota /
6313	static const u64 min_bandwidth_expiration = `2` * NSEC_PER_MSEC;
6314	/ how long we wait to gather additional slack before distributing /
6315	static const u64 cfs_bandwidth_slack_period = `5` * NSEC_PER_MSEC;
6316
6317	/*
6318	* Are we near the end of the current quota period?
6319	*
6320	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6321	* hrtimer base being cleared by hrtimer_start. In the case of
6322	* migrate_hrtimers, base is never cleared, so we are fine.
6323	*/
6324	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6325	{
6326	struct hrtimer *refresh_timer = &cfs_b->period_timer;
6327	s64 remaining;
6328
6329	/ if the call-back is running a quota refresh is already occurring /
6330	if (hrtimer_callback_running(timer: refresh_timer))
6331	return `1`;
6332
6333	/ is a quota refresh about to occur? /
6334	remaining = ktime_to_ns(kt: hrtimer_expires_remaining(timer: refresh_timer));
6335	if (remaining < (s64)min_expire)
6336	return `1`;
6337
6338	return `0`;
6339	}
6340
6341	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6342	{
6343	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6344
6345	/ if there's a quota refresh soon don't bother with slack /
6346	if (runtime_refresh_within(cfs_b, min_expire: min_left))
6347	return;
6348
6349	/ don't push forwards an existing deferred unthrottle /
6350	if (cfs_b->slack_started)
6351	return;
6352	cfs_b->slack_started = true;
6353
6354	hrtimer_start(timer: &cfs_b->slack_timer,
6355	tim: ns_to_ktime(ns: cfs_bandwidth_slack_period),
6356	mode: HRTIMER_MODE_REL);
6357	}
6358
6359	/ we know any runtime found here is valid as update_curr() precedes return /
6360	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6361	{
6362	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg: cfs_rq->tg);
6363	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6364
6365	if (slack_runtime <= `0`)
6366	return;
6367
6368	raw_spin_lock(&cfs_b->lock);
6369	if (cfs_b->quota != RUNTIME_INF) {
6370	cfs_b->runtime += slack_runtime;
6371
6372	/ we are under rq->lock, defer unthrottling using a timer /
6373	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6374	!list_empty(head: &cfs_b->throttled_cfs_rq))
6375	start_cfs_slack_bandwidth(cfs_b);
6376	}
6377	raw_spin_unlock(&cfs_b->lock);
6378
6379	/ even if it's not valid for return we don't want to try again /
6380	cfs_rq->runtime_remaining -= slack_runtime;
6381	}
6382
6383	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6384	{
6385	if (!cfs_bandwidth_used())
6386	return;
6387
6388	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_queued)
6389	return;
6390
6391	__return_cfs_rq_runtime(cfs_rq);
6392	}
6393
6394	/*
6395	* This is done with a timer (instead of inline with bandwidth return) since
6396	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6397	*/
6398	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6399	{
6400	u64 runtime = `0`, slice = sched_cfs_bandwidth_slice();
6401	unsigned long flags;
6402
6403	/ confirm we're still not at a refresh boundary /
6404	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6405	cfs_b->slack_started = false;
6406
6407	if (runtime_refresh_within(cfs_b, min_expire: min_bandwidth_expiration)) {
6408	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6409	return;
6410	}
6411
6412	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6413	runtime = cfs_b->runtime;
6414
6415	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6416
6417	if (!runtime)
6418	return;
6419
6420	distribute_cfs_runtime(cfs_b);
6421	}
6422
6423	/*
6424	* When a group wakes up we want to make sure that its quota is not already
6425	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
6426	* runtime as update_curr() throttling can not trigger until it's on-rq.
6427	*/
6428	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6429	{
6430	if (!cfs_bandwidth_used())
6431	return;
6432
6433	/ an active group must be handled by the update_curr()->put() path /
6434	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
6435	return;
6436
6437	/ ensure the group is not already throttled /
6438	if (cfs_rq_throttled(cfs_rq))
6439	return;
6440
6441	/ update runtime allocation /
6442	account_cfs_rq_runtime(cfs_rq, delta_exec: `0`);
6443	if (cfs_rq->runtime_remaining <= `0`)
6444	throttle_cfs_rq(cfs_rq);
6445	}
6446
6447	static void sync_throttle(struct task_group tg, int* cpu)
6448	{
6449	struct cfs_rq pcfs_rq, cfs_rq;
6450
6451	if (!cfs_bandwidth_used())
6452	return;
6453
6454	if (!tg->parent)
6455	return;
6456
6457	cfs_rq = tg->cfs_rq[cpu];
6458	pcfs_rq = tg->parent->cfs_rq[cpu];
6459
6460	cfs_rq->throttle_count = pcfs_rq->throttle_count;
6461	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6462	}
6463
6464	/ conditionally throttle active cfs_rq's from put_prev_entity() /
6465	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6466	{
6467	if (!cfs_bandwidth_used())
6468	return false;
6469
6470	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > `0`))
6471	return false;
6472
6473	/*
6474	* it's possible for a throttled entity to be forced into a running
6475	* state (e.g. set_curr_task), in this case we're finished.
6476	*/
6477	if (cfs_rq_throttled(cfs_rq))
6478	return true;
6479
6480	return throttle_cfs_rq(cfs_rq);
6481	}
6482
6483	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6484	{
6485	struct cfs_bandwidth *cfs_b =
6486	container_of(timer, struct cfs_bandwidth, slack_timer);
6487
6488	do_sched_cfs_slack_timer(cfs_b);
6489
6490	return HRTIMER_NORESTART;
6491	}
6492
6493	extern const u64 max_cfs_quota_period;
6494
6495	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6496	{
6497	struct cfs_bandwidth *cfs_b =
6498	container_of(timer, struct cfs_bandwidth, period_timer);
6499	unsigned long flags;
6500	int overrun;
6501	int idle = `0`;
6502	int count = `0`;
6503
6504	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6505	for (;;) {
6506	overrun = hrtimer_forward_now(timer, interval: cfs_b->period);
6507	if (!overrun)
6508	break;
6509
6510	idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6511
6512	if (++count > `3`) {
6513	u64 new, old = ktime_to_ns(kt: cfs_b->period);
6514
6515	/*
6516	* Grow period by a factor of 2 to avoid losing precision.
6517	* Precision loss in the quota/period ratio can cause __cfs_schedulable
6518	* to fail.
6519	*/
6520	new = old * `2`;
6521	if (new < max_cfs_quota_period) {
6522	cfs_b->period = ns_to_ktime(ns: new);
6523	cfs_b->quota *= `2`;
6524	cfs_b->burst *= `2`;
6525
6526	pr_warn_ratelimited(
6527	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6528	smp_processor_id(),
6529	div_u64(new, NSEC_PER_USEC),
6530	div_u64(cfs_b->quota, NSEC_PER_USEC));
6531	} else {
6532	pr_warn_ratelimited(
6533	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6534	smp_processor_id(),
6535	div_u64(old, NSEC_PER_USEC),
6536	div_u64(cfs_b->quota, NSEC_PER_USEC));
6537	}
6538
6539	/ reset count so we don't come right back in here /
6540	count = `0`;
6541	}
6542	}
6543	if (idle)
6544	cfs_b->period_active = `0`;
6545	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6546
6547	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6548	}
6549
6550	void init_cfs_bandwidth(struct cfs_bandwidth cfs_b, struct* cfs_bandwidth *parent)
6551	{
6552	raw_spin_lock_init(&cfs_b->lock);
6553	cfs_b->runtime = `0`;
6554	cfs_b->quota = RUNTIME_INF;
6555	cfs_b->period = ns_to_ktime(ns: default_cfs_period());
6556	cfs_b->burst = `0`;
6557	cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6558
6559	INIT_LIST_HEAD(list: &cfs_b->throttled_cfs_rq);
6560	hrtimer_setup(timer: &cfs_b->period_timer, function: sched_cfs_period_timer, CLOCK_MONOTONIC,
6561	mode: HRTIMER_MODE_ABS_PINNED);
6562
6563	/ Add a random offset so that timers interleave /
6564	hrtimer_set_expires(timer: &cfs_b->period_timer,
6565	time: get_random_u32_below(ceil: cfs_b->period));
6566	hrtimer_setup(timer: &cfs_b->slack_timer, function: sched_cfs_slack_timer, CLOCK_MONOTONIC,
6567	mode: HRTIMER_MODE_REL);
6568	cfs_b->slack_started = false;
6569	}
6570
6571	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6572	{
6573	cfs_rq->runtime_enabled = `0`;
6574	INIT_LIST_HEAD(list: &cfs_rq->throttled_list);
6575	INIT_LIST_HEAD(list: &cfs_rq->throttled_csd_list);
6576	}
6577
6578	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6579	{
6580	lockdep_assert_held(&cfs_b->lock);
6581
6582	if (cfs_b->period_active)
6583	return;
6584
6585	cfs_b->period_active = `1`;
6586	hrtimer_forward_now(timer: &cfs_b->period_timer, interval: cfs_b->period);
6587	hrtimer_start_expires(timer: &cfs_b->period_timer, mode: HRTIMER_MODE_ABS_PINNED);
6588	}
6589
6590	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6591	{
6592	int __maybe_unused i;
6593
6594	/ init_cfs_bandwidth() was not called /
6595	if (!cfs_b->throttled_cfs_rq.next)
6596	return;
6597
6598	hrtimer_cancel(timer: &cfs_b->period_timer);
6599	hrtimer_cancel(timer: &cfs_b->slack_timer);
6600
6601	/*
6602	* It is possible that we still have some cfs_rq's pending on a CSD
6603	* list, though this race is very rare. In order for this to occur, we
6604	* must have raced with the last task leaving the group while there
6605	* exist throttled cfs_rq(s), and the period_timer must have queued the
6606	* CSD item but the remote cpu has not yet processed it. To handle this,
6607	* we can simply flush all pending CSD work inline here. We're
6608	* guaranteed at this point that no additional cfs_rq of this group can
6609	* join a CSD list.
6610	*/
6611	#ifdef CONFIG_SMP
6612	for_each_possible_cpu(i) {
6613	struct rq *rq = cpu_rq(i);
6614	unsigned long flags;
6615
6616	if (list_empty(head: &rq->cfsb_csd_list))
6617	continue;
6618
6619	local_irq_save(flags);
6620	__cfsb_csd_unthrottle(arg: rq);
6621	local_irq_restore(flags);
6622	}
6623	#endif
6624	}
6625
6626	/*
6627	* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6628	*
6629	* The race is harmless, since modifying bandwidth settings of unhooked group
6630	* bits doesn't do much.
6631	*/
6632
6633	/ cpu online callback /
6634	static void __maybe_unused update_runtime_enabled(struct rq *rq)
6635	{
6636	struct task_group *tg;
6637
6638	lockdep_assert_rq_held(rq);
6639
6640	rcu_read_lock();
6641	list_for_each_entry_rcu(tg, &task_groups, list) {
6642	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6643	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6644
6645	raw_spin_lock(&cfs_b->lock);
6646	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6647	raw_spin_unlock(&cfs_b->lock);
6648	}
6649	rcu_read_unlock();
6650	}
6651
6652	/ cpu offline callback /
6653	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6654	{
6655	struct task_group *tg;
6656
6657	lockdep_assert_rq_held(rq);
6658
6659	// Do not unthrottle for an active CPU
6660	if (cpumask_test_cpu(cpu: cpu_of(rq), cpu_active_mask))
6661	return;
6662
6663	/*
6664	* The rq clock has already been updated in the
6665	* set_rq_offline(), so we should skip updating
6666	* the rq clock again in unthrottle_cfs_rq().
6667	*/
6668	rq_clock_start_loop_update(rq);
6669
6670	rcu_read_lock();
6671	list_for_each_entry_rcu(tg, &task_groups, list) {
6672	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6673
6674	if (!cfs_rq->runtime_enabled)
6675	continue;
6676
6677	/*
6678	* Offline rq is schedulable till CPU is completely disabled
6679	* in take_cpu_down(), so we prevent new cfs throttling here.
6680	*/
6681	cfs_rq->runtime_enabled = `0`;
6682
6683	if (!cfs_rq_throttled(cfs_rq))
6684	continue;
6685
6686	/*
6687	* clock_task is not advancing so we just need to make sure
6688	* there's some valid quota amount
6689	*/
6690	cfs_rq->runtime_remaining = `1`;
6691	unthrottle_cfs_rq(cfs_rq);
6692	}
6693	rcu_read_unlock();
6694
6695	rq_clock_stop_loop_update(rq);
6696	}
6697
6698	bool cfs_task_bw_constrained(struct task_struct *p)
6699	{
6700	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6701
6702	if (!cfs_bandwidth_used())
6703	return false;
6704
6705	if (cfs_rq->runtime_enabled \|\|
6706	tg_cfs_bandwidth(tg: cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6707	return true;
6708
6709	return false;
6710	}
6711
6712	#ifdef CONFIG_NO_HZ_FULL
6713	/ called from pick_next_task_fair() /
6714	static void sched_fair_update_stop_tick(struct rq rq, struct* task_struct *p)
6715	{
6716	int cpu = cpu_of(rq);
6717
6718	if (!cfs_bandwidth_used())
6719	return;
6720
6721	if (!tick_nohz_full_cpu(cpu))
6722	return;
6723
6724	if (rq->nr_running != `1`)
6725	return;
6726
6727	/*
6728	* We know there is only one task runnable and we've just picked it. The
6729	* normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6730	* be otherwise able to stop the tick. Just need to check if we are using
6731	* bandwidth control.
6732	*/
6733	if (cfs_task_bw_constrained(p))
6734	tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6735	}
6736	#endif
6737
6738	#else /* CONFIG_CFS_BANDWIDTH */
6739
6740	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
6741	static bool check_cfs_rq_runtime(struct cfs_rq cfs_rq) { return* false; }
6742	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6743	static inline void sync_throttle(struct task_group tg, int* cpu) {}
6744	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6745
6746	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6747	{
6748	return `0`;
6749	}
6750
6751	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6752	{
6753	return `0`;
6754	}
6755
6756	static inline int throttled_lb_pair(struct task_group *tg,
6757	int src_cpu, int dest_cpu)
6758	{
6759	return `0`;
6760	}
6761
6762	#ifdef CONFIG_FAIR_GROUP_SCHED
6763	void init_cfs_bandwidth(struct cfs_bandwidth cfs_b, struct* cfs_bandwidth *parent) {}
6764	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6765	#endif
6766
6767	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct* task_group *tg)
6768	{
6769	return NULL;
6770	}
6771	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
6772	static inline void update_runtime_enabled(struct rq *rq) {}
6773	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6774	#ifdef CONFIG_CGROUP_SCHED
6775	bool cfs_task_bw_constrained(struct task_struct *p)
6776	{
6777	return false;
6778	}
6779	#endif
6780	#endif /* CONFIG_CFS_BANDWIDTH */
6781
6782	#if !defined(CONFIG_CFS_BANDWIDTH) \|\| !defined(CONFIG_NO_HZ_FULL)
6783	static inline void sched_fair_update_stop_tick(struct rq rq, struct* task_struct *p) {}
6784	#endif
6785
6786	/**************************************************
6787	* CFS operations on tasks:
6788	*/
6789
6790	#ifdef CONFIG_SCHED_HRTICK
6791	static void hrtick_start_fair(struct rq rq, struct* task_struct *p)
6792	{
6793	struct sched_entity *se = &p->se;
6794
6795	WARN_ON_ONCE(task_rq(p) != rq);
6796
6797	if (rq->cfs.h_nr_queued > `1`) {
6798	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6799	u64 slice = se->slice;
6800	s64 delta = slice - ran;
6801
6802	if (delta < `0`) {
6803	if (task_current_donor(rq, p))
6804	resched_curr(rq);
6805	return;
6806	}
6807	hrtick_start(rq, delay: delta);
6808	}
6809	}
6810
6811	/*
6812	* called from enqueue/dequeue and updates the hrtick when the
6813	* current task is from our class and nr_running is low enough
6814	* to matter.
6815	*/
6816	static void hrtick_update(struct rq *rq)
6817	{
6818	struct task_struct *donor = rq->donor;
6819
6820	if (!hrtick_enabled_fair(rq) \|\| donor->sched_class != &fair_sched_class)
6821	return;
6822
6823	hrtick_start_fair(rq, p: donor);
6824	}
6825	#else /* !CONFIG_SCHED_HRTICK */
6826	static inline void
6827	hrtick_start_fair(struct rq rq, struct* task_struct *p)
6828	{
6829	}
6830
6831	static inline void hrtick_update(struct rq *rq)
6832	{
6833	}
6834	#endif
6835
6836	#ifdef CONFIG_SMP
6837	static inline bool cpu_overutilized(int cpu)
6838	{
6839	unsigned long rq_util_min, rq_util_max;
6840
6841	if (!sched_energy_enabled())
6842	return false;
6843
6844	rq_util_min = uclamp_rq_get(cpu_rq(cpu), clamp_id: UCLAMP_MIN);
6845	rq_util_max = uclamp_rq_get(cpu_rq(cpu), clamp_id: UCLAMP_MAX);
6846
6847	/ Return true only if the utilization doesn't fit CPU's capacity /
6848	return !util_fits_cpu(util: cpu_util_cfs(cpu), uclamp_min: rq_util_min, uclamp_max: rq_util_max, cpu);
6849	}
6850
6851	/*
6852	* overutilized value make sense only if EAS is enabled
6853	*/
6854	static inline bool is_rd_overutilized(struct root_domain *rd)
6855	{
6856	return !sched_energy_enabled() \|\| READ_ONCE(rd->overutilized);
6857	}
6858
6859	static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
6860	{
6861	if (!sched_energy_enabled())
6862	return;
6863
6864	WRITE_ONCE(rd->overutilized, flag);
6865	trace_sched_overutilized_tp(rd, overutilized: flag);
6866	}
6867
6868	static inline void check_update_overutilized_status(struct rq *rq)
6869	{
6870	/*
6871	* overutilized field is used for load balancing decisions only
6872	* if energy aware scheduler is being used
6873	*/
6874
6875	if (!is_rd_overutilized(rd: rq->rd) && cpu_overutilized(cpu: rq->cpu))
6876	set_rd_overutilized(rd: rq->rd, flag: `1`);
6877	}
6878	#else
6879	static inline void check_update_overutilized_status(struct rq *rq) { }
6880	#endif
6881
6882	/ Runqueue only has SCHED_IDLE tasks enqueued /
6883	static int sched_idle_rq(struct rq *rq)
6884	{
6885	return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
6886	rq->nr_running);
6887	}
6888
6889	#ifdef CONFIG_SMP
6890	static int sched_idle_cpu(int cpu)
6891	{
6892	return sched_idle_rq(cpu_rq(cpu));
6893	}
6894	#endif
6895
6896	static void
6897	requeue_delayed_entity(struct sched_entity *se)
6898	{
6899	struct cfs_rq *cfs_rq = cfs_rq_of(se);
6900
6901	/*
6902	* se->sched_delayed should imply: se->on_rq == 1.
6903	* Because a delayed entity is one that is still on
6904	* the runqueue competing until elegibility.
6905	*/
6906	WARN_ON_ONCE(!se->sched_delayed);
6907	WARN_ON_ONCE(!se->on_rq);
6908
6909	if (sched_feat(DELAY_ZERO)) {
6910	update_entity_lag(cfs_rq, se);
6911	if (se->vlag > `0`) {
6912	cfs_rq->nr_queued--;
6913	if (se != cfs_rq->curr)
6914	__dequeue_entity(cfs_rq, se);
6915	se->vlag = `0`;
6916	place_entity(cfs_rq, se, flags: `0`);
6917	if (se != cfs_rq->curr)
6918	__enqueue_entity(cfs_rq, se);
6919	cfs_rq->nr_queued++;
6920	}
6921	}
6922
6923	update_load_avg(cfs_rq, se, flags: `0`);
6924	clear_delayed(se);
6925	}
6926
6927	/*
6928	* The enqueue_task method is called before nr_running is
6929	* increased. Here we update the fair scheduling stats and
6930	* then put the task into the rbtree:
6931	*/
6932	static void
6933	enqueue_task_fair(struct rq rq, struct* task_struct p, int* flags)
6934	{
6935	struct cfs_rq *cfs_rq;
6936	struct sched_entity *se = &p->se;
6937	int h_nr_idle = task_has_idle_policy(p);
6938	int h_nr_runnable = `1`;
6939	int task_new = !(flags & ENQUEUE_WAKEUP);
6940	int rq_h_nr_queued = rq->cfs.h_nr_queued;
6941	u64 slice = `0`;
6942
6943	/*
6944	* The code below (indirectly) updates schedutil which looks at
6945	* the cfs_rq utilization to select a frequency.
6946	* Let's add the task's estimated utilization to the cfs_rq's
6947	* estimated utilization, before we update schedutil.
6948	*/
6949	if (!p->se.sched_delayed \|\| (flags & ENQUEUE_DELAYED))
6950	util_est_enqueue(cfs_rq: &rq->cfs, p);
6951
6952	if (flags & ENQUEUE_DELAYED) {
6953	requeue_delayed_entity(se);
6954	return;
6955	}
6956
6957	/*
6958	* If in_iowait is set, the code below may not trigger any cpufreq
6959	* utilization updates, so do it here explicitly with the IOWAIT flag
6960	* passed.
6961	*/
6962	if (p->in_iowait)
6963	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6964
6965	if (task_new && se->sched_delayed)
6966	h_nr_runnable = `0`;
6967
6968	for_each_sched_entity(se) {
6969	if (se->on_rq) {
6970	if (se->sched_delayed)
6971	requeue_delayed_entity(se);
6972	break;
6973	}
6974	cfs_rq = cfs_rq_of(se);
6975
6976	/*
6977	* Basically set the slice of group entries to the min_slice of
6978	* their respective cfs_rq. This ensures the group can service
6979	* its entities in the desired time-frame.
6980	*/
6981	if (slice) {
6982	se->slice = slice;
6983	se->custom_slice = `1`;
6984	}
6985	enqueue_entity(cfs_rq, se, flags);
6986	slice = cfs_rq_min_slice(cfs_rq);
6987
6988	cfs_rq->h_nr_runnable += h_nr_runnable;
6989	cfs_rq->h_nr_queued++;
6990	cfs_rq->h_nr_idle += h_nr_idle;
6991
6992	if (cfs_rq_is_idle(cfs_rq))
6993	h_nr_idle = `1`;
6994
6995	/ end evaluation on encountering a throttled cfs_rq /
6996	if (cfs_rq_throttled(cfs_rq))
6997	goto enqueue_throttle;
6998
6999	flags = ENQUEUE_WAKEUP;
7000	}
7001
7002	for_each_sched_entity(se) {
7003	cfs_rq = cfs_rq_of(se);
7004
7005	update_load_avg(cfs_rq, se, UPDATE_TG);
7006	se_update_runnable(se);
7007	update_cfs_group(se);
7008
7009	se->slice = slice;
7010	if (se != cfs_rq->curr)
7011	min_vruntime_cb_propagate(rb: &se->run_node, NULL);
7012	slice = cfs_rq_min_slice(cfs_rq);
7013
7014	cfs_rq->h_nr_runnable += h_nr_runnable;
7015	cfs_rq->h_nr_queued++;
7016	cfs_rq->h_nr_idle += h_nr_idle;
7017
7018	if (cfs_rq_is_idle(cfs_rq))
7019	h_nr_idle = `1`;
7020
7021	/ end evaluation on encountering a throttled cfs_rq /
7022	if (cfs_rq_throttled(cfs_rq))
7023	goto enqueue_throttle;
7024	}
7025
7026	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
7027	/ Account for idle runtime /
7028	if (!rq->nr_running)
7029	dl_server_update_idle_time(rq, p: rq->curr);
7030	dl_server_start(dl_se: &rq->fair_server);
7031	}
7032
7033	/ At this point se is NULL and we are at root level/
7034	add_nr_running(rq, count: `1`);
7035
7036	/*
7037	* Since new tasks are assigned an initial util_avg equal to
7038	* half of the spare capacity of their CPU, tiny tasks have the
7039	* ability to cross the overutilized threshold, which will
7040	* result in the load balancer ruining all the task placement
7041	* done by EAS. As a way to mitigate that effect, do not account
7042	* for the first enqueue operation of new tasks during the
7043	* overutilized flag detection.
7044	*
7045	* A better way of solving this problem would be to wait for
7046	* the PELT signals of tasks to converge before taking them
7047	* into account, but that is not straightforward to implement,
7048	* and the following generally works well enough in practice.
7049	*/
7050	if (!task_new)
7051	check_update_overutilized_status(rq);
7052
7053	enqueue_throttle:
7054	assert_list_leaf_cfs_rq(rq);
7055
7056	hrtick_update(rq);
7057	}
7058
7059	static void set_next_buddy(struct sched_entity *se);
7060
7061	/*
7062	* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
7063	* failing half-way through and resume the dequeue later.
7064	*
7065	* Returns:
7066	* -1 - dequeue delayed
7067	* 0 - dequeue throttled
7068	* 1 - dequeue complete
7069	*/
7070	static int dequeue_entities(struct rq rq, struct* sched_entity se, int* flags)
7071	{
7072	bool was_sched_idle = sched_idle_rq(rq);
7073	int rq_h_nr_queued = rq->cfs.h_nr_queued;
7074	bool task_sleep = flags & DEQUEUE_SLEEP;
7075	bool task_delayed = flags & DEQUEUE_DELAYED;
7076	struct task_struct *p = NULL;
7077	int h_nr_idle = `0`;
7078	int h_nr_queued = `0`;
7079	int h_nr_runnable = `0`;
7080	struct cfs_rq *cfs_rq;
7081	u64 slice = `0`;
7082
7083	if (entity_is_task(se)) {
7084	p = task_of(se);
7085	h_nr_queued = `1`;
7086	h_nr_idle = task_has_idle_policy(p);
7087	if (task_sleep \|\| task_delayed \|\| !se->sched_delayed)
7088	h_nr_runnable = `1`;
7089	}
7090
7091	for_each_sched_entity(se) {
7092	cfs_rq = cfs_rq_of(se);
7093
7094	if (!dequeue_entity(cfs_rq, se, flags)) {
7095	if (p && &p->se == se)
7096	return -`1`;
7097
7098	slice = cfs_rq_min_slice(cfs_rq);
7099	break;
7100	}
7101
7102	cfs_rq->h_nr_runnable -= h_nr_runnable;
7103	cfs_rq->h_nr_queued -= h_nr_queued;
7104	cfs_rq->h_nr_idle -= h_nr_idle;
7105
7106	if (cfs_rq_is_idle(cfs_rq))
7107	h_nr_idle = h_nr_queued;
7108
7109	/ end evaluation on encountering a throttled cfs_rq /
7110	if (cfs_rq_throttled(cfs_rq))
7111	return `0`;
7112
7113	/ Don't dequeue parent if it has other entities besides us /
7114	if (cfs_rq->load.weight) {
7115	slice = cfs_rq_min_slice(cfs_rq);
7116
7117	/ Avoid re-evaluating load for this entity: /
7118	se = parent_entity(se);
7119	/*
7120	* Bias pick_next to pick a task from this cfs_rq, as
7121	* p is sleeping when it is within its sched_slice.
7122	*/
7123	if (task_sleep && se && !throttled_hierarchy(cfs_rq))
7124	set_next_buddy(se);
7125	break;
7126	}
7127	flags \|= DEQUEUE_SLEEP;
7128	flags &= ~(DEQUEUE_DELAYED \| DEQUEUE_SPECIAL);
7129	}
7130
7131	for_each_sched_entity(se) {
7132	cfs_rq = cfs_rq_of(se);
7133
7134	update_load_avg(cfs_rq, se, UPDATE_TG);
7135	se_update_runnable(se);
7136	update_cfs_group(se);
7137
7138	se->slice = slice;
7139	if (se != cfs_rq->curr)
7140	min_vruntime_cb_propagate(rb: &se->run_node, NULL);
7141	slice = cfs_rq_min_slice(cfs_rq);
7142
7143	cfs_rq->h_nr_runnable -= h_nr_runnable;
7144	cfs_rq->h_nr_queued -= h_nr_queued;
7145	cfs_rq->h_nr_idle -= h_nr_idle;
7146
7147	if (cfs_rq_is_idle(cfs_rq))
7148	h_nr_idle = h_nr_queued;
7149
7150	/ end evaluation on encountering a throttled cfs_rq /
7151	if (cfs_rq_throttled(cfs_rq))
7152	return `0`;
7153	}
7154
7155	sub_nr_running(rq, count: h_nr_queued);
7156
7157	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
7158	dl_server_stop(dl_se: &rq->fair_server);
7159
7160	/ balance early to pull high priority tasks /
7161	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
7162	rq->next_balance = jiffies;
7163
7164	if (p && task_delayed) {
7165	WARN_ON_ONCE(!task_sleep);
7166	WARN_ON_ONCE(p->on_rq != `1`);
7167
7168	/ Fix-up what dequeue_task_fair() skipped /
7169	hrtick_update(rq);
7170
7171	/*
7172	* Fix-up what block_task() skipped.
7173	*
7174	* Must be last, @p might not be valid after this.
7175	*/
7176	__block_task(rq, p);
7177	}
7178
7179	return `1`;
7180	}
7181
7182	/*
7183	* The dequeue_task method is called before nr_running is
7184	* decreased. We remove the task from the rbtree and
7185	* update the fair scheduling stats:
7186	*/
7187	static bool dequeue_task_fair(struct rq rq, struct* task_struct p, int* flags)
7188	{
7189	if (!p->se.sched_delayed)
7190	util_est_dequeue(cfs_rq: &rq->cfs, p);
7191
7192	util_est_update(cfs_rq: &rq->cfs, p, task_sleep: flags & DEQUEUE_SLEEP);
7193	if (dequeue_entities(rq, se: &p->se, flags) < `0`)
7194	return false;
7195
7196	/*
7197	* Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
7198	*/
7199
7200	hrtick_update(rq);
7201	return true;
7202	}
7203
7204	static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
7205	{
7206	return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
7207	}
7208
7209	#ifdef CONFIG_SMP
7210
7211	/ Working cpumask for: sched_balance_rq(), sched_balance_newidle(). /
7212	static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7213	static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
7214	static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
7215
7216	#ifdef CONFIG_NO_HZ_COMMON
7217
7218	static struct {
7219	cpumask_var_t idle_cpus_mask;
7220	atomic_t nr_cpus;
7221	int has_blocked; / Idle CPUS has blocked load /
7222	int needs_update; / Newly idle CPUs need their next_balance collated /
7223	unsigned long next_balance; / in jiffy units /
7224	unsigned long next_blocked; / Next update of blocked load in jiffies /
7225	} nohz ____cacheline_aligned;
7226
7227	#endif /* CONFIG_NO_HZ_COMMON */
7228
7229	static unsigned long cpu_load(struct rq *rq)
7230	{
7231	return cfs_rq_load_avg(cfs_rq: &rq->cfs);
7232	}
7233
7234	/*
7235	* cpu_load_without - compute CPU load without any contributions from *p
7236	* @cpu: the CPU which load is requested
7237	* @p: the task which load should be discounted
7238	*
7239	* The load of a CPU is defined by the load of tasks currently enqueued on that
7240	* CPU as well as tasks which are currently sleeping after an execution on that
7241	* CPU.
7242	*
7243	* This method returns the load of the specified CPU by discounting the load of
7244	* the specified task, whenever the task is currently contributing to the CPU
7245	* load.
7246	*/
7247	static unsigned long cpu_load_without(struct rq rq, struct* task_struct *p)
7248	{
7249	struct cfs_rq *cfs_rq;
7250	unsigned int load;
7251
7252	/ Task has no contribution or is new /
7253	if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
7254	return cpu_load(rq);
7255
7256	cfs_rq = &rq->cfs;
7257	load = READ_ONCE(cfs_rq->avg.load_avg);
7258
7259	/ Discount task's util from CPU's util /
7260	lsub_positive(&load, task_h_load(p));
7261
7262	return load;
7263	}
7264
7265	static unsigned long cpu_runnable(struct rq *rq)
7266	{
7267	return cfs_rq_runnable_avg(cfs_rq: &rq->cfs);
7268	}
7269
7270	static unsigned long cpu_runnable_without(struct rq rq, struct* task_struct *p)
7271	{
7272	struct cfs_rq *cfs_rq;
7273	unsigned int runnable;
7274
7275	/ Task has no contribution or is new /
7276	if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
7277	return cpu_runnable(rq);
7278
7279	cfs_rq = &rq->cfs;
7280	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7281
7282	/ Discount task's runnable from CPU's runnable /
7283	lsub_positive(&runnable, p->se.avg.runnable_avg);
7284
7285	return runnable;
7286	}
7287
7288	static unsigned long capacity_of(int cpu)
7289	{
7290	return cpu_rq(cpu)->cpu_capacity;
7291	}
7292
7293	static void record_wakee(struct task_struct *p)
7294	{
7295	/*
7296	* Only decay a single time; tasks that have less then 1 wakeup per
7297	* jiffy will not have built up many flips.
7298	*/
7299	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7300	current->wakee_flips >>= `1`;
7301	current->wakee_flip_decay_ts = jiffies;
7302	}
7303
7304	if (current->last_wakee != p) {
7305	current->last_wakee = p;
7306	current->wakee_flips++;
7307	}
7308	}
7309
7310	/*
7311	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
7312	*
7313	* A waker of many should wake a different task than the one last awakened
7314	* at a frequency roughly N times higher than one of its wakees.
7315	*
7316	* In order to determine whether we should let the load spread vs consolidating
7317	* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
7318	* partner, and a factor of lls_size higher frequency in the other.
7319	*
7320	* With both conditions met, we can be relatively sure that the relationship is
7321	* non-monogamous, with partner count exceeding socket size.
7322	*
7323	* Waker/wakee being client/server, worker/dispatcher, interrupt source or
7324	* whatever is irrelevant, spread criteria is apparent partner count exceeds
7325	* socket size.
7326	*/
7327	static int wake_wide(struct task_struct *p)
7328	{
7329	unsigned int master = current->wakee_flips;
7330	unsigned int slave = p->wakee_flips;
7331	int factor = __this_cpu_read(sd_llc_size);
7332
7333	if (master < slave)
7334	swap(master, slave);
7335	if (slave < factor \|\| master < slave * factor)
7336	return `0`;
7337	return `1`;
7338	}
7339
7340	/*
7341	* The purpose of wake_affine() is to quickly determine on which CPU we can run
7342	* soonest. For the purpose of speed we only consider the waking and previous
7343	* CPU.
7344	*
7345	* wake_affine_idle() - only considers 'now', it check if the waking CPU is
7346	* cache-affine and is (or will be) idle.
7347	*
7348	* wake_affine_weight() - considers the weight to reflect the average
7349	* scheduling latency of the CPUs. This seems to work
7350	* for the overloaded case.
7351	*/
7352	static int
7353	wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7354	{
7355	/*
7356	* If this_cpu is idle, it implies the wakeup is from interrupt
7357	* context. Only allow the move if cache is shared. Otherwise an
7358	* interrupt intensive workload could force all tasks onto one
7359	* node depending on the IO topology or IRQ affinity settings.
7360	*
7361	* If the prev_cpu is idle and cache affine then avoid a migration.
7362	* There is no guarantee that the cache hot data from an interrupt
7363	* is more important than cache hot data on the prev_cpu and from
7364	* a cpufreq perspective, it's better to have higher utilisation
7365	* on one CPU.
7366	*/
7367	if (available_idle_cpu(cpu: this_cpu) && cpus_share_cache(this_cpu, that_cpu: prev_cpu))
7368	return available_idle_cpu(cpu: prev_cpu) ? prev_cpu : this_cpu;
7369
7370	if (sync) {
7371	struct rq *rq = cpu_rq(this_cpu);
7372
7373	if ((rq->nr_running - cfs_h_nr_delayed(rq)) == `1`)
7374	return this_cpu;
7375	}
7376
7377	if (available_idle_cpu(cpu: prev_cpu))
7378	return prev_cpu;
7379
7380	return nr_cpumask_bits;
7381	}
7382
7383	static int
7384	wake_affine_weight(struct sched_domain sd, struct* task_struct *p,
7385	int this_cpu, int prev_cpu, int sync)
7386	{
7387	s64 this_eff_load, prev_eff_load;
7388	unsigned long task_load;
7389
7390	this_eff_load = cpu_load(cpu_rq(this_cpu));
7391
7392	if (sync) {
7393	unsigned long current_load = task_h_load(current);
7394
7395	if (current_load > this_eff_load)
7396	return this_cpu;
7397
7398	this_eff_load -= current_load;
7399	}
7400
7401	task_load = task_h_load(p);
7402
7403	this_eff_load += task_load;
7404	if (sched_feat(WA_BIAS))
7405	this_eff_load *= `100`;
7406	this_eff_load *= capacity_of(cpu: prev_cpu);
7407
7408	prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7409	prev_eff_load -= task_load;
7410	if (sched_feat(WA_BIAS))
7411	prev_eff_load *= `100` + (sd->imbalance_pct - `100`) / `2`;
7412	prev_eff_load *= capacity_of(cpu: this_cpu);
7413
7414	/*
7415	* If sync, adjust the weight of prev_eff_load such that if
7416	* prev_eff == this_eff that select_idle_sibling() will consider
7417	* stacking the wakee on top of the waker if no other CPU is
7418	* idle.
7419	*/
7420	if (sync)
7421	prev_eff_load += `1`;
7422
7423	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7424	}
7425
7426	static int wake_affine(struct sched_domain sd, struct* task_struct *p,
7427	int this_cpu, int prev_cpu, int sync)
7428	{
7429	int target = nr_cpumask_bits;
7430
7431	if (sched_feat(WA_IDLE))
7432	target = wake_affine_idle(this_cpu, prev_cpu, sync);
7433
7434	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7435	target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7436
7437	schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7438	if (target != this_cpu)
7439	return prev_cpu;
7440
7441	schedstat_inc(sd->ttwu_move_affine);
7442	schedstat_inc(p->stats.nr_wakeups_affine);
7443	return target;
7444	}
7445
7446	static struct sched_group *
7447	sched_balance_find_dst_group(struct sched_domain sd, struct* task_struct p, int* this_cpu);
7448
7449	/*
7450	* sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
7451	*/
7452	static int
7453	sched_balance_find_dst_group_cpu(struct sched_group group, struct* task_struct p, int* this_cpu)
7454	{
7455	unsigned long load, min_load = ULONG_MAX;
7456	unsigned int min_exit_latency = UINT_MAX;
7457	u64 latest_idle_timestamp = `0`;
7458	int least_loaded_cpu = this_cpu;
7459	int shallowest_idle_cpu = -`1`;
7460	int i;
7461
7462	/ Check if we have any choice: /
7463	if (group->group_weight == `1`)
7464	return cpumask_first(srcp: sched_group_span(sg: group));
7465
7466	/ Traverse only the allowed CPUs /
7467	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7468	struct rq *rq = cpu_rq(i);
7469
7470	if (!sched_core_cookie_match(rq, p))
7471	continue;
7472
7473	if (sched_idle_cpu(cpu: i))
7474	return i;
7475
7476	if (available_idle_cpu(cpu: i)) {
7477	struct cpuidle_state *idle = idle_get_state(rq);
7478	if (idle && idle->exit_latency < min_exit_latency) {
7479	/*
7480	* We give priority to a CPU whose idle state
7481	* has the smallest exit latency irrespective
7482	* of any idle timestamp.
7483	*/
7484	min_exit_latency = idle->exit_latency;
7485	latest_idle_timestamp = rq->idle_stamp;
7486	shallowest_idle_cpu = i;
7487	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
7488	rq->idle_stamp > latest_idle_timestamp) {
7489	/*
7490	* If equal or no active idle state, then
7491	* the most recently idled CPU might have
7492	* a warmer cache.
7493	*/
7494	latest_idle_timestamp = rq->idle_stamp;
7495	shallowest_idle_cpu = i;
7496	}
7497	} else if (shallowest_idle_cpu == -`1`) {
7498	load = cpu_load(cpu_rq(i));
7499	if (load < min_load) {
7500	min_load = load;
7501	least_loaded_cpu = i;
7502	}
7503	}
7504	}
7505
7506	return shallowest_idle_cpu != -`1` ? shallowest_idle_cpu : least_loaded_cpu;
7507	}
7508
7509	static inline int sched_balance_find_dst_cpu(struct sched_domain sd, struct* task_struct *p,
7510	int cpu, int prev_cpu, int sd_flag)
7511	{
7512	int new_cpu = cpu;
7513
7514	if (!cpumask_intersects(src1p: sched_domain_span(sd), src2p: p->cpus_ptr))
7515	return prev_cpu;
7516
7517	/*
7518	* We need task's util for cpu_util_without, sync it up to
7519	* prev_cpu's last_update_time.
7520	*/
7521	if (!(sd_flag & SD_BALANCE_FORK))
7522	sync_entity_load_avg(se: &p->se);
7523
7524	while (sd) {
7525	struct sched_group *group;
7526	struct sched_domain *tmp;
7527	int weight;
7528
7529	if (!(sd->flags & sd_flag)) {
7530	sd = sd->child;
7531	continue;
7532	}
7533
7534	group = sched_balance_find_dst_group(sd, p, this_cpu: cpu);
7535	if (!group) {
7536	sd = sd->child;
7537	continue;
7538	}
7539
7540	new_cpu = sched_balance_find_dst_group_cpu(group, p, this_cpu: cpu);
7541	if (new_cpu == cpu) {
7542	/ Now try balancing at a lower domain level of 'cpu': /
7543	sd = sd->child;
7544	continue;
7545	}
7546
7547	/ Now try balancing at a lower domain level of 'new_cpu': /
7548	cpu = new_cpu;
7549	weight = sd->span_weight;
7550	sd = NULL;
7551	for_each_domain(cpu, tmp) {
7552	if (weight <= tmp->span_weight)
7553	break;
7554	if (tmp->flags & sd_flag)
7555	sd = tmp;
7556	}
7557	}
7558
7559	return new_cpu;
7560	}
7561
7562	static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7563	{
7564	if ((available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu)) &&
7565	sched_cpu_cookie_match(cpu_rq(cpu), p))
7566	return cpu;
7567
7568	return -`1`;
7569	}
7570
7571	#ifdef CONFIG_SCHED_SMT
7572	DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7573	EXPORT_SYMBOL_GPL(sched_smt_present);
7574
7575	static inline void set_idle_cores(int cpu, int val)
7576	{
7577	struct sched_domain_shared *sds;
7578
7579	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7580	if (sds)
7581	WRITE_ONCE(sds->has_idle_cores, val);
7582	}
7583
7584	static inline bool test_idle_cores(int cpu)
7585	{
7586	struct sched_domain_shared *sds;
7587
7588	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7589	if (sds)
7590	return READ_ONCE(sds->has_idle_cores);
7591
7592	return false;
7593	}
7594
7595	/*
7596	* Scans the local SMT mask to see if the entire core is idle, and records this
7597	* information in sd_llc_shared->has_idle_cores.
7598	*
7599	* Since SMT siblings share all cache levels, inspecting this limited remote
7600	* state should be fairly cheap.
7601	*/
7602	void __update_idle_core(struct rq *rq)
7603	{
7604	int core = cpu_of(rq);
7605	int cpu;
7606
7607	rcu_read_lock();
7608	if (test_idle_cores(cpu: core))
7609	goto unlock;
7610
7611	for_each_cpu(cpu, cpu_smt_mask(core)) {
7612	if (cpu == core)
7613	continue;
7614
7615	if (!available_idle_cpu(cpu))
7616	goto unlock;
7617	}
7618
7619	set_idle_cores(cpu: core, val: `1`);
7620	unlock:
7621	rcu_read_unlock();
7622	}
7623
7624	/*
7625	* Scan the entire LLC domain for idle cores; this dynamically switches off if
7626	* there are no idle cores left in the system; tracked through
7627	* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7628	*/
7629	static int select_idle_core(struct task_struct p, int* core, struct cpumask cpus, int* *idle_cpu)
7630	{
7631	bool idle = true;
7632	int cpu;
7633
7634	for_each_cpu(cpu, cpu_smt_mask(core)) {
7635	if (!available_idle_cpu(cpu)) {
7636	idle = false;
7637	if (*idle_cpu == -`1`) {
7638	if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpumask: cpus)) {
7639	*idle_cpu = cpu;
7640	break;
7641	}
7642	continue;
7643	}
7644	break;
7645	}
7646	if (*idle_cpu == -`1` && cpumask_test_cpu(cpu, cpumask: cpus))
7647	*idle_cpu = cpu;
7648	}
7649
7650	if (idle)
7651	return core;
7652
7653	cpumask_andnot(dstp: cpus, src1p: cpus, src2p: cpu_smt_mask(cpu: core));
7654	return -`1`;
7655	}
7656
7657	/*
7658	* Scan the local SMT mask for idle CPUs.
7659	*/
7660	static int select_idle_smt(struct task_struct p, struct* sched_domain sd, int* target)
7661	{
7662	int cpu;
7663
7664	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7665	if (cpu == target)
7666	continue;
7667	/*
7668	* Check if the CPU is in the LLC scheduling domain of @target.
7669	* Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7670	*/
7671	if (!cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd)))
7672	continue;
7673	if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
7674	return cpu;
7675	}
7676
7677	return -`1`;
7678	}
7679
7680	#else /* CONFIG_SCHED_SMT */
7681
7682	static inline void set_idle_cores(int cpu, int val)
7683	{
7684	}
7685
7686	static inline bool test_idle_cores(int cpu)
7687	{
7688	return false;
7689	}
7690
7691	static inline int select_idle_core(struct task_struct p, int* core, struct cpumask cpus, int* *idle_cpu)
7692	{
7693	return __select_idle_cpu(core, p);
7694	}
7695
7696	static inline int select_idle_smt(struct task_struct p, struct* sched_domain sd, int* target)
7697	{
7698	return -`1`;
7699	}
7700
7701	#endif /* CONFIG_SCHED_SMT */
7702
7703	/*
7704	* Scan the LLC domain for idle CPUs; this is dynamically regulated by
7705	* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7706	* average idle time for this rq (as found in rq->avg_idle).
7707	*/
7708	static int select_idle_cpu(struct task_struct p, struct* sched_domain sd, bool has_idle_core, int* target)
7709	{
7710	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7711	int i, cpu, idle_cpu = -`1`, nr = INT_MAX;
7712	struct sched_domain_shared *sd_share;
7713
7714	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), src2p: p->cpus_ptr);
7715
7716	if (sched_feat(SIS_UTIL)) {
7717	sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
7718	if (sd_share) {
7719	/ because !--nr is the condition to stop scan /
7720	nr = READ_ONCE(sd_share->nr_idle_scan) + `1`;
7721	/ overloaded LLC is unlikely to have idle cpu/core /
7722	if (nr == `1`)
7723	return -`1`;
7724	}
7725	}
7726
7727	if (static_branch_unlikely(&sched_cluster_active)) {
7728	struct sched_group *sg = sd->groups;
7729
7730	if (sg->flags & SD_CLUSTER) {
7731	for_each_cpu_wrap(cpu, sched_group_span(sg), target + `1`) {
7732	if (!cpumask_test_cpu(cpu, cpumask: cpus))
7733	continue;
7734
7735	if (has_idle_core) {
7736	i = select_idle_core(p, core: cpu, cpus, idle_cpu: &idle_cpu);
7737	if ((unsigned int)i < nr_cpumask_bits)
7738	return i;
7739	} else {
7740	if (--nr <= `0`)
7741	return -`1`;
7742	idle_cpu = __select_idle_cpu(cpu, p);
7743	if ((unsigned int)idle_cpu < nr_cpumask_bits)
7744	return idle_cpu;
7745	}
7746	}
7747	cpumask_andnot(dstp: cpus, src1p: cpus, src2p: sched_group_span(sg));
7748	}
7749	}
7750
7751	for_each_cpu_wrap(cpu, cpus, target + `1`) {
7752	if (has_idle_core) {
7753	i = select_idle_core(p, core: cpu, cpus, idle_cpu: &idle_cpu);
7754	if ((unsigned int)i < nr_cpumask_bits)
7755	return i;
7756
7757	} else {
7758	if (--nr <= `0`)
7759	return -`1`;
7760	idle_cpu = __select_idle_cpu(cpu, p);
7761	if ((unsigned int)idle_cpu < nr_cpumask_bits)
7762	break;
7763	}
7764	}
7765
7766	if (has_idle_core)
7767	set_idle_cores(cpu: target, val: false);
7768
7769	return idle_cpu;
7770	}
7771
7772	/*
7773	* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7774	* the task fits. If no CPU is big enough, but there are idle ones, try to
7775	* maximize capacity.
7776	*/
7777	static int
7778	select_idle_capacity(struct task_struct p, struct* sched_domain sd, int* target)
7779	{
7780	unsigned long task_util, util_min, util_max, best_cap = `0`;
7781	int fits, best_fits = `0`;
7782	int cpu, best_cpu = -`1`;
7783	struct cpumask *cpus;
7784
7785	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7786	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), src2p: p->cpus_ptr);
7787
7788	task_util = task_util_est(p);
7789	util_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
7790	util_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
7791
7792	for_each_cpu_wrap(cpu, cpus, target) {
7793	unsigned long cpu_cap = capacity_of(cpu);
7794
7795	if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7796	continue;
7797
7798	fits = util_fits_cpu(util: task_util, uclamp_min: util_min, uclamp_max: util_max, cpu);
7799
7800	/ This CPU fits with all requirements /
7801	if (fits > `0`)
7802	return cpu;
7803	/*
7804	* Only the min performance hint (i.e. uclamp_min) doesn't fit.
7805	* Look for the CPU with best capacity.
7806	*/
7807	else if (fits < `0`)
7808	cpu_cap = get_actual_cpu_capacity(cpu);
7809
7810	/*
7811	* First, select CPU which fits better (-1 being better than 0).
7812	* Then, select the one with best capacity at same level.
7813	*/
7814	if ((fits < best_fits) \|\|
7815	((fits == best_fits) && (cpu_cap > best_cap))) {
7816	best_cap = cpu_cap;
7817	best_cpu = cpu;
7818	best_fits = fits;
7819	}
7820	}
7821
7822	return best_cpu;
7823	}
7824
7825	static inline bool asym_fits_cpu(unsigned long util,
7826	unsigned long util_min,
7827	unsigned long util_max,
7828	int cpu)
7829	{
7830	if (sched_asym_cpucap_active())
7831	/*
7832	* Return true only if the cpu fully fits the task requirements
7833	* which include the utilization and the performance hints.
7834	*/
7835	return (util_fits_cpu(util, uclamp_min: util_min, uclamp_max: util_max, cpu) > `0`);
7836
7837	return true;
7838	}
7839
7840	/*
7841	* Try and locate an idle core/thread in the LLC cache domain.
7842	*/
7843	static int select_idle_sibling(struct task_struct p, int* prev, int target)
7844	{
7845	bool has_idle_core = false;
7846	struct sched_domain *sd;
7847	unsigned long task_util, util_min, util_max;
7848	int i, recent_used_cpu, prev_aff = -`1`;
7849
7850	/*
7851	* On asymmetric system, update task utilization because we will check
7852	* that the task fits with CPU's capacity.
7853	*/
7854	if (sched_asym_cpucap_active()) {
7855	sync_entity_load_avg(se: &p->se);
7856	task_util = task_util_est(p);
7857	util_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
7858	util_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
7859	}
7860
7861	/*
7862	* per-cpu select_rq_mask usage
7863	*/
7864	lockdep_assert_irqs_disabled();
7865
7866	if ((available_idle_cpu(cpu: target) \|\| sched_idle_cpu(cpu: target)) &&
7867	asym_fits_cpu(util: task_util, util_min, util_max, cpu: target))
7868	return target;
7869
7870	/*
7871	* If the previous CPU is cache affine and idle, don't be stupid:
7872	*/
7873	if (prev != target && cpus_share_cache(this_cpu: prev, that_cpu: target) &&
7874	(available_idle_cpu(cpu: prev) \|\| sched_idle_cpu(cpu: prev)) &&
7875	asym_fits_cpu(util: task_util, util_min, util_max, cpu: prev)) {
7876
7877	if (!static_branch_unlikely(&sched_cluster_active) \|\|
7878	cpus_share_resources(this_cpu: prev, that_cpu: target))
7879	return prev;
7880
7881	prev_aff = prev;
7882	}
7883
7884	/*
7885	* Allow a per-cpu kthread to stack with the wakee if the
7886	* kworker thread and the tasks previous CPUs are the same.
7887	* The assumption is that the wakee queued work for the
7888	* per-cpu kthread that is now complete and the wakeup is
7889	* essentially a sync wakeup. An obvious example of this
7890	* pattern is IO completions.
7891	*/
7892	if (is_per_cpu_kthread(current) &&
7893	in_task() &&
7894	prev == smp_processor_id() &&
7895	this_rq()->nr_running <= `1` &&
7896	asym_fits_cpu(util: task_util, util_min, util_max, cpu: prev)) {
7897	return prev;
7898	}
7899
7900	/ Check a recently used CPU as a potential idle candidate: /
7901	recent_used_cpu = p->recent_used_cpu;
7902	p->recent_used_cpu = prev;
7903	if (recent_used_cpu != prev &&
7904	recent_used_cpu != target &&
7905	cpus_share_cache(this_cpu: recent_used_cpu, that_cpu: target) &&
7906	(available_idle_cpu(cpu: recent_used_cpu) \|\| sched_idle_cpu(cpu: recent_used_cpu)) &&
7907	cpumask_test_cpu(cpu: recent_used_cpu, cpumask: p->cpus_ptr) &&
7908	asym_fits_cpu(util: task_util, util_min, util_max, cpu: recent_used_cpu)) {
7909
7910	if (!static_branch_unlikely(&sched_cluster_active) \|\|
7911	cpus_share_resources(this_cpu: recent_used_cpu, that_cpu: target))
7912	return recent_used_cpu;
7913
7914	} else {
7915	recent_used_cpu = -`1`;
7916	}
7917
7918	/*
7919	* For asymmetric CPU capacity systems, our domain of interest is
7920	* sd_asym_cpucapacity rather than sd_llc.
7921	*/
7922	if (sched_asym_cpucap_active()) {
7923	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
7924	/*
7925	* On an asymmetric CPU capacity system where an exclusive
7926	* cpuset defines a symmetric island (i.e. one unique
7927	* capacity_orig value through the cpuset), the key will be set
7928	* but the CPUs within that cpuset will not have a domain with
7929	* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7930	* capacity path.
7931	*/
7932	if (sd) {
7933	i = select_idle_capacity(p, sd, target);
7934	return ((unsigned)i < nr_cpumask_bits) ? i : target;
7935	}
7936	}
7937
7938	sd = rcu_dereference(per_cpu(sd_llc, target));
7939	if (!sd)
7940	return target;
7941
7942	if (sched_smt_active()) {
7943	has_idle_core = test_idle_cores(cpu: target);
7944
7945	if (!has_idle_core && cpus_share_cache(this_cpu: prev, that_cpu: target)) {
7946	i = select_idle_smt(p, sd, target: prev);
7947	if ((unsigned int)i < nr_cpumask_bits)
7948	return i;
7949	}
7950	}
7951
7952	i = select_idle_cpu(p, sd, has_idle_core, target);
7953	if ((unsigned)i < nr_cpumask_bits)
7954	return i;
7955
7956	/*
7957	* For cluster machines which have lower sharing cache like L2 or
7958	* LLC Tag, we tend to find an idle CPU in the target's cluster
7959	* first. But prev_cpu or recent_used_cpu may also be a good candidate,
7960	* use them if possible when no idle CPU found in select_idle_cpu().
7961	*/
7962	if ((unsigned int)prev_aff < nr_cpumask_bits)
7963	return prev_aff;
7964	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7965	return recent_used_cpu;
7966
7967	return target;
7968	}
7969
7970	/**
7971	* cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7972	* @cpu: the CPU to get the utilization for
7973	* @p: task for which the CPU utilization should be predicted or NULL
7974	* @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7975	* @boost: 1 to enable boosting, otherwise 0
7976	*
7977	* The unit of the return value must be the same as the one of CPU capacity
7978	* so that CPU utilization can be compared with CPU capacity.
7979	*
7980	* CPU utilization is the sum of running time of runnable tasks plus the
7981	* recent utilization of currently non-runnable tasks on that CPU.
7982	* It represents the amount of CPU capacity currently used by CFS tasks in
7983	* the range [0..max CPU capacity] with max CPU capacity being the CPU
7984	* capacity at f_max.
7985	*
7986	* The estimated CPU utilization is defined as the maximum between CPU
7987	* utilization and sum of the estimated utilization of the currently
7988	* runnable tasks on that CPU. It preserves a utilization "snapshot" of
7989	* previously-executed tasks, which helps better deduce how busy a CPU will
7990	* be when a long-sleeping task wakes up. The contribution to CPU utilization
7991	* of such a task would be significantly decayed at this point of time.
7992	*
7993	* Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7994	* CPU contention for CFS tasks can be detected by CPU runnable > CPU
7995	* utilization. Boosting is implemented in cpu_util() so that internal
7996	* users (e.g. EAS) can use it next to external users (e.g. schedutil),
7997	* latter via cpu_util_cfs_boost().
7998	*
7999	* CPU utilization can be higher than the current CPU capacity
8000	* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
8001	* of rounding errors as well as task migrations or wakeups of new tasks.
8002	* CPU utilization has to be capped to fit into the [0..max CPU capacity]
8003	* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
8004	* could be seen as over-utilized even though CPU1 has 20% of spare CPU
8005	* capacity. CPU utilization is allowed to overshoot current CPU capacity
8006	* though since this is useful for predicting the CPU capacity required
8007	* after task migrations (scheduler-driven DVFS).
8008	*
8009	* Return: (Boosted) (estimated) utilization for the specified CPU.
8010	*/
8011	static unsigned long
8012	cpu_util(int cpu, struct task_struct p, int* dst_cpu, int boost)
8013	{
8014	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8015	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
8016	unsigned long runnable;
8017
8018	if (boost) {
8019	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
8020	util = max(util, runnable);
8021	}
8022
8023	/*
8024	* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
8025	* contribution. If @p migrates from another CPU to @cpu add its
8026	* contribution. In all the other cases @cpu is not impacted by the
8027	* migration so its util_avg is already correct.
8028	*/
8029	if (p && task_cpu(p) == cpu && dst_cpu != cpu)
8030	lsub_positive(&util, task_util(p));
8031	else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
8032	util += task_util(p);
8033
8034	if (sched_feat(UTIL_EST)) {
8035	unsigned long util_est;
8036
8037	util_est = READ_ONCE(cfs_rq->avg.util_est);
8038
8039	/*
8040	* During wake-up @p isn't enqueued yet and doesn't contribute
8041	* to any cpu_rq(cpu)->cfs.avg.util_est.
8042	* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
8043	* has been enqueued.
8044	*
8045	* During exec (@dst_cpu = -1) @p is enqueued and does
8046	* contribute to cpu_rq(cpu)->cfs.util_est.
8047	* Remove it to "simulate" cpu_util without @p's contribution.
8048	*
8049	* Despite the task_on_rq_queued(@p) check there is still a
8050	* small window for a possible race when an exec
8051	* select_task_rq_fair() races with LB's detach_task().
8052	*
8053	* detach_task()
8054	* deactivate_task()
8055	* p->on_rq = TASK_ON_RQ_MIGRATING;
8056	* -------------------------------- A
8057	* dequeue_task() \
8058	* dequeue_task_fair() + Race Time
8059	* util_est_dequeue() /
8060	* -------------------------------- B
8061	*
8062	* The additional check "current == p" is required to further
8063	* reduce the race window.
8064	*/
8065	if (dst_cpu == cpu)
8066	util_est += _task_util_est(p);
8067	else if (p && unlikely(task_on_rq_queued(p) \|\| current == p))
8068	lsub_positive(&util_est, _task_util_est(p));
8069
8070	util = max(util, util_est);
8071	}
8072
8073	return min(util, arch_scale_cpu_capacity(cpu));
8074	}
8075
8076	unsigned long cpu_util_cfs(int cpu)
8077	{
8078	return cpu_util(cpu, NULL, dst_cpu: -`1`, boost: `0`);
8079	}
8080
8081	unsigned long cpu_util_cfs_boost(int cpu)
8082	{
8083	return cpu_util(cpu, NULL, dst_cpu: -`1`, boost: `1`);
8084	}
8085
8086	/*
8087	* cpu_util_without: compute cpu utilization without any contributions from *p
8088	* @cpu: the CPU which utilization is requested
8089	* @p: the task which utilization should be discounted
8090	*
8091	* The utilization of a CPU is defined by the utilization of tasks currently
8092	* enqueued on that CPU as well as tasks which are currently sleeping after an
8093	* execution on that CPU.
8094	*
8095	* This method returns the utilization of the specified CPU by discounting the
8096	* utilization of the specified task, whenever the task is currently
8097	* contributing to the CPU utilization.
8098	*/
8099	static unsigned long cpu_util_without(int cpu, struct task_struct *p)
8100	{
8101	/ Task has no contribution or is new /
8102	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
8103	p = NULL;
8104
8105	return cpu_util(cpu, p, dst_cpu: -`1`, boost: `0`);
8106	}
8107
8108	/*
8109	* This function computes an effective utilization for the given CPU, to be
8110	* used for frequency selection given the linear relation: f = u * f_max.
8111	*
8112	* The scheduler tracks the following metrics:
8113	*
8114	* cpu_util_{cfs,rt,dl,irq}()
8115	* cpu_bw_dl()
8116	*
8117	* Where the cfs,rt and dl util numbers are tracked with the same metric and
8118	* synchronized windows and are thus directly comparable.
8119	*
8120	* The cfs,rt,dl utilization are the running times measured with rq->clock_task
8121	* which excludes things like IRQ and steal-time. These latter are then accrued
8122	* in the IRQ utilization.
8123	*
8124	* The DL bandwidth number OTOH is not a measured metric but a value computed
8125	* based on the task model parameters and gives the minimal utilization
8126	* required to meet deadlines.
8127	*/
8128	unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
8129	unsigned long *min,
8130	unsigned long *max)
8131	{
8132	unsigned long util, irq, scale;
8133	struct rq *rq = cpu_rq(cpu);
8134
8135	scale = arch_scale_cpu_capacity(cpu);
8136
8137	/*
8138	* Early check to see if IRQ/steal time saturates the CPU, can be
8139	* because of inaccuracies in how we track these -- see
8140	* update_irq_load_avg().
8141	*/
8142	irq = cpu_util_irq(rq);
8143	if (unlikely(irq >= scale)) {
8144	if (min)
8145	*min = scale;
8146	if (max)
8147	*max = scale;
8148	return scale;
8149	}
8150
8151	if (min) {
8152	/*
8153	* The minimum utilization returns the highest level between:
8154	* - the computed DL bandwidth needed with the IRQ pressure which
8155	* steals time to the deadline task.
8156	* - The minimum performance requirement for CFS and/or RT.
8157	*/
8158	*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
8159
8160	/*
8161	* When an RT task is runnable and uclamp is not used, we must
8162	* ensure that the task will run at maximum compute capacity.
8163	*/
8164	if (!uclamp_is_used() && rt_rq_is_runnable(rt_rq: &rq->rt))
8165	min = max(min, scale);
8166	}
8167
8168	/*
8169	* Because the time spend on RT/DL tasks is visible as 'lost' time to
8170	* CFS tasks and we use the same metric to track the effective
8171	* utilization (PELT windows are synchronized) we can directly add them
8172	* to obtain the CPU's actual utilization.
8173	*/
8174	util = util_cfs + cpu_util_rt(rq);
8175	util += cpu_util_dl(rq);
8176
8177	/*
8178	* The maximum hint is a soft bandwidth requirement, which can be lower
8179	* than the actual utilization because of uclamp_max requirements.
8180	*/
8181	if (max)
8182	*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
8183
8184	if (util >= scale)
8185	return scale;
8186
8187	/*
8188	* There is still idle time; further improve the number by using the
8189	* IRQ metric. Because IRQ/steal time is hidden from the task clock we
8190	* need to scale the task numbers:
8191	*
8192	* max - irq
8193	* U' = irq + --------- * U
8194	* max
8195	*/
8196	util = scale_irq_capacity(util, irq, max: scale);
8197	util += irq;
8198
8199	return min(scale, util);
8200	}
8201
8202	unsigned long sched_cpu_util(int cpu)
8203	{
8204	return effective_cpu_util(cpu, util_cfs: cpu_util_cfs(cpu), NULL, NULL);
8205	}
8206
8207	/*
8208	* energy_env - Utilization landscape for energy estimation.
8209	* @task_busy_time: Utilization contribution by the task for which we test the
8210	* placement. Given by eenv_task_busy_time().
8211	* @pd_busy_time: Utilization of the whole perf domain without the task
8212	* contribution. Given by eenv_pd_busy_time().
8213	* @cpu_cap: Maximum CPU capacity for the perf domain.
8214	* @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
8215	*/
8216	struct energy_env {
8217	unsigned long task_busy_time;
8218	unsigned long pd_busy_time;
8219	unsigned long cpu_cap;
8220	unsigned long pd_cap;
8221	};
8222
8223	/*
8224	* Compute the task busy time for compute_energy(). This time cannot be
8225	* injected directly into effective_cpu_util() because of the IRQ scaling.
8226	* The latter only makes sense with the most recent CPUs where the task has
8227	* run.
8228	*/
8229	static inline void eenv_task_busy_time(struct energy_env *eenv,
8230	struct task_struct p, int* prev_cpu)
8231	{
8232	unsigned long busy_time, max_cap = arch_scale_cpu_capacity(cpu: prev_cpu);
8233	unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
8234
8235	if (unlikely(irq >= max_cap))
8236	busy_time = max_cap;
8237	else
8238	busy_time = scale_irq_capacity(util: task_util_est(p), irq, max: max_cap);
8239
8240	eenv->task_busy_time = busy_time;
8241	}
8242
8243	/*
8244	* Compute the perf_domain (PD) busy time for compute_energy(). Based on the
8245	* utilization for each @pd_cpus, it however doesn't take into account
8246	* clamping since the ratio (utilization / cpu_capacity) is already enough to
8247	* scale the EM reported power consumption at the (eventually clamped)
8248	* cpu_capacity.
8249	*
8250	* The contribution of the task @p for which we want to estimate the
8251	* energy cost is removed (by cpu_util()) and must be calculated
8252	* separately (see eenv_task_busy_time). This ensures:
8253	*
8254	* - A stable PD utilization, no matter which CPU of that PD we want to place
8255	* the task on.
8256	*
8257	* - A fair comparison between CPUs as the task contribution (task_util())
8258	* will always be the same no matter which CPU utilization we rely on
8259	* (util_avg or util_est).
8260	*
8261	* Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
8262	* exceed @eenv->pd_cap.
8263	*/
8264	static inline void eenv_pd_busy_time(struct energy_env *eenv,
8265	struct cpumask *pd_cpus,
8266	struct task_struct *p)
8267	{
8268	unsigned long busy_time = `0`;
8269	int cpu;
8270
8271	for_each_cpu(cpu, pd_cpus) {
8272	unsigned long util = cpu_util(cpu, p, dst_cpu: -`1`, boost: `0`);
8273
8274	busy_time += effective_cpu_util(cpu, util_cfs: util, NULL, NULL);
8275	}
8276
8277	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
8278	}
8279
8280	/*
8281	* Compute the maximum utilization for compute_energy() when the task @p
8282	* is placed on the cpu @dst_cpu.
8283	*
8284	* Returns the maximum utilization among @eenv->cpus. This utilization can't
8285	* exceed @eenv->cpu_cap.
8286	*/
8287	static inline unsigned long
8288	eenv_pd_max_util(struct energy_env eenv, struct* cpumask *pd_cpus,
8289	struct task_struct p, int* dst_cpu)
8290	{
8291	unsigned long max_util = `0`;
8292	int cpu;
8293
8294	for_each_cpu(cpu, pd_cpus) {
8295	struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
8296	unsigned long util = cpu_util(cpu, p, dst_cpu, boost: `1`);
8297	unsigned long eff_util, min, max;
8298
8299	/*
8300	* Performance domain frequency: utilization clamping
8301	* must be considered since it affects the selection
8302	* of the performance domain frequency.
8303	* NOTE: in case RT tasks are running, by default the min
8304	* utilization can be max OPP.
8305	*/
8306	eff_util = effective_cpu_util(cpu, util_cfs: util, min: &min, max: &max);
8307
8308	/ Task's uclamp can modify min and max value /
8309	if (tsk && uclamp_is_used()) {
8310	min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
8311
8312	/*
8313	* If there is no active max uclamp constraint,
8314	* directly use task's one, otherwise keep max.
8315	*/
8316	if (uclamp_rq_is_idle(cpu_rq(cpu)))
8317	max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
8318	else
8319	max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
8320	}
8321
8322	eff_util = sugov_effective_cpu_perf(cpu, actual: eff_util, min, max);
8323	max_util = max(max_util, eff_util);
8324	}
8325
8326	return min(max_util, eenv->cpu_cap);
8327	}
8328
8329	/*
8330	* compute_energy(): Use the Energy Model to estimate the energy that @pd would
8331	* consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
8332	* contribution is ignored.
8333	*/
8334	static inline unsigned long
8335	compute_energy(struct energy_env eenv, struct* perf_domain *pd,
8336	struct cpumask pd_cpus, struct* task_struct p, int* dst_cpu)
8337	{
8338	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
8339	unsigned long busy_time = eenv->pd_busy_time;
8340	unsigned long energy;
8341
8342	if (dst_cpu >= `0`)
8343	busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
8344
8345	energy = em_cpu_energy(pd: pd->em_pd, max_util, sum_util: busy_time, allowed_cpu_cap: eenv->cpu_cap);
8346
8347	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
8348
8349	return energy;
8350	}
8351
8352	/*
8353	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
8354	* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
8355	* spare capacity in each performance domain and uses it as a potential
8356	* candidate to execute the task. Then, it uses the Energy Model to figure
8357	* out which of the CPU candidates is the most energy-efficient.
8358	*
8359	* The rationale for this heuristic is as follows. In a performance domain,
8360	* all the most energy efficient CPU candidates (according to the Energy
8361	* Model) are those for which we'll request a low frequency. When there are
8362	* several CPUs for which the frequency request will be the same, we don't
8363	* have enough data to break the tie between them, because the Energy Model
8364	* only includes active power costs. With this model, if we assume that
8365	* frequency requests follow utilization (e.g. using schedutil), the CPU with
8366	* the maximum spare capacity in a performance domain is guaranteed to be among
8367	* the best candidates of the performance domain.
8368	*
8369	* In practice, it could be preferable from an energy standpoint to pack
8370	* small tasks on a CPU in order to let other CPUs go in deeper idle states,
8371	* but that could also hurt our chances to go cluster idle, and we have no
8372	* ways to tell with the current Energy Model if this is actually a good
8373	* idea or not. So, find_energy_efficient_cpu() basically favors
8374	* cluster-packing, and spreading inside a cluster. That should at least be
8375	* a good thing for latency, and this is consistent with the idea that most
8376	* of the energy savings of EAS come from the asymmetry of the system, and
8377	* not so much from breaking the tie between identical CPUs. That's also the
8378	* reason why EAS is enabled in the topology code only for systems where
8379	* SD_ASYM_CPUCAPACITY is set.
8380	*
8381	* NOTE: Forkees are not accepted in the energy-aware wake-up path because
8382	* they don't have any useful utilization data yet and it's not possible to
8383	* forecast their impact on energy consumption. Consequently, they will be
8384	* placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
8385	* to be energy-inefficient in some use-cases. The alternative would be to
8386	* bias new tasks towards specific types of CPUs first, or to try to infer
8387	* their util_avg from the parent task, but those heuristics could hurt
8388	* other use-cases too. So, until someone finds a better way to solve this,
8389	* let's keep things simple by re-using the existing slow path.
8390	*/
8391	static int find_energy_efficient_cpu(struct task_struct p, int* prev_cpu)
8392	{
8393	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
8394	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
8395	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, clamp_id: UCLAMP_MIN) : `0`;
8396	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, clamp_id: UCLAMP_MAX) : `1024`;
8397	struct root_domain *rd = this_rq()->rd;
8398	int cpu, best_energy_cpu, target = -`1`;
8399	int prev_fits = -`1`, best_fits = -`1`;
8400	unsigned long best_actual_cap = `0`;
8401	unsigned long prev_actual_cap = `0`;
8402	struct sched_domain *sd;
8403	struct perf_domain *pd;
8404	struct energy_env eenv;
8405
8406	rcu_read_lock();
8407	pd = rcu_dereference(rd->pd);
8408	if (!pd)
8409	goto unlock;
8410
8411	/*
8412	* Energy-aware wake-up happens on the lowest sched_domain starting
8413	* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8414	*/
8415	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
8416	while (sd && !cpumask_test_cpu(cpu: prev_cpu, cpumask: sched_domain_span(sd)))
8417	sd = sd->parent;
8418	if (!sd)
8419	goto unlock;
8420
8421	target = prev_cpu;
8422
8423	sync_entity_load_avg(se: &p->se);
8424	if (!task_util_est(p) && p_util_min == `0`)
8425	goto unlock;
8426
8427	eenv_task_busy_time(eenv: &eenv, p, prev_cpu);
8428
8429	for (; pd; pd = pd->next) {
8430	unsigned long util_min = p_util_min, util_max = p_util_max;
8431	unsigned long cpu_cap, cpu_actual_cap, util;
8432	long prev_spare_cap = -`1`, max_spare_cap = -`1`;
8433	unsigned long rq_util_min, rq_util_max;
8434	unsigned long cur_delta, base_energy;
8435	int max_spare_cap_cpu = -`1`;
8436	int fits, max_fits = -`1`;
8437
8438	cpumask_and(dstp: cpus, perf_domain_span(pd), cpu_online_mask);
8439
8440	if (cpumask_empty(srcp: cpus))
8441	continue;
8442
8443	/ Account external pressure for the energy estimation /
8444	cpu = cpumask_first(srcp: cpus);
8445	cpu_actual_cap = get_actual_cpu_capacity(cpu);
8446
8447	eenv.cpu_cap = cpu_actual_cap;
8448	eenv.pd_cap = `0`;
8449
8450	for_each_cpu(cpu, cpus) {
8451	struct rq *rq = cpu_rq(cpu);
8452
8453	eenv.pd_cap += cpu_actual_cap;
8454
8455	if (!cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd)))
8456	continue;
8457
8458	if (!cpumask_test_cpu(cpu, cpumask: p->cpus_ptr))
8459	continue;
8460
8461	util = cpu_util(cpu, p, dst_cpu: cpu, boost: `0`);
8462	cpu_cap = capacity_of(cpu);
8463
8464	/*
8465	* Skip CPUs that cannot satisfy the capacity request.
8466	* IOW, placing the task there would make the CPU
8467	* overutilized. Take uclamp into account to see how
8468	* much capacity we can get out of the CPU; this is
8469	* aligned with sched_cpu_util().
8470	*/
8471	if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8472	/*
8473	* Open code uclamp_rq_util_with() except for
8474	* the clamp() part. I.e.: apply max aggregation
8475	* only. util_fits_cpu() logic requires to
8476	* operate on non clamped util but must use the
8477	* max-aggregated uclamp_{min, max}.
8478	*/
8479	rq_util_min = uclamp_rq_get(rq, clamp_id: UCLAMP_MIN);
8480	rq_util_max = uclamp_rq_get(rq, clamp_id: UCLAMP_MAX);
8481
8482	util_min = max(rq_util_min, p_util_min);
8483	util_max = max(rq_util_max, p_util_max);
8484	}
8485
8486	fits = util_fits_cpu(util, uclamp_min: util_min, uclamp_max: util_max, cpu);
8487	if (!fits)
8488	continue;
8489
8490	lsub_positive(&cpu_cap, util);
8491
8492	if (cpu == prev_cpu) {
8493	/ Always use prev_cpu as a candidate. /
8494	prev_spare_cap = cpu_cap;
8495	prev_fits = fits;
8496	} else if ((fits > max_fits) \|\|
8497	((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8498	/*
8499	* Find the CPU with the maximum spare capacity
8500	* among the remaining CPUs in the performance
8501	* domain.
8502	*/
8503	max_spare_cap = cpu_cap;
8504	max_spare_cap_cpu = cpu;
8505	max_fits = fits;
8506	}
8507	}
8508
8509	if (max_spare_cap_cpu < `0` && prev_spare_cap < `0`)
8510	continue;
8511
8512	eenv_pd_busy_time(eenv: &eenv, pd_cpus: cpus, p);
8513	/ Compute the 'base' energy of the pd, without @p /
8514	base_energy = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p, dst_cpu: -`1`);
8515
8516	/ Evaluate the energy impact of using prev_cpu. /
8517	if (prev_spare_cap > -`1`) {
8518	prev_delta = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p,
8519	dst_cpu: prev_cpu);
8520	/ CPU utilization has changed /
8521	if (prev_delta < base_energy)
8522	goto unlock;
8523	prev_delta -= base_energy;
8524	prev_actual_cap = cpu_actual_cap;
8525	best_delta = min(best_delta, prev_delta);
8526	}
8527
8528	/ Evaluate the energy impact of using max_spare_cap_cpu. /
8529	if (max_spare_cap_cpu >= `0` && max_spare_cap > prev_spare_cap) {
8530	/ Current best energy cpu fits better /
8531	if (max_fits < best_fits)
8532	continue;
8533
8534	/*
8535	* Both don't fit performance hint (i.e. uclamp_min)
8536	* but best energy cpu has better capacity.
8537	*/
8538	if ((max_fits < `0`) &&
8539	(cpu_actual_cap <= best_actual_cap))
8540	continue;
8541
8542	cur_delta = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p,
8543	dst_cpu: max_spare_cap_cpu);
8544	/ CPU utilization has changed /
8545	if (cur_delta < base_energy)
8546	goto unlock;
8547	cur_delta -= base_energy;
8548
8549	/*
8550	* Both fit for the task but best energy cpu has lower
8551	* energy impact.
8552	*/
8553	if ((max_fits > `0`) && (best_fits > `0`) &&
8554	(cur_delta >= best_delta))
8555	continue;
8556
8557	best_delta = cur_delta;
8558	best_energy_cpu = max_spare_cap_cpu;
8559	best_fits = max_fits;
8560	best_actual_cap = cpu_actual_cap;
8561	}
8562	}
8563	rcu_read_unlock();
8564
8565	if ((best_fits > prev_fits) \|\|
8566	((best_fits > `0`) && (best_delta < prev_delta)) \|\|
8567	((best_fits < `0`) && (best_actual_cap > prev_actual_cap)))
8568	target = best_energy_cpu;
8569
8570	return target;
8571
8572	unlock:
8573	rcu_read_unlock();
8574
8575	return target;
8576	}
8577
8578	/*
8579	* select_task_rq_fair: Select target runqueue for the waking task in domains
8580	* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8581	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8582	*
8583	* Balances load by selecting the idlest CPU in the idlest group, or under
8584	* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8585	*
8586	* Returns the target CPU number.
8587	*/
8588	static int
8589	select_task_rq_fair(struct task_struct p, int* prev_cpu, int wake_flags)
8590	{
8591	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8592	struct sched_domain tmp, sd = NULL;
8593	int cpu = smp_processor_id();
8594	int new_cpu = prev_cpu;
8595	int want_affine = `0`;
8596	/ SD_flags and WF_flags share the first nibble /
8597	int sd_flag = wake_flags & `0xF`;
8598
8599	/*
8600	* required for stable ->cpus_allowed
8601	*/
8602	lockdep_assert_held(&p->pi_lock);
8603	if (wake_flags & WF_TTWU) {
8604	record_wakee(p);
8605
8606	if ((wake_flags & WF_CURRENT_CPU) &&
8607	cpumask_test_cpu(cpu, cpumask: p->cpus_ptr))
8608	return cpu;
8609
8610	if (!is_rd_overutilized(this_rq()->rd)) {
8611	new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8612	if (new_cpu >= `0`)
8613	return new_cpu;
8614	new_cpu = prev_cpu;
8615	}
8616
8617	want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, cpumask: p->cpus_ptr);
8618	}
8619
8620	rcu_read_lock();
8621	for_each_domain(cpu, tmp) {
8622	/*
8623	* If both 'cpu' and 'prev_cpu' are part of this domain,
8624	* cpu is a valid SD_WAKE_AFFINE target.
8625	*/
8626	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8627	cpumask_test_cpu(cpu: prev_cpu, cpumask: sched_domain_span(sd: tmp))) {
8628	if (cpu != prev_cpu)
8629	new_cpu = wake_affine(sd: tmp, p, this_cpu: cpu, prev_cpu, sync);
8630
8631	sd = NULL; / Prefer wake_affine over balance flags /
8632	break;
8633	}
8634
8635	/*
8636	* Usually only true for WF_EXEC and WF_FORK, as sched_domains
8637	* usually do not have SD_BALANCE_WAKE set. That means wakeup
8638	* will usually go to the fast path.
8639	*/
8640	if (tmp->flags & sd_flag)
8641	sd = tmp;
8642	else if (!want_affine)
8643	break;
8644	}
8645
8646	if (unlikely(sd)) {
8647	/ Slow path /
8648	new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8649	} else if (wake_flags & WF_TTWU) { / XXX always ? /
8650	/ Fast path /
8651	new_cpu = select_idle_sibling(p, prev: prev_cpu, target: new_cpu);
8652	}
8653	rcu_read_unlock();
8654
8655	return new_cpu;
8656	}
8657
8658	/*
8659	* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8660	* cfs_rq_of(p) references at time of call are still valid and identify the
8661	* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8662	*/
8663	static void migrate_task_rq_fair(struct task_struct p, int* new_cpu)
8664	{
8665	struct sched_entity *se = &p->se;
8666
8667	if (!task_on_rq_migrating(p)) {
8668	remove_entity_load_avg(se);
8669
8670	/*
8671	* Here, the task's PELT values have been updated according to
8672	* the current rq's clock. But if that clock hasn't been
8673	* updated in a while, a substantial idle time will be missed,
8674	* leading to an inflation after wake-up on the new rq.
8675	*
8676	* Estimate the missing time from the cfs_rq last_update_time
8677	* and update sched_avg to improve the PELT continuity after
8678	* migration.
8679	*/
8680	migrate_se_pelt_lag(se);
8681	}
8682
8683	/ Tell new CPU we are migrated /
8684	se->avg.last_update_time = `0`;
8685
8686	update_scan_period(p, new_cpu);
8687	}
8688
8689	static void task_dead_fair(struct task_struct *p)
8690	{
8691	struct sched_entity *se = &p->se;
8692
8693	if (se->sched_delayed) {
8694	struct rq_flags rf;
8695	struct rq *rq;
8696
8697	rq = task_rq_lock(p, rf: &rf);
8698	if (se->sched_delayed) {
8699	update_rq_clock(rq);
8700	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
8701	}
8702	task_rq_unlock(rq, p, rf: &rf);
8703	}
8704
8705	remove_entity_load_avg(se);
8706	}
8707
8708	/*
8709	* Set the max capacity the task is allowed to run at for misfit detection.
8710	*/
8711	static void set_task_max_allowed_capacity(struct task_struct *p)
8712	{
8713	struct asym_cap_data *entry;
8714
8715	if (!sched_asym_cpucap_active())
8716	return;
8717
8718	rcu_read_lock();
8719	list_for_each_entry_rcu(entry, &asym_cap_list, link) {
8720	cpumask_t *cpumask;
8721
8722	cpumask = cpu_capacity_span(entry);
8723	if (!cpumask_intersects(src1p: p->cpus_ptr, src2p: cpumask))
8724	continue;
8725
8726	p->max_allowed_capacity = entry->capacity;
8727	break;
8728	}
8729	rcu_read_unlock();
8730	}
8731
8732	static void set_cpus_allowed_fair(struct task_struct p, struct* affinity_context *ctx)
8733	{
8734	set_cpus_allowed_common(p, ctx);
8735	set_task_max_allowed_capacity(p);
8736	}
8737
8738	static int
8739	balance_fair(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
8740	{
8741	if (sched_fair_runnable(rq))
8742	return `1`;
8743
8744	return sched_balance_newidle(this_rq: rq, rf) != `0`;
8745	}
8746	#else
8747	static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
8748	#endif /* CONFIG_SMP */
8749
8750	static void set_next_buddy(struct sched_entity *se)
8751	{
8752	for_each_sched_entity(se) {
8753	if (WARN_ON_ONCE(!se->on_rq))
8754	return;
8755	if (se_is_idle(se))
8756	return;
8757	cfs_rq_of(se)->next = se;
8758	}
8759	}
8760
8761	/*
8762	* Preempt the current task with a newly woken task if needed:
8763	*/
8764	static void check_preempt_wakeup_fair(struct rq rq, struct* task_struct p, int* wake_flags)
8765	{
8766	struct task_struct *donor = rq->donor;
8767	struct sched_entity se = &donor->se, pse = &p->se;
8768	struct cfs_rq *cfs_rq = task_cfs_rq(p: donor);
8769	int cse_is_idle, pse_is_idle;
8770
8771	if (unlikely(se == pse))
8772	return;
8773
8774	/*
8775	* This is possible from callers such as attach_tasks(), in which we
8776	* unconditionally wakeup_preempt() after an enqueue (which may have
8777	* lead to a throttle). This both saves work and prevents false
8778	* next-buddy nomination below.
8779	*/
8780	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8781	return;
8782
8783	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
8784	set_next_buddy(pse);
8785	}
8786
8787	/*
8788	* We can come here with TIF_NEED_RESCHED already set from new task
8789	* wake up path.
8790	*
8791	* Note: this also catches the edge-case of curr being in a throttled
8792	* group (e.g. via set_curr_task), since update_curr() (in the
8793	* enqueue of curr) will have resulted in resched being set. This
8794	* prevents us from potentially nominating it as a false LAST_BUDDY
8795	* below.
8796	*/
8797	if (test_tsk_need_resched(tsk: rq->curr))
8798	return;
8799
8800	if (!sched_feat(WAKEUP_PREEMPTION))
8801	return;
8802
8803	find_matching_se(se: &se, pse: &pse);
8804	WARN_ON_ONCE(!pse);
8805
8806	cse_is_idle = se_is_idle(se);
8807	pse_is_idle = se_is_idle(se: pse);
8808
8809	/*
8810	* Preempt an idle entity in favor of a non-idle entity (and don't preempt
8811	* in the inverse case).
8812	*/
8813	if (cse_is_idle && !pse_is_idle) {
8814	/*
8815	* When non-idle entity preempt an idle entity,
8816	* don't give idle entity slice protection.
8817	*/
8818	cancel_protect_slice(se);
8819	goto preempt;
8820	}
8821
8822	if (cse_is_idle != pse_is_idle)
8823	return;
8824
8825	/*
8826	* BATCH and IDLE tasks do not preempt others.
8827	*/
8828	if (unlikely(!normal_policy(p->policy)))
8829	return;
8830
8831	cfs_rq = cfs_rq_of(se);
8832	update_curr(cfs_rq);
8833	/*
8834	* If @p has a shorter slice than current and @p is eligible, override
8835	* current's slice protection in order to allow preemption.
8836	*
8837	* Note that even if @p does not turn out to be the most eligible
8838	* task at this moment, current's slice protection will be lost.
8839	*/
8840	if (do_preempt_short(cfs_rq, pse, se))
8841	cancel_protect_slice(se);
8842
8843	/*
8844	* If @p has become the most eligible task, force preemption.
8845	*/
8846	if (pick_eevdf(cfs_rq) == pse)
8847	goto preempt;
8848
8849	return;
8850
8851	preempt:
8852	resched_curr_lazy(rq);
8853	}
8854
8855	static struct task_struct pick_task_fair(struct* rq *rq)
8856	{
8857	struct sched_entity *se;
8858	struct cfs_rq *cfs_rq;
8859
8860	again:
8861	cfs_rq = &rq->cfs;
8862	if (!cfs_rq->nr_queued)
8863	return NULL;
8864
8865	do {
8866	/ Might not have done put_prev_entity() /
8867	if (cfs_rq->curr && cfs_rq->curr->on_rq)
8868	update_curr(cfs_rq);
8869
8870	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8871	goto again;
8872
8873	se = pick_next_entity(rq, cfs_rq);
8874	if (!se)
8875	goto again;
8876	cfs_rq = group_cfs_rq(grp: se);
8877	} while (cfs_rq);
8878
8879	return task_of(se);
8880	}
8881
8882	static void __set_next_task_fair(struct rq rq, struct* task_struct *p, bool first);
8883	static void set_next_task_fair(struct rq rq, struct* task_struct *p, bool first);
8884
8885	struct task_struct *
8886	pick_next_task_fair(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
8887	{
8888	struct sched_entity *se;
8889	struct task_struct *p;
8890	int new_tasks;
8891
8892	again:
8893	p = pick_task_fair(rq);
8894	if (!p)
8895	goto idle;
8896	se = &p->se;
8897
8898	#ifdef CONFIG_FAIR_GROUP_SCHED
8899	if (prev->sched_class != &fair_sched_class)
8900	goto simple;
8901
8902	__put_prev_set_next_dl_server(rq, prev, next: p);
8903
8904	/*
8905	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
8906	* likely that a next task is from the same cgroup as the current.
8907	*
8908	* Therefore attempt to avoid putting and setting the entire cgroup
8909	* hierarchy, only change the part that actually changes.
8910	*
8911	* Since we haven't yet done put_prev_entity and if the selected task
8912	* is a different task than we started out with, try and touch the
8913	* least amount of cfs_rqs.
8914	*/
8915	if (prev != p) {
8916	struct sched_entity *pse = &prev->se;
8917	struct cfs_rq *cfs_rq;
8918
8919	while (!(cfs_rq = is_same_group(se, pse))) {
8920	int se_depth = se->depth;
8921	int pse_depth = pse->depth;
8922
8923	if (se_depth <= pse_depth) {
8924	put_prev_entity(cfs_rq: cfs_rq_of(se: pse), prev: pse);
8925	pse = parent_entity(se: pse);
8926	}
8927	if (se_depth >= pse_depth) {
8928	set_next_entity(cfs_rq: cfs_rq_of(se), se);
8929	se = parent_entity(se);
8930	}
8931	}
8932
8933	put_prev_entity(cfs_rq, prev: pse);
8934	set_next_entity(cfs_rq, se);
8935
8936	__set_next_task_fair(rq, p, first: true);
8937	}
8938
8939	return p;
8940
8941	simple:
8942	#endif
8943	put_prev_set_next_task(rq, prev, next: p);
8944	return p;
8945
8946	idle:
8947	if (!rf)
8948	return NULL;
8949
8950	new_tasks = sched_balance_newidle(this_rq: rq, rf);
8951
8952	/*
8953	* Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
8954	* possible for any higher priority task to appear. In that case we
8955	* must re-start the pick_next_entity() loop.
8956	*/
8957	if (new_tasks < `0`)
8958	return RETRY_TASK;
8959
8960	if (new_tasks > `0`)
8961	goto again;
8962
8963	/*
8964	* rq is about to be idle, check if we need to update the
8965	* lost_idle_time of clock_pelt
8966	*/
8967	update_idle_rq_clock_pelt(rq);
8968
8969	return NULL;
8970	}
8971
8972	static struct task_struct __pick_next_task_fair(struct* rq rq, struct* task_struct *prev)
8973	{
8974	return pick_next_task_fair(rq, prev, NULL);
8975	}
8976
8977	static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
8978	{
8979	return !!dl_se->rq->cfs.nr_queued;
8980	}
8981
8982	static struct task_struct fair_server_pick_task(struct* sched_dl_entity *dl_se)
8983	{
8984	return pick_task_fair(rq: dl_se->rq);
8985	}
8986
8987	void fair_server_init(struct rq *rq)
8988	{
8989	struct sched_dl_entity *dl_se = &rq->fair_server;
8990
8991	init_dl_entity(dl_se);
8992
8993	dl_server_init(dl_se, rq, has_tasks: fair_server_has_tasks, pick_task: fair_server_pick_task);
8994	}
8995
8996	/*
8997	* Account for a descheduled task:
8998	*/
8999	static void put_prev_task_fair(struct rq rq, struct* task_struct prev, struct* task_struct *next)
9000	{
9001	struct sched_entity *se = &prev->se;
9002	struct cfs_rq *cfs_rq;
9003
9004	for_each_sched_entity(se) {
9005	cfs_rq = cfs_rq_of(se);
9006	put_prev_entity(cfs_rq, prev: se);
9007	}
9008	}
9009
9010	/*
9011	* sched_yield() is very simple
9012	*/
9013	static void yield_task_fair(struct rq *rq)
9014	{
9015	struct task_struct *curr = rq->curr;
9016	struct cfs_rq *cfs_rq = task_cfs_rq(p: curr);
9017	struct sched_entity *se = &curr->se;
9018
9019	/*
9020	* Are we the only task in the tree?
9021	*/
9022	if (unlikely(rq->nr_running == `1`))
9023	return;
9024
9025	clear_buddies(cfs_rq, se);
9026
9027	update_rq_clock(rq);
9028	/*
9029	* Update run-time statistics of the 'current'.
9030	*/
9031	update_curr(cfs_rq);
9032	/*
9033	* Tell update_rq_clock() that we've just updated,
9034	* so we don't do microscopic update in schedule()
9035	* and double the fastpath cost.
9036	*/
9037	rq_clock_skip_update(rq);
9038
9039	se->deadline += calc_delta_fair(delta: se->slice, se);
9040	}
9041
9042	static bool yield_to_task_fair(struct rq rq, struct* task_struct *p)
9043	{
9044	struct sched_entity *se = &p->se;
9045
9046	/ throttled hierarchies are not runnable /
9047	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq: cfs_rq_of(se)))
9048	return false;
9049
9050	/ Tell the scheduler that we'd really like se to run next. /
9051	set_next_buddy(se);
9052
9053	yield_task_fair(rq);
9054
9055	return true;
9056	}
9057
9058	#ifdef CONFIG_SMP
9059	/**************************************************
9060	* Fair scheduling class load-balancing methods.
9061	*
9062	* BASICS
9063	*
9064	* The purpose of load-balancing is to achieve the same basic fairness the
9065	* per-CPU scheduler provides, namely provide a proportional amount of compute
9066	* time to each task. This is expressed in the following equation:
9067	*
9068	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
9069	*
9070	* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
9071	* W_i,0 is defined as:
9072	*
9073	* W_i,0 = \Sum_j w_i,j (2)
9074	*
9075	* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
9076	* is derived from the nice value as per sched_prio_to_weight[].
9077	*
9078	* The weight average is an exponential decay average of the instantaneous
9079	* weight:
9080	*
9081	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
9082	*
9083	* C_i is the compute capacity of CPU i, typically it is the
9084	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
9085	* can also include other factors [XXX].
9086	*
9087	* To achieve this balance we define a measure of imbalance which follows
9088	* directly from (1):
9089	*
9090	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
9091	*
9092	* We them move tasks around to minimize the imbalance. In the continuous
9093	* function space it is obvious this converges, in the discrete case we get
9094	* a few fun cases generally called infeasible weight scenarios.
9095	*
9096	* [XXX expand on:
9097	* - infeasible weights;
9098	* - local vs global optima in the discrete case. ]
9099	*
9100	*
9101	* SCHED DOMAINS
9102	*
9103	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
9104	* for all i,j solution, we create a tree of CPUs that follows the hardware
9105	* topology where each level pairs two lower groups (or better). This results
9106	* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
9107	* tree to only the first of the previous level and we decrease the frequency
9108	* of load-balance at each level inversely proportional to the number of CPUs in
9109	* the groups.
9110	*
9111	* This yields:
9112	*
9113	* log_2 n 1 n
9114	* \Sum { --- * --- * 2^i } = O(n) (5)
9115	* i = 0 2^i 2^i
9116	* `- size of each group
9117	* \| \| `- number of CPUs doing load-balance
9118	* \| `- freq
9119	* `- sum over all levels
9120	*
9121	* Coupled with a limit on how many tasks we can migrate every balance pass,
9122	* this makes (5) the runtime complexity of the balancer.
9123	*
9124	* An important property here is that each CPU is still (indirectly) connected
9125	* to every other CPU in at most O(log n) steps:
9126	*
9127	* The adjacency matrix of the resulting graph is given by:
9128	*
9129	* log_2 n
9130	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
9131	* k = 0
9132	*
9133	* And you'll find that:
9134	*
9135	* A^(log_2 n)_i,j != 0 for all i,j (7)
9136	*
9137	* Showing there's indeed a path between every CPU in at most O(log n) steps.
9138	* The task movement gives a factor of O(m), giving a convergence complexity
9139	* of:
9140	*
9141	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
9142	*
9143	*
9144	* WORK CONSERVING
9145	*
9146	* In order to avoid CPUs going idle while there's still work to do, new idle
9147	* balancing is more aggressive and has the newly idle CPU iterate up the domain
9148	* tree itself instead of relying on other CPUs to bring it work.
9149	*
9150	* This adds some complexity to both (5) and (8) but it reduces the total idle
9151	* time.
9152	*
9153	* [XXX more?]
9154	*
9155	*
9156	* CGROUPS
9157	*
9158	* Cgroups make a horror show out of (2), instead of a simple sum we get:
9159	*
9160	* s_k,i
9161	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
9162	* S_k
9163	*
9164	* Where
9165	*
9166	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
9167	*
9168	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
9169	*
9170	* The big problem is S_k, its a global sum needed to compute a local (W_i)
9171	* property.
9172	*
9173	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
9174	* rewrite all of this once again.]
9175	*/
9176
9177	static unsigned long __read_mostly max_load_balance_interval = HZ/`10`;
9178
9179	enum fbq_type { regular, remote, all };
9180
9181	/*
9182	* 'group_type' describes the group of CPUs at the moment of load balancing.
9183	*
9184	* The enum is ordered by pulling priority, with the group with lowest priority
9185	* first so the group_type can simply be compared when selecting the busiest
9186	* group. See update_sd_pick_busiest().
9187	*/
9188	enum group_type {
9189	/ The group has spare capacity that can be used to run more tasks. /
9190	group_has_spare = `0`,
9191	/*
9192	* The group is fully used and the tasks don't compete for more CPU
9193	* cycles. Nevertheless, some tasks might wait before running.
9194	*/
9195	group_fully_busy,
9196	/*
9197	* One task doesn't fit with CPU's capacity and must be migrated to a
9198	* more powerful CPU.
9199	*/
9200	group_misfit_task,
9201	/*
9202	* Balance SMT group that's fully busy. Can benefit from migration
9203	* a task on SMT with busy sibling to another CPU on idle core.
9204	*/
9205	group_smt_balance,
9206	/*
9207	* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
9208	* and the task should be migrated to it instead of running on the
9209	* current CPU.
9210	*/
9211	group_asym_packing,
9212	/*
9213	* The tasks' affinity constraints previously prevented the scheduler
9214	* from balancing the load across the system.
9215	*/
9216	group_imbalanced,
9217	/*
9218	* The CPU is overloaded and can't provide expected CPU cycles to all
9219	* tasks.
9220	*/
9221	group_overloaded
9222	};
9223
9224	enum migration_type {
9225	migrate_load = `0`,
9226	migrate_util,
9227	migrate_task,
9228	migrate_misfit
9229	};
9230
9231	#define LBF_ALL_PINNED 0x01
9232	#define LBF_NEED_BREAK 0x02
9233	#define LBF_DST_PINNED 0x04
9234	#define LBF_SOME_PINNED 0x08
9235	#define LBF_ACTIVE_LB 0x10
9236
9237	struct lb_env {
9238	struct sched_domain *sd;
9239
9240	struct rq *src_rq;
9241	int src_cpu;
9242
9243	int dst_cpu;
9244	struct rq *dst_rq;
9245
9246	struct cpumask *dst_grpmask;
9247	int new_dst_cpu;
9248	enum cpu_idle_type idle;
9249	long imbalance;
9250	/ The set of CPUs under consideration for load-balancing /
9251	struct cpumask *cpus;
9252
9253	unsigned int flags;
9254
9255	unsigned int loop;
9256	unsigned int loop_break;
9257	unsigned int loop_max;
9258
9259	enum fbq_type fbq_type;
9260	enum migration_type migration_type;
9261	struct list_head tasks;
9262	};
9263
9264	/*
9265	* Is this task likely cache-hot:
9266	*/
9267	static int task_hot(struct task_struct p, struct* lb_env *env)
9268	{
9269	s64 delta;
9270
9271	lockdep_assert_rq_held(rq: env->src_rq);
9272
9273	if (p->sched_class != &fair_sched_class)
9274	return `0`;
9275
9276	if (unlikely(task_has_idle_policy(p)))
9277	return `0`;
9278
9279	/ SMT siblings share cache /
9280	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
9281	return `0`;
9282
9283	/*
9284	* Buddy candidates are cache hot:
9285	*/
9286	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
9287	(&p->se == cfs_rq_of(se: &p->se)->next))
9288	return `1`;
9289
9290	if (sysctl_sched_migration_cost == -`1`)
9291	return `1`;
9292
9293	/*
9294	* Don't migrate task if the task's cookie does not match
9295	* with the destination CPU's core cookie.
9296	*/
9297	if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
9298	return `1`;
9299
9300	if (sysctl_sched_migration_cost == `0`)
9301	return `0`;
9302
9303	delta = rq_clock_task(rq: env->src_rq) - p->se.exec_start;
9304
9305	return delta < (s64)sysctl_sched_migration_cost;
9306	}
9307
9308	#ifdef CONFIG_NUMA_BALANCING
9309	/*
9310	* Returns a positive value, if task migration degrades locality.
9311	* Returns 0, if task migration is not affected by locality.
9312	* Returns a negative value, if task migration improves locality i.e migration preferred.
9313	*/
9314	static long migrate_degrades_locality(struct task_struct p, struct* lb_env *env)
9315	{
9316	struct numa_group *numa_group = rcu_dereference(p->numa_group);
9317	unsigned long src_weight, dst_weight;
9318	int src_nid, dst_nid, dist;
9319
9320	if (!static_branch_likely(&sched_numa_balancing))
9321	return `0`;
9322
9323	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
9324	return `0`;
9325
9326	src_nid = cpu_to_node(cpu: env->src_cpu);
9327	dst_nid = cpu_to_node(cpu: env->dst_cpu);
9328
9329	if (src_nid == dst_nid)
9330	return `0`;
9331
9332	/ Migrating away from the preferred node is always bad. /
9333	if (src_nid == p->numa_preferred_nid) {
9334	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
9335	return `1`;
9336	else
9337	return `0`;
9338	}
9339
9340	/ Encourage migration to the preferred node. /
9341	if (dst_nid == p->numa_preferred_nid)
9342	return -`1`;
9343
9344	/ Leaving a core idle is often worse than degrading locality. /
9345	if (env->idle == CPU_IDLE)
9346	return `0`;
9347
9348	dist = node_distance(src_nid, dst_nid);
9349	if (numa_group) {
9350	src_weight = group_weight(p, nid: src_nid, dist);
9351	dst_weight = group_weight(p, nid: dst_nid, dist);
9352	} else {
9353	src_weight = task_weight(p, nid: src_nid, dist);
9354	dst_weight = task_weight(p, nid: dst_nid, dist);
9355	}
9356
9357	return src_weight - dst_weight;
9358	}
9359
9360	#else
9361	static inline long migrate_degrades_locality(struct task_struct *p,
9362	struct lb_env *env)
9363	{
9364	return `0`;
9365	}
9366	#endif
9367
9368	/*
9369	* Check whether the task is ineligible on the destination cpu
9370	*
9371	* When the PLACE_LAG scheduling feature is enabled and
9372	* dst_cfs_rq->nr_queued is greater than 1, if the task
9373	* is ineligible, it will also be ineligible when
9374	* it is migrated to the destination cpu.
9375	*/
9376	static inline int task_is_ineligible_on_dst_cpu(struct task_struct p, int* dest_cpu)
9377	{
9378	struct cfs_rq *dst_cfs_rq;
9379
9380	#ifdef CONFIG_FAIR_GROUP_SCHED
9381	dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
9382	#else
9383	dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
9384	#endif
9385	if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
9386	!entity_eligible(cfs_rq: task_cfs_rq(p), se: &p->se))
9387	return `1`;
9388
9389	return `0`;
9390	}
9391
9392	/*
9393	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9394	*/
9395	static
9396	int can_migrate_task(struct task_struct p, struct* lb_env *env)
9397	{
9398	long degrades, hot;
9399
9400	lockdep_assert_rq_held(rq: env->src_rq);
9401	if (p->sched_task_hot)
9402	p->sched_task_hot = `0`;
9403
9404	/*
9405	* We do not migrate tasks that are:
9406	* 1) delayed dequeued unless we migrate load, or
9407	* 2) throttled_lb_pair, or
9408	* 3) cannot be migrated to this CPU due to cpus_ptr, or
9409	* 4) running (obviously), or
9410	* 5) are cache-hot on their current CPU.
9411	*/
9412	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
9413	return `0`;
9414
9415	if (throttled_lb_pair(tg: task_group(p), src_cpu: env->src_cpu, dest_cpu: env->dst_cpu))
9416	return `0`;
9417
9418	/*
9419	* We want to prioritize the migration of eligible tasks.
9420	* For ineligible tasks we soft-limit them and only allow
9421	* them to migrate when nr_balance_failed is non-zero to
9422	* avoid load-balancing trying very hard to balance the load.
9423	*/
9424	if (!env->sd->nr_balance_failed &&
9425	task_is_ineligible_on_dst_cpu(p, dest_cpu: env->dst_cpu))
9426	return `0`;
9427
9428	/ Disregard percpu kthreads; they are where they need to be. /
9429	if (kthread_is_per_cpu(k: p))
9430	return `0`;
9431
9432	if (!cpumask_test_cpu(cpu: env->dst_cpu, cpumask: p->cpus_ptr)) {
9433	int cpu;
9434
9435	schedstat_inc(p->stats.nr_failed_migrations_affine);
9436
9437	env->flags \|= LBF_SOME_PINNED;
9438
9439	/*
9440	* Remember if this task can be migrated to any other CPU in
9441	* our sched_group. We may want to revisit it if we couldn't
9442	* meet load balance goals by pulling other tasks on src_cpu.
9443	*
9444	* Avoid computing new_dst_cpu
9445	* - for NEWLY_IDLE
9446	* - if we have already computed one in current iteration
9447	* - if it's an active balance
9448	*/
9449	if (env->idle == CPU_NEWLY_IDLE \|\|
9450	env->flags & (LBF_DST_PINNED \| LBF_ACTIVE_LB))
9451	return `0`;
9452
9453	/ Prevent to re-select dst_cpu via env's CPUs: /
9454	cpu = cpumask_first_and_and(srcp1: env->dst_grpmask, srcp2: env->cpus, srcp3: p->cpus_ptr);
9455
9456	if (cpu < nr_cpu_ids) {
9457	env->flags \|= LBF_DST_PINNED;
9458	env->new_dst_cpu = cpu;
9459	}
9460
9461	return `0`;
9462	}
9463
9464	/ Record that we found at least one task that could run on dst_cpu /
9465	env->flags &= ~LBF_ALL_PINNED;
9466
9467	if (task_on_cpu(rq: env->src_rq, p)) {
9468	schedstat_inc(p->stats.nr_failed_migrations_running);
9469	return `0`;
9470	}
9471
9472	/*
9473	* Aggressive migration if:
9474	* 1) active balance
9475	* 2) destination numa is preferred
9476	* 3) task is cache cold, or
9477	* 4) too many balance attempts have failed.
9478	*/
9479	if (env->flags & LBF_ACTIVE_LB)
9480	return `1`;
9481
9482	degrades = migrate_degrades_locality(p, env);
9483	if (!degrades)
9484	hot = task_hot(p, env);
9485	else
9486	hot = degrades > `0`;
9487
9488	if (!hot \|\| env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9489	if (hot)
9490	p->sched_task_hot = `1`;
9491	return `1`;
9492	}
9493
9494	schedstat_inc(p->stats.nr_failed_migrations_hot);
9495	return `0`;
9496	}
9497
9498	/*
9499	* detach_task() -- detach the task for the migration specified in env
9500	*/
9501	static void detach_task(struct task_struct p, struct* lb_env *env)
9502	{
9503	lockdep_assert_rq_held(rq: env->src_rq);
9504
9505	if (p->sched_task_hot) {
9506	p->sched_task_hot = `0`;
9507	schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9508	schedstat_inc(p->stats.nr_forced_migrations);
9509	}
9510
9511	deactivate_task(rq: env->src_rq, p, DEQUEUE_NOCLOCK);
9512	set_task_cpu(p, cpu: env->dst_cpu);
9513	}
9514
9515	/*
9516	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9517	* part of active balancing operations within "domain".
9518	*
9519	* Returns a task if successful and NULL otherwise.
9520	*/
9521	static struct task_struct detach_one_task(struct* lb_env *env)
9522	{
9523	struct task_struct *p;
9524
9525	lockdep_assert_rq_held(rq: env->src_rq);
9526
9527	list_for_each_entry_reverse(p,
9528	&env->src_rq->cfs_tasks, se.group_node) {
9529	if (!can_migrate_task(p, env))
9530	continue;
9531
9532	detach_task(p, env);
9533
9534	/*
9535	* Right now, this is only the second place where
9536	* lb_gained[env->idle] is updated (other is detach_tasks)
9537	* so we can safely collect stats here rather than
9538	* inside detach_tasks().
9539	*/
9540	schedstat_inc(env->sd->lb_gained[env->idle]);
9541	return p;
9542	}
9543	return NULL;
9544	}
9545
9546	/*
9547	* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9548	* busiest_rq, as part of a balancing operation within domain "sd".
9549	*
9550	* Returns number of detached tasks if successful and 0 otherwise.
9551	*/
9552	static int detach_tasks(struct lb_env *env)
9553	{
9554	struct list_head *tasks = &env->src_rq->cfs_tasks;
9555	unsigned long util, load;
9556	struct task_struct *p;
9557	int detached = `0`;
9558
9559	lockdep_assert_rq_held(rq: env->src_rq);
9560
9561	/*
9562	* Source run queue has been emptied by another CPU, clear
9563	* LBF_ALL_PINNED flag as we will not test any task.
9564	*/
9565	if (env->src_rq->nr_running <= `1`) {
9566	env->flags &= ~LBF_ALL_PINNED;
9567	return `0`;
9568	}
9569
9570	if (env->imbalance <= `0`)
9571	return `0`;
9572
9573	while (!list_empty(head: tasks)) {
9574	/*
9575	* We don't want to steal all, otherwise we may be treated likewise,
9576	* which could at worst lead to a livelock crash.
9577	*/
9578	if (env->idle && env->src_rq->nr_running <= `1`)
9579	break;
9580
9581	env->loop++;
9582	/ We've more or less seen every task there is, call it quits /
9583	if (env->loop > env->loop_max)
9584	break;
9585
9586	/ take a breather every nr_migrate tasks /
9587	if (env->loop > env->loop_break) {
9588	env->loop_break += SCHED_NR_MIGRATE_BREAK;
9589	env->flags \|= LBF_NEED_BREAK;
9590	break;
9591	}
9592
9593	p = list_last_entry(tasks, struct task_struct, se.group_node);
9594
9595	if (!can_migrate_task(p, env))
9596	goto next;
9597
9598	switch (env->migration_type) {
9599	case migrate_load:
9600	/*
9601	* Depending of the number of CPUs and tasks and the
9602	* cgroup hierarchy, task_h_load() can return a null
9603	* value. Make sure that env->imbalance decreases
9604	* otherwise detach_tasks() will stop only after
9605	* detaching up to loop_max tasks.
9606	*/
9607	load = max_t(unsigned long, task_h_load(p), `1`);
9608
9609	if (sched_feat(LB_MIN) &&
9610	load < `16` && !env->sd->nr_balance_failed)
9611	goto next;
9612
9613	/*
9614	* Make sure that we don't migrate too much load.
9615	* Nevertheless, let relax the constraint if
9616	* scheduler fails to find a good waiting task to
9617	* migrate.
9618	*/
9619	if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9620	goto next;
9621
9622	env->imbalance -= load;
9623	break;
9624
9625	case migrate_util:
9626	util = task_util_est(p);
9627
9628	if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9629	goto next;
9630
9631	env->imbalance -= util;
9632	break;
9633
9634	case migrate_task:
9635	env->imbalance--;
9636	break;
9637
9638	case migrate_misfit:
9639	/ This is not a misfit task /
9640	if (task_fits_cpu(p, cpu: env->src_cpu))
9641	goto next;
9642
9643	env->imbalance = `0`;
9644	break;
9645	}
9646
9647	detach_task(p, env);
9648	list_add(new: &p->se.group_node, head: &env->tasks);
9649
9650	detached++;
9651
9652	#ifdef CONFIG_PREEMPTION
9653	/*
9654	* NEWIDLE balancing is a source of latency, so preemptible
9655	* kernels will stop after the first task is detached to minimize
9656	* the critical section.
9657	*/
9658	if (env->idle == CPU_NEWLY_IDLE)
9659	break;
9660	#endif
9661
9662	/*
9663	* We only want to steal up to the prescribed amount of
9664	* load/util/tasks.
9665	*/
9666	if (env->imbalance <= `0`)
9667	break;
9668
9669	continue;
9670	next:
9671	if (p->sched_task_hot)
9672	schedstat_inc(p->stats.nr_failed_migrations_hot);
9673
9674	list_move(list: &p->se.group_node, head: tasks);
9675	}
9676
9677	/*
9678	* Right now, this is one of only two places we collect this stat
9679	* so we can safely collect detach_one_task() stats here rather
9680	* than inside detach_one_task().
9681	*/
9682	schedstat_add(env->sd->lb_gained[env->idle], detached);
9683
9684	return detached;
9685	}
9686
9687	/*
9688	* attach_task() -- attach the task detached by detach_task() to its new rq.
9689	*/
9690	static void attach_task(struct rq rq, struct* task_struct *p)
9691	{
9692	lockdep_assert_rq_held(rq);
9693
9694	WARN_ON_ONCE(task_rq(p) != rq);
9695	activate_task(rq, p, ENQUEUE_NOCLOCK);
9696	wakeup_preempt(rq, p, flags: `0`);
9697	}
9698
9699	/*
9700	* attach_one_task() -- attaches the task returned from detach_one_task() to
9701	* its new rq.
9702	*/
9703	static void attach_one_task(struct rq rq, struct* task_struct *p)
9704	{
9705	struct rq_flags rf;
9706
9707	rq_lock(rq, rf: &rf);
9708	update_rq_clock(rq);
9709	attach_task(rq, p);
9710	rq_unlock(rq, rf: &rf);
9711	}
9712
9713	/*
9714	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9715	* new rq.
9716	*/
9717	static void attach_tasks(struct lb_env *env)
9718	{
9719	struct list_head *tasks = &env->tasks;
9720	struct task_struct *p;
9721	struct rq_flags rf;
9722
9723	rq_lock(rq: env->dst_rq, rf: &rf);
9724	update_rq_clock(rq: env->dst_rq);
9725
9726	while (!list_empty(head: tasks)) {
9727	p = list_first_entry(tasks, struct task_struct, se.group_node);
9728	list_del_init(entry: &p->se.group_node);
9729
9730	attach_task(rq: env->dst_rq, p);
9731	}
9732
9733	rq_unlock(rq: env->dst_rq, rf: &rf);
9734	}
9735
9736	#ifdef CONFIG_NO_HZ_COMMON
9737	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
9738	{
9739	if (cfs_rq->avg.load_avg)
9740	return true;
9741
9742	if (cfs_rq->avg.util_avg)
9743	return true;
9744
9745	return false;
9746	}
9747
9748	static inline bool others_have_blocked(struct rq *rq)
9749	{
9750	if (cpu_util_rt(rq))
9751	return true;
9752
9753	if (cpu_util_dl(rq))
9754	return true;
9755
9756	if (hw_load_avg(rq))
9757	return true;
9758
9759	if (cpu_util_irq(rq))
9760	return true;
9761
9762	return false;
9763	}
9764
9765	static inline void update_blocked_load_tick(struct rq *rq)
9766	{
9767	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9768	}
9769
9770	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
9771	{
9772	if (!has_blocked)
9773	rq->has_blocked_load = `0`;
9774	}
9775	#else
9776	static inline bool cfs_rq_has_blocked(struct cfs_rq cfs_rq) { return* false; }
9777	static inline bool others_have_blocked(struct rq rq) { return* false; }
9778	static inline void update_blocked_load_tick(struct rq *rq) {}
9779	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
9780	#endif
9781
9782	static bool __update_blocked_others(struct rq rq, bool done)
9783	{
9784	bool updated;
9785
9786	/*
9787	* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9788	* DL and IRQ signals have been updated before updating CFS.
9789	*/
9790	updated = update_other_load_avgs(rq);
9791
9792	if (others_have_blocked(rq))
9793	*done = false;
9794
9795	return updated;
9796	}
9797
9798	#ifdef CONFIG_FAIR_GROUP_SCHED
9799
9800	static bool __update_blocked_fair(struct rq rq, bool done)
9801	{
9802	struct cfs_rq cfs_rq, pos;
9803	bool decayed = false;
9804	int cpu = cpu_of(rq);
9805
9806	/*
9807	* Iterates the task_group tree in a bottom up fashion, see
9808	* list_add_leaf_cfs_rq() for details.
9809	*/
9810	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9811	struct sched_entity *se;
9812
9813	if (update_cfs_rq_load_avg(now: cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9814	update_tg_load_avg(cfs_rq);
9815
9816	if (cfs_rq->nr_queued == `0`)
9817	update_idle_cfs_rq_clock_pelt(cfs_rq);
9818
9819	if (cfs_rq == &rq->cfs)
9820	decayed = true;
9821	}
9822
9823	/ Propagate pending load changes to the parent, if any: /
9824	se = cfs_rq->tg->se[cpu];
9825	if (se && !skip_blocked_update(se))
9826	update_load_avg(cfs_rq: cfs_rq_of(se), se, UPDATE_TG);
9827
9828	/*
9829	* There can be a lot of idle CPU cgroups. Don't let fully
9830	* decayed cfs_rqs linger on the list.
9831	*/
9832	if (cfs_rq_is_decayed(cfs_rq))
9833	list_del_leaf_cfs_rq(cfs_rq);
9834
9835	/ Don't need periodic decay once load/util_avg are null /
9836	if (cfs_rq_has_blocked(cfs_rq))
9837	*done = false;
9838	}
9839
9840	return decayed;
9841	}
9842
9843	/*
9844	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
9845	* This needs to be done in a top-down fashion because the load of a child
9846	* group is a fraction of its parents load.
9847	*/
9848	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9849	{
9850	struct rq *rq = rq_of(cfs_rq);
9851	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9852	unsigned long now = jiffies;
9853	unsigned long load;
9854
9855	if (cfs_rq->last_h_load_update == now)
9856	return;
9857
9858	WRITE_ONCE(cfs_rq->h_load_next, NULL);
9859	for_each_sched_entity(se) {
9860	cfs_rq = cfs_rq_of(se);
9861	WRITE_ONCE(cfs_rq->h_load_next, se);
9862	if (cfs_rq->last_h_load_update == now)
9863	break;
9864	}
9865
9866	if (!se) {
9867	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9868	cfs_rq->last_h_load_update = now;
9869	}
9870
9871	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9872	load = cfs_rq->h_load;
9873	load = div64_ul(load * se->avg.load_avg,
9874	cfs_rq_load_avg(cfs_rq) + `1`);
9875	cfs_rq = group_cfs_rq(grp: se);
9876	cfs_rq->h_load = load;
9877	cfs_rq->last_h_load_update = now;
9878	}
9879	}
9880
9881	static unsigned long task_h_load(struct task_struct *p)
9882	{
9883	struct cfs_rq *cfs_rq = task_cfs_rq(p);
9884
9885	update_cfs_rq_h_load(cfs_rq);
9886	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9887	cfs_rq_load_avg(cfs_rq) + `1`);
9888	}
9889	#else
9890	static bool __update_blocked_fair(struct rq rq, bool done)
9891	{
9892	struct cfs_rq *cfs_rq = &rq->cfs;
9893	bool decayed;
9894
9895	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9896	if (cfs_rq_has_blocked(cfs_rq))
9897	*done = false;
9898
9899	return decayed;
9900	}
9901
9902	static unsigned long task_h_load(struct task_struct *p)
9903	{
9904	return p->se.avg.load_avg;
9905	}
9906	#endif
9907
9908	static void sched_balance_update_blocked_averages(int cpu)
9909	{
9910	bool decayed = false, done = true;
9911	struct rq *rq = cpu_rq(cpu);
9912	struct rq_flags rf;
9913
9914	rq_lock_irqsave(rq, rf: &rf);
9915	update_blocked_load_tick(rq);
9916	update_rq_clock(rq);
9917
9918	decayed \|= __update_blocked_others(rq, done: &done);
9919	decayed \|= __update_blocked_fair(rq, done: &done);
9920
9921	update_blocked_load_status(rq, has_blocked: !done);
9922	if (decayed)
9923	cpufreq_update_util(rq, flags: `0`);
9924	rq_unlock_irqrestore(rq, rf: &rf);
9925	}
9926
9927	/******* Helpers for sched_balance_find_src_group *********************/
9928
9929	/*
9930	* sg_lb_stats - stats of a sched_group required for load-balancing:
9931	*/
9932	struct sg_lb_stats {
9933	unsigned long avg_load; / Avg load over the CPUs of the group /
9934	unsigned long group_load; / Total load over the CPUs of the group /
9935	unsigned long group_capacity; / Capacity over the CPUs of the group /
9936	unsigned long group_util; / Total utilization over the CPUs of the group /
9937	unsigned long group_runnable; / Total runnable time over the CPUs of the group /
9938	unsigned int sum_nr_running; / Nr of all tasks running in the group /
9939	unsigned int sum_h_nr_running; / Nr of CFS tasks running in the group /
9940	unsigned int idle_cpus; / Nr of idle CPUs in the group /
9941	unsigned int group_weight;
9942	enum group_type group_type;
9943	unsigned int group_asym_packing; / Tasks should be moved to preferred CPU /
9944	unsigned int group_smt_balance; / Task on busy SMT be moved /
9945	unsigned long group_misfit_task_load; / A CPU has a task too big for its capacity /
9946	#ifdef CONFIG_NUMA_BALANCING
9947	unsigned int nr_numa_running;
9948	unsigned int nr_preferred_running;
9949	#endif
9950	};
9951
9952	/*
9953	* sd_lb_stats - stats of a sched_domain required for load-balancing:
9954	*/
9955	struct sd_lb_stats {
9956	struct sched_group busiest; /* Busiest group in this sd /
9957	struct sched_group local; /* Local group in this sd /
9958	unsigned long total_load; / Total load of all groups in sd /
9959	unsigned long total_capacity; / Total capacity of all groups in sd /
9960	unsigned long avg_load; / Average load across all groups in sd /
9961	unsigned int prefer_sibling; / Tasks should go to sibling first /
9962
9963	struct sg_lb_stats busiest_stat; / Statistics of the busiest group /
9964	struct sg_lb_stats local_stat; / Statistics of the local group /
9965	};
9966
9967	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9968	{
9969	/*
9970	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
9971	* local_stat because update_sg_lb_stats() does a full clear/assignment.
9972	* We must however set busiest_stat::group_type and
9973	* busiest_stat::idle_cpus to the worst busiest group because
9974	* update_sd_pick_busiest() reads these before assignment.
9975	*/
9976	sds = (struct* sd_lb_stats){
9977	.busiest = NULL,
9978	.local = NULL,
9979	.total_load = `0UL`,
9980	.total_capacity = `0UL`,
9981	.busiest_stat = {
9982	.idle_cpus = UINT_MAX,
9983	.group_type = group_has_spare,
9984	},
9985	};
9986	}
9987
9988	static unsigned long scale_rt_capacity(int cpu)
9989	{
9990	unsigned long max = get_actual_cpu_capacity(cpu);
9991	struct rq *rq = cpu_rq(cpu);
9992	unsigned long used, free;
9993	unsigned long irq;
9994
9995	irq = cpu_util_irq(rq);
9996
9997	if (unlikely(irq >= max))
9998	return `1`;
9999
10000	/*
10001	* avg_rt.util_avg and avg_dl.util_avg track binary signals
10002	* (running and not running) with weights 0 and 1024 respectively.
10003	*/
10004	used = cpu_util_rt(rq);
10005	used += cpu_util_dl(rq);
10006
10007	if (unlikely(used >= max))
10008	return `1`;
10009
10010	free = max - used;
10011
10012	return scale_irq_capacity(util: free, irq, max);
10013	}
10014
10015	static void update_cpu_capacity(struct sched_domain sd, int* cpu)
10016	{
10017	unsigned long capacity = scale_rt_capacity(cpu);
10018	struct sched_group *sdg = sd->groups;
10019
10020	if (!capacity)
10021	capacity = `1`;
10022
10023	cpu_rq(cpu)->cpu_capacity = capacity;
10024	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
10025
10026	sdg->sgc->capacity = capacity;
10027	sdg->sgc->min_capacity = capacity;
10028	sdg->sgc->max_capacity = capacity;
10029	}
10030
10031	void update_group_capacity(struct sched_domain sd, int* cpu)
10032	{
10033	struct sched_domain *child = sd->child;
10034	struct sched_group group, sdg = sd->groups;
10035	unsigned long capacity, min_capacity, max_capacity;
10036	unsigned long interval;
10037
10038	interval = msecs_to_jiffies(m: sd->balance_interval);
10039	interval = clamp(interval, `1UL`, max_load_balance_interval);
10040	sdg->sgc->next_update = jiffies + interval;
10041
10042	if (!child) {
10043	update_cpu_capacity(sd, cpu);
10044	return;
10045	}
10046
10047	capacity = `0`;
10048	min_capacity = ULONG_MAX;
10049	max_capacity = `0`;
10050
10051	if (child->flags & SD_OVERLAP) {
10052	/*
10053	* SD_OVERLAP domains cannot assume that child groups
10054	* span the current group.
10055	*/
10056
10057	for_each_cpu(cpu, sched_group_span(sdg)) {
10058	unsigned long cpu_cap = capacity_of(cpu);
10059
10060	capacity += cpu_cap;
10061	min_capacity = min(cpu_cap, min_capacity);
10062	max_capacity = max(cpu_cap, max_capacity);
10063	}
10064	} else {
10065	/*
10066	* !SD_OVERLAP domains can assume that child groups
10067	* span the current group.
10068	*/
10069
10070	group = child->groups;
10071	do {
10072	struct sched_group_capacity *sgc = group->sgc;
10073
10074	capacity += sgc->capacity;
10075	min_capacity = min(sgc->min_capacity, min_capacity);
10076	max_capacity = max(sgc->max_capacity, max_capacity);
10077	group = group->next;
10078	} while (group != child->groups);
10079	}
10080
10081	sdg->sgc->capacity = capacity;
10082	sdg->sgc->min_capacity = min_capacity;
10083	sdg->sgc->max_capacity = max_capacity;
10084	}
10085
10086	/*
10087	* Check whether the capacity of the rq has been noticeably reduced by side
10088	* activity. The imbalance_pct is used for the threshold.
10089	* Return true is the capacity is reduced
10090	*/
10091	static inline int
10092	check_cpu_capacity(struct rq rq, struct* sched_domain *sd)
10093	{
10094	return ((rq->cpu_capacity * sd->imbalance_pct) <
10095	(arch_scale_cpu_capacity(cpu: cpu_of(rq)) * `100`));
10096	}
10097
10098	/ Check if the rq has a misfit task /
10099	static inline bool check_misfit_status(struct rq *rq)
10100	{
10101	return rq->misfit_task_load;
10102	}
10103
10104	/*
10105	* Group imbalance indicates (and tries to solve) the problem where balancing
10106	* groups is inadequate due to ->cpus_ptr constraints.
10107	*
10108	* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
10109	* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
10110	* Something like:
10111	*
10112	* { 0 1 2 3 } { 4 5 6 7 }
10113	* * * * *
10114	*
10115	* If we were to balance group-wise we'd place two tasks in the first group and
10116	* two tasks in the second group. Clearly this is undesired as it will overload
10117	* cpu 3 and leave one of the CPUs in the second group unused.
10118	*
10119	* The current solution to this issue is detecting the skew in the first group
10120	* by noticing the lower domain failed to reach balance and had difficulty
10121	* moving tasks due to affinity constraints.
10122	*
10123	* When this is so detected; this group becomes a candidate for busiest; see
10124	* update_sd_pick_busiest(). And calculate_imbalance() and
10125	* sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
10126	* to create an effective group imbalance.
10127	*
10128	* This is a somewhat tricky proposition since the next run might not find the
10129	* group imbalance and decide the groups need to be balanced again. A most
10130	* subtle and fragile situation.
10131	*/
10132
10133	static inline int sg_imbalanced(struct sched_group *group)
10134	{
10135	return group->sgc->imbalance;
10136	}
10137
10138	/*
10139	* group_has_capacity returns true if the group has spare capacity that could
10140	* be used by some tasks.
10141	* We consider that a group has spare capacity if the number of task is
10142	* smaller than the number of CPUs or if the utilization is lower than the
10143	* available capacity for CFS tasks.
10144	* For the latter, we use a threshold to stabilize the state, to take into
10145	* account the variance of the tasks' load and to return true if the available
10146	* capacity in meaningful for the load balancer.
10147	* As an example, an available capacity of 1% can appear but it doesn't make
10148	* any benefit for the load balance.
10149	*/
10150	static inline bool
10151	group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10152	{
10153	if (sgs->sum_nr_running < sgs->group_weight)
10154	return true;
10155
10156	if ((sgs->group_capacity * imbalance_pct) <
10157	(sgs->group_runnable * `100`))
10158	return false;
10159
10160	if ((sgs->group_capacity * `100`) >
10161	(sgs->group_util * imbalance_pct))
10162	return true;
10163
10164	return false;
10165	}
10166
10167	/*
10168	* group_is_overloaded returns true if the group has more tasks than it can
10169	* handle.
10170	* group_is_overloaded is not equals to !group_has_capacity because a group
10171	* with the exact right number of tasks, has no more spare capacity but is not
10172	* overloaded so both group_has_capacity and group_is_overloaded return
10173	* false.
10174	*/
10175	static inline bool
10176	group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10177	{
10178	if (sgs->sum_nr_running <= sgs->group_weight)
10179	return false;
10180
10181	if ((sgs->group_capacity * `100`) <
10182	(sgs->group_util * imbalance_pct))
10183	return true;
10184
10185	if ((sgs->group_capacity * imbalance_pct) <
10186	(sgs->group_runnable * `100`))
10187	return true;
10188
10189	return false;
10190	}
10191
10192	static inline enum
10193	group_type group_classify(unsigned int imbalance_pct,
10194	struct sched_group *group,
10195	struct sg_lb_stats *sgs)
10196	{
10197	if (group_is_overloaded(imbalance_pct, sgs))
10198	return group_overloaded;
10199
10200	if (sg_imbalanced(group))
10201	return group_imbalanced;
10202
10203	if (sgs->group_asym_packing)
10204	return group_asym_packing;
10205
10206	if (sgs->group_smt_balance)
10207	return group_smt_balance;
10208
10209	if (sgs->group_misfit_task_load)
10210	return group_misfit_task;
10211
10212	if (!group_has_capacity(imbalance_pct, sgs))
10213	return group_fully_busy;
10214
10215	return group_has_spare;
10216	}
10217
10218	/**
10219	* sched_use_asym_prio - Check whether asym_packing priority must be used
10220	* @sd: The scheduling domain of the load balancing
10221	* @cpu: A CPU
10222	*
10223	* Always use CPU priority when balancing load between SMT siblings. When
10224	* balancing load between cores, it is not sufficient that @cpu is idle. Only
10225	* use CPU priority if the whole core is idle.
10226	*
10227	* Returns: True if the priority of @cpu must be followed. False otherwise.
10228	*/
10229	static bool sched_use_asym_prio(struct sched_domain sd, int* cpu)
10230	{
10231	if (!(sd->flags & SD_ASYM_PACKING))
10232	return false;
10233
10234	if (!sched_smt_active())
10235	return true;
10236
10237	return sd->flags & SD_SHARE_CPUCAPACITY \|\| is_core_idle(cpu);
10238	}
10239
10240	static inline bool sched_asym(struct sched_domain sd, int* dst_cpu, int src_cpu)
10241	{
10242	/*
10243	* First check if @dst_cpu can do asym_packing load balance. Only do it
10244	* if it has higher priority than @src_cpu.
10245	*/
10246	return sched_use_asym_prio(sd, cpu: dst_cpu) &&
10247	sched_asym_prefer(a: dst_cpu, b: src_cpu);
10248	}
10249
10250	/**
10251	* sched_group_asym - Check if the destination CPU can do asym_packing balance
10252	* @env: The load balancing environment
10253	* @sgs: Load-balancing statistics of the candidate busiest group
10254	* @group: The candidate busiest group
10255	*
10256	* @env::dst_cpu can do asym_packing if it has higher priority than the
10257	* preferred CPU of @group.
10258	*
10259	* Return: true if @env::dst_cpu can do with asym_packing load balance. False
10260	* otherwise.
10261	*/
10262	static inline bool
10263	sched_group_asym(struct lb_env env, struct* sg_lb_stats sgs, struct* sched_group *group)
10264	{
10265	/*
10266	* CPU priorities do not make sense for SMT cores with more than one
10267	* busy sibling.
10268	*/
10269	if ((group->flags & SD_SHARE_CPUCAPACITY) &&
10270	(sgs->group_weight - sgs->idle_cpus != `1`))
10271	return false;
10272
10273	return sched_asym(sd: env->sd, dst_cpu: env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
10274	}
10275
10276	/ One group has more than one SMT CPU while the other group does not /
10277	static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
10278	struct sched_group *sg2)
10279	{
10280	if (!sg1 \|\| !sg2)
10281	return false;
10282
10283	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
10284	(sg2->flags & SD_SHARE_CPUCAPACITY);
10285	}
10286
10287	static inline bool smt_balance(struct lb_env env, struct* sg_lb_stats *sgs,
10288	struct sched_group *group)
10289	{
10290	if (!env->idle)
10291	return false;
10292
10293	/*
10294	* For SMT source group, it is better to move a task
10295	* to a CPU that doesn't have multiple tasks sharing its CPU capacity.
10296	* Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
10297	* will not be on.
10298	*/
10299	if (group->flags & SD_SHARE_CPUCAPACITY &&
10300	sgs->sum_h_nr_running > `1`)
10301	return true;
10302
10303	return false;
10304	}
10305
10306	static inline long sibling_imbalance(struct lb_env *env,
10307	struct sd_lb_stats *sds,
10308	struct sg_lb_stats *busiest,
10309	struct sg_lb_stats *local)
10310	{
10311	int ncores_busiest, ncores_local;
10312	long imbalance;
10313
10314	if (!env->idle \|\| !busiest->sum_nr_running)
10315	return `0`;
10316
10317	ncores_busiest = sds->busiest->cores;
10318	ncores_local = sds->local->cores;
10319
10320	if (ncores_busiest == ncores_local) {
10321	imbalance = busiest->sum_nr_running;
10322	lsub_positive(&imbalance, local->sum_nr_running);
10323	return imbalance;
10324	}
10325
10326	/ Balance such that nr_running/ncores ratio are same on both groups /
10327	imbalance = ncores_local * busiest->sum_nr_running;
10328	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
10329	/ Normalize imbalance and do rounding on normalization /
10330	imbalance = `2` * imbalance + ncores_local + ncores_busiest;
10331	imbalance /= ncores_local + ncores_busiest;
10332
10333	/ Take advantage of resource in an empty sched group /
10334	if (imbalance <= `1` && local->sum_nr_running == `0` &&
10335	busiest->sum_nr_running > `1`)
10336	imbalance = `2`;
10337
10338	return imbalance;
10339	}
10340
10341	static inline bool
10342	sched_reduced_capacity(struct rq rq, struct* sched_domain *sd)
10343	{
10344	/*
10345	* When there is more than 1 task, the group_overloaded case already
10346	* takes care of cpu with reduced capacity
10347	*/
10348	if (rq->cfs.h_nr_runnable != `1`)
10349	return false;
10350
10351	return check_cpu_capacity(rq, sd);
10352	}
10353
10354	/**
10355	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
10356	* @env: The load balancing environment.
10357	* @sds: Load-balancing data with statistics of the local group.
10358	* @group: sched_group whose statistics are to be updated.
10359	* @sgs: variable to hold the statistics for this group.
10360	* @sg_overloaded: sched_group is overloaded
10361	* @sg_overutilized: sched_group is overutilized
10362	*/
10363	static inline void update_sg_lb_stats(struct lb_env *env,
10364	struct sd_lb_stats *sds,
10365	struct sched_group *group,
10366	struct sg_lb_stats *sgs,
10367	bool *sg_overloaded,
10368	bool *sg_overutilized)
10369	{
10370	int i, nr_running, local_group, sd_flags = env->sd->flags;
10371	bool balancing_at_rd = !env->sd->parent;
10372
10373	memset(sgs, `0`, sizeof(*sgs));
10374
10375	local_group = group == sds->local;
10376
10377	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10378	struct rq *rq = cpu_rq(i);
10379	unsigned long load = cpu_load(rq);
10380
10381	sgs->group_load += load;
10382	sgs->group_util += cpu_util_cfs(cpu: i);
10383	sgs->group_runnable += cpu_runnable(rq);
10384	sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
10385
10386	nr_running = rq->nr_running;
10387	sgs->sum_nr_running += nr_running;
10388
10389	if (cpu_overutilized(cpu: i))
10390	*sg_overutilized = `1`;
10391
10392	/*
10393	* No need to call idle_cpu() if nr_running is not 0
10394	*/
10395	if (!nr_running && idle_cpu(cpu: i)) {
10396	sgs->idle_cpus++;
10397	/ Idle cpu can't have misfit task /
10398	continue;
10399	}
10400
10401	/ Overload indicator is only updated at root domain /
10402	if (balancing_at_rd && nr_running > `1`)
10403	*sg_overloaded = `1`;
10404
10405	#ifdef CONFIG_NUMA_BALANCING
10406	/ Only fbq_classify_group() uses this to classify NUMA groups /
10407	if (sd_flags & SD_NUMA) {
10408	sgs->nr_numa_running += rq->nr_numa_running;
10409	sgs->nr_preferred_running += rq->nr_preferred_running;
10410	}
10411	#endif
10412	if (local_group)
10413	continue;
10414
10415	if (sd_flags & SD_ASYM_CPUCAPACITY) {
10416	/ Check for a misfit task on the cpu /
10417	if (sgs->group_misfit_task_load < rq->misfit_task_load) {
10418	sgs->group_misfit_task_load = rq->misfit_task_load;
10419	*sg_overloaded = `1`;
10420	}
10421	} else if (env->idle && sched_reduced_capacity(rq, sd: env->sd)) {
10422	/ Check for a task running on a CPU with reduced capacity /
10423	if (sgs->group_misfit_task_load < load)
10424	sgs->group_misfit_task_load = load;
10425	}
10426	}
10427
10428	sgs->group_capacity = group->sgc->capacity;
10429
10430	sgs->group_weight = group->group_weight;
10431
10432	/ Check if dst CPU is idle and preferred to this group /
10433	if (!local_group && env->idle && sgs->sum_h_nr_running &&
10434	sched_group_asym(env, sgs, group))
10435	sgs->group_asym_packing = `1`;
10436
10437	/ Check for loaded SMT group to be balanced to dst CPU /
10438	if (!local_group && smt_balance(env, sgs, group))
10439	sgs->group_smt_balance = `1`;
10440
10441	sgs->group_type = group_classify(imbalance_pct: env->sd->imbalance_pct, group, sgs);
10442
10443	/ Computing avg_load makes sense only when group is overloaded /
10444	if (sgs->group_type == group_overloaded)
10445	sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10446	sgs->group_capacity;
10447	}
10448
10449	/**
10450	* update_sd_pick_busiest - return 1 on busiest group
10451	* @env: The load balancing environment.
10452	* @sds: sched_domain statistics
10453	* @sg: sched_group candidate to be checked for being the busiest
10454	* @sgs: sched_group statistics
10455	*
10456	* Determine if @sg is a busier group than the previously selected
10457	* busiest group.
10458	*
10459	* Return: %true if @sg is a busier group than the previously selected
10460	* busiest group. %false otherwise.
10461	*/
10462	static bool update_sd_pick_busiest(struct lb_env *env,
10463	struct sd_lb_stats *sds,
10464	struct sched_group *sg,
10465	struct sg_lb_stats *sgs)
10466	{
10467	struct sg_lb_stats *busiest = &sds->busiest_stat;
10468
10469	/ Make sure that there is at least one task to pull /
10470	if (!sgs->sum_h_nr_running)
10471	return false;
10472
10473	/*
10474	* Don't try to pull misfit tasks we can't help.
10475	* We can use max_capacity here as reduction in capacity on some
10476	* CPUs in the group should either be possible to resolve
10477	* internally or be covered by avg_load imbalance (eventually).
10478	*/
10479	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10480	(sgs->group_type == group_misfit_task) &&
10481	(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) \|\|
10482	sds->local_stat.group_type != group_has_spare))
10483	return false;
10484
10485	if (sgs->group_type > busiest->group_type)
10486	return true;
10487
10488	if (sgs->group_type < busiest->group_type)
10489	return false;
10490
10491	/*
10492	* The candidate and the current busiest group are the same type of
10493	* group. Let check which one is the busiest according to the type.
10494	*/
10495
10496	switch (sgs->group_type) {
10497	case group_overloaded:
10498	/ Select the overloaded group with highest avg_load. /
10499	return sgs->avg_load > busiest->avg_load;
10500
10501	case group_imbalanced:
10502	/*
10503	* Select the 1st imbalanced group as we don't have any way to
10504	* choose one more than another.
10505	*/
10506	return false;
10507
10508	case group_asym_packing:
10509	/ Prefer to move from lowest priority CPU's work /
10510	return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
10511	READ_ONCE(sg->asym_prefer_cpu));
10512
10513	case group_misfit_task:
10514	/*
10515	* If we have more than one misfit sg go with the biggest
10516	* misfit.
10517	*/
10518	return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10519
10520	case group_smt_balance:
10521	/*
10522	* Check if we have spare CPUs on either SMT group to
10523	* choose has spare or fully busy handling.
10524	*/
10525	if (sgs->idle_cpus != `0` \|\| busiest->idle_cpus != `0`)
10526	goto has_spare;
10527
10528	fallthrough;
10529
10530	case group_fully_busy:
10531	/*
10532	* Select the fully busy group with highest avg_load. In
10533	* theory, there is no need to pull task from such kind of
10534	* group because tasks have all compute capacity that they need
10535	* but we can still improve the overall throughput by reducing
10536	* contention when accessing shared HW resources.
10537	*
10538	* XXX for now avg_load is not computed and always 0 so we
10539	* select the 1st one, except if @sg is composed of SMT
10540	* siblings.
10541	*/
10542
10543	if (sgs->avg_load < busiest->avg_load)
10544	return false;
10545
10546	if (sgs->avg_load == busiest->avg_load) {
10547	/*
10548	* SMT sched groups need more help than non-SMT groups.
10549	* If @sg happens to also be SMT, either choice is good.
10550	*/
10551	if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10552	return false;
10553	}
10554
10555	break;
10556
10557	case group_has_spare:
10558	/*
10559	* Do not pick sg with SMT CPUs over sg with pure CPUs,
10560	* as we do not want to pull task off SMT core with one task
10561	* and make the core idle.
10562	*/
10563	if (smt_vs_nonsmt_groups(sg1: sds->busiest, sg2: sg)) {
10564	if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= `1`)
10565	return false;
10566	else
10567	return true;
10568	}
10569	has_spare:
10570
10571	/*
10572	* Select not overloaded group with lowest number of idle CPUs
10573	* and highest number of running tasks. We could also compare
10574	* the spare capacity which is more stable but it can end up
10575	* that the group has less spare capacity but finally more idle
10576	* CPUs which means less opportunity to pull tasks.
10577	*/
10578	if (sgs->idle_cpus > busiest->idle_cpus)
10579	return false;
10580	else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10581	(sgs->sum_nr_running <= busiest->sum_nr_running))
10582	return false;
10583
10584	break;
10585	}
10586
10587	/*
10588	* Candidate sg has no more than one task per CPU and has higher
10589	* per-CPU capacity. Migrating tasks to less capable CPUs may harm
10590	* throughput. Maximize throughput, power/energy consequences are not
10591	* considered.
10592	*/
10593	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10594	(sgs->group_type <= group_fully_busy) &&
10595	(capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10596	return false;
10597
10598	return true;
10599	}
10600
10601	#ifdef CONFIG_NUMA_BALANCING
10602	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10603	{
10604	if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10605	return regular;
10606	if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10607	return remote;
10608	return all;
10609	}
10610
10611	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10612	{
10613	if (rq->nr_running > rq->nr_numa_running)
10614	return regular;
10615	if (rq->nr_running > rq->nr_preferred_running)
10616	return remote;
10617	return all;
10618	}
10619	#else
10620	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10621	{
10622	return all;
10623	}
10624
10625	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10626	{
10627	return regular;
10628	}
10629	#endif /* CONFIG_NUMA_BALANCING */
10630
10631
10632	struct sg_lb_stats;
10633
10634	/*
10635	* task_running_on_cpu - return 1 if @p is running on @cpu.
10636	*/
10637
10638	static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10639	{
10640	/ Task has no contribution or is new /
10641	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
10642	return `0`;
10643
10644	if (task_on_rq_queued(p))
10645	return `1`;
10646
10647	return `0`;
10648	}
10649
10650	/**
10651	* idle_cpu_without - would a given CPU be idle without p ?
10652	* @cpu: the processor on which idleness is tested.
10653	* @p: task which should be ignored.
10654	*
10655	* Return: 1 if the CPU would be idle. 0 otherwise.
10656	*/
10657	static int idle_cpu_without(int cpu, struct task_struct *p)
10658	{
10659	struct rq *rq = cpu_rq(cpu);
10660
10661	if (rq->curr != rq->idle && rq->curr != p)
10662	return `0`;
10663
10664	/*
10665	* rq->nr_running can't be used but an updated version without the
10666	* impact of p on cpu must be used instead. The updated nr_running
10667	* be computed and tested before calling idle_cpu_without().
10668	*/
10669
10670	if (rq->ttwu_pending)
10671	return `0`;
10672
10673	return `1`;
10674	}
10675
10676	/*
10677	* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10678	* @sd: The sched_domain level to look for idlest group.
10679	* @group: sched_group whose statistics are to be updated.
10680	* @sgs: variable to hold the statistics for this group.
10681	* @p: The task for which we look for the idlest group/CPU.
10682	*/
10683	static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10684	struct sched_group *group,
10685	struct sg_lb_stats *sgs,
10686	struct task_struct *p)
10687	{
10688	int i, nr_running;
10689
10690	memset(sgs, `0`, sizeof(*sgs));
10691
10692	/ Assume that task can't fit any CPU of the group /
10693	if (sd->flags & SD_ASYM_CPUCAPACITY)
10694	sgs->group_misfit_task_load = `1`;
10695
10696	for_each_cpu(i, sched_group_span(group)) {
10697	struct rq *rq = cpu_rq(i);
10698	unsigned int local;
10699
10700	sgs->group_load += cpu_load_without(rq, p);
10701	sgs->group_util += cpu_util_without(cpu: i, p);
10702	sgs->group_runnable += cpu_runnable_without(rq, p);
10703	local = task_running_on_cpu(cpu: i, p);
10704	sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
10705
10706	nr_running = rq->nr_running - local;
10707	sgs->sum_nr_running += nr_running;
10708
10709	/*
10710	* No need to call idle_cpu_without() if nr_running is not 0
10711	*/
10712	if (!nr_running && idle_cpu_without(cpu: i, p))
10713	sgs->idle_cpus++;
10714
10715	/ Check if task fits in the CPU /
10716	if (sd->flags & SD_ASYM_CPUCAPACITY &&
10717	sgs->group_misfit_task_load &&
10718	task_fits_cpu(p, cpu: i))
10719	sgs->group_misfit_task_load = `0`;
10720
10721	}
10722
10723	sgs->group_capacity = group->sgc->capacity;
10724
10725	sgs->group_weight = group->group_weight;
10726
10727	sgs->group_type = group_classify(imbalance_pct: sd->imbalance_pct, group, sgs);
10728
10729	/*
10730	* Computing avg_load makes sense only when group is fully busy or
10731	* overloaded
10732	*/
10733	if (sgs->group_type == group_fully_busy \|\|
10734	sgs->group_type == group_overloaded)
10735	sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10736	sgs->group_capacity;
10737	}
10738
10739	static bool update_pick_idlest(struct sched_group *idlest,
10740	struct sg_lb_stats *idlest_sgs,
10741	struct sched_group *group,
10742	struct sg_lb_stats *sgs)
10743	{
10744	if (sgs->group_type < idlest_sgs->group_type)
10745	return true;
10746
10747	if (sgs->group_type > idlest_sgs->group_type)
10748	return false;
10749
10750	/*
10751	* The candidate and the current idlest group are the same type of
10752	* group. Let check which one is the idlest according to the type.
10753	*/
10754
10755	switch (sgs->group_type) {
10756	case group_overloaded:
10757	case group_fully_busy:
10758	/ Select the group with lowest avg_load. /
10759	if (idlest_sgs->avg_load <= sgs->avg_load)
10760	return false;
10761	break;
10762
10763	case group_imbalanced:
10764	case group_asym_packing:
10765	case group_smt_balance:
10766	/ Those types are not used in the slow wakeup path /
10767	return false;
10768
10769	case group_misfit_task:
10770	/ Select group with the highest max capacity /
10771	if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10772	return false;
10773	break;
10774
10775	case group_has_spare:
10776	/ Select group with most idle CPUs /
10777	if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10778	return false;
10779
10780	/ Select group with lowest group_util /
10781	if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10782	idlest_sgs->group_util <= sgs->group_util)
10783	return false;
10784
10785	break;
10786	}
10787
10788	return true;
10789	}
10790
10791	/*
10792	* sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10793	* domain.
10794	*
10795	* Assumes p is allowed on at least one CPU in sd.
10796	*/
10797	static struct sched_group *
10798	sched_balance_find_dst_group(struct sched_domain sd, struct* task_struct p, int* this_cpu)
10799	{
10800	struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
10801	struct sg_lb_stats local_sgs, tmp_sgs;
10802	struct sg_lb_stats *sgs;
10803	unsigned long imbalance;
10804	struct sg_lb_stats idlest_sgs = {
10805	.avg_load = UINT_MAX,
10806	.group_type = group_overloaded,
10807	};
10808
10809	do {
10810	int local_group;
10811
10812	/ Skip over this group if it has no CPUs allowed /
10813	if (!cpumask_intersects(src1p: sched_group_span(sg: group),
10814	src2p: p->cpus_ptr))
10815	continue;
10816
10817	/ Skip over this group if no cookie matched /
10818	if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10819	continue;
10820
10821	local_group = cpumask_test_cpu(cpu: this_cpu,
10822	cpumask: sched_group_span(sg: group));
10823
10824	if (local_group) {
10825	sgs = &local_sgs;
10826	local = group;
10827	} else {
10828	sgs = &tmp_sgs;
10829	}
10830
10831	update_sg_wakeup_stats(sd, group, sgs, p);
10832
10833	if (!local_group && update_pick_idlest(idlest, idlest_sgs: &idlest_sgs, group, sgs)) {
10834	idlest = group;
10835	idlest_sgs = *sgs;
10836	}
10837
10838	} while (group = group->next, group != sd->groups);
10839
10840
10841	/ There is no idlest group to push tasks to /
10842	if (!idlest)
10843	return NULL;
10844
10845	/ The local group has been skipped because of CPU affinity /
10846	if (!local)
10847	return idlest;
10848
10849	/*
10850	* If the local group is idler than the selected idlest group
10851	* don't try and push the task.
10852	*/
10853	if (local_sgs.group_type < idlest_sgs.group_type)
10854	return NULL;
10855
10856	/*
10857	* If the local group is busier than the selected idlest group
10858	* try and push the task.
10859	*/
10860	if (local_sgs.group_type > idlest_sgs.group_type)
10861	return idlest;
10862
10863	switch (local_sgs.group_type) {
10864	case group_overloaded:
10865	case group_fully_busy:
10866
10867	/ Calculate allowed imbalance based on load /
10868	imbalance = scale_load_down(NICE_0_LOAD) *
10869	(sd->imbalance_pct-`100`) / `100`;
10870
10871	/*
10872	* When comparing groups across NUMA domains, it's possible for
10873	* the local domain to be very lightly loaded relative to the
10874	* remote domains but "imbalance" skews the comparison making
10875	* remote CPUs look much more favourable. When considering
10876	* cross-domain, add imbalance to the load on the remote node
10877	* and consider staying local.
10878	*/
10879
10880	if ((sd->flags & SD_NUMA) &&
10881	((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10882	return NULL;
10883
10884	/*
10885	* If the local group is less loaded than the selected
10886	* idlest group don't try and push any tasks.
10887	*/
10888	if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10889	return NULL;
10890
10891	if (`100` * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10892	return NULL;
10893	break;
10894
10895	case group_imbalanced:
10896	case group_asym_packing:
10897	case group_smt_balance:
10898	/ Those type are not used in the slow wakeup path /
10899	return NULL;
10900
10901	case group_misfit_task:
10902	/ Select group with the highest max capacity /
10903	if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10904	return NULL;
10905	break;
10906
10907	case group_has_spare:
10908	#ifdef CONFIG_NUMA
10909	if (sd->flags & SD_NUMA) {
10910	int imb_numa_nr = sd->imb_numa_nr;
10911	#ifdef CONFIG_NUMA_BALANCING
10912	int idlest_cpu;
10913	/*
10914	* If there is spare capacity at NUMA, try to select
10915	* the preferred node
10916	*/
10917	if (cpu_to_node(cpu: this_cpu) == p->numa_preferred_nid)
10918	return NULL;
10919
10920	idlest_cpu = cpumask_first(srcp: sched_group_span(sg: idlest));
10921	if (cpu_to_node(cpu: idlest_cpu) == p->numa_preferred_nid)
10922	return idlest;
10923	#endif /* CONFIG_NUMA_BALANCING */
10924	/*
10925	* Otherwise, keep the task close to the wakeup source
10926	* and improve locality if the number of running tasks
10927	* would remain below threshold where an imbalance is
10928	* allowed while accounting for the possibility the
10929	* task is pinned to a subset of CPUs. If there is a
10930	* real need of migration, periodic load balance will
10931	* take care of it.
10932	*/
10933	if (p->nr_cpus_allowed != NR_CPUS) {
10934	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10935
10936	cpumask_and(dstp: cpus, src1p: sched_group_span(sg: local), src2p: p->cpus_ptr);
10937	imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10938	}
10939
10940	imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10941	if (!adjust_numa_imbalance(imbalance,
10942	dst_running: local_sgs.sum_nr_running + `1`,
10943	imb_numa_nr)) {
10944	return NULL;
10945	}
10946	}
10947	#endif /* CONFIG_NUMA */
10948
10949	/*
10950	* Select group with highest number of idle CPUs. We could also
10951	* compare the utilization which is more stable but it can end
10952	* up that the group has less spare capacity but finally more
10953	* idle CPUs which means more opportunity to run task.
10954	*/
10955	if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10956	return NULL;
10957	break;
10958	}
10959
10960	return idlest;
10961	}
10962
10963	static void update_idle_cpu_scan(struct lb_env *env,
10964	unsigned long sum_util)
10965	{
10966	struct sched_domain_shared *sd_share;
10967	int llc_weight, pct;
10968	u64 x, y, tmp;
10969	/*
10970	* Update the number of CPUs to scan in LLC domain, which could
10971	* be used as a hint in select_idle_cpu(). The update of sd_share
10972	* could be expensive because it is within a shared cache line.
10973	* So the write of this hint only occurs during periodic load
10974	* balancing, rather than CPU_NEWLY_IDLE, because the latter
10975	* can fire way more frequently than the former.
10976	*/
10977	if (!sched_feat(SIS_UTIL) \|\| env->idle == CPU_NEWLY_IDLE)
10978	return;
10979
10980	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10981	if (env->sd->span_weight != llc_weight)
10982	return;
10983
10984	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10985	if (!sd_share)
10986	return;
10987
10988	/*
10989	* The number of CPUs to search drops as sum_util increases, when
10990	* sum_util hits 85% or above, the scan stops.
10991	* The reason to choose 85% as the threshold is because this is the
10992	* imbalance_pct(117) when a LLC sched group is overloaded.
10993	*
10994	* let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
10995	* and y'= y / SCHED_CAPACITY_SCALE
10996	*
10997	* x is the ratio of sum_util compared to the CPU capacity:
10998	* x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10999	* y' is the ratio of CPUs to be scanned in the LLC domain,
11000	* and the number of CPUs to scan is calculated by:
11001	*
11002	* nr_scan = llc_weight * y' [2]
11003	*
11004	* When x hits the threshold of overloaded, AKA, when
11005	* x = 100 / pct, y drops to 0. According to [1],
11006	* p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
11007	*
11008	* Scale x by SCHED_CAPACITY_SCALE:
11009	* x' = sum_util / llc_weight; [3]
11010	*
11011	* and finally [1] becomes:
11012	* y = SCHED_CAPACITY_SCALE -
11013	* x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
11014	*
11015	*/
11016	/ equation [3] /
11017	x = sum_util;
11018	do_div(x, llc_weight);
11019
11020	/ equation [4] /
11021	pct = env->sd->imbalance_pct;
11022	tmp = x * x * pct * pct;
11023	do_div(tmp, `10000` * SCHED_CAPACITY_SCALE);
11024	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
11025	y = SCHED_CAPACITY_SCALE - tmp;
11026
11027	/ equation [2] /
11028	y *= llc_weight;
11029	do_div(y, SCHED_CAPACITY_SCALE);
11030	if ((int)y != sd_share->nr_idle_scan)
11031	WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
11032	}
11033
11034	/**
11035	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
11036	* @env: The load balancing environment.
11037	* @sds: variable to hold the statistics for this sched_domain.
11038	*/
11039
11040	static inline void update_sd_lb_stats(struct lb_env env, struct* sd_lb_stats *sds)
11041	{
11042	struct sched_group *sg = env->sd->groups;
11043	struct sg_lb_stats *local = &sds->local_stat;
11044	struct sg_lb_stats tmp_sgs;
11045	unsigned long sum_util = `0`;
11046	bool sg_overloaded = `0`, sg_overutilized = `0`;
11047
11048	do {
11049	struct sg_lb_stats *sgs = &tmp_sgs;
11050	int local_group;
11051
11052	local_group = cpumask_test_cpu(cpu: env->dst_cpu, cpumask: sched_group_span(sg));
11053	if (local_group) {
11054	sds->local = sg;
11055	sgs = local;
11056
11057	if (env->idle != CPU_NEWLY_IDLE \|\|
11058	time_after_eq(jiffies, sg->sgc->next_update))
11059	update_group_capacity(sd: env->sd, cpu: env->dst_cpu);
11060	}
11061
11062	update_sg_lb_stats(env, sds, group: sg, sgs, sg_overloaded: &sg_overloaded, sg_overutilized: &sg_overutilized);
11063
11064	if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
11065	sds->busiest = sg;
11066	sds->busiest_stat = *sgs;
11067	}
11068
11069	/ Now, start updating sd_lb_stats /
11070	sds->total_load += sgs->group_load;
11071	sds->total_capacity += sgs->group_capacity;
11072
11073	sum_util += sgs->group_util;
11074	sg = sg->next;
11075	} while (sg != env->sd->groups);
11076
11077	/*
11078	* Indicate that the child domain of the busiest group prefers tasks
11079	* go to a child's sibling domains first. NB the flags of a sched group
11080	* are those of the child domain.
11081	*/
11082	if (sds->busiest)
11083	sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
11084
11085
11086	if (env->sd->flags & SD_NUMA)
11087	env->fbq_type = fbq_classify_group(sgs: &sds->busiest_stat);
11088
11089	if (!env->sd->parent) {
11090	/ update overload indicator if we are at root domain /
11091	set_rd_overloaded(rd: env->dst_rq->rd, status: sg_overloaded);
11092
11093	/ Update over-utilization (tipping point, U >= 0) indicator /
11094	set_rd_overutilized(rd: env->dst_rq->rd, flag: sg_overutilized);
11095	} else if (sg_overutilized) {
11096	set_rd_overutilized(rd: env->dst_rq->rd, flag: sg_overutilized);
11097	}
11098
11099	update_idle_cpu_scan(env, sum_util);
11100	}
11101
11102	/**
11103	* calculate_imbalance - Calculate the amount of imbalance present within the
11104	* groups of a given sched_domain during load balance.
11105	* @env: load balance environment
11106	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
11107	*/
11108	static inline void calculate_imbalance(struct lb_env env, struct* sd_lb_stats *sds)
11109	{
11110	struct sg_lb_stats local, busiest;
11111
11112	local = &sds->local_stat;
11113	busiest = &sds->busiest_stat;
11114
11115	if (busiest->group_type == group_misfit_task) {
11116	if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
11117	/ Set imbalance to allow misfit tasks to be balanced. /
11118	env->migration_type = migrate_misfit;
11119	env->imbalance = `1`;
11120	} else {
11121	/*
11122	* Set load imbalance to allow moving task from cpu
11123	* with reduced capacity.
11124	*/
11125	env->migration_type = migrate_load;
11126	env->imbalance = busiest->group_misfit_task_load;
11127	}
11128	return;
11129	}
11130
11131	if (busiest->group_type == group_asym_packing) {
11132	/*
11133	* In case of asym capacity, we will try to migrate all load to
11134	* the preferred CPU.
11135	*/
11136	env->migration_type = migrate_task;
11137	env->imbalance = busiest->sum_h_nr_running;
11138	return;
11139	}
11140
11141	if (busiest->group_type == group_smt_balance) {
11142	/ Reduce number of tasks sharing CPU capacity /
11143	env->migration_type = migrate_task;
11144	env->imbalance = `1`;
11145	return;
11146	}
11147
11148	if (busiest->group_type == group_imbalanced) {
11149	/*
11150	* In the group_imb case we cannot rely on group-wide averages
11151	* to ensure CPU-load equilibrium, try to move any task to fix
11152	* the imbalance. The next load balance will take care of
11153	* balancing back the system.
11154	*/
11155	env->migration_type = migrate_task;
11156	env->imbalance = `1`;
11157	return;
11158	}
11159
11160	/*
11161	* Try to use spare capacity of local group without overloading it or
11162	* emptying busiest.
11163	*/
11164	if (local->group_type == group_has_spare) {
11165	if ((busiest->group_type > group_fully_busy) &&
11166	!(env->sd->flags & SD_SHARE_LLC)) {
11167	/*
11168	* If busiest is overloaded, try to fill spare
11169	* capacity. This might end up creating spare capacity
11170	* in busiest or busiest still being overloaded but
11171	* there is no simple way to directly compute the
11172	* amount of load to migrate in order to balance the
11173	* system.
11174	*/
11175	env->migration_type = migrate_util;
11176	env->imbalance = max(local->group_capacity, local->group_util) -
11177	local->group_util;
11178
11179	/*
11180	* In some cases, the group's utilization is max or even
11181	* higher than capacity because of migrations but the
11182	* local CPU is (newly) idle. There is at least one
11183	* waiting task in this overloaded busiest group. Let's
11184	* try to pull it.
11185	*/
11186	if (env->idle && env->imbalance == `0`) {
11187	env->migration_type = migrate_task;
11188	env->imbalance = `1`;
11189	}
11190
11191	return;
11192	}
11193
11194	if (busiest->group_weight == `1` \|\| sds->prefer_sibling) {
11195	/*
11196	* When prefer sibling, evenly spread running tasks on
11197	* groups.
11198	*/
11199	env->migration_type = migrate_task;
11200	env->imbalance = sibling_imbalance(env, sds, busiest, local);
11201	} else {
11202
11203	/*
11204	* If there is no overload, we just want to even the number of
11205	* idle CPUs.
11206	*/
11207	env->migration_type = migrate_task;
11208	env->imbalance = max_t(long, `0`,
11209	(local->idle_cpus - busiest->idle_cpus));
11210	}
11211
11212	#ifdef CONFIG_NUMA
11213	/ Consider allowing a small imbalance between NUMA groups /
11214	if (env->sd->flags & SD_NUMA) {
11215	env->imbalance = adjust_numa_imbalance(imbalance: env->imbalance,
11216	dst_running: local->sum_nr_running + `1`,
11217	imb_numa_nr: env->sd->imb_numa_nr);
11218	}
11219	#endif
11220
11221	/ Number of tasks to move to restore balance /
11222	env->imbalance >>= `1`;
11223
11224	return;
11225	}
11226
11227	/*
11228	* Local is fully busy but has to take more load to relieve the
11229	* busiest group
11230	*/
11231	if (local->group_type < group_overloaded) {
11232	/*
11233	* Local will become overloaded so the avg_load metrics are
11234	* finally needed.
11235	*/
11236
11237	local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
11238	local->group_capacity;
11239
11240	/*
11241	* If the local group is more loaded than the selected
11242	* busiest group don't try to pull any tasks.
11243	*/
11244	if (local->avg_load >= busiest->avg_load) {
11245	env->imbalance = `0`;
11246	return;
11247	}
11248
11249	sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
11250	sds->total_capacity;
11251
11252	/*
11253	* If the local group is more loaded than the average system
11254	* load, don't try to pull any tasks.
11255	*/
11256	if (local->avg_load >= sds->avg_load) {
11257	env->imbalance = `0`;
11258	return;
11259	}
11260
11261	}
11262
11263	/*
11264	* Both group are or will become overloaded and we're trying to get all
11265	* the CPUs to the average_load, so we don't want to push ourselves
11266	* above the average load, nor do we wish to reduce the max loaded CPU
11267	* below the average load. At the same time, we also don't want to
11268	* reduce the group load below the group capacity. Thus we look for
11269	* the minimum possible imbalance.
11270	*/
11271	env->migration_type = migrate_load;
11272	env->imbalance = min(
11273	(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
11274	(sds->avg_load - local->avg_load) * local->group_capacity
11275	) / SCHED_CAPACITY_SCALE;
11276	}
11277
11278	/**** sched_balance_find_src_group() helpers end here ******************/
11279
11280	/*
11281	* Decision matrix according to the local and busiest group type:
11282	*
11283	* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
11284	* has_spare nr_idle balanced N/A N/A balanced balanced
11285	* fully_busy nr_idle nr_idle N/A N/A balanced balanced
11286	* misfit_task force N/A N/A N/A N/A N/A
11287	* asym_packing force force N/A N/A force force
11288	* imbalanced force force N/A N/A force force
11289	* overloaded force force N/A N/A force avg_load
11290	*
11291	* N/A : Not Applicable because already filtered while updating
11292	* statistics.
11293	* balanced : The system is balanced for these 2 groups.
11294	* force : Calculate the imbalance as load migration is probably needed.
11295	* avg_load : Only if imbalance is significant enough.
11296	* nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
11297	* different in groups.
11298	*/
11299
11300	/**
11301	* sched_balance_find_src_group - Returns the busiest group within the sched_domain
11302	* if there is an imbalance.
11303	* @env: The load balancing environment.
11304	*
11305	* Also calculates the amount of runnable load which should be moved
11306	* to restore balance.
11307	*
11308	* Return: - The busiest group if imbalance exists.
11309	*/
11310	static struct sched_group sched_balance_find_src_group(struct* lb_env *env)
11311	{
11312	struct sg_lb_stats local, busiest;
11313	struct sd_lb_stats sds;
11314
11315	init_sd_lb_stats(sds: &sds);
11316
11317	/*
11318	* Compute the various statistics relevant for load balancing at
11319	* this level.
11320	*/
11321	update_sd_lb_stats(env, sds: &sds);
11322
11323	/ There is no busy sibling group to pull tasks from /
11324	if (!sds.busiest)
11325	goto out_balanced;
11326
11327	busiest = &sds.busiest_stat;
11328
11329	/ Misfit tasks should be dealt with regardless of the avg load /
11330	if (busiest->group_type == group_misfit_task)
11331	goto force_balance;
11332
11333	if (!is_rd_overutilized(rd: env->dst_rq->rd) &&
11334	rcu_dereference(env->dst_rq->rd->pd))
11335	goto out_balanced;
11336
11337	/ ASYM feature bypasses nice load balance check /
11338	if (busiest->group_type == group_asym_packing)
11339	goto force_balance;
11340
11341	/*
11342	* If the busiest group is imbalanced the below checks don't
11343	* work because they assume all things are equal, which typically
11344	* isn't true due to cpus_ptr constraints and the like.
11345	*/
11346	if (busiest->group_type == group_imbalanced)
11347	goto force_balance;
11348
11349	local = &sds.local_stat;
11350	/*
11351	* If the local group is busier than the selected busiest group
11352	* don't try and pull any tasks.
11353	*/
11354	if (local->group_type > busiest->group_type)
11355	goto out_balanced;
11356
11357	/*
11358	* When groups are overloaded, use the avg_load to ensure fairness
11359	* between tasks.
11360	*/
11361	if (local->group_type == group_overloaded) {
11362	/*
11363	* If the local group is more loaded than the selected
11364	* busiest group don't try to pull any tasks.
11365	*/
11366	if (local->avg_load >= busiest->avg_load)
11367	goto out_balanced;
11368
11369	/ XXX broken for overlapping NUMA groups /
11370	sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
11371	sds.total_capacity;
11372
11373	/*
11374	* Don't pull any tasks if this group is already above the
11375	* domain average load.
11376	*/
11377	if (local->avg_load >= sds.avg_load)
11378	goto out_balanced;
11379
11380	/*
11381	* If the busiest group is more loaded, use imbalance_pct to be
11382	* conservative.
11383	*/
11384	if (`100` * busiest->avg_load <=
11385	env->sd->imbalance_pct * local->avg_load)
11386	goto out_balanced;
11387	}
11388
11389	/*
11390	* Try to move all excess tasks to a sibling domain of the busiest
11391	* group's child domain.
11392	*/
11393	if (sds.prefer_sibling && local->group_type == group_has_spare &&
11394	sibling_imbalance(env, sds: &sds, busiest, local) > `1`)
11395	goto force_balance;
11396
11397	if (busiest->group_type != group_overloaded) {
11398	if (!env->idle) {
11399	/*
11400	* If the busiest group is not overloaded (and as a
11401	* result the local one too) but this CPU is already
11402	* busy, let another idle CPU try to pull task.
11403	*/
11404	goto out_balanced;
11405	}
11406
11407	if (busiest->group_type == group_smt_balance &&
11408	smt_vs_nonsmt_groups(sg1: sds.local, sg2: sds.busiest)) {
11409	/ Let non SMT CPU pull from SMT CPU sharing with sibling /
11410	goto force_balance;
11411	}
11412
11413	if (busiest->group_weight > `1` &&
11414	local->idle_cpus <= (busiest->idle_cpus + `1`)) {
11415	/*
11416	* If the busiest group is not overloaded
11417	* and there is no imbalance between this and busiest
11418	* group wrt idle CPUs, it is balanced. The imbalance
11419	* becomes significant if the diff is greater than 1
11420	* otherwise we might end up to just move the imbalance
11421	* on another group. Of course this applies only if
11422	* there is more than 1 CPU per group.
11423	*/
11424	goto out_balanced;
11425	}
11426
11427	if (busiest->sum_h_nr_running == `1`) {
11428	/*
11429	* busiest doesn't have any tasks waiting to run
11430	*/
11431	goto out_balanced;
11432	}
11433	}
11434
11435	force_balance:
11436	/ Looks like there is an imbalance. Compute it /
11437	calculate_imbalance(env, sds: &sds);
11438	return env->imbalance ? sds.busiest : NULL;
11439
11440	out_balanced:
11441	env->imbalance = `0`;
11442	return NULL;
11443	}
11444
11445	/*
11446	* sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11447	*/
11448	static struct rq sched_balance_find_src_rq(struct* lb_env *env,
11449	struct sched_group *group)
11450	{
11451	struct rq busiest = NULL, rq;
11452	unsigned long busiest_util = `0`, busiest_load = `0`, busiest_capacity = `1`;
11453	unsigned int busiest_nr = `0`;
11454	int i;
11455
11456	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11457	unsigned long capacity, load, util;
11458	unsigned int nr_running;
11459	enum fbq_type rt;
11460
11461	rq = cpu_rq(i);
11462	rt = fbq_classify_rq(rq);
11463
11464	/*
11465	* We classify groups/runqueues into three groups:
11466	* - regular: there are !numa tasks
11467	* - remote: there are numa tasks that run on the 'wrong' node
11468	* - all: there is no distinction
11469	*
11470	* In order to avoid migrating ideally placed numa tasks,
11471	* ignore those when there's better options.
11472	*
11473	* If we ignore the actual busiest queue to migrate another
11474	* task, the next balance pass can still reduce the busiest
11475	* queue by moving tasks around inside the node.
11476	*
11477	* If we cannot move enough load due to this classification
11478	* the next pass will adjust the group classification and
11479	* allow migration of more tasks.
11480	*
11481	* Both cases only affect the total convergence complexity.
11482	*/
11483	if (rt > env->fbq_type)
11484	continue;
11485
11486	nr_running = rq->cfs.h_nr_runnable;
11487	if (!nr_running)
11488	continue;
11489
11490	capacity = capacity_of(cpu: i);
11491
11492	/*
11493	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11494	* eventually lead to active_balancing high->low capacity.
11495	* Higher per-CPU capacity is considered better than balancing
11496	* average load.
11497	*/
11498	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11499	!capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11500	nr_running == `1`)
11501	continue;
11502
11503	/*
11504	* Make sure we only pull tasks from a CPU of lower priority
11505	* when balancing between SMT siblings.
11506	*
11507	* If balancing between cores, let lower priority CPUs help
11508	* SMT cores with more than one busy sibling.
11509	*/
11510	if (sched_asym(sd: env->sd, dst_cpu: i, src_cpu: env->dst_cpu) && nr_running == `1`)
11511	continue;
11512
11513	switch (env->migration_type) {
11514	case migrate_load:
11515	/*
11516	* When comparing with load imbalance, use cpu_load()
11517	* which is not scaled with the CPU capacity.
11518	*/
11519	load = cpu_load(rq);
11520
11521	if (nr_running == `1` && load > env->imbalance &&
11522	!check_cpu_capacity(rq, sd: env->sd))
11523	break;
11524
11525	/*
11526	* For the load comparisons with the other CPUs,
11527	* consider the cpu_load() scaled with the CPU
11528	* capacity, so that the load can be moved away
11529	* from the CPU that is potentially running at a
11530	* lower capacity.
11531	*
11532	* Thus we're looking for max(load_i / capacity_i),
11533	* crosswise multiplication to rid ourselves of the
11534	* division works out to:
11535	* load_i * capacity_j > load_j * capacity_i;
11536	* where j is our previous maximum.
11537	*/
11538	if (load * busiest_capacity > busiest_load * capacity) {
11539	busiest_load = load;
11540	busiest_capacity = capacity;
11541	busiest = rq;
11542	}
11543	break;
11544
11545	case migrate_util:
11546	util = cpu_util_cfs_boost(cpu: i);
11547
11548	/*
11549	* Don't try to pull utilization from a CPU with one
11550	* running task. Whatever its utilization, we will fail
11551	* detach the task.
11552	*/
11553	if (nr_running <= `1`)
11554	continue;
11555
11556	if (busiest_util < util) {
11557	busiest_util = util;
11558	busiest = rq;
11559	}
11560	break;
11561
11562	case migrate_task:
11563	if (busiest_nr < nr_running) {
11564	busiest_nr = nr_running;
11565	busiest = rq;
11566	}
11567	break;
11568
11569	case migrate_misfit:
11570	/*
11571	* For ASYM_CPUCAPACITY domains with misfit tasks we
11572	* simply seek the "biggest" misfit task.
11573	*/
11574	if (rq->misfit_task_load > busiest_load) {
11575	busiest_load = rq->misfit_task_load;
11576	busiest = rq;
11577	}
11578
11579	break;
11580
11581	}
11582	}
11583
11584	return busiest;
11585	}
11586
11587	/*
11588	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11589	* so long as it is large enough.
11590	*/
11591	#define MAX_PINNED_INTERVAL 512
11592
11593	static inline bool
11594	asym_active_balance(struct lb_env *env)
11595	{
11596	/*
11597	* ASYM_PACKING needs to force migrate tasks from busy but lower
11598	* priority CPUs in order to pack all tasks in the highest priority
11599	* CPUs. When done between cores, do it only if the whole core if the
11600	* whole core is idle.
11601	*
11602	* If @env::src_cpu is an SMT core with busy siblings, let
11603	* the lower priority @env::dst_cpu help it. Do not follow
11604	* CPU priority.
11605	*/
11606	return env->idle && sched_use_asym_prio(sd: env->sd, cpu: env->dst_cpu) &&
11607	(sched_asym_prefer(a: env->dst_cpu, b: env->src_cpu) \|\|
11608	!sched_use_asym_prio(sd: env->sd, cpu: env->src_cpu));
11609	}
11610
11611	static inline bool
11612	imbalanced_active_balance(struct lb_env *env)
11613	{
11614	struct sched_domain *sd = env->sd;
11615
11616	/*
11617	* The imbalanced case includes the case of pinned tasks preventing a fair
11618	* distribution of the load on the system but also the even distribution of the
11619	* threads on a system with spare capacity
11620	*/
11621	if ((env->migration_type == migrate_task) &&
11622	(sd->nr_balance_failed > sd->cache_nice_tries+`2`))
11623	return `1`;
11624
11625	return `0`;
11626	}
11627
11628	static int need_active_balance(struct lb_env *env)
11629	{
11630	struct sched_domain *sd = env->sd;
11631
11632	if (asym_active_balance(env))
11633	return `1`;
11634
11635	if (imbalanced_active_balance(env))
11636	return `1`;
11637
11638	/*
11639	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11640	* It's worth migrating the task if the src_cpu's capacity is reduced
11641	* because of other sched_class or IRQs if more capacity stays
11642	* available on dst_cpu.
11643	*/
11644	if (env->idle &&
11645	(env->src_rq->cfs.h_nr_runnable == `1`)) {
11646	if ((check_cpu_capacity(rq: env->src_rq, sd)) &&
11647	(capacity_of(cpu: env->src_cpu)sd->imbalance_pct < capacity_of(cpu: env->dst_cpu)`100`))
11648	return `1`;
11649	}
11650
11651	if (env->migration_type == migrate_misfit)
11652	return `1`;
11653
11654	return `0`;
11655	}
11656
11657	static int active_load_balance_cpu_stop(void *data);
11658
11659	static int should_we_balance(struct lb_env *env)
11660	{
11661	struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11662	struct sched_group *sg = env->sd->groups;
11663	int cpu, idle_smt = -`1`;
11664
11665	/*
11666	* Ensure the balancing environment is consistent; can happen
11667	* when the softirq triggers 'during' hotplug.
11668	*/
11669	if (!cpumask_test_cpu(cpu: env->dst_cpu, cpumask: env->cpus))
11670	return `0`;
11671
11672	/*
11673	* In the newly idle case, we will allow all the CPUs
11674	* to do the newly idle load balance.
11675	*
11676	* However, we bail out if we already have tasks or a wakeup pending,
11677	* to optimize wakeup latency.
11678	*/
11679	if (env->idle == CPU_NEWLY_IDLE) {
11680	if (env->dst_rq->nr_running > `0` \|\| env->dst_rq->ttwu_pending)
11681	return `0`;
11682	return `1`;
11683	}
11684
11685	cpumask_copy(dstp: swb_cpus, srcp: group_balance_mask(sg));
11686	/ Try to find first idle CPU /
11687	for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11688	if (!idle_cpu(cpu))
11689	continue;
11690
11691	/*
11692	* Don't balance to idle SMT in busy core right away when
11693	* balancing cores, but remember the first idle SMT CPU for
11694	* later consideration. Find CPU on an idle core first.
11695	*/
11696	if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11697	if (idle_smt == -`1`)
11698	idle_smt = cpu;
11699	/*
11700	* If the core is not idle, and first SMT sibling which is
11701	* idle has been found, then its not needed to check other
11702	* SMT siblings for idleness:
11703	*/
11704	#ifdef CONFIG_SCHED_SMT
11705	cpumask_andnot(dstp: swb_cpus, src1p: swb_cpus, src2p: cpu_smt_mask(cpu));
11706	#endif
11707	continue;
11708	}
11709
11710	/*
11711	* Are we the first idle core in a non-SMT domain or higher,
11712	* or the first idle CPU in a SMT domain?
11713	*/
11714	return cpu == env->dst_cpu;
11715	}
11716
11717	/ Are we the first idle CPU with busy siblings? /
11718	if (idle_smt != -`1`)
11719	return idle_smt == env->dst_cpu;
11720
11721	/ Are we the first CPU of this group ? /
11722	return group_balance_cpu(sg) == env->dst_cpu;
11723	}
11724
11725	static void update_lb_imbalance_stat(struct lb_env env, struct* sched_domain *sd,
11726	enum cpu_idle_type idle)
11727	{
11728	if (!schedstat_enabled())
11729	return;
11730
11731	switch (env->migration_type) {
11732	case migrate_load:
11733	__schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
11734	break;
11735	case migrate_util:
11736	__schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
11737	break;
11738	case migrate_task:
11739	__schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
11740	break;
11741	case migrate_misfit:
11742	__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
11743	break;
11744	}
11745	}
11746
11747	/*
11748	* Check this_cpu to ensure it is balanced within domain. Attempt to move
11749	* tasks if there is an imbalance.
11750	*/
11751	static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11752	struct sched_domain sd, enum* cpu_idle_type idle,
11753	int *continue_balancing)
11754	{
11755	int ld_moved, cur_ld_moved, active_balance = `0`;
11756	struct sched_domain *sd_parent = sd->parent;
11757	struct sched_group *group;
11758	struct rq *busiest;
11759	struct rq_flags rf;
11760	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11761	struct lb_env env = {
11762	.sd = sd,
11763	.dst_cpu = this_cpu,
11764	.dst_rq = this_rq,
11765	.dst_grpmask = group_balance_mask(sg: sd->groups),
11766	.idle = idle,
11767	.loop_break = SCHED_NR_MIGRATE_BREAK,
11768	.cpus = cpus,
11769	.fbq_type = all,
11770	.tasks = LIST_HEAD_INIT(env.tasks),
11771	};
11772
11773	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), cpu_active_mask);
11774
11775	schedstat_inc(sd->lb_count[idle]);
11776
11777	redo:
11778	if (!should_we_balance(env: &env)) {
11779	*continue_balancing = `0`;
11780	goto out_balanced;
11781	}
11782
11783	group = sched_balance_find_src_group(env: &env);
11784	if (!group) {
11785	schedstat_inc(sd->lb_nobusyg[idle]);
11786	goto out_balanced;
11787	}
11788
11789	busiest = sched_balance_find_src_rq(env: &env, group);
11790	if (!busiest) {
11791	schedstat_inc(sd->lb_nobusyq[idle]);
11792	goto out_balanced;
11793	}
11794
11795	WARN_ON_ONCE(busiest == env.dst_rq);
11796
11797	update_lb_imbalance_stat(env: &env, sd, idle);
11798
11799	env.src_cpu = busiest->cpu;
11800	env.src_rq = busiest;
11801
11802	ld_moved = `0`;
11803	/ Clear this flag as soon as we find a pullable task /
11804	env.flags \|= LBF_ALL_PINNED;
11805	if (busiest->nr_running > `1`) {
11806	/*
11807	* Attempt to move tasks. If sched_balance_find_src_group has found
11808	* an imbalance but busiest->nr_running <= 1, the group is
11809	* still unbalanced. ld_moved simply stays zero, so it is
11810	* correctly treated as an imbalance.
11811	*/
11812	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
11813
11814	more_balance:
11815	rq_lock_irqsave(rq: busiest, rf: &rf);
11816	update_rq_clock(rq: busiest);
11817
11818	/*
11819	* cur_ld_moved - load moved in current iteration
11820	* ld_moved - cumulative load moved across iterations
11821	*/
11822	cur_ld_moved = detach_tasks(env: &env);
11823
11824	/*
11825	* We've detached some tasks from busiest_rq. Every
11826	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11827	* unlock busiest->lock, and we are able to be sure
11828	* that nobody can manipulate the tasks in parallel.
11829	* See task_rq_lock() family for the details.
11830	*/
11831
11832	rq_unlock(rq: busiest, rf: &rf);
11833
11834	if (cur_ld_moved) {
11835	attach_tasks(env: &env);
11836	ld_moved += cur_ld_moved;
11837	}
11838
11839	local_irq_restore(rf.flags);
11840
11841	if (env.flags & LBF_NEED_BREAK) {
11842	env.flags &= ~LBF_NEED_BREAK;
11843	goto more_balance;
11844	}
11845
11846	/*
11847	* Revisit (affine) tasks on src_cpu that couldn't be moved to
11848	* us and move them to an alternate dst_cpu in our sched_group
11849	* where they can run. The upper limit on how many times we
11850	* iterate on same src_cpu is dependent on number of CPUs in our
11851	* sched_group.
11852	*
11853	* This changes load balance semantics a bit on who can move
11854	* load to a given_cpu. In addition to the given_cpu itself
11855	* (or a ilb_cpu acting on its behalf where given_cpu is
11856	* nohz-idle), we now have balance_cpu in a position to move
11857	* load to given_cpu. In rare situations, this may cause
11858	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11859	* _independently_ and at _same_ time to move some load to
11860	* given_cpu) causing excess load to be moved to given_cpu.
11861	* This however should not happen so much in practice and
11862	* moreover subsequent load balance cycles should correct the
11863	* excess load moved.
11864	*/
11865	if ((env.flags & LBF_DST_PINNED) && env.imbalance > `0`) {
11866
11867	/ Prevent to re-select dst_cpu via env's CPUs /
11868	__cpumask_clear_cpu(cpu: env.dst_cpu, dstp: env.cpus);
11869
11870	env.dst_rq = cpu_rq(env.new_dst_cpu);
11871	env.dst_cpu = env.new_dst_cpu;
11872	env.flags &= ~LBF_DST_PINNED;
11873	env.loop = `0`;
11874	env.loop_break = SCHED_NR_MIGRATE_BREAK;
11875
11876	/*
11877	* Go back to "more_balance" rather than "redo" since we
11878	* need to continue with same src_cpu.
11879	*/
11880	goto more_balance;
11881	}
11882
11883	/*
11884	* We failed to reach balance because of affinity.
11885	*/
11886	if (sd_parent) {
11887	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11888
11889	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > `0`)
11890	*group_imbalance = `1`;
11891	}
11892
11893	/ All tasks on this runqueue were pinned by CPU affinity /
11894	if (unlikely(env.flags & LBF_ALL_PINNED)) {
11895	__cpumask_clear_cpu(cpu: cpu_of(rq: busiest), dstp: cpus);
11896	/*
11897	* Attempting to continue load balancing at the current
11898	* sched_domain level only makes sense if there are
11899	* active CPUs remaining as possible busiest CPUs to
11900	* pull load from which are not contained within the
11901	* destination group that is receiving any migrated
11902	* load.
11903	*/
11904	if (!cpumask_subset(src1p: cpus, src2p: env.dst_grpmask)) {
11905	env.loop = `0`;
11906	env.loop_break = SCHED_NR_MIGRATE_BREAK;
11907	goto redo;
11908	}
11909	goto out_all_pinned;
11910	}
11911	}
11912
11913	if (!ld_moved) {
11914	schedstat_inc(sd->lb_failed[idle]);
11915	/*
11916	* Increment the failure counter only on periodic balance.
11917	* We do not want newidle balance, which can be very
11918	* frequent, pollute the failure counter causing
11919	* excessive cache_hot migrations and active balances.
11920	*
11921	* Similarly for migration_misfit which is not related to
11922	* load/util migration, don't pollute nr_balance_failed.
11923	*/
11924	if (idle != CPU_NEWLY_IDLE &&
11925	env.migration_type != migrate_misfit)
11926	sd->nr_balance_failed++;
11927
11928	if (need_active_balance(env: &env)) {
11929	unsigned long flags;
11930
11931	raw_spin_rq_lock_irqsave(busiest, flags);
11932
11933	/*
11934	* Don't kick the active_load_balance_cpu_stop,
11935	* if the curr task on busiest CPU can't be
11936	* moved to this_cpu:
11937	*/
11938	if (!cpumask_test_cpu(cpu: this_cpu, cpumask: busiest->curr->cpus_ptr)) {
11939	raw_spin_rq_unlock_irqrestore(rq: busiest, flags);
11940	goto out_one_pinned;
11941	}
11942
11943	/ Record that we found at least one task that could run on this_cpu /
11944	env.flags &= ~LBF_ALL_PINNED;
11945
11946	/*
11947	* ->active_balance synchronizes accesses to
11948	* ->active_balance_work. Once set, it's cleared
11949	* only after active load balance is finished.
11950	*/
11951	if (!busiest->active_balance) {
11952	busiest->active_balance = `1`;
11953	busiest->push_cpu = this_cpu;
11954	active_balance = `1`;
11955	}
11956
11957	preempt_disable();
11958	raw_spin_rq_unlock_irqrestore(rq: busiest, flags);
11959	if (active_balance) {
11960	stop_one_cpu_nowait(cpu: cpu_of(rq: busiest),
11961	fn: active_load_balance_cpu_stop, arg: busiest,
11962	work_buf: &busiest->active_balance_work);
11963	}
11964	preempt_enable();
11965	}
11966	} else {
11967	sd->nr_balance_failed = `0`;
11968	}
11969
11970	if (likely(!active_balance) \|\| need_active_balance(env: &env)) {
11971	/ We were unbalanced, so reset the balancing interval /
11972	sd->balance_interval = sd->min_interval;
11973	}
11974
11975	goto out;
11976
11977	out_balanced:
11978	/*
11979	* We reach balance although we may have faced some affinity
11980	* constraints. Clear the imbalance flag only if other tasks got
11981	* a chance to move and fix the imbalance.
11982	*/
11983	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11984	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11985
11986	if (*group_imbalance)
11987	*group_imbalance = `0`;
11988	}
11989
11990	out_all_pinned:
11991	/*
11992	* We reach balance because all tasks are pinned at this level so
11993	* we can't migrate them. Let the imbalance flag set so parent level
11994	* can try to migrate them.
11995	*/
11996	schedstat_inc(sd->lb_balanced[idle]);
11997
11998	sd->nr_balance_failed = `0`;
11999
12000	out_one_pinned:
12001	ld_moved = `0`;
12002
12003	/*
12004	* sched_balance_newidle() disregards balance intervals, so we could
12005	* repeatedly reach this code, which would lead to balance_interval
12006	* skyrocketing in a short amount of time. Skip the balance_interval
12007	* increase logic to avoid that.
12008	*
12009	* Similarly misfit migration which is not necessarily an indication of
12010	* the system being busy and requires lb to backoff to let it settle
12011	* down.
12012	*/
12013	if (env.idle == CPU_NEWLY_IDLE \|\|
12014	env.migration_type == migrate_misfit)
12015	goto out;
12016
12017	/ tune up the balancing interval /
12018	if ((env.flags & LBF_ALL_PINNED &&
12019	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
12020	sd->balance_interval < sd->max_interval)
12021	sd->balance_interval *= `2`;
12022	out:
12023	return ld_moved;
12024	}
12025
12026	static inline unsigned long
12027	get_sd_balance_interval(struct sched_domain sd, int* cpu_busy)
12028	{
12029	unsigned long interval = sd->balance_interval;
12030
12031	if (cpu_busy)
12032	interval *= sd->busy_factor;
12033
12034	/ scale ms to jiffies /
12035	interval = msecs_to_jiffies(m: interval);
12036
12037	/*
12038	* Reduce likelihood of busy balancing at higher domains racing with
12039	* balancing at lower domains by preventing their balancing periods
12040	* from being multiples of each other.
12041	*/
12042	if (cpu_busy)
12043	interval -= `1`;
12044
12045	interval = clamp(interval, `1UL`, max_load_balance_interval);
12046
12047	return interval;
12048	}
12049
12050	static inline void
12051	update_next_balance(struct sched_domain sd, unsigned* long *next_balance)
12052	{
12053	unsigned long interval, next;
12054
12055	/ used by idle balance, so cpu_busy = 0 /
12056	interval = get_sd_balance_interval(sd, cpu_busy: `0`);
12057	next = sd->last_balance + interval;
12058
12059	if (time_after(*next_balance, next))
12060	*next_balance = next;
12061	}
12062
12063	/*
12064	* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
12065	* running tasks off the busiest CPU onto idle CPUs. It requires at
12066	* least 1 task to be running on each physical CPU where possible, and
12067	* avoids physical / logical imbalances.
12068	*/
12069	static int active_load_balance_cpu_stop(void *data)
12070	{
12071	struct rq *busiest_rq = data;
12072	int busiest_cpu = cpu_of(rq: busiest_rq);
12073	int target_cpu = busiest_rq->push_cpu;
12074	struct rq *target_rq = cpu_rq(target_cpu);
12075	struct sched_domain *sd;
12076	struct task_struct *p = NULL;
12077	struct rq_flags rf;
12078
12079	rq_lock_irq(rq: busiest_rq, rf: &rf);
12080	/*
12081	* Between queueing the stop-work and running it is a hole in which
12082	* CPUs can become inactive. We should not move tasks from or to
12083	* inactive CPUs.
12084	*/
12085	if (!cpu_active(cpu: busiest_cpu) \|\| !cpu_active(cpu: target_cpu))
12086	goto out_unlock;
12087
12088	/ Make sure the requested CPU hasn't gone down in the meantime: /
12089	if (unlikely(busiest_cpu != smp_processor_id() \|\|
12090	!busiest_rq->active_balance))
12091	goto out_unlock;
12092
12093	/ Is there any task to move? /
12094	if (busiest_rq->nr_running <= `1`)
12095	goto out_unlock;
12096
12097	/*
12098	* This condition is "impossible", if it occurs
12099	* we need to fix it. Originally reported by
12100	* Bjorn Helgaas on a 128-CPU setup.
12101	*/
12102	WARN_ON_ONCE(busiest_rq == target_rq);
12103
12104	/ Search for an sd spanning us and the target CPU. /
12105	rcu_read_lock();
12106	for_each_domain(target_cpu, sd) {
12107	if (cpumask_test_cpu(cpu: busiest_cpu, cpumask: sched_domain_span(sd)))
12108	break;
12109	}
12110
12111	if (likely(sd)) {
12112	struct lb_env env = {
12113	.sd = sd,
12114	.dst_cpu = target_cpu,
12115	.dst_rq = target_rq,
12116	.src_cpu = busiest_rq->cpu,
12117	.src_rq = busiest_rq,
12118	.idle = CPU_IDLE,
12119	.flags = LBF_ACTIVE_LB,
12120	};
12121
12122	schedstat_inc(sd->alb_count);
12123	update_rq_clock(rq: busiest_rq);
12124
12125	p = detach_one_task(env: &env);
12126	if (p) {
12127	schedstat_inc(sd->alb_pushed);
12128	/ Active balancing done, reset the failure counter. /
12129	sd->nr_balance_failed = `0`;
12130	} else {
12131	schedstat_inc(sd->alb_failed);
12132	}
12133	}
12134	rcu_read_unlock();
12135	out_unlock:
12136	busiest_rq->active_balance = `0`;
12137	rq_unlock(rq: busiest_rq, rf: &rf);
12138
12139	if (p)
12140	attach_one_task(rq: target_rq, p);
12141
12142	local_irq_enable();
12143
12144	return `0`;
12145	}
12146
12147	/*
12148	* This flag serializes load-balancing passes over large domains
12149	* (above the NODE topology level) - only one load-balancing instance
12150	* may run at a time, to reduce overhead on very large systems with
12151	* lots of CPUs and large NUMA distances.
12152	*
12153	* - Note that load-balancing passes triggered while another one
12154	* is executing are skipped and not re-tried.
12155	*
12156	* - Also note that this does not serialize rebalance_domains()
12157	* execution, as non-SD_SERIALIZE domains will still be
12158	* load-balanced in parallel.
12159	*/
12160	static atomic_t sched_balance_running = ATOMIC_INIT(`0`);
12161
12162	/*
12163	* Scale the max sched_balance_rq interval with the number of CPUs in the system.
12164	* This trades load-balance latency on larger machines for less cross talk.
12165	*/
12166	void update_max_interval(void)
12167	{
12168	max_load_balance_interval = HZ*num_online_cpus()/`10`;
12169	}
12170
12171	static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12172	{
12173	if (cost > sd->max_newidle_lb_cost) {
12174	/*
12175	* Track max cost of a domain to make sure to not delay the
12176	* next wakeup on the CPU.
12177	*/
12178	sd->max_newidle_lb_cost = cost;
12179	sd->last_decay_max_lb_cost = jiffies;
12180	} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
12181	/*
12182	* Decay the newidle max times by ~1% per second to ensure that
12183	* it is not outdated and the current max cost is actually
12184	* shorter.
12185	*/
12186	sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * `253`) / `256`;
12187	sd->last_decay_max_lb_cost = jiffies;
12188
12189	return true;
12190	}
12191
12192	return false;
12193	}
12194
12195	/*
12196	* It checks each scheduling domain to see if it is due to be balanced,
12197	* and initiates a balancing operation if so.
12198	*
12199	* Balancing parameters are set up in init_sched_domains.
12200	*/
12201	static void sched_balance_domains(struct rq rq, enum* cpu_idle_type idle)
12202	{
12203	int continue_balancing = `1`;
12204	int cpu = rq->cpu;
12205	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
12206	unsigned long interval;
12207	struct sched_domain *sd;
12208	/ Earliest time when we have to do rebalance again /
12209	unsigned long next_balance = jiffies + `60`*HZ;
12210	int update_next_balance = `0`;
12211	int need_serialize, need_decay = `0`;
12212	u64 max_cost = `0`;
12213
12214	rcu_read_lock();
12215	for_each_domain(cpu, sd) {
12216	/*
12217	* Decay the newidle max times here because this is a regular
12218	* visit to all the domains.
12219	*/
12220	need_decay = update_newidle_cost(sd, cost: `0`);
12221	max_cost += sd->max_newidle_lb_cost;
12222
12223	/*
12224	* Stop the load balance at this level. There is another
12225	* CPU in our sched group which is doing load balancing more
12226	* actively.
12227	*/
12228	if (!continue_balancing) {
12229	if (need_decay)
12230	continue;
12231	break;
12232	}
12233
12234	interval = get_sd_balance_interval(sd, cpu_busy: busy);
12235
12236	need_serialize = sd->flags & SD_SERIALIZE;
12237	if (need_serialize) {
12238	if (atomic_cmpxchg_acquire(v: &sched_balance_running, old: `0`, new: `1`))
12239	goto out;
12240	}
12241
12242	if (time_after_eq(jiffies, sd->last_balance + interval)) {
12243	if (sched_balance_rq(this_cpu: cpu, this_rq: rq, sd, idle, continue_balancing: &continue_balancing)) {
12244	/*
12245	* The LBF_DST_PINNED logic could have changed
12246	* env->dst_cpu, so we can't know our idle
12247	* state even if we migrated tasks. Update it.
12248	*/
12249	idle = idle_cpu(cpu);
12250	busy = !idle && !sched_idle_cpu(cpu);
12251	}
12252	sd->last_balance = jiffies;
12253	interval = get_sd_balance_interval(sd, cpu_busy: busy);
12254	}
12255	if (need_serialize)
12256	atomic_set_release(v: &sched_balance_running, i: `0`);
12257	out:
12258	if (time_after(next_balance, sd->last_balance + interval)) {
12259	next_balance = sd->last_balance + interval;
12260	update_next_balance = `1`;
12261	}
12262	}
12263	if (need_decay) {
12264	/*
12265	* Ensure the rq-wide value also decays but keep it at a
12266	* reasonable floor to avoid funnies with rq->avg_idle.
12267	*/
12268	rq->max_idle_balance_cost =
12269	max((u64)sysctl_sched_migration_cost, max_cost);
12270	}
12271	rcu_read_unlock();
12272
12273	/*
12274	* next_balance will be updated only when there is a need.
12275	* When the cpu is attached to null domain for ex, it will not be
12276	* updated.
12277	*/
12278	if (likely(update_next_balance))
12279	rq->next_balance = next_balance;
12280
12281	}
12282
12283	static inline int on_null_domain(struct rq *rq)
12284	{
12285	return unlikely(!rcu_dereference_sched(rq->sd));
12286	}
12287
12288	#ifdef CONFIG_NO_HZ_COMMON
12289	/*
12290	* NOHZ idle load balancing (ILB) details:
12291	*
12292	* - When one of the busy CPUs notices that there may be an idle rebalancing
12293	* needed, they will kick the idle load balancer, which then does idle
12294	* load balancing for all the idle CPUs.
12295	*/
12296	static inline int find_new_ilb(void)
12297	{
12298	const struct cpumask *hk_mask;
12299	int ilb_cpu;
12300
12301	hk_mask = housekeeping_cpumask(type: HK_TYPE_KERNEL_NOISE);
12302
12303	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
12304
12305	if (ilb_cpu == smp_processor_id())
12306	continue;
12307
12308	if (idle_cpu(cpu: ilb_cpu))
12309	return ilb_cpu;
12310	}
12311
12312	return -`1`;
12313	}
12314
12315	/*
12316	* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
12317	* SMP function call (IPI).
12318	*
12319	* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
12320	* (if there is one).
12321	*/
12322	static void kick_ilb(unsigned int flags)
12323	{
12324	int ilb_cpu;
12325
12326	/*
12327	* Increase nohz.next_balance only when if full ilb is triggered but
12328	* not if we only update stats.
12329	*/
12330	if (flags & NOHZ_BALANCE_KICK)
12331	nohz.next_balance = jiffies+`1`;
12332
12333	ilb_cpu = find_new_ilb();
12334	if (ilb_cpu < `0`)
12335	return;
12336
12337	/*
12338	* Don't bother if no new NOHZ balance work items for ilb_cpu,
12339	* i.e. all bits in flags are already set in ilb_cpu.
12340	*/
12341	if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
12342	return;
12343
12344	/*
12345	* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
12346	* the first flag owns it; cleared by nohz_csd_func().
12347	*/
12348	flags = atomic_fetch_or(i: flags, nohz_flags(ilb_cpu));
12349	if (flags & NOHZ_KICK_MASK)
12350	return;
12351
12352	/*
12353	* This way we generate an IPI on the target CPU which
12354	* is idle, and the softirq performing NOHZ idle load balancing
12355	* will be run before returning from the IPI.
12356	*/
12357	smp_call_function_single_async(cpu: ilb_cpu, csd: &cpu_rq(ilb_cpu)->nohz_csd);
12358	}
12359
12360	/*
12361	* Current decision point for kicking the idle load balancer in the presence
12362	* of idle CPUs in the system.
12363	*/
12364	static void nohz_balancer_kick(struct rq *rq)
12365	{
12366	unsigned long now = jiffies;
12367	struct sched_domain_shared *sds;
12368	struct sched_domain *sd;
12369	int nr_busy, i, cpu = rq->cpu;
12370	unsigned int flags = `0`;
12371
12372	if (unlikely(rq->idle_balance))
12373	return;
12374
12375	/*
12376	* We may be recently in ticked or tickless idle mode. At the first
12377	* busy tick after returning from idle, we will update the busy stats.
12378	*/
12379	nohz_balance_exit_idle(rq);
12380
12381	/*
12382	* None are in tickless mode and hence no need for NOHZ idle load
12383	* balancing:
12384	*/
12385	if (likely(!atomic_read(&nohz.nr_cpus)))
12386	return;
12387
12388	if (READ_ONCE(nohz.has_blocked) &&
12389	time_after(now, READ_ONCE(nohz.next_blocked)))
12390	flags = NOHZ_STATS_KICK;
12391
12392	if (time_before(now, nohz.next_balance))
12393	goto out;
12394
12395	if (rq->nr_running >= `2`) {
12396	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12397	goto out;
12398	}
12399
12400	rcu_read_lock();
12401
12402	sd = rcu_dereference(rq->sd);
12403	if (sd) {
12404	/*
12405	* If there's a runnable CFS task and the current CPU has reduced
12406	* capacity, kick the ILB to see if there's a better CPU to run on:
12407	*/
12408	if (rq->cfs.h_nr_runnable >= `1` && check_cpu_capacity(rq, sd)) {
12409	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12410	goto unlock;
12411	}
12412	}
12413
12414	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12415	if (sd) {
12416	/*
12417	* When ASYM_PACKING; see if there's a more preferred CPU
12418	* currently idle; in which case, kick the ILB to move tasks
12419	* around.
12420	*
12421	* When balancing between cores, all the SMT siblings of the
12422	* preferred CPU must be idle.
12423	*/
12424	for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12425	if (sched_asym(sd, dst_cpu: i, src_cpu: cpu)) {
12426	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12427	goto unlock;
12428	}
12429	}
12430	}
12431
12432	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12433	if (sd) {
12434	/*
12435	* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12436	* to run the misfit task on.
12437	*/
12438	if (check_misfit_status(rq)) {
12439	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12440	goto unlock;
12441	}
12442
12443	/*
12444	* For asymmetric systems, we do not want to nicely balance
12445	* cache use, instead we want to embrace asymmetry and only
12446	* ensure tasks have enough CPU capacity.
12447	*
12448	* Skip the LLC logic because it's not relevant in that case.
12449	*/
12450	goto unlock;
12451	}
12452
12453	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12454	if (sds) {
12455	/*
12456	* If there is an imbalance between LLC domains (IOW we could
12457	* increase the overall cache utilization), we need a less-loaded LLC
12458	* domain to pull some load from. Likewise, we may need to spread
12459	* load within the current LLC domain (e.g. packed SMT cores but
12460	* other CPUs are idle). We can't really know from here how busy
12461	* the others are - so just get a NOHZ balance going if it looks
12462	* like this LLC domain has tasks we could move.
12463	*/
12464	nr_busy = atomic_read(v: &sds->nr_busy_cpus);
12465	if (nr_busy > `1`) {
12466	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12467	goto unlock;
12468	}
12469	}
12470	unlock:
12471	rcu_read_unlock();
12472	out:
12473	if (READ_ONCE(nohz.needs_update))
12474	flags \|= NOHZ_NEXT_KICK;
12475
12476	if (flags)
12477	kick_ilb(flags);
12478	}
12479
12480	static void set_cpu_sd_state_busy(int cpu)
12481	{
12482	struct sched_domain *sd;
12483
12484	rcu_read_lock();
12485	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12486
12487	if (!sd \|\| !sd->nohz_idle)
12488	goto unlock;
12489	sd->nohz_idle = `0`;
12490
12491	atomic_inc(v: &sd->shared->nr_busy_cpus);
12492	unlock:
12493	rcu_read_unlock();
12494	}
12495
12496	void nohz_balance_exit_idle(struct rq *rq)
12497	{
12498	WARN_ON_ONCE(rq != this_rq());
12499
12500	if (likely(!rq->nohz_tick_stopped))
12501	return;
12502
12503	rq->nohz_tick_stopped = `0`;
12504	cpumask_clear_cpu(cpu: rq->cpu, dstp: nohz.idle_cpus_mask);
12505	atomic_dec(v: &nohz.nr_cpus);
12506
12507	set_cpu_sd_state_busy(rq->cpu);
12508	}
12509
12510	static void set_cpu_sd_state_idle(int cpu)
12511	{
12512	struct sched_domain *sd;
12513
12514	rcu_read_lock();
12515	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12516
12517	if (!sd \|\| sd->nohz_idle)
12518	goto unlock;
12519	sd->nohz_idle = `1`;
12520
12521	atomic_dec(v: &sd->shared->nr_busy_cpus);
12522	unlock:
12523	rcu_read_unlock();
12524	}
12525
12526	/*
12527	* This routine will record that the CPU is going idle with tick stopped.
12528	* This info will be used in performing idle load balancing in the future.
12529	*/
12530	void nohz_balance_enter_idle(int cpu)
12531	{
12532	struct rq *rq = cpu_rq(cpu);
12533
12534	WARN_ON_ONCE(cpu != smp_processor_id());
12535
12536	/ If this CPU is going down, then nothing needs to be done: /
12537	if (!cpu_active(cpu))
12538	return;
12539
12540	/*
12541	* Can be set safely without rq->lock held
12542	* If a clear happens, it will have evaluated last additions because
12543	* rq->lock is held during the check and the clear
12544	*/
12545	rq->has_blocked_load = `1`;
12546
12547	/*
12548	* The tick is still stopped but load could have been added in the
12549	* meantime. We set the nohz.has_blocked flag to trig a check of the
12550	* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12551	* of nohz.has_blocked can only happen after checking the new load
12552	*/
12553	if (rq->nohz_tick_stopped)
12554	goto out;
12555
12556	/ If we're a completely isolated CPU, we don't play: /
12557	if (on_null_domain(rq))
12558	return;
12559
12560	rq->nohz_tick_stopped = `1`;
12561
12562	cpumask_set_cpu(cpu, dstp: nohz.idle_cpus_mask);
12563	atomic_inc(v: &nohz.nr_cpus);
12564
12565	/*
12566	* Ensures that if nohz_idle_balance() fails to observe our
12567	* @idle_cpus_mask store, it must observe the @has_blocked
12568	* and @needs_update stores.
12569	*/
12570	smp_mb__after_atomic();
12571
12572	set_cpu_sd_state_idle(cpu);
12573
12574	WRITE_ONCE(nohz.needs_update, `1`);
12575	out:
12576	/*
12577	* Each time a cpu enter idle, we assume that it has blocked load and
12578	* enable the periodic update of the load of idle CPUs
12579	*/
12580	WRITE_ONCE(nohz.has_blocked, `1`);
12581	}
12582
12583	static bool update_nohz_stats(struct rq *rq)
12584	{
12585	unsigned int cpu = rq->cpu;
12586
12587	if (!rq->has_blocked_load)
12588	return false;
12589
12590	if (!cpumask_test_cpu(cpu, cpumask: nohz.idle_cpus_mask))
12591	return false;
12592
12593	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12594	return true;
12595
12596	sched_balance_update_blocked_averages(cpu);
12597
12598	return rq->has_blocked_load;
12599	}
12600
12601	/*
12602	* Internal function that runs load balance for all idle CPUs. The load balance
12603	* can be a simple update of blocked load or a complete load balance with
12604	* tasks movement depending of flags.
12605	*/
12606	static void _nohz_idle_balance(struct rq this_rq, unsigned* int flags)
12607	{
12608	/ Earliest time when we have to do rebalance again /
12609	unsigned long now = jiffies;
12610	unsigned long next_balance = now + `60`*HZ;
12611	bool has_blocked_load = false;
12612	int update_next_balance = `0`;
12613	int this_cpu = this_rq->cpu;
12614	int balance_cpu;
12615	struct rq *rq;
12616
12617	WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12618
12619	/*
12620	* We assume there will be no idle load after this update and clear
12621	* the has_blocked flag. If a cpu enters idle in the mean time, it will
12622	* set the has_blocked flag and trigger another update of idle load.
12623	* Because a cpu that becomes idle, is added to idle_cpus_mask before
12624	* setting the flag, we are sure to not clear the state and not
12625	* check the load of an idle cpu.
12626	*
12627	* Same applies to idle_cpus_mask vs needs_update.
12628	*/
12629	if (flags & NOHZ_STATS_KICK)
12630	WRITE_ONCE(nohz.has_blocked, `0`);
12631	if (flags & NOHZ_NEXT_KICK)
12632	WRITE_ONCE(nohz.needs_update, `0`);
12633
12634	/*
12635	* Ensures that if we miss the CPU, we must see the has_blocked
12636	* store from nohz_balance_enter_idle().
12637	*/
12638	smp_mb();
12639
12640	/*
12641	* Start with the next CPU after this_cpu so we will end with this_cpu and let a
12642	* chance for other idle cpu to pull load.
12643	*/
12644	for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+`1`) {
12645	if (!idle_cpu(cpu: balance_cpu))
12646	continue;
12647
12648	/*
12649	* If this CPU gets work to do, stop the load balancing
12650	* work being done for other CPUs. Next load
12651	* balancing owner will pick it up.
12652	*/
12653	if (!idle_cpu(cpu: this_cpu) && need_resched()) {
12654	if (flags & NOHZ_STATS_KICK)
12655	has_blocked_load = true;
12656	if (flags & NOHZ_NEXT_KICK)
12657	WRITE_ONCE(nohz.needs_update, `1`);
12658	goto abort;
12659	}
12660
12661	rq = cpu_rq(balance_cpu);
12662
12663	if (flags & NOHZ_STATS_KICK)
12664	has_blocked_load \|= update_nohz_stats(rq);
12665
12666	/*
12667	* If time for next balance is due,
12668	* do the balance.
12669	*/
12670	if (time_after_eq(jiffies, rq->next_balance)) {
12671	struct rq_flags rf;
12672
12673	rq_lock_irqsave(rq, rf: &rf);
12674	update_rq_clock(rq);
12675	rq_unlock_irqrestore(rq, rf: &rf);
12676
12677	if (flags & NOHZ_BALANCE_KICK)
12678	sched_balance_domains(rq, idle: CPU_IDLE);
12679	}
12680
12681	if (time_after(next_balance, rq->next_balance)) {
12682	next_balance = rq->next_balance;
12683	update_next_balance = `1`;
12684	}
12685	}
12686
12687	/*
12688	* next_balance will be updated only when there is a need.
12689	* When the CPU is attached to null domain for ex, it will not be
12690	* updated.
12691	*/
12692	if (likely(update_next_balance))
12693	nohz.next_balance = next_balance;
12694
12695	if (flags & NOHZ_STATS_KICK)
12696	WRITE_ONCE(nohz.next_blocked,
12697	now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12698
12699	abort:
12700	/ There is still blocked load, enable periodic update /
12701	if (has_blocked_load)
12702	WRITE_ONCE(nohz.has_blocked, `1`);
12703	}
12704
12705	/*
12706	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12707	* rebalancing for all the CPUs for whom scheduler ticks are stopped.
12708	*/
12709	static bool nohz_idle_balance(struct rq this_rq, enum* cpu_idle_type idle)
12710	{
12711	unsigned int flags = this_rq->nohz_idle_balance;
12712
12713	if (!flags)
12714	return false;
12715
12716	this_rq->nohz_idle_balance = `0`;
12717
12718	if (idle != CPU_IDLE)
12719	return false;
12720
12721	_nohz_idle_balance(this_rq, flags);
12722
12723	return true;
12724	}
12725
12726	/*
12727	* Check if we need to directly run the ILB for updating blocked load before
12728	* entering idle state. Here we run ILB directly without issuing IPIs.
12729	*
12730	* Note that when this function is called, the tick may not yet be stopped on
12731	* this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12732	* cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12733	* don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12734	* entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12735	* called from this function on (this) CPU that's not yet in the mask. That's
12736	* OK because the goal of nohz_run_idle_balance() is to run ILB only for
12737	* updating the blocked load of already idle CPUs without waking up one of
12738	* those idle CPUs and outside the preempt disable / IRQ off phase of the local
12739	* cpu about to enter idle, because it can take a long time.
12740	*/
12741	void nohz_run_idle_balance(int cpu)
12742	{
12743	unsigned int flags;
12744
12745	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12746
12747	/*
12748	* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12749	* (i.e. NOHZ_STATS_KICK set) and will do the same.
12750	*/
12751	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12752	_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12753	}
12754
12755	static void nohz_newidle_balance(struct rq *this_rq)
12756	{
12757	int this_cpu = this_rq->cpu;
12758
12759	/ Will wake up very soon. No time for doing anything else/
12760	if (this_rq->avg_idle < sysctl_sched_migration_cost)
12761	return;
12762
12763	/ Don't need to update blocked load of idle CPUs/
12764	if (!READ_ONCE(nohz.has_blocked) \|\|
12765	time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12766	return;
12767
12768	/*
12769	* Set the need to trigger ILB in order to update blocked load
12770	* before entering idle state.
12771	*/
12772	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12773	}
12774
12775	#else /* !CONFIG_NO_HZ_COMMON */
12776	static inline void nohz_balancer_kick(struct rq *rq) { }
12777
12778	static inline bool nohz_idle_balance(struct rq this_rq, enum* cpu_idle_type idle)
12779	{
12780	return false;
12781	}
12782
12783	static inline void nohz_newidle_balance(struct rq *this_rq) { }
12784	#endif /* CONFIG_NO_HZ_COMMON */
12785
12786	/*
12787	* sched_balance_newidle is called by schedule() if this_cpu is about to become
12788	* idle. Attempts to pull tasks from other CPUs.
12789	*
12790	* Returns:
12791	* < 0 - we released the lock and there are !fair tasks present
12792	* 0 - failed, no new tasks
12793	* > 0 - success, new (fair) tasks present
12794	*/
12795	static int sched_balance_newidle(struct rq this_rq, struct* rq_flags *rf)
12796	{
12797	unsigned long next_balance = jiffies + HZ;
12798	int this_cpu = this_rq->cpu;
12799	int continue_balancing = `1`;
12800	u64 t0, t1, curr_cost = `0`;
12801	struct sched_domain *sd;
12802	int pulled_task = `0`;
12803
12804	update_misfit_status(NULL, rq: this_rq);
12805
12806	/*
12807	* There is a task waiting to run. No need to search for one.
12808	* Return 0; the task will be enqueued when switching to idle.
12809	*/
12810	if (this_rq->ttwu_pending)
12811	return `0`;
12812
12813	/*
12814	* We must set idle_stamp _before_ calling sched_balance_rq()
12815	* for CPU_NEWLY_IDLE, such that we measure the this duration
12816	* as idle time.
12817	*/
12818	this_rq->idle_stamp = rq_clock(rq: this_rq);
12819
12820	/*
12821	* Do not pull tasks towards !active CPUs...
12822	*/
12823	if (!cpu_active(cpu: this_cpu))
12824	return `0`;
12825
12826	/*
12827	* This is OK, because current is on_cpu, which avoids it being picked
12828	* for load-balance and preemption/IRQs are still disabled avoiding
12829	* further scheduler activity on it and we're being very careful to
12830	* re-start the picking loop.
12831	*/
12832	rq_unpin_lock(rq: this_rq, rf);
12833
12834	rcu_read_lock();
12835	sd = rcu_dereference_check_sched_domain(this_rq->sd);
12836
12837	if (!get_rd_overloaded(rd: this_rq->rd) \|\|
12838	(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12839
12840	if (sd)
12841	update_next_balance(sd, next_balance: &next_balance);
12842	rcu_read_unlock();
12843
12844	goto out;
12845	}
12846	rcu_read_unlock();
12847
12848	raw_spin_rq_unlock(rq: this_rq);
12849
12850	t0 = sched_clock_cpu(cpu: this_cpu);
12851	sched_balance_update_blocked_averages(cpu: this_cpu);
12852
12853	rcu_read_lock();
12854	for_each_domain(this_cpu, sd) {
12855	u64 domain_cost;
12856
12857	update_next_balance(sd, next_balance: &next_balance);
12858
12859	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12860	break;
12861
12862	if (sd->flags & SD_BALANCE_NEWIDLE) {
12863
12864	pulled_task = sched_balance_rq(this_cpu, this_rq,
12865	sd, idle: CPU_NEWLY_IDLE,
12866	continue_balancing: &continue_balancing);
12867
12868	t1 = sched_clock_cpu(cpu: this_cpu);
12869	domain_cost = t1 - t0;
12870	update_newidle_cost(sd, cost: domain_cost);
12871
12872	curr_cost += domain_cost;
12873	t0 = t1;
12874	}
12875
12876	/*
12877	* Stop searching for tasks to pull if there are
12878	* now runnable tasks on this rq.
12879	*/
12880	if (pulled_task \|\| !continue_balancing)
12881	break;
12882	}
12883	rcu_read_unlock();
12884
12885	raw_spin_rq_lock(rq: this_rq);
12886
12887	if (curr_cost > this_rq->max_idle_balance_cost)
12888	this_rq->max_idle_balance_cost = curr_cost;
12889
12890	/*
12891	* While browsing the domains, we released the rq lock, a task could
12892	* have been enqueued in the meantime. Since we're not going idle,
12893	* pretend we pulled a task.
12894	*/
12895	if (this_rq->cfs.h_nr_queued && !pulled_task)
12896	pulled_task = `1`;
12897
12898	/ Is there a task of a high priority class? /
12899	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
12900	pulled_task = -`1`;
12901
12902	out:
12903	/ Move the next balance forward /
12904	if (time_after(this_rq->next_balance, next_balance))
12905	this_rq->next_balance = next_balance;
12906
12907	if (pulled_task)
12908	this_rq->idle_stamp = `0`;
12909	else
12910	nohz_newidle_balance(this_rq);
12911
12912	rq_repin_lock(rq: this_rq, rf);
12913
12914	return pulled_task;
12915	}
12916
12917	/*
12918	* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
12919	*
12920	* - directly from the local sched_tick() for periodic load balancing
12921	*
12922	* - indirectly from a remote sched_tick() for NOHZ idle balancing
12923	* through the SMP cross-call nohz_csd_func()
12924	*/
12925	static __latent_entropy void sched_balance_softirq(void)
12926	{
12927	struct rq *this_rq = this_rq();
12928	enum cpu_idle_type idle = this_rq->idle_balance;
12929	/*
12930	* If this CPU has a pending NOHZ_BALANCE_KICK, then do the
12931	* balancing on behalf of the other idle CPUs whose ticks are
12932	* stopped. Do nohz_idle_balance before sched_balance_domains to
12933	* give the idle CPUs a chance to load balance. Else we may
12934	* load balance only within the local sched_domain hierarchy
12935	* and abort nohz_idle_balance altogether if we pull some load.
12936	*/
12937	if (nohz_idle_balance(this_rq, idle))
12938	return;
12939
12940	/ normal load balance /
12941	sched_balance_update_blocked_averages(cpu: this_rq->cpu);
12942	sched_balance_domains(rq: this_rq, idle);
12943	}
12944
12945	/*
12946	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12947	*/
12948	void sched_balance_trigger(struct rq *rq)
12949	{
12950	/*
12951	* Don't need to rebalance while attached to NULL domain or
12952	* runqueue CPU is not active
12953	*/
12954	if (unlikely(on_null_domain(rq) \|\| !cpu_active(cpu_of(rq))))
12955	return;
12956
12957	if (time_after_eq(jiffies, rq->next_balance))
12958	raise_softirq(nr: SCHED_SOFTIRQ);
12959
12960	nohz_balancer_kick(rq);
12961	}
12962
12963	static void rq_online_fair(struct rq *rq)
12964	{
12965	update_sysctl();
12966
12967	update_runtime_enabled(rq);
12968	}
12969
12970	static void rq_offline_fair(struct rq *rq)
12971	{
12972	update_sysctl();
12973
12974	/ Ensure any throttled groups are reachable by pick_next_task /
12975	unthrottle_offline_cfs_rqs(rq);
12976
12977	/ Ensure that we remove rq contribution to group share: /
12978	clear_tg_offline_cfs_rqs(rq);
12979	}
12980
12981	#endif /* CONFIG_SMP */
12982
12983	#ifdef CONFIG_SCHED_CORE
12984	static inline bool
12985	__entity_slice_used(struct sched_entity se, int* min_nr_tasks)
12986	{
12987	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12988	u64 slice = se->slice;
12989
12990	return (rtime * min_nr_tasks > slice);
12991	}
12992
12993	#define MIN_NR_TASKS_DURING_FORCEIDLE 2
12994	static inline void task_tick_core(struct rq rq, struct* task_struct *curr)
12995	{
12996	if (!sched_core_enabled(rq))
12997	return;
12998
12999	/*
13000	* If runqueue has only one task which used up its slice and
13001	* if the sibling is forced idle, then trigger schedule to
13002	* give forced idle task a chance.
13003	*
13004	* sched_slice() considers only this active rq and it gets the
13005	* whole slice. But during force idle, we have siblings acting
13006	* like a single runqueue and hence we need to consider runnable
13007	* tasks on this CPU and the forced idle CPU. Ideally, we should
13008	* go through the forced idle rq, but that would be a perf hit.
13009	* We can assume that the forced idle CPU has at least
13010	* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
13011	* if we need to give up the CPU.
13012	*/
13013	if (rq->core->core_forceidle_count && rq->cfs.nr_queued == `1` &&
13014	__entity_slice_used(se: &curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
13015	resched_curr(rq);
13016	}
13017
13018	/*
13019	* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
13020	*/
13021	static void se_fi_update(const struct sched_entity se, unsigned* int fi_seq,
13022	bool forceidle)
13023	{
13024	for_each_sched_entity(se) {
13025	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13026
13027	if (forceidle) {
13028	if (cfs_rq->forceidle_seq == fi_seq)
13029	break;
13030	cfs_rq->forceidle_seq = fi_seq;
13031	}
13032
13033	cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
13034	}
13035	}
13036
13037	void task_vruntime_update(struct rq rq, struct* task_struct *p, bool in_fi)
13038	{
13039	struct sched_entity *se = &p->se;
13040
13041	if (p->sched_class != &fair_sched_class)
13042	return;
13043
13044	se_fi_update(se, fi_seq: rq->core->core_forceidle_seq, forceidle: in_fi);
13045	}
13046
13047	bool cfs_prio_less(const struct task_struct a, const* struct task_struct *b,
13048	bool in_fi)
13049	{
13050	struct rq *rq = task_rq(a);
13051	const struct sched_entity *sea = &a->se;
13052	const struct sched_entity *seb = &b->se;
13053	struct cfs_rq *cfs_rqa;
13054	struct cfs_rq *cfs_rqb;
13055	s64 delta;
13056
13057	WARN_ON_ONCE(task_rq(b)->core != rq->core);
13058
13059	#ifdef CONFIG_FAIR_GROUP_SCHED
13060	/*
13061	* Find an se in the hierarchy for tasks a and b, such that the se's
13062	* are immediate siblings.
13063	*/
13064	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
13065	int sea_depth = sea->depth;
13066	int seb_depth = seb->depth;
13067
13068	if (sea_depth >= seb_depth)
13069	sea = parent_entity(se: sea);
13070	if (sea_depth <= seb_depth)
13071	seb = parent_entity(se: seb);
13072	}
13073
13074	se_fi_update(se: sea, fi_seq: rq->core->core_forceidle_seq, forceidle: in_fi);
13075	se_fi_update(se: seb, fi_seq: rq->core->core_forceidle_seq, forceidle: in_fi);
13076
13077	cfs_rqa = sea->cfs_rq;
13078	cfs_rqb = seb->cfs_rq;
13079	#else
13080	cfs_rqa = &task_rq(a)->cfs;
13081	cfs_rqb = &task_rq(b)->cfs;
13082	#endif
13083
13084	/*
13085	* Find delta after normalizing se's vruntime with its cfs_rq's
13086	* min_vruntime_fi, which would have been updated in prior calls
13087	* to se_fi_update().
13088	*/
13089	delta = (s64)(sea->vruntime - seb->vruntime) +
13090	(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
13091
13092	return delta > `0`;
13093	}
13094
13095	static int task_is_throttled_fair(struct task_struct p, int* cpu)
13096	{
13097	struct cfs_rq *cfs_rq;
13098
13099	#ifdef CONFIG_FAIR_GROUP_SCHED
13100	cfs_rq = task_group(p)->cfs_rq[cpu];
13101	#else
13102	cfs_rq = &cpu_rq(cpu)->cfs;
13103	#endif
13104	return throttled_hierarchy(cfs_rq);
13105	}
13106	#else
13107	static inline void task_tick_core(struct rq rq, struct* task_struct *curr) {}
13108	#endif
13109
13110	/*
13111	* scheduler tick hitting a task of our scheduling class.
13112	*
13113	* NOTE: This function can be called remotely by the tick offload that
13114	* goes along full dynticks. Therefore no local assumption can be made
13115	* and everything must be accessed through the @rq and @curr passed in
13116	* parameters.
13117	*/
13118	static void task_tick_fair(struct rq rq, struct* task_struct curr, int* queued)
13119	{
13120	struct cfs_rq *cfs_rq;
13121	struct sched_entity *se = &curr->se;
13122
13123	for_each_sched_entity(se) {
13124	cfs_rq = cfs_rq_of(se);
13125	entity_tick(cfs_rq, curr: se, queued);
13126	}
13127
13128	if (static_branch_unlikely(&sched_numa_balancing))
13129	task_tick_numa(rq, curr);
13130
13131	update_misfit_status(p: curr, rq);
13132	check_update_overutilized_status(task_rq(curr));
13133
13134	task_tick_core(rq, curr);
13135	}
13136
13137	/*
13138	* called on fork with the child task as argument from the parent's context
13139	* - child not yet on the tasklist
13140	* - preemption disabled
13141	*/
13142	static void task_fork_fair(struct task_struct *p)
13143	{
13144	set_task_max_allowed_capacity(p);
13145	}
13146
13147	/*
13148	* Priority of the task has changed. Check to see if we preempt
13149	* the current task.
13150	*/
13151	static void
13152	prio_changed_fair(struct rq rq, struct* task_struct p, int* oldprio)
13153	{
13154	if (!task_on_rq_queued(p))
13155	return;
13156
13157	if (rq->cfs.nr_queued == `1`)
13158	return;
13159
13160	/*
13161	* Reschedule if we are currently running on this runqueue and
13162	* our priority decreased, or if we are not currently running on
13163	* this runqueue and our priority is higher than the current's
13164	*/
13165	if (task_current_donor(rq, p)) {
13166	if (p->prio > oldprio)
13167	resched_curr(rq);
13168	} else
13169	wakeup_preempt(rq, p, flags: `0`);
13170	}
13171
13172	#ifdef CONFIG_FAIR_GROUP_SCHED
13173	/*
13174	* Propagate the changes of the sched_entity across the tg tree to make it
13175	* visible to the root
13176	*/
13177	static void propagate_entity_cfs_rq(struct sched_entity *se)
13178	{
13179	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13180
13181	if (cfs_rq_throttled(cfs_rq))
13182	return;
13183
13184	if (!throttled_hierarchy(cfs_rq))
13185	list_add_leaf_cfs_rq(cfs_rq);
13186
13187	/ Start to propagate at parent /
13188	se = se->parent;
13189
13190	for_each_sched_entity(se) {
13191	cfs_rq = cfs_rq_of(se);
13192
13193	update_load_avg(cfs_rq, se, UPDATE_TG);
13194
13195	if (cfs_rq_throttled(cfs_rq))
13196	break;
13197
13198	if (!throttled_hierarchy(cfs_rq))
13199	list_add_leaf_cfs_rq(cfs_rq);
13200	}
13201	}
13202	#else
13203	static void propagate_entity_cfs_rq(struct sched_entity *se) { }
13204	#endif
13205
13206	static void detach_entity_cfs_rq(struct sched_entity *se)
13207	{
13208	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13209
13210	#ifdef CONFIG_SMP
13211	/*
13212	* In case the task sched_avg hasn't been attached:
13213	* - A forked task which hasn't been woken up by wake_up_new_task().
13214	* - A task which has been woken up by try_to_wake_up() but is
13215	* waiting for actually being woken up by sched_ttwu_pending().
13216	*/
13217	if (!se->avg.last_update_time)
13218	return;
13219	#endif
13220
13221	/ Catch up with the cfs_rq and remove our load when we leave /
13222	update_load_avg(cfs_rq, se, flags: `0`);
13223	detach_entity_load_avg(cfs_rq, se);
13224	update_tg_load_avg(cfs_rq);
13225	propagate_entity_cfs_rq(se);
13226	}
13227
13228	static void attach_entity_cfs_rq(struct sched_entity *se)
13229	{
13230	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13231
13232	/ Synchronize entity with its cfs_rq /
13233	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? `0` : SKIP_AGE_LOAD);
13234	attach_entity_load_avg(cfs_rq, se);
13235	update_tg_load_avg(cfs_rq);
13236	propagate_entity_cfs_rq(se);
13237	}
13238
13239	static void detach_task_cfs_rq(struct task_struct *p)
13240	{
13241	struct sched_entity *se = &p->se;
13242
13243	detach_entity_cfs_rq(se);
13244	}
13245
13246	static void attach_task_cfs_rq(struct task_struct *p)
13247	{
13248	struct sched_entity *se = &p->se;
13249
13250	attach_entity_cfs_rq(se);
13251	}
13252
13253	static void switched_from_fair(struct rq rq, struct* task_struct *p)
13254	{
13255	detach_task_cfs_rq(p);
13256	}
13257
13258	static void switched_to_fair(struct rq rq, struct* task_struct *p)
13259	{
13260	WARN_ON_ONCE(p->se.sched_delayed);
13261
13262	attach_task_cfs_rq(p);
13263
13264	set_task_max_allowed_capacity(p);
13265
13266	if (task_on_rq_queued(p)) {
13267	/*
13268	* We were most likely switched from sched_rt, so
13269	* kick off the schedule if running, otherwise just see
13270	* if we can still preempt the current task.
13271	*/
13272	if (task_current_donor(rq, p))
13273	resched_curr(rq);
13274	else
13275	wakeup_preempt(rq, p, flags: `0`);
13276	}
13277	}
13278
13279	static void __set_next_task_fair(struct rq rq, struct* task_struct *p, bool first)
13280	{
13281	struct sched_entity *se = &p->se;
13282
13283	#ifdef CONFIG_SMP
13284	if (task_on_rq_queued(p)) {
13285	/*
13286	* Move the next running task to the front of the list, so our
13287	* cfs_tasks list becomes MRU one.
13288	*/
13289	list_move(list: &se->group_node, head: &rq->cfs_tasks);
13290	}
13291	#endif
13292	if (!first)
13293	return;
13294
13295	WARN_ON_ONCE(se->sched_delayed);
13296
13297	if (hrtick_enabled_fair(rq))
13298	hrtick_start_fair(rq, p);
13299
13300	update_misfit_status(p, rq);
13301	sched_fair_update_stop_tick(rq, p);
13302	}
13303
13304	/*
13305	* Account for a task changing its policy or group.
13306	*
13307	* This routine is mostly called to set cfs_rq->curr field when a task
13308	* migrates between groups/classes.
13309	*/
13310	static void set_next_task_fair(struct rq rq, struct* task_struct *p, bool first)
13311	{
13312	struct sched_entity *se = &p->se;
13313
13314	for_each_sched_entity(se) {
13315	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13316
13317	set_next_entity(cfs_rq, se);
13318	/ ensure bandwidth has been allocated on our new cfs_rq /
13319	account_cfs_rq_runtime(cfs_rq, delta_exec: `0`);
13320	}
13321
13322	__set_next_task_fair(rq, p, first);
13323	}
13324
13325	void init_cfs_rq(struct cfs_rq *cfs_rq)
13326	{
13327	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13328	cfs_rq->min_vruntime = (u64)(-(`1LL` << `20`));
13329	#ifdef CONFIG_SMP
13330	raw_spin_lock_init(&cfs_rq->removed.lock);
13331	#endif
13332	}
13333
13334	#ifdef CONFIG_FAIR_GROUP_SCHED
13335	static void task_change_group_fair(struct task_struct *p)
13336	{
13337	/*
13338	* We couldn't detach or attach a forked task which
13339	* hasn't been woken up by wake_up_new_task().
13340	*/
13341	if (READ_ONCE(p->__state) == TASK_NEW)
13342	return;
13343
13344	detach_task_cfs_rq(p);
13345
13346	#ifdef CONFIG_SMP
13347	/ Tell se's cfs_rq has been changed -- migrated /
13348	p->se.avg.last_update_time = `0`;
13349	#endif
13350	set_task_rq(p, cpu: task_cpu(p));
13351	attach_task_cfs_rq(p);
13352	}
13353
13354	void free_fair_sched_group(struct task_group *tg)
13355	{
13356	int i;
13357
13358	for_each_possible_cpu(i) {
13359	if (tg->cfs_rq)
13360	kfree(objp: tg->cfs_rq[i]);
13361	if (tg->se)
13362	kfree(objp: tg->se[i]);
13363	}
13364
13365	kfree(objp: tg->cfs_rq);
13366	kfree(objp: tg->se);
13367	}
13368
13369	int alloc_fair_sched_group(struct task_group tg, struct* task_group *parent)
13370	{
13371	struct sched_entity *se;
13372	struct cfs_rq *cfs_rq;
13373	int i;
13374
13375	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
13376	if (!tg->cfs_rq)
13377	goto err;
13378	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
13379	if (!tg->se)
13380	goto err;
13381
13382	tg->shares = NICE_0_LOAD;
13383
13384	init_cfs_bandwidth(cfs_b: tg_cfs_bandwidth(tg), parent: tg_cfs_bandwidth(tg: parent));
13385
13386	for_each_possible_cpu(i) {
13387	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
13388	GFP_KERNEL, cpu_to_node(i));
13389	if (!cfs_rq)
13390	goto err;
13391
13392	se = kzalloc_node(sizeof(struct sched_entity_stats),
13393	GFP_KERNEL, cpu_to_node(i));
13394	if (!se)
13395	goto err_free_rq;
13396
13397	init_cfs_rq(cfs_rq);
13398	init_tg_cfs_entry(tg, cfs_rq, se, cpu: i, parent: parent->se[i]);
13399	init_entity_runnable_average(se);
13400	}
13401
13402	return `1`;
13403
13404	err_free_rq:
13405	kfree(objp: cfs_rq);
13406	err:
13407	return `0`;
13408	}
13409
13410	void online_fair_sched_group(struct task_group *tg)
13411	{
13412	struct sched_entity *se;
13413	struct rq_flags rf;
13414	struct rq *rq;
13415	int i;
13416
13417	for_each_possible_cpu(i) {
13418	rq = cpu_rq(i);
13419	se = tg->se[i];
13420	rq_lock_irq(rq, rf: &rf);
13421	update_rq_clock(rq);
13422	attach_entity_cfs_rq(se);
13423	sync_throttle(tg, cpu: i);
13424	rq_unlock_irq(rq, rf: &rf);
13425	}
13426	}
13427
13428	void unregister_fair_sched_group(struct task_group *tg)
13429	{
13430	int cpu;
13431
13432	destroy_cfs_bandwidth(cfs_b: tg_cfs_bandwidth(tg));
13433
13434	for_each_possible_cpu(cpu) {
13435	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
13436	struct sched_entity *se = tg->se[cpu];
13437	struct rq *rq = cpu_rq(cpu);
13438
13439	if (se) {
13440	if (se->sched_delayed) {
13441	guard(rq_lock_irqsave)(l: rq);
13442	if (se->sched_delayed) {
13443	update_rq_clock(rq);
13444	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
13445	}
13446	list_del_leaf_cfs_rq(cfs_rq);
13447	}
13448	remove_entity_load_avg(se);
13449	}
13450
13451	/*
13452	* Only empty task groups can be destroyed; so we can speculatively
13453	* check on_list without danger of it being re-added.
13454	*/
13455	if (cfs_rq->on_list) {
13456	guard(rq_lock_irqsave)(l: rq);
13457	list_del_leaf_cfs_rq(cfs_rq);
13458	}
13459	}
13460	}
13461
13462	void init_tg_cfs_entry(struct task_group tg, struct* cfs_rq *cfs_rq,
13463	struct sched_entity se, int* cpu,
13464	struct sched_entity *parent)
13465	{
13466	struct rq *rq = cpu_rq(cpu);
13467
13468	cfs_rq->tg = tg;
13469	cfs_rq->rq = rq;
13470	init_cfs_rq_runtime(cfs_rq);
13471
13472	tg->cfs_rq[cpu] = cfs_rq;
13473	tg->se[cpu] = se;
13474
13475	/ se could be NULL for root_task_group /
13476	if (!se)
13477	return;
13478
13479	if (!parent) {
13480	se->cfs_rq = &rq->cfs;
13481	se->depth = `0`;
13482	} else {
13483	se->cfs_rq = parent->my_q;
13484	se->depth = parent->depth + `1`;
13485	}
13486
13487	se->my_q = cfs_rq;
13488	/ guarantee group entities always have weight /
13489	update_load_set(lw: &se->load, NICE_0_LOAD);
13490	se->parent = parent;
13491	}
13492
13493	static DEFINE_MUTEX(shares_mutex);
13494
13495	static int __sched_group_set_shares(struct task_group tg, unsigned* long shares)
13496	{
13497	int i;
13498
13499	lockdep_assert_held(&shares_mutex);
13500
13501	/*
13502	* We can't change the weight of the root cgroup.
13503	*/
13504	if (!tg->se[`0`])
13505	return -EINVAL;
13506
13507	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13508
13509	if (tg->shares == shares)
13510	return `0`;
13511
13512	tg->shares = shares;
13513	for_each_possible_cpu(i) {
13514	struct rq *rq = cpu_rq(i);
13515	struct sched_entity *se = tg->se[i];
13516	struct rq_flags rf;
13517
13518	/ Propagate contribution to hierarchy /
13519	rq_lock_irqsave(rq, rf: &rf);
13520	update_rq_clock(rq);
13521	for_each_sched_entity(se) {
13522	update_load_avg(cfs_rq: cfs_rq_of(se), se, UPDATE_TG);
13523	update_cfs_group(se);
13524	}
13525	rq_unlock_irqrestore(rq, rf: &rf);
13526	}
13527
13528	return `0`;
13529	}
13530
13531	int sched_group_set_shares(struct task_group tg, unsigned* long shares)
13532	{
13533	int ret;
13534
13535	mutex_lock(&shares_mutex);
13536	if (tg_is_idle(tg))
13537	ret = -EINVAL;
13538	else
13539	ret = __sched_group_set_shares(tg, shares);
13540	mutex_unlock(lock: &shares_mutex);
13541
13542	return ret;
13543	}
13544
13545	int sched_group_set_idle(struct task_group tg, long* idle)
13546	{
13547	int i;
13548
13549	if (tg == &root_task_group)
13550	return -EINVAL;
13551
13552	if (idle < `0` \|\| idle > `1`)
13553	return -EINVAL;
13554
13555	mutex_lock(&shares_mutex);
13556
13557	if (tg->idle == idle) {
13558	mutex_unlock(lock: &shares_mutex);
13559	return `0`;
13560	}
13561
13562	tg->idle = idle;
13563
13564	for_each_possible_cpu(i) {
13565	struct rq *rq = cpu_rq(i);
13566	struct sched_entity *se = tg->se[i];
13567	struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
13568	bool was_idle = cfs_rq_is_idle(cfs_rq: grp_cfs_rq);
13569	long idle_task_delta;
13570	struct rq_flags rf;
13571
13572	rq_lock_irqsave(rq, rf: &rf);
13573
13574	grp_cfs_rq->idle = idle;
13575	if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13576	goto next_cpu;
13577
13578	idle_task_delta = grp_cfs_rq->h_nr_queued -
13579	grp_cfs_rq->h_nr_idle;
13580	if (!cfs_rq_is_idle(cfs_rq: grp_cfs_rq))
13581	idle_task_delta *= -`1`;
13582
13583	for_each_sched_entity(se) {
13584	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13585
13586	if (!se->on_rq)
13587	break;
13588
13589	cfs_rq->h_nr_idle += idle_task_delta;
13590
13591	/ Already accounted at parent level and above. /
13592	if (cfs_rq_is_idle(cfs_rq))
13593	break;
13594	}
13595
13596	next_cpu:
13597	rq_unlock_irqrestore(rq, rf: &rf);
13598	}
13599
13600	/ Idle groups have minimum weight. /
13601	if (tg_is_idle(tg))
13602	__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13603	else
13604	__sched_group_set_shares(tg, NICE_0_LOAD);
13605
13606	mutex_unlock(lock: &shares_mutex);
13607	return `0`;
13608	}
13609
13610	#endif /* CONFIG_FAIR_GROUP_SCHED */
13611
13612
13613	static unsigned int get_rr_interval_fair(struct rq rq, struct* task_struct *task)
13614	{
13615	struct sched_entity *se = &task->se;
13616	unsigned int rr_interval = `0`;
13617
13618	/*
13619	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13620	* idle runqueue:
13621	*/
13622	if (rq->cfs.load.weight)
13623	rr_interval = NS_TO_JIFFIES(se->slice);
13624
13625	return rr_interval;
13626	}
13627
13628	/*
13629	* All the scheduling class methods:
13630	*/
13631	DEFINE_SCHED_CLASS(fair) = {
13632
13633	.enqueue_task = enqueue_task_fair,
13634	.dequeue_task = dequeue_task_fair,
13635	.yield_task = yield_task_fair,
13636	.yield_to_task = yield_to_task_fair,
13637
13638	.wakeup_preempt = check_preempt_wakeup_fair,
13639
13640	.pick_task = pick_task_fair,
13641	.pick_next_task = __pick_next_task_fair,
13642	.put_prev_task = put_prev_task_fair,
13643	.set_next_task = set_next_task_fair,
13644
13645	#ifdef CONFIG_SMP
13646	.balance = balance_fair,
13647	.select_task_rq = select_task_rq_fair,
13648	.migrate_task_rq = migrate_task_rq_fair,
13649
13650	.rq_online = rq_online_fair,
13651	.rq_offline = rq_offline_fair,
13652
13653	.task_dead = task_dead_fair,
13654	.set_cpus_allowed = set_cpus_allowed_fair,
13655	#endif
13656
13657	.task_tick = task_tick_fair,
13658	.task_fork = task_fork_fair,
13659
13660	.reweight_task = reweight_task_fair,
13661	.prio_changed = prio_changed_fair,
13662	.switched_from = switched_from_fair,
13663	.switched_to = switched_to_fair,
13664
13665	.get_rr_interval = get_rr_interval_fair,
13666
13667	.update_curr = update_curr_fair,
13668
13669	#ifdef CONFIG_FAIR_GROUP_SCHED
13670	.task_change_group = task_change_group_fair,
13671	#endif
13672
13673	#ifdef CONFIG_SCHED_CORE
13674	.task_is_throttled = task_is_throttled_fair,
13675	#endif
13676
13677	#ifdef CONFIG_UCLAMP_TASK
13678	.uclamp_enabled = `1`,
13679	#endif
13680	};
13681
13682	void print_cfs_stats(struct seq_file m, int* cpu)
13683	{
13684	struct cfs_rq cfs_rq, pos;
13685
13686	rcu_read_lock();
13687	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13688	print_cfs_rq(m, cpu, cfs_rq);
13689	rcu_read_unlock();
13690	}
13691
13692	#ifdef CONFIG_NUMA_BALANCING
13693	void show_numa_stats(struct task_struct p, struct* seq_file *m)
13694	{
13695	int node;
13696	unsigned long tsf = `0`, tpf = `0`, gsf = `0`, gpf = `0`;
13697	struct numa_group *ng;
13698
13699	rcu_read_lock();
13700	ng = rcu_dereference(p->numa_group);
13701	for_each_online_node(node) {
13702	if (p->numa_faults) {
13703	tsf = p->numa_faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `0`)];
13704	tpf = p->numa_faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `1`)];
13705	}
13706	if (ng) {
13707	gsf = ng->faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `0`)],
13708	gpf = ng->faults[task_faults_idx(s: NUMA_MEM, nid: node, priv: `1`)];
13709	}
13710	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13711	}
13712	rcu_read_unlock();
13713	}
13714	#endif /* CONFIG_NUMA_BALANCING */
13715
13716	__init void init_sched_fair_class(void)
13717	{
13718	#ifdef CONFIG_SMP
13719	int i;
13720
13721	for_each_possible_cpu(i) {
13722	zalloc_cpumask_var_node(mask: &per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(cpu: i));
13723	zalloc_cpumask_var_node(mask: &per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(cpu: i));
13724	zalloc_cpumask_var_node(mask: &per_cpu(should_we_balance_tmpmask, i),
13725	GFP_KERNEL, cpu_to_node(cpu: i));
13726
13727	#ifdef CONFIG_CFS_BANDWIDTH
13728	INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13729	INIT_LIST_HEAD(list: &cpu_rq(i)->cfsb_csd_list);
13730	#endif
13731	}
13732
13733	open_softirq(nr: SCHED_SOFTIRQ, action: sched_balance_softirq);
13734
13735	#ifdef CONFIG_NO_HZ_COMMON
13736	nohz.next_balance = jiffies;
13737	nohz.next_blocked = jiffies;
13738	zalloc_cpumask_var(mask: &nohz.idle_cpus_mask, GFP_NOWAIT);
13739	#endif
13740	#endif /* SMP */
13741
13742	}
13743

source code of linux/kernel/sched/fair.c