scx_flatcg.bpf.c source code [linux/tools/sched_ext/scx_flatcg.bpf.c]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* A demo sched_ext flattened cgroup hierarchy scheduler. It implements
4	* hierarchical weight-based cgroup CPU control by flattening the cgroup
5	* hierarchy into a single layer by compounding the active weight share at each
6	* level. Consider the following hierarchy with weights in parentheses:
7	*
8	* R + A (100) + B (100)
9	* \| \ C (100)
10	* \ D (200)
11	*
12	* Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
13	* Let's say all three have runnable tasks. The total share that each of these
14	* three cgroups is entitled to can be calculated by compounding its share at
15	* each level.
16	*
17	* For example, B is competing against C and in that competition its share is
18	* 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
19	* share in that competition is 100/(200+100) == 1/3. B's eventual share in the
20	* system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
21	* eventual shaer is the same at 1/6. D is only competing at the top level and
22	* its share is 200/(100+200) == 2/3.
23	*
24	* So, instead of hierarchically scheduling level-by-level, we can consider it
25	* as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
26	* and keep updating the eventual shares as the cgroups' runnable states change.
27	*
28	* This flattening of hierarchy can bring a substantial performance gain when
29	* the cgroup hierarchy is nested multiple levels. in a simple benchmark using
30	* wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
31	* outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
32	* apache instances competing with 2:1 weight ratio nested four level deep.
33	*
34	* However, the gain comes at the cost of not being able to properly handle
35	* thundering herd of cgroups. For example, if many cgroups which are nested
36	* behind a low priority parent cgroup wake up around the same time, they may be
37	* able to consume more CPU cycles than they are entitled to. In many use cases,
38	* this isn't a real concern especially given the performance gain. Also, there
39	* are ways to mitigate the problem further by e.g. introducing an extra
40	* scheduling layer on cgroup delegation boundaries.
41	*
42	* The scheduler first picks the cgroup to run and then schedule the tasks
43	* within by using nested weighted vtime scheduling by default. The
44	* cgroup-internal scheduling can be switched to FIFO with the -f option.
45	*/
46	#include <scx/common.bpf.h>
47	#include "scx_flatcg.h"
48
49	/*
50	* Maximum amount of retries to find a valid cgroup.
51	*/
52	enum {
53	FALLBACK_DSQ = `0`,
54	CGROUP_MAX_RETRIES = `1024`,
55	};
56
57	char _license[] SEC("license") = "GPL";
58
59	const volatile u32 nr_cpus = `32`; / !0 for veristat, set during init /
60	const volatile u64 cgrp_slice_ns;
61	const volatile bool fifo_sched;
62
63	u64 cvtime_now;
64	UEI_DEFINE(uei);
65
66	struct {
67	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
68	__type(key, u32);
69	__type(value, u64);
70	__uint(max_entries, FCG_NR_STATS);
71	} stats SEC(".maps");
72
73	static void stat_inc(enum fcg_stat_idx idx)
74	{
75	u32 idx_v = idx;
76
77	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
78	if (cnt_p)
79	(*cnt_p)++;
80	}
81
82	struct fcg_cpu_ctx {
83	u64 cur_cgid;
84	u64 cur_at;
85	};
86
87	struct {
88	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
89	__type(key, u32);
90	__type(value, struct fcg_cpu_ctx);
91	__uint(max_entries, `1`);
92	} cpu_ctx SEC(".maps");
93
94	struct {
95	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
96	__uint(map_flags, BPF_F_NO_PREALLOC);
97	__type(key, int);
98	__type(value, struct fcg_cgrp_ctx);
99	} cgrp_ctx SEC(".maps");
100
101	struct cgv_node {
102	struct bpf_rb_node rb_node;
103	__u64 cvtime;
104	__u64 cgid;
105	};
106
107	private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
108	private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
109
110	struct cgv_node_stash {
111	struct cgv_node __kptr *node;
112	};
113
114	struct {
115	__uint(type, BPF_MAP_TYPE_HASH);
116	__uint(max_entries, `16384`);
117	__type(key, __u64);
118	__type(value, struct cgv_node_stash);
119	} cgv_node_stash SEC(".maps");
120
121	struct fcg_task_ctx {
122	u64 bypassed_at;
123	};
124
125	struct {
126	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
127	__uint(map_flags, BPF_F_NO_PREALLOC);
128	__type(key, int);
129	__type(value, struct fcg_task_ctx);
130	} task_ctx SEC(".maps");
131
132	/ gets inc'd on weight tree changes to expire the cached hweights /
133	u64 hweight_gen = `1`;
134
135	static u64 div_round_up(u64 dividend, u64 divisor)
136	{
137	return (dividend + divisor - `1`) / divisor;
138	}
139
140	static bool cgv_node_less(struct bpf_rb_node a, const* struct bpf_rb_node *b)
141	{
142	struct cgv_node cgc_a, cgc_b;
143
144	cgc_a = container_of(a, struct cgv_node, rb_node);
145	cgc_b = container_of(b, struct cgv_node, rb_node);
146
147	return cgc_a->cvtime < cgc_b->cvtime;
148	}
149
150	static struct fcg_cpu_ctx find_cpu_ctx(void*)
151	{
152	struct fcg_cpu_ctx *cpuc;
153	u32 idx = `0`;
154
155	cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
156	if (!cpuc) {
157	scx_bpf_error("cpu_ctx lookup failed");
158	return NULL;
159	}
160	return cpuc;
161	}
162
163	static struct fcg_cgrp_ctx find_cgrp_ctx(struct* cgroup *cgrp)
164	{
165	struct fcg_cgrp_ctx *cgc;
166
167	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, `0`, `0`);
168	if (!cgc) {
169	scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
170	return NULL;
171	}
172	return cgc;
173	}
174
175	static struct fcg_cgrp_ctx find_ancestor_cgrp_ctx(struct* cgroup cgrp, int* level)
176	{
177	struct fcg_cgrp_ctx *cgc;
178
179	cgrp = bpf_cgroup_ancestor(cgrp, level);
180	if (!cgrp) {
181	scx_bpf_error("ancestor cgroup lookup failed");
182	return NULL;
183	}
184
185	cgc = find_cgrp_ctx(cgrp);
186	if (!cgc)
187	scx_bpf_error("ancestor cgrp_ctx lookup failed");
188	bpf_cgroup_release(cgrp);
189	return cgc;
190	}
191
192	static void cgrp_refresh_hweight(struct cgroup cgrp, struct* fcg_cgrp_ctx *cgc)
193	{
194	int level;
195
196	if (!cgc->nr_active) {
197	stat_inc(idx: FCG_STAT_HWT_SKIP);
198	return;
199	}
200
201	if (cgc->hweight_gen == hweight_gen) {
202	stat_inc(idx: FCG_STAT_HWT_CACHE);
203	return;
204	}
205
206	stat_inc(idx: FCG_STAT_HWT_UPDATES);
207	bpf_for(level, `0`, cgrp->level + `1`) {
208	struct fcg_cgrp_ctx *cgc;
209	bool is_active;
210
211	cgc = find_ancestor_cgrp_ctx(cgrp, level);
212	if (!cgc)
213	break;
214
215	if (!level) {
216	cgc->hweight = FCG_HWEIGHT_ONE;
217	cgc->hweight_gen = hweight_gen;
218	} else {
219	struct fcg_cgrp_ctx *pcgc;
220
221	pcgc = find_ancestor_cgrp_ctx(cgrp, level: level - `1`);
222	if (!pcgc)
223	break;
224
225	/*
226	* We can be opportunistic here and not grab the
227	* cgv_tree_lock and deal with the occasional races.
228	* However, hweight updates are already cached and
229	* relatively low-frequency. Let's just do the
230	* straightforward thing.
231	*/
232	bpf_spin_lock(&cgv_tree_lock);
233	is_active = cgc->nr_active;
234	if (is_active) {
235	cgc->hweight_gen = pcgc->hweight_gen;
236	cgc->hweight =
237	div_round_up(pcgc->hweight * cgc->weight,
238	pcgc->child_weight_sum);
239	}
240	bpf_spin_unlock(&cgv_tree_lock);
241
242	if (!is_active) {
243	stat_inc(idx: FCG_STAT_HWT_RACE);
244	break;
245	}
246	}
247	}
248	}
249
250	static void cgrp_cap_budget(struct cgv_node cgv_node, struct* fcg_cgrp_ctx *cgc)
251	{
252	u64 delta, cvtime, max_budget;
253
254	/*
255	* A node which is on the rbtree can't be pointed to from elsewhere yet
256	* and thus can't be updated and repositioned. Instead, we collect the
257	* vtime deltas separately and apply it asynchronously here.
258	*/
259	delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta);
260	cvtime = cgv_node->cvtime + delta;
261
262	/*
263	* Allow a cgroup to carry the maximum budget proportional to its
264	* hweight such that a full-hweight cgroup can immediately take up half
265	* of the CPUs at the most while staying at the front of the rbtree.
266	*/
267	max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
268	(`2` * FCG_HWEIGHT_ONE);
269	if (time_before(cvtime, cvtime_now - max_budget))
270	cvtime = cvtime_now - max_budget;
271
272	cgv_node->cvtime = cvtime;
273	}
274
275	static void cgrp_enqueued(struct cgroup cgrp, struct* fcg_cgrp_ctx *cgc)
276	{
277	struct cgv_node_stash *stash;
278	struct cgv_node *cgv_node;
279	u64 cgid = cgrp->kn->id;
280
281	/ paired with cmpxchg in try_pick_next_cgroup() /
282	if (__sync_val_compare_and_swap(&cgc->queued, `0`, `1`)) {
283	stat_inc(idx: FCG_STAT_ENQ_SKIP);
284	return;
285	}
286
287	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
288	if (!stash) {
289	scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
290	return;
291	}
292
293	/ NULL if the node is already on the rbtree /
294	cgv_node = bpf_kptr_xchg(&stash->node, NULL);
295	if (!cgv_node) {
296	stat_inc(idx: FCG_STAT_ENQ_RACE);
297	return;
298	}
299
300	bpf_spin_lock(&cgv_tree_lock);
301	cgrp_cap_budget(cgv_node, cgc);
302	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
303	bpf_spin_unlock(&cgv_tree_lock);
304	}
305
306	static void set_bypassed_at(struct task_struct p, struct* fcg_task_ctx *taskc)
307	{
308	/*
309	* Tell fcg_stopping() that this bypassed the regular scheduling path
310	* and should be force charged to the cgroup. 0 is used to indicate that
311	* the task isn't bypassing, so if the current runtime is 0, go back by
312	* one nanosecond.
313	*/
314	taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-`1`;
315	}
316
317	s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
318	{
319	struct fcg_task_ctx *taskc;
320	bool is_idle = false;
321	s32 cpu;
322
323	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
324
325	taskc = bpf_task_storage_get(&task_ctx, p, `0`, `0`);
326	if (!taskc) {
327	scx_bpf_error("task_ctx lookup failed");
328	return cpu;
329	}
330
331	/*
332	* If select_cpu_dfl() is recommending local enqueue, the target CPU is
333	* idle. Follow it and charge the cgroup later in fcg_stopping() after
334	* the fact.
335	*/
336	if (is_idle) {
337	set_bypassed_at(p, taskc);
338	stat_inc(idx: FCG_STAT_LOCAL);
339	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, `0`);
340	}
341
342	return cpu;
343	}
344
345	void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
346	{
347	struct fcg_task_ctx *taskc;
348	struct cgroup *cgrp;
349	struct fcg_cgrp_ctx *cgc;
350
351	taskc = bpf_task_storage_get(&task_ctx, p, `0`, `0`);
352	if (!taskc) {
353	scx_bpf_error("task_ctx lookup failed");
354	return;
355	}
356
357	/*
358	* Use the direct dispatching and force charging to deal with tasks with
359	* custom affinities so that we don't have to worry about per-cgroup
360	* dq's containing tasks that can't be executed from some CPUs.
361	*/
362	if (p->nr_cpus_allowed != nr_cpus) {
363	set_bypassed_at(p, taskc);
364
365	/*
366	* The global dq is deprioritized as we don't want to let tasks
367	* to boost themselves by constraining its cpumask. The
368	* deprioritization is rather severe, so let's not apply that to
369	* per-cpu kernel threads. This is ham-fisted. We probably wanna
370	* implement per-cgroup fallback dq's instead so that we have
371	* more control over when tasks with custom cpumask get issued.
372	*/
373	if (p->nr_cpus_allowed == `1` && (p->flags & PF_KTHREAD)) {
374	stat_inc(idx: FCG_STAT_LOCAL);
375	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
376	enq_flags);
377	} else {
378	stat_inc(idx: FCG_STAT_GLOBAL);
379	scx_bpf_dsq_insert(p, FALLBACK_DSQ, SCX_SLICE_DFL,
380	enq_flags);
381	}
382	return;
383	}
384
385	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
386	cgc = find_cgrp_ctx(cgrp);
387	if (!cgc)
388	goto out_release;
389
390	if (fifo_sched) {
391	scx_bpf_dsq_insert(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
392	} else {
393	u64 tvtime = p->scx.dsq_vtime;
394
395	/*
396	* Limit the amount of budget that an idling task can accumulate
397	* to one slice.
398	*/
399	if (time_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
400	tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
401
402	scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
403	tvtime, enq_flags);
404	}
405
406	cgrp_enqueued(cgrp, cgc);
407	out_release:
408	bpf_cgroup_release(cgrp);
409	}
410
411	/*
412	* Walk the cgroup tree to update the active weight sums as tasks wake up and
413	* sleep. The weight sums are used as the base when calculating the proportion a
414	* given cgroup or task is entitled to at each level.
415	*/
416	static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
417	{
418	struct fcg_cgrp_ctx *cgc;
419	bool updated = false;
420	int idx;
421
422	cgc = find_cgrp_ctx(cgrp);
423	if (!cgc)
424	return;
425
426	/*
427	* In most cases, a hot cgroup would have multiple threads going to
428	* sleep and waking up while the whole cgroup stays active. In leaf
429	* cgroups, ->nr_runnable which is updated with __sync operations gates
430	* ->nr_active updates, so that we don't have to grab the cgv_tree_lock
431	* repeatedly for a busy cgroup which is staying active.
432	*/
433	if (runnable) {
434	if (__sync_fetch_and_add(&cgc->nr_runnable, `1`))
435	return;
436	stat_inc(idx: FCG_STAT_ACT);
437	} else {
438	if (__sync_sub_and_fetch(&cgc->nr_runnable, `1`))
439	return;
440	stat_inc(idx: FCG_STAT_DEACT);
441	}
442
443	/*
444	* If @cgrp is becoming runnable, its hweight should be refreshed after
445	* it's added to the weight tree so that enqueue has the up-to-date
446	* value. If @cgrp is becoming quiescent, the hweight should be
447	* refreshed before it's removed from the weight tree so that the usage
448	* charging which happens afterwards has access to the latest value.
449	*/
450	if (!runnable)
451	cgrp_refresh_hweight(cgrp, cgc);
452
453	/ propagate upwards /
454	bpf_for(idx, `0`, cgrp->level) {
455	int level = cgrp->level - idx;
456	struct fcg_cgrp_ctx cgc, pcgc = NULL;
457	bool propagate = false;
458
459	cgc = find_ancestor_cgrp_ctx(cgrp, level);
460	if (!cgc)
461	break;
462	if (level) {
463	pcgc = find_ancestor_cgrp_ctx(cgrp, level: level - `1`);
464	if (!pcgc)
465	break;
466	}
467
468	/*
469	* We need the propagation protected by a lock to synchronize
470	* against weight changes. There's no reason to drop the lock at
471	* each level but bpf_spin_lock() doesn't want any function
472	* calls while locked.
473	*/
474	bpf_spin_lock(&cgv_tree_lock);
475
476	if (runnable) {
477	if (!cgc->nr_active++) {
478	updated = true;
479	if (pcgc) {
480	propagate = true;
481	pcgc->child_weight_sum += cgc->weight;
482	}
483	}
484	} else {
485	if (!--cgc->nr_active) {
486	updated = true;
487	if (pcgc) {
488	propagate = true;
489	pcgc->child_weight_sum -= cgc->weight;
490	}
491	}
492	}
493
494	bpf_spin_unlock(&cgv_tree_lock);
495
496	if (!propagate)
497	break;
498	}
499
500	if (updated)
501	__sync_fetch_and_add(&hweight_gen, `1`);
502
503	if (runnable)
504	cgrp_refresh_hweight(cgrp, cgc);
505	}
506
507	void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
508	{
509	struct cgroup *cgrp;
510
511	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
512	update_active_weight_sums(cgrp, true);
513	bpf_cgroup_release(cgrp);
514	}
515
516	void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
517	{
518	struct cgroup *cgrp;
519	struct fcg_cgrp_ctx *cgc;
520
521	if (fifo_sched)
522	return;
523
524	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
525	cgc = find_cgrp_ctx(cgrp);
526	if (cgc) {
527	/*
528	* @cgc->tvtime_now always progresses forward as tasks start
529	* executing. The test and update can be performed concurrently
530	* from multiple CPUs and thus racy. Any error should be
531	* contained and temporary. Let's just live with it.
532	*/
533	if (time_before(cgc->tvtime_now, p->scx.dsq_vtime))
534	cgc->tvtime_now = p->scx.dsq_vtime;
535	}
536	bpf_cgroup_release(cgrp);
537	}
538
539	void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
540	{
541	struct fcg_task_ctx *taskc;
542	struct cgroup *cgrp;
543	struct fcg_cgrp_ctx *cgc;
544
545	/*
546	* Scale the execution time by the inverse of the weight and charge.
547	*
548	* Note that the default yield implementation yields by setting
549	* @p->scx.slice to zero and the following would treat the yielding task
550	* as if it has consumed all its slice. If this penalizes yielding tasks
551	* too much, determine the execution time by taking explicit timestamps
552	* instead of depending on @p->scx.slice.
553	*/
554	if (!fifo_sched)
555	p->scx.dsq_vtime +=
556	(SCX_SLICE_DFL - p->scx.slice) * `100` / p->scx.weight;
557
558	taskc = bpf_task_storage_get(&task_ctx, p, `0`, `0`);
559	if (!taskc) {
560	scx_bpf_error("task_ctx lookup failed");
561	return;
562	}
563
564	if (!taskc->bypassed_at)
565	return;
566
567	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
568	cgc = find_cgrp_ctx(cgrp);
569	if (cgc) {
570	__sync_fetch_and_add(&cgc->cvtime_delta,
571	p->se.sum_exec_runtime - taskc->bypassed_at);
572	taskc->bypassed_at = `0`;
573	}
574	bpf_cgroup_release(cgrp);
575	}
576
577	void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
578	{
579	struct cgroup *cgrp;
580
581	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
582	update_active_weight_sums(cgrp, false);
583	bpf_cgroup_release(cgrp);
584	}
585
586	void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
587	{
588	struct fcg_cgrp_ctx cgc, pcgc = NULL;
589
590	cgc = find_cgrp_ctx(cgrp);
591	if (!cgc)
592	return;
593
594	if (cgrp->level) {
595	pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - `1`);
596	if (!pcgc)
597	return;
598	}
599
600	bpf_spin_lock(&cgv_tree_lock);
601	if (pcgc && cgc->nr_active)
602	pcgc->child_weight_sum += (s64)weight - cgc->weight;
603	cgc->weight = weight;
604	bpf_spin_unlock(&cgv_tree_lock);
605	}
606
607	static bool try_pick_next_cgroup(u64 *cgidp)
608	{
609	struct bpf_rb_node *rb_node;
610	struct cgv_node_stash *stash;
611	struct cgv_node *cgv_node;
612	struct fcg_cgrp_ctx *cgc;
613	struct cgroup *cgrp;
614	u64 cgid;
615
616	/ pop the front cgroup and wind cvtime_now accordingly /
617	bpf_spin_lock(&cgv_tree_lock);
618
619	rb_node = bpf_rbtree_first(&cgv_tree);
620	if (!rb_node) {
621	bpf_spin_unlock(&cgv_tree_lock);
622	stat_inc(idx: FCG_STAT_PNC_NO_CGRP);
623	*cgidp = `0`;
624	return true;
625	}
626
627	rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
628	bpf_spin_unlock(&cgv_tree_lock);
629
630	if (!rb_node) {
631	/*
632	* This should never happen. bpf_rbtree_first() was called
633	* above while the tree lock was held, so the node should
634	* always be present.
635	*/
636	scx_bpf_error("node could not be removed");
637	return true;
638	}
639
640	cgv_node = container_of(rb_node, struct cgv_node, rb_node);
641	cgid = cgv_node->cgid;
642
643	if (time_before(cvtime_now, cgv_node->cvtime))
644	cvtime_now = cgv_node->cvtime;
645
646	/*
647	* If lookup fails, the cgroup's gone. Free and move on. See
648	* fcg_cgroup_exit().
649	*/
650	cgrp = bpf_cgroup_from_id(cgid);
651	if (!cgrp) {
652	stat_inc(idx: FCG_STAT_PNC_GONE);
653	goto out_free;
654	}
655
656	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, `0`, `0`);
657	if (!cgc) {
658	bpf_cgroup_release(cgrp);
659	stat_inc(idx: FCG_STAT_PNC_GONE);
660	goto out_free;
661	}
662
663	if (!scx_bpf_dsq_move_to_local(cgid)) {
664	bpf_cgroup_release(cgrp);
665	stat_inc(idx: FCG_STAT_PNC_EMPTY);
666	goto out_stash;
667	}
668
669	/*
670	* Successfully consumed from the cgroup. This will be our current
671	* cgroup for the new slice. Refresh its hweight.
672	*/
673	cgrp_refresh_hweight(cgrp, cgc);
674
675	bpf_cgroup_release(cgrp);
676
677	/*
678	* As the cgroup may have more tasks, add it back to the rbtree. Note
679	* that here we charge the full slice upfront and then exact later
680	* according to the actual consumption. This prevents lowpri thundering
681	* herd from saturating the machine.
682	*/
683	bpf_spin_lock(&cgv_tree_lock);
684	cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: `1`);
685	cgrp_cap_budget(cgv_node, cgc);
686	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
687	bpf_spin_unlock(&cgv_tree_lock);
688
689	*cgidp = cgid;
690	stat_inc(idx: FCG_STAT_PNC_NEXT);
691	return true;
692
693	out_stash:
694	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
695	if (!stash) {
696	stat_inc(idx: FCG_STAT_PNC_GONE);
697	goto out_free;
698	}
699
700	/*
701	* Paired with cmpxchg in cgrp_enqueued(). If they see the following
702	* transition, they'll enqueue the cgroup. If they are earlier, we'll
703	* see their task in the dq below and requeue the cgroup.
704	*/
705	__sync_val_compare_and_swap(&cgc->queued, `1`, `0`);
706
707	if (scx_bpf_dsq_nr_queued(cgid)) {
708	bpf_spin_lock(&cgv_tree_lock);
709	bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
710	bpf_spin_unlock(&cgv_tree_lock);
711	stat_inc(idx: FCG_STAT_PNC_RACE);
712	} else {
713	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
714	if (cgv_node) {
715	scx_bpf_error("unexpected !NULL cgv_node stash");
716	goto out_free;
717	}
718	}
719
720	return false;
721
722	out_free:
723	bpf_obj_drop(cgv_node);
724	return false;
725	}
726
727	void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
728	{
729	struct fcg_cpu_ctx *cpuc;
730	struct fcg_cgrp_ctx *cgc;
731	struct cgroup *cgrp;
732	u64 now = scx_bpf_now();
733	bool picked_next = false;
734
735	cpuc = find_cpu_ctx();
736	if (!cpuc)
737	return;
738
739	if (!cpuc->cur_cgid)
740	goto pick_next_cgroup;
741
742	if (time_before(now, cpuc->cur_at + cgrp_slice_ns)) {
743	if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
744	stat_inc(idx: FCG_STAT_CNS_KEEP);
745	return;
746	}
747	stat_inc(idx: FCG_STAT_CNS_EMPTY);
748	} else {
749	stat_inc(idx: FCG_STAT_CNS_EXPIRE);
750	}
751
752	/*
753	* The current cgroup is expiring. It was already charged a full slice.
754	* Calculate the actual usage and accumulate the delta.
755	*/
756	cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
757	if (!cgrp) {
758	stat_inc(idx: FCG_STAT_CNS_GONE);
759	goto pick_next_cgroup;
760	}
761
762	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, `0`, `0`);
763	if (cgc) {
764	/*
765	* We want to update the vtime delta and then look for the next
766	* cgroup to execute but the latter needs to be done in a loop
767	* and we can't keep the lock held. Oh well...
768	*/
769	bpf_spin_lock(&cgv_tree_lock);
770	__sync_fetch_and_add(&cgc->cvtime_delta,
771	(cpuc->cur_at + cgrp_slice_ns - now) *
772	FCG_HWEIGHT_ONE / (cgc->hweight ?: `1`));
773	bpf_spin_unlock(&cgv_tree_lock);
774	} else {
775	stat_inc(idx: FCG_STAT_CNS_GONE);
776	}
777
778	bpf_cgroup_release(cgrp);
779
780	pick_next_cgroup:
781	cpuc->cur_at = now;
782
783	if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ)) {
784	cpuc->cur_cgid = `0`;
785	return;
786	}
787
788	bpf_repeat(CGROUP_MAX_RETRIES) {
789	if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
790	picked_next = true;
791	break;
792	}
793	}
794
795	/*
796	* This only happens if try_pick_next_cgroup() races against enqueue
797	* path for more than CGROUP_MAX_RETRIES times, which is extremely
798	* unlikely and likely indicates an underlying bug. There shouldn't be
799	* any stall risk as the race is against enqueue.
800	*/
801	if (!picked_next)
802	stat_inc(idx: FCG_STAT_PNC_FAIL);
803	}
804
805	s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
806	struct scx_init_task_args *args)
807	{
808	struct fcg_task_ctx *taskc;
809	struct fcg_cgrp_ctx *cgc;
810
811	/*
812	* @p is new. Let's ensure that its task_ctx is available. We can sleep
813	* in this function and the following will automatically use GFP_KERNEL.
814	*/
815	taskc = bpf_task_storage_get(&task_ctx, p, `0`,
816	BPF_LOCAL_STORAGE_GET_F_CREATE);
817	if (!taskc)
818	return -ENOMEM;
819
820	taskc->bypassed_at = `0`;
821
822	if (!(cgc = find_cgrp_ctx(args->cgroup)))
823	return -ENOENT;
824
825	p->scx.dsq_vtime = cgc->tvtime_now;
826
827	return `0`;
828	}
829
830	int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
831	struct scx_cgroup_init_args *args)
832	{
833	struct fcg_cgrp_ctx *cgc;
834	struct cgv_node *cgv_node;
835	struct cgv_node_stash empty_stash = {}, *stash;
836	u64 cgid = cgrp->kn->id;
837	int ret;
838
839	/*
840	* Technically incorrect as cgroup ID is full 64bit while dsq ID is
841	* 63bit. Should not be a problem in practice and easy to spot in the
842	* unlikely case that it breaks.
843	*/
844	ret = scx_bpf_create_dsq(cgid, -`1`);
845	if (ret)
846	return ret;
847
848	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, `0`,
849	BPF_LOCAL_STORAGE_GET_F_CREATE);
850	if (!cgc) {
851	ret = -ENOMEM;
852	goto err_destroy_dsq;
853	}
854
855	cgc->weight = args->weight;
856	cgc->hweight = FCG_HWEIGHT_ONE;
857
858	ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
859	BPF_NOEXIST);
860	if (ret) {
861	if (ret != -ENOMEM)
862	scx_bpf_error("unexpected stash creation error (%d)",
863	ret);
864	goto err_destroy_dsq;
865	}
866
867	stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
868	if (!stash) {
869	scx_bpf_error("unexpected cgv_node stash lookup failure");
870	ret = -ENOENT;
871	goto err_destroy_dsq;
872	}
873
874	cgv_node = bpf_obj_new(struct cgv_node);
875	if (!cgv_node) {
876	ret = -ENOMEM;
877	goto err_del_cgv_node;
878	}
879
880	cgv_node->cgid = cgid;
881	cgv_node->cvtime = cvtime_now;
882
883	cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
884	if (cgv_node) {
885	scx_bpf_error("unexpected !NULL cgv_node stash");
886	ret = -EBUSY;
887	goto err_drop;
888	}
889
890	return `0`;
891
892	err_drop:
893	bpf_obj_drop(cgv_node);
894	err_del_cgv_node:
895	bpf_map_delete_elem(&cgv_node_stash, &cgid);
896	err_destroy_dsq:
897	scx_bpf_destroy_dsq(cgid);
898	return ret;
899	}
900
901	void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
902	{
903	u64 cgid = cgrp->kn->id;
904
905	/*
906	* For now, there's no way find and remove the cgv_node if it's on the
907	* cgv_tree. Let's drain them in the dispatch path as they get popped
908	* off the front of the tree.
909	*/
910	bpf_map_delete_elem(&cgv_node_stash, &cgid);
911	scx_bpf_destroy_dsq(cgid);
912	}
913
914	void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
915	struct cgroup from, struct* cgroup *to)
916	{
917	struct fcg_cgrp_ctx from_cgc, to_cgc;
918	s64 delta;
919
920	/ find_cgrp_ctx() triggers scx_ops_error() on lookup failures /
921	if (!(from_cgc = find_cgrp_ctx(from)) \|\| !(to_cgc = find_cgrp_ctx(to)))
922	return;
923
924	delta = time_delta(p->scx.dsq_vtime, from_cgc->tvtime_now);
925	p->scx.dsq_vtime = to_cgc->tvtime_now + delta;
926	}
927
928	s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
929	{
930	return scx_bpf_create_dsq(FALLBACK_DSQ, -`1`);
931	}
932
933	void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
934	{
935	UEI_RECORD(uei, ei);
936	}
937
938	SCX_OPS_DEFINE(flatcg_ops,
939	.select_cpu = (void *)fcg_select_cpu,
940	.enqueue = (void *)fcg_enqueue,
941	.dispatch = (void *)fcg_dispatch,
942	.runnable = (void *)fcg_runnable,
943	.running = (void *)fcg_running,
944	.stopping = (void *)fcg_stopping,
945	.quiescent = (void *)fcg_quiescent,
946	.init_task = (void *)fcg_init_task,
947	.cgroup_set_weight = (void *)fcg_cgroup_set_weight,
948	.cgroup_init = (void *)fcg_cgroup_init,
949	.cgroup_exit = (void *)fcg_cgroup_exit,
950	.cgroup_move = (void *)fcg_cgroup_move,
951	.init = (void *)fcg_init,
952	.exit = (void *)fcg_exit,
953	.flags = SCX_OPS_ENQ_EXITING,
954	.name = "flatcg");
955

source code of linux/tools/sched_ext/scx_flatcg.bpf.c