rstat.c source code [linux/kernel/cgroup/rstat.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include "cgroup-internal.h"
3
4	#include <linux/sched/cputime.h>
5
6	#include <linux/bpf.h>
7	#include <linux/btf.h>
8	#include <linux/btf_ids.h>
9
10	static DEFINE_SPINLOCK(cgroup_rstat_lock);
11	static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
12
13	static void cgroup_base_stat_flush(struct cgroup cgrp, int* cpu);
14
15	static struct cgroup_rstat_cpu cgroup_rstat_cpu(struct* cgroup cgrp, int* cpu)
16	{
17	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
18	}
19
20	/**
21	* cgroup_rstat_updated - keep track of updated rstat_cpu
22	* @cgrp: target cgroup
23	* @cpu: cpu on which rstat_cpu was updated
24	*
25	* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
26	* rstat_cpu->updated_children list. See the comment on top of
27	* cgroup_rstat_cpu definition for details.
28	*/
29	__bpf_kfunc void cgroup_rstat_updated(struct cgroup cgrp, int* cpu)
30	{
31	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
32	unsigned long flags;
33
34	/*
35	* Speculative already-on-list test. This may race leading to
36	* temporary inaccuracies, which is fine.
37	*
38	* Because @parent's updated_children is terminated with @parent
39	* instead of NULL, we can tell whether @cgrp is on the list by
40	* testing the next pointer for NULL.
41	*/
42	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
43	return;
44
45	raw_spin_lock_irqsave(cpu_lock, flags);
46
47	/ put @cgrp and all ancestors on the corresponding updated lists /
48	while (true) {
49	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
50	struct cgroup *parent = cgroup_parent(cgrp);
51	struct cgroup_rstat_cpu *prstatc;
52
53	/*
54	* Both additions and removals are bottom-up. If a cgroup
55	* is already in the tree, all ancestors are.
56	*/
57	if (rstatc->updated_next)
58	break;
59
60	/ Root has no parent to link it to, but mark it busy /
61	if (!parent) {
62	rstatc->updated_next = cgrp;
63	break;
64	}
65
66	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
67	rstatc->updated_next = prstatc->updated_children;
68	prstatc->updated_children = cgrp;
69
70	cgrp = parent;
71	}
72
73	raw_spin_unlock_irqrestore(cpu_lock, flags);
74	}
75
76	/**
77	* cgroup_rstat_push_children - push children cgroups into the given list
78	* @head: current head of the list (= subtree root)
79	* @child: first child of the root
80	* @cpu: target cpu
81	* Return: A new singly linked list of cgroups to be flush
82	*
83	* Iteratively traverse down the cgroup_rstat_cpu updated tree level by
84	* level and push all the parents first before their next level children
85	* into a singly linked list built from the tail backward like "pushing"
86	* cgroups into a stack. The root is pushed by the caller.
87	*/
88	static struct cgroup cgroup_rstat_push_children(struct* cgroup *head,
89	struct cgroup child, int* cpu)
90	{
91	struct cgroup chead = child; /* Head of child cgroup level /
92	struct cgroup ghead = NULL; /* Head of grandchild cgroup level /
93	struct cgroup parent, grandchild;
94	struct cgroup_rstat_cpu *crstatc;
95
96	child->rstat_flush_next = NULL;
97
98	next_level:
99	while (chead) {
100	child = chead;
101	chead = child->rstat_flush_next;
102	parent = cgroup_parent(cgrp: child);
103
104	/ updated_next is parent cgroup terminated /
105	while (child != parent) {
106	child->rstat_flush_next = head;
107	head = child;
108	crstatc = cgroup_rstat_cpu(cgrp: child, cpu);
109	grandchild = crstatc->updated_children;
110	if (grandchild != child) {
111	/ Push the grand child to the next level /
112	crstatc->updated_children = child;
113	grandchild->rstat_flush_next = ghead;
114	ghead = grandchild;
115	}
116	child = crstatc->updated_next;
117	crstatc->updated_next = NULL;
118	}
119	}
120
121	if (ghead) {
122	chead = ghead;
123	ghead = NULL;
124	goto next_level;
125	}
126	return head;
127	}
128
129	/**
130	* cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
131	* @root: root of the cgroup subtree to traverse
132	* @cpu: target cpu
133	* Return: A singly linked list of cgroups to be flushed
134	*
135	* Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
136	* each returned cgroup is unlinked from the updated tree.
137	*
138	* The only ordering guarantee is that, for a parent and a child pair
139	* covered by a given traversal, the child is before its parent in
140	* the list.
141	*
142	* Note that updated_children is self terminated and points to a list of
143	* child cgroups if not empty. Whereas updated_next is like a sibling link
144	* within the children list and terminated by the parent cgroup. An exception
145	* here is the cgroup root whose updated_next can be self terminated.
146	*/
147	static struct cgroup cgroup_rstat_updated_list(struct* cgroup root, int* cpu)
148	{
149	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
150	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp: root, cpu);
151	struct cgroup head = NULL, parent, *child;
152	unsigned long flags;
153
154	/*
155	* The _irqsave() is needed because cgroup_rstat_lock is
156	* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
157	* this lock with the _irq() suffix only disables interrupts on
158	* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
159	* interrupts on both configurations. The _irqsave() ensures
160	* that interrupts are always disabled and later restored.
161	*/
162	raw_spin_lock_irqsave(cpu_lock, flags);
163
164	/ Return NULL if this subtree is not on-list /
165	if (!rstatc->updated_next)
166	goto unlock_ret;
167
168	/*
169	* Unlink @root from its parent. As the updated_children list is
170	* singly linked, we have to walk it to find the removal point.
171	*/
172	parent = cgroup_parent(cgrp: root);
173	if (parent) {
174	struct cgroup_rstat_cpu *prstatc;
175	struct cgroup **nextp;
176
177	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
178	nextp = &prstatc->updated_children;
179	while (*nextp != root) {
180	struct cgroup_rstat_cpu *nrstatc;
181
182	nrstatc = cgroup_rstat_cpu(cgrp: *nextp, cpu);
183	WARN_ON_ONCE(*nextp == parent);
184	nextp = &nrstatc->updated_next;
185	}
186	*nextp = rstatc->updated_next;
187	}
188
189	rstatc->updated_next = NULL;
190
191	/ Push @root to the list first before pushing the children /
192	head = root;
193	root->rstat_flush_next = NULL;
194	child = rstatc->updated_children;
195	rstatc->updated_children = root;
196	if (child != root)
197	head = cgroup_rstat_push_children(head, child, cpu);
198	unlock_ret:
199	raw_spin_unlock_irqrestore(cpu_lock, flags);
200	return head;
201	}
202
203	/*
204	* A hook for bpf stat collectors to attach to and flush their stats.
205	* Together with providing bpf kfuncs for cgroup_rstat_updated() and
206	* cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
207	* collect cgroup stats can integrate with rstat for efficient flushing.
208	*
209	* A static noinline declaration here could cause the compiler to optimize away
210	* the function. A global noinline declaration will keep the definition, but may
211	* optimize away the callsite. Therefore, __weak is needed to ensure that the
212	* call is still emitted, by telling the compiler that we don't know what the
213	* function might eventually be.
214	*/
215
216	__bpf_hook_start();
217
218	__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
219	struct cgroup parent, int* cpu)
220	{
221	}
222
223	__bpf_hook_end();
224
225	/ see cgroup_rstat_flush() /
226	static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
227	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
228	{
229	int cpu;
230
231	lockdep_assert_held(&cgroup_rstat_lock);
232
233	for_each_possible_cpu(cpu) {
234	struct cgroup *pos = cgroup_rstat_updated_list(root: cgrp, cpu);
235
236	for (; pos; pos = pos->rstat_flush_next) {
237	struct cgroup_subsys_state *css;
238
239	cgroup_base_stat_flush(cgrp: pos, cpu);
240	bpf_rstat_flush(cgrp: pos, parent: cgroup_parent(cgrp: pos), cpu);
241
242	rcu_read_lock();
243	list_for_each_entry_rcu(css, &pos->rstat_css_list,
244	rstat_css_node)
245	css->ss->css_rstat_flush(css, cpu);
246	rcu_read_unlock();
247	}
248
249	/ play nice and yield if necessary /
250	if (need_resched() \|\| spin_needbreak(lock: &cgroup_rstat_lock)) {
251	spin_unlock_irq(lock: &cgroup_rstat_lock);
252	if (!cond_resched())
253	cpu_relax();
254	spin_lock_irq(lock: &cgroup_rstat_lock);
255	}
256	}
257	}
258
259	/**
260	* cgroup_rstat_flush - flush stats in @cgrp's subtree
261	* @cgrp: target cgroup
262	*
263	* Collect all per-cpu stats in @cgrp's subtree into the global counters
264	* and propagate them upwards. After this function returns, all cgroups in
265	* the subtree have up-to-date ->stat.
266	*
267	* This also gets all cgroups in the subtree including @cgrp off the
268	* ->updated_children lists.
269	*
270	* This function may block.
271	*/
272	__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
273	{
274	might_sleep();
275
276	spin_lock_irq(lock: &cgroup_rstat_lock);
277	cgroup_rstat_flush_locked(cgrp);
278	spin_unlock_irq(lock: &cgroup_rstat_lock);
279	}
280
281	/**
282	* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
283	* @cgrp: target cgroup
284	*
285	* Flush stats in @cgrp's subtree and prevent further flushes. Must be
286	* paired with cgroup_rstat_flush_release().
287	*
288	* This function may block.
289	*/
290	void cgroup_rstat_flush_hold(struct cgroup *cgrp)
291	__acquires(&cgroup_rstat_lock)
292	{
293	might_sleep();
294	spin_lock_irq(lock: &cgroup_rstat_lock);
295	cgroup_rstat_flush_locked(cgrp);
296	}
297
298	/**
299	* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
300	*/
301	void cgroup_rstat_flush_release(void)
302	__releases(&cgroup_rstat_lock)
303	{
304	spin_unlock_irq(lock: &cgroup_rstat_lock);
305	}
306
307	int cgroup_rstat_init(struct cgroup *cgrp)
308	{
309	int cpu;
310
311	/ the root cgrp has rstat_cpu preallocated /
312	if (!cgrp->rstat_cpu) {
313	cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
314	if (!cgrp->rstat_cpu)
315	return -ENOMEM;
316	}
317
318	/ ->updated_children list is self terminated /
319	for_each_possible_cpu(cpu) {
320	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
321
322	rstatc->updated_children = cgrp;
323	u64_stats_init(syncp: &rstatc->bsync);
324	}
325
326	return `0`;
327	}
328
329	void cgroup_rstat_exit(struct cgroup *cgrp)
330	{
331	int cpu;
332
333	cgroup_rstat_flush(cgrp);
334
335	/ sanity check /
336	for_each_possible_cpu(cpu) {
337	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
338
339	if (WARN_ON_ONCE(rstatc->updated_children != cgrp) \|\|
340	WARN_ON_ONCE(rstatc->updated_next))
341	return;
342	}
343
344	free_percpu(pdata: cgrp->rstat_cpu);
345	cgrp->rstat_cpu = NULL;
346	}
347
348	void __init cgroup_rstat_boot(void)
349	{
350	int cpu;
351
352	for_each_possible_cpu(cpu)
353	raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
354	}
355
356	/*
357	* Functions for cgroup basic resource statistics implemented on top of
358	* rstat.
359	*/
360	static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
361	struct cgroup_base_stat *src_bstat)
362	{
363	dst_bstat->cputime.utime += src_bstat->cputime.utime;
364	dst_bstat->cputime.stime += src_bstat->cputime.stime;
365	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
366	#ifdef CONFIG_SCHED_CORE
367	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
368	#endif
369	}
370
371	static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
372	struct cgroup_base_stat *src_bstat)
373	{
374	dst_bstat->cputime.utime -= src_bstat->cputime.utime;
375	dst_bstat->cputime.stime -= src_bstat->cputime.stime;
376	dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
377	#ifdef CONFIG_SCHED_CORE
378	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
379	#endif
380	}
381
382	static void cgroup_base_stat_flush(struct cgroup cgrp, int* cpu)
383	{
384	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
385	struct cgroup *parent = cgroup_parent(cgrp);
386	struct cgroup_rstat_cpu *prstatc;
387	struct cgroup_base_stat delta;
388	unsigned seq;
389
390	/ Root-level stats are sourced from system-wide CPU stats /
391	if (!parent)
392	return;
393
394	/ fetch the current per-cpu values /
395	do {
396	seq = __u64_stats_fetch_begin(syncp: &rstatc->bsync);
397	delta = rstatc->bstat;
398	} while (__u64_stats_fetch_retry(syncp: &rstatc->bsync, start: seq));
399
400	/ propagate per-cpu delta to cgroup and per-cpu global statistics /
401	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_bstat);
402	cgroup_base_stat_add(dst_bstat: &cgrp->bstat, src_bstat: &delta);
403	cgroup_base_stat_add(dst_bstat: &rstatc->last_bstat, src_bstat: &delta);
404	cgroup_base_stat_add(dst_bstat: &rstatc->subtree_bstat, src_bstat: &delta);
405
406	/ propagate cgroup and per-cpu global delta to parent (unless that's root) /
407	if (cgroup_parent(cgrp: parent)) {
408	delta = cgrp->bstat;
409	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &cgrp->last_bstat);
410	cgroup_base_stat_add(dst_bstat: &parent->bstat, src_bstat: &delta);
411	cgroup_base_stat_add(dst_bstat: &cgrp->last_bstat, src_bstat: &delta);
412
413	delta = rstatc->subtree_bstat;
414	prstatc = cgroup_rstat_cpu(cgrp: parent, cpu);
415	cgroup_base_stat_sub(dst_bstat: &delta, src_bstat: &rstatc->last_subtree_bstat);
416	cgroup_base_stat_add(dst_bstat: &prstatc->subtree_bstat, src_bstat: &delta);
417	cgroup_base_stat_add(dst_bstat: &rstatc->last_subtree_bstat, src_bstat: &delta);
418	}
419	}
420
421	static struct cgroup_rstat_cpu *
422	cgroup_base_stat_cputime_account_begin(struct cgroup cgrp, unsigned* long *flags)
423	{
424	struct cgroup_rstat_cpu *rstatc;
425
426	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
427	*flags = u64_stats_update_begin_irqsave(syncp: &rstatc->bsync);
428	return rstatc;
429	}
430
431	static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
432	struct cgroup_rstat_cpu *rstatc,
433	unsigned long flags)
434	{
435	u64_stats_update_end_irqrestore(syncp: &rstatc->bsync, flags);
436	cgroup_rstat_updated(cgrp, smp_processor_id());
437	put_cpu_ptr(rstatc);
438	}
439
440	void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
441	{
442	struct cgroup_rstat_cpu *rstatc;
443	unsigned long flags;
444
445	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags);
446	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
447	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
448	}
449
450	void __cgroup_account_cputime_field(struct cgroup *cgrp,
451	enum cpu_usage_stat index, u64 delta_exec)
452	{
453	struct cgroup_rstat_cpu *rstatc;
454	unsigned long flags;
455
456	rstatc = cgroup_base_stat_cputime_account_begin(cgrp, flags: &flags);
457
458	switch (index) {
459	case CPUTIME_USER:
460	case CPUTIME_NICE:
461	rstatc->bstat.cputime.utime += delta_exec;
462	break;
463	case CPUTIME_SYSTEM:
464	case CPUTIME_IRQ:
465	case CPUTIME_SOFTIRQ:
466	rstatc->bstat.cputime.stime += delta_exec;
467	break;
468	#ifdef CONFIG_SCHED_CORE
469	case CPUTIME_FORCEIDLE:
470	rstatc->bstat.forceidle_sum += delta_exec;
471	break;
472	#endif
473	default:
474	break;
475	}
476
477	cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
478	}
479
480	/*
481	* compute the cputime for the root cgroup by getting the per cpu data
482	* at a global level, then categorizing the fields in a manner consistent
483	* with how it is done by __cgroup_account_cputime_field for each bit of
484	* cpu time attributed to a cgroup.
485	*/
486	static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
487	{
488	struct task_cputime *cputime = &bstat->cputime;
489	int i;
490
491	memset(bstat, `0`, sizeof(*bstat));
492	for_each_possible_cpu(i) {
493	struct kernel_cpustat kcpustat;
494	u64 *cpustat = kcpustat.cpustat;
495	u64 user = `0`;
496	u64 sys = `0`;
497
498	kcpustat_cpu_fetch(dst: &kcpustat, cpu: i);
499
500	user += cpustat[CPUTIME_USER];
501	user += cpustat[CPUTIME_NICE];
502	cputime->utime += user;
503
504	sys += cpustat[CPUTIME_SYSTEM];
505	sys += cpustat[CPUTIME_IRQ];
506	sys += cpustat[CPUTIME_SOFTIRQ];
507	cputime->stime += sys;
508
509	cputime->sum_exec_runtime += user;
510	cputime->sum_exec_runtime += sys;
511	cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
512
513	#ifdef CONFIG_SCHED_CORE
514	bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
515	#endif
516	}
517	}
518
519	void cgroup_base_stat_cputime_show(struct seq_file *seq)
520	{
521	struct cgroup *cgrp = seq_css(seq)->cgroup;
522	u64 usage, utime, stime;
523	struct cgroup_base_stat bstat;
524	#ifdef CONFIG_SCHED_CORE
525	u64 forceidle_time;
526	#endif
527
528	if (cgroup_parent(cgrp)) {
529	cgroup_rstat_flush_hold(cgrp);
530	usage = cgrp->bstat.cputime.sum_exec_runtime;
531	cputime_adjust(curr: &cgrp->bstat.cputime, prev: &cgrp->prev_cputime,
532	ut: &utime, st: &stime);
533	#ifdef CONFIG_SCHED_CORE
534	forceidle_time = cgrp->bstat.forceidle_sum;
535	#endif
536	cgroup_rstat_flush_release();
537	} else {
538	root_cgroup_cputime(bstat: &bstat);
539	usage = bstat.cputime.sum_exec_runtime;
540	utime = bstat.cputime.utime;
541	stime = bstat.cputime.stime;
542	#ifdef CONFIG_SCHED_CORE
543	forceidle_time = bstat.forceidle_sum;
544	#endif
545	}
546
547	do_div(usage, NSEC_PER_USEC);
548	do_div(utime, NSEC_PER_USEC);
549	do_div(stime, NSEC_PER_USEC);
550	#ifdef CONFIG_SCHED_CORE
551	do_div(forceidle_time, NSEC_PER_USEC);
552	#endif
553
554	seq_printf(m: seq, fmt: "usage_usec %llu\n"
555	"user_usec %llu\n"
556	"system_usec %llu\n",
557	usage, utime, stime);
558
559	#ifdef CONFIG_SCHED_CORE
560	seq_printf(m: seq, fmt: "core_sched.force_idle_usec %llu\n", forceidle_time);
561	#endif
562	}
563
564	/ Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() /
565	BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
566	BTF_ID_FLAGS(func, cgroup_rstat_updated)
567	BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
568	BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
569
570	static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
571	.owner = THIS_MODULE,
572	.set = &bpf_rstat_kfunc_ids,
573	};
574
575	static int __init bpf_rstat_kfunc_init(void)
576	{
577	return register_btf_kfunc_id_set(prog_type: BPF_PROG_TYPE_TRACING,
578	s: &bpf_rstat_kfunc_set);
579	}
580	late_initcall(bpf_rstat_kfunc_init);
581

source code of linux/kernel/cgroup/rstat.c