1 | // SPDX-License-Identifier: GPL-2.0-only |
---|---|
2 | /* |
3 | * Simple CPU accounting cgroup controller |
4 | */ |
5 | |
6 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
7 | #include <asm/cputime.h> |
8 | #endif |
9 | |
10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
11 | |
12 | /* |
13 | * There are no locks covering percpu hardirq/softirq time. |
14 | * They are only modified in vtime_account, on corresponding CPU |
15 | * with interrupts disabled. So, writes are safe. |
16 | * They are read and saved off onto struct rq in update_rq_clock(). |
17 | * This may result in other CPU reading this CPU's irq time and can |
18 | * race with irq/vtime_account on this CPU. We would either get old |
19 | * or new value with a side effect of accounting a slice of irq time to wrong |
20 | * task when irq is in progress while we read rq->clock. That is a worthy |
21 | * compromise in place of having locks on each irq in account_system_time. |
22 | */ |
23 | DEFINE_PER_CPU(struct irqtime, cpu_irqtime); |
24 | |
25 | static int sched_clock_irqtime; |
26 | |
27 | void enable_sched_clock_irqtime(void) |
28 | { |
29 | sched_clock_irqtime = 1; |
30 | } |
31 | |
32 | void disable_sched_clock_irqtime(void) |
33 | { |
34 | sched_clock_irqtime = 0; |
35 | } |
36 | |
37 | static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, |
38 | enum cpu_usage_stat idx) |
39 | { |
40 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
41 | |
42 | u64_stats_update_begin(syncp: &irqtime->sync); |
43 | cpustat[idx] += delta; |
44 | irqtime->total += delta; |
45 | irqtime->tick_delta += delta; |
46 | u64_stats_update_end(syncp: &irqtime->sync); |
47 | } |
48 | |
49 | /* |
50 | * Called after incrementing preempt_count on {soft,}irq_enter |
51 | * and before decrementing preempt_count on {soft,}irq_exit. |
52 | */ |
53 | void irqtime_account_irq(struct task_struct *curr, unsigned int offset) |
54 | { |
55 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
56 | unsigned int pc; |
57 | s64 delta; |
58 | int cpu; |
59 | |
60 | if (!sched_clock_irqtime) |
61 | return; |
62 | |
63 | cpu = smp_processor_id(); |
64 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
65 | irqtime->irq_start_time += delta; |
66 | pc = irq_count() - offset; |
67 | |
68 | /* |
69 | * We do not account for softirq time from ksoftirqd here. |
70 | * We want to continue accounting softirq time to ksoftirqd thread |
71 | * in that case, so as not to confuse scheduler with a special task |
72 | * that do not consume any time, but still wants to run. |
73 | */ |
74 | if (pc & HARDIRQ_MASK) |
75 | irqtime_account_delta(irqtime, delta, idx: CPUTIME_IRQ); |
76 | else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) |
77 | irqtime_account_delta(irqtime, delta, idx: CPUTIME_SOFTIRQ); |
78 | } |
79 | |
80 | static u64 irqtime_tick_accounted(u64 maxtime) |
81 | { |
82 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
83 | u64 delta; |
84 | |
85 | delta = min(irqtime->tick_delta, maxtime); |
86 | irqtime->tick_delta -= delta; |
87 | |
88 | return delta; |
89 | } |
90 | |
91 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
92 | |
93 | #define sched_clock_irqtime (0) |
94 | |
95 | static u64 irqtime_tick_accounted(u64 dummy) |
96 | { |
97 | return 0; |
98 | } |
99 | |
100 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ |
101 | |
102 | static inline void task_group_account_field(struct task_struct *p, int index, |
103 | u64 tmp) |
104 | { |
105 | /* |
106 | * Since all updates are sure to touch the root cgroup, we |
107 | * get ourselves ahead and touch it first. If the root cgroup |
108 | * is the only cgroup, then nothing else should be necessary. |
109 | * |
110 | */ |
111 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
112 | |
113 | cgroup_account_cputime_field(task: p, index, delta_exec: tmp); |
114 | } |
115 | |
116 | /* |
117 | * Account user CPU time to a process. |
118 | * @p: the process that the CPU time gets accounted to |
119 | * @cputime: the CPU time spent in user space since the last update |
120 | */ |
121 | void account_user_time(struct task_struct *p, u64 cputime) |
122 | { |
123 | int index; |
124 | |
125 | /* Add user time to process. */ |
126 | p->utime += cputime; |
127 | account_group_user_time(tsk: p, cputime); |
128 | |
129 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
130 | |
131 | /* Add user time to cpustat. */ |
132 | task_group_account_field(p, index, tmp: cputime); |
133 | |
134 | /* Account for user time used */ |
135 | acct_account_cputime(tsk: p); |
136 | } |
137 | |
138 | /* |
139 | * Account guest CPU time to a process. |
140 | * @p: the process that the CPU time gets accounted to |
141 | * @cputime: the CPU time spent in virtual machine since the last update |
142 | */ |
143 | void account_guest_time(struct task_struct *p, u64 cputime) |
144 | { |
145 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
146 | |
147 | /* Add guest time to process. */ |
148 | p->utime += cputime; |
149 | account_group_user_time(tsk: p, cputime); |
150 | p->gtime += cputime; |
151 | |
152 | /* Add guest time to cpustat. */ |
153 | if (task_nice(p) > 0) { |
154 | task_group_account_field(p, index: CPUTIME_NICE, tmp: cputime); |
155 | cpustat[CPUTIME_GUEST_NICE] += cputime; |
156 | } else { |
157 | task_group_account_field(p, index: CPUTIME_USER, tmp: cputime); |
158 | cpustat[CPUTIME_GUEST] += cputime; |
159 | } |
160 | } |
161 | |
162 | /* |
163 | * Account system CPU time to a process and desired cpustat field |
164 | * @p: the process that the CPU time gets accounted to |
165 | * @cputime: the CPU time spent in kernel space since the last update |
166 | * @index: pointer to cpustat field that has to be updated |
167 | */ |
168 | void account_system_index_time(struct task_struct *p, |
169 | u64 cputime, enum cpu_usage_stat index) |
170 | { |
171 | /* Add system time to process. */ |
172 | p->stime += cputime; |
173 | account_group_system_time(tsk: p, cputime); |
174 | |
175 | /* Add system time to cpustat. */ |
176 | task_group_account_field(p, index, tmp: cputime); |
177 | |
178 | /* Account for system time used */ |
179 | acct_account_cputime(tsk: p); |
180 | } |
181 | |
182 | /* |
183 | * Account system CPU time to a process. |
184 | * @p: the process that the CPU time gets accounted to |
185 | * @hardirq_offset: the offset to subtract from hardirq_count() |
186 | * @cputime: the CPU time spent in kernel space since the last update |
187 | */ |
188 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
189 | { |
190 | int index; |
191 | |
192 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
193 | account_guest_time(p, cputime); |
194 | return; |
195 | } |
196 | |
197 | if (hardirq_count() - hardirq_offset) |
198 | index = CPUTIME_IRQ; |
199 | else if (in_serving_softirq()) |
200 | index = CPUTIME_SOFTIRQ; |
201 | else |
202 | index = CPUTIME_SYSTEM; |
203 | |
204 | account_system_index_time(p, cputime, index); |
205 | } |
206 | |
207 | /* |
208 | * Account for involuntary wait time. |
209 | * @cputime: the CPU time spent in involuntary wait |
210 | */ |
211 | void account_steal_time(u64 cputime) |
212 | { |
213 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
214 | |
215 | cpustat[CPUTIME_STEAL] += cputime; |
216 | } |
217 | |
218 | /* |
219 | * Account for idle time. |
220 | * @cputime: the CPU time spent in idle wait |
221 | */ |
222 | void account_idle_time(u64 cputime) |
223 | { |
224 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
225 | struct rq *rq = this_rq(); |
226 | |
227 | if (atomic_read(v: &rq->nr_iowait) > 0) |
228 | cpustat[CPUTIME_IOWAIT] += cputime; |
229 | else |
230 | cpustat[CPUTIME_IDLE] += cputime; |
231 | } |
232 | |
233 | |
234 | #ifdef CONFIG_SCHED_CORE |
235 | /* |
236 | * Account for forceidle time due to core scheduling. |
237 | * |
238 | * REQUIRES: schedstat is enabled. |
239 | */ |
240 | void __account_forceidle_time(struct task_struct *p, u64 delta) |
241 | { |
242 | __schedstat_add(p->stats.core_forceidle_sum, delta); |
243 | |
244 | task_group_account_field(p, index: CPUTIME_FORCEIDLE, tmp: delta); |
245 | } |
246 | #endif |
247 | |
248 | /* |
249 | * When a guest is interrupted for a longer amount of time, missed clock |
250 | * ticks are not redelivered later. Due to that, this function may on |
251 | * occasion account more time than the calling functions think elapsed. |
252 | */ |
253 | static __always_inline u64 steal_account_process_time(u64 maxtime) |
254 | { |
255 | #ifdef CONFIG_PARAVIRT |
256 | if (static_key_false(key: ¶virt_steal_enabled)) { |
257 | u64 steal; |
258 | |
259 | steal = paravirt_steal_clock(smp_processor_id()); |
260 | steal -= this_rq()->prev_steal_time; |
261 | steal = min(steal, maxtime); |
262 | account_steal_time(cputime: steal); |
263 | this_rq()->prev_steal_time += steal; |
264 | |
265 | return steal; |
266 | } |
267 | #endif |
268 | return 0; |
269 | } |
270 | |
271 | /* |
272 | * Account how much elapsed time was spent in steal, irq, or softirq time. |
273 | */ |
274 | static inline u64 account_other_time(u64 max) |
275 | { |
276 | u64 accounted; |
277 | |
278 | lockdep_assert_irqs_disabled(); |
279 | |
280 | accounted = steal_account_process_time(maxtime: max); |
281 | |
282 | if (accounted < max) |
283 | accounted += irqtime_tick_accounted(maxtime: max - accounted); |
284 | |
285 | return accounted; |
286 | } |
287 | |
288 | #ifdef CONFIG_64BIT |
289 | static inline u64 read_sum_exec_runtime(struct task_struct *t) |
290 | { |
291 | return t->se.sum_exec_runtime; |
292 | } |
293 | #else |
294 | static u64 read_sum_exec_runtime(struct task_struct *t) |
295 | { |
296 | u64 ns; |
297 | struct rq_flags rf; |
298 | struct rq *rq; |
299 | |
300 | rq = task_rq_lock(t, &rf); |
301 | ns = t->se.sum_exec_runtime; |
302 | task_rq_unlock(rq, t, &rf); |
303 | |
304 | return ns; |
305 | } |
306 | #endif |
307 | |
308 | /* |
309 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live |
310 | * tasks (sum on group iteration) belonging to @tsk's group. |
311 | */ |
312 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
313 | { |
314 | struct signal_struct *sig = tsk->signal; |
315 | u64 utime, stime; |
316 | struct task_struct *t; |
317 | unsigned int seq, nextseq; |
318 | unsigned long flags; |
319 | |
320 | /* |
321 | * Update current task runtime to account pending time since last |
322 | * scheduler action or thread_group_cputime() call. This thread group |
323 | * might have other running tasks on different CPUs, but updating |
324 | * their runtime can affect syscall performance, so we skip account |
325 | * those pending times and rely only on values updated on tick or |
326 | * other scheduler action. |
327 | */ |
328 | if (same_thread_group(current, p2: tsk)) |
329 | (void) task_sched_runtime(current); |
330 | |
331 | rcu_read_lock(); |
332 | /* Attempt a lockless read on the first round. */ |
333 | nextseq = 0; |
334 | do { |
335 | seq = nextseq; |
336 | flags = read_seqbegin_or_lock_irqsave(lock: &sig->stats_lock, seq: &seq); |
337 | times->utime = sig->utime; |
338 | times->stime = sig->stime; |
339 | times->sum_exec_runtime = sig->sum_sched_runtime; |
340 | |
341 | for_each_thread(tsk, t) { |
342 | task_cputime(t, utime: &utime, stime: &stime); |
343 | times->utime += utime; |
344 | times->stime += stime; |
345 | times->sum_exec_runtime += read_sum_exec_runtime(t); |
346 | } |
347 | /* If lockless access failed, take the lock. */ |
348 | nextseq = 1; |
349 | } while (need_seqretry(lock: &sig->stats_lock, seq)); |
350 | done_seqretry_irqrestore(lock: &sig->stats_lock, seq, flags); |
351 | rcu_read_unlock(); |
352 | } |
353 | |
354 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
355 | /* |
356 | * Account a tick to a process and cpustat |
357 | * @p: the process that the CPU time gets accounted to |
358 | * @user_tick: is the tick from userspace |
359 | * @rq: the pointer to rq |
360 | * |
361 | * Tick demultiplexing follows the order |
362 | * - pending hardirq update |
363 | * - pending softirq update |
364 | * - user_time |
365 | * - idle_time |
366 | * - system time |
367 | * - check for guest_time |
368 | * - else account as system_time |
369 | * |
370 | * Check for hardirq is done both for system and user time as there is |
371 | * no timer going off while we are on hardirq and hence we may never get an |
372 | * opportunity to update it solely in system time. |
373 | * p->stime and friends are only updated on system time and not on irq |
374 | * softirq as those do not count in task exec_runtime any more. |
375 | */ |
376 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
377 | int ticks) |
378 | { |
379 | u64 other, cputime = TICK_NSEC * ticks; |
380 | |
381 | /* |
382 | * When returning from idle, many ticks can get accounted at |
383 | * once, including some ticks of steal, irq, and softirq time. |
384 | * Subtract those ticks from the amount of time accounted to |
385 | * idle, or potentially user or system time. Due to rounding, |
386 | * other time can exceed ticks occasionally. |
387 | */ |
388 | other = account_other_time(ULONG_MAX); |
389 | if (other >= cputime) |
390 | return; |
391 | |
392 | cputime -= other; |
393 | |
394 | if (this_cpu_ksoftirqd() == p) { |
395 | /* |
396 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
397 | * So, we have to handle it separately here. |
398 | * Also, p->stime needs to be updated for ksoftirqd. |
399 | */ |
400 | account_system_index_time(p, cputime, index: CPUTIME_SOFTIRQ); |
401 | } else if (user_tick) { |
402 | account_user_time(p, cputime); |
403 | } else if (p == this_rq()->idle) { |
404 | account_idle_time(cputime); |
405 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
406 | account_guest_time(p, cputime); |
407 | } else { |
408 | account_system_index_time(p, cputime, index: CPUTIME_SYSTEM); |
409 | } |
410 | } |
411 | |
412 | static void irqtime_account_idle_ticks(int ticks) |
413 | { |
414 | irqtime_account_process_tick(current, user_tick: 0, ticks); |
415 | } |
416 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
417 | static inline void irqtime_account_idle_ticks(int ticks) { } |
418 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
419 | int nr_ticks) { } |
420 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
421 | |
422 | /* |
423 | * Use precise platform statistics if available: |
424 | */ |
425 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
426 | |
427 | # ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
428 | void vtime_task_switch(struct task_struct *prev) |
429 | { |
430 | if (is_idle_task(prev)) |
431 | vtime_account_idle(prev); |
432 | else |
433 | vtime_account_kernel(prev); |
434 | |
435 | vtime_flush(prev); |
436 | arch_vtime_task_switch(prev); |
437 | } |
438 | # endif |
439 | |
440 | void vtime_account_irq(struct task_struct *tsk, unsigned int offset) |
441 | { |
442 | unsigned int pc = irq_count() - offset; |
443 | |
444 | if (pc & HARDIRQ_OFFSET) { |
445 | vtime_account_hardirq(tsk); |
446 | } else if (pc & SOFTIRQ_OFFSET) { |
447 | vtime_account_softirq(tsk); |
448 | } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && |
449 | is_idle_task(tsk)) { |
450 | vtime_account_idle(tsk); |
451 | } else { |
452 | vtime_account_kernel(tsk); |
453 | } |
454 | } |
455 | |
456 | void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, |
457 | u64 *ut, u64 *st) |
458 | { |
459 | *ut = curr->utime; |
460 | *st = curr->stime; |
461 | } |
462 | |
463 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
464 | { |
465 | *ut = p->utime; |
466 | *st = p->stime; |
467 | } |
468 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
469 | |
470 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
471 | { |
472 | struct task_cputime cputime; |
473 | |
474 | thread_group_cputime(p, &cputime); |
475 | |
476 | *ut = cputime.utime; |
477 | *st = cputime.stime; |
478 | } |
479 | |
480 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ |
481 | |
482 | /* |
483 | * Account a single tick of CPU time. |
484 | * @p: the process that the CPU time gets accounted to |
485 | * @user_tick: indicates if the tick is a user or a system tick |
486 | */ |
487 | void account_process_tick(struct task_struct *p, int user_tick) |
488 | { |
489 | u64 cputime, steal; |
490 | |
491 | if (vtime_accounting_enabled_this_cpu()) |
492 | return; |
493 | |
494 | if (sched_clock_irqtime) { |
495 | irqtime_account_process_tick(p, user_tick, ticks: 1); |
496 | return; |
497 | } |
498 | |
499 | cputime = TICK_NSEC; |
500 | steal = steal_account_process_time(ULONG_MAX); |
501 | |
502 | if (steal >= cputime) |
503 | return; |
504 | |
505 | cputime -= steal; |
506 | |
507 | if (user_tick) |
508 | account_user_time(p, cputime); |
509 | else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET)) |
510 | account_system_time(p, HARDIRQ_OFFSET, cputime); |
511 | else |
512 | account_idle_time(cputime); |
513 | } |
514 | |
515 | /* |
516 | * Account multiple ticks of idle time. |
517 | * @ticks: number of stolen ticks |
518 | */ |
519 | void account_idle_ticks(unsigned long ticks) |
520 | { |
521 | u64 cputime, steal; |
522 | |
523 | if (sched_clock_irqtime) { |
524 | irqtime_account_idle_ticks(ticks); |
525 | return; |
526 | } |
527 | |
528 | cputime = ticks * TICK_NSEC; |
529 | steal = steal_account_process_time(ULONG_MAX); |
530 | |
531 | if (steal >= cputime) |
532 | return; |
533 | |
534 | cputime -= steal; |
535 | account_idle_time(cputime); |
536 | } |
537 | |
538 | /* |
539 | * Adjust tick based cputime random precision against scheduler runtime |
540 | * accounting. |
541 | * |
542 | * Tick based cputime accounting depend on random scheduling timeslices of a |
543 | * task to be interrupted or not by the timer. Depending on these |
544 | * circumstances, the number of these interrupts may be over or |
545 | * under-optimistic, matching the real user and system cputime with a variable |
546 | * precision. |
547 | * |
548 | * Fix this by scaling these tick based values against the total runtime |
549 | * accounted by the CFS scheduler. |
550 | * |
551 | * This code provides the following guarantees: |
552 | * |
553 | * stime + utime == rtime |
554 | * stime_i+1 >= stime_i, utime_i+1 >= utime_i |
555 | * |
556 | * Assuming that rtime_i+1 >= rtime_i. |
557 | */ |
558 | void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, |
559 | u64 *ut, u64 *st) |
560 | { |
561 | u64 rtime, stime, utime; |
562 | unsigned long flags; |
563 | |
564 | /* Serialize concurrent callers such that we can honour our guarantees */ |
565 | raw_spin_lock_irqsave(&prev->lock, flags); |
566 | rtime = curr->sum_exec_runtime; |
567 | |
568 | /* |
569 | * This is possible under two circumstances: |
570 | * - rtime isn't monotonic after all (a bug); |
571 | * - we got reordered by the lock. |
572 | * |
573 | * In both cases this acts as a filter such that the rest of the code |
574 | * can assume it is monotonic regardless of anything else. |
575 | */ |
576 | if (prev->stime + prev->utime >= rtime) |
577 | goto out; |
578 | |
579 | stime = curr->stime; |
580 | utime = curr->utime; |
581 | |
582 | /* |
583 | * If either stime or utime are 0, assume all runtime is userspace. |
584 | * Once a task gets some ticks, the monotonicity code at 'update:' |
585 | * will ensure things converge to the observed ratio. |
586 | */ |
587 | if (stime == 0) { |
588 | utime = rtime; |
589 | goto update; |
590 | } |
591 | |
592 | if (utime == 0) { |
593 | stime = rtime; |
594 | goto update; |
595 | } |
596 | |
597 | stime = mul_u64_u64_div_u64(a: stime, mul: rtime, div: stime + utime); |
598 | |
599 | update: |
600 | /* |
601 | * Make sure stime doesn't go backwards; this preserves monotonicity |
602 | * for utime because rtime is monotonic. |
603 | * |
604 | * utime_i+1 = rtime_i+1 - stime_i |
605 | * = rtime_i+1 - (rtime_i - utime_i) |
606 | * = (rtime_i+1 - rtime_i) + utime_i |
607 | * >= utime_i |
608 | */ |
609 | if (stime < prev->stime) |
610 | stime = prev->stime; |
611 | utime = rtime - stime; |
612 | |
613 | /* |
614 | * Make sure utime doesn't go backwards; this still preserves |
615 | * monotonicity for stime, analogous argument to above. |
616 | */ |
617 | if (utime < prev->utime) { |
618 | utime = prev->utime; |
619 | stime = rtime - utime; |
620 | } |
621 | |
622 | prev->stime = stime; |
623 | prev->utime = utime; |
624 | out: |
625 | *ut = prev->utime; |
626 | *st = prev->stime; |
627 | raw_spin_unlock_irqrestore(&prev->lock, flags); |
628 | } |
629 | |
630 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
631 | { |
632 | struct task_cputime cputime = { |
633 | .sum_exec_runtime = p->se.sum_exec_runtime, |
634 | }; |
635 | |
636 | if (task_cputime(t: p, utime: &cputime.utime, stime: &cputime.stime)) |
637 | cputime.sum_exec_runtime = task_sched_runtime(task: p); |
638 | cputime_adjust(curr: &cputime, prev: &p->prev_cputime, ut, st); |
639 | } |
640 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
641 | |
642 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
643 | { |
644 | struct task_cputime cputime; |
645 | |
646 | thread_group_cputime(tsk: p, times: &cputime); |
647 | cputime_adjust(curr: &cputime, prev: &p->signal->prev_cputime, ut, st); |
648 | } |
649 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
650 | |
651 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
652 | static u64 vtime_delta(struct vtime *vtime) |
653 | { |
654 | unsigned long long clock; |
655 | |
656 | clock = sched_clock(); |
657 | if (clock < vtime->starttime) |
658 | return 0; |
659 | |
660 | return clock - vtime->starttime; |
661 | } |
662 | |
663 | static u64 get_vtime_delta(struct vtime *vtime) |
664 | { |
665 | u64 delta = vtime_delta(vtime); |
666 | u64 other; |
667 | |
668 | /* |
669 | * Unlike tick based timing, vtime based timing never has lost |
670 | * ticks, and no need for steal time accounting to make up for |
671 | * lost ticks. Vtime accounts a rounded version of actual |
672 | * elapsed time. Limit account_other_time to prevent rounding |
673 | * errors from causing elapsed vtime to go negative. |
674 | */ |
675 | other = account_other_time(delta); |
676 | WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); |
677 | vtime->starttime += delta; |
678 | |
679 | return delta - other; |
680 | } |
681 | |
682 | static void vtime_account_system(struct task_struct *tsk, |
683 | struct vtime *vtime) |
684 | { |
685 | vtime->stime += get_vtime_delta(vtime); |
686 | if (vtime->stime >= TICK_NSEC) { |
687 | account_system_time(tsk, irq_count(), vtime->stime); |
688 | vtime->stime = 0; |
689 | } |
690 | } |
691 | |
692 | static void vtime_account_guest(struct task_struct *tsk, |
693 | struct vtime *vtime) |
694 | { |
695 | vtime->gtime += get_vtime_delta(vtime); |
696 | if (vtime->gtime >= TICK_NSEC) { |
697 | account_guest_time(tsk, vtime->gtime); |
698 | vtime->gtime = 0; |
699 | } |
700 | } |
701 | |
702 | static void __vtime_account_kernel(struct task_struct *tsk, |
703 | struct vtime *vtime) |
704 | { |
705 | /* We might have scheduled out from guest path */ |
706 | if (vtime->state == VTIME_GUEST) |
707 | vtime_account_guest(tsk, vtime); |
708 | else |
709 | vtime_account_system(tsk, vtime); |
710 | } |
711 | |
712 | void vtime_account_kernel(struct task_struct *tsk) |
713 | { |
714 | struct vtime *vtime = &tsk->vtime; |
715 | |
716 | if (!vtime_delta(vtime)) |
717 | return; |
718 | |
719 | write_seqcount_begin(&vtime->seqcount); |
720 | __vtime_account_kernel(tsk, vtime); |
721 | write_seqcount_end(&vtime->seqcount); |
722 | } |
723 | |
724 | void vtime_user_enter(struct task_struct *tsk) |
725 | { |
726 | struct vtime *vtime = &tsk->vtime; |
727 | |
728 | write_seqcount_begin(&vtime->seqcount); |
729 | vtime_account_system(tsk, vtime); |
730 | vtime->state = VTIME_USER; |
731 | write_seqcount_end(&vtime->seqcount); |
732 | } |
733 | |
734 | void vtime_user_exit(struct task_struct *tsk) |
735 | { |
736 | struct vtime *vtime = &tsk->vtime; |
737 | |
738 | write_seqcount_begin(&vtime->seqcount); |
739 | vtime->utime += get_vtime_delta(vtime); |
740 | if (vtime->utime >= TICK_NSEC) { |
741 | account_user_time(tsk, vtime->utime); |
742 | vtime->utime = 0; |
743 | } |
744 | vtime->state = VTIME_SYS; |
745 | write_seqcount_end(&vtime->seqcount); |
746 | } |
747 | |
748 | void vtime_guest_enter(struct task_struct *tsk) |
749 | { |
750 | struct vtime *vtime = &tsk->vtime; |
751 | /* |
752 | * The flags must be updated under the lock with |
753 | * the vtime_starttime flush and update. |
754 | * That enforces a right ordering and update sequence |
755 | * synchronization against the reader (task_gtime()) |
756 | * that can thus safely catch up with a tickless delta. |
757 | */ |
758 | write_seqcount_begin(&vtime->seqcount); |
759 | vtime_account_system(tsk, vtime); |
760 | tsk->flags |= PF_VCPU; |
761 | vtime->state = VTIME_GUEST; |
762 | write_seqcount_end(&vtime->seqcount); |
763 | } |
764 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
765 | |
766 | void vtime_guest_exit(struct task_struct *tsk) |
767 | { |
768 | struct vtime *vtime = &tsk->vtime; |
769 | |
770 | write_seqcount_begin(&vtime->seqcount); |
771 | vtime_account_guest(tsk, vtime); |
772 | tsk->flags &= ~PF_VCPU; |
773 | vtime->state = VTIME_SYS; |
774 | write_seqcount_end(&vtime->seqcount); |
775 | } |
776 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
777 | |
778 | void vtime_account_idle(struct task_struct *tsk) |
779 | { |
780 | account_idle_time(get_vtime_delta(&tsk->vtime)); |
781 | } |
782 | |
783 | void vtime_task_switch_generic(struct task_struct *prev) |
784 | { |
785 | struct vtime *vtime = &prev->vtime; |
786 | |
787 | write_seqcount_begin(&vtime->seqcount); |
788 | if (vtime->state == VTIME_IDLE) |
789 | vtime_account_idle(prev); |
790 | else |
791 | __vtime_account_kernel(prev, vtime); |
792 | vtime->state = VTIME_INACTIVE; |
793 | vtime->cpu = -1; |
794 | write_seqcount_end(&vtime->seqcount); |
795 | |
796 | vtime = ¤t->vtime; |
797 | |
798 | write_seqcount_begin(&vtime->seqcount); |
799 | if (is_idle_task(current)) |
800 | vtime->state = VTIME_IDLE; |
801 | else if (current->flags & PF_VCPU) |
802 | vtime->state = VTIME_GUEST; |
803 | else |
804 | vtime->state = VTIME_SYS; |
805 | vtime->starttime = sched_clock(); |
806 | vtime->cpu = smp_processor_id(); |
807 | write_seqcount_end(&vtime->seqcount); |
808 | } |
809 | |
810 | void vtime_init_idle(struct task_struct *t, int cpu) |
811 | { |
812 | struct vtime *vtime = &t->vtime; |
813 | unsigned long flags; |
814 | |
815 | local_irq_save(flags); |
816 | write_seqcount_begin(&vtime->seqcount); |
817 | vtime->state = VTIME_IDLE; |
818 | vtime->starttime = sched_clock(); |
819 | vtime->cpu = cpu; |
820 | write_seqcount_end(&vtime->seqcount); |
821 | local_irq_restore(flags); |
822 | } |
823 | |
824 | u64 task_gtime(struct task_struct *t) |
825 | { |
826 | struct vtime *vtime = &t->vtime; |
827 | unsigned int seq; |
828 | u64 gtime; |
829 | |
830 | if (!vtime_accounting_enabled()) |
831 | return t->gtime; |
832 | |
833 | do { |
834 | seq = read_seqcount_begin(&vtime->seqcount); |
835 | |
836 | gtime = t->gtime; |
837 | if (vtime->state == VTIME_GUEST) |
838 | gtime += vtime->gtime + vtime_delta(vtime); |
839 | |
840 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
841 | |
842 | return gtime; |
843 | } |
844 | |
845 | /* |
846 | * Fetch cputime raw values from fields of task_struct and |
847 | * add up the pending nohz execution time since the last |
848 | * cputime snapshot. |
849 | */ |
850 | bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
851 | { |
852 | struct vtime *vtime = &t->vtime; |
853 | unsigned int seq; |
854 | u64 delta; |
855 | int ret; |
856 | |
857 | if (!vtime_accounting_enabled()) { |
858 | *utime = t->utime; |
859 | *stime = t->stime; |
860 | return false; |
861 | } |
862 | |
863 | do { |
864 | ret = false; |
865 | seq = read_seqcount_begin(&vtime->seqcount); |
866 | |
867 | *utime = t->utime; |
868 | *stime = t->stime; |
869 | |
870 | /* Task is sleeping or idle, nothing to add */ |
871 | if (vtime->state < VTIME_SYS) |
872 | continue; |
873 | |
874 | ret = true; |
875 | delta = vtime_delta(vtime); |
876 | |
877 | /* |
878 | * Task runs either in user (including guest) or kernel space, |
879 | * add pending nohz time to the right place. |
880 | */ |
881 | if (vtime->state == VTIME_SYS) |
882 | *stime += vtime->stime + delta; |
883 | else |
884 | *utime += vtime->utime + delta; |
885 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
886 | |
887 | return ret; |
888 | } |
889 | |
890 | static int vtime_state_fetch(struct vtime *vtime, int cpu) |
891 | { |
892 | int state = READ_ONCE(vtime->state); |
893 | |
894 | /* |
895 | * We raced against a context switch, fetch the |
896 | * kcpustat task again. |
897 | */ |
898 | if (vtime->cpu != cpu && vtime->cpu != -1) |
899 | return -EAGAIN; |
900 | |
901 | /* |
902 | * Two possible things here: |
903 | * 1) We are seeing the scheduling out task (prev) or any past one. |
904 | * 2) We are seeing the scheduling in task (next) but it hasn't |
905 | * passed though vtime_task_switch() yet so the pending |
906 | * cputime of the prev task may not be flushed yet. |
907 | * |
908 | * Case 1) is ok but 2) is not. So wait for a safe VTIME state. |
909 | */ |
910 | if (state == VTIME_INACTIVE) |
911 | return -EAGAIN; |
912 | |
913 | return state; |
914 | } |
915 | |
916 | static u64 kcpustat_user_vtime(struct vtime *vtime) |
917 | { |
918 | if (vtime->state == VTIME_USER) |
919 | return vtime->utime + vtime_delta(vtime); |
920 | else if (vtime->state == VTIME_GUEST) |
921 | return vtime->gtime + vtime_delta(vtime); |
922 | return 0; |
923 | } |
924 | |
925 | static int kcpustat_field_vtime(u64 *cpustat, |
926 | struct task_struct *tsk, |
927 | enum cpu_usage_stat usage, |
928 | int cpu, u64 *val) |
929 | { |
930 | struct vtime *vtime = &tsk->vtime; |
931 | unsigned int seq; |
932 | |
933 | do { |
934 | int state; |
935 | |
936 | seq = read_seqcount_begin(&vtime->seqcount); |
937 | |
938 | state = vtime_state_fetch(vtime, cpu); |
939 | if (state < 0) |
940 | return state; |
941 | |
942 | *val = cpustat[usage]; |
943 | |
944 | /* |
945 | * Nice VS unnice cputime accounting may be inaccurate if |
946 | * the nice value has changed since the last vtime update. |
947 | * But proper fix would involve interrupting target on nice |
948 | * updates which is a no go on nohz_full (although the scheduler |
949 | * may still interrupt the target if rescheduling is needed...) |
950 | */ |
951 | switch (usage) { |
952 | case CPUTIME_SYSTEM: |
953 | if (state == VTIME_SYS) |
954 | *val += vtime->stime + vtime_delta(vtime); |
955 | break; |
956 | case CPUTIME_USER: |
957 | if (task_nice(tsk) <= 0) |
958 | *val += kcpustat_user_vtime(vtime); |
959 | break; |
960 | case CPUTIME_NICE: |
961 | if (task_nice(tsk) > 0) |
962 | *val += kcpustat_user_vtime(vtime); |
963 | break; |
964 | case CPUTIME_GUEST: |
965 | if (state == VTIME_GUEST && task_nice(tsk) <= 0) |
966 | *val += vtime->gtime + vtime_delta(vtime); |
967 | break; |
968 | case CPUTIME_GUEST_NICE: |
969 | if (state == VTIME_GUEST && task_nice(tsk) > 0) |
970 | *val += vtime->gtime + vtime_delta(vtime); |
971 | break; |
972 | default: |
973 | break; |
974 | } |
975 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
976 | |
977 | return 0; |
978 | } |
979 | |
980 | u64 kcpustat_field(struct kernel_cpustat *kcpustat, |
981 | enum cpu_usage_stat usage, int cpu) |
982 | { |
983 | u64 *cpustat = kcpustat->cpustat; |
984 | u64 val = cpustat[usage]; |
985 | struct rq *rq; |
986 | int err; |
987 | |
988 | if (!vtime_accounting_enabled_cpu(cpu)) |
989 | return val; |
990 | |
991 | rq = cpu_rq(cpu); |
992 | |
993 | for (;;) { |
994 | struct task_struct *curr; |
995 | |
996 | rcu_read_lock(); |
997 | curr = rcu_dereference(rq->curr); |
998 | if (WARN_ON_ONCE(!curr)) { |
999 | rcu_read_unlock(); |
1000 | return cpustat[usage]; |
1001 | } |
1002 | |
1003 | err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); |
1004 | rcu_read_unlock(); |
1005 | |
1006 | if (!err) |
1007 | return val; |
1008 | |
1009 | cpu_relax(); |
1010 | } |
1011 | } |
1012 | EXPORT_SYMBOL_GPL(kcpustat_field); |
1013 | |
1014 | static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, |
1015 | const struct kernel_cpustat *src, |
1016 | struct task_struct *tsk, int cpu) |
1017 | { |
1018 | struct vtime *vtime = &tsk->vtime; |
1019 | unsigned int seq; |
1020 | |
1021 | do { |
1022 | u64 *cpustat; |
1023 | u64 delta; |
1024 | int state; |
1025 | |
1026 | seq = read_seqcount_begin(&vtime->seqcount); |
1027 | |
1028 | state = vtime_state_fetch(vtime, cpu); |
1029 | if (state < 0) |
1030 | return state; |
1031 | |
1032 | *dst = *src; |
1033 | cpustat = dst->cpustat; |
1034 | |
1035 | /* Task is sleeping, dead or idle, nothing to add */ |
1036 | if (state < VTIME_SYS) |
1037 | continue; |
1038 | |
1039 | delta = vtime_delta(vtime); |
1040 | |
1041 | /* |
1042 | * Task runs either in user (including guest) or kernel space, |
1043 | * add pending nohz time to the right place. |
1044 | */ |
1045 | if (state == VTIME_SYS) { |
1046 | cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; |
1047 | } else if (state == VTIME_USER) { |
1048 | if (task_nice(tsk) > 0) |
1049 | cpustat[CPUTIME_NICE] += vtime->utime + delta; |
1050 | else |
1051 | cpustat[CPUTIME_USER] += vtime->utime + delta; |
1052 | } else { |
1053 | WARN_ON_ONCE(state != VTIME_GUEST); |
1054 | if (task_nice(tsk) > 0) { |
1055 | cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; |
1056 | cpustat[CPUTIME_NICE] += vtime->gtime + delta; |
1057 | } else { |
1058 | cpustat[CPUTIME_GUEST] += vtime->gtime + delta; |
1059 | cpustat[CPUTIME_USER] += vtime->gtime + delta; |
1060 | } |
1061 | } |
1062 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
1063 | |
1064 | return 0; |
1065 | } |
1066 | |
1067 | void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) |
1068 | { |
1069 | const struct kernel_cpustat *src = &kcpustat_cpu(cpu); |
1070 | struct rq *rq; |
1071 | int err; |
1072 | |
1073 | if (!vtime_accounting_enabled_cpu(cpu)) { |
1074 | *dst = *src; |
1075 | return; |
1076 | } |
1077 | |
1078 | rq = cpu_rq(cpu); |
1079 | |
1080 | for (;;) { |
1081 | struct task_struct *curr; |
1082 | |
1083 | rcu_read_lock(); |
1084 | curr = rcu_dereference(rq->curr); |
1085 | if (WARN_ON_ONCE(!curr)) { |
1086 | rcu_read_unlock(); |
1087 | *dst = *src; |
1088 | return; |
1089 | } |
1090 | |
1091 | err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); |
1092 | rcu_read_unlock(); |
1093 | |
1094 | if (!err) |
1095 | return; |
1096 | |
1097 | cpu_relax(); |
1098 | } |
1099 | } |
1100 | EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); |
1101 | |
1102 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
1103 |
Definitions
- cpu_irqtime
- sched_clock_irqtime
- enable_sched_clock_irqtime
- disable_sched_clock_irqtime
- irqtime_account_delta
- irqtime_account_irq
- irqtime_tick_accounted
- task_group_account_field
- account_user_time
- account_guest_time
- account_system_index_time
- account_system_time
- account_steal_time
- account_idle_time
- __account_forceidle_time
- steal_account_process_time
- account_other_time
- read_sum_exec_runtime
- thread_group_cputime
- irqtime_account_process_tick
- irqtime_account_idle_ticks
- account_process_tick
- account_idle_ticks
- cputime_adjust
- task_cputime_adjusted
Improve your Profiling and Debugging skills
Find out more