1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _KERNEL_STATS_H |
3 | #define _KERNEL_STATS_H |
4 | |
5 | #ifdef CONFIG_SCHEDSTATS |
6 | |
7 | extern struct static_key_false sched_schedstats; |
8 | |
9 | /* |
10 | * Expects runqueue lock to be held for atomicity of update |
11 | */ |
12 | static inline void |
13 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
14 | { |
15 | if (rq) { |
16 | rq->rq_sched_info.run_delay += delta; |
17 | rq->rq_sched_info.pcount++; |
18 | } |
19 | } |
20 | |
21 | /* |
22 | * Expects runqueue lock to be held for atomicity of update |
23 | */ |
24 | static inline void |
25 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
26 | { |
27 | if (rq) |
28 | rq->rq_cpu_time += delta; |
29 | } |
30 | |
31 | static inline void |
32 | rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) |
33 | { |
34 | if (rq) |
35 | rq->rq_sched_info.run_delay += delta; |
36 | } |
37 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
38 | #define __schedstat_inc(var) do { var++; } while (0) |
39 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
40 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) |
41 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
42 | #define __schedstat_set(var, val) do { var = (val); } while (0) |
43 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
44 | #define schedstat_val(var) (var) |
45 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) |
46 | |
47 | void __update_stats_wait_start(struct rq *rq, struct task_struct *p, |
48 | struct sched_statistics *stats); |
49 | |
50 | void __update_stats_wait_end(struct rq *rq, struct task_struct *p, |
51 | struct sched_statistics *stats); |
52 | void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, |
53 | struct sched_statistics *stats); |
54 | |
55 | static inline void |
56 | check_schedstat_required(void) |
57 | { |
58 | if (schedstat_enabled()) |
59 | return; |
60 | |
61 | /* Force schedstat enabled if a dependent tracepoint is active */ |
62 | if (trace_sched_stat_wait_enabled() || |
63 | trace_sched_stat_sleep_enabled() || |
64 | trace_sched_stat_iowait_enabled() || |
65 | trace_sched_stat_blocked_enabled() || |
66 | trace_sched_stat_runtime_enabled()) |
67 | printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n" ); |
68 | } |
69 | |
70 | #else /* !CONFIG_SCHEDSTATS: */ |
71 | |
72 | static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } |
73 | static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } |
74 | static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } |
75 | # define schedstat_enabled() 0 |
76 | # define __schedstat_inc(var) do { } while (0) |
77 | # define schedstat_inc(var) do { } while (0) |
78 | # define __schedstat_add(var, amt) do { } while (0) |
79 | # define schedstat_add(var, amt) do { } while (0) |
80 | # define __schedstat_set(var, val) do { } while (0) |
81 | # define schedstat_set(var, val) do { } while (0) |
82 | # define schedstat_val(var) 0 |
83 | # define schedstat_val_or_zero(var) 0 |
84 | |
85 | # define __update_stats_wait_start(rq, p, stats) do { } while (0) |
86 | # define __update_stats_wait_end(rq, p, stats) do { } while (0) |
87 | # define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) |
88 | # define check_schedstat_required() do { } while (0) |
89 | |
90 | #endif /* CONFIG_SCHEDSTATS */ |
91 | |
92 | #ifdef CONFIG_FAIR_GROUP_SCHED |
93 | struct sched_entity_stats { |
94 | struct sched_entity se; |
95 | struct sched_statistics stats; |
96 | } __no_randomize_layout; |
97 | #endif |
98 | |
99 | static inline struct sched_statistics * |
100 | __schedstats_from_se(struct sched_entity *se) |
101 | { |
102 | #ifdef CONFIG_FAIR_GROUP_SCHED |
103 | if (!entity_is_task(se)) |
104 | return &container_of(se, struct sched_entity_stats, se)->stats; |
105 | #endif |
106 | return &task_of(se)->stats; |
107 | } |
108 | |
109 | #ifdef CONFIG_PSI |
110 | void psi_task_change(struct task_struct *task, int clear, int set); |
111 | void psi_task_switch(struct task_struct *prev, struct task_struct *next, |
112 | bool sleep); |
113 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
114 | void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); |
115 | #else |
116 | static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, |
117 | struct task_struct *prev) {} |
118 | #endif /*CONFIG_IRQ_TIME_ACCOUNTING */ |
119 | /* |
120 | * PSI tracks state that persists across sleeps, such as iowaits and |
121 | * memory stalls. As a result, it has to distinguish between sleeps, |
122 | * where a task's runnable state changes, and migrations, where a task |
123 | * and its runnable state are being moved between CPUs and runqueues. |
124 | * |
125 | * A notable case is a task whose dequeue is delayed. PSI considers |
126 | * those sleeping, but because they are still on the runqueue they can |
127 | * go through migration requeues. In this case, *sleeping* states need |
128 | * to be transferred. |
129 | */ |
130 | static inline void psi_enqueue(struct task_struct *p, int flags) |
131 | { |
132 | int clear = 0, set = 0; |
133 | |
134 | if (static_branch_likely(&psi_disabled)) |
135 | return; |
136 | |
137 | /* Same runqueue, nothing changed for psi */ |
138 | if (flags & ENQUEUE_RESTORE) |
139 | return; |
140 | |
141 | /* psi_sched_switch() will handle the flags */ |
142 | if (task_on_cpu(task_rq(p), p)) |
143 | return; |
144 | |
145 | if (p->se.sched_delayed) { |
146 | /* CPU migration of "sleeping" task */ |
147 | WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); |
148 | if (p->in_memstall) |
149 | set |= TSK_MEMSTALL; |
150 | if (p->in_iowait) |
151 | set |= TSK_IOWAIT; |
152 | } else if (flags & ENQUEUE_MIGRATED) { |
153 | /* CPU migration of runnable task */ |
154 | set = TSK_RUNNING; |
155 | if (p->in_memstall) |
156 | set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; |
157 | } else { |
158 | /* Wakeup of new or sleeping task */ |
159 | if (p->in_iowait) |
160 | clear |= TSK_IOWAIT; |
161 | set = TSK_RUNNING; |
162 | if (p->in_memstall) |
163 | set |= TSK_MEMSTALL_RUNNING; |
164 | } |
165 | |
166 | psi_task_change(task: p, clear, set); |
167 | } |
168 | |
169 | static inline void psi_dequeue(struct task_struct *p, int flags) |
170 | { |
171 | if (static_branch_likely(&psi_disabled)) |
172 | return; |
173 | |
174 | /* Same runqueue, nothing changed for psi */ |
175 | if (flags & DEQUEUE_SAVE) |
176 | return; |
177 | |
178 | /* |
179 | * A voluntary sleep is a dequeue followed by a task switch. To |
180 | * avoid walking all ancestors twice, psi_task_switch() handles |
181 | * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. |
182 | * Do nothing here. |
183 | */ |
184 | if (flags & DEQUEUE_SLEEP) |
185 | return; |
186 | |
187 | /* |
188 | * When migrating a task to another CPU, clear all psi |
189 | * state. The enqueue callback above will work it out. |
190 | */ |
191 | psi_task_change(task: p, clear: p->psi_flags, set: 0); |
192 | } |
193 | |
194 | static inline void psi_ttwu_dequeue(struct task_struct *p) |
195 | { |
196 | if (static_branch_likely(&psi_disabled)) |
197 | return; |
198 | /* |
199 | * Is the task being migrated during a wakeup? Make sure to |
200 | * deregister its sleep-persistent psi states from the old |
201 | * queue, and let psi_enqueue() know it has to requeue. |
202 | */ |
203 | if (unlikely(p->psi_flags)) { |
204 | struct rq_flags rf; |
205 | struct rq *rq; |
206 | |
207 | rq = __task_rq_lock(p, rf: &rf); |
208 | psi_task_change(task: p, clear: p->psi_flags, set: 0); |
209 | __task_rq_unlock(rq, rf: &rf); |
210 | } |
211 | } |
212 | |
213 | static inline void psi_sched_switch(struct task_struct *prev, |
214 | struct task_struct *next, |
215 | bool sleep) |
216 | { |
217 | if (static_branch_likely(&psi_disabled)) |
218 | return; |
219 | |
220 | psi_task_switch(prev, next, sleep); |
221 | } |
222 | |
223 | #else /* CONFIG_PSI */ |
224 | static inline void psi_enqueue(struct task_struct *p, bool migrate) {} |
225 | static inline void psi_dequeue(struct task_struct *p, bool migrate) {} |
226 | static inline void psi_ttwu_dequeue(struct task_struct *p) {} |
227 | static inline void psi_sched_switch(struct task_struct *prev, |
228 | struct task_struct *next, |
229 | bool sleep) {} |
230 | static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, |
231 | struct task_struct *prev) {} |
232 | #endif /* CONFIG_PSI */ |
233 | |
234 | #ifdef CONFIG_SCHED_INFO |
235 | /* |
236 | * We are interested in knowing how long it was from the *first* time a |
237 | * task was queued to the time that it finally hit a CPU, we call this routine |
238 | * from dequeue_task() to account for possible rq->clock skew across CPUs. The |
239 | * delta taken on each CPU would annul the skew. |
240 | */ |
241 | static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) |
242 | { |
243 | unsigned long long delta = 0; |
244 | |
245 | if (!t->sched_info.last_queued) |
246 | return; |
247 | |
248 | delta = rq_clock(rq) - t->sched_info.last_queued; |
249 | t->sched_info.last_queued = 0; |
250 | t->sched_info.run_delay += delta; |
251 | if (delta > t->sched_info.max_run_delay) |
252 | t->sched_info.max_run_delay = delta; |
253 | if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) |
254 | t->sched_info.min_run_delay = delta; |
255 | rq_sched_info_dequeue(rq, delta); |
256 | } |
257 | |
258 | /* |
259 | * Called when a task finally hits the CPU. We can now calculate how |
260 | * long it was waiting to run. We also note when it began so that we |
261 | * can keep stats on how long its time-slice is. |
262 | */ |
263 | static void sched_info_arrive(struct rq *rq, struct task_struct *t) |
264 | { |
265 | unsigned long long now, delta = 0; |
266 | |
267 | if (!t->sched_info.last_queued) |
268 | return; |
269 | |
270 | now = rq_clock(rq); |
271 | delta = now - t->sched_info.last_queued; |
272 | t->sched_info.last_queued = 0; |
273 | t->sched_info.run_delay += delta; |
274 | t->sched_info.last_arrival = now; |
275 | t->sched_info.pcount++; |
276 | if (delta > t->sched_info.max_run_delay) |
277 | t->sched_info.max_run_delay = delta; |
278 | if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) |
279 | t->sched_info.min_run_delay = delta; |
280 | |
281 | rq_sched_info_arrive(rq, delta); |
282 | } |
283 | |
284 | /* |
285 | * This function is only called from enqueue_task(), but also only updates |
286 | * the timestamp if it is already not set. It's assumed that |
287 | * sched_info_dequeue() will clear that stamp when appropriate. |
288 | */ |
289 | static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) |
290 | { |
291 | if (!t->sched_info.last_queued) |
292 | t->sched_info.last_queued = rq_clock(rq); |
293 | } |
294 | |
295 | /* |
296 | * Called when a process ceases being the active-running process involuntarily |
297 | * due, typically, to expiring its time slice (this may also be called when |
298 | * switching to the idle task). Now we can calculate how long we ran. |
299 | * Also, if the process is still in the TASK_RUNNING state, call |
300 | * sched_info_enqueue() to mark that it has now again started waiting on |
301 | * the runqueue. |
302 | */ |
303 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
304 | { |
305 | unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; |
306 | |
307 | rq_sched_info_depart(rq, delta); |
308 | |
309 | if (task_is_running(t)) |
310 | sched_info_enqueue(rq, t); |
311 | } |
312 | |
313 | /* |
314 | * Called when tasks are switched involuntarily due, typically, to expiring |
315 | * their time slice. (This may also be called when switching to or from |
316 | * the idle task.) We are only called when prev != next. |
317 | */ |
318 | static inline void |
319 | sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
320 | { |
321 | /* |
322 | * prev now departs the CPU. It's not interesting to record |
323 | * stats about how efficient we were at scheduling the idle |
324 | * process, however. |
325 | */ |
326 | if (prev != rq->idle) |
327 | sched_info_depart(rq, t: prev); |
328 | |
329 | if (next != rq->idle) |
330 | sched_info_arrive(rq, t: next); |
331 | } |
332 | |
333 | #else /* !CONFIG_SCHED_INFO: */ |
334 | # define sched_info_enqueue(rq, t) do { } while (0) |
335 | # define sched_info_dequeue(rq, t) do { } while (0) |
336 | # define sched_info_switch(rq, t, next) do { } while (0) |
337 | #endif /* CONFIG_SCHED_INFO */ |
338 | |
339 | #endif /* _KERNEL_STATS_H */ |
340 | |