1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * Kernel internal timers |
4 | * |
5 | * Copyright (C) 1991, 1992 Linus Torvalds |
6 | * |
7 | * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. |
8 | * |
9 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 |
10 | * "A Kernel Model for Precision Timekeeping" by Dave Mills |
11 | * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to |
12 | * serialize accesses to xtime/lost_ticks). |
13 | * Copyright (C) 1998 Andrea Arcangeli |
14 | * 1999-03-10 Improved NTP compatibility by Ulrich Windl |
15 | * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love |
16 | * 2000-10-05 Implemented scalable SMP per-CPU timer handling. |
17 | * Copyright (C) 2000, 2001, 2002 Ingo Molnar |
18 | * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar |
19 | */ |
20 | |
21 | #include <linux/kernel_stat.h> |
22 | #include <linux/export.h> |
23 | #include <linux/interrupt.h> |
24 | #include <linux/percpu.h> |
25 | #include <linux/init.h> |
26 | #include <linux/mm.h> |
27 | #include <linux/swap.h> |
28 | #include <linux/pid_namespace.h> |
29 | #include <linux/notifier.h> |
30 | #include <linux/thread_info.h> |
31 | #include <linux/time.h> |
32 | #include <linux/jiffies.h> |
33 | #include <linux/posix-timers.h> |
34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> |
37 | #include <linux/tick.h> |
38 | #include <linux/kallsyms.h> |
39 | #include <linux/irq_work.h> |
40 | #include <linux/sched/signal.h> |
41 | #include <linux/sched/sysctl.h> |
42 | #include <linux/sched/nohz.h> |
43 | #include <linux/sched/debug.h> |
44 | #include <linux/slab.h> |
45 | #include <linux/compat.h> |
46 | #include <linux/random.h> |
47 | #include <linux/sysctl.h> |
48 | |
49 | #include <linux/uaccess.h> |
50 | #include <asm/unistd.h> |
51 | #include <asm/div64.h> |
52 | #include <asm/timex.h> |
53 | #include <asm/io.h> |
54 | |
55 | #include "tick-internal.h" |
56 | #include "timer_migration.h" |
57 | |
58 | #define CREATE_TRACE_POINTS |
59 | #include <trace/events/timer.h> |
60 | |
61 | __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; |
62 | |
63 | EXPORT_SYMBOL(jiffies_64); |
64 | |
65 | /* |
66 | * The timer wheel has LVL_DEPTH array levels. Each level provides an array of |
67 | * LVL_SIZE buckets. Each level is driven by its own clock and therefore each |
68 | * level has a different granularity. |
69 | * |
70 | * The level granularity is: LVL_CLK_DIV ^ level |
71 | * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) |
72 | * |
73 | * The array level of a newly armed timer depends on the relative expiry |
74 | * time. The farther the expiry time is away the higher the array level and |
75 | * therefore the granularity becomes. |
76 | * |
77 | * Contrary to the original timer wheel implementation, which aims for 'exact' |
78 | * expiry of the timers, this implementation removes the need for recascading |
79 | * the timers into the lower array levels. The previous 'classic' timer wheel |
80 | * implementation of the kernel already violated the 'exact' expiry by adding |
81 | * slack to the expiry time to provide batched expiration. The granularity |
82 | * levels provide implicit batching. |
83 | * |
84 | * This is an optimization of the original timer wheel implementation for the |
85 | * majority of the timer wheel use cases: timeouts. The vast majority of |
86 | * timeout timers (networking, disk I/O ...) are canceled before expiry. If |
87 | * the timeout expires it indicates that normal operation is disturbed, so it |
88 | * does not matter much whether the timeout comes with a slight delay. |
89 | * |
90 | * The only exception to this are networking timers with a small expiry |
91 | * time. They rely on the granularity. Those fit into the first wheel level, |
92 | * which has HZ granularity. |
93 | * |
94 | * We don't have cascading anymore. timers with a expiry time above the |
95 | * capacity of the last wheel level are force expired at the maximum timeout |
96 | * value of the last wheel level. From data sampling we know that the maximum |
97 | * value observed is 5 days (network connection tracking), so this should not |
98 | * be an issue. |
99 | * |
100 | * The currently chosen array constants values are a good compromise between |
101 | * array size and granularity. |
102 | * |
103 | * This results in the following granularity and range levels: |
104 | * |
105 | * HZ 1000 steps |
106 | * Level Offset Granularity Range |
107 | * 0 0 1 ms 0 ms - 63 ms |
108 | * 1 64 8 ms 64 ms - 511 ms |
109 | * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) |
110 | * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) |
111 | * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) |
112 | * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) |
113 | * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) |
114 | * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) |
115 | * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) |
116 | * |
117 | * HZ 300 |
118 | * Level Offset Granularity Range |
119 | * 0 0 3 ms 0 ms - 210 ms |
120 | * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) |
121 | * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) |
122 | * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) |
123 | * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) |
124 | * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) |
125 | * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) |
126 | * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) |
127 | * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) |
128 | * |
129 | * HZ 250 |
130 | * Level Offset Granularity Range |
131 | * 0 0 4 ms 0 ms - 255 ms |
132 | * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) |
133 | * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) |
134 | * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) |
135 | * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) |
136 | * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) |
137 | * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) |
138 | * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) |
139 | * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) |
140 | * |
141 | * HZ 100 |
142 | * Level Offset Granularity Range |
143 | * 0 0 10 ms 0 ms - 630 ms |
144 | * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) |
145 | * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) |
146 | * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) |
147 | * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) |
148 | * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) |
149 | * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) |
150 | * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) |
151 | */ |
152 | |
153 | /* Clock divisor for the next level */ |
154 | #define LVL_CLK_SHIFT 3 |
155 | #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) |
156 | #define LVL_CLK_MASK (LVL_CLK_DIV - 1) |
157 | #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) |
158 | #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) |
159 | |
160 | /* |
161 | * The time start value for each level to select the bucket at enqueue |
162 | * time. We start from the last possible delta of the previous level |
163 | * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). |
164 | */ |
165 | #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) |
166 | |
167 | /* Size of each clock level */ |
168 | #define LVL_BITS 6 |
169 | #define LVL_SIZE (1UL << LVL_BITS) |
170 | #define LVL_MASK (LVL_SIZE - 1) |
171 | #define LVL_OFFS(n) ((n) * LVL_SIZE) |
172 | |
173 | /* Level depth */ |
174 | #if HZ > 100 |
175 | # define LVL_DEPTH 9 |
176 | # else |
177 | # define LVL_DEPTH 8 |
178 | #endif |
179 | |
180 | /* The cutoff (max. capacity of the wheel) */ |
181 | #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) |
182 | #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) |
183 | |
184 | /* |
185 | * The resulting wheel size. If NOHZ is configured we allocate two |
186 | * wheels so we have a separate storage for the deferrable timers. |
187 | */ |
188 | #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) |
189 | |
190 | #ifdef CONFIG_NO_HZ_COMMON |
191 | /* |
192 | * If multiple bases need to be locked, use the base ordering for lock |
193 | * nesting, i.e. lowest number first. |
194 | */ |
195 | # define NR_BASES 3 |
196 | # define BASE_LOCAL 0 |
197 | # define BASE_GLOBAL 1 |
198 | # define BASE_DEF 2 |
199 | #else |
200 | # define NR_BASES 1 |
201 | # define BASE_LOCAL 0 |
202 | # define BASE_GLOBAL 0 |
203 | # define BASE_DEF 0 |
204 | #endif |
205 | |
206 | /** |
207 | * struct timer_base - Per CPU timer base (number of base depends on config) |
208 | * @lock: Lock protecting the timer_base |
209 | * @running_timer: When expiring timers, the lock is dropped. To make |
210 | * sure not to race against deleting/modifying a |
211 | * currently running timer, the pointer is set to the |
212 | * timer, which expires at the moment. If no timer is |
213 | * running, the pointer is NULL. |
214 | * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around |
215 | * timer expiry callback execution and when trying to |
216 | * delete a running timer and it wasn't successful in |
217 | * the first glance. It prevents priority inversion |
218 | * when callback was preempted on a remote CPU and a |
219 | * caller tries to delete the running timer. It also |
220 | * prevents a life lock, when the task which tries to |
221 | * delete a timer preempted the softirq thread which |
222 | * is running the timer callback function. |
223 | * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter |
224 | * waiting for the end of the timer callback function |
225 | * execution. |
226 | * @clk: clock of the timer base; is updated before enqueue |
227 | * of a timer; during expiry, it is 1 offset ahead of |
228 | * jiffies to avoid endless requeuing to current |
229 | * jiffies |
230 | * @next_expiry: expiry value of the first timer; it is updated when |
231 | * finding the next timer and during enqueue; the |
232 | * value is not valid, when next_expiry_recalc is set |
233 | * @cpu: Number of CPU the timer base belongs to |
234 | * @next_expiry_recalc: States, whether a recalculation of next_expiry is |
235 | * required. Value is set true, when a timer was |
236 | * deleted. |
237 | * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ |
238 | * code. This state is only used in standard |
239 | * base. Deferrable timers, which are enqueued remotely |
240 | * never wake up an idle CPU. So no matter of supporting it |
241 | * for this base. |
242 | * @timers_pending: Is set, when a timer is pending in the base. It is only |
243 | * reliable when next_expiry_recalc is not set. |
244 | * @pending_map: bitmap of the timer wheel; each bit reflects a |
245 | * bucket of the wheel. When a bit is set, at least a |
246 | * single timer is enqueued in the related bucket. |
247 | * @vectors: Array of lists; Each array member reflects a bucket |
248 | * of the timer wheel. The list contains all timers |
249 | * which are enqueued into a specific bucket. |
250 | */ |
251 | struct timer_base { |
252 | raw_spinlock_t lock; |
253 | struct timer_list *running_timer; |
254 | #ifdef CONFIG_PREEMPT_RT |
255 | spinlock_t expiry_lock; |
256 | atomic_t timer_waiters; |
257 | #endif |
258 | unsigned long clk; |
259 | unsigned long next_expiry; |
260 | unsigned int cpu; |
261 | bool next_expiry_recalc; |
262 | bool is_idle; |
263 | bool timers_pending; |
264 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); |
265 | struct hlist_head vectors[WHEEL_SIZE]; |
266 | } ____cacheline_aligned; |
267 | |
268 | static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); |
269 | |
270 | #ifdef CONFIG_NO_HZ_COMMON |
271 | |
272 | static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); |
273 | static DEFINE_MUTEX(timer_keys_mutex); |
274 | |
275 | static void timer_update_keys(struct work_struct *work); |
276 | static DECLARE_WORK(timer_update_work, timer_update_keys); |
277 | |
278 | #ifdef CONFIG_SMP |
279 | static unsigned int sysctl_timer_migration = 1; |
280 | |
281 | DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); |
282 | |
283 | static void timers_update_migration(void) |
284 | { |
285 | if (sysctl_timer_migration && tick_nohz_active) |
286 | static_branch_enable(&timers_migration_enabled); |
287 | else |
288 | static_branch_disable(&timers_migration_enabled); |
289 | } |
290 | |
291 | #ifdef CONFIG_SYSCTL |
292 | static int timer_migration_handler(struct ctl_table *table, int write, |
293 | void *buffer, size_t *lenp, loff_t *ppos) |
294 | { |
295 | int ret; |
296 | |
297 | mutex_lock(&timer_keys_mutex); |
298 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
299 | if (!ret && write) |
300 | timers_update_migration(); |
301 | mutex_unlock(lock: &timer_keys_mutex); |
302 | return ret; |
303 | } |
304 | |
305 | static struct ctl_table timer_sysctl[] = { |
306 | { |
307 | .procname = "timer_migration", |
308 | .data = &sysctl_timer_migration, |
309 | .maxlen = sizeof(unsigned int), |
310 | .mode = 0644, |
311 | .proc_handler = timer_migration_handler, |
312 | .extra1 = SYSCTL_ZERO, |
313 | .extra2 = SYSCTL_ONE, |
314 | }, |
315 | {} |
316 | }; |
317 | |
318 | static int __init timer_sysctl_init(void) |
319 | { |
320 | register_sysctl("kernel", timer_sysctl); |
321 | return 0; |
322 | } |
323 | device_initcall(timer_sysctl_init); |
324 | #endif /* CONFIG_SYSCTL */ |
325 | #else /* CONFIG_SMP */ |
326 | static inline void timers_update_migration(void) { } |
327 | #endif /* !CONFIG_SMP */ |
328 | |
329 | static void timer_update_keys(struct work_struct *work) |
330 | { |
331 | mutex_lock(&timer_keys_mutex); |
332 | timers_update_migration(); |
333 | static_branch_enable(&timers_nohz_active); |
334 | mutex_unlock(lock: &timer_keys_mutex); |
335 | } |
336 | |
337 | void timers_update_nohz(void) |
338 | { |
339 | schedule_work(work: &timer_update_work); |
340 | } |
341 | |
342 | static inline bool is_timers_nohz_active(void) |
343 | { |
344 | return static_branch_unlikely(&timers_nohz_active); |
345 | } |
346 | #else |
347 | static inline bool is_timers_nohz_active(void) { return false; } |
348 | #endif /* NO_HZ_COMMON */ |
349 | |
350 | static unsigned long round_jiffies_common(unsigned long j, int cpu, |
351 | bool force_up) |
352 | { |
353 | int rem; |
354 | unsigned long original = j; |
355 | |
356 | /* |
357 | * We don't want all cpus firing their timers at once hitting the |
358 | * same lock or cachelines, so we skew each extra cpu with an extra |
359 | * 3 jiffies. This 3 jiffies came originally from the mm/ code which |
360 | * already did this. |
361 | * The skew is done by adding 3*cpunr, then round, then subtract this |
362 | * extra offset again. |
363 | */ |
364 | j += cpu * 3; |
365 | |
366 | rem = j % HZ; |
367 | |
368 | /* |
369 | * If the target jiffie is just after a whole second (which can happen |
370 | * due to delays of the timer irq, long irq off times etc etc) then |
371 | * we should round down to the whole second, not up. Use 1/4th second |
372 | * as cutoff for this rounding as an extreme upper bound for this. |
373 | * But never round down if @force_up is set. |
374 | */ |
375 | if (rem < HZ/4 && !force_up) /* round down */ |
376 | j = j - rem; |
377 | else /* round up */ |
378 | j = j - rem + HZ; |
379 | |
380 | /* now that we have rounded, subtract the extra skew again */ |
381 | j -= cpu * 3; |
382 | |
383 | /* |
384 | * Make sure j is still in the future. Otherwise return the |
385 | * unmodified value. |
386 | */ |
387 | return time_is_after_jiffies(j) ? j : original; |
388 | } |
389 | |
390 | /** |
391 | * __round_jiffies - function to round jiffies to a full second |
392 | * @j: the time in (absolute) jiffies that should be rounded |
393 | * @cpu: the processor number on which the timeout will happen |
394 | * |
395 | * __round_jiffies() rounds an absolute time in the future (in jiffies) |
396 | * up or down to (approximately) full seconds. This is useful for timers |
397 | * for which the exact time they fire does not matter too much, as long as |
398 | * they fire approximately every X seconds. |
399 | * |
400 | * By rounding these timers to whole seconds, all such timers will fire |
401 | * at the same time, rather than at various times spread out. The goal |
402 | * of this is to have the CPU wake up less, which saves power. |
403 | * |
404 | * The exact rounding is skewed for each processor to avoid all |
405 | * processors firing at the exact same time, which could lead |
406 | * to lock contention or spurious cache line bouncing. |
407 | * |
408 | * The return value is the rounded version of the @j parameter. |
409 | */ |
410 | unsigned long __round_jiffies(unsigned long j, int cpu) |
411 | { |
412 | return round_jiffies_common(j, cpu, force_up: false); |
413 | } |
414 | EXPORT_SYMBOL_GPL(__round_jiffies); |
415 | |
416 | /** |
417 | * __round_jiffies_relative - function to round jiffies to a full second |
418 | * @j: the time in (relative) jiffies that should be rounded |
419 | * @cpu: the processor number on which the timeout will happen |
420 | * |
421 | * __round_jiffies_relative() rounds a time delta in the future (in jiffies) |
422 | * up or down to (approximately) full seconds. This is useful for timers |
423 | * for which the exact time they fire does not matter too much, as long as |
424 | * they fire approximately every X seconds. |
425 | * |
426 | * By rounding these timers to whole seconds, all such timers will fire |
427 | * at the same time, rather than at various times spread out. The goal |
428 | * of this is to have the CPU wake up less, which saves power. |
429 | * |
430 | * The exact rounding is skewed for each processor to avoid all |
431 | * processors firing at the exact same time, which could lead |
432 | * to lock contention or spurious cache line bouncing. |
433 | * |
434 | * The return value is the rounded version of the @j parameter. |
435 | */ |
436 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) |
437 | { |
438 | unsigned long j0 = jiffies; |
439 | |
440 | /* Use j0 because jiffies might change while we run */ |
441 | return round_jiffies_common(j: j + j0, cpu, force_up: false) - j0; |
442 | } |
443 | EXPORT_SYMBOL_GPL(__round_jiffies_relative); |
444 | |
445 | /** |
446 | * round_jiffies - function to round jiffies to a full second |
447 | * @j: the time in (absolute) jiffies that should be rounded |
448 | * |
449 | * round_jiffies() rounds an absolute time in the future (in jiffies) |
450 | * up or down to (approximately) full seconds. This is useful for timers |
451 | * for which the exact time they fire does not matter too much, as long as |
452 | * they fire approximately every X seconds. |
453 | * |
454 | * By rounding these timers to whole seconds, all such timers will fire |
455 | * at the same time, rather than at various times spread out. The goal |
456 | * of this is to have the CPU wake up less, which saves power. |
457 | * |
458 | * The return value is the rounded version of the @j parameter. |
459 | */ |
460 | unsigned long round_jiffies(unsigned long j) |
461 | { |
462 | return round_jiffies_common(j, raw_smp_processor_id(), force_up: false); |
463 | } |
464 | EXPORT_SYMBOL_GPL(round_jiffies); |
465 | |
466 | /** |
467 | * round_jiffies_relative - function to round jiffies to a full second |
468 | * @j: the time in (relative) jiffies that should be rounded |
469 | * |
470 | * round_jiffies_relative() rounds a time delta in the future (in jiffies) |
471 | * up or down to (approximately) full seconds. This is useful for timers |
472 | * for which the exact time they fire does not matter too much, as long as |
473 | * they fire approximately every X seconds. |
474 | * |
475 | * By rounding these timers to whole seconds, all such timers will fire |
476 | * at the same time, rather than at various times spread out. The goal |
477 | * of this is to have the CPU wake up less, which saves power. |
478 | * |
479 | * The return value is the rounded version of the @j parameter. |
480 | */ |
481 | unsigned long round_jiffies_relative(unsigned long j) |
482 | { |
483 | return __round_jiffies_relative(j, raw_smp_processor_id()); |
484 | } |
485 | EXPORT_SYMBOL_GPL(round_jiffies_relative); |
486 | |
487 | /** |
488 | * __round_jiffies_up - function to round jiffies up to a full second |
489 | * @j: the time in (absolute) jiffies that should be rounded |
490 | * @cpu: the processor number on which the timeout will happen |
491 | * |
492 | * This is the same as __round_jiffies() except that it will never |
493 | * round down. This is useful for timeouts for which the exact time |
494 | * of firing does not matter too much, as long as they don't fire too |
495 | * early. |
496 | */ |
497 | unsigned long __round_jiffies_up(unsigned long j, int cpu) |
498 | { |
499 | return round_jiffies_common(j, cpu, force_up: true); |
500 | } |
501 | EXPORT_SYMBOL_GPL(__round_jiffies_up); |
502 | |
503 | /** |
504 | * __round_jiffies_up_relative - function to round jiffies up to a full second |
505 | * @j: the time in (relative) jiffies that should be rounded |
506 | * @cpu: the processor number on which the timeout will happen |
507 | * |
508 | * This is the same as __round_jiffies_relative() except that it will never |
509 | * round down. This is useful for timeouts for which the exact time |
510 | * of firing does not matter too much, as long as they don't fire too |
511 | * early. |
512 | */ |
513 | unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) |
514 | { |
515 | unsigned long j0 = jiffies; |
516 | |
517 | /* Use j0 because jiffies might change while we run */ |
518 | return round_jiffies_common(j: j + j0, cpu, force_up: true) - j0; |
519 | } |
520 | EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); |
521 | |
522 | /** |
523 | * round_jiffies_up - function to round jiffies up to a full second |
524 | * @j: the time in (absolute) jiffies that should be rounded |
525 | * |
526 | * This is the same as round_jiffies() except that it will never |
527 | * round down. This is useful for timeouts for which the exact time |
528 | * of firing does not matter too much, as long as they don't fire too |
529 | * early. |
530 | */ |
531 | unsigned long round_jiffies_up(unsigned long j) |
532 | { |
533 | return round_jiffies_common(j, raw_smp_processor_id(), force_up: true); |
534 | } |
535 | EXPORT_SYMBOL_GPL(round_jiffies_up); |
536 | |
537 | /** |
538 | * round_jiffies_up_relative - function to round jiffies up to a full second |
539 | * @j: the time in (relative) jiffies that should be rounded |
540 | * |
541 | * This is the same as round_jiffies_relative() except that it will never |
542 | * round down. This is useful for timeouts for which the exact time |
543 | * of firing does not matter too much, as long as they don't fire too |
544 | * early. |
545 | */ |
546 | unsigned long round_jiffies_up_relative(unsigned long j) |
547 | { |
548 | return __round_jiffies_up_relative(j, raw_smp_processor_id()); |
549 | } |
550 | EXPORT_SYMBOL_GPL(round_jiffies_up_relative); |
551 | |
552 | |
553 | static inline unsigned int timer_get_idx(struct timer_list *timer) |
554 | { |
555 | return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; |
556 | } |
557 | |
558 | static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) |
559 | { |
560 | timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | |
561 | idx << TIMER_ARRAYSHIFT; |
562 | } |
563 | |
564 | /* |
565 | * Helper function to calculate the array index for a given expiry |
566 | * time. |
567 | */ |
568 | static inline unsigned calc_index(unsigned long expires, unsigned lvl, |
569 | unsigned long *bucket_expiry) |
570 | { |
571 | |
572 | /* |
573 | * The timer wheel has to guarantee that a timer does not fire |
574 | * early. Early expiry can happen due to: |
575 | * - Timer is armed at the edge of a tick |
576 | * - Truncation of the expiry time in the outer wheel levels |
577 | * |
578 | * Round up with level granularity to prevent this. |
579 | */ |
580 | expires = (expires >> LVL_SHIFT(lvl)) + 1; |
581 | *bucket_expiry = expires << LVL_SHIFT(lvl); |
582 | return LVL_OFFS(lvl) + (expires & LVL_MASK); |
583 | } |
584 | |
585 | static int calc_wheel_index(unsigned long expires, unsigned long clk, |
586 | unsigned long *bucket_expiry) |
587 | { |
588 | unsigned long delta = expires - clk; |
589 | unsigned int idx; |
590 | |
591 | if (delta < LVL_START(1)) { |
592 | idx = calc_index(expires, lvl: 0, bucket_expiry); |
593 | } else if (delta < LVL_START(2)) { |
594 | idx = calc_index(expires, lvl: 1, bucket_expiry); |
595 | } else if (delta < LVL_START(3)) { |
596 | idx = calc_index(expires, lvl: 2, bucket_expiry); |
597 | } else if (delta < LVL_START(4)) { |
598 | idx = calc_index(expires, lvl: 3, bucket_expiry); |
599 | } else if (delta < LVL_START(5)) { |
600 | idx = calc_index(expires, lvl: 4, bucket_expiry); |
601 | } else if (delta < LVL_START(6)) { |
602 | idx = calc_index(expires, lvl: 5, bucket_expiry); |
603 | } else if (delta < LVL_START(7)) { |
604 | idx = calc_index(expires, lvl: 6, bucket_expiry); |
605 | } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { |
606 | idx = calc_index(expires, lvl: 7, bucket_expiry); |
607 | } else if ((long) delta < 0) { |
608 | idx = clk & LVL_MASK; |
609 | *bucket_expiry = clk; |
610 | } else { |
611 | /* |
612 | * Force expire obscene large timeouts to expire at the |
613 | * capacity limit of the wheel. |
614 | */ |
615 | if (delta >= WHEEL_TIMEOUT_CUTOFF) |
616 | expires = clk + WHEEL_TIMEOUT_MAX; |
617 | |
618 | idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); |
619 | } |
620 | return idx; |
621 | } |
622 | |
623 | static void |
624 | trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) |
625 | { |
626 | /* |
627 | * Deferrable timers do not prevent the CPU from entering dynticks and |
628 | * are not taken into account on the idle/nohz_full path. An IPI when a |
629 | * new deferrable timer is enqueued will wake up the remote CPU but |
630 | * nothing will be done with the deferrable timer base. Therefore skip |
631 | * the remote IPI for deferrable timers completely. |
632 | */ |
633 | if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) |
634 | return; |
635 | |
636 | /* |
637 | * We might have to IPI the remote CPU if the base is idle and the |
638 | * timer is pinned. If it is a non pinned timer, it is only queued |
639 | * on the remote CPU, when timer was running during queueing. Then |
640 | * everything is handled by remote CPU anyway. If the other CPU is |
641 | * on the way to idle then it can't set base->is_idle as we hold |
642 | * the base lock: |
643 | */ |
644 | if (base->is_idle) { |
645 | WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || |
646 | tick_nohz_full_cpu(base->cpu))); |
647 | wake_up_nohz_cpu(cpu: base->cpu); |
648 | } |
649 | } |
650 | |
651 | /* |
652 | * Enqueue the timer into the hash bucket, mark it pending in |
653 | * the bitmap, store the index in the timer flags then wake up |
654 | * the target CPU if needed. |
655 | */ |
656 | static void enqueue_timer(struct timer_base *base, struct timer_list *timer, |
657 | unsigned int idx, unsigned long bucket_expiry) |
658 | { |
659 | |
660 | hlist_add_head(n: &timer->entry, h: base->vectors + idx); |
661 | __set_bit(idx, base->pending_map); |
662 | timer_set_idx(timer, idx); |
663 | |
664 | trace_timer_start(timer, bucket_expiry); |
665 | |
666 | /* |
667 | * Check whether this is the new first expiring timer. The |
668 | * effective expiry time of the timer is required here |
669 | * (bucket_expiry) instead of timer->expires. |
670 | */ |
671 | if (time_before(bucket_expiry, base->next_expiry)) { |
672 | /* |
673 | * Set the next expiry time and kick the CPU so it |
674 | * can reevaluate the wheel: |
675 | */ |
676 | base->next_expiry = bucket_expiry; |
677 | base->timers_pending = true; |
678 | base->next_expiry_recalc = false; |
679 | trigger_dyntick_cpu(base, timer); |
680 | } |
681 | } |
682 | |
683 | static void internal_add_timer(struct timer_base *base, struct timer_list *timer) |
684 | { |
685 | unsigned long bucket_expiry; |
686 | unsigned int idx; |
687 | |
688 | idx = calc_wheel_index(expires: timer->expires, clk: base->clk, bucket_expiry: &bucket_expiry); |
689 | enqueue_timer(base, timer, idx, bucket_expiry); |
690 | } |
691 | |
692 | #ifdef CONFIG_DEBUG_OBJECTS_TIMERS |
693 | |
694 | static const struct debug_obj_descr timer_debug_descr; |
695 | |
696 | struct timer_hint { |
697 | void (*function)(struct timer_list *t); |
698 | long offset; |
699 | }; |
700 | |
701 | #define TIMER_HINT(fn, container, timr, hintfn) \ |
702 | { \ |
703 | .function = fn, \ |
704 | .offset = offsetof(container, hintfn) - \ |
705 | offsetof(container, timr) \ |
706 | } |
707 | |
708 | static const struct timer_hint timer_hints[] = { |
709 | TIMER_HINT(delayed_work_timer_fn, |
710 | struct delayed_work, timer, work.func), |
711 | TIMER_HINT(kthread_delayed_work_timer_fn, |
712 | struct kthread_delayed_work, timer, work.func), |
713 | }; |
714 | |
715 | static void *timer_debug_hint(void *addr) |
716 | { |
717 | struct timer_list *timer = addr; |
718 | int i; |
719 | |
720 | for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { |
721 | if (timer_hints[i].function == timer->function) { |
722 | void (**fn)(void) = addr + timer_hints[i].offset; |
723 | |
724 | return *fn; |
725 | } |
726 | } |
727 | |
728 | return timer->function; |
729 | } |
730 | |
731 | static bool timer_is_static_object(void *addr) |
732 | { |
733 | struct timer_list *timer = addr; |
734 | |
735 | return (timer->entry.pprev == NULL && |
736 | timer->entry.next == TIMER_ENTRY_STATIC); |
737 | } |
738 | |
739 | /* |
740 | * timer_fixup_init is called when: |
741 | * - an active object is initialized |
742 | */ |
743 | static bool timer_fixup_init(void *addr, enum debug_obj_state state) |
744 | { |
745 | struct timer_list *timer = addr; |
746 | |
747 | switch (state) { |
748 | case ODEBUG_STATE_ACTIVE: |
749 | del_timer_sync(timer); |
750 | debug_object_init(addr: timer, descr: &timer_debug_descr); |
751 | return true; |
752 | default: |
753 | return false; |
754 | } |
755 | } |
756 | |
757 | /* Stub timer callback for improperly used timers. */ |
758 | static void stub_timer(struct timer_list *unused) |
759 | { |
760 | WARN_ON(1); |
761 | } |
762 | |
763 | /* |
764 | * timer_fixup_activate is called when: |
765 | * - an active object is activated |
766 | * - an unknown non-static object is activated |
767 | */ |
768 | static bool timer_fixup_activate(void *addr, enum debug_obj_state state) |
769 | { |
770 | struct timer_list *timer = addr; |
771 | |
772 | switch (state) { |
773 | case ODEBUG_STATE_NOTAVAILABLE: |
774 | timer_setup(timer, stub_timer, 0); |
775 | return true; |
776 | |
777 | case ODEBUG_STATE_ACTIVE: |
778 | WARN_ON(1); |
779 | fallthrough; |
780 | default: |
781 | return false; |
782 | } |
783 | } |
784 | |
785 | /* |
786 | * timer_fixup_free is called when: |
787 | * - an active object is freed |
788 | */ |
789 | static bool timer_fixup_free(void *addr, enum debug_obj_state state) |
790 | { |
791 | struct timer_list *timer = addr; |
792 | |
793 | switch (state) { |
794 | case ODEBUG_STATE_ACTIVE: |
795 | del_timer_sync(timer); |
796 | debug_object_free(addr: timer, descr: &timer_debug_descr); |
797 | return true; |
798 | default: |
799 | return false; |
800 | } |
801 | } |
802 | |
803 | /* |
804 | * timer_fixup_assert_init is called when: |
805 | * - an untracked/uninit-ed object is found |
806 | */ |
807 | static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) |
808 | { |
809 | struct timer_list *timer = addr; |
810 | |
811 | switch (state) { |
812 | case ODEBUG_STATE_NOTAVAILABLE: |
813 | timer_setup(timer, stub_timer, 0); |
814 | return true; |
815 | default: |
816 | return false; |
817 | } |
818 | } |
819 | |
820 | static const struct debug_obj_descr timer_debug_descr = { |
821 | .name = "timer_list", |
822 | .debug_hint = timer_debug_hint, |
823 | .is_static_object = timer_is_static_object, |
824 | .fixup_init = timer_fixup_init, |
825 | .fixup_activate = timer_fixup_activate, |
826 | .fixup_free = timer_fixup_free, |
827 | .fixup_assert_init = timer_fixup_assert_init, |
828 | }; |
829 | |
830 | static inline void debug_timer_init(struct timer_list *timer) |
831 | { |
832 | debug_object_init(addr: timer, descr: &timer_debug_descr); |
833 | } |
834 | |
835 | static inline void debug_timer_activate(struct timer_list *timer) |
836 | { |
837 | debug_object_activate(addr: timer, descr: &timer_debug_descr); |
838 | } |
839 | |
840 | static inline void debug_timer_deactivate(struct timer_list *timer) |
841 | { |
842 | debug_object_deactivate(addr: timer, descr: &timer_debug_descr); |
843 | } |
844 | |
845 | static inline void debug_timer_assert_init(struct timer_list *timer) |
846 | { |
847 | debug_object_assert_init(addr: timer, descr: &timer_debug_descr); |
848 | } |
849 | |
850 | static void do_init_timer(struct timer_list *timer, |
851 | void (*func)(struct timer_list *), |
852 | unsigned int flags, |
853 | const char *name, struct lock_class_key *key); |
854 | |
855 | void init_timer_on_stack_key(struct timer_list *timer, |
856 | void (*func)(struct timer_list *), |
857 | unsigned int flags, |
858 | const char *name, struct lock_class_key *key) |
859 | { |
860 | debug_object_init_on_stack(addr: timer, descr: &timer_debug_descr); |
861 | do_init_timer(timer, func, flags, name, key); |
862 | } |
863 | EXPORT_SYMBOL_GPL(init_timer_on_stack_key); |
864 | |
865 | void destroy_timer_on_stack(struct timer_list *timer) |
866 | { |
867 | debug_object_free(addr: timer, descr: &timer_debug_descr); |
868 | } |
869 | EXPORT_SYMBOL_GPL(destroy_timer_on_stack); |
870 | |
871 | #else |
872 | static inline void debug_timer_init(struct timer_list *timer) { } |
873 | static inline void debug_timer_activate(struct timer_list *timer) { } |
874 | static inline void debug_timer_deactivate(struct timer_list *timer) { } |
875 | static inline void debug_timer_assert_init(struct timer_list *timer) { } |
876 | #endif |
877 | |
878 | static inline void debug_init(struct timer_list *timer) |
879 | { |
880 | debug_timer_init(timer); |
881 | trace_timer_init(timer); |
882 | } |
883 | |
884 | static inline void debug_deactivate(struct timer_list *timer) |
885 | { |
886 | debug_timer_deactivate(timer); |
887 | trace_timer_cancel(timer); |
888 | } |
889 | |
890 | static inline void debug_assert_init(struct timer_list *timer) |
891 | { |
892 | debug_timer_assert_init(timer); |
893 | } |
894 | |
895 | static void do_init_timer(struct timer_list *timer, |
896 | void (*func)(struct timer_list *), |
897 | unsigned int flags, |
898 | const char *name, struct lock_class_key *key) |
899 | { |
900 | timer->entry.pprev = NULL; |
901 | timer->function = func; |
902 | if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) |
903 | flags &= TIMER_INIT_FLAGS; |
904 | timer->flags = flags | raw_smp_processor_id(); |
905 | lockdep_init_map(lock: &timer->lockdep_map, name, key, subclass: 0); |
906 | } |
907 | |
908 | /** |
909 | * init_timer_key - initialize a timer |
910 | * @timer: the timer to be initialized |
911 | * @func: timer callback function |
912 | * @flags: timer flags |
913 | * @name: name of the timer |
914 | * @key: lockdep class key of the fake lock used for tracking timer |
915 | * sync lock dependencies |
916 | * |
917 | * init_timer_key() must be done to a timer prior to calling *any* of the |
918 | * other timer functions. |
919 | */ |
920 | void init_timer_key(struct timer_list *timer, |
921 | void (*func)(struct timer_list *), unsigned int flags, |
922 | const char *name, struct lock_class_key *key) |
923 | { |
924 | debug_init(timer); |
925 | do_init_timer(timer, func, flags, name, key); |
926 | } |
927 | EXPORT_SYMBOL(init_timer_key); |
928 | |
929 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) |
930 | { |
931 | struct hlist_node *entry = &timer->entry; |
932 | |
933 | debug_deactivate(timer); |
934 | |
935 | __hlist_del(n: entry); |
936 | if (clear_pending) |
937 | entry->pprev = NULL; |
938 | entry->next = LIST_POISON2; |
939 | } |
940 | |
941 | static int detach_if_pending(struct timer_list *timer, struct timer_base *base, |
942 | bool clear_pending) |
943 | { |
944 | unsigned idx = timer_get_idx(timer); |
945 | |
946 | if (!timer_pending(timer)) |
947 | return 0; |
948 | |
949 | if (hlist_is_singular_node(n: &timer->entry, h: base->vectors + idx)) { |
950 | __clear_bit(idx, base->pending_map); |
951 | base->next_expiry_recalc = true; |
952 | } |
953 | |
954 | detach_timer(timer, clear_pending); |
955 | return 1; |
956 | } |
957 | |
958 | static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) |
959 | { |
960 | int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; |
961 | struct timer_base *base; |
962 | |
963 | base = per_cpu_ptr(&timer_bases[index], cpu); |
964 | |
965 | /* |
966 | * If the timer is deferrable and NO_HZ_COMMON is set then we need |
967 | * to use the deferrable base. |
968 | */ |
969 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) |
970 | base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); |
971 | return base; |
972 | } |
973 | |
974 | static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) |
975 | { |
976 | int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; |
977 | struct timer_base *base; |
978 | |
979 | base = this_cpu_ptr(&timer_bases[index]); |
980 | |
981 | /* |
982 | * If the timer is deferrable and NO_HZ_COMMON is set then we need |
983 | * to use the deferrable base. |
984 | */ |
985 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) |
986 | base = this_cpu_ptr(&timer_bases[BASE_DEF]); |
987 | return base; |
988 | } |
989 | |
990 | static inline struct timer_base *get_timer_base(u32 tflags) |
991 | { |
992 | return get_timer_cpu_base(tflags, cpu: tflags & TIMER_CPUMASK); |
993 | } |
994 | |
995 | static inline void __forward_timer_base(struct timer_base *base, |
996 | unsigned long basej) |
997 | { |
998 | /* |
999 | * Check whether we can forward the base. We can only do that when |
1000 | * @basej is past base->clk otherwise we might rewind base->clk. |
1001 | */ |
1002 | if (time_before_eq(basej, base->clk)) |
1003 | return; |
1004 | |
1005 | /* |
1006 | * If the next expiry value is > jiffies, then we fast forward to |
1007 | * jiffies otherwise we forward to the next expiry value. |
1008 | */ |
1009 | if (time_after(base->next_expiry, basej)) { |
1010 | base->clk = basej; |
1011 | } else { |
1012 | if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) |
1013 | return; |
1014 | base->clk = base->next_expiry; |
1015 | } |
1016 | |
1017 | } |
1018 | |
1019 | static inline void forward_timer_base(struct timer_base *base) |
1020 | { |
1021 | __forward_timer_base(base, READ_ONCE(jiffies)); |
1022 | } |
1023 | |
1024 | /* |
1025 | * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means |
1026 | * that all timers which are tied to this base are locked, and the base itself |
1027 | * is locked too. |
1028 | * |
1029 | * So __run_timers/migrate_timers can safely modify all timers which could |
1030 | * be found in the base->vectors array. |
1031 | * |
1032 | * When a timer is migrating then the TIMER_MIGRATING flag is set and we need |
1033 | * to wait until the migration is done. |
1034 | */ |
1035 | static struct timer_base *lock_timer_base(struct timer_list *timer, |
1036 | unsigned long *flags) |
1037 | __acquires(timer->base->lock) |
1038 | { |
1039 | for (;;) { |
1040 | struct timer_base *base; |
1041 | u32 tf; |
1042 | |
1043 | /* |
1044 | * We need to use READ_ONCE() here, otherwise the compiler |
1045 | * might re-read @tf between the check for TIMER_MIGRATING |
1046 | * and spin_lock(). |
1047 | */ |
1048 | tf = READ_ONCE(timer->flags); |
1049 | |
1050 | if (!(tf & TIMER_MIGRATING)) { |
1051 | base = get_timer_base(tflags: tf); |
1052 | raw_spin_lock_irqsave(&base->lock, *flags); |
1053 | if (timer->flags == tf) |
1054 | return base; |
1055 | raw_spin_unlock_irqrestore(&base->lock, *flags); |
1056 | } |
1057 | cpu_relax(); |
1058 | } |
1059 | } |
1060 | |
1061 | #define MOD_TIMER_PENDING_ONLY 0x01 |
1062 | #define MOD_TIMER_REDUCE 0x02 |
1063 | #define MOD_TIMER_NOTPENDING 0x04 |
1064 | |
1065 | static inline int |
1066 | __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) |
1067 | { |
1068 | unsigned long clk = 0, flags, bucket_expiry; |
1069 | struct timer_base *base, *new_base; |
1070 | unsigned int idx = UINT_MAX; |
1071 | int ret = 0; |
1072 | |
1073 | debug_assert_init(timer); |
1074 | |
1075 | /* |
1076 | * This is a common optimization triggered by the networking code - if |
1077 | * the timer is re-modified to have the same timeout or ends up in the |
1078 | * same array bucket then just return: |
1079 | */ |
1080 | if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { |
1081 | /* |
1082 | * The downside of this optimization is that it can result in |
1083 | * larger granularity than you would get from adding a new |
1084 | * timer with this expiry. |
1085 | */ |
1086 | long diff = timer->expires - expires; |
1087 | |
1088 | if (!diff) |
1089 | return 1; |
1090 | if (options & MOD_TIMER_REDUCE && diff <= 0) |
1091 | return 1; |
1092 | |
1093 | /* |
1094 | * We lock timer base and calculate the bucket index right |
1095 | * here. If the timer ends up in the same bucket, then we |
1096 | * just update the expiry time and avoid the whole |
1097 | * dequeue/enqueue dance. |
1098 | */ |
1099 | base = lock_timer_base(timer, flags: &flags); |
1100 | /* |
1101 | * Has @timer been shutdown? This needs to be evaluated |
1102 | * while holding base lock to prevent a race against the |
1103 | * shutdown code. |
1104 | */ |
1105 | if (!timer->function) |
1106 | goto out_unlock; |
1107 | |
1108 | forward_timer_base(base); |
1109 | |
1110 | if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && |
1111 | time_before_eq(timer->expires, expires)) { |
1112 | ret = 1; |
1113 | goto out_unlock; |
1114 | } |
1115 | |
1116 | clk = base->clk; |
1117 | idx = calc_wheel_index(expires, clk, bucket_expiry: &bucket_expiry); |
1118 | |
1119 | /* |
1120 | * Retrieve and compare the array index of the pending |
1121 | * timer. If it matches set the expiry to the new value so a |
1122 | * subsequent call will exit in the expires check above. |
1123 | */ |
1124 | if (idx == timer_get_idx(timer)) { |
1125 | if (!(options & MOD_TIMER_REDUCE)) |
1126 | timer->expires = expires; |
1127 | else if (time_after(timer->expires, expires)) |
1128 | timer->expires = expires; |
1129 | ret = 1; |
1130 | goto out_unlock; |
1131 | } |
1132 | } else { |
1133 | base = lock_timer_base(timer, flags: &flags); |
1134 | /* |
1135 | * Has @timer been shutdown? This needs to be evaluated |
1136 | * while holding base lock to prevent a race against the |
1137 | * shutdown code. |
1138 | */ |
1139 | if (!timer->function) |
1140 | goto out_unlock; |
1141 | |
1142 | forward_timer_base(base); |
1143 | } |
1144 | |
1145 | ret = detach_if_pending(timer, base, clear_pending: false); |
1146 | if (!ret && (options & MOD_TIMER_PENDING_ONLY)) |
1147 | goto out_unlock; |
1148 | |
1149 | new_base = get_timer_this_cpu_base(tflags: timer->flags); |
1150 | |
1151 | if (base != new_base) { |
1152 | /* |
1153 | * We are trying to schedule the timer on the new base. |
1154 | * However we can't change timer's base while it is running, |
1155 | * otherwise timer_delete_sync() can't detect that the timer's |
1156 | * handler yet has not finished. This also guarantees that the |
1157 | * timer is serialized wrt itself. |
1158 | */ |
1159 | if (likely(base->running_timer != timer)) { |
1160 | /* See the comment in lock_timer_base() */ |
1161 | timer->flags |= TIMER_MIGRATING; |
1162 | |
1163 | raw_spin_unlock(&base->lock); |
1164 | base = new_base; |
1165 | raw_spin_lock(&base->lock); |
1166 | WRITE_ONCE(timer->flags, |
1167 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); |
1168 | forward_timer_base(base); |
1169 | } |
1170 | } |
1171 | |
1172 | debug_timer_activate(timer); |
1173 | |
1174 | timer->expires = expires; |
1175 | /* |
1176 | * If 'idx' was calculated above and the base time did not advance |
1177 | * between calculating 'idx' and possibly switching the base, only |
1178 | * enqueue_timer() is required. Otherwise we need to (re)calculate |
1179 | * the wheel index via internal_add_timer(). |
1180 | */ |
1181 | if (idx != UINT_MAX && clk == base->clk) |
1182 | enqueue_timer(base, timer, idx, bucket_expiry); |
1183 | else |
1184 | internal_add_timer(base, timer); |
1185 | |
1186 | out_unlock: |
1187 | raw_spin_unlock_irqrestore(&base->lock, flags); |
1188 | |
1189 | return ret; |
1190 | } |
1191 | |
1192 | /** |
1193 | * mod_timer_pending - Modify a pending timer's timeout |
1194 | * @timer: The pending timer to be modified |
1195 | * @expires: New absolute timeout in jiffies |
1196 | * |
1197 | * mod_timer_pending() is the same for pending timers as mod_timer(), but |
1198 | * will not activate inactive timers. |
1199 | * |
1200 | * If @timer->function == NULL then the start operation is silently |
1201 | * discarded. |
1202 | * |
1203 | * Return: |
1204 | * * %0 - The timer was inactive and not modified or was in |
1205 | * shutdown state and the operation was discarded |
1206 | * * %1 - The timer was active and requeued to expire at @expires |
1207 | */ |
1208 | int mod_timer_pending(struct timer_list *timer, unsigned long expires) |
1209 | { |
1210 | return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); |
1211 | } |
1212 | EXPORT_SYMBOL(mod_timer_pending); |
1213 | |
1214 | /** |
1215 | * mod_timer - Modify a timer's timeout |
1216 | * @timer: The timer to be modified |
1217 | * @expires: New absolute timeout in jiffies |
1218 | * |
1219 | * mod_timer(timer, expires) is equivalent to: |
1220 | * |
1221 | * del_timer(timer); timer->expires = expires; add_timer(timer); |
1222 | * |
1223 | * mod_timer() is more efficient than the above open coded sequence. In |
1224 | * case that the timer is inactive, the del_timer() part is a NOP. The |
1225 | * timer is in any case activated with the new expiry time @expires. |
1226 | * |
1227 | * Note that if there are multiple unserialized concurrent users of the |
1228 | * same timer, then mod_timer() is the only safe way to modify the timeout, |
1229 | * since add_timer() cannot modify an already running timer. |
1230 | * |
1231 | * If @timer->function == NULL then the start operation is silently |
1232 | * discarded. In this case the return value is 0 and meaningless. |
1233 | * |
1234 | * Return: |
1235 | * * %0 - The timer was inactive and started or was in shutdown |
1236 | * state and the operation was discarded |
1237 | * * %1 - The timer was active and requeued to expire at @expires or |
1238 | * the timer was active and not modified because @expires did |
1239 | * not change the effective expiry time |
1240 | */ |
1241 | int mod_timer(struct timer_list *timer, unsigned long expires) |
1242 | { |
1243 | return __mod_timer(timer, expires, options: 0); |
1244 | } |
1245 | EXPORT_SYMBOL(mod_timer); |
1246 | |
1247 | /** |
1248 | * timer_reduce - Modify a timer's timeout if it would reduce the timeout |
1249 | * @timer: The timer to be modified |
1250 | * @expires: New absolute timeout in jiffies |
1251 | * |
1252 | * timer_reduce() is very similar to mod_timer(), except that it will only |
1253 | * modify an enqueued timer if that would reduce the expiration time. If |
1254 | * @timer is not enqueued it starts the timer. |
1255 | * |
1256 | * If @timer->function == NULL then the start operation is silently |
1257 | * discarded. |
1258 | * |
1259 | * Return: |
1260 | * * %0 - The timer was inactive and started or was in shutdown |
1261 | * state and the operation was discarded |
1262 | * * %1 - The timer was active and requeued to expire at @expires or |
1263 | * the timer was active and not modified because @expires |
1264 | * did not change the effective expiry time such that the |
1265 | * timer would expire earlier than already scheduled |
1266 | */ |
1267 | int timer_reduce(struct timer_list *timer, unsigned long expires) |
1268 | { |
1269 | return __mod_timer(timer, expires, MOD_TIMER_REDUCE); |
1270 | } |
1271 | EXPORT_SYMBOL(timer_reduce); |
1272 | |
1273 | /** |
1274 | * add_timer - Start a timer |
1275 | * @timer: The timer to be started |
1276 | * |
1277 | * Start @timer to expire at @timer->expires in the future. @timer->expires |
1278 | * is the absolute expiry time measured in 'jiffies'. When the timer expires |
1279 | * timer->function(timer) will be invoked from soft interrupt context. |
1280 | * |
1281 | * The @timer->expires and @timer->function fields must be set prior |
1282 | * to calling this function. |
1283 | * |
1284 | * If @timer->function == NULL then the start operation is silently |
1285 | * discarded. |
1286 | * |
1287 | * If @timer->expires is already in the past @timer will be queued to |
1288 | * expire at the next timer tick. |
1289 | * |
1290 | * This can only operate on an inactive timer. Attempts to invoke this on |
1291 | * an active timer are rejected with a warning. |
1292 | */ |
1293 | void add_timer(struct timer_list *timer) |
1294 | { |
1295 | if (WARN_ON_ONCE(timer_pending(timer))) |
1296 | return; |
1297 | __mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING); |
1298 | } |
1299 | EXPORT_SYMBOL(add_timer); |
1300 | |
1301 | /** |
1302 | * add_timer_local() - Start a timer on the local CPU |
1303 | * @timer: The timer to be started |
1304 | * |
1305 | * Same as add_timer() except that the timer flag TIMER_PINNED is set. |
1306 | * |
1307 | * See add_timer() for further details. |
1308 | */ |
1309 | void add_timer_local(struct timer_list *timer) |
1310 | { |
1311 | if (WARN_ON_ONCE(timer_pending(timer))) |
1312 | return; |
1313 | timer->flags |= TIMER_PINNED; |
1314 | __mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING); |
1315 | } |
1316 | EXPORT_SYMBOL(add_timer_local); |
1317 | |
1318 | /** |
1319 | * add_timer_global() - Start a timer without TIMER_PINNED flag set |
1320 | * @timer: The timer to be started |
1321 | * |
1322 | * Same as add_timer() except that the timer flag TIMER_PINNED is unset. |
1323 | * |
1324 | * See add_timer() for further details. |
1325 | */ |
1326 | void add_timer_global(struct timer_list *timer) |
1327 | { |
1328 | if (WARN_ON_ONCE(timer_pending(timer))) |
1329 | return; |
1330 | timer->flags &= ~TIMER_PINNED; |
1331 | __mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING); |
1332 | } |
1333 | EXPORT_SYMBOL(add_timer_global); |
1334 | |
1335 | /** |
1336 | * add_timer_on - Start a timer on a particular CPU |
1337 | * @timer: The timer to be started |
1338 | * @cpu: The CPU to start it on |
1339 | * |
1340 | * Same as add_timer() except that it starts the timer on the given CPU and |
1341 | * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in |
1342 | * the next round, add_timer_global() should be used instead as it unsets |
1343 | * the TIMER_PINNED flag. |
1344 | * |
1345 | * See add_timer() for further details. |
1346 | */ |
1347 | void add_timer_on(struct timer_list *timer, int cpu) |
1348 | { |
1349 | struct timer_base *new_base, *base; |
1350 | unsigned long flags; |
1351 | |
1352 | debug_assert_init(timer); |
1353 | |
1354 | if (WARN_ON_ONCE(timer_pending(timer))) |
1355 | return; |
1356 | |
1357 | /* Make sure timer flags have TIMER_PINNED flag set */ |
1358 | timer->flags |= TIMER_PINNED; |
1359 | |
1360 | new_base = get_timer_cpu_base(tflags: timer->flags, cpu); |
1361 | |
1362 | /* |
1363 | * If @timer was on a different CPU, it should be migrated with the |
1364 | * old base locked to prevent other operations proceeding with the |
1365 | * wrong base locked. See lock_timer_base(). |
1366 | */ |
1367 | base = lock_timer_base(timer, flags: &flags); |
1368 | /* |
1369 | * Has @timer been shutdown? This needs to be evaluated while |
1370 | * holding base lock to prevent a race against the shutdown code. |
1371 | */ |
1372 | if (!timer->function) |
1373 | goto out_unlock; |
1374 | |
1375 | if (base != new_base) { |
1376 | timer->flags |= TIMER_MIGRATING; |
1377 | |
1378 | raw_spin_unlock(&base->lock); |
1379 | base = new_base; |
1380 | raw_spin_lock(&base->lock); |
1381 | WRITE_ONCE(timer->flags, |
1382 | (timer->flags & ~TIMER_BASEMASK) | cpu); |
1383 | } |
1384 | forward_timer_base(base); |
1385 | |
1386 | debug_timer_activate(timer); |
1387 | internal_add_timer(base, timer); |
1388 | out_unlock: |
1389 | raw_spin_unlock_irqrestore(&base->lock, flags); |
1390 | } |
1391 | EXPORT_SYMBOL_GPL(add_timer_on); |
1392 | |
1393 | /** |
1394 | * __timer_delete - Internal function: Deactivate a timer |
1395 | * @timer: The timer to be deactivated |
1396 | * @shutdown: If true, this indicates that the timer is about to be |
1397 | * shutdown permanently. |
1398 | * |
1399 | * If @shutdown is true then @timer->function is set to NULL under the |
1400 | * timer base lock which prevents further rearming of the time. In that |
1401 | * case any attempt to rearm @timer after this function returns will be |
1402 | * silently ignored. |
1403 | * |
1404 | * Return: |
1405 | * * %0 - The timer was not pending |
1406 | * * %1 - The timer was pending and deactivated |
1407 | */ |
1408 | static int __timer_delete(struct timer_list *timer, bool shutdown) |
1409 | { |
1410 | struct timer_base *base; |
1411 | unsigned long flags; |
1412 | int ret = 0; |
1413 | |
1414 | debug_assert_init(timer); |
1415 | |
1416 | /* |
1417 | * If @shutdown is set then the lock has to be taken whether the |
1418 | * timer is pending or not to protect against a concurrent rearm |
1419 | * which might hit between the lockless pending check and the lock |
1420 | * acquisition. By taking the lock it is ensured that such a newly |
1421 | * enqueued timer is dequeued and cannot end up with |
1422 | * timer->function == NULL in the expiry code. |
1423 | * |
1424 | * If timer->function is currently executed, then this makes sure |
1425 | * that the callback cannot requeue the timer. |
1426 | */ |
1427 | if (timer_pending(timer) || shutdown) { |
1428 | base = lock_timer_base(timer, flags: &flags); |
1429 | ret = detach_if_pending(timer, base, clear_pending: true); |
1430 | if (shutdown) |
1431 | timer->function = NULL; |
1432 | raw_spin_unlock_irqrestore(&base->lock, flags); |
1433 | } |
1434 | |
1435 | return ret; |
1436 | } |
1437 | |
1438 | /** |
1439 | * timer_delete - Deactivate a timer |
1440 | * @timer: The timer to be deactivated |
1441 | * |
1442 | * The function only deactivates a pending timer, but contrary to |
1443 | * timer_delete_sync() it does not take into account whether the timer's |
1444 | * callback function is concurrently executed on a different CPU or not. |
1445 | * It neither prevents rearming of the timer. If @timer can be rearmed |
1446 | * concurrently then the return value of this function is meaningless. |
1447 | * |
1448 | * Return: |
1449 | * * %0 - The timer was not pending |
1450 | * * %1 - The timer was pending and deactivated |
1451 | */ |
1452 | int timer_delete(struct timer_list *timer) |
1453 | { |
1454 | return __timer_delete(timer, shutdown: false); |
1455 | } |
1456 | EXPORT_SYMBOL(timer_delete); |
1457 | |
1458 | /** |
1459 | * timer_shutdown - Deactivate a timer and prevent rearming |
1460 | * @timer: The timer to be deactivated |
1461 | * |
1462 | * The function does not wait for an eventually running timer callback on a |
1463 | * different CPU but it prevents rearming of the timer. Any attempt to arm |
1464 | * @timer after this function returns will be silently ignored. |
1465 | * |
1466 | * This function is useful for teardown code and should only be used when |
1467 | * timer_shutdown_sync() cannot be invoked due to locking or context constraints. |
1468 | * |
1469 | * Return: |
1470 | * * %0 - The timer was not pending |
1471 | * * %1 - The timer was pending |
1472 | */ |
1473 | int timer_shutdown(struct timer_list *timer) |
1474 | { |
1475 | return __timer_delete(timer, shutdown: true); |
1476 | } |
1477 | EXPORT_SYMBOL_GPL(timer_shutdown); |
1478 | |
1479 | /** |
1480 | * __try_to_del_timer_sync - Internal function: Try to deactivate a timer |
1481 | * @timer: Timer to deactivate |
1482 | * @shutdown: If true, this indicates that the timer is about to be |
1483 | * shutdown permanently. |
1484 | * |
1485 | * If @shutdown is true then @timer->function is set to NULL under the |
1486 | * timer base lock which prevents further rearming of the timer. Any |
1487 | * attempt to rearm @timer after this function returns will be silently |
1488 | * ignored. |
1489 | * |
1490 | * This function cannot guarantee that the timer cannot be rearmed |
1491 | * right after dropping the base lock if @shutdown is false. That |
1492 | * needs to be prevented by the calling code if necessary. |
1493 | * |
1494 | * Return: |
1495 | * * %0 - The timer was not pending |
1496 | * * %1 - The timer was pending and deactivated |
1497 | * * %-1 - The timer callback function is running on a different CPU |
1498 | */ |
1499 | static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) |
1500 | { |
1501 | struct timer_base *base; |
1502 | unsigned long flags; |
1503 | int ret = -1; |
1504 | |
1505 | debug_assert_init(timer); |
1506 | |
1507 | base = lock_timer_base(timer, flags: &flags); |
1508 | |
1509 | if (base->running_timer != timer) |
1510 | ret = detach_if_pending(timer, base, clear_pending: true); |
1511 | if (shutdown) |
1512 | timer->function = NULL; |
1513 | |
1514 | raw_spin_unlock_irqrestore(&base->lock, flags); |
1515 | |
1516 | return ret; |
1517 | } |
1518 | |
1519 | /** |
1520 | * try_to_del_timer_sync - Try to deactivate a timer |
1521 | * @timer: Timer to deactivate |
1522 | * |
1523 | * This function tries to deactivate a timer. On success the timer is not |
1524 | * queued and the timer callback function is not running on any CPU. |
1525 | * |
1526 | * This function does not guarantee that the timer cannot be rearmed right |
1527 | * after dropping the base lock. That needs to be prevented by the calling |
1528 | * code if necessary. |
1529 | * |
1530 | * Return: |
1531 | * * %0 - The timer was not pending |
1532 | * * %1 - The timer was pending and deactivated |
1533 | * * %-1 - The timer callback function is running on a different CPU |
1534 | */ |
1535 | int try_to_del_timer_sync(struct timer_list *timer) |
1536 | { |
1537 | return __try_to_del_timer_sync(timer, shutdown: false); |
1538 | } |
1539 | EXPORT_SYMBOL(try_to_del_timer_sync); |
1540 | |
1541 | #ifdef CONFIG_PREEMPT_RT |
1542 | static __init void timer_base_init_expiry_lock(struct timer_base *base) |
1543 | { |
1544 | spin_lock_init(&base->expiry_lock); |
1545 | } |
1546 | |
1547 | static inline void timer_base_lock_expiry(struct timer_base *base) |
1548 | { |
1549 | spin_lock(&base->expiry_lock); |
1550 | } |
1551 | |
1552 | static inline void timer_base_unlock_expiry(struct timer_base *base) |
1553 | { |
1554 | spin_unlock(&base->expiry_lock); |
1555 | } |
1556 | |
1557 | /* |
1558 | * The counterpart to del_timer_wait_running(). |
1559 | * |
1560 | * If there is a waiter for base->expiry_lock, then it was waiting for the |
1561 | * timer callback to finish. Drop expiry_lock and reacquire it. That allows |
1562 | * the waiter to acquire the lock and make progress. |
1563 | */ |
1564 | static void timer_sync_wait_running(struct timer_base *base) |
1565 | { |
1566 | if (atomic_read(&base->timer_waiters)) { |
1567 | raw_spin_unlock_irq(&base->lock); |
1568 | spin_unlock(&base->expiry_lock); |
1569 | spin_lock(&base->expiry_lock); |
1570 | raw_spin_lock_irq(&base->lock); |
1571 | } |
1572 | } |
1573 | |
1574 | /* |
1575 | * This function is called on PREEMPT_RT kernels when the fast path |
1576 | * deletion of a timer failed because the timer callback function was |
1577 | * running. |
1578 | * |
1579 | * This prevents priority inversion, if the softirq thread on a remote CPU |
1580 | * got preempted, and it prevents a life lock when the task which tries to |
1581 | * delete a timer preempted the softirq thread running the timer callback |
1582 | * function. |
1583 | */ |
1584 | static void del_timer_wait_running(struct timer_list *timer) |
1585 | { |
1586 | u32 tf; |
1587 | |
1588 | tf = READ_ONCE(timer->flags); |
1589 | if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { |
1590 | struct timer_base *base = get_timer_base(tf); |
1591 | |
1592 | /* |
1593 | * Mark the base as contended and grab the expiry lock, |
1594 | * which is held by the softirq across the timer |
1595 | * callback. Drop the lock immediately so the softirq can |
1596 | * expire the next timer. In theory the timer could already |
1597 | * be running again, but that's more than unlikely and just |
1598 | * causes another wait loop. |
1599 | */ |
1600 | atomic_inc(&base->timer_waiters); |
1601 | spin_lock_bh(&base->expiry_lock); |
1602 | atomic_dec(&base->timer_waiters); |
1603 | spin_unlock_bh(&base->expiry_lock); |
1604 | } |
1605 | } |
1606 | #else |
1607 | static inline void timer_base_init_expiry_lock(struct timer_base *base) { } |
1608 | static inline void timer_base_lock_expiry(struct timer_base *base) { } |
1609 | static inline void timer_base_unlock_expiry(struct timer_base *base) { } |
1610 | static inline void timer_sync_wait_running(struct timer_base *base) { } |
1611 | static inline void del_timer_wait_running(struct timer_list *timer) { } |
1612 | #endif |
1613 | |
1614 | /** |
1615 | * __timer_delete_sync - Internal function: Deactivate a timer and wait |
1616 | * for the handler to finish. |
1617 | * @timer: The timer to be deactivated |
1618 | * @shutdown: If true, @timer->function will be set to NULL under the |
1619 | * timer base lock which prevents rearming of @timer |
1620 | * |
1621 | * If @shutdown is not set the timer can be rearmed later. If the timer can |
1622 | * be rearmed concurrently, i.e. after dropping the base lock then the |
1623 | * return value is meaningless. |
1624 | * |
1625 | * If @shutdown is set then @timer->function is set to NULL under timer |
1626 | * base lock which prevents rearming of the timer. Any attempt to rearm |
1627 | * a shutdown timer is silently ignored. |
1628 | * |
1629 | * If the timer should be reused after shutdown it has to be initialized |
1630 | * again. |
1631 | * |
1632 | * Return: |
1633 | * * %0 - The timer was not pending |
1634 | * * %1 - The timer was pending and deactivated |
1635 | */ |
1636 | static int __timer_delete_sync(struct timer_list *timer, bool shutdown) |
1637 | { |
1638 | int ret; |
1639 | |
1640 | #ifdef CONFIG_LOCKDEP |
1641 | unsigned long flags; |
1642 | |
1643 | /* |
1644 | * If lockdep gives a backtrace here, please reference |
1645 | * the synchronization rules above. |
1646 | */ |
1647 | local_irq_save(flags); |
1648 | lock_map_acquire(&timer->lockdep_map); |
1649 | lock_map_release(&timer->lockdep_map); |
1650 | local_irq_restore(flags); |
1651 | #endif |
1652 | /* |
1653 | * don't use it in hardirq context, because it |
1654 | * could lead to deadlock. |
1655 | */ |
1656 | WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); |
1657 | |
1658 | /* |
1659 | * Must be able to sleep on PREEMPT_RT because of the slowpath in |
1660 | * del_timer_wait_running(). |
1661 | */ |
1662 | if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) |
1663 | lockdep_assert_preemption_enabled(); |
1664 | |
1665 | do { |
1666 | ret = __try_to_del_timer_sync(timer, shutdown); |
1667 | |
1668 | if (unlikely(ret < 0)) { |
1669 | del_timer_wait_running(timer); |
1670 | cpu_relax(); |
1671 | } |
1672 | } while (ret < 0); |
1673 | |
1674 | return ret; |
1675 | } |
1676 | |
1677 | /** |
1678 | * timer_delete_sync - Deactivate a timer and wait for the handler to finish. |
1679 | * @timer: The timer to be deactivated |
1680 | * |
1681 | * Synchronization rules: Callers must prevent restarting of the timer, |
1682 | * otherwise this function is meaningless. It must not be called from |
1683 | * interrupt contexts unless the timer is an irqsafe one. The caller must |
1684 | * not hold locks which would prevent completion of the timer's callback |
1685 | * function. The timer's handler must not call add_timer_on(). Upon exit |
1686 | * the timer is not queued and the handler is not running on any CPU. |
1687 | * |
1688 | * For !irqsafe timers, the caller must not hold locks that are held in |
1689 | * interrupt context. Even if the lock has nothing to do with the timer in |
1690 | * question. Here's why:: |
1691 | * |
1692 | * CPU0 CPU1 |
1693 | * ---- ---- |
1694 | * <SOFTIRQ> |
1695 | * call_timer_fn(); |
1696 | * base->running_timer = mytimer; |
1697 | * spin_lock_irq(somelock); |
1698 | * <IRQ> |
1699 | * spin_lock(somelock); |
1700 | * timer_delete_sync(mytimer); |
1701 | * while (base->running_timer == mytimer); |
1702 | * |
1703 | * Now timer_delete_sync() will never return and never release somelock. |
1704 | * The interrupt on the other CPU is waiting to grab somelock but it has |
1705 | * interrupted the softirq that CPU0 is waiting to finish. |
1706 | * |
1707 | * This function cannot guarantee that the timer is not rearmed again by |
1708 | * some concurrent or preempting code, right after it dropped the base |
1709 | * lock. If there is the possibility of a concurrent rearm then the return |
1710 | * value of the function is meaningless. |
1711 | * |
1712 | * If such a guarantee is needed, e.g. for teardown situations then use |
1713 | * timer_shutdown_sync() instead. |
1714 | * |
1715 | * Return: |
1716 | * * %0 - The timer was not pending |
1717 | * * %1 - The timer was pending and deactivated |
1718 | */ |
1719 | int timer_delete_sync(struct timer_list *timer) |
1720 | { |
1721 | return __timer_delete_sync(timer, shutdown: false); |
1722 | } |
1723 | EXPORT_SYMBOL(timer_delete_sync); |
1724 | |
1725 | /** |
1726 | * timer_shutdown_sync - Shutdown a timer and prevent rearming |
1727 | * @timer: The timer to be shutdown |
1728 | * |
1729 | * When the function returns it is guaranteed that: |
1730 | * - @timer is not queued |
1731 | * - The callback function of @timer is not running |
1732 | * - @timer cannot be enqueued again. Any attempt to rearm |
1733 | * @timer is silently ignored. |
1734 | * |
1735 | * See timer_delete_sync() for synchronization rules. |
1736 | * |
1737 | * This function is useful for final teardown of an infrastructure where |
1738 | * the timer is subject to a circular dependency problem. |
1739 | * |
1740 | * A common pattern for this is a timer and a workqueue where the timer can |
1741 | * schedule work and work can arm the timer. On shutdown the workqueue must |
1742 | * be destroyed and the timer must be prevented from rearming. Unless the |
1743 | * code has conditionals like 'if (mything->in_shutdown)' to prevent that |
1744 | * there is no way to get this correct with timer_delete_sync(). |
1745 | * |
1746 | * timer_shutdown_sync() is solving the problem. The correct ordering of |
1747 | * calls in this case is: |
1748 | * |
1749 | * timer_shutdown_sync(&mything->timer); |
1750 | * workqueue_destroy(&mything->workqueue); |
1751 | * |
1752 | * After this 'mything' can be safely freed. |
1753 | * |
1754 | * This obviously implies that the timer is not required to be functional |
1755 | * for the rest of the shutdown operation. |
1756 | * |
1757 | * Return: |
1758 | * * %0 - The timer was not pending |
1759 | * * %1 - The timer was pending |
1760 | */ |
1761 | int timer_shutdown_sync(struct timer_list *timer) |
1762 | { |
1763 | return __timer_delete_sync(timer, shutdown: true); |
1764 | } |
1765 | EXPORT_SYMBOL_GPL(timer_shutdown_sync); |
1766 | |
1767 | static void call_timer_fn(struct timer_list *timer, |
1768 | void (*fn)(struct timer_list *), |
1769 | unsigned long baseclk) |
1770 | { |
1771 | int count = preempt_count(); |
1772 | |
1773 | #ifdef CONFIG_LOCKDEP |
1774 | /* |
1775 | * It is permissible to free the timer from inside the |
1776 | * function that is called from it, this we need to take into |
1777 | * account for lockdep too. To avoid bogus "held lock freed" |
1778 | * warnings as well as problems when looking into |
1779 | * timer->lockdep_map, make a copy and use that here. |
1780 | */ |
1781 | struct lockdep_map lockdep_map; |
1782 | |
1783 | lockdep_copy_map(to: &lockdep_map, from: &timer->lockdep_map); |
1784 | #endif |
1785 | /* |
1786 | * Couple the lock chain with the lock chain at |
1787 | * timer_delete_sync() by acquiring the lock_map around the fn() |
1788 | * call here and in timer_delete_sync(). |
1789 | */ |
1790 | lock_map_acquire(&lockdep_map); |
1791 | |
1792 | trace_timer_expire_entry(timer, baseclk); |
1793 | fn(timer); |
1794 | trace_timer_expire_exit(timer); |
1795 | |
1796 | lock_map_release(&lockdep_map); |
1797 | |
1798 | if (count != preempt_count()) { |
1799 | WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", |
1800 | fn, count, preempt_count()); |
1801 | /* |
1802 | * Restore the preempt count. That gives us a decent |
1803 | * chance to survive and extract information. If the |
1804 | * callback kept a lock held, bad luck, but not worse |
1805 | * than the BUG() we had. |
1806 | */ |
1807 | preempt_count_set(pc: count); |
1808 | } |
1809 | } |
1810 | |
1811 | static void expire_timers(struct timer_base *base, struct hlist_head *head) |
1812 | { |
1813 | /* |
1814 | * This value is required only for tracing. base->clk was |
1815 | * incremented directly before expire_timers was called. But expiry |
1816 | * is related to the old base->clk value. |
1817 | */ |
1818 | unsigned long baseclk = base->clk - 1; |
1819 | |
1820 | while (!hlist_empty(h: head)) { |
1821 | struct timer_list *timer; |
1822 | void (*fn)(struct timer_list *); |
1823 | |
1824 | timer = hlist_entry(head->first, struct timer_list, entry); |
1825 | |
1826 | base->running_timer = timer; |
1827 | detach_timer(timer, clear_pending: true); |
1828 | |
1829 | fn = timer->function; |
1830 | |
1831 | if (WARN_ON_ONCE(!fn)) { |
1832 | /* Should never happen. Emphasis on should! */ |
1833 | base->running_timer = NULL; |
1834 | continue; |
1835 | } |
1836 | |
1837 | if (timer->flags & TIMER_IRQSAFE) { |
1838 | raw_spin_unlock(&base->lock); |
1839 | call_timer_fn(timer, fn, baseclk); |
1840 | raw_spin_lock(&base->lock); |
1841 | base->running_timer = NULL; |
1842 | } else { |
1843 | raw_spin_unlock_irq(&base->lock); |
1844 | call_timer_fn(timer, fn, baseclk); |
1845 | raw_spin_lock_irq(&base->lock); |
1846 | base->running_timer = NULL; |
1847 | timer_sync_wait_running(base); |
1848 | } |
1849 | } |
1850 | } |
1851 | |
1852 | static int collect_expired_timers(struct timer_base *base, |
1853 | struct hlist_head *heads) |
1854 | { |
1855 | unsigned long clk = base->clk = base->next_expiry; |
1856 | struct hlist_head *vec; |
1857 | int i, levels = 0; |
1858 | unsigned int idx; |
1859 | |
1860 | for (i = 0; i < LVL_DEPTH; i++) { |
1861 | idx = (clk & LVL_MASK) + i * LVL_SIZE; |
1862 | |
1863 | if (__test_and_clear_bit(idx, base->pending_map)) { |
1864 | vec = base->vectors + idx; |
1865 | hlist_move_list(old: vec, new: heads++); |
1866 | levels++; |
1867 | } |
1868 | /* Is it time to look at the next level? */ |
1869 | if (clk & LVL_CLK_MASK) |
1870 | break; |
1871 | /* Shift clock for the next level granularity */ |
1872 | clk >>= LVL_CLK_SHIFT; |
1873 | } |
1874 | return levels; |
1875 | } |
1876 | |
1877 | /* |
1878 | * Find the next pending bucket of a level. Search from level start (@offset) |
1879 | * + @clk upwards and if nothing there, search from start of the level |
1880 | * (@offset) up to @offset + clk. |
1881 | */ |
1882 | static int next_pending_bucket(struct timer_base *base, unsigned offset, |
1883 | unsigned clk) |
1884 | { |
1885 | unsigned pos, start = offset + clk; |
1886 | unsigned end = offset + LVL_SIZE; |
1887 | |
1888 | pos = find_next_bit(addr: base->pending_map, size: end, offset: start); |
1889 | if (pos < end) |
1890 | return pos - start; |
1891 | |
1892 | pos = find_next_bit(addr: base->pending_map, size: start, offset); |
1893 | return pos < start ? pos + LVL_SIZE - start : -1; |
1894 | } |
1895 | |
1896 | /* |
1897 | * Search the first expiring timer in the various clock levels. Caller must |
1898 | * hold base->lock. |
1899 | * |
1900 | * Store next expiry time in base->next_expiry. |
1901 | */ |
1902 | static void next_expiry_recalc(struct timer_base *base) |
1903 | { |
1904 | unsigned long clk, next, adj; |
1905 | unsigned lvl, offset = 0; |
1906 | |
1907 | next = base->clk + NEXT_TIMER_MAX_DELTA; |
1908 | clk = base->clk; |
1909 | for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { |
1910 | int pos = next_pending_bucket(base, offset, clk: clk & LVL_MASK); |
1911 | unsigned long lvl_clk = clk & LVL_CLK_MASK; |
1912 | |
1913 | if (pos >= 0) { |
1914 | unsigned long tmp = clk + (unsigned long) pos; |
1915 | |
1916 | tmp <<= LVL_SHIFT(lvl); |
1917 | if (time_before(tmp, next)) |
1918 | next = tmp; |
1919 | |
1920 | /* |
1921 | * If the next expiration happens before we reach |
1922 | * the next level, no need to check further. |
1923 | */ |
1924 | if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) |
1925 | break; |
1926 | } |
1927 | /* |
1928 | * Clock for the next level. If the current level clock lower |
1929 | * bits are zero, we look at the next level as is. If not we |
1930 | * need to advance it by one because that's going to be the |
1931 | * next expiring bucket in that level. base->clk is the next |
1932 | * expiring jiffie. So in case of: |
1933 | * |
1934 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1935 | * 0 0 0 0 0 0 |
1936 | * |
1937 | * we have to look at all levels @index 0. With |
1938 | * |
1939 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1940 | * 0 0 0 0 0 2 |
1941 | * |
1942 | * LVL0 has the next expiring bucket @index 2. The upper |
1943 | * levels have the next expiring bucket @index 1. |
1944 | * |
1945 | * In case that the propagation wraps the next level the same |
1946 | * rules apply: |
1947 | * |
1948 | * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 |
1949 | * 0 0 0 0 F 2 |
1950 | * |
1951 | * So after looking at LVL0 we get: |
1952 | * |
1953 | * LVL5 LVL4 LVL3 LVL2 LVL1 |
1954 | * 0 0 0 1 0 |
1955 | * |
1956 | * So no propagation from LVL1 to LVL2 because that happened |
1957 | * with the add already, but then we need to propagate further |
1958 | * from LVL2 to LVL3. |
1959 | * |
1960 | * So the simple check whether the lower bits of the current |
1961 | * level are 0 or not is sufficient for all cases. |
1962 | */ |
1963 | adj = lvl_clk ? 1 : 0; |
1964 | clk >>= LVL_CLK_SHIFT; |
1965 | clk += adj; |
1966 | } |
1967 | |
1968 | base->next_expiry = next; |
1969 | base->next_expiry_recalc = false; |
1970 | base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); |
1971 | } |
1972 | |
1973 | #ifdef CONFIG_NO_HZ_COMMON |
1974 | /* |
1975 | * Check, if the next hrtimer event is before the next timer wheel |
1976 | * event: |
1977 | */ |
1978 | static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) |
1979 | { |
1980 | u64 nextevt = hrtimer_get_next_event(); |
1981 | |
1982 | /* |
1983 | * If high resolution timers are enabled |
1984 | * hrtimer_get_next_event() returns KTIME_MAX. |
1985 | */ |
1986 | if (expires <= nextevt) |
1987 | return expires; |
1988 | |
1989 | /* |
1990 | * If the next timer is already expired, return the tick base |
1991 | * time so the tick is fired immediately. |
1992 | */ |
1993 | if (nextevt <= basem) |
1994 | return basem; |
1995 | |
1996 | /* |
1997 | * Round up to the next jiffie. High resolution timers are |
1998 | * off, so the hrtimers are expired in the tick and we need to |
1999 | * make sure that this tick really expires the timer to avoid |
2000 | * a ping pong of the nohz stop code. |
2001 | * |
2002 | * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 |
2003 | */ |
2004 | return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; |
2005 | } |
2006 | |
2007 | static unsigned long next_timer_interrupt(struct timer_base *base, |
2008 | unsigned long basej) |
2009 | { |
2010 | if (base->next_expiry_recalc) |
2011 | next_expiry_recalc(base); |
2012 | |
2013 | /* |
2014 | * Move next_expiry for the empty base into the future to prevent an |
2015 | * unnecessary raise of the timer softirq when the next_expiry value |
2016 | * will be reached even if there is no timer pending. |
2017 | * |
2018 | * This update is also required to make timer_base::next_expiry values |
2019 | * easy comparable to find out which base holds the first pending timer. |
2020 | */ |
2021 | if (!base->timers_pending) |
2022 | base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; |
2023 | |
2024 | return base->next_expiry; |
2025 | } |
2026 | |
2027 | static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, |
2028 | struct timer_base *base_local, |
2029 | struct timer_base *base_global, |
2030 | struct timer_events *tevt) |
2031 | { |
2032 | unsigned long nextevt, nextevt_local, nextevt_global; |
2033 | bool local_first; |
2034 | |
2035 | nextevt_local = next_timer_interrupt(base: base_local, basej); |
2036 | nextevt_global = next_timer_interrupt(base: base_global, basej); |
2037 | |
2038 | local_first = time_before_eq(nextevt_local, nextevt_global); |
2039 | |
2040 | nextevt = local_first ? nextevt_local : nextevt_global; |
2041 | |
2042 | /* |
2043 | * If the @nextevt is at max. one tick away, use @nextevt and store |
2044 | * it in the local expiry value. The next global event is irrelevant in |
2045 | * this case and can be left as KTIME_MAX. |
2046 | */ |
2047 | if (time_before_eq(nextevt, basej + 1)) { |
2048 | /* If we missed a tick already, force 0 delta */ |
2049 | if (time_before(nextevt, basej)) |
2050 | nextevt = basej; |
2051 | tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; |
2052 | |
2053 | /* |
2054 | * This is required for the remote check only but it doesn't |
2055 | * hurt, when it is done for both call sites: |
2056 | * |
2057 | * * The remote callers will only take care of the global timers |
2058 | * as local timers will be handled by CPU itself. When not |
2059 | * updating tevt->global with the already missed first global |
2060 | * timer, it is possible that it will be missed completely. |
2061 | * |
2062 | * * The local callers will ignore the tevt->global anyway, when |
2063 | * nextevt is max. one tick away. |
2064 | */ |
2065 | if (!local_first) |
2066 | tevt->global = tevt->local; |
2067 | return nextevt; |
2068 | } |
2069 | |
2070 | /* |
2071 | * Update tevt.* values: |
2072 | * |
2073 | * If the local queue expires first, then the global event can be |
2074 | * ignored. If the global queue is empty, nothing to do either. |
2075 | */ |
2076 | if (!local_first && base_global->timers_pending) |
2077 | tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; |
2078 | |
2079 | if (base_local->timers_pending) |
2080 | tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; |
2081 | |
2082 | return nextevt; |
2083 | } |
2084 | |
2085 | # ifdef CONFIG_SMP |
2086 | /** |
2087 | * fetch_next_timer_interrupt_remote() - Store next timers into @tevt |
2088 | * @basej: base time jiffies |
2089 | * @basem: base time clock monotonic |
2090 | * @tevt: Pointer to the storage for the expiry values |
2091 | * @cpu: Remote CPU |
2092 | * |
2093 | * Stores the next pending local and global timer expiry values in the |
2094 | * struct pointed to by @tevt. If a queue is empty the corresponding |
2095 | * field is set to KTIME_MAX. If local event expires before global |
2096 | * event, global event is set to KTIME_MAX as well. |
2097 | * |
2098 | * Caller needs to make sure timer base locks are held (use |
2099 | * timer_lock_remote_bases() for this purpose). |
2100 | */ |
2101 | void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, |
2102 | struct timer_events *tevt, |
2103 | unsigned int cpu) |
2104 | { |
2105 | struct timer_base *base_local, *base_global; |
2106 | |
2107 | /* Preset local / global events */ |
2108 | tevt->local = tevt->global = KTIME_MAX; |
2109 | |
2110 | base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); |
2111 | base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); |
2112 | |
2113 | lockdep_assert_held(&base_local->lock); |
2114 | lockdep_assert_held(&base_global->lock); |
2115 | |
2116 | fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); |
2117 | } |
2118 | |
2119 | /** |
2120 | * timer_unlock_remote_bases - unlock timer bases of cpu |
2121 | * @cpu: Remote CPU |
2122 | * |
2123 | * Unlocks the remote timer bases. |
2124 | */ |
2125 | void timer_unlock_remote_bases(unsigned int cpu) |
2126 | __releases(timer_bases[BASE_LOCAL]->lock) |
2127 | __releases(timer_bases[BASE_GLOBAL]->lock) |
2128 | { |
2129 | struct timer_base *base_local, *base_global; |
2130 | |
2131 | base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); |
2132 | base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); |
2133 | |
2134 | raw_spin_unlock(&base_global->lock); |
2135 | raw_spin_unlock(&base_local->lock); |
2136 | } |
2137 | |
2138 | /** |
2139 | * timer_lock_remote_bases - lock timer bases of cpu |
2140 | * @cpu: Remote CPU |
2141 | * |
2142 | * Locks the remote timer bases. |
2143 | */ |
2144 | void timer_lock_remote_bases(unsigned int cpu) |
2145 | __acquires(timer_bases[BASE_LOCAL]->lock) |
2146 | __acquires(timer_bases[BASE_GLOBAL]->lock) |
2147 | { |
2148 | struct timer_base *base_local, *base_global; |
2149 | |
2150 | base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); |
2151 | base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); |
2152 | |
2153 | lockdep_assert_irqs_disabled(); |
2154 | |
2155 | raw_spin_lock(&base_local->lock); |
2156 | raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); |
2157 | } |
2158 | |
2159 | /** |
2160 | * timer_base_is_idle() - Return whether timer base is set idle |
2161 | * |
2162 | * Returns value of local timer base is_idle value. |
2163 | */ |
2164 | bool timer_base_is_idle(void) |
2165 | { |
2166 | return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); |
2167 | } |
2168 | |
2169 | static void __run_timer_base(struct timer_base *base); |
2170 | |
2171 | /** |
2172 | * timer_expire_remote() - expire global timers of cpu |
2173 | * @cpu: Remote CPU |
2174 | * |
2175 | * Expire timers of global base of remote CPU. |
2176 | */ |
2177 | void timer_expire_remote(unsigned int cpu) |
2178 | { |
2179 | struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); |
2180 | |
2181 | __run_timer_base(base); |
2182 | } |
2183 | |
2184 | static void timer_use_tmigr(unsigned long basej, u64 basem, |
2185 | unsigned long *nextevt, bool *tick_stop_path, |
2186 | bool timer_base_idle, struct timer_events *tevt) |
2187 | { |
2188 | u64 next_tmigr; |
2189 | |
2190 | if (timer_base_idle) |
2191 | next_tmigr = tmigr_cpu_new_timer(nextevt: tevt->global); |
2192 | else if (tick_stop_path) |
2193 | next_tmigr = tmigr_cpu_deactivate(nextevt: tevt->global); |
2194 | else |
2195 | next_tmigr = tmigr_quick_check(nextevt: tevt->global); |
2196 | |
2197 | /* |
2198 | * If the CPU is the last going idle in timer migration hierarchy, make |
2199 | * sure the CPU will wake up in time to handle remote timers. |
2200 | * next_tmigr == KTIME_MAX if other CPUs are still active. |
2201 | */ |
2202 | if (next_tmigr < tevt->local) { |
2203 | u64 tmp; |
2204 | |
2205 | /* If we missed a tick already, force 0 delta */ |
2206 | if (next_tmigr < basem) |
2207 | next_tmigr = basem; |
2208 | |
2209 | tmp = div_u64(dividend: next_tmigr - basem, TICK_NSEC); |
2210 | |
2211 | *nextevt = basej + (unsigned long)tmp; |
2212 | tevt->local = next_tmigr; |
2213 | } |
2214 | } |
2215 | # else |
2216 | static void timer_use_tmigr(unsigned long basej, u64 basem, |
2217 | unsigned long *nextevt, bool *tick_stop_path, |
2218 | bool timer_base_idle, struct timer_events *tevt) |
2219 | { |
2220 | /* |
2221 | * Make sure first event is written into tevt->local to not miss a |
2222 | * timer on !SMP systems. |
2223 | */ |
2224 | tevt->local = min_t(u64, tevt->local, tevt->global); |
2225 | } |
2226 | # endif /* CONFIG_SMP */ |
2227 | |
2228 | static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, |
2229 | bool *idle) |
2230 | { |
2231 | struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; |
2232 | struct timer_base *base_local, *base_global; |
2233 | unsigned long nextevt; |
2234 | bool idle_is_possible; |
2235 | |
2236 | /* |
2237 | * When the CPU is offline, the tick is cancelled and nothing is supposed |
2238 | * to try to stop it. |
2239 | */ |
2240 | if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { |
2241 | if (idle) |
2242 | *idle = true; |
2243 | return tevt.local; |
2244 | } |
2245 | |
2246 | base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); |
2247 | base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); |
2248 | |
2249 | raw_spin_lock(&base_local->lock); |
2250 | raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); |
2251 | |
2252 | nextevt = fetch_next_timer_interrupt(basej, basem, base_local, |
2253 | base_global, tevt: &tevt); |
2254 | |
2255 | /* |
2256 | * If the next event is only one jiffie ahead there is no need to call |
2257 | * timer migration hierarchy related functions. The value for the next |
2258 | * global timer in @tevt struct equals then KTIME_MAX. This is also |
2259 | * true, when the timer base is idle. |
2260 | * |
2261 | * The proper timer migration hierarchy function depends on the callsite |
2262 | * and whether timer base is idle or not. @nextevt will be updated when |
2263 | * this CPU needs to handle the first timer migration hierarchy |
2264 | * event. See timer_use_tmigr() for detailed information. |
2265 | */ |
2266 | idle_is_possible = time_after(nextevt, basej + 1); |
2267 | if (idle_is_possible) |
2268 | timer_use_tmigr(basej, basem, nextevt: &nextevt, tick_stop_path: idle, |
2269 | timer_base_idle: base_local->is_idle, tevt: &tevt); |
2270 | |
2271 | /* |
2272 | * We have a fresh next event. Check whether we can forward the |
2273 | * base. |
2274 | */ |
2275 | __forward_timer_base(base: base_local, basej); |
2276 | __forward_timer_base(base: base_global, basej); |
2277 | |
2278 | /* |
2279 | * Set base->is_idle only when caller is timer_base_try_to_set_idle() |
2280 | */ |
2281 | if (idle) { |
2282 | /* |
2283 | * Bases are idle if the next event is more than a tick |
2284 | * away. Caution: @nextevt could have changed by enqueueing a |
2285 | * global timer into timer migration hierarchy. Therefore a new |
2286 | * check is required here. |
2287 | * |
2288 | * If the base is marked idle then any timer add operation must |
2289 | * forward the base clk itself to keep granularity small. This |
2290 | * idle logic is only maintained for the BASE_LOCAL and |
2291 | * BASE_GLOBAL base, deferrable timers may still see large |
2292 | * granularity skew (by design). |
2293 | */ |
2294 | if (!base_local->is_idle && time_after(nextevt, basej + 1)) { |
2295 | base_local->is_idle = true; |
2296 | /* |
2297 | * Global timers queued locally while running in a task |
2298 | * in nohz_full mode need a self-IPI to kick reprogramming |
2299 | * in IRQ tail. |
2300 | */ |
2301 | if (tick_nohz_full_cpu(cpu: base_local->cpu)) |
2302 | base_global->is_idle = true; |
2303 | trace_timer_base_idle(is_idle: true, cpu: base_local->cpu); |
2304 | } |
2305 | *idle = base_local->is_idle; |
2306 | |
2307 | /* |
2308 | * When timer base is not set idle, undo the effect of |
2309 | * tmigr_cpu_deactivate() to prevent inconsistent states - active |
2310 | * timer base but inactive timer migration hierarchy. |
2311 | * |
2312 | * When timer base was already marked idle, nothing will be |
2313 | * changed here. |
2314 | */ |
2315 | if (!base_local->is_idle && idle_is_possible) |
2316 | tmigr_cpu_activate(); |
2317 | } |
2318 | |
2319 | raw_spin_unlock(&base_global->lock); |
2320 | raw_spin_unlock(&base_local->lock); |
2321 | |
2322 | return cmp_next_hrtimer_event(basem, expires: tevt.local); |
2323 | } |
2324 | |
2325 | /** |
2326 | * get_next_timer_interrupt() - return the time (clock mono) of the next timer |
2327 | * @basej: base time jiffies |
2328 | * @basem: base time clock monotonic |
2329 | * |
2330 | * Returns the tick aligned clock monotonic time of the next pending timer or |
2331 | * KTIME_MAX if no timer is pending. If timer of global base was queued into |
2332 | * timer migration hierarchy, first global timer is not taken into account. If |
2333 | * it was the last CPU of timer migration hierarchy going idle, first global |
2334 | * event is taken into account. |
2335 | */ |
2336 | u64 get_next_timer_interrupt(unsigned long basej, u64 basem) |
2337 | { |
2338 | return __get_next_timer_interrupt(basej, basem, NULL); |
2339 | } |
2340 | |
2341 | /** |
2342 | * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases |
2343 | * @basej: base time jiffies |
2344 | * @basem: base time clock monotonic |
2345 | * @idle: pointer to store the value of timer_base->is_idle on return; |
2346 | * *idle contains the information whether tick was already stopped |
2347 | * |
2348 | * Returns the tick aligned clock monotonic time of the next pending timer or |
2349 | * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is |
2350 | * returned as well. |
2351 | */ |
2352 | u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) |
2353 | { |
2354 | if (*idle) |
2355 | return KTIME_MAX; |
2356 | |
2357 | return __get_next_timer_interrupt(basej, basem, idle); |
2358 | } |
2359 | |
2360 | /** |
2361 | * timer_clear_idle - Clear the idle state of the timer base |
2362 | * |
2363 | * Called with interrupts disabled |
2364 | */ |
2365 | void timer_clear_idle(void) |
2366 | { |
2367 | /* |
2368 | * We do this unlocked. The worst outcome is a remote pinned timer |
2369 | * enqueue sending a pointless IPI, but taking the lock would just |
2370 | * make the window for sending the IPI a few instructions smaller |
2371 | * for the cost of taking the lock in the exit from idle |
2372 | * path. Required for BASE_LOCAL only. |
2373 | */ |
2374 | __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); |
2375 | if (tick_nohz_full_cpu(smp_processor_id())) |
2376 | __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); |
2377 | trace_timer_base_idle(is_idle: false, smp_processor_id()); |
2378 | |
2379 | /* Activate without holding the timer_base->lock */ |
2380 | tmigr_cpu_activate(); |
2381 | } |
2382 | #endif |
2383 | |
2384 | /** |
2385 | * __run_timers - run all expired timers (if any) on this CPU. |
2386 | * @base: the timer vector to be processed. |
2387 | */ |
2388 | static inline void __run_timers(struct timer_base *base) |
2389 | { |
2390 | struct hlist_head heads[LVL_DEPTH]; |
2391 | int levels; |
2392 | |
2393 | lockdep_assert_held(&base->lock); |
2394 | |
2395 | if (base->running_timer) |
2396 | return; |
2397 | |
2398 | while (time_after_eq(jiffies, base->clk) && |
2399 | time_after_eq(jiffies, base->next_expiry)) { |
2400 | levels = collect_expired_timers(base, heads); |
2401 | /* |
2402 | * The two possible reasons for not finding any expired |
2403 | * timer at this clk are that all matching timers have been |
2404 | * dequeued or no timer has been queued since |
2405 | * base::next_expiry was set to base::clk + |
2406 | * NEXT_TIMER_MAX_DELTA. |
2407 | */ |
2408 | WARN_ON_ONCE(!levels && !base->next_expiry_recalc |
2409 | && base->timers_pending); |
2410 | /* |
2411 | * While executing timers, base->clk is set 1 offset ahead of |
2412 | * jiffies to avoid endless requeuing to current jiffies. |
2413 | */ |
2414 | base->clk++; |
2415 | next_expiry_recalc(base); |
2416 | |
2417 | while (levels--) |
2418 | expire_timers(base, head: heads + levels); |
2419 | } |
2420 | } |
2421 | |
2422 | static void __run_timer_base(struct timer_base *base) |
2423 | { |
2424 | if (time_before(jiffies, base->next_expiry)) |
2425 | return; |
2426 | |
2427 | timer_base_lock_expiry(base); |
2428 | raw_spin_lock_irq(&base->lock); |
2429 | __run_timers(base); |
2430 | raw_spin_unlock_irq(&base->lock); |
2431 | timer_base_unlock_expiry(base); |
2432 | } |
2433 | |
2434 | static void run_timer_base(int index) |
2435 | { |
2436 | struct timer_base *base = this_cpu_ptr(&timer_bases[index]); |
2437 | |
2438 | __run_timer_base(base); |
2439 | } |
2440 | |
2441 | /* |
2442 | * This function runs timers and the timer-tq in bottom half context. |
2443 | */ |
2444 | static __latent_entropy void run_timer_softirq(struct softirq_action *h) |
2445 | { |
2446 | run_timer_base(BASE_LOCAL); |
2447 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { |
2448 | run_timer_base(BASE_GLOBAL); |
2449 | run_timer_base(BASE_DEF); |
2450 | |
2451 | if (is_timers_nohz_active()) |
2452 | tmigr_handle_remote(); |
2453 | } |
2454 | } |
2455 | |
2456 | /* |
2457 | * Called by the local, per-CPU timer interrupt on SMP. |
2458 | */ |
2459 | static void run_local_timers(void) |
2460 | { |
2461 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); |
2462 | |
2463 | hrtimer_run_queues(); |
2464 | |
2465 | for (int i = 0; i < NR_BASES; i++, base++) { |
2466 | /* Raise the softirq only if required. */ |
2467 | if (time_after_eq(jiffies, base->next_expiry) || |
2468 | (i == BASE_DEF && tmigr_requires_handle_remote())) { |
2469 | raise_softirq(nr: TIMER_SOFTIRQ); |
2470 | return; |
2471 | } |
2472 | } |
2473 | } |
2474 | |
2475 | /* |
2476 | * Called from the timer interrupt handler to charge one tick to the current |
2477 | * process. user_tick is 1 if the tick is user time, 0 for system. |
2478 | */ |
2479 | void update_process_times(int user_tick) |
2480 | { |
2481 | struct task_struct *p = current; |
2482 | |
2483 | /* Note: this timer irq context must be accounted for as well. */ |
2484 | account_process_tick(p, user: user_tick); |
2485 | run_local_timers(); |
2486 | rcu_sched_clock_irq(user: user_tick); |
2487 | #ifdef CONFIG_IRQ_WORK |
2488 | if (in_irq()) |
2489 | irq_work_tick(); |
2490 | #endif |
2491 | scheduler_tick(); |
2492 | if (IS_ENABLED(CONFIG_POSIX_TIMERS)) |
2493 | run_posix_cpu_timers(); |
2494 | } |
2495 | |
2496 | /* |
2497 | * Since schedule_timeout()'s timer is defined on the stack, it must store |
2498 | * the target task on the stack as well. |
2499 | */ |
2500 | struct process_timer { |
2501 | struct timer_list timer; |
2502 | struct task_struct *task; |
2503 | }; |
2504 | |
2505 | static void process_timeout(struct timer_list *t) |
2506 | { |
2507 | struct process_timer *timeout = from_timer(timeout, t, timer); |
2508 | |
2509 | wake_up_process(tsk: timeout->task); |
2510 | } |
2511 | |
2512 | /** |
2513 | * schedule_timeout - sleep until timeout |
2514 | * @timeout: timeout value in jiffies |
2515 | * |
2516 | * Make the current task sleep until @timeout jiffies have elapsed. |
2517 | * The function behavior depends on the current task state |
2518 | * (see also set_current_state() description): |
2519 | * |
2520 | * %TASK_RUNNING - the scheduler is called, but the task does not sleep |
2521 | * at all. That happens because sched_submit_work() does nothing for |
2522 | * tasks in %TASK_RUNNING state. |
2523 | * |
2524 | * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to |
2525 | * pass before the routine returns unless the current task is explicitly |
2526 | * woken up, (e.g. by wake_up_process()). |
2527 | * |
2528 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is |
2529 | * delivered to the current task or the current task is explicitly woken |
2530 | * up. |
2531 | * |
2532 | * The current task state is guaranteed to be %TASK_RUNNING when this |
2533 | * routine returns. |
2534 | * |
2535 | * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule |
2536 | * the CPU away without a bound on the timeout. In this case the return |
2537 | * value will be %MAX_SCHEDULE_TIMEOUT. |
2538 | * |
2539 | * Returns 0 when the timer has expired otherwise the remaining time in |
2540 | * jiffies will be returned. In all cases the return value is guaranteed |
2541 | * to be non-negative. |
2542 | */ |
2543 | signed long __sched schedule_timeout(signed long timeout) |
2544 | { |
2545 | struct process_timer timer; |
2546 | unsigned long expire; |
2547 | |
2548 | switch (timeout) |
2549 | { |
2550 | case MAX_SCHEDULE_TIMEOUT: |
2551 | /* |
2552 | * These two special cases are useful to be comfortable |
2553 | * in the caller. Nothing more. We could take |
2554 | * MAX_SCHEDULE_TIMEOUT from one of the negative value |
2555 | * but I' d like to return a valid offset (>=0) to allow |
2556 | * the caller to do everything it want with the retval. |
2557 | */ |
2558 | schedule(); |
2559 | goto out; |
2560 | default: |
2561 | /* |
2562 | * Another bit of PARANOID. Note that the retval will be |
2563 | * 0 since no piece of kernel is supposed to do a check |
2564 | * for a negative retval of schedule_timeout() (since it |
2565 | * should never happens anyway). You just have the printk() |
2566 | * that will tell you if something is gone wrong and where. |
2567 | */ |
2568 | if (timeout < 0) { |
2569 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
2570 | "value %lx\n", timeout); |
2571 | dump_stack(); |
2572 | __set_current_state(TASK_RUNNING); |
2573 | goto out; |
2574 | } |
2575 | } |
2576 | |
2577 | expire = timeout + jiffies; |
2578 | |
2579 | timer.task = current; |
2580 | timer_setup_on_stack(&timer.timer, process_timeout, 0); |
2581 | __mod_timer(timer: &timer.timer, expires: expire, MOD_TIMER_NOTPENDING); |
2582 | schedule(); |
2583 | del_timer_sync(timer: &timer.timer); |
2584 | |
2585 | /* Remove the timer from the object tracker */ |
2586 | destroy_timer_on_stack(&timer.timer); |
2587 | |
2588 | timeout = expire - jiffies; |
2589 | |
2590 | out: |
2591 | return timeout < 0 ? 0 : timeout; |
2592 | } |
2593 | EXPORT_SYMBOL(schedule_timeout); |
2594 | |
2595 | /* |
2596 | * We can use __set_current_state() here because schedule_timeout() calls |
2597 | * schedule() unconditionally. |
2598 | */ |
2599 | signed long __sched schedule_timeout_interruptible(signed long timeout) |
2600 | { |
2601 | __set_current_state(TASK_INTERRUPTIBLE); |
2602 | return schedule_timeout(timeout); |
2603 | } |
2604 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
2605 | |
2606 | signed long __sched schedule_timeout_killable(signed long timeout) |
2607 | { |
2608 | __set_current_state(TASK_KILLABLE); |
2609 | return schedule_timeout(timeout); |
2610 | } |
2611 | EXPORT_SYMBOL(schedule_timeout_killable); |
2612 | |
2613 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
2614 | { |
2615 | __set_current_state(TASK_UNINTERRUPTIBLE); |
2616 | return schedule_timeout(timeout); |
2617 | } |
2618 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
2619 | |
2620 | /* |
2621 | * Like schedule_timeout_uninterruptible(), except this task will not contribute |
2622 | * to load average. |
2623 | */ |
2624 | signed long __sched schedule_timeout_idle(signed long timeout) |
2625 | { |
2626 | __set_current_state(TASK_IDLE); |
2627 | return schedule_timeout(timeout); |
2628 | } |
2629 | EXPORT_SYMBOL(schedule_timeout_idle); |
2630 | |
2631 | #ifdef CONFIG_HOTPLUG_CPU |
2632 | static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) |
2633 | { |
2634 | struct timer_list *timer; |
2635 | int cpu = new_base->cpu; |
2636 | |
2637 | while (!hlist_empty(h: head)) { |
2638 | timer = hlist_entry(head->first, struct timer_list, entry); |
2639 | detach_timer(timer, clear_pending: false); |
2640 | timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; |
2641 | internal_add_timer(base: new_base, timer); |
2642 | } |
2643 | } |
2644 | |
2645 | int timers_prepare_cpu(unsigned int cpu) |
2646 | { |
2647 | struct timer_base *base; |
2648 | int b; |
2649 | |
2650 | for (b = 0; b < NR_BASES; b++) { |
2651 | base = per_cpu_ptr(&timer_bases[b], cpu); |
2652 | base->clk = jiffies; |
2653 | base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; |
2654 | base->next_expiry_recalc = false; |
2655 | base->timers_pending = false; |
2656 | base->is_idle = false; |
2657 | } |
2658 | return 0; |
2659 | } |
2660 | |
2661 | int timers_dead_cpu(unsigned int cpu) |
2662 | { |
2663 | struct timer_base *old_base; |
2664 | struct timer_base *new_base; |
2665 | int b, i; |
2666 | |
2667 | for (b = 0; b < NR_BASES; b++) { |
2668 | old_base = per_cpu_ptr(&timer_bases[b], cpu); |
2669 | new_base = get_cpu_ptr(&timer_bases[b]); |
2670 | /* |
2671 | * The caller is globally serialized and nobody else |
2672 | * takes two locks at once, deadlock is not possible. |
2673 | */ |
2674 | raw_spin_lock_irq(&new_base->lock); |
2675 | raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
2676 | |
2677 | /* |
2678 | * The current CPUs base clock might be stale. Update it |
2679 | * before moving the timers over. |
2680 | */ |
2681 | forward_timer_base(base: new_base); |
2682 | |
2683 | WARN_ON_ONCE(old_base->running_timer); |
2684 | old_base->running_timer = NULL; |
2685 | |
2686 | for (i = 0; i < WHEEL_SIZE; i++) |
2687 | migrate_timer_list(new_base, head: old_base->vectors + i); |
2688 | |
2689 | raw_spin_unlock(&old_base->lock); |
2690 | raw_spin_unlock_irq(&new_base->lock); |
2691 | put_cpu_ptr(&timer_bases); |
2692 | } |
2693 | return 0; |
2694 | } |
2695 | |
2696 | #endif /* CONFIG_HOTPLUG_CPU */ |
2697 | |
2698 | static void __init init_timer_cpu(int cpu) |
2699 | { |
2700 | struct timer_base *base; |
2701 | int i; |
2702 | |
2703 | for (i = 0; i < NR_BASES; i++) { |
2704 | base = per_cpu_ptr(&timer_bases[i], cpu); |
2705 | base->cpu = cpu; |
2706 | raw_spin_lock_init(&base->lock); |
2707 | base->clk = jiffies; |
2708 | base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; |
2709 | timer_base_init_expiry_lock(base); |
2710 | } |
2711 | } |
2712 | |
2713 | static void __init init_timer_cpus(void) |
2714 | { |
2715 | int cpu; |
2716 | |
2717 | for_each_possible_cpu(cpu) |
2718 | init_timer_cpu(cpu); |
2719 | } |
2720 | |
2721 | void __init init_timers(void) |
2722 | { |
2723 | init_timer_cpus(); |
2724 | posix_cputimers_init_work(); |
2725 | open_softirq(nr: TIMER_SOFTIRQ, action: run_timer_softirq); |
2726 | } |
2727 | |
2728 | /** |
2729 | * msleep - sleep safely even with waitqueue interruptions |
2730 | * @msecs: Time in milliseconds to sleep for |
2731 | */ |
2732 | void msleep(unsigned int msecs) |
2733 | { |
2734 | unsigned long timeout = msecs_to_jiffies(m: msecs) + 1; |
2735 | |
2736 | while (timeout) |
2737 | timeout = schedule_timeout_uninterruptible(timeout); |
2738 | } |
2739 | |
2740 | EXPORT_SYMBOL(msleep); |
2741 | |
2742 | /** |
2743 | * msleep_interruptible - sleep waiting for signals |
2744 | * @msecs: Time in milliseconds to sleep for |
2745 | */ |
2746 | unsigned long msleep_interruptible(unsigned int msecs) |
2747 | { |
2748 | unsigned long timeout = msecs_to_jiffies(m: msecs) + 1; |
2749 | |
2750 | while (timeout && !signal_pending(current)) |
2751 | timeout = schedule_timeout_interruptible(timeout); |
2752 | return jiffies_to_msecs(j: timeout); |
2753 | } |
2754 | |
2755 | EXPORT_SYMBOL(msleep_interruptible); |
2756 | |
2757 | /** |
2758 | * usleep_range_state - Sleep for an approximate time in a given state |
2759 | * @min: Minimum time in usecs to sleep |
2760 | * @max: Maximum time in usecs to sleep |
2761 | * @state: State of the current task that will be while sleeping |
2762 | * |
2763 | * In non-atomic context where the exact wakeup time is flexible, use |
2764 | * usleep_range_state() instead of udelay(). The sleep improves responsiveness |
2765 | * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces |
2766 | * power usage by allowing hrtimers to take advantage of an already- |
2767 | * scheduled interrupt instead of scheduling a new one just for this sleep. |
2768 | */ |
2769 | void __sched usleep_range_state(unsigned long min, unsigned long max, |
2770 | unsigned int state) |
2771 | { |
2772 | ktime_t exp = ktime_add_us(kt: ktime_get(), usec: min); |
2773 | u64 delta = (u64)(max - min) * NSEC_PER_USEC; |
2774 | |
2775 | for (;;) { |
2776 | __set_current_state(state); |
2777 | /* Do not return before the requested sleep time has elapsed */ |
2778 | if (!schedule_hrtimeout_range(expires: &exp, delta, mode: HRTIMER_MODE_ABS)) |
2779 | break; |
2780 | } |
2781 | } |
2782 | EXPORT_SYMBOL(usleep_range_state); |
2783 |
Definitions
- jiffies_64
- timer_base
- timer_bases
- timers_nohz_active
- timer_keys_mutex
- timer_update_work
- sysctl_timer_migration
- timers_migration_enabled
- timers_update_migration
- timer_migration_handler
- timer_sysctl
- timer_sysctl_init
- timer_update_keys
- timers_update_nohz
- is_timers_nohz_active
- round_jiffies_common
- __round_jiffies
- __round_jiffies_relative
- round_jiffies
- round_jiffies_relative
- __round_jiffies_up
- __round_jiffies_up_relative
- round_jiffies_up
- round_jiffies_up_relative
- timer_get_idx
- timer_set_idx
- calc_index
- calc_wheel_index
- trigger_dyntick_cpu
- enqueue_timer
- internal_add_timer
- timer_debug_descr
- timer_hint
- timer_hints
- timer_debug_hint
- timer_is_static_object
- timer_fixup_init
- stub_timer
- timer_fixup_activate
- timer_fixup_free
- timer_fixup_assert_init
- timer_debug_descr
- debug_timer_init
- debug_timer_activate
- debug_timer_deactivate
- debug_timer_assert_init
- init_timer_on_stack_key
- destroy_timer_on_stack
- debug_init
- debug_deactivate
- debug_assert_init
- do_init_timer
- init_timer_key
- detach_timer
- detach_if_pending
- get_timer_cpu_base
- get_timer_this_cpu_base
- get_timer_base
- __forward_timer_base
- forward_timer_base
- lock_timer_base
- __mod_timer
- mod_timer_pending
- mod_timer
- timer_reduce
- add_timer
- add_timer_local
- add_timer_global
- add_timer_on
- __timer_delete
- timer_delete
- timer_shutdown
- __try_to_del_timer_sync
- try_to_del_timer_sync
- timer_base_init_expiry_lock
- timer_base_lock_expiry
- timer_base_unlock_expiry
- timer_sync_wait_running
- del_timer_wait_running
- __timer_delete_sync
- timer_delete_sync
- timer_shutdown_sync
- call_timer_fn
- expire_timers
- collect_expired_timers
- next_pending_bucket
- next_expiry_recalc
- cmp_next_hrtimer_event
- next_timer_interrupt
- fetch_next_timer_interrupt
- fetch_next_timer_interrupt_remote
- timer_unlock_remote_bases
- timer_lock_remote_bases
- timer_base_is_idle
- timer_expire_remote
- timer_use_tmigr
- __get_next_timer_interrupt
- get_next_timer_interrupt
- timer_base_try_to_set_idle
- timer_clear_idle
- __run_timers
- __run_timer_base
- run_timer_base
- run_timer_softirq
- run_local_timers
- update_process_times
- process_timer
- process_timeout
- schedule_timeout
- schedule_timeout_interruptible
- schedule_timeout_killable
- schedule_timeout_uninterruptible
- schedule_timeout_idle
- migrate_timer_list
- timers_prepare_cpu
- timers_dead_cpu
- init_timer_cpu
- init_timer_cpus
- init_timers
- msleep
- msleep_interruptible
Improve your Profiling and Debugging skills
Find out more