timer.c source code [linux/kernel/time/timer.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Kernel internal timers
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*
7	* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
8	*
9	* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
10	* "A Kernel Model for Precision Timekeeping" by Dave Mills
11	* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
12	* serialize accesses to xtime/lost_ticks).
13	* Copyright (C) 1998 Andrea Arcangeli
14	* 1999-03-10 Improved NTP compatibility by Ulrich Windl
15	* 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
16	* 2000-10-05 Implemented scalable SMP per-CPU timer handling.
17	* Copyright (C) 2000, 2001, 2002 Ingo Molnar
18	* Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
19	*/
20
21	#include <linux/kernel_stat.h>
22	#include <linux/export.h>
23	#include <linux/interrupt.h>
24	#include <linux/percpu.h>
25	#include <linux/init.h>
26	#include <linux/mm.h>
27	#include <linux/swap.h>
28	#include <linux/pid_namespace.h>
29	#include <linux/notifier.h>
30	#include <linux/thread_info.h>
31	#include <linux/time.h>
32	#include <linux/jiffies.h>
33	#include <linux/posix-timers.h>
34	#include <linux/cpu.h>
35	#include <linux/syscalls.h>
36	#include <linux/delay.h>
37	#include <linux/tick.h>
38	#include <linux/kallsyms.h>
39	#include <linux/irq_work.h>
40	#include <linux/sched/sysctl.h>
41	#include <linux/sched/nohz.h>
42	#include <linux/sched/debug.h>
43	#include <linux/slab.h>
44	#include <linux/compat.h>
45	#include <linux/random.h>
46	#include <linux/sysctl.h>
47
48	#include <linux/uaccess.h>
49	#include <asm/unistd.h>
50	#include <asm/div64.h>
51	#include <asm/timex.h>
52	#include <asm/io.h>
53
54	#include "tick-internal.h"
55	#include "timer_migration.h"
56
57	#define CREATE_TRACE_POINTS
58	#include <trace/events/timer.h>
59
60	__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
61
62	EXPORT_SYMBOL(jiffies_64);
63
64	/*
65	* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
66	* LVL_SIZE buckets. Each level is driven by its own clock and therefore each
67	* level has a different granularity.
68	*
69	* The level granularity is: LVL_CLK_DIV ^ level
70	* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
71	*
72	* The array level of a newly armed timer depends on the relative expiry
73	* time. The farther the expiry time is away the higher the array level and
74	* therefore the granularity becomes.
75	*
76	* Contrary to the original timer wheel implementation, which aims for 'exact'
77	* expiry of the timers, this implementation removes the need for recascading
78	* the timers into the lower array levels. The previous 'classic' timer wheel
79	* implementation of the kernel already violated the 'exact' expiry by adding
80	* slack to the expiry time to provide batched expiration. The granularity
81	* levels provide implicit batching.
82	*
83	* This is an optimization of the original timer wheel implementation for the
84	* majority of the timer wheel use cases: timeouts. The vast majority of
85	* timeout timers (networking, disk I/O ...) are canceled before expiry. If
86	* the timeout expires it indicates that normal operation is disturbed, so it
87	* does not matter much whether the timeout comes with a slight delay.
88	*
89	* The only exception to this are networking timers with a small expiry
90	* time. They rely on the granularity. Those fit into the first wheel level,
91	* which has HZ granularity.
92	*
93	* We don't have cascading anymore. timers with a expiry time above the
94	* capacity of the last wheel level are force expired at the maximum timeout
95	* value of the last wheel level. From data sampling we know that the maximum
96	* value observed is 5 days (network connection tracking), so this should not
97	* be an issue.
98	*
99	* The currently chosen array constants values are a good compromise between
100	* array size and granularity.
101	*
102	* This results in the following granularity and range levels:
103	*
104	* HZ 1000 steps
105	* Level Offset Granularity Range
106	* 0 0 1 ms 0 ms - 63 ms
107	* 1 64 8 ms 64 ms - 511 ms
108	* 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
109	* 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
110	* 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
111	* 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
112	* 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
113	* 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
114	* 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
115	*
116	* HZ 300
117	* Level Offset Granularity Range
118	* 0 0 3 ms 0 ms - 210 ms
119	* 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
120	* 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
121	* 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
122	* 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
123	* 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
124	* 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
125	* 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
126	* 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
127	*
128	* HZ 250
129	* Level Offset Granularity Range
130	* 0 0 4 ms 0 ms - 255 ms
131	* 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
132	* 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
133	* 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
134	* 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
135	* 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
136	* 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
137	* 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
138	* 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
139	*
140	* HZ 100
141	* Level Offset Granularity Range
142	* 0 0 10 ms 0 ms - 630 ms
143	* 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
144	* 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
145	* 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
146	* 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
147	* 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
148	* 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
149	* 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
150	*/
151
152	/ Clock divisor for the next level /
153	#define LVL_CLK_SHIFT 3
154	#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
155	#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
156	#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
157	#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
158
159	/*
160	* The time start value for each level to select the bucket at enqueue
161	* time. We start from the last possible delta of the previous level
162	* so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
163	*/
164	#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
165
166	/ Size of each clock level /
167	#define LVL_BITS 6
168	#define LVL_SIZE (1UL << LVL_BITS)
169	#define LVL_MASK (LVL_SIZE - 1)
170	#define LVL_OFFS(n) ((n) * LVL_SIZE)
171
172	/ Level depth /
173	#if HZ > 100
174	# define LVL_DEPTH 9
175	# else
176	# define LVL_DEPTH 8
177	#endif
178
179	/ The cutoff (max. capacity of the wheel) /
180	#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
181	#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
182
183	/*
184	* The resulting wheel size. If NOHZ is configured we allocate two
185	* wheels so we have a separate storage for the deferrable timers.
186	*/
187	#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
188
189	#ifdef CONFIG_NO_HZ_COMMON
190	/*
191	* If multiple bases need to be locked, use the base ordering for lock
192	* nesting, i.e. lowest number first.
193	*/
194	# define NR_BASES 3
195	# define BASE_LOCAL 0
196	# define BASE_GLOBAL 1
197	# define BASE_DEF 2
198	#else
199	# define NR_BASES 1
200	# define BASE_LOCAL 0
201	# define BASE_GLOBAL 0
202	# define BASE_DEF 0
203	#endif
204
205	/**
206	* struct timer_base - Per CPU timer base (number of base depends on config)
207	* @lock: Lock protecting the timer_base
208	* @running_timer: When expiring timers, the lock is dropped. To make
209	* sure not to race against deleting/modifying a
210	* currently running timer, the pointer is set to the
211	* timer, which expires at the moment. If no timer is
212	* running, the pointer is NULL.
213	* @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around
214	* timer expiry callback execution and when trying to
215	* delete a running timer and it wasn't successful in
216	* the first glance. It prevents priority inversion
217	* when callback was preempted on a remote CPU and a
218	* caller tries to delete the running timer. It also
219	* prevents a life lock, when the task which tries to
220	* delete a timer preempted the softirq thread which
221	* is running the timer callback function.
222	* @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter
223	* waiting for the end of the timer callback function
224	* execution.
225	* @clk: clock of the timer base; is updated before enqueue
226	* of a timer; during expiry, it is 1 offset ahead of
227	* jiffies to avoid endless requeuing to current
228	* jiffies
229	* @next_expiry: expiry value of the first timer; it is updated when
230	* finding the next timer and during enqueue; the
231	* value is not valid, when next_expiry_recalc is set
232	* @cpu: Number of CPU the timer base belongs to
233	* @next_expiry_recalc: States, whether a recalculation of next_expiry is
234	* required. Value is set true, when a timer was
235	* deleted.
236	* @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ
237	* code. This state is only used in standard
238	* base. Deferrable timers, which are enqueued remotely
239	* never wake up an idle CPU. So no matter of supporting it
240	* for this base.
241	* @timers_pending: Is set, when a timer is pending in the base. It is only
242	* reliable when next_expiry_recalc is not set.
243	* @pending_map: bitmap of the timer wheel; each bit reflects a
244	* bucket of the wheel. When a bit is set, at least a
245	* single timer is enqueued in the related bucket.
246	* @vectors: Array of lists; Each array member reflects a bucket
247	* of the timer wheel. The list contains all timers
248	* which are enqueued into a specific bucket.
249	*/
250	struct timer_base {
251	raw_spinlock_t lock;
252	struct timer_list *running_timer;
253	#ifdef CONFIG_PREEMPT_RT
254	spinlock_t expiry_lock;
255	atomic_t timer_waiters;
256	#endif
257	unsigned long clk;
258	unsigned long next_expiry;
259	unsigned int cpu;
260	bool next_expiry_recalc;
261	bool is_idle;
262	bool timers_pending;
263	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
264	struct hlist_head vectors[WHEEL_SIZE];
265	} ____cacheline_aligned;
266
267	static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
268
269	#ifdef CONFIG_NO_HZ_COMMON
270
271	static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
272	static DEFINE_MUTEX(timer_keys_mutex);
273
274	static void timer_update_keys(struct work_struct *work);
275	static DECLARE_WORK(timer_update_work, timer_update_keys);
276
277	#ifdef CONFIG_SMP
278	static unsigned int sysctl_timer_migration = `1`;
279
280	DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
281
282	static void timers_update_migration(void)
283	{
284	if (sysctl_timer_migration && tick_nohz_active)
285	static_branch_enable(&timers_migration_enabled);
286	else
287	static_branch_disable(&timers_migration_enabled);
288	}
289
290	#ifdef CONFIG_SYSCTL
291	static int timer_migration_handler(const struct ctl_table table, int* write,
292	void buffer, size_t lenp, loff_t *ppos)
293	{
294	int ret;
295
296	mutex_lock(&timer_keys_mutex);
297	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
298	if (!ret && write)
299	timers_update_migration();
300	mutex_unlock(lock: &timer_keys_mutex);
301	return ret;
302	}
303
304	static const struct ctl_table timer_sysctl[] = {
305	{
306	.procname = "timer_migration",
307	.data = &sysctl_timer_migration,
308	.maxlen = sizeof(unsigned int),
309	.mode = `0644`,
310	.proc_handler = timer_migration_handler,
311	.extra1 = SYSCTL_ZERO,
312	.extra2 = SYSCTL_ONE,
313	},
314	};
315
316	static int __init timer_sysctl_init(void)
317	{
318	register_sysctl("kernel", timer_sysctl);
319	return `0`;
320	}
321	device_initcall(timer_sysctl_init);
322	#endif /* CONFIG_SYSCTL */
323	#else /* CONFIG_SMP */
324	static inline void timers_update_migration(void) { }
325	#endif /* !CONFIG_SMP */
326
327	static void timer_update_keys(struct work_struct *work)
328	{
329	mutex_lock(&timer_keys_mutex);
330	timers_update_migration();
331	static_branch_enable(&timers_nohz_active);
332	mutex_unlock(lock: &timer_keys_mutex);
333	}
334
335	void timers_update_nohz(void)
336	{
337	schedule_work(work: &timer_update_work);
338	}
339
340	static inline bool is_timers_nohz_active(void)
341	{
342	return static_branch_unlikely(&timers_nohz_active);
343	}
344	#else
345	static inline bool is_timers_nohz_active(void) { return false; }
346	#endif /* NO_HZ_COMMON */
347
348	static unsigned long round_jiffies_common(unsigned long j, int cpu,
349	bool force_up)
350	{
351	int rem;
352	unsigned long original = j;
353
354	/*
355	* We don't want all cpus firing their timers at once hitting the
356	* same lock or cachelines, so we skew each extra cpu with an extra
357	* 3 jiffies. This 3 jiffies came originally from the mm/ code which
358	* already did this.
359	* The skew is done by adding 3*cpunr, then round, then subtract this
360	* extra offset again.
361	*/
362	j += cpu * `3`;
363
364	rem = j % HZ;
365
366	/*
367	* If the target jiffy is just after a whole second (which can happen
368	* due to delays of the timer irq, long irq off times etc etc) then
369	* we should round down to the whole second, not up. Use 1/4th second
370	* as cutoff for this rounding as an extreme upper bound for this.
371	* But never round down if @force_up is set.
372	*/
373	if (rem < HZ/`4` && !force_up) / round down /
374	j = j - rem;
375	else / round up /
376	j = j - rem + HZ;
377
378	/ now that we have rounded, subtract the extra skew again /
379	j -= cpu * `3`;
380
381	/*
382	* Make sure j is still in the future. Otherwise return the
383	* unmodified value.
384	*/
385	return time_is_after_jiffies(j) ? j : original;
386	}
387
388	/**
389	* __round_jiffies_relative - function to round jiffies to a full second
390	* @j: the time in (relative) jiffies that should be rounded
391	* @cpu: the processor number on which the timeout will happen
392	*
393	* __round_jiffies_relative() rounds a time delta in the future (in jiffies)
394	* up or down to (approximately) full seconds. This is useful for timers
395	* for which the exact time they fire does not matter too much, as long as
396	* they fire approximately every X seconds.
397	*
398	* By rounding these timers to whole seconds, all such timers will fire
399	* at the same time, rather than at various times spread out. The goal
400	* of this is to have the CPU wake up less, which saves power.
401	*
402	* The exact rounding is skewed for each processor to avoid all
403	* processors firing at the exact same time, which could lead
404	* to lock contention or spurious cache line bouncing.
405	*
406	* The return value is the rounded version of the @j parameter.
407	*/
408	unsigned long __round_jiffies_relative(unsigned long j, int cpu)
409	{
410	unsigned long j0 = jiffies;
411
412	/ Use j0 because jiffies might change while we run /
413	return round_jiffies_common(j: j + j0, cpu, force_up: false) - j0;
414	}
415	EXPORT_SYMBOL_GPL(__round_jiffies_relative);
416
417	/**
418	* round_jiffies - function to round jiffies to a full second
419	* @j: the time in (absolute) jiffies that should be rounded
420	*
421	* round_jiffies() rounds an absolute time in the future (in jiffies)
422	* up or down to (approximately) full seconds. This is useful for timers
423	* for which the exact time they fire does not matter too much, as long as
424	* they fire approximately every X seconds.
425	*
426	* By rounding these timers to whole seconds, all such timers will fire
427	* at the same time, rather than at various times spread out. The goal
428	* of this is to have the CPU wake up less, which saves power.
429	*
430	* The return value is the rounded version of the @j parameter.
431	*/
432	unsigned long round_jiffies(unsigned long j)
433	{
434	return round_jiffies_common(j, raw_smp_processor_id(), force_up: false);
435	}
436	EXPORT_SYMBOL_GPL(round_jiffies);
437
438	/**
439	* round_jiffies_relative - function to round jiffies to a full second
440	* @j: the time in (relative) jiffies that should be rounded
441	*
442	* round_jiffies_relative() rounds a time delta in the future (in jiffies)
443	* up or down to (approximately) full seconds. This is useful for timers
444	* for which the exact time they fire does not matter too much, as long as
445	* they fire approximately every X seconds.
446	*
447	* By rounding these timers to whole seconds, all such timers will fire
448	* at the same time, rather than at various times spread out. The goal
449	* of this is to have the CPU wake up less, which saves power.
450	*
451	* The return value is the rounded version of the @j parameter.
452	*/
453	unsigned long round_jiffies_relative(unsigned long j)
454	{
455	return __round_jiffies_relative(j, raw_smp_processor_id());
456	}
457	EXPORT_SYMBOL_GPL(round_jiffies_relative);
458
459	/**
460	* __round_jiffies_up_relative - function to round jiffies up to a full second
461	* @j: the time in (relative) jiffies that should be rounded
462	* @cpu: the processor number on which the timeout will happen
463	*
464	* This is the same as __round_jiffies_relative() except that it will never
465	* round down. This is useful for timeouts for which the exact time
466	* of firing does not matter too much, as long as they don't fire too
467	* early.
468	*/
469	unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
470	{
471	unsigned long j0 = jiffies;
472
473	/ Use j0 because jiffies might change while we run /
474	return round_jiffies_common(j: j + j0, cpu, force_up: true) - j0;
475	}
476	EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
477
478	/**
479	* round_jiffies_up - function to round jiffies up to a full second
480	* @j: the time in (absolute) jiffies that should be rounded
481	*
482	* This is the same as round_jiffies() except that it will never
483	* round down. This is useful for timeouts for which the exact time
484	* of firing does not matter too much, as long as they don't fire too
485	* early.
486	*/
487	unsigned long round_jiffies_up(unsigned long j)
488	{
489	return round_jiffies_common(j, raw_smp_processor_id(), force_up: true);
490	}
491	EXPORT_SYMBOL_GPL(round_jiffies_up);
492
493	/**
494	* round_jiffies_up_relative - function to round jiffies up to a full second
495	* @j: the time in (relative) jiffies that should be rounded
496	*
497	* This is the same as round_jiffies_relative() except that it will never
498	* round down. This is useful for timeouts for which the exact time
499	* of firing does not matter too much, as long as they don't fire too
500	* early.
501	*/
502	unsigned long round_jiffies_up_relative(unsigned long j)
503	{
504	return __round_jiffies_up_relative(j, raw_smp_processor_id());
505	}
506	EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
507
508
509	static inline unsigned int timer_get_idx(struct timer_list *timer)
510	{
511	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
512	}
513
514	static inline void timer_set_idx(struct timer_list timer, unsigned* int idx)
515	{
516	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) \|
517	idx << TIMER_ARRAYSHIFT;
518	}
519
520	/*
521	* Helper function to calculate the array index for a given expiry
522	* time.
523	*/
524	static inline unsigned calc_index(unsigned long expires, unsigned lvl,
525	unsigned long *bucket_expiry)
526	{
527
528	/*
529	* The timer wheel has to guarantee that a timer does not fire
530	* early. Early expiry can happen due to:
531	* - Timer is armed at the edge of a tick
532	* - Truncation of the expiry time in the outer wheel levels
533	*
534	* Round up with level granularity to prevent this.
535	*/
536	expires = (expires >> LVL_SHIFT(lvl)) + `1`;
537	*bucket_expiry = expires << LVL_SHIFT(lvl);
538	return LVL_OFFS(lvl) + (expires & LVL_MASK);
539	}
540
541	static int calc_wheel_index(unsigned long expires, unsigned long clk,
542	unsigned long *bucket_expiry)
543	{
544	unsigned long delta = expires - clk;
545	unsigned int idx;
546
547	if (delta < LVL_START(`1`)) {
548	idx = calc_index(expires, lvl: `0`, bucket_expiry);
549	} else if (delta < LVL_START(`2`)) {
550	idx = calc_index(expires, lvl: `1`, bucket_expiry);
551	} else if (delta < LVL_START(`3`)) {
552	idx = calc_index(expires, lvl: `2`, bucket_expiry);
553	} else if (delta < LVL_START(`4`)) {
554	idx = calc_index(expires, lvl: `3`, bucket_expiry);
555	} else if (delta < LVL_START(`5`)) {
556	idx = calc_index(expires, lvl: `4`, bucket_expiry);
557	} else if (delta < LVL_START(`6`)) {
558	idx = calc_index(expires, lvl: `5`, bucket_expiry);
559	} else if (delta < LVL_START(`7`)) {
560	idx = calc_index(expires, lvl: `6`, bucket_expiry);
561	} else if (LVL_DEPTH > `8` && delta < LVL_START(`8`)) {
562	idx = calc_index(expires, lvl: `7`, bucket_expiry);
563	} else if ((long) delta < `0`) {
564	idx = clk & LVL_MASK;
565	*bucket_expiry = clk;
566	} else {
567	/*
568	* Force expire obscene large timeouts to expire at the
569	* capacity limit of the wheel.
570	*/
571	if (delta >= WHEEL_TIMEOUT_CUTOFF)
572	expires = clk + WHEEL_TIMEOUT_MAX;
573
574	idx = calc_index(expires, LVL_DEPTH - `1`, bucket_expiry);
575	}
576	return idx;
577	}
578
579	static void
580	trigger_dyntick_cpu(struct timer_base base, struct* timer_list *timer)
581	{
582	/*
583	* Deferrable timers do not prevent the CPU from entering dynticks and
584	* are not taken into account on the idle/nohz_full path. An IPI when a
585	* new deferrable timer is enqueued will wake up the remote CPU but
586	* nothing will be done with the deferrable timer base. Therefore skip
587	* the remote IPI for deferrable timers completely.
588	*/
589	if (!is_timers_nohz_active() \|\| timer->flags & TIMER_DEFERRABLE)
590	return;
591
592	/*
593	* We might have to IPI the remote CPU if the base is idle and the
594	* timer is pinned. If it is a non pinned timer, it is only queued
595	* on the remote CPU, when timer was running during queueing. Then
596	* everything is handled by remote CPU anyway. If the other CPU is
597	* on the way to idle then it can't set base->is_idle as we hold
598	* the base lock:
599	*/
600	if (base->is_idle) {
601	WARN_ON_ONCE(!(timer->flags & TIMER_PINNED \|\|
602	tick_nohz_full_cpu(base->cpu)));
603	wake_up_nohz_cpu(cpu: base->cpu);
604	}
605	}
606
607	/*
608	* Enqueue the timer into the hash bucket, mark it pending in
609	* the bitmap, store the index in the timer flags then wake up
610	* the target CPU if needed.
611	*/
612	static void enqueue_timer(struct timer_base base, struct* timer_list *timer,
613	unsigned int idx, unsigned long bucket_expiry)
614	{
615
616	hlist_add_head(n: &timer->entry, h: base->vectors + idx);
617	__set_bit(idx, base->pending_map);
618	timer_set_idx(timer, idx);
619
620	trace_timer_start(timer, bucket_expiry);
621
622	/*
623	* Check whether this is the new first expiring timer. The
624	* effective expiry time of the timer is required here
625	* (bucket_expiry) instead of timer->expires.
626	*/
627	if (time_before(bucket_expiry, base->next_expiry)) {
628	/*
629	* Set the next expiry time and kick the CPU so it
630	* can reevaluate the wheel:
631	*/
632	WRITE_ONCE(base->next_expiry, bucket_expiry);
633	base->timers_pending = true;
634	base->next_expiry_recalc = false;
635	trigger_dyntick_cpu(base, timer);
636	}
637	}
638
639	static void internal_add_timer(struct timer_base base, struct* timer_list *timer)
640	{
641	unsigned long bucket_expiry;
642	unsigned int idx;
643
644	idx = calc_wheel_index(expires: timer->expires, clk: base->clk, bucket_expiry: &bucket_expiry);
645	enqueue_timer(base, timer, idx, bucket_expiry);
646	}
647
648	#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
649
650	static const struct debug_obj_descr timer_debug_descr;
651
652	struct timer_hint {
653	void (function)(struct* timer_list *t);
654	long offset;
655	};
656
657	#define TIMER_HINT(fn, container, timr, hintfn) \
658	{ \
659	.function = fn, \
660	.offset = offsetof(container, hintfn) - \
661	offsetof(container, timr) \
662	}
663
664	static const struct timer_hint timer_hints[] = {
665	TIMER_HINT(delayed_work_timer_fn,
666	struct delayed_work, timer, work.func),
667	TIMER_HINT(kthread_delayed_work_timer_fn,
668	struct kthread_delayed_work, timer, work.func),
669	};
670
671	static void timer_debug_hint(void* *addr)
672	{
673	struct timer_list *timer = addr;
674	int i;
675
676	for (i = `0`; i < ARRAY_SIZE(timer_hints); i++) {
677	if (timer_hints[i].function == timer->function) {
678	void (*fn)(void*) = addr + timer_hints[i].offset;
679
680	return *fn;
681	}
682	}
683
684	return timer->function;
685	}
686
687	static bool timer_is_static_object(void *addr)
688	{
689	struct timer_list *timer = addr;
690
691	return (timer->entry.pprev == NULL &&
692	timer->entry.next == TIMER_ENTRY_STATIC);
693	}
694
695	/*
696	* timer_fixup_init is called when:
697	* - an active object is initialized
698	*/
699	static bool timer_fixup_init(void addr, enum* debug_obj_state state)
700	{
701	struct timer_list *timer = addr;
702
703	switch (state) {
704	case ODEBUG_STATE_ACTIVE:
705	timer_delete_sync(timer);
706	debug_object_init(addr: timer, descr: &timer_debug_descr);
707	return true;
708	default:
709	return false;
710	}
711	}
712
713	/ Stub timer callback for improperly used timers. /
714	static void stub_timer(struct timer_list *unused)
715	{
716	WARN_ON(`1`);
717	}
718
719	/*
720	* timer_fixup_activate is called when:
721	* - an active object is activated
722	* - an unknown non-static object is activated
723	*/
724	static bool timer_fixup_activate(void addr, enum* debug_obj_state state)
725	{
726	struct timer_list *timer = addr;
727
728	switch (state) {
729	case ODEBUG_STATE_NOTAVAILABLE:
730	timer_setup(timer, stub_timer, `0`);
731	return true;
732
733	case ODEBUG_STATE_ACTIVE:
734	WARN_ON(`1`);
735	fallthrough;
736	default:
737	return false;
738	}
739	}
740
741	/*
742	* timer_fixup_free is called when:
743	* - an active object is freed
744	*/
745	static bool timer_fixup_free(void addr, enum* debug_obj_state state)
746	{
747	struct timer_list *timer = addr;
748
749	switch (state) {
750	case ODEBUG_STATE_ACTIVE:
751	timer_delete_sync(timer);
752	debug_object_free(addr: timer, descr: &timer_debug_descr);
753	return true;
754	default:
755	return false;
756	}
757	}
758
759	/*
760	* timer_fixup_assert_init is called when:
761	* - an untracked/uninit-ed object is found
762	*/
763	static bool timer_fixup_assert_init(void addr, enum* debug_obj_state state)
764	{
765	struct timer_list *timer = addr;
766
767	switch (state) {
768	case ODEBUG_STATE_NOTAVAILABLE:
769	timer_setup(timer, stub_timer, `0`);
770	return true;
771	default:
772	return false;
773	}
774	}
775
776	static const struct debug_obj_descr timer_debug_descr = {
777	.name = "timer_list",
778	.debug_hint = timer_debug_hint,
779	.is_static_object = timer_is_static_object,
780	.fixup_init = timer_fixup_init,
781	.fixup_activate = timer_fixup_activate,
782	.fixup_free = timer_fixup_free,
783	.fixup_assert_init = timer_fixup_assert_init,
784	};
785
786	static inline void debug_timer_init(struct timer_list *timer)
787	{
788	debug_object_init(addr: timer, descr: &timer_debug_descr);
789	}
790
791	static inline void debug_timer_activate(struct timer_list *timer)
792	{
793	debug_object_activate(addr: timer, descr: &timer_debug_descr);
794	}
795
796	static inline void debug_timer_deactivate(struct timer_list *timer)
797	{
798	debug_object_deactivate(addr: timer, descr: &timer_debug_descr);
799	}
800
801	static inline void debug_timer_assert_init(struct timer_list *timer)
802	{
803	debug_object_assert_init(addr: timer, descr: &timer_debug_descr);
804	}
805
806	static void do_init_timer(struct timer_list *timer,
807	void (func)(struct* timer_list *),
808	unsigned int flags,
809	const char name, struct* lock_class_key *key);
810
811	void timer_init_key_on_stack(struct timer_list *timer,
812	void (func)(struct* timer_list *),
813	unsigned int flags,
814	const char name, struct* lock_class_key *key)
815	{
816	debug_object_init_on_stack(addr: timer, descr: &timer_debug_descr);
817	do_init_timer(timer, func, flags, name, key);
818	}
819	EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
820
821	void timer_destroy_on_stack(struct timer_list *timer)
822	{
823	debug_object_free(addr: timer, descr: &timer_debug_descr);
824	}
825	EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
826
827	#else
828	static inline void debug_timer_init(struct timer_list *timer) { }
829	static inline void debug_timer_activate(struct timer_list *timer) { }
830	static inline void debug_timer_deactivate(struct timer_list *timer) { }
831	static inline void debug_timer_assert_init(struct timer_list *timer) { }
832	#endif
833
834	static inline void debug_init(struct timer_list *timer)
835	{
836	debug_timer_init(timer);
837	trace_timer_init(timer);
838	}
839
840	static inline void debug_deactivate(struct timer_list *timer)
841	{
842	debug_timer_deactivate(timer);
843	trace_timer_cancel(timer);
844	}
845
846	static inline void debug_assert_init(struct timer_list *timer)
847	{
848	debug_timer_assert_init(timer);
849	}
850
851	static void do_init_timer(struct timer_list *timer,
852	void (func)(struct* timer_list *),
853	unsigned int flags,
854	const char name, struct* lock_class_key *key)
855	{
856	timer->entry.pprev = NULL;
857	timer->function = func;
858	if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
859	flags &= TIMER_INIT_FLAGS;
860	timer->flags = flags \| raw_smp_processor_id();
861	lockdep_init_map(lock: &timer->lockdep_map, name, key, subclass: `0`);
862	}
863
864	/**
865	* timer_init_key - initialize a timer
866	* @timer: the timer to be initialized
867	* @func: timer callback function
868	* @flags: timer flags
869	* @name: name of the timer
870	* @key: lockdep class key of the fake lock used for tracking timer
871	* sync lock dependencies
872	*
873	* timer_init_key() must be done to a timer prior to calling any of the
874	* other timer functions.
875	*/
876	void timer_init_key(struct timer_list *timer,
877	void (func)(struct* timer_list ), unsigned* int flags,
878	const char name, struct* lock_class_key *key)
879	{
880	debug_init(timer);
881	do_init_timer(timer, func, flags, name, key);
882	}
883	EXPORT_SYMBOL(timer_init_key);
884
885	static inline void detach_timer(struct timer_list *timer, bool clear_pending)
886	{
887	struct hlist_node *entry = &timer->entry;
888
889	debug_deactivate(timer);
890
891	__hlist_del(n: entry);
892	if (clear_pending)
893	entry->pprev = NULL;
894	entry->next = LIST_POISON2;
895	}
896
897	static int detach_if_pending(struct timer_list timer, struct* timer_base *base,
898	bool clear_pending)
899	{
900	unsigned idx = timer_get_idx(timer);
901
902	if (!timer_pending(timer))
903	return `0`;
904
905	if (hlist_is_singular_node(n: &timer->entry, h: base->vectors + idx)) {
906	__clear_bit(idx, base->pending_map);
907	base->next_expiry_recalc = true;
908	}
909
910	detach_timer(timer, clear_pending);
911	return `1`;
912	}
913
914	static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
915	{
916	int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
917
918	/*
919	* If the timer is deferrable and NO_HZ_COMMON is set then we need
920	* to use the deferrable base.
921	*/
922	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
923	index = BASE_DEF;
924
925	return per_cpu_ptr(&timer_bases[index], cpu);
926	}
927
928	static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
929	{
930	int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
931
932	/*
933	* If the timer is deferrable and NO_HZ_COMMON is set then we need
934	* to use the deferrable base.
935	*/
936	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
937	index = BASE_DEF;
938
939	return this_cpu_ptr(&timer_bases[index]);
940	}
941
942	static inline struct timer_base *get_timer_base(u32 tflags)
943	{
944	return get_timer_cpu_base(tflags, cpu: tflags & TIMER_CPUMASK);
945	}
946
947	static inline void __forward_timer_base(struct timer_base *base,
948	unsigned long basej)
949	{
950	/*
951	* Check whether we can forward the base. We can only do that when
952	* @basej is past base->clk otherwise we might rewind base->clk.
953	*/
954	if (time_before_eq(basej, base->clk))
955	return;
956
957	/*
958	* If the next expiry value is > jiffies, then we fast forward to
959	* jiffies otherwise we forward to the next expiry value.
960	*/
961	if (time_after(base->next_expiry, basej)) {
962	base->clk = basej;
963	} else {
964	if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
965	return;
966	base->clk = base->next_expiry;
967	}
968
969	}
970
971	static inline void forward_timer_base(struct timer_base *base)
972	{
973	__forward_timer_base(base, READ_ONCE(jiffies));
974	}
975
976	/*
977	* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
978	* that all timers which are tied to this base are locked, and the base itself
979	* is locked too.
980	*
981	* So __run_timers/migrate_timers can safely modify all timers which could
982	* be found in the base->vectors array.
983	*
984	* When a timer is migrating then the TIMER_MIGRATING flag is set and we need
985	* to wait until the migration is done.
986	*/
987	static struct timer_base lock_timer_base(struct* timer_list *timer,
988	unsigned long *flags)
989	__acquires(timer->base->lock)
990	{
991	for (;;) {
992	struct timer_base *base;
993	u32 tf;
994
995	/*
996	* We need to use READ_ONCE() here, otherwise the compiler
997	* might re-read @tf between the check for TIMER_MIGRATING
998	* and spin_lock().
999	*/
1000	tf = READ_ONCE(timer->flags);
1001
1002	if (!(tf & TIMER_MIGRATING)) {
1003	base = get_timer_base(tflags: tf);
1004	raw_spin_lock_irqsave(&base->lock, *flags);
1005	if (timer->flags == tf)
1006	return base;
1007	raw_spin_unlock_irqrestore(&base->lock, *flags);
1008	}
1009	cpu_relax();
1010	}
1011	}
1012
1013	#define MOD_TIMER_PENDING_ONLY 0x01
1014	#define MOD_TIMER_REDUCE 0x02
1015	#define MOD_TIMER_NOTPENDING 0x04
1016
1017	static inline int
1018	__mod_timer(struct timer_list timer, unsigned* long expires, unsigned int options)
1019	{
1020	unsigned long clk = `0`, flags, bucket_expiry;
1021	struct timer_base base, new_base;
1022	unsigned int idx = UINT_MAX;
1023	int ret = `0`;
1024
1025	debug_assert_init(timer);
1026
1027	/*
1028	* This is a common optimization triggered by the networking code - if
1029	* the timer is re-modified to have the same timeout or ends up in the
1030	* same array bucket then just return:
1031	*/
1032	if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
1033	/*
1034	* The downside of this optimization is that it can result in
1035	* larger granularity than you would get from adding a new
1036	* timer with this expiry.
1037	*/
1038	long diff = timer->expires - expires;
1039
1040	if (!diff)
1041	return `1`;
1042	if (options & MOD_TIMER_REDUCE && diff <= `0`)
1043	return `1`;
1044
1045	/*
1046	* We lock timer base and calculate the bucket index right
1047	* here. If the timer ends up in the same bucket, then we
1048	* just update the expiry time and avoid the whole
1049	* dequeue/enqueue dance.
1050	*/
1051	base = lock_timer_base(timer, flags: &flags);
1052	/*
1053	* Has @timer been shutdown? This needs to be evaluated
1054	* while holding base lock to prevent a race against the
1055	* shutdown code.
1056	*/
1057	if (!timer->function)
1058	goto out_unlock;
1059
1060	forward_timer_base(base);
1061
1062	if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
1063	time_before_eq(timer->expires, expires)) {
1064	ret = `1`;
1065	goto out_unlock;
1066	}
1067
1068	clk = base->clk;
1069	idx = calc_wheel_index(expires, clk, bucket_expiry: &bucket_expiry);
1070
1071	/*
1072	* Retrieve and compare the array index of the pending
1073	* timer. If it matches set the expiry to the new value so a
1074	* subsequent call will exit in the expires check above.
1075	*/
1076	if (idx == timer_get_idx(timer)) {
1077	if (!(options & MOD_TIMER_REDUCE))
1078	timer->expires = expires;
1079	else if (time_after(timer->expires, expires))
1080	timer->expires = expires;
1081	ret = `1`;
1082	goto out_unlock;
1083	}
1084	} else {
1085	base = lock_timer_base(timer, flags: &flags);
1086	/*
1087	* Has @timer been shutdown? This needs to be evaluated
1088	* while holding base lock to prevent a race against the
1089	* shutdown code.
1090	*/
1091	if (!timer->function)
1092	goto out_unlock;
1093
1094	forward_timer_base(base);
1095	}
1096
1097	ret = detach_if_pending(timer, base, clear_pending: false);
1098	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
1099	goto out_unlock;
1100
1101	new_base = get_timer_this_cpu_base(tflags: timer->flags);
1102
1103	if (base != new_base) {
1104	/*
1105	* We are trying to schedule the timer on the new base.
1106	* However we can't change timer's base while it is running,
1107	* otherwise timer_delete_sync() can't detect that the timer's
1108	* handler yet has not finished. This also guarantees that the
1109	* timer is serialized wrt itself.
1110	*/
1111	if (likely(base->running_timer != timer)) {
1112	/ See the comment in lock_timer_base() /
1113	timer->flags \|= TIMER_MIGRATING;
1114
1115	raw_spin_unlock(&base->lock);
1116	base = new_base;
1117	raw_spin_lock(&base->lock);
1118	WRITE_ONCE(timer->flags,
1119	(timer->flags & ~TIMER_BASEMASK) \| base->cpu);
1120	forward_timer_base(base);
1121	}
1122	}
1123
1124	debug_timer_activate(timer);
1125
1126	timer->expires = expires;
1127	/*
1128	* If 'idx' was calculated above and the base time did not advance
1129	* between calculating 'idx' and possibly switching the base, only
1130	* enqueue_timer() is required. Otherwise we need to (re)calculate
1131	* the wheel index via internal_add_timer().
1132	*/
1133	if (idx != UINT_MAX && clk == base->clk)
1134	enqueue_timer(base, timer, idx, bucket_expiry);
1135	else
1136	internal_add_timer(base, timer);
1137
1138	out_unlock:
1139	raw_spin_unlock_irqrestore(&base->lock, flags);
1140
1141	return ret;
1142	}
1143
1144	/**
1145	* mod_timer_pending - Modify a pending timer's timeout
1146	* @timer: The pending timer to be modified
1147	* @expires: New absolute timeout in jiffies
1148	*
1149	* mod_timer_pending() is the same for pending timers as mod_timer(), but
1150	* will not activate inactive timers.
1151	*
1152	* If @timer->function == NULL then the start operation is silently
1153	* discarded.
1154	*
1155	* Return:
1156	* * %0 - The timer was inactive and not modified or was in
1157	* shutdown state and the operation was discarded
1158	* * %1 - The timer was active and requeued to expire at @expires
1159	*/
1160	int mod_timer_pending(struct timer_list timer, unsigned* long expires)
1161	{
1162	return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
1163	}
1164	EXPORT_SYMBOL(mod_timer_pending);
1165
1166	/**
1167	* mod_timer - Modify a timer's timeout
1168	* @timer: The timer to be modified
1169	* @expires: New absolute timeout in jiffies
1170	*
1171	* mod_timer(timer, expires) is equivalent to:
1172	*
1173	* timer_delete(timer); timer->expires = expires; add_timer(timer);
1174	*
1175	* mod_timer() is more efficient than the above open coded sequence. In
1176	* case that the timer is inactive, the timer_delete() part is a NOP. The
1177	* timer is in any case activated with the new expiry time @expires.
1178	*
1179	* Note that if there are multiple unserialized concurrent users of the
1180	* same timer, then mod_timer() is the only safe way to modify the timeout,
1181	* since add_timer() cannot modify an already running timer.
1182	*
1183	* If @timer->function == NULL then the start operation is silently
1184	* discarded. In this case the return value is 0 and meaningless.
1185	*
1186	* Return:
1187	* * %0 - The timer was inactive and started or was in shutdown
1188	* state and the operation was discarded
1189	* * %1 - The timer was active and requeued to expire at @expires or
1190	* the timer was active and not modified because @expires did
1191	* not change the effective expiry time
1192	*/
1193	int mod_timer(struct timer_list timer, unsigned* long expires)
1194	{
1195	return __mod_timer(timer, expires, options: `0`);
1196	}
1197	EXPORT_SYMBOL(mod_timer);
1198
1199	/**
1200	* timer_reduce - Modify a timer's timeout if it would reduce the timeout
1201	* @timer: The timer to be modified
1202	* @expires: New absolute timeout in jiffies
1203	*
1204	* timer_reduce() is very similar to mod_timer(), except that it will only
1205	* modify an enqueued timer if that would reduce the expiration time. If
1206	* @timer is not enqueued it starts the timer.
1207	*
1208	* If @timer->function == NULL then the start operation is silently
1209	* discarded.
1210	*
1211	* Return:
1212	* * %0 - The timer was inactive and started or was in shutdown
1213	* state and the operation was discarded
1214	* * %1 - The timer was active and requeued to expire at @expires or
1215	* the timer was active and not modified because @expires
1216	* did not change the effective expiry time such that the
1217	* timer would expire earlier than already scheduled
1218	*/
1219	int timer_reduce(struct timer_list timer, unsigned* long expires)
1220	{
1221	return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
1222	}
1223	EXPORT_SYMBOL(timer_reduce);
1224
1225	/**
1226	* add_timer - Start a timer
1227	* @timer: The timer to be started
1228	*
1229	* Start @timer to expire at @timer->expires in the future. @timer->expires
1230	* is the absolute expiry time measured in 'jiffies'. When the timer expires
1231	* timer->function(timer) will be invoked from soft interrupt context.
1232	*
1233	* The @timer->expires and @timer->function fields must be set prior
1234	* to calling this function.
1235	*
1236	* If @timer->function == NULL then the start operation is silently
1237	* discarded.
1238	*
1239	* If @timer->expires is already in the past @timer will be queued to
1240	* expire at the next timer tick.
1241	*
1242	* This can only operate on an inactive timer. Attempts to invoke this on
1243	* an active timer are rejected with a warning.
1244	*/
1245	void add_timer(struct timer_list *timer)
1246	{
1247	if (WARN_ON_ONCE(timer_pending(timer)))
1248	return;
1249	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1250	}
1251	EXPORT_SYMBOL(add_timer);
1252
1253	/**
1254	* add_timer_local() - Start a timer on the local CPU
1255	* @timer: The timer to be started
1256	*
1257	* Same as add_timer() except that the timer flag TIMER_PINNED is set.
1258	*
1259	* See add_timer() for further details.
1260	*/
1261	void add_timer_local(struct timer_list *timer)
1262	{
1263	if (WARN_ON_ONCE(timer_pending(timer)))
1264	return;
1265	timer->flags \|= TIMER_PINNED;
1266	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1267	}
1268	EXPORT_SYMBOL(add_timer_local);
1269
1270	/**
1271	* add_timer_global() - Start a timer without TIMER_PINNED flag set
1272	* @timer: The timer to be started
1273	*
1274	* Same as add_timer() except that the timer flag TIMER_PINNED is unset.
1275	*
1276	* See add_timer() for further details.
1277	*/
1278	void add_timer_global(struct timer_list *timer)
1279	{
1280	if (WARN_ON_ONCE(timer_pending(timer)))
1281	return;
1282	timer->flags &= ~TIMER_PINNED;
1283	__mod_timer(timer, expires: timer->expires, MOD_TIMER_NOTPENDING);
1284	}
1285	EXPORT_SYMBOL(add_timer_global);
1286
1287	/**
1288	* add_timer_on - Start a timer on a particular CPU
1289	* @timer: The timer to be started
1290	* @cpu: The CPU to start it on
1291	*
1292	* Same as add_timer() except that it starts the timer on the given CPU and
1293	* the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
1294	* the next round, add_timer_global() should be used instead as it unsets
1295	* the TIMER_PINNED flag.
1296	*
1297	* See add_timer() for further details.
1298	*/
1299	void add_timer_on(struct timer_list timer, int* cpu)
1300	{
1301	struct timer_base new_base, base;
1302	unsigned long flags;
1303
1304	debug_assert_init(timer);
1305
1306	if (WARN_ON_ONCE(timer_pending(timer)))
1307	return;
1308
1309	/ Make sure timer flags have TIMER_PINNED flag set /
1310	timer->flags \|= TIMER_PINNED;
1311
1312	new_base = get_timer_cpu_base(tflags: timer->flags, cpu);
1313
1314	/*
1315	* If @timer was on a different CPU, it should be migrated with the
1316	* old base locked to prevent other operations proceeding with the
1317	* wrong base locked. See lock_timer_base().
1318	*/
1319	base = lock_timer_base(timer, flags: &flags);
1320	/*
1321	* Has @timer been shutdown? This needs to be evaluated while
1322	* holding base lock to prevent a race against the shutdown code.
1323	*/
1324	if (!timer->function)
1325	goto out_unlock;
1326
1327	if (base != new_base) {
1328	timer->flags \|= TIMER_MIGRATING;
1329
1330	raw_spin_unlock(&base->lock);
1331	base = new_base;
1332	raw_spin_lock(&base->lock);
1333	WRITE_ONCE(timer->flags,
1334	(timer->flags & ~TIMER_BASEMASK) \| cpu);
1335	}
1336	forward_timer_base(base);
1337
1338	debug_timer_activate(timer);
1339	internal_add_timer(base, timer);
1340	out_unlock:
1341	raw_spin_unlock_irqrestore(&base->lock, flags);
1342	}
1343	EXPORT_SYMBOL_GPL(add_timer_on);
1344
1345	/**
1346	* __timer_delete - Internal function: Deactivate a timer
1347	* @timer: The timer to be deactivated
1348	* @shutdown: If true, this indicates that the timer is about to be
1349	* shutdown permanently.
1350	*
1351	* If @shutdown is true then @timer->function is set to NULL under the
1352	* timer base lock which prevents further rearming of the time. In that
1353	* case any attempt to rearm @timer after this function returns will be
1354	* silently ignored.
1355	*
1356	* Return:
1357	* * %0 - The timer was not pending
1358	* * %1 - The timer was pending and deactivated
1359	*/
1360	static int __timer_delete(struct timer_list *timer, bool shutdown)
1361	{
1362	struct timer_base *base;
1363	unsigned long flags;
1364	int ret = `0`;
1365
1366	debug_assert_init(timer);
1367
1368	/*
1369	* If @shutdown is set then the lock has to be taken whether the
1370	* timer is pending or not to protect against a concurrent rearm
1371	* which might hit between the lockless pending check and the lock
1372	* acquisition. By taking the lock it is ensured that such a newly
1373	* enqueued timer is dequeued and cannot end up with
1374	* timer->function == NULL in the expiry code.
1375	*
1376	* If timer->function is currently executed, then this makes sure
1377	* that the callback cannot requeue the timer.
1378	*/
1379	if (timer_pending(timer) \|\| shutdown) {
1380	base = lock_timer_base(timer, flags: &flags);
1381	ret = detach_if_pending(timer, base, clear_pending: true);
1382	if (shutdown)
1383	timer->function = NULL;
1384	raw_spin_unlock_irqrestore(&base->lock, flags);
1385	}
1386
1387	return ret;
1388	}
1389
1390	/**
1391	* timer_delete - Deactivate a timer
1392	* @timer: The timer to be deactivated
1393	*
1394	* The function only deactivates a pending timer, but contrary to
1395	* timer_delete_sync() it does not take into account whether the timer's
1396	* callback function is concurrently executed on a different CPU or not.
1397	* It neither prevents rearming of the timer. If @timer can be rearmed
1398	* concurrently then the return value of this function is meaningless.
1399	*
1400	* Return:
1401	* * %0 - The timer was not pending
1402	* * %1 - The timer was pending and deactivated
1403	*/
1404	int timer_delete(struct timer_list *timer)
1405	{
1406	return __timer_delete(timer, shutdown: false);
1407	}
1408	EXPORT_SYMBOL(timer_delete);
1409
1410	/**
1411	* timer_shutdown - Deactivate a timer and prevent rearming
1412	* @timer: The timer to be deactivated
1413	*
1414	* The function does not wait for an eventually running timer callback on a
1415	* different CPU but it prevents rearming of the timer. Any attempt to arm
1416	* @timer after this function returns will be silently ignored.
1417	*
1418	* This function is useful for teardown code and should only be used when
1419	* timer_shutdown_sync() cannot be invoked due to locking or context constraints.
1420	*
1421	* Return:
1422	* * %0 - The timer was not pending
1423	* * %1 - The timer was pending
1424	*/
1425	int timer_shutdown(struct timer_list *timer)
1426	{
1427	return __timer_delete(timer, shutdown: true);
1428	}
1429	EXPORT_SYMBOL_GPL(timer_shutdown);
1430
1431	/**
1432	* __try_to_del_timer_sync - Internal function: Try to deactivate a timer
1433	* @timer: Timer to deactivate
1434	* @shutdown: If true, this indicates that the timer is about to be
1435	* shutdown permanently.
1436	*
1437	* If @shutdown is true then @timer->function is set to NULL under the
1438	* timer base lock which prevents further rearming of the timer. Any
1439	* attempt to rearm @timer after this function returns will be silently
1440	* ignored.
1441	*
1442	* This function cannot guarantee that the timer cannot be rearmed
1443	* right after dropping the base lock if @shutdown is false. That
1444	* needs to be prevented by the calling code if necessary.
1445	*
1446	* Return:
1447	* * %0 - The timer was not pending
1448	* * %1 - The timer was pending and deactivated
1449	* * %-1 - The timer callback function is running on a different CPU
1450	*/
1451	static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
1452	{
1453	struct timer_base *base;
1454	unsigned long flags;
1455	int ret = -`1`;
1456
1457	debug_assert_init(timer);
1458
1459	base = lock_timer_base(timer, flags: &flags);
1460
1461	if (base->running_timer != timer)
1462	ret = detach_if_pending(timer, base, clear_pending: true);
1463	if (shutdown)
1464	timer->function = NULL;
1465
1466	raw_spin_unlock_irqrestore(&base->lock, flags);
1467
1468	return ret;
1469	}
1470
1471	/**
1472	* timer_delete_sync_try - Try to deactivate a timer
1473	* @timer: Timer to deactivate
1474	*
1475	* This function tries to deactivate a timer. On success the timer is not
1476	* queued and the timer callback function is not running on any CPU.
1477	*
1478	* This function does not guarantee that the timer cannot be rearmed right
1479	* after dropping the base lock. That needs to be prevented by the calling
1480	* code if necessary.
1481	*
1482	* Return:
1483	* * %0 - The timer was not pending
1484	* * %1 - The timer was pending and deactivated
1485	* * %-1 - The timer callback function is running on a different CPU
1486	*/
1487	int timer_delete_sync_try(struct timer_list *timer)
1488	{
1489	return __try_to_del_timer_sync(timer, shutdown: false);
1490	}
1491	EXPORT_SYMBOL(timer_delete_sync_try);
1492
1493	#ifdef CONFIG_PREEMPT_RT
1494	static __init void timer_base_init_expiry_lock(struct timer_base *base)
1495	{
1496	spin_lock_init(&base->expiry_lock);
1497	}
1498
1499	static inline void timer_base_lock_expiry(struct timer_base *base)
1500	{
1501	spin_lock(&base->expiry_lock);
1502	}
1503
1504	static inline void timer_base_unlock_expiry(struct timer_base *base)
1505	{
1506	spin_unlock(&base->expiry_lock);
1507	}
1508
1509	/*
1510	* The counterpart to del_timer_wait_running().
1511	*
1512	* If there is a waiter for base->expiry_lock, then it was waiting for the
1513	* timer callback to finish. Drop expiry_lock and reacquire it. That allows
1514	* the waiter to acquire the lock and make progress.
1515	*/
1516	static void timer_sync_wait_running(struct timer_base *base)
1517	__releases(&base->lock) __releases(&base->expiry_lock)
1518	__acquires(&base->expiry_lock) __acquires(&base->lock)
1519	{
1520	if (atomic_read(&base->timer_waiters)) {
1521	raw_spin_unlock_irq(&base->lock);
1522	spin_unlock(&base->expiry_lock);
1523	spin_lock(&base->expiry_lock);
1524	raw_spin_lock_irq(&base->lock);
1525	}
1526	}
1527
1528	/*
1529	* This function is called on PREEMPT_RT kernels when the fast path
1530	* deletion of a timer failed because the timer callback function was
1531	* running.
1532	*
1533	* This prevents priority inversion, if the softirq thread on a remote CPU
1534	* got preempted, and it prevents a life lock when the task which tries to
1535	* delete a timer preempted the softirq thread running the timer callback
1536	* function.
1537	*/
1538	static void del_timer_wait_running(struct timer_list *timer)
1539	{
1540	u32 tf;
1541
1542	tf = READ_ONCE(timer->flags);
1543	if (!(tf & (TIMER_MIGRATING \| TIMER_IRQSAFE))) {
1544	struct timer_base *base = get_timer_base(tf);
1545
1546	/*
1547	* Mark the base as contended and grab the expiry lock,
1548	* which is held by the softirq across the timer
1549	* callback. Drop the lock immediately so the softirq can
1550	* expire the next timer. In theory the timer could already
1551	* be running again, but that's more than unlikely and just
1552	* causes another wait loop.
1553	*/
1554	atomic_inc(&base->timer_waiters);
1555	spin_lock_bh(&base->expiry_lock);
1556	atomic_dec(&base->timer_waiters);
1557	spin_unlock_bh(&base->expiry_lock);
1558	}
1559	}
1560	#else
1561	static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
1562	static inline void timer_base_lock_expiry(struct timer_base *base) { }
1563	static inline void timer_base_unlock_expiry(struct timer_base *base) { }
1564	static inline void timer_sync_wait_running(struct timer_base *base) { }
1565	static inline void del_timer_wait_running(struct timer_list *timer) { }
1566	#endif
1567
1568	/**
1569	* __timer_delete_sync - Internal function: Deactivate a timer and wait
1570	* for the handler to finish.
1571	* @timer: The timer to be deactivated
1572	* @shutdown: If true, @timer->function will be set to NULL under the
1573	* timer base lock which prevents rearming of @timer
1574	*
1575	* If @shutdown is not set the timer can be rearmed later. If the timer can
1576	* be rearmed concurrently, i.e. after dropping the base lock then the
1577	* return value is meaningless.
1578	*
1579	* If @shutdown is set then @timer->function is set to NULL under timer
1580	* base lock which prevents rearming of the timer. Any attempt to rearm
1581	* a shutdown timer is silently ignored.
1582	*
1583	* If the timer should be reused after shutdown it has to be initialized
1584	* again.
1585	*
1586	* Return:
1587	* * %0 - The timer was not pending
1588	* * %1 - The timer was pending and deactivated
1589	*/
1590	static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
1591	{
1592	int ret;
1593
1594	#ifdef CONFIG_LOCKDEP
1595	unsigned long flags;
1596
1597	/*
1598	* If lockdep gives a backtrace here, please reference
1599	* the synchronization rules above.
1600	*/
1601	local_irq_save(flags);
1602	lock_map_acquire(&timer->lockdep_map);
1603	lock_map_release(&timer->lockdep_map);
1604	local_irq_restore(flags);
1605	#endif
1606	/*
1607	* don't use it in hardirq context, because it
1608	* could lead to deadlock.
1609	*/
1610	WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));
1611
1612	/*
1613	* Must be able to sleep on PREEMPT_RT because of the slowpath in
1614	* del_timer_wait_running().
1615	*/
1616	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
1617	lockdep_assert_preemption_enabled();
1618
1619	do {
1620	ret = __try_to_del_timer_sync(timer, shutdown);
1621
1622	if (unlikely(ret < `0`)) {
1623	del_timer_wait_running(timer);
1624	cpu_relax();
1625	}
1626	} while (ret < `0`);
1627
1628	return ret;
1629	}
1630
1631	/**
1632	* timer_delete_sync - Deactivate a timer and wait for the handler to finish.
1633	* @timer: The timer to be deactivated
1634	*
1635	* Synchronization rules: Callers must prevent restarting of the timer,
1636	* otherwise this function is meaningless. It must not be called from
1637	* interrupt contexts unless the timer is an irqsafe one. The caller must
1638	* not hold locks which would prevent completion of the timer's callback
1639	* function. The timer's handler must not call add_timer_on(). Upon exit
1640	* the timer is not queued and the handler is not running on any CPU.
1641	*
1642	* For !irqsafe timers, the caller must not hold locks that are held in
1643	* interrupt context. Even if the lock has nothing to do with the timer in
1644	* question. Here's why::
1645	*
1646	* CPU0 CPU1
1647	* ---- ----
1648	* <SOFTIRQ>
1649	* call_timer_fn();
1650	* base->running_timer = mytimer;
1651	* spin_lock_irq(somelock);
1652	* <IRQ>
1653	* spin_lock(somelock);
1654	* timer_delete_sync(mytimer);
1655	* while (base->running_timer == mytimer);
1656	*
1657	* Now timer_delete_sync() will never return and never release somelock.
1658	* The interrupt on the other CPU is waiting to grab somelock but it has
1659	* interrupted the softirq that CPU0 is waiting to finish.
1660	*
1661	* This function cannot guarantee that the timer is not rearmed again by
1662	* some concurrent or preempting code, right after it dropped the base
1663	* lock. If there is the possibility of a concurrent rearm then the return
1664	* value of the function is meaningless.
1665	*
1666	* If such a guarantee is needed, e.g. for teardown situations then use
1667	* timer_shutdown_sync() instead.
1668	*
1669	* Return:
1670	* * %0 - The timer was not pending
1671	* * %1 - The timer was pending and deactivated
1672	*/
1673	int timer_delete_sync(struct timer_list *timer)
1674	{
1675	return __timer_delete_sync(timer, shutdown: false);
1676	}
1677	EXPORT_SYMBOL(timer_delete_sync);
1678
1679	/**
1680	* timer_shutdown_sync - Shutdown a timer and prevent rearming
1681	* @timer: The timer to be shutdown
1682	*
1683	* When the function returns it is guaranteed that:
1684	* - @timer is not queued
1685	* - The callback function of @timer is not running
1686	* - @timer cannot be enqueued again. Any attempt to rearm
1687	* @timer is silently ignored.
1688	*
1689	* See timer_delete_sync() for synchronization rules.
1690	*
1691	* This function is useful for final teardown of an infrastructure where
1692	* the timer is subject to a circular dependency problem.
1693	*
1694	* A common pattern for this is a timer and a workqueue where the timer can
1695	* schedule work and work can arm the timer. On shutdown the workqueue must
1696	* be destroyed and the timer must be prevented from rearming. Unless the
1697	* code has conditionals like 'if (mything->in_shutdown)' to prevent that
1698	* there is no way to get this correct with timer_delete_sync().
1699	*
1700	* timer_shutdown_sync() is solving the problem. The correct ordering of
1701	* calls in this case is:
1702	*
1703	* timer_shutdown_sync(&mything->timer);
1704	* workqueue_destroy(&mything->workqueue);
1705	*
1706	* After this 'mything' can be safely freed.
1707	*
1708	* This obviously implies that the timer is not required to be functional
1709	* for the rest of the shutdown operation.
1710	*
1711	* Return:
1712	* * %0 - The timer was not pending
1713	* * %1 - The timer was pending
1714	*/
1715	int timer_shutdown_sync(struct timer_list *timer)
1716	{
1717	return __timer_delete_sync(timer, shutdown: true);
1718	}
1719	EXPORT_SYMBOL_GPL(timer_shutdown_sync);
1720
1721	static void call_timer_fn(struct timer_list *timer,
1722	void (fn)(struct* timer_list *),
1723	unsigned long baseclk)
1724	{
1725	int count = preempt_count();
1726
1727	#ifdef CONFIG_LOCKDEP
1728	/*
1729	* It is permissible to free the timer from inside the
1730	* function that is called from it, this we need to take into
1731	* account for lockdep too. To avoid bogus "held lock freed"
1732	* warnings as well as problems when looking into
1733	* timer->lockdep_map, make a copy and use that here.
1734	*/
1735	struct lockdep_map lockdep_map;
1736
1737	lockdep_copy_map(to: &lockdep_map, from: &timer->lockdep_map);
1738	#endif
1739	/*
1740	* Couple the lock chain with the lock chain at
1741	* timer_delete_sync() by acquiring the lock_map around the fn()
1742	* call here and in timer_delete_sync().
1743	*/
1744	lock_map_acquire(&lockdep_map);
1745
1746	trace_timer_expire_entry(timer, baseclk);
1747	fn(timer);
1748	trace_timer_expire_exit(timer);
1749
1750	lock_map_release(&lockdep_map);
1751
1752	if (count != preempt_count()) {
1753	WARN_ONCE(`1`, "timer: %pS preempt leak: %08x -> %08x\n",
1754	fn, count, preempt_count());
1755	/*
1756	* Restore the preempt count. That gives us a decent
1757	* chance to survive and extract information. If the
1758	* callback kept a lock held, bad luck, but not worse
1759	* than the BUG() we had.
1760	*/
1761	preempt_count_set(pc: count);
1762	}
1763	}
1764
1765	static void expire_timers(struct timer_base base, struct* hlist_head *head)
1766	{
1767	/*
1768	* This value is required only for tracing. base->clk was
1769	* incremented directly before expire_timers was called. But expiry
1770	* is related to the old base->clk value.
1771	*/
1772	unsigned long baseclk = base->clk - `1`;
1773
1774	while (!hlist_empty(h: head)) {
1775	struct timer_list *timer;
1776	void (fn)(struct* timer_list *);
1777
1778	timer = hlist_entry(head->first, struct timer_list, entry);
1779
1780	base->running_timer = timer;
1781	detach_timer(timer, clear_pending: true);
1782
1783	fn = timer->function;
1784
1785	if (WARN_ON_ONCE(!fn)) {
1786	/ Should never happen. Emphasis on should! /
1787	base->running_timer = NULL;
1788	continue;
1789	}
1790
1791	if (timer->flags & TIMER_IRQSAFE) {
1792	raw_spin_unlock(&base->lock);
1793	call_timer_fn(timer, fn, baseclk);
1794	raw_spin_lock(&base->lock);
1795	base->running_timer = NULL;
1796	} else {
1797	raw_spin_unlock_irq(&base->lock);
1798	call_timer_fn(timer, fn, baseclk);
1799	raw_spin_lock_irq(&base->lock);
1800	base->running_timer = NULL;
1801	timer_sync_wait_running(base);
1802	}
1803	}
1804	}
1805
1806	static int collect_expired_timers(struct timer_base *base,
1807	struct hlist_head *heads)
1808	{
1809	unsigned long clk = base->clk = base->next_expiry;
1810	struct hlist_head *vec;
1811	int i, levels = `0`;
1812	unsigned int idx;
1813
1814	for (i = `0`; i < LVL_DEPTH; i++) {
1815	idx = (clk & LVL_MASK) + i * LVL_SIZE;
1816
1817	if (__test_and_clear_bit(idx, base->pending_map)) {
1818	vec = base->vectors + idx;
1819	hlist_move_list(old: vec, new: heads++);
1820	levels++;
1821	}
1822	/ Is it time to look at the next level? /
1823	if (clk & LVL_CLK_MASK)
1824	break;
1825	/ Shift clock for the next level granularity /
1826	clk >>= LVL_CLK_SHIFT;
1827	}
1828	return levels;
1829	}
1830
1831	/*
1832	* Find the next pending bucket of a level. Search from level start (@offset)
1833	* + @clk upwards and if nothing there, search from start of the level
1834	* (@offset) up to @offset + clk.
1835	*/
1836	static int next_pending_bucket(struct timer_base base, unsigned* offset,
1837	unsigned clk)
1838	{
1839	unsigned pos, start = offset + clk;
1840	unsigned end = offset + LVL_SIZE;
1841
1842	pos = find_next_bit(addr: base->pending_map, size: end, offset: start);
1843	if (pos < end)
1844	return pos - start;
1845
1846	pos = find_next_bit(addr: base->pending_map, size: start, offset);
1847	return pos < start ? pos + LVL_SIZE - start : -`1`;
1848	}
1849
1850	/*
1851	* Search the first expiring timer in the various clock levels. Caller must
1852	* hold base->lock.
1853	*
1854	* Store next expiry time in base->next_expiry.
1855	*/
1856	static void timer_recalc_next_expiry(struct timer_base *base)
1857	{
1858	unsigned long clk, next, adj;
1859	unsigned lvl, offset = `0`;
1860
1861	next = base->clk + TIMER_NEXT_MAX_DELTA;
1862	clk = base->clk;
1863	for (lvl = `0`; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
1864	int pos = next_pending_bucket(base, offset, clk: clk & LVL_MASK);
1865	unsigned long lvl_clk = clk & LVL_CLK_MASK;
1866
1867	if (pos >= `0`) {
1868	unsigned long tmp = clk + (unsigned long) pos;
1869
1870	tmp <<= LVL_SHIFT(lvl);
1871	if (time_before(tmp, next))
1872	next = tmp;
1873
1874	/*
1875	* If the next expiration happens before we reach
1876	* the next level, no need to check further.
1877	*/
1878	if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1879	break;
1880	}
1881	/*
1882	* Clock for the next level. If the current level clock lower
1883	* bits are zero, we look at the next level as is. If not we
1884	* need to advance it by one because that's going to be the
1885	* next expiring bucket in that level. base->clk is the next
1886	* expiring jiffy. So in case of:
1887	*
1888	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1889	* 0 0 0 0 0 0
1890	*
1891	* we have to look at all levels @index 0. With
1892	*
1893	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1894	* 0 0 0 0 0 2
1895	*
1896	* LVL0 has the next expiring bucket @index 2. The upper
1897	* levels have the next expiring bucket @index 1.
1898	*
1899	* In case that the propagation wraps the next level the same
1900	* rules apply:
1901	*
1902	* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
1903	* 0 0 0 0 F 2
1904	*
1905	* So after looking at LVL0 we get:
1906	*
1907	* LVL5 LVL4 LVL3 LVL2 LVL1
1908	* 0 0 0 1 0
1909	*
1910	* So no propagation from LVL1 to LVL2 because that happened
1911	* with the add already, but then we need to propagate further
1912	* from LVL2 to LVL3.
1913	*
1914	* So the simple check whether the lower bits of the current
1915	* level are 0 or not is sufficient for all cases.
1916	*/
1917	adj = lvl_clk ? `1` : `0`;
1918	clk >>= LVL_CLK_SHIFT;
1919	clk += adj;
1920	}
1921
1922	WRITE_ONCE(base->next_expiry, next);
1923	base->next_expiry_recalc = false;
1924	base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
1925	}
1926
1927	#ifdef CONFIG_NO_HZ_COMMON
1928	/*
1929	* Check, if the next hrtimer event is before the next timer wheel
1930	* event:
1931	*/
1932	static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1933	{
1934	u64 nextevt = hrtimer_get_next_event();
1935
1936	/*
1937	* If high resolution timers are enabled
1938	* hrtimer_get_next_event() returns KTIME_MAX.
1939	*/
1940	if (expires <= nextevt)
1941	return expires;
1942
1943	/*
1944	* If the next timer is already expired, return the tick base
1945	* time so the tick is fired immediately.
1946	*/
1947	if (nextevt <= basem)
1948	return basem;
1949
1950	/*
1951	* Round up to the next jiffy. High resolution timers are
1952	* off, so the hrtimers are expired in the tick and we need to
1953	* make sure that this tick really expires the timer to avoid
1954	* a ping pong of the nohz stop code.
1955	*
1956	* Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
1957	*/
1958	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
1959	}
1960
1961	static unsigned long next_timer_interrupt(struct timer_base *base,
1962	unsigned long basej)
1963	{
1964	if (base->next_expiry_recalc)
1965	timer_recalc_next_expiry(base);
1966
1967	/*
1968	* Move next_expiry for the empty base into the future to prevent an
1969	* unnecessary raise of the timer softirq when the next_expiry value
1970	* will be reached even if there is no timer pending.
1971	*
1972	* This update is also required to make timer_base::next_expiry values
1973	* easy comparable to find out which base holds the first pending timer.
1974	*/
1975	if (!base->timers_pending)
1976	WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
1977
1978	return base->next_expiry;
1979	}
1980
1981	static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
1982	struct timer_base *base_local,
1983	struct timer_base *base_global,
1984	struct timer_events *tevt)
1985	{
1986	unsigned long nextevt, nextevt_local, nextevt_global;
1987	bool local_first;
1988
1989	nextevt_local = next_timer_interrupt(base: base_local, basej);
1990	nextevt_global = next_timer_interrupt(base: base_global, basej);
1991
1992	local_first = time_before_eq(nextevt_local, nextevt_global);
1993
1994	nextevt = local_first ? nextevt_local : nextevt_global;
1995
1996	/*
1997	* If the @nextevt is at max. one tick away, use @nextevt and store
1998	* it in the local expiry value. The next global event is irrelevant in
1999	* this case and can be left as KTIME_MAX.
2000	*/
2001	if (time_before_eq(nextevt, basej + `1`)) {
2002	/ If we missed a tick already, force 0 delta /
2003	if (time_before(nextevt, basej))
2004	nextevt = basej;
2005	tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;
2006
2007	/*
2008	* This is required for the remote check only but it doesn't
2009	* hurt, when it is done for both call sites:
2010	*
2011	* * The remote callers will only take care of the global timers
2012	* as local timers will be handled by CPU itself. When not
2013	* updating tevt->global with the already missed first global
2014	* timer, it is possible that it will be missed completely.
2015	*
2016	* * The local callers will ignore the tevt->global anyway, when
2017	* nextevt is max. one tick away.
2018	*/
2019	if (!local_first)
2020	tevt->global = tevt->local;
2021	return nextevt;
2022	}
2023
2024	/*
2025	* Update tevt.* values:
2026	*
2027	* If the local queue expires first, then the global event can be
2028	* ignored. If the global queue is empty, nothing to do either.
2029	*/
2030	if (!local_first && base_global->timers_pending)
2031	tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;
2032
2033	if (base_local->timers_pending)
2034	tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;
2035
2036	return nextevt;
2037	}
2038
2039	# ifdef CONFIG_SMP
2040	/**
2041	* fetch_next_timer_interrupt_remote() - Store next timers into @tevt
2042	* @basej: base time jiffies
2043	* @basem: base time clock monotonic
2044	* @tevt: Pointer to the storage for the expiry values
2045	* @cpu: Remote CPU
2046	*
2047	* Stores the next pending local and global timer expiry values in the
2048	* struct pointed to by @tevt. If a queue is empty the corresponding
2049	* field is set to KTIME_MAX. If local event expires before global
2050	* event, global event is set to KTIME_MAX as well.
2051	*
2052	* Caller needs to make sure timer base locks are held (use
2053	* timer_lock_remote_bases() for this purpose).
2054	*/
2055	void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
2056	struct timer_events *tevt,
2057	unsigned int cpu)
2058	{
2059	struct timer_base base_local, base_global;
2060
2061	/ Preset local / global events /
2062	tevt->local = tevt->global = KTIME_MAX;
2063
2064	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2065	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2066
2067	lockdep_assert_held(&base_local->lock);
2068	lockdep_assert_held(&base_global->lock);
2069
2070	fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
2071	}
2072
2073	/**
2074	* timer_unlock_remote_bases - unlock timer bases of cpu
2075	* @cpu: Remote CPU
2076	*
2077	* Unlocks the remote timer bases.
2078	*/
2079	void timer_unlock_remote_bases(unsigned int cpu)
2080	__releases(timer_bases[BASE_LOCAL]->lock)
2081	__releases(timer_bases[BASE_GLOBAL]->lock)
2082	{
2083	struct timer_base base_local, base_global;
2084
2085	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2086	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2087
2088	raw_spin_unlock(&base_global->lock);
2089	raw_spin_unlock(&base_local->lock);
2090	}
2091
2092	/**
2093	* timer_lock_remote_bases - lock timer bases of cpu
2094	* @cpu: Remote CPU
2095	*
2096	* Locks the remote timer bases.
2097	*/
2098	void timer_lock_remote_bases(unsigned int cpu)
2099	__acquires(timer_bases[BASE_LOCAL]->lock)
2100	__acquires(timer_bases[BASE_GLOBAL]->lock)
2101	{
2102	struct timer_base base_local, base_global;
2103
2104	base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
2105	base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2106
2107	lockdep_assert_irqs_disabled();
2108
2109	raw_spin_lock(&base_local->lock);
2110	raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2111	}
2112
2113	/**
2114	* timer_base_is_idle() - Return whether timer base is set idle
2115	*
2116	* Returns value of local timer base is_idle value.
2117	*/
2118	bool timer_base_is_idle(void)
2119	{
2120	return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
2121	}
2122
2123	static void __run_timer_base(struct timer_base *base);
2124
2125	/**
2126	* timer_expire_remote() - expire global timers of cpu
2127	* @cpu: Remote CPU
2128	*
2129	* Expire timers of global base of remote CPU.
2130	*/
2131	void timer_expire_remote(unsigned int cpu)
2132	{
2133	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2134
2135	__run_timer_base(base);
2136	}
2137
2138	static void timer_use_tmigr(unsigned long basej, u64 basem,
2139	unsigned long nextevt, bool tick_stop_path,
2140	bool timer_base_idle, struct timer_events *tevt)
2141	{
2142	u64 next_tmigr;
2143
2144	if (timer_base_idle)
2145	next_tmigr = tmigr_cpu_new_timer(nextevt: tevt->global);
2146	else if (tick_stop_path)
2147	next_tmigr = tmigr_cpu_deactivate(nextevt: tevt->global);
2148	else
2149	next_tmigr = tmigr_quick_check(nextevt: tevt->global);
2150
2151	/*
2152	* If the CPU is the last going idle in timer migration hierarchy, make
2153	* sure the CPU will wake up in time to handle remote timers.
2154	* next_tmigr == KTIME_MAX if other CPUs are still active.
2155	*/
2156	if (next_tmigr < tevt->local) {
2157	u64 tmp;
2158
2159	/ If we missed a tick already, force 0 delta /
2160	if (next_tmigr < basem)
2161	next_tmigr = basem;
2162
2163	tmp = div_u64(dividend: next_tmigr - basem, TICK_NSEC);
2164
2165	nextevt = basej + (unsigned* long)tmp;
2166	tevt->local = next_tmigr;
2167	}
2168	}
2169	# else
2170	static void timer_use_tmigr(unsigned long basej, u64 basem,
2171	unsigned long nextevt, bool tick_stop_path,
2172	bool timer_base_idle, struct timer_events *tevt)
2173	{
2174	/*
2175	* Make sure first event is written into tevt->local to not miss a
2176	* timer on !SMP systems.
2177	*/
2178	tevt->local = min_t(u64, tevt->local, tevt->global);
2179	}
2180	# endif /* CONFIG_SMP */
2181
2182	static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
2183	bool *idle)
2184	{
2185	struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
2186	struct timer_base base_local, base_global;
2187	unsigned long nextevt;
2188	bool idle_is_possible;
2189
2190	/*
2191	* When the CPU is offline, the tick is cancelled and nothing is supposed
2192	* to try to stop it.
2193	*/
2194	if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
2195	if (idle)
2196	*idle = true;
2197	return tevt.local;
2198	}
2199
2200	base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2201	base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);
2202
2203	raw_spin_lock(&base_local->lock);
2204	raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
2205
2206	nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
2207	base_global, tevt: &tevt);
2208
2209	/*
2210	* If the next event is only one jiffy ahead there is no need to call
2211	* timer migration hierarchy related functions. The value for the next
2212	* global timer in @tevt struct equals then KTIME_MAX. This is also
2213	* true, when the timer base is idle.
2214	*
2215	* The proper timer migration hierarchy function depends on the callsite
2216	* and whether timer base is idle or not. @nextevt will be updated when
2217	* this CPU needs to handle the first timer migration hierarchy
2218	* event. See timer_use_tmigr() for detailed information.
2219	*/
2220	idle_is_possible = time_after(nextevt, basej + `1`);
2221	if (idle_is_possible)
2222	timer_use_tmigr(basej, basem, nextevt: &nextevt, tick_stop_path: idle,
2223	timer_base_idle: base_local->is_idle, tevt: &tevt);
2224
2225	/*
2226	* We have a fresh next event. Check whether we can forward the
2227	* base.
2228	*/
2229	__forward_timer_base(base: base_local, basej);
2230	__forward_timer_base(base: base_global, basej);
2231
2232	/*
2233	* Set base->is_idle only when caller is timer_base_try_to_set_idle()
2234	*/
2235	if (idle) {
2236	/*
2237	* Bases are idle if the next event is more than a tick
2238	* away. Caution: @nextevt could have changed by enqueueing a
2239	* global timer into timer migration hierarchy. Therefore a new
2240	* check is required here.
2241	*
2242	* If the base is marked idle then any timer add operation must
2243	* forward the base clk itself to keep granularity small. This
2244	* idle logic is only maintained for the BASE_LOCAL and
2245	* BASE_GLOBAL base, deferrable timers may still see large
2246	* granularity skew (by design).
2247	*/
2248	if (!base_local->is_idle && time_after(nextevt, basej + `1`)) {
2249	base_local->is_idle = true;
2250	/*
2251	* Global timers queued locally while running in a task
2252	* in nohz_full mode need a self-IPI to kick reprogramming
2253	* in IRQ tail.
2254	*/
2255	if (tick_nohz_full_cpu(cpu: base_local->cpu))
2256	base_global->is_idle = true;
2257	trace_timer_base_idle(is_idle: true, cpu: base_local->cpu);
2258	}
2259	*idle = base_local->is_idle;
2260
2261	/*
2262	* When timer base is not set idle, undo the effect of
2263	* tmigr_cpu_deactivate() to prevent inconsistent states - active
2264	* timer base but inactive timer migration hierarchy.
2265	*
2266	* When timer base was already marked idle, nothing will be
2267	* changed here.
2268	*/
2269	if (!base_local->is_idle && idle_is_possible)
2270	tmigr_cpu_activate();
2271	}
2272
2273	raw_spin_unlock(&base_global->lock);
2274	raw_spin_unlock(&base_local->lock);
2275
2276	return cmp_next_hrtimer_event(basem, expires: tevt.local);
2277	}
2278
2279	/**
2280	* get_next_timer_interrupt() - return the time (clock mono) of the next timer
2281	* @basej: base time jiffies
2282	* @basem: base time clock monotonic
2283	*
2284	* Returns the tick aligned clock monotonic time of the next pending timer or
2285	* KTIME_MAX if no timer is pending. If timer of global base was queued into
2286	* timer migration hierarchy, first global timer is not taken into account. If
2287	* it was the last CPU of timer migration hierarchy going idle, first global
2288	* event is taken into account.
2289	*/
2290	u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
2291	{
2292	return __get_next_timer_interrupt(basej, basem, NULL);
2293	}
2294
2295	/**
2296	* timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
2297	* @basej: base time jiffies
2298	* @basem: base time clock monotonic
2299	* @idle: pointer to store the value of timer_base->is_idle on return;
2300	* *idle contains the information whether tick was already stopped
2301	*
2302	* Returns the tick aligned clock monotonic time of the next pending timer or
2303	* KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
2304	* returned as well.
2305	*/
2306	u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
2307	{
2308	if (*idle)
2309	return KTIME_MAX;
2310
2311	return __get_next_timer_interrupt(basej, basem, idle);
2312	}
2313
2314	/**
2315	* timer_clear_idle - Clear the idle state of the timer base
2316	*
2317	* Called with interrupts disabled
2318	*/
2319	void timer_clear_idle(void)
2320	{
2321	/*
2322	* We do this unlocked. The worst outcome is a remote pinned timer
2323	* enqueue sending a pointless IPI, but taking the lock would just
2324	* make the window for sending the IPI a few instructions smaller
2325	* for the cost of taking the lock in the exit from idle
2326	* path. Required for BASE_LOCAL only.
2327	*/
2328	__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
2329	if (tick_nohz_full_cpu(smp_processor_id()))
2330	__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
2331	trace_timer_base_idle(is_idle: false, smp_processor_id());
2332
2333	/ Activate without holding the timer_base->lock /
2334	tmigr_cpu_activate();
2335	}
2336	#endif
2337
2338	/**
2339	* __run_timers - run all expired timers (if any) on this CPU.
2340	* @base: the timer vector to be processed.
2341	*/
2342	static inline void __run_timers(struct timer_base *base)
2343	{
2344	struct hlist_head heads[LVL_DEPTH];
2345	int levels;
2346
2347	lockdep_assert_held(&base->lock);
2348
2349	if (base->running_timer)
2350	return;
2351
2352	while (time_after_eq(jiffies, base->clk) &&
2353	time_after_eq(jiffies, base->next_expiry)) {
2354	levels = collect_expired_timers(base, heads);
2355	/*
2356	* The two possible reasons for not finding any expired
2357	* timer at this clk are that all matching timers have been
2358	* dequeued or no timer has been queued since
2359	* base::next_expiry was set to base::clk +
2360	* TIMER_NEXT_MAX_DELTA.
2361	*/
2362	WARN_ON_ONCE(!levels && !base->next_expiry_recalc
2363	&& base->timers_pending);
2364	/*
2365	* While executing timers, base->clk is set 1 offset ahead of
2366	* jiffies to avoid endless requeuing to current jiffies.
2367	*/
2368	base->clk++;
2369	timer_recalc_next_expiry(base);
2370
2371	while (levels--)
2372	expire_timers(base, head: heads + levels);
2373	}
2374	}
2375
2376	static void __run_timer_base(struct timer_base *base)
2377	{
2378	/ Can race against a remote CPU updating next_expiry under the lock /
2379	if (time_before(jiffies, READ_ONCE(base->next_expiry)))
2380	return;
2381
2382	timer_base_lock_expiry(base);
2383	raw_spin_lock_irq(&base->lock);
2384	__run_timers(base);
2385	raw_spin_unlock_irq(&base->lock);
2386	timer_base_unlock_expiry(base);
2387	}
2388
2389	static void run_timer_base(int index)
2390	{
2391	struct timer_base *base = this_cpu_ptr(&timer_bases[index]);
2392
2393	__run_timer_base(base);
2394	}
2395
2396	/*
2397	* This function runs timers and the timer-tq in bottom half context.
2398	*/
2399	static __latent_entropy void run_timer_softirq(void)
2400	{
2401	run_timer_base(BASE_LOCAL);
2402	if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
2403	run_timer_base(BASE_GLOBAL);
2404	run_timer_base(BASE_DEF);
2405
2406	if (is_timers_nohz_active())
2407	tmigr_handle_remote();
2408	}
2409	}
2410
2411	/*
2412	* Called by the local, per-CPU timer interrupt on SMP.
2413	*/
2414	static void run_local_timers(void)
2415	{
2416	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
2417
2418	hrtimer_run_queues();
2419
2420	for (int i = `0`; i < NR_BASES; i++, base++) {
2421	/*
2422	* Raise the softirq only if required.
2423	*
2424	* timer_base::next_expiry can be written by a remote CPU while
2425	* holding the lock. If this write happens at the same time than
2426	* the lockless local read, sanity checker could complain about
2427	* data corruption.
2428	*
2429	* There are two possible situations where
2430	* timer_base::next_expiry is written by a remote CPU:
2431	*
2432	* 1. Remote CPU expires global timers of this CPU and updates
2433	* timer_base::next_expiry of BASE_GLOBAL afterwards in
2434	* next_timer_interrupt() or timer_recalc_next_expiry(). The
2435	* worst outcome is a superfluous raise of the timer softirq
2436	* when the not yet updated value is read.
2437	*
2438	* 2. A new first pinned timer is enqueued by a remote CPU
2439	* and therefore timer_base::next_expiry of BASE_LOCAL is
2440	* updated. When this update is missed, this isn't a
2441	* problem, as an IPI is executed nevertheless when the CPU
2442	* was idle before. When the CPU wasn't idle but the update
2443	* is missed, then the timer would expire one jiffy late -
2444	* bad luck.
2445	*
2446	* Those unlikely corner cases where the worst outcome is only a
2447	* one jiffy delay or a superfluous raise of the softirq are
2448	* not that expensive as doing the check always while holding
2449	* the lock.
2450	*
2451	* Possible remote writers are using WRITE_ONCE(). Local reader
2452	* uses therefore READ_ONCE().
2453	*/
2454	if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) \|\|
2455	(i == BASE_DEF && tmigr_requires_handle_remote())) {
2456	raise_timer_softirq(nr: TIMER_SOFTIRQ);
2457	return;
2458	}
2459	}
2460	}
2461
2462	/*
2463	* Called from the timer interrupt handler to charge one tick to the current
2464	* process. user_tick is 1 if the tick is user time, 0 for system.
2465	*/
2466	void update_process_times(int user_tick)
2467	{
2468	struct task_struct *p = current;
2469
2470	/ Note: this timer irq context must be accounted for as well. /
2471	account_process_tick(p, user: user_tick);
2472	run_local_timers();
2473	rcu_sched_clock_irq(user: user_tick);
2474	#ifdef CONFIG_IRQ_WORK
2475	if (in_irq())
2476	irq_work_tick();
2477	#endif
2478	sched_tick();
2479	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
2480	run_posix_cpu_timers();
2481	}
2482
2483	#ifdef CONFIG_HOTPLUG_CPU
2484	static void migrate_timer_list(struct timer_base new_base, struct* hlist_head *head)
2485	{
2486	struct timer_list *timer;
2487	int cpu = new_base->cpu;
2488
2489	while (!hlist_empty(h: head)) {
2490	timer = hlist_entry(head->first, struct timer_list, entry);
2491	detach_timer(timer, clear_pending: false);
2492	timer->flags = (timer->flags & ~TIMER_BASEMASK) \| cpu;
2493	internal_add_timer(base: new_base, timer);
2494	}
2495	}
2496
2497	int timers_prepare_cpu(unsigned int cpu)
2498	{
2499	struct timer_base *base;
2500	int b;
2501
2502	for (b = `0`; b < NR_BASES; b++) {
2503	base = per_cpu_ptr(&timer_bases[b], cpu);
2504	base->clk = jiffies;
2505	base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
2506	base->next_expiry_recalc = false;
2507	base->timers_pending = false;
2508	base->is_idle = false;
2509	}
2510	return `0`;
2511	}
2512
2513	int timers_dead_cpu(unsigned int cpu)
2514	{
2515	struct timer_base *old_base;
2516	struct timer_base *new_base;
2517	int b, i;
2518
2519	for (b = `0`; b < NR_BASES; b++) {
2520	old_base = per_cpu_ptr(&timer_bases[b], cpu);
2521	new_base = get_cpu_ptr(&timer_bases[b]);
2522	/*
2523	* The caller is globally serialized and nobody else
2524	* takes two locks at once, deadlock is not possible.
2525	*/
2526	raw_spin_lock_irq(&new_base->lock);
2527	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
2528
2529	/*
2530	* The current CPUs base clock might be stale. Update it
2531	* before moving the timers over.
2532	*/
2533	forward_timer_base(base: new_base);
2534
2535	WARN_ON_ONCE(old_base->running_timer);
2536	old_base->running_timer = NULL;
2537
2538	for (i = `0`; i < WHEEL_SIZE; i++)
2539	migrate_timer_list(new_base, head: old_base->vectors + i);
2540
2541	raw_spin_unlock(&old_base->lock);
2542	raw_spin_unlock_irq(&new_base->lock);
2543	put_cpu_ptr(&timer_bases);
2544	}
2545	return `0`;
2546	}
2547
2548	#endif /* CONFIG_HOTPLUG_CPU */
2549
2550	static void __init init_timer_cpu(int cpu)
2551	{
2552	struct timer_base *base;
2553	int i;
2554
2555	for (i = `0`; i < NR_BASES; i++) {
2556	base = per_cpu_ptr(&timer_bases[i], cpu);
2557	base->cpu = cpu;
2558	raw_spin_lock_init(&base->lock);
2559	base->clk = jiffies;
2560	base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
2561	timer_base_init_expiry_lock(base);
2562	}
2563	}
2564
2565	static void __init init_timer_cpus(void)
2566	{
2567	int cpu;
2568
2569	for_each_possible_cpu(cpu)
2570	init_timer_cpu(cpu);
2571	}
2572
2573	void __init timers_init(void)
2574	{
2575	init_timer_cpus();
2576	posix_cputimers_init_work();
2577	open_softirq(nr: TIMER_SOFTIRQ, action: run_timer_softirq);
2578	}
2579

source code of linux/kernel/time/timer.c