tick-sched.c source code [linux/kernel/time/tick-sched.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
4	* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
5	* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
6	*
7	* NOHZ implementation for low and high resolution timers
8	*
9	* Started by: Thomas Gleixner and Ingo Molnar
10	*/
11	#include <linux/compiler.h>
12	#include <linux/cpu.h>
13	#include <linux/err.h>
14	#include <linux/hrtimer.h>
15	#include <linux/interrupt.h>
16	#include <linux/kernel_stat.h>
17	#include <linux/percpu.h>
18	#include <linux/nmi.h>
19	#include <linux/profile.h>
20	#include <linux/sched/signal.h>
21	#include <linux/sched/clock.h>
22	#include <linux/sched/stat.h>
23	#include <linux/sched/nohz.h>
24	#include <linux/sched/loadavg.h>
25	#include <linux/module.h>
26	#include <linux/irq_work.h>
27	#include <linux/posix-timers.h>
28	#include <linux/context_tracking.h>
29	#include <linux/mm.h>
30
31	#include <asm/irq_regs.h>
32
33	#include "tick-internal.h"
34
35	#include <trace/events/timer.h>
36
37	/*
38	* Per-CPU nohz control structure
39	*/
40	static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
41
42	struct tick_sched tick_get_tick_sched(int* cpu)
43	{
44	return &per_cpu(tick_cpu_sched, cpu);
45	}
46
47	/*
48	* The time when the last jiffy update happened. Write access must hold
49	* jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
50	* consistent view of jiffies and last_jiffies_update.
51	*/
52	static ktime_t last_jiffies_update;
53
54	/*
55	* Must be called with interrupts disabled !
56	*/
57	static void tick_do_update_jiffies64(ktime_t now)
58	{
59	unsigned long ticks = `1`;
60	ktime_t delta, nextp;
61
62	/*
63	* 64-bit can do a quick check without holding the jiffies lock and
64	* without looking at the sequence count. The smp_load_acquire()
65	* pairs with the update done later in this function.
66	*
67	* 32-bit cannot do that because the store of 'tick_next_period'
68	* consists of two 32-bit stores, and the first store could be
69	* moved by the CPU to a random point in the future.
70	*/
71	if (IS_ENABLED(CONFIG_64BIT)) {
72	if (ktime_before(cmp1: now, smp_load_acquire(&tick_next_period)))
73	return;
74	} else {
75	unsigned int seq;
76
77	/*
78	* Avoid contention on 'jiffies_lock' and protect the quick
79	* check with the sequence count.
80	*/
81	do {
82	seq = read_seqcount_begin(&jiffies_seq);
83	nextp = tick_next_period;
84	} while (read_seqcount_retry(&jiffies_seq, seq));
85
86	if (ktime_before(cmp1: now, cmp2: nextp))
87	return;
88	}
89
90	/ Quick check failed, i.e. update is required. /
91	raw_spin_lock(&jiffies_lock);
92	/*
93	* Re-evaluate with the lock held. Another CPU might have done the
94	* update already.
95	*/
96	if (ktime_before(cmp1: now, cmp2: tick_next_period)) {
97	raw_spin_unlock(&jiffies_lock);
98	return;
99	}
100
101	write_seqcount_begin(&jiffies_seq);
102
103	delta = ktime_sub(now, tick_next_period);
104	if (unlikely(delta >= TICK_NSEC)) {
105	/ Slow path for long idle sleep times /
106	s64 incr = TICK_NSEC;
107
108	ticks += ktime_divns(kt: delta, div: incr);
109
110	last_jiffies_update = ktime_add_ns(last_jiffies_update,
111	incr * ticks);
112	} else {
113	last_jiffies_update = ktime_add_ns(last_jiffies_update,
114	TICK_NSEC);
115	}
116
117	/ Advance jiffies to complete the 'jiffies_seq' protected job /
118	jiffies_64 += ticks;
119
120	/ Keep the tick_next_period variable up to date /
121	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
122
123	if (IS_ENABLED(CONFIG_64BIT)) {
124	/*
125	* Pairs with smp_load_acquire() in the lockless quick
126	* check above, and ensures that the update to 'jiffies_64' is
127	* not reordered vs. the store to 'tick_next_period', neither
128	* by the compiler nor by the CPU.
129	*/
130	smp_store_release(&tick_next_period, nextp);
131	} else {
132	/*
133	* A plain store is good enough on 32-bit, as the quick check
134	* above is protected by the sequence count.
135	*/
136	tick_next_period = nextp;
137	}
138
139	/*
140	* Release the sequence count. calc_global_load() below is not
141	* protected by it, but 'jiffies_lock' needs to be held to prevent
142	* concurrent invocations.
143	*/
144	write_seqcount_end(&jiffies_seq);
145
146	calc_global_load();
147
148	raw_spin_unlock(&jiffies_lock);
149	update_wall_time();
150	}
151
152	/*
153	* Initialize and return retrieve the jiffies update.
154	*/
155	static ktime_t tick_init_jiffy_update(void)
156	{
157	ktime_t period;
158
159	raw_spin_lock(&jiffies_lock);
160	write_seqcount_begin(&jiffies_seq);
161
162	/ Have we started the jiffies update yet ? /
163	if (last_jiffies_update == `0`) {
164	u32 rem;
165
166	/*
167	* Ensure that the tick is aligned to a multiple of
168	* TICK_NSEC.
169	*/
170	div_u64_rem(dividend: tick_next_period, TICK_NSEC, remainder: &rem);
171	if (rem)
172	tick_next_period += TICK_NSEC - rem;
173
174	last_jiffies_update = tick_next_period;
175	}
176	period = last_jiffies_update;
177
178	write_seqcount_end(&jiffies_seq);
179	raw_spin_unlock(&jiffies_lock);
180
181	return period;
182	}
183
184	static inline int tick_sched_flag_test(struct tick_sched *ts,
185	unsigned long flag)
186	{
187	return !!(ts->flags & flag);
188	}
189
190	static inline void tick_sched_flag_set(struct tick_sched *ts,
191	unsigned long flag)
192	{
193	lockdep_assert_irqs_disabled();
194	ts->flags \|= flag;
195	}
196
197	static inline void tick_sched_flag_clear(struct tick_sched *ts,
198	unsigned long flag)
199	{
200	lockdep_assert_irqs_disabled();
201	ts->flags &= ~flag;
202	}
203
204	/*
205	* Allow only one non-timekeeper CPU at a time update jiffies from
206	* the timer tick.
207	*
208	* Returns true if update was run.
209	*/
210	static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
211	{
212	static atomic_t in_progress;
213	int inp;
214
215	inp = atomic_read(v: &in_progress);
216	if (inp \|\| !atomic_try_cmpxchg(v: &in_progress, old: &inp, new: `1`))
217	return false;
218
219	if (ts->last_tick_jiffies == jiffies)
220	tick_do_update_jiffies64(now);
221	atomic_set(v: &in_progress, i: `0`);
222	return true;
223	}
224
225	#define MAX_STALLED_JIFFIES 5
226
227	static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
228	{
229	int tick_cpu, cpu = smp_processor_id();
230
231	/*
232	* Check if the do_timer duty was dropped. We don't care about
233	* concurrency: This happens only when the CPU in charge went
234	* into a long sleep. If two CPUs happen to assign themselves to
235	* this duty, then the jiffies update is still serialized by
236	* 'jiffies_lock'.
237	*
238	* If nohz_full is enabled, this should not happen because the
239	* 'tick_do_timer_cpu' CPU never relinquishes.
240	*/
241	tick_cpu = READ_ONCE(tick_do_timer_cpu);
242
243	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
244	#ifdef CONFIG_NO_HZ_FULL
245	WARN_ON_ONCE(tick_nohz_full_running);
246	#endif
247	WRITE_ONCE(tick_do_timer_cpu, cpu);
248	tick_cpu = cpu;
249	}
250
251	/ Check if jiffies need an update /
252	if (tick_cpu == cpu)
253	tick_do_update_jiffies64(now);
254
255	/*
256	* If the jiffies update stalled for too long (timekeeper in stop_machine()
257	* or VMEXIT'ed for several msecs), force an update.
258	*/
259	if (ts->last_tick_jiffies != jiffies) {
260	ts->stalled_jiffies = `0`;
261	ts->last_tick_jiffies = READ_ONCE(jiffies);
262	} else {
263	if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
264	if (tick_limited_update_jiffies64(ts, now)) {
265	ts->stalled_jiffies = `0`;
266	ts->last_tick_jiffies = READ_ONCE(jiffies);
267	}
268	}
269	}
270
271	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
272	ts->got_idle_tick = `1`;
273	}
274
275	static void tick_sched_handle(struct tick_sched ts, struct* pt_regs *regs)
276	{
277	/*
278	* When we are idle and the tick is stopped, we have to touch
279	* the watchdog as we might not schedule for a really long
280	* time. This happens on completely idle SMP systems while
281	* waiting on the login prompt. We also increment the "start of
282	* idle" jiffy stamp so the idle accounting adjustment we do
283	* when we go busy again does not account too many ticks.
284	*/
285	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
286	tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
287	touch_softlockup_watchdog_sched();
288	if (is_idle_task(current))
289	ts->idle_jiffies++;
290	/*
291	* In case the current tick fired too early past its expected
292	* expiration, make sure we don't bypass the next clock reprogramming
293	* to the same deadline.
294	*/
295	ts->next_tick = `0`;
296	}
297
298	update_process_times(user: user_mode(regs));
299	profile_tick(CPU_PROFILING);
300	}
301
302	/*
303	* We rearm the timer until we get disabled by the idle code.
304	* Called with interrupts disabled.
305	*/
306	static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
307	{
308	struct tick_sched ts = container_of(timer, struct* tick_sched, sched_timer);
309	struct pt_regs *regs = get_irq_regs();
310	ktime_t now = ktime_get();
311
312	tick_sched_do_timer(ts, now);
313
314	/*
315	* Do not call when we are not in IRQ context and have
316	* no valid 'regs' pointer
317	*/
318	if (regs)
319	tick_sched_handle(ts, regs);
320	else
321	ts->next_tick = `0`;
322
323	/*
324	* In dynticks mode, tick reprogram is deferred:
325	* - to the idle task if in dynticks-idle
326	* - to IRQ exit if in full-dynticks.
327	*/
328	if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
329	return HRTIMER_NORESTART;
330
331	hrtimer_forward(timer, now, TICK_NSEC);
332
333	return HRTIMER_RESTART;
334	}
335
336	#ifdef CONFIG_NO_HZ_FULL
337	cpumask_var_t tick_nohz_full_mask;
338	EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
339	bool tick_nohz_full_running;
340	EXPORT_SYMBOL_GPL(tick_nohz_full_running);
341	static atomic_t tick_dep_mask;
342
343	static bool check_tick_dependency(atomic_t *dep)
344	{
345	int val = atomic_read(dep);
346
347	if (val & TICK_DEP_MASK_POSIX_TIMER) {
348	trace_tick_stop(`0`, TICK_DEP_MASK_POSIX_TIMER);
349	return true;
350	}
351
352	if (val & TICK_DEP_MASK_PERF_EVENTS) {
353	trace_tick_stop(`0`, TICK_DEP_MASK_PERF_EVENTS);
354	return true;
355	}
356
357	if (val & TICK_DEP_MASK_SCHED) {
358	trace_tick_stop(`0`, TICK_DEP_MASK_SCHED);
359	return true;
360	}
361
362	if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
363	trace_tick_stop(`0`, TICK_DEP_MASK_CLOCK_UNSTABLE);
364	return true;
365	}
366
367	if (val & TICK_DEP_MASK_RCU) {
368	trace_tick_stop(`0`, TICK_DEP_MASK_RCU);
369	return true;
370	}
371
372	if (val & TICK_DEP_MASK_RCU_EXP) {
373	trace_tick_stop(`0`, TICK_DEP_MASK_RCU_EXP);
374	return true;
375	}
376
377	return false;
378	}
379
380	static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
381	{
382	lockdep_assert_irqs_disabled();
383
384	if (unlikely(!cpu_online(cpu)))
385	return false;
386
387	if (check_tick_dependency(&tick_dep_mask))
388	return false;
389
390	if (check_tick_dependency(&ts->tick_dep_mask))
391	return false;
392
393	if (check_tick_dependency(&current->tick_dep_mask))
394	return false;
395
396	if (check_tick_dependency(&current->signal->tick_dep_mask))
397	return false;
398
399	return true;
400	}
401
402	static void nohz_full_kick_func(struct irq_work *work)
403	{
404	/ Empty, the tick restart happens on tick_nohz_irq_exit() /
405	}
406
407	static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
408	IRQ_WORK_INIT_HARD(nohz_full_kick_func);
409
410	/*
411	* Kick this CPU if it's full dynticks in order to force it to
412	* re-evaluate its dependency on the tick and restart it if necessary.
413	* This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
414	* is NMI safe.
415	*/
416	static void tick_nohz_full_kick(void)
417	{
418	if (!tick_nohz_full_cpu(smp_processor_id()))
419	return;
420
421	irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
422	}
423
424	/*
425	* Kick the CPU if it's full dynticks in order to force it to
426	* re-evaluate its dependency on the tick and restart it if necessary.
427	*/
428	void tick_nohz_full_kick_cpu(int cpu)
429	{
430	if (!tick_nohz_full_cpu(cpu))
431	return;
432
433	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
434	}
435
436	static void tick_nohz_kick_task(struct task_struct *tsk)
437	{
438	int cpu;
439
440	/*
441	* If the task is not running, run_posix_cpu_timers()
442	* has nothing to elapse, and an IPI can then be optimized out.
443	*
444	* activate_task() STORE p->tick_dep_mask
445	* STORE p->on_rq
446	* __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or())
447	* LOCK rq->lock LOAD p->on_rq
448	* smp_mb__after_spin_lock()
449	* tick_nohz_task_switch()
450	* LOAD p->tick_dep_mask
451	*
452	* XXX given a task picks up the dependency on schedule(), should we
453	* only care about tasks that are currently on the CPU instead of all
454	* that are on the runqueue?
455	*
456	* That is, does this want to be: task_on_cpu() / task_curr()?
457	*/
458	if (!sched_task_on_rq(tsk))
459	return;
460
461	/*
462	* If the task concurrently migrates to another CPU,
463	* we guarantee it sees the new tick dependency upon
464	* schedule.
465	*
466	* set_task_cpu(p, cpu);
467	* STORE p->cpu = @cpu
468	* __schedule() (switch to task 'p')
469	* LOCK rq->lock
470	* smp_mb__after_spin_lock() STORE p->tick_dep_mask
471	* tick_nohz_task_switch() smp_mb() (atomic_fetch_or())
472	* LOAD p->tick_dep_mask LOAD p->cpu
473	*/
474	cpu = task_cpu(tsk);
475
476	preempt_disable();
477	if (cpu_online(cpu))
478	tick_nohz_full_kick_cpu(cpu);
479	preempt_enable();
480	}
481
482	/*
483	* Kick all full dynticks CPUs in order to force these to re-evaluate
484	* their dependency on the tick and restart it if necessary.
485	*/
486	static void tick_nohz_full_kick_all(void)
487	{
488	int cpu;
489
490	if (!tick_nohz_full_running)
491	return;
492
493	preempt_disable();
494	for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
495	tick_nohz_full_kick_cpu(cpu);
496	preempt_enable();
497	}
498
499	static void tick_nohz_dep_set_all(atomic_t *dep,
500	enum tick_dep_bits bit)
501	{
502	int prev;
503
504	prev = atomic_fetch_or(BIT(bit), dep);
505	if (!prev)
506	tick_nohz_full_kick_all();
507	}
508
509	/*
510	* Set a global tick dependency. Used by perf events that rely on freq and
511	* unstable clocks.
512	*/
513	void tick_nohz_dep_set(enum tick_dep_bits bit)
514	{
515	tick_nohz_dep_set_all(&tick_dep_mask, bit);
516	}
517
518	void tick_nohz_dep_clear(enum tick_dep_bits bit)
519	{
520	atomic_andnot(BIT(bit), &tick_dep_mask);
521	}
522
523	/*
524	* Set per-CPU tick dependency. Used by scheduler and perf events in order to
525	* manage event-throttling.
526	*/
527	void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
528	{
529	int prev;
530	struct tick_sched *ts;
531
532	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
533
534	prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
535	if (!prev) {
536	preempt_disable();
537	/ Perf needs local kick that is NMI safe /
538	if (cpu == smp_processor_id()) {
539	tick_nohz_full_kick();
540	} else {
541	/ Remote IRQ work not NMI-safe /
542	if (!WARN_ON_ONCE(in_nmi()))
543	tick_nohz_full_kick_cpu(cpu);
544	}
545	preempt_enable();
546	}
547	}
548	EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
549
550	void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
551	{
552	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
553
554	atomic_andnot(BIT(bit), &ts->tick_dep_mask);
555	}
556	EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
557
558	/*
559	* Set a per-task tick dependency. RCU needs this. Also posix CPU timers
560	* in order to elapse per task timers.
561	*/
562	void tick_nohz_dep_set_task(struct task_struct tsk, enum* tick_dep_bits bit)
563	{
564	if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
565	tick_nohz_kick_task(tsk);
566	}
567	EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
568
569	void tick_nohz_dep_clear_task(struct task_struct tsk, enum* tick_dep_bits bit)
570	{
571	atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
572	}
573	EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
574
575	/*
576	* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
577	* per process timers.
578	*/
579	void tick_nohz_dep_set_signal(struct task_struct *tsk,
580	enum tick_dep_bits bit)
581	{
582	int prev;
583	struct signal_struct *sig = tsk->signal;
584
585	prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
586	if (!prev) {
587	struct task_struct *t;
588
589	lockdep_assert_held(&tsk->sighand->siglock);
590	__for_each_thread(sig, t)
591	tick_nohz_kick_task(t);
592	}
593	}
594
595	void tick_nohz_dep_clear_signal(struct signal_struct sig, enum* tick_dep_bits bit)
596	{
597	atomic_andnot(BIT(bit), &sig->tick_dep_mask);
598	}
599
600	/*
601	* Re-evaluate the need for the tick as we switch the current task.
602	* It might need the tick due to per task/process properties:
603	* perf events, posix CPU timers, ...
604	*/
605	void __tick_nohz_task_switch(void)
606	{
607	struct tick_sched *ts;
608
609	if (!tick_nohz_full_cpu(smp_processor_id()))
610	return;
611
612	ts = this_cpu_ptr(&tick_cpu_sched);
613
614	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
615	if (atomic_read(&current->tick_dep_mask) \|\|
616	atomic_read(&current->signal->tick_dep_mask))
617	tick_nohz_full_kick();
618	}
619	}
620
621	/ Get the boot-time nohz CPU list from the kernel parameters. /
622	void __init tick_nohz_full_setup(cpumask_var_t cpumask)
623	{
624	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
625	cpumask_copy(tick_nohz_full_mask, cpumask);
626	tick_nohz_full_running = true;
627	}
628
629	bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
630	{
631	/*
632	* The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
633	* timers, workqueues, timekeeping, ...) on behalf of full dynticks
634	* CPUs. It must remain online when nohz full is enabled.
635	*/
636	if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
637	return false;
638	return true;
639	}
640
641	static int tick_nohz_cpu_down(unsigned int cpu)
642	{
643	return tick_nohz_cpu_hotpluggable(cpu) ? `0` : -EBUSY;
644	}
645
646	void __init tick_nohz_init(void)
647	{
648	int cpu, ret;
649
650	if (!tick_nohz_full_running)
651	return;
652
653	/*
654	* Full dynticks uses IRQ work to drive the tick rescheduling on safe
655	* locking contexts. But then we need IRQ work to raise its own
656	* interrupts to avoid circular dependency on the tick.
657	*/
658	if (!arch_irq_work_has_interrupt()) {
659	pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
660	cpumask_clear(tick_nohz_full_mask);
661	tick_nohz_full_running = false;
662	return;
663	}
664
665	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
666	!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
667	cpu = smp_processor_id();
668
669	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
670	pr_warn("NO_HZ: Clearing %d from nohz_full range "
671	"for timekeeping\n", cpu);
672	cpumask_clear_cpu(cpu, tick_nohz_full_mask);
673	}
674	}
675
676	for_each_cpu(cpu, tick_nohz_full_mask)
677	ct_cpu_track_user(cpu);
678
679	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
680	"kernel/nohz:predown", NULL,
681	tick_nohz_cpu_down);
682	WARN_ON(ret < `0`);
683	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
684	cpumask_pr_args(tick_nohz_full_mask));
685	}
686	#endif /* #ifdef CONFIG_NO_HZ_FULL */
687
688	/*
689	* NOHZ - aka dynamic tick functionality
690	*/
691	#ifdef CONFIG_NO_HZ_COMMON
692	/*
693	* NO HZ enabled ?
694	*/
695	bool tick_nohz_enabled __read_mostly = true;
696	unsigned long tick_nohz_active __read_mostly;
697	/*
698	* Enable / Disable tickless mode
699	*/
700	static int __init setup_tick_nohz(char *str)
701	{
702	return (kstrtobool(s: str, res: &tick_nohz_enabled) == `0`);
703	}
704
705	__setup("nohz=", setup_tick_nohz);
706
707	bool tick_nohz_tick_stopped(void)
708	{
709	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
710
711	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
712	}
713
714	bool tick_nohz_tick_stopped_cpu(int cpu)
715	{
716	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
717
718	return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
719	}
720
721	/**
722	* tick_nohz_update_jiffies - update jiffies when idle was interrupted
723	* @now: current ktime_t
724	*
725	* Called from interrupt entry when the CPU was idle
726	*
727	* In case the sched_tick was stopped on this CPU, we have to check if jiffies
728	* must be updated. Otherwise an interrupt handler could use a stale jiffy
729	* value. We do this unconditionally on any CPU, as we don't know whether the
730	* CPU, which has the update task assigned, is in a long sleep.
731	*/
732	static void tick_nohz_update_jiffies(ktime_t now)
733	{
734	unsigned long flags;
735
736	__this_cpu_write(tick_cpu_sched.idle_waketime, now);
737
738	local_irq_save(flags);
739	tick_do_update_jiffies64(now);
740	local_irq_restore(flags);
741
742	touch_softlockup_watchdog_sched();
743	}
744
745	static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
746	{
747	ktime_t delta;
748
749	if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
750	return;
751
752	delta = ktime_sub(now, ts->idle_entrytime);
753
754	write_seqcount_begin(&ts->idle_sleeptime_seq);
755	if (nr_iowait_cpu(smp_processor_id()) > `0`)
756	ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
757	else
758	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
759
760	ts->idle_entrytime = now;
761	tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
762	write_seqcount_end(&ts->idle_sleeptime_seq);
763
764	sched_clock_idle_wakeup_event();
765	}
766
767	static void tick_nohz_start_idle(struct tick_sched *ts)
768	{
769	write_seqcount_begin(&ts->idle_sleeptime_seq);
770	ts->idle_entrytime = ktime_get();
771	tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
772	write_seqcount_end(&ts->idle_sleeptime_seq);
773
774	sched_clock_idle_sleep_event();
775	}
776
777	static u64 get_cpu_sleep_time_us(struct tick_sched ts, ktime_t sleeptime,
778	bool compute_delta, u64 *last_update_time)
779	{
780	ktime_t now, idle;
781	unsigned int seq;
782
783	if (!tick_nohz_active)
784	return -`1`;
785
786	now = ktime_get();
787	if (last_update_time)
788	*last_update_time = ktime_to_us(kt: now);
789
790	do {
791	seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
792
793	if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
794	ktime_t delta = ktime_sub(now, ts->idle_entrytime);
795
796	idle = ktime_add(*sleeptime, delta);
797	} else {
798	idle = *sleeptime;
799	}
800	} while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
801
802	return ktime_to_us(kt: idle);
803
804	}
805
806	/**
807	* get_cpu_idle_time_us - get the total idle time of a CPU
808	* @cpu: CPU number to query
809	* @last_update_time: variable to store update time in. Do not update
810	* counters if NULL.
811	*
812	* Return the cumulative idle time (since boot) for a given
813	* CPU, in microseconds. Note that this is partially broken due to
814	* the counter of iowait tasks that can be remotely updated without
815	* any synchronization. Therefore it is possible to observe backward
816	* values within two consecutive reads.
817	*
818	* This time is measured via accounting rather than sampling,
819	* and is as accurate as ktime_get() is.
820	*
821	* Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
822	*/
823	u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
824	{
825	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
826
827	return get_cpu_sleep_time_us(ts, sleeptime: &ts->idle_sleeptime,
828	compute_delta: !nr_iowait_cpu(cpu), last_update_time);
829	}
830	EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
831
832	/**
833	* get_cpu_iowait_time_us - get the total iowait time of a CPU
834	* @cpu: CPU number to query
835	* @last_update_time: variable to store update time in. Do not update
836	* counters if NULL.
837	*
838	* Return the cumulative iowait time (since boot) for a given
839	* CPU, in microseconds. Note this is partially broken due to
840	* the counter of iowait tasks that can be remotely updated without
841	* any synchronization. Therefore it is possible to observe backward
842	* values within two consecutive reads.
843	*
844	* This time is measured via accounting rather than sampling,
845	* and is as accurate as ktime_get() is.
846	*
847	* Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
848	*/
849	u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
850	{
851	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
852
853	return get_cpu_sleep_time_us(ts, sleeptime: &ts->iowait_sleeptime,
854	compute_delta: nr_iowait_cpu(cpu), last_update_time);
855	}
856	EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
857
858	static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
859	{
860	hrtimer_cancel(timer: &ts->sched_timer);
861	hrtimer_set_expires(timer: &ts->sched_timer, time: ts->last_tick);
862
863	/ Forward the time to expire in the future /
864	hrtimer_forward(timer: &ts->sched_timer, now, TICK_NSEC);
865
866	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
867	hrtimer_start_expires(timer: &ts->sched_timer,
868	mode: HRTIMER_MODE_ABS_PINNED_HARD);
869	} else {
870	tick_program_event(expires: hrtimer_get_expires(timer: &ts->sched_timer), force: `1`);
871	}
872
873	/*
874	* Reset to make sure the next tick stop doesn't get fooled by past
875	* cached clock deadline.
876	*/
877	ts->next_tick = `0`;
878	}
879
880	static inline bool local_timer_softirq_pending(void)
881	{
882	return local_timers_pending() & BIT(TIMER_SOFTIRQ);
883	}
884
885	/*
886	* Read jiffies and the time when jiffies were updated last
887	*/
888	u64 get_jiffies_update(unsigned long *basej)
889	{
890	unsigned long basejiff;
891	unsigned int seq;
892	u64 basemono;
893
894	do {
895	seq = read_seqcount_begin(&jiffies_seq);
896	basemono = last_jiffies_update;
897	basejiff = jiffies;
898	} while (read_seqcount_retry(&jiffies_seq, seq));
899	*basej = basejiff;
900	return basemono;
901	}
902
903	/**
904	* tick_nohz_next_event() - return the clock monotonic based next event
905	* @ts: pointer to tick_sched struct
906	* @cpu: CPU number
907	*
908	* Return:
909	* *%0 - When the next event is a maximum of TICK_NSEC in the future
910	* and the tick is not stopped yet
911	* *%next_event - Next event based on clock monotonic
912	*/
913	static ktime_t tick_nohz_next_event(struct tick_sched ts, int* cpu)
914	{
915	u64 basemono, next_tick, delta, expires;
916	unsigned long basejiff;
917	int tick_cpu;
918
919	basemono = get_jiffies_update(basej: &basejiff);
920	ts->last_jiffies = basejiff;
921	ts->timer_expires_base = basemono;
922
923	/*
924	* Keep the periodic tick, when RCU, architecture or irq_work
925	* requests it.
926	* Aside of that, check whether the local timer softirq is
927	* pending. If so, its a bad idea to call get_next_timer_interrupt(),
928	* because there is an already expired timer, so it will request
929	* immediate expiry, which rearms the hardware timer with a
930	* minimal delta, which brings us back to this place
931	* immediately. Lather, rinse and repeat...
932	*/
933	if (rcu_needs_cpu() \|\| arch_needs_cpu() \|\|
934	irq_work_needs_cpu() \|\| local_timer_softirq_pending()) {
935	next_tick = basemono + TICK_NSEC;
936	} else {
937	/*
938	* Get the next pending timer. If high resolution
939	* timers are enabled this only takes the timer wheel
940	* timers into account. If high resolution timers are
941	* disabled this also looks at the next expiring
942	* hrtimer.
943	*/
944	next_tick = get_next_timer_interrupt(basej: basejiff, basem: basemono);
945	ts->next_timer = next_tick;
946	}
947
948	/ Make sure next_tick is never before basemono! /
949	if (WARN_ON_ONCE(basemono > next_tick))
950	next_tick = basemono;
951
952	/*
953	* If the tick is due in the next period, keep it ticking or
954	* force prod the timer.
955	*/
956	delta = next_tick - basemono;
957	if (delta <= (u64)TICK_NSEC) {
958	/*
959	* We've not stopped the tick yet, and there's a timer in the
960	* next period, so no point in stopping it either, bail.
961	*/
962	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
963	ts->timer_expires = `0`;
964	goto out;
965	}
966	}
967
968	/*
969	* If this CPU is the one which had the do_timer() duty last, we limit
970	* the sleep time to the timekeeping 'max_deferment' value.
971	* Otherwise we can sleep as long as we want.
972	*/
973	delta = timekeeping_max_deferment();
974	tick_cpu = READ_ONCE(tick_do_timer_cpu);
975	if (tick_cpu != cpu &&
976	(tick_cpu != TICK_DO_TIMER_NONE \|\| !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
977	delta = KTIME_MAX;
978
979	/ Calculate the next expiry time /
980	if (delta < (KTIME_MAX - basemono))
981	expires = basemono + delta;
982	else
983	expires = KTIME_MAX;
984
985	ts->timer_expires = min_t(u64, expires, next_tick);
986
987	out:
988	return ts->timer_expires;
989	}
990
991	static void tick_nohz_stop_tick(struct tick_sched ts, int* cpu)
992	{
993	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
994	unsigned long basejiff = ts->last_jiffies;
995	u64 basemono = ts->timer_expires_base;
996	bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
997	int tick_cpu;
998	u64 expires;
999
1000	/ Make sure we won't be trying to stop it twice in a row. /
1001	ts->timer_expires_base = `0`;
1002
1003	/*
1004	* Now the tick should be stopped definitely - so the timer base needs
1005	* to be marked idle as well to not miss a newly queued timer.
1006	*/
1007	expires = timer_base_try_to_set_idle(basej: basejiff, basem: basemono, idle: &timer_idle);
1008	if (expires > ts->timer_expires) {
1009	/*
1010	* This path could only happen when the first timer was removed
1011	* between calculating the possible sleep length and now (when
1012	* high resolution mode is not active, timer could also be a
1013	* hrtimer).
1014	*
1015	* We have to stick to the original calculated expiry value to
1016	* not stop the tick for too long with a shallow C-state (which
1017	* was programmed by cpuidle because of an early next expiration
1018	* value).
1019	*/
1020	expires = ts->timer_expires;
1021	}
1022
1023	/ If the timer base is not idle, retain the not yet stopped tick. /
1024	if (!timer_idle)
1025	return;
1026
1027	/*
1028	* If this CPU is the one which updates jiffies, then give up
1029	* the assignment and let it be taken by the CPU which runs
1030	* the tick timer next, which might be this CPU as well. If we
1031	* don't drop this here, the jiffies might be stale and
1032	* do_timer() never gets invoked. Keep track of the fact that it
1033	* was the one which had the do_timer() duty last.
1034	*/
1035	tick_cpu = READ_ONCE(tick_do_timer_cpu);
1036	if (tick_cpu == cpu) {
1037	WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
1038	tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
1039	} else if (tick_cpu != TICK_DO_TIMER_NONE) {
1040	tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
1041	}
1042
1043	/ Skip reprogram of event if it's not changed /
1044	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
1045	/ Sanity check: make sure clockevent is actually programmed /
1046	if (expires == KTIME_MAX \|\| ts->next_tick == hrtimer_get_expires(timer: &ts->sched_timer))
1047	return;
1048
1049	WARN_ONCE(`1`, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
1050	"timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
1051	dev->next_event, hrtimer_active(&ts->sched_timer),
1052	hrtimer_get_expires(&ts->sched_timer));
1053	}
1054
1055	/*
1056	* tick_nohz_stop_tick() can be called several times before
1057	* tick_nohz_restart_sched_tick() is called. This happens when
1058	* interrupts arrive which do not cause a reschedule. In the first
1059	* call we save the current tick time, so we can restart the
1060	* scheduler tick in tick_nohz_restart_sched_tick().
1061	*/
1062	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1063	calc_load_nohz_start();
1064	quiet_vmstat();
1065
1066	ts->last_tick = hrtimer_get_expires(timer: &ts->sched_timer);
1067	tick_sched_flag_set(ts, TS_FLAG_STOPPED);
1068	trace_tick_stop(success: `1`, TICK_DEP_MASK_NONE);
1069	}
1070
1071	ts->next_tick = expires;
1072
1073	/*
1074	* If the expiration time == KTIME_MAX, then we simply stop
1075	* the tick timer.
1076	*/
1077	if (unlikely(expires == KTIME_MAX)) {
1078	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
1079	hrtimer_cancel(timer: &ts->sched_timer);
1080	else
1081	tick_program_event(KTIME_MAX, force: `1`);
1082	return;
1083	}
1084
1085	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
1086	hrtimer_start(timer: &ts->sched_timer, tim: expires,
1087	mode: HRTIMER_MODE_ABS_PINNED_HARD);
1088	} else {
1089	hrtimer_set_expires(timer: &ts->sched_timer, time: expires);
1090	tick_program_event(expires, force: `1`);
1091	}
1092	}
1093
1094	static void tick_nohz_retain_tick(struct tick_sched *ts)
1095	{
1096	ts->timer_expires_base = `0`;
1097	}
1098
1099	#ifdef CONFIG_NO_HZ_FULL
1100	static void tick_nohz_full_stop_tick(struct tick_sched ts, int* cpu)
1101	{
1102	if (tick_nohz_next_event(ts, cpu))
1103	tick_nohz_stop_tick(ts, cpu);
1104	else
1105	tick_nohz_retain_tick(ts);
1106	}
1107	#endif /* CONFIG_NO_HZ_FULL */
1108
1109	static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
1110	{
1111	/ Update jiffies first /
1112	tick_do_update_jiffies64(now);
1113
1114	/*
1115	* Clear the timer idle flag, so we avoid IPIs on remote queueing and
1116	* the clock forward checks in the enqueue path:
1117	*/
1118	timer_clear_idle();
1119
1120	calc_load_nohz_stop();
1121	touch_softlockup_watchdog_sched();
1122
1123	/ Cancel the scheduled timer and restore the tick: /
1124	tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
1125	tick_nohz_restart(ts, now);
1126	}
1127
1128	static void __tick_nohz_full_update_tick(struct tick_sched *ts,
1129	ktime_t now)
1130	{
1131	#ifdef CONFIG_NO_HZ_FULL
1132	int cpu = smp_processor_id();
1133
1134	if (can_stop_full_tick(cpu, ts))
1135	tick_nohz_full_stop_tick(ts, cpu);
1136	else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1137	tick_nohz_restart_sched_tick(ts, now);
1138	#endif
1139	}
1140
1141	static void tick_nohz_full_update_tick(struct tick_sched *ts)
1142	{
1143	if (!tick_nohz_full_cpu(smp_processor_id()))
1144	return;
1145
1146	if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1147	return;
1148
1149	__tick_nohz_full_update_tick(ts, now: ktime_get());
1150	}
1151
1152	/*
1153	* A pending softirq outside an IRQ (or softirq disabled section) context
1154	* should be waiting for ksoftirqd to handle it. Therefore we shouldn't
1155	* reach this code due to the need_resched() early check in can_stop_idle_tick().
1156	*
1157	* However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
1158	* cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
1159	* triggering the code below, since wakep_softirqd() is ignored.
1160	*
1161	*/
1162	static bool report_idle_softirq(void)
1163	{
1164	static int ratelimit;
1165	unsigned int pending = local_softirq_pending();
1166
1167	if (likely(!pending))
1168	return false;
1169
1170	/ Some softirqs claim to be safe against hotplug and ksoftirqd parking /
1171	if (!cpu_active(smp_processor_id())) {
1172	pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
1173	if (!pending)
1174	return false;
1175	}
1176
1177	/ On RT, softirq handling may be waiting on some lock /
1178	if (local_bh_blocked())
1179	return false;
1180
1181	if (ratelimit < `10`) {
1182	pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
1183	pending);
1184	ratelimit++;
1185	}
1186
1187	return true;
1188	}
1189
1190	static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
1191	{
1192	WARN_ON_ONCE(cpu_is_offline(cpu));
1193
1194	if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
1195	return false;
1196
1197	if (need_resched())
1198	return false;
1199
1200	if (unlikely(report_idle_softirq()))
1201	return false;
1202
1203	if (tick_nohz_full_enabled()) {
1204	int tick_cpu = READ_ONCE(tick_do_timer_cpu);
1205
1206	/*
1207	* Keep the tick alive to guarantee timekeeping progression
1208	* if there are full dynticks CPUs around
1209	*/
1210	if (tick_cpu == cpu)
1211	return false;
1212
1213	/ Should not happen for nohz-full /
1214	if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
1215	return false;
1216	}
1217
1218	return true;
1219	}
1220
1221	/**
1222	* tick_nohz_idle_stop_tick - stop the idle tick from the idle task
1223	*
1224	* When the next event is more than a tick into the future, stop the idle tick
1225	*/
1226	void tick_nohz_idle_stop_tick(void)
1227	{
1228	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1229	int cpu = smp_processor_id();
1230	ktime_t expires;
1231
1232	/*
1233	* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
1234	* tick timer expiration time is known already.
1235	*/
1236	if (ts->timer_expires_base)
1237	expires = ts->timer_expires;
1238	else if (can_stop_idle_tick(cpu, ts))
1239	expires = tick_nohz_next_event(ts, cpu);
1240	else
1241	return;
1242
1243	ts->idle_calls++;
1244
1245	if (expires > `0LL`) {
1246	int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1247
1248	tick_nohz_stop_tick(ts, cpu);
1249
1250	ts->idle_sleeps++;
1251	ts->idle_expires = expires;
1252
1253	if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1254	ts->idle_jiffies = ts->last_jiffies;
1255	nohz_balance_enter_idle(cpu);
1256	}
1257	} else {
1258	tick_nohz_retain_tick(ts);
1259	}
1260	}
1261
1262	void tick_nohz_idle_retain_tick(void)
1263	{
1264	tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
1265	}
1266
1267	/**
1268	* tick_nohz_idle_enter - prepare for entering idle on the current CPU
1269	*
1270	* Called when we start the idle loop.
1271	*/
1272	void tick_nohz_idle_enter(void)
1273	{
1274	struct tick_sched *ts;
1275
1276	lockdep_assert_irqs_enabled();
1277
1278	local_irq_disable();
1279
1280	ts = this_cpu_ptr(&tick_cpu_sched);
1281
1282	WARN_ON_ONCE(ts->timer_expires_base);
1283
1284	tick_sched_flag_set(ts, TS_FLAG_INIDLE);
1285	tick_nohz_start_idle(ts);
1286
1287	local_irq_enable();
1288	}
1289
1290	/**
1291	* tick_nohz_irq_exit - Notify the tick about IRQ exit
1292	*
1293	* A timer may have been added/modified/deleted either by the current IRQ,
1294	* or by another place using this IRQ as a notification. This IRQ may have
1295	* also updated the RCU callback list. These events may require a
1296	* re-evaluation of the next tick. Depending on the context:
1297	*
1298	* 1) If the CPU is idle and no resched is pending, just proceed with idle
1299	* time accounting. The next tick will be re-evaluated on the next idle
1300	* loop iteration.
1301	*
1302	* 2) If the CPU is nohz_full:
1303	*
1304	* 2.1) If there is any tick dependency, restart the tick if stopped.
1305	*
1306	* 2.2) If there is no tick dependency, (re-)evaluate the next tick and
1307	* stop/update it accordingly.
1308	*/
1309	void tick_nohz_irq_exit(void)
1310	{
1311	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1312
1313	if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
1314	tick_nohz_start_idle(ts);
1315	else
1316	tick_nohz_full_update_tick(ts);
1317	}
1318
1319	/**
1320	* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
1321	*
1322	* Return: %true if the tick handler has run, otherwise %false
1323	*/
1324	bool tick_nohz_idle_got_tick(void)
1325	{
1326	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1327
1328	if (ts->got_idle_tick) {
1329	ts->got_idle_tick = `0`;
1330	return true;
1331	}
1332	return false;
1333	}
1334
1335	/**
1336	* tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
1337	* or the tick, whichever expires first. Note that, if the tick has been
1338	* stopped, it returns the next hrtimer.
1339	*
1340	* Called from power state control code with interrupts disabled
1341	*
1342	* Return: the next expiration time
1343	*/
1344	ktime_t tick_nohz_get_next_hrtimer(void)
1345	{
1346	return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
1347	}
1348
1349	/**
1350	* tick_nohz_get_sleep_length - return the expected length of the current sleep
1351	* @delta_next: duration until the next event if the tick cannot be stopped
1352	*
1353	* Called from power state control code with interrupts disabled.
1354	*
1355	* The return value of this function and/or the value returned by it through the
1356	* @delta_next pointer can be negative which must be taken into account by its
1357	* callers.
1358	*
1359	* Return: the expected length of the current sleep
1360	*/
1361	ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
1362	{
1363	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
1364	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1365	int cpu = smp_processor_id();
1366	/*
1367	* The idle entry time is expected to be a sufficient approximation of
1368	* the current time at this point.
1369	*/
1370	ktime_t now = ts->idle_entrytime;
1371	ktime_t next_event;
1372
1373	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1374
1375	*delta_next = ktime_sub(dev->next_event, now);
1376
1377	if (!can_stop_idle_tick(cpu, ts))
1378	return *delta_next;
1379
1380	next_event = tick_nohz_next_event(ts, cpu);
1381	if (!next_event)
1382	return *delta_next;
1383
1384	/*
1385	* If the next highres timer to expire is earlier than 'next_event', the
1386	* idle governor needs to know that.
1387	*/
1388	next_event = min_t(u64, next_event,
1389	hrtimer_next_event_without(&ts->sched_timer));
1390
1391	return ktime_sub(next_event, now);
1392	}
1393
1394	/**
1395	* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
1396	* for a particular CPU.
1397	* @cpu: target CPU number
1398	*
1399	* Called from the schedutil frequency scaling governor in scheduler context.
1400	*
1401	* Return: the current idle calls counter value for @cpu
1402	*/
1403	unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
1404	{
1405	struct tick_sched *ts = tick_get_tick_sched(cpu);
1406
1407	return ts->idle_calls;
1408	}
1409
1410	static void tick_nohz_account_idle_time(struct tick_sched *ts,
1411	ktime_t now)
1412	{
1413	unsigned long ticks;
1414
1415	ts->idle_exittime = now;
1416
1417	if (vtime_accounting_enabled_this_cpu())
1418	return;
1419	/*
1420	* We stopped the tick in idle. update_process_times() would miss the
1421	* time we slept, as it does only a 1 tick accounting.
1422	* Enforce that this is accounted to idle !
1423	*/
1424	ticks = jiffies - ts->idle_jiffies;
1425	/*
1426	* We might be one off. Do not randomly account a huge number of ticks!
1427	*/
1428	if (ticks && ticks < LONG_MAX)
1429	account_idle_ticks(ticks);
1430	}
1431
1432	void tick_nohz_idle_restart_tick(void)
1433	{
1434	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1435
1436	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
1437	ktime_t now = ktime_get();
1438	tick_nohz_restart_sched_tick(ts, now);
1439	tick_nohz_account_idle_time(ts, now);
1440	}
1441	}
1442
1443	static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
1444	{
1445	if (tick_nohz_full_cpu(smp_processor_id()))
1446	__tick_nohz_full_update_tick(ts, now);
1447	else
1448	tick_nohz_restart_sched_tick(ts, now);
1449
1450	tick_nohz_account_idle_time(ts, now);
1451	}
1452
1453	/**
1454	* tick_nohz_idle_exit - Update the tick upon idle task exit
1455	*
1456	* When the idle task exits, update the tick depending on the
1457	* following situations:
1458	*
1459	* 1) If the CPU is not in nohz_full mode (most cases), then
1460	* restart the tick.
1461	*
1462	* 2) If the CPU is in nohz_full mode (corner case):
1463	* 2.1) If the tick can be kept stopped (no tick dependencies)
1464	* then re-evaluate the next tick and try to keep it stopped
1465	* as long as possible.
1466	* 2.2) If the tick has dependencies, restart the tick.
1467	*
1468	*/
1469	void tick_nohz_idle_exit(void)
1470	{
1471	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1472	bool idle_active, tick_stopped;
1473	ktime_t now;
1474
1475	local_irq_disable();
1476
1477	WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
1478	WARN_ON_ONCE(ts->timer_expires_base);
1479
1480	tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
1481	idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
1482	tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
1483
1484	if (idle_active \|\| tick_stopped)
1485	now = ktime_get();
1486
1487	if (idle_active)
1488	tick_nohz_stop_idle(ts, now);
1489
1490	if (tick_stopped)
1491	tick_nohz_idle_update_tick(ts, now);
1492
1493	local_irq_enable();
1494	}
1495
1496	/*
1497	* In low-resolution mode, the tick handler must be implemented directly
1498	* at the clockevent level. hrtimer can't be used instead, because its
1499	* infrastructure actually relies on the tick itself as a backend in
1500	* low-resolution mode (see hrtimer_run_queues()).
1501	*/
1502	static void tick_nohz_lowres_handler(struct clock_event_device *dev)
1503	{
1504	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1505
1506	dev->next_event = KTIME_MAX;
1507
1508	if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
1509	tick_program_event(expires: hrtimer_get_expires(timer: &ts->sched_timer), force: `1`);
1510	}
1511
1512	static inline void tick_nohz_activate(struct tick_sched *ts)
1513	{
1514	if (!tick_nohz_enabled)
1515	return;
1516	tick_sched_flag_set(ts, TS_FLAG_NOHZ);
1517	/ One update is enough /
1518	if (!test_and_set_bit(nr: `0`, addr: &tick_nohz_active))
1519	timers_update_nohz();
1520	}
1521
1522	/**
1523	* tick_nohz_switch_to_nohz - switch to NOHZ mode
1524	*/
1525	static void tick_nohz_switch_to_nohz(void)
1526	{
1527	if (!tick_nohz_enabled)
1528	return;
1529
1530	if (tick_switch_to_oneshot(handler: tick_nohz_lowres_handler))
1531	return;
1532
1533	/*
1534	* Recycle the hrtimer in 'ts', so we can share the
1535	* highres code.
1536	*/
1537	tick_setup_sched_timer(hrtimer: false);
1538	}
1539
1540	static inline void tick_nohz_irq_enter(void)
1541	{
1542	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1543	ktime_t now;
1544
1545	if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED \| TS_FLAG_IDLE_ACTIVE))
1546	return;
1547	now = ktime_get();
1548	if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
1549	tick_nohz_stop_idle(ts, now);
1550	/*
1551	* If all CPUs are idle we may need to update a stale jiffies value.
1552	* Note nohz_full is a special case: a timekeeper is guaranteed to stay
1553	* alive but it might be busy looping with interrupts disabled in some
1554	* rare case (typically stop machine). So we must make sure we have a
1555	* last resort.
1556	*/
1557	if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
1558	tick_nohz_update_jiffies(now);
1559	}
1560
1561	#else
1562
1563	static inline void tick_nohz_switch_to_nohz(void) { }
1564	static inline void tick_nohz_irq_enter(void) { }
1565	static inline void tick_nohz_activate(struct tick_sched *ts) { }
1566
1567	#endif /* CONFIG_NO_HZ_COMMON */
1568
1569	/*
1570	* Called from irq_enter() to notify about the possible interruption of idle()
1571	*/
1572	void tick_irq_enter(void)
1573	{
1574	tick_check_oneshot_broadcast_this_cpu();
1575	tick_nohz_irq_enter();
1576	}
1577
1578	static int sched_skew_tick;
1579
1580	static int __init skew_tick(char *str)
1581	{
1582	get_option(str: &str, pint: &sched_skew_tick);
1583
1584	return `0`;
1585	}
1586	early_param("skew_tick", skew_tick);
1587
1588	/**
1589	* tick_setup_sched_timer - setup the tick emulation timer
1590	* @hrtimer: whether to use the hrtimer or not
1591	*/
1592	void tick_setup_sched_timer(bool hrtimer)
1593	{
1594	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1595
1596	/ Emulate tick processing via per-CPU hrtimers: /
1597	hrtimer_setup(timer: &ts->sched_timer, function: tick_nohz_handler, CLOCK_MONOTONIC, mode: HRTIMER_MODE_ABS_HARD);
1598
1599	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
1600	tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
1601
1602	/ Get the next period (per-CPU) /
1603	hrtimer_set_expires(timer: &ts->sched_timer, time: tick_init_jiffy_update());
1604
1605	/ Offset the tick to avert 'jiffies_lock' contention. /
1606	if (sched_skew_tick) {
1607	u64 offset = TICK_NSEC >> `1`;
1608	do_div(offset, num_possible_cpus());
1609	offset *= smp_processor_id();
1610	hrtimer_add_expires_ns(timer: &ts->sched_timer, ns: offset);
1611	}
1612
1613	hrtimer_forward_now(timer: &ts->sched_timer, TICK_NSEC);
1614	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
1615	hrtimer_start_expires(timer: &ts->sched_timer, mode: HRTIMER_MODE_ABS_PINNED_HARD);
1616	else
1617	tick_program_event(expires: hrtimer_get_expires(timer: &ts->sched_timer), force: `1`);
1618	tick_nohz_activate(ts);
1619	}
1620
1621	/*
1622	* Shut down the tick and make sure the CPU won't try to retake the timekeeping
1623	* duty before disabling IRQs in idle for the last time.
1624	*/
1625	void tick_sched_timer_dying(int cpu)
1626	{
1627	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
1628	ktime_t idle_sleeptime, iowait_sleeptime;
1629	unsigned long idle_calls, idle_sleeps;
1630
1631	/ This must happen before hrtimers are migrated! /
1632	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
1633	hrtimer_cancel(timer: &ts->sched_timer);
1634
1635	idle_sleeptime = ts->idle_sleeptime;
1636	iowait_sleeptime = ts->iowait_sleeptime;
1637	idle_calls = ts->idle_calls;
1638	idle_sleeps = ts->idle_sleeps;
1639	memset(ts, `0`, sizeof(*ts));
1640	ts->idle_sleeptime = idle_sleeptime;
1641	ts->iowait_sleeptime = iowait_sleeptime;
1642	ts->idle_calls = idle_calls;
1643	ts->idle_sleeps = idle_sleeps;
1644	}
1645
1646	/*
1647	* Async notification about clocksource changes
1648	*/
1649	void tick_clock_notify(void)
1650	{
1651	int cpu;
1652
1653	for_each_possible_cpu(cpu)
1654	set_bit(nr: `0`, addr: &per_cpu(tick_cpu_sched, cpu).check_clocks);
1655	}
1656
1657	/*
1658	* Async notification about clock event changes
1659	*/
1660	void tick_oneshot_notify(void)
1661	{
1662	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1663
1664	set_bit(nr: `0`, addr: &ts->check_clocks);
1665	}
1666
1667	/*
1668	* Check if a change happened, which makes oneshot possible.
1669	*
1670	* Called cyclically from the hrtimer softirq (driven by the timer
1671	* softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
1672	* mode, because high resolution timers are disabled (either compile
1673	* or runtime). Called with interrupts disabled.
1674	*/
1675	int tick_check_oneshot_change(int allow_nohz)
1676	{
1677	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1678
1679	if (!test_and_clear_bit(nr: `0`, addr: &ts->check_clocks))
1680	return `0`;
1681
1682	if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
1683	return `0`;
1684
1685	if (!timekeeping_valid_for_hres() \|\| !tick_is_oneshot_available())
1686	return `0`;
1687
1688	if (!allow_nohz)
1689	return `1`;
1690
1691	tick_nohz_switch_to_nohz();
1692	return `0`;
1693	}
1694

source code of linux/kernel/time/tick-sched.c