posix-cpu-timers.c source code [linux/kernel/time/posix-cpu-timers.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Implement CPU time clocks for the POSIX clock interface.
4	*/
5
6	#include <linux/sched/signal.h>
7	#include <linux/sched/cputime.h>
8	#include <linux/posix-timers.h>
9	#include <linux/errno.h>
10	#include <linux/math64.h>
11	#include <linux/uaccess.h>
12	#include <linux/kernel_stat.h>
13	#include <trace/events/timer.h>
14	#include <linux/tick.h>
15	#include <linux/workqueue.h>
16	#include <linux/compat.h>
17	#include <linux/sched/deadline.h>
18	#include <linux/task_work.h>
19
20	#include "posix-timers.h"
21
22	static void posix_cpu_timer_rearm(struct k_itimer *timer);
23
24	void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
25	{
26	posix_cputimers_init(pct);
27	if (cpu_limit != RLIM_INFINITY) {
28	pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
29	pct->timers_active = true;
30	}
31	}
32
33	/*
34	* Called after updating RLIMIT_CPU to run cpu timer and update
35	* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
36	* necessary. Needs siglock protection since other code may update the
37	* expiration cache as well.
38	*
39	* Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
40	* we cannot lock_task_sighand. Cannot fail if task is current.
41	*/
42	int update_rlimit_cpu(struct task_struct task, unsigned* long rlim_new)
43	{
44	u64 nsecs = rlim_new * NSEC_PER_SEC;
45	unsigned long irq_fl;
46
47	if (!lock_task_sighand(task, flags: &irq_fl))
48	return -ESRCH;
49	set_process_cpu_timer(task, CPUCLOCK_PROF, newval: &nsecs, NULL);
50	unlock_task_sighand(task, flags: &irq_fl);
51	return `0`;
52	}
53
54	/*
55	* Functions for validating access to tasks.
56	*/
57	static struct pid pid_for_clock(const* clockid_t clock, bool gettime)
58	{
59	const bool thread = !!CPUCLOCK_PERTHREAD(clock);
60	const pid_t upid = CPUCLOCK_PID(clock);
61	struct pid *pid;
62
63	if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
64	return NULL;
65
66	/*
67	* If the encoded PID is 0, then the timer is targeted at current
68	* or the process to which current belongs.
69	*/
70	if (upid == `0`)
71	return thread ? task_pid(current) : task_tgid(current);
72
73	pid = find_vpid(nr: upid);
74	if (!pid)
75	return NULL;
76
77	if (thread) {
78	struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
79	return (tsk && same_thread_group(p1: tsk, current)) ? pid : NULL;
80	}
81
82	/*
83	* For clock_gettime(PROCESS) allow finding the process by
84	* with the pid of the current task. The code needs the tgid
85	* of the process so that pid_task(pid, PIDTYPE_TGID) can be
86	* used to find the process.
87	*/
88	if (gettime && (pid == task_pid(current)))
89	return task_tgid(current);
90
91	/*
92	* For processes require that pid identifies a process.
93	*/
94	return pid_has_task(pid, type: PIDTYPE_TGID) ? pid : NULL;
95	}
96
97	static inline int validate_clock_permissions(const clockid_t clock)
98	{
99	int ret;
100
101	rcu_read_lock();
102	ret = pid_for_clock(clock, gettime: false) ? `0` : -EINVAL;
103	rcu_read_unlock();
104
105	return ret;
106	}
107
108	static inline enum pid_type clock_pid_type(const clockid_t clock)
109	{
110	return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
111	}
112
113	static inline struct task_struct cpu_timer_task_rcu(struct* k_itimer *timer)
114	{
115	return pid_task(pid: timer->it.cpu.pid, clock_pid_type(clock: timer->it_clock));
116	}
117
118	/*
119	* Update expiry time from increment, and increase overrun count,
120	* given the current clock sample.
121	*/
122	static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
123	{
124	u64 delta, incr, expires = timer->it.cpu.node.expires;
125	int i;
126
127	if (!timer->it_interval)
128	return expires;
129
130	if (now < expires)
131	return expires;
132
133	incr = timer->it_interval;
134	delta = now + incr - expires;
135
136	/ Don't use (incr2 < delta), incr2 might overflow. /
137	for (i = `0`; incr < delta - incr; i++)
138	incr = incr << `1`;
139
140	for (; i >= `0`; incr >>= `1`, i--) {
141	if (delta < incr)
142	continue;
143
144	timer->it.cpu.node.expires += incr;
145	timer->it_overrun += `1LL` << i;
146	delta -= incr;
147	}
148	return timer->it.cpu.node.expires;
149	}
150
151	/ Check whether all cache entries contain U64_MAX, i.e. eternal expiry time /
152	static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
153	{
154	return !(~pct->bases[CPUCLOCK_PROF].nextevt \|
155	~pct->bases[CPUCLOCK_VIRT].nextevt \|
156	~pct->bases[CPUCLOCK_SCHED].nextevt);
157	}
158
159	static int
160	posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
161	{
162	int error = validate_clock_permissions(clock: which_clock);
163
164	if (!error) {
165	tp->tv_sec = `0`;
166	tp->tv_nsec = ((NSEC_PER_SEC + HZ - `1`) / HZ);
167	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
168	/*
169	* If sched_clock is using a cycle counter, we
170	* don't have any idea of its true resolution
171	* exported, but it is much more than 1s/HZ.
172	*/
173	tp->tv_nsec = `1`;
174	}
175	}
176	return error;
177	}
178
179	static int
180	posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
181	{
182	int error = validate_clock_permissions(clock);
183
184	/*
185	* You can never reset a CPU clock, but we check for other errors
186	* in the call before failing with EPERM.
187	*/
188	return error ? : -EPERM;
189	}
190
191	/*
192	* Sample a per-thread clock for the given task. clkid is validated.
193	*/
194	static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
195	{
196	u64 utime, stime;
197
198	if (clkid == CPUCLOCK_SCHED)
199	return task_sched_runtime(task: p);
200
201	task_cputime(t: p, utime: &utime, stime: &stime);
202
203	switch (clkid) {
204	case CPUCLOCK_PROF:
205	return utime + stime;
206	case CPUCLOCK_VIRT:
207	return utime;
208	default:
209	WARN_ON_ONCE(`1`);
210	}
211	return `0`;
212	}
213
214	static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
215	{
216	samples[CPUCLOCK_PROF] = stime + utime;
217	samples[CPUCLOCK_VIRT] = utime;
218	samples[CPUCLOCK_SCHED] = rtime;
219	}
220
221	static void task_sample_cputime(struct task_struct p, u64 samples)
222	{
223	u64 stime, utime;
224
225	task_cputime(t: p, utime: &utime, stime: &stime);
226	store_samples(samples, stime, utime, rtime: p->se.sum_exec_runtime);
227	}
228
229	static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
230	u64 *samples)
231	{
232	u64 stime, utime, rtime;
233
234	utime = atomic64_read(v: &at->utime);
235	stime = atomic64_read(v: &at->stime);
236	rtime = atomic64_read(v: &at->sum_exec_runtime);
237	store_samples(samples, stime, utime, rtime);
238	}
239
240	/*
241	* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
242	* to avoid race conditions with concurrent updates to cputime.
243	*/
244	static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
245	{
246	u64 curr_cputime = atomic64_read(v: cputime);
247
248	do {
249	if (sum_cputime <= curr_cputime)
250	return;
251	} while (!atomic64_try_cmpxchg(v: cputime, old: &curr_cputime, new: sum_cputime));
252	}
253
254	static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
255	struct task_cputime *sum)
256	{
257	__update_gt_cputime(cputime: &cputime_atomic->utime, sum_cputime: sum->utime);
258	__update_gt_cputime(cputime: &cputime_atomic->stime, sum_cputime: sum->stime);
259	__update_gt_cputime(cputime: &cputime_atomic->sum_exec_runtime, sum_cputime: sum->sum_exec_runtime);
260	}
261
262	/**
263	* thread_group_sample_cputime - Sample cputime for a given task
264	* @tsk: Task for which cputime needs to be started
265	* @samples: Storage for time samples
266	*
267	* Called from sys_getitimer() to calculate the expiry time of an active
268	* timer. That means group cputime accounting is already active. Called
269	* with task sighand lock held.
270	*
271	* Updates @times with an uptodate sample of the thread group cputimes.
272	*/
273	void thread_group_sample_cputime(struct task_struct tsk, u64 samples)
274	{
275	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
276	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
277
278	WARN_ON_ONCE(!pct->timers_active);
279
280	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
281	}
282
283	/**
284	* thread_group_start_cputime - Start cputime and return a sample
285	* @tsk: Task for which cputime needs to be started
286	* @samples: Storage for time samples
287	*
288	* The thread group cputime accounting is avoided when there are no posix
289	* CPU timers armed. Before starting a timer it's required to check whether
290	* the time accounting is active. If not, a full update of the atomic
291	* accounting store needs to be done and the accounting enabled.
292	*
293	* Updates @times with an uptodate sample of the thread group cputimes.
294	*/
295	static void thread_group_start_cputime(struct task_struct tsk, u64 samples)
296	{
297	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
298	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
299
300	lockdep_assert_task_sighand_held(task: tsk);
301
302	/ Check if cputimer isn't running. This is accessed without locking. /
303	if (!READ_ONCE(pct->timers_active)) {
304	struct task_cputime sum;
305
306	/*
307	* The POSIX timer interface allows for absolute time expiry
308	* values through the TIMER_ABSTIME flag, therefore we have
309	* to synchronize the timer to the clock every time we start it.
310	*/
311	thread_group_cputime(tsk, times: &sum);
312	update_gt_cputime(cputime_atomic: &cputimer->cputime_atomic, sum: &sum);
313
314	/*
315	* We're setting timers_active without a lock. Ensure this
316	* only gets written to in one operation. We set it after
317	* update_gt_cputime() as a small optimization, but
318	* barriers are not required because update_gt_cputime()
319	* can handle concurrent updates.
320	*/
321	WRITE_ONCE(pct->timers_active, true);
322	}
323	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
324	}
325
326	static void __thread_group_cputime(struct task_struct tsk, u64 samples)
327	{
328	struct task_cputime ct;
329
330	thread_group_cputime(tsk, times: &ct);
331	store_samples(samples, stime: ct.stime, utime: ct.utime, rtime: ct.sum_exec_runtime);
332	}
333
334	/*
335	* Sample a process (thread group) clock for the given task clkid. If the
336	* group's cputime accounting is already enabled, read the atomic
337	* store. Otherwise a full update is required. clkid is already validated.
338	*/
339	static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
340	bool start)
341	{
342	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
343	struct posix_cputimers *pct = &p->signal->posix_cputimers;
344	u64 samples[CPUCLOCK_MAX];
345
346	if (!READ_ONCE(pct->timers_active)) {
347	if (start)
348	thread_group_start_cputime(tsk: p, samples);
349	else
350	__thread_group_cputime(tsk: p, samples);
351	} else {
352	proc_sample_cputime_atomic(at: &cputimer->cputime_atomic, samples);
353	}
354
355	return samples[clkid];
356	}
357
358	static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
359	{
360	const clockid_t clkid = CPUCLOCK_WHICH(clock);
361	struct task_struct *tsk;
362	u64 t;
363
364	rcu_read_lock();
365	tsk = pid_task(pid: pid_for_clock(clock, gettime: true), clock_pid_type(clock));
366	if (!tsk) {
367	rcu_read_unlock();
368	return -EINVAL;
369	}
370
371	if (CPUCLOCK_PERTHREAD(clock))
372	t = cpu_clock_sample(clkid, p: tsk);
373	else
374	t = cpu_clock_sample_group(clkid, p: tsk, start: false);
375	rcu_read_unlock();
376
377	*tp = ns_to_timespec64(nsec: t);
378	return `0`;
379	}
380
381	/*
382	* Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
383	* This is called from sys_timer_create() and do_cpu_nanosleep() with the
384	* new timer already all-zeros initialized.
385	*/
386	static int posix_cpu_timer_create(struct k_itimer *new_timer)
387	{
388	static struct lock_class_key posix_cpu_timers_key;
389	struct pid *pid;
390
391	rcu_read_lock();
392	pid = pid_for_clock(clock: new_timer->it_clock, gettime: false);
393	if (!pid) {
394	rcu_read_unlock();
395	return -EINVAL;
396	}
397
398	/*
399	* If posix timer expiry is handled in task work context then
400	* timer::it_lock can be taken without disabling interrupts as all
401	* other locking happens in task context. This requires a separate
402	* lock class key otherwise regular posix timer expiry would record
403	* the lock class being taken in interrupt context and generate a
404	* false positive warning.
405	*/
406	if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
407	lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
408
409	new_timer->kclock = &clock_posix_cpu;
410	timerqueue_init(node: &new_timer->it.cpu.node);
411	new_timer->it.cpu.pid = get_pid(pid);
412	rcu_read_unlock();
413	return `0`;
414	}
415
416	static struct posix_cputimer_base timer_base(struct* k_itimer *timer,
417	struct task_struct *tsk)
418	{
419	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
420
421	if (CPUCLOCK_PERTHREAD(timer->it_clock))
422	return tsk->posix_cputimers.bases + clkidx;
423	else
424	return tsk->signal->posix_cputimers.bases + clkidx;
425	}
426
427	/*
428	* Force recalculating the base earliest expiration on the next tick.
429	* This will also re-evaluate the need to keep around the process wide
430	* cputime counter and tick dependency and eventually shut these down
431	* if necessary.
432	*/
433	static void trigger_base_recalc_expires(struct k_itimer *timer,
434	struct task_struct *tsk)
435	{
436	struct posix_cputimer_base *base = timer_base(timer, tsk);
437
438	base->nextevt = `0`;
439	}
440
441	/*
442	* Dequeue the timer and reset the base if it was its earliest expiration.
443	* It makes sure the next tick recalculates the base next expiration so we
444	* don't keep the costly process wide cputime counter around for a random
445	* amount of time, along with the tick dependency.
446	*
447	* If another timer gets queued between this and the next tick, its
448	* expiration will update the base next event if necessary on the next
449	* tick.
450	*/
451	static void disarm_timer(struct k_itimer timer, struct* task_struct *p)
452	{
453	struct cpu_timer *ctmr = &timer->it.cpu;
454	struct posix_cputimer_base *base;
455
456	if (!cpu_timer_dequeue(ctmr))
457	return;
458
459	base = timer_base(timer, tsk: p);
460	if (cpu_timer_getexpires(ctmr) == base->nextevt)
461	trigger_base_recalc_expires(timer, tsk: p);
462	}
463
464
465	/*
466	* Clean up a CPU-clock timer that is about to be destroyed.
467	* This is called from timer deletion with the timer already locked.
468	* If we return TIMER_RETRY, it's necessary to release the timer's lock
469	* and try again. (This happens when the timer is in the middle of firing.)
470	*/
471	static int posix_cpu_timer_del(struct k_itimer *timer)
472	{
473	struct cpu_timer *ctmr = &timer->it.cpu;
474	struct sighand_struct *sighand;
475	struct task_struct *p;
476	unsigned long flags;
477	int ret = `0`;
478
479	rcu_read_lock();
480	p = cpu_timer_task_rcu(timer);
481	if (!p)
482	goto out;
483
484	/*
485	* Protect against sighand release/switch in exit/exec and process/
486	* thread timer list entry concurrent read/writes.
487	*/
488	sighand = lock_task_sighand(task: p, flags: &flags);
489	if (unlikely(sighand == NULL)) {
490	/*
491	* This raced with the reaping of the task. The exit cleanup
492	* should have removed this timer from the timer queue.
493	*/
494	WARN_ON_ONCE(ctmr->head \|\| timerqueue_node_queued(&ctmr->node));
495	} else {
496	if (timer->it.cpu.firing)
497	ret = TIMER_RETRY;
498	else
499	disarm_timer(timer, p);
500
501	unlock_task_sighand(task: p, flags: &flags);
502	}
503
504	out:
505	rcu_read_unlock();
506	if (!ret)
507	put_pid(pid: ctmr->pid);
508
509	return ret;
510	}
511
512	static void cleanup_timerqueue(struct timerqueue_head *head)
513	{
514	struct timerqueue_node *node;
515	struct cpu_timer *ctmr;
516
517	while ((node = timerqueue_getnext(head))) {
518	timerqueue_del(head, node);
519	ctmr = container_of(node, struct cpu_timer, node);
520	ctmr->head = NULL;
521	}
522	}
523
524	/*
525	* Clean out CPU timers which are still armed when a thread exits. The
526	* timers are only removed from the list. No other updates are done. The
527	* corresponding posix timers are still accessible, but cannot be rearmed.
528	*
529	* This must be called with the siglock held.
530	*/
531	static void cleanup_timers(struct posix_cputimers *pct)
532	{
533	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_PROF].tqhead);
534	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_VIRT].tqhead);
535	cleanup_timerqueue(head: &pct->bases[CPUCLOCK_SCHED].tqhead);
536	}
537
538	/*
539	* These are both called with the siglock held, when the current thread
540	* is being reaped. When the final (leader) thread in the group is reaped,
541	* posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
542	*/
543	void posix_cpu_timers_exit(struct task_struct *tsk)
544	{
545	cleanup_timers(pct: &tsk->posix_cputimers);
546	}
547	void posix_cpu_timers_exit_group(struct task_struct *tsk)
548	{
549	cleanup_timers(pct: &tsk->signal->posix_cputimers);
550	}
551
552	/*
553	* Insert the timer on the appropriate list before any timers that
554	* expire later. This must be called with the sighand lock held.
555	*/
556	static void arm_timer(struct k_itimer timer, struct* task_struct *p)
557	{
558	struct posix_cputimer_base *base = timer_base(timer, tsk: p);
559	struct cpu_timer *ctmr = &timer->it.cpu;
560	u64 newexp = cpu_timer_getexpires(ctmr);
561
562	if (!cpu_timer_enqueue(head: &base->tqhead, ctmr))
563	return;
564
565	/*
566	* We are the new earliest-expiring POSIX 1.b timer, hence
567	* need to update expiration cache. Take into account that
568	* for process timers we share expiration cache with itimers
569	* and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
570	*/
571	if (newexp < base->nextevt)
572	base->nextevt = newexp;
573
574	if (CPUCLOCK_PERTHREAD(timer->it_clock))
575	tick_dep_set_task(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER);
576	else
577	tick_dep_set_signal(tsk: p, bit: TICK_DEP_BIT_POSIX_TIMER);
578	}
579
580	/*
581	* The timer is locked, fire it and arrange for its reload.
582	*/
583	static void cpu_timer_fire(struct k_itimer *timer)
584	{
585	struct cpu_timer *ctmr = &timer->it.cpu;
586
587	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
588	/*
589	* User don't want any signal.
590	*/
591	cpu_timer_setexpires(ctmr, exp: `0`);
592	} else if (unlikely(timer->sigq == NULL)) {
593	/*
594	* This a special case for clock_nanosleep,
595	* not a normal timer from sys_timer_create.
596	*/
597	wake_up_process(tsk: timer->it_process);
598	cpu_timer_setexpires(ctmr, exp: `0`);
599	} else if (!timer->it_interval) {
600	/*
601	* One-shot timer. Clear it as soon as it's fired.
602	*/
603	posix_timer_event(timr: timer, si_private: `0`);
604	cpu_timer_setexpires(ctmr, exp: `0`);
605	} else if (posix_timer_event(timr: timer, si_private: ++timer->it_requeue_pending)) {
606	/*
607	* The signal did not get queued because the signal
608	* was ignored, so we won't get any callback to
609	* reload the timer. But we need to keep it
610	* ticking in case the signal is deliverable next time.
611	*/
612	posix_cpu_timer_rearm(timer);
613	++timer->it_requeue_pending;
614	}
615	}
616
617	/*
618	* Guts of sys_timer_settime for CPU timers.
619	* This is called with the timer locked and interrupts disabled.
620	* If we return TIMER_RETRY, it's necessary to release the timer's lock
621	* and try again. (This happens when the timer is in the middle of firing.)
622	*/
623	static int posix_cpu_timer_set(struct k_itimer timer, int* timer_flags,
624	struct itimerspec64 new, struct* itimerspec64 *old)
625	{
626	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
627	u64 old_expires, new_expires, old_incr, val;
628	struct cpu_timer *ctmr = &timer->it.cpu;
629	struct sighand_struct *sighand;
630	struct task_struct *p;
631	unsigned long flags;
632	int ret = `0`;
633
634	rcu_read_lock();
635	p = cpu_timer_task_rcu(timer);
636	if (!p) {
637	/*
638	* If p has just been reaped, we can no
639	* longer get any information about it at all.
640	*/
641	rcu_read_unlock();
642	return -ESRCH;
643	}
644
645	/*
646	* Use the to_ktime conversion because that clamps the maximum
647	* value to KTIME_MAX and avoid multiplication overflows.
648	*/
649	new_expires = ktime_to_ns(kt: timespec64_to_ktime(ts: new->it_value));
650
651	/*
652	* Protect against sighand release/switch in exit/exec and p->cpu_timers
653	* and p->signal->cpu_timers read/write in arm_timer()
654	*/
655	sighand = lock_task_sighand(task: p, flags: &flags);
656	/*
657	* If p has just been reaped, we can no
658	* longer get any information about it at all.
659	*/
660	if (unlikely(sighand == NULL)) {
661	rcu_read_unlock();
662	return -ESRCH;
663	}
664
665	/*
666	* Disarm any old timer after extracting its expiry time.
667	*/
668	old_incr = timer->it_interval;
669	old_expires = cpu_timer_getexpires(ctmr);
670
671	if (unlikely(timer->it.cpu.firing)) {
672	timer->it.cpu.firing = -`1`;
673	ret = TIMER_RETRY;
674	} else {
675	cpu_timer_dequeue(ctmr);
676	}
677
678	/*
679	* We need to sample the current value to convert the new
680	* value from to relative and absolute, and to convert the
681	* old value from absolute to relative. To set a process
682	* timer, we need a sample to balance the thread expiry
683	* times (in arm_timer). With an absolute time, we must
684	* check if it's already passed. In short, we need a sample.
685	*/
686	if (CPUCLOCK_PERTHREAD(timer->it_clock))
687	val = cpu_clock_sample(clkid, p);
688	else
689	val = cpu_clock_sample_group(clkid, p, start: true);
690
691	if (old) {
692	if (old_expires == `0`) {
693	old->it_value.tv_sec = `0`;
694	old->it_value.tv_nsec = `0`;
695	} else {
696	/*
697	* Update the timer in case it has overrun already.
698	* If it has, we'll report it as having overrun and
699	* with the next reloaded timer already ticking,
700	* though we are swallowing that pending
701	* notification here to install the new setting.
702	*/
703	u64 exp = bump_cpu_timer(timer, now: val);
704
705	if (val < exp) {
706	old_expires = exp - val;
707	old->it_value = ns_to_timespec64(nsec: old_expires);
708	} else {
709	old->it_value.tv_nsec = `1`;
710	old->it_value.tv_sec = `0`;
711	}
712	}
713	}
714
715	if (unlikely(ret)) {
716	/*
717	* We are colliding with the timer actually firing.
718	* Punt after filling in the timer's old value, and
719	* disable this firing since we are already reporting
720	* it as an overrun (thanks to bump_cpu_timer above).
721	*/
722	unlock_task_sighand(task: p, flags: &flags);
723	goto out;
724	}
725
726	if (new_expires != `0` && !(timer_flags & TIMER_ABSTIME)) {
727	new_expires += val;
728	}
729
730	/*
731	* Install the new expiry time (or zero).
732	* For a timer with no notification action, we don't actually
733	* arm the timer (we'll just fake it for timer_gettime).
734	*/
735	cpu_timer_setexpires(ctmr, exp: new_expires);
736	if (new_expires != `0` && val < new_expires) {
737	arm_timer(timer, p);
738	}
739
740	unlock_task_sighand(task: p, flags: &flags);
741	/*
742	* Install the new reload setting, and
743	* set up the signal and overrun bookkeeping.
744	*/
745	timer->it_interval = timespec64_to_ktime(ts: new->it_interval);
746
747	/*
748	* This acts as a modification timestamp for the timer,
749	* so any automatic reload attempt will punt on seeing
750	* that we have reset the timer manually.
751	*/
752	timer->it_requeue_pending = (timer->it_requeue_pending + `2`) &
753	~REQUEUE_PENDING;
754	timer->it_overrun_last = `0`;
755	timer->it_overrun = -`1`;
756
757	if (val >= new_expires) {
758	if (new_expires != `0`) {
759	/*
760	* The designated time already passed, so we notify
761	* immediately, even if the thread never runs to
762	* accumulate more time on this clock.
763	*/
764	cpu_timer_fire(timer);
765	}
766
767	/*
768	* Make sure we don't keep around the process wide cputime
769	* counter or the tick dependency if they are not necessary.
770	*/
771	sighand = lock_task_sighand(task: p, flags: &flags);
772	if (!sighand)
773	goto out;
774
775	if (!cpu_timer_queued(ctmr))
776	trigger_base_recalc_expires(timer, tsk: p);
777
778	unlock_task_sighand(task: p, flags: &flags);
779	}
780	out:
781	rcu_read_unlock();
782	if (old)
783	old->it_interval = ns_to_timespec64(nsec: old_incr);
784
785	return ret;
786	}
787
788	static void posix_cpu_timer_get(struct k_itimer timer, struct* itimerspec64 *itp)
789	{
790	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
791	struct cpu_timer *ctmr = &timer->it.cpu;
792	u64 now, expires = cpu_timer_getexpires(ctmr);
793	struct task_struct *p;
794
795	rcu_read_lock();
796	p = cpu_timer_task_rcu(timer);
797	if (!p)
798	goto out;
799
800	/*
801	* Easy part: convert the reload time.
802	*/
803	itp->it_interval = ktime_to_timespec64(timer->it_interval);
804
805	if (!expires)
806	goto out;
807
808	/*
809	* Sample the clock to take the difference with the expiry time.
810	*/
811	if (CPUCLOCK_PERTHREAD(timer->it_clock))
812	now = cpu_clock_sample(clkid, p);
813	else
814	now = cpu_clock_sample_group(clkid, p, start: false);
815
816	if (now < expires) {
817	itp->it_value = ns_to_timespec64(nsec: expires - now);
818	} else {
819	/*
820	* The timer should have expired already, but the firing
821	* hasn't taken place yet. Say it's just about to expire.
822	*/
823	itp->it_value.tv_nsec = `1`;
824	itp->it_value.tv_sec = `0`;
825	}
826	out:
827	rcu_read_unlock();
828	}
829
830	#define MAX_COLLECTED 20
831
832	static u64 collect_timerqueue(struct timerqueue_head *head,
833	struct list_head *firing, u64 now)
834	{
835	struct timerqueue_node *next;
836	int i = `0`;
837
838	while ((next = timerqueue_getnext(head))) {
839	struct cpu_timer *ctmr;
840	u64 expires;
841
842	ctmr = container_of(next, struct cpu_timer, node);
843	expires = cpu_timer_getexpires(ctmr);
844	/ Limit the number of timers to expire at once /
845	if (++i == MAX_COLLECTED \|\| now < expires)
846	return expires;
847
848	ctmr->firing = `1`;
849	/ See posix_cpu_timer_wait_running() /
850	rcu_assign_pointer(ctmr->handling, current);
851	cpu_timer_dequeue(ctmr);
852	list_add_tail(new: &ctmr->elist, head: firing);
853	}
854
855	return U64_MAX;
856	}
857
858	static void collect_posix_cputimers(struct posix_cputimers pct, u64 samples,
859	struct list_head *firing)
860	{
861	struct posix_cputimer_base *base = pct->bases;
862	int i;
863
864	for (i = `0`; i < CPUCLOCK_MAX; i++, base++) {
865	base->nextevt = collect_timerqueue(head: &base->tqhead, firing,
866	now: samples[i]);
867	}
868	}
869
870	static inline void check_dl_overrun(struct task_struct *tsk)
871	{
872	if (tsk->dl.dl_overrun) {
873	tsk->dl.dl_overrun = `0`;
874	send_signal_locked(SIGXCPU, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID);
875	}
876	}
877
878	static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
879	{
880	if (time < limit)
881	return false;
882
883	if (print_fatal_signals) {
884	pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
885	rt ? "RT" : "CPU", hard ? "hard" : "soft",
886	current->comm, task_pid_nr(current));
887	}
888	send_signal_locked(sig: signo, SEND_SIG_PRIV, current, type: PIDTYPE_TGID);
889	return true;
890	}
891
892	/*
893	* Check for any per-thread CPU timers that have fired and move them off
894	* the tsk->cpu_timers[N] list onto the firing list. Here we update the
895	* tsk->it_*_expires values to reflect the remaining thread CPU timers.
896	*/
897	static void check_thread_timers(struct task_struct *tsk,
898	struct list_head *firing)
899	{
900	struct posix_cputimers *pct = &tsk->posix_cputimers;
901	u64 samples[CPUCLOCK_MAX];
902	unsigned long soft;
903
904	if (dl_task(p: tsk))
905	check_dl_overrun(tsk);
906
907	if (expiry_cache_is_inactive(pct))
908	return;
909
910	task_sample_cputime(p: tsk, samples);
911	collect_posix_cputimers(pct, samples, firing);
912
913	/*
914	* Check for the special case thread timers.
915	*/
916	soft = task_rlimit(task: tsk, RLIMIT_RTTIME);
917	if (soft != RLIM_INFINITY) {
918	/ Task RT timeout is accounted in jiffies. RTTIME is usec /
919	unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
920	unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_RTTIME);
921
922	/ At the hard limit, send SIGKILL. No further action. /
923	if (hard != RLIM_INFINITY &&
924	check_rlimit(time: rttime, limit: hard, SIGKILL, rt: true, hard: true))
925	return;
926
927	/ At the soft limit, send a SIGXCPU every second /
928	if (check_rlimit(time: rttime, limit: soft, SIGXCPU, rt: true, hard: false)) {
929	soft += USEC_PER_SEC;
930	tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
931	}
932	}
933
934	if (expiry_cache_is_inactive(pct))
935	tick_dep_clear_task(tsk, bit: TICK_DEP_BIT_POSIX_TIMER);
936	}
937
938	static inline void stop_process_timers(struct signal_struct *sig)
939	{
940	struct posix_cputimers *pct = &sig->posix_cputimers;
941
942	/ Turn off the active flag. This is done without locking. /
943	WRITE_ONCE(pct->timers_active, false);
944	tick_dep_clear_signal(signal: sig, bit: TICK_DEP_BIT_POSIX_TIMER);
945	}
946
947	static void check_cpu_itimer(struct task_struct tsk, struct* cpu_itimer *it,
948	u64 expires, u64 cur_time, int* signo)
949	{
950	if (!it->expires)
951	return;
952
953	if (cur_time >= it->expires) {
954	if (it->incr)
955	it->expires += it->incr;
956	else
957	it->expires = `0`;
958
959	trace_itimer_expire(which: signo == SIGPROF ?
960	ITIMER_PROF : ITIMER_VIRTUAL,
961	pid: task_tgid(task: tsk), now: cur_time);
962	send_signal_locked(sig: signo, SEND_SIG_PRIV, p: tsk, type: PIDTYPE_TGID);
963	}
964
965	if (it->expires && it->expires < *expires)
966	*expires = it->expires;
967	}
968
969	/*
970	* Check for any per-thread CPU timers that have fired and move them
971	* off the tsk->*_timers list onto the firing list. Per-thread timers
972	* have already been taken off.
973	*/
974	static void check_process_timers(struct task_struct *tsk,
975	struct list_head *firing)
976	{
977	struct signal_struct *const sig = tsk->signal;
978	struct posix_cputimers *pct = &sig->posix_cputimers;
979	u64 samples[CPUCLOCK_MAX];
980	unsigned long soft;
981
982	/*
983	* If there are no active process wide timers (POSIX 1.b, itimers,
984	* RLIMIT_CPU) nothing to check. Also skip the process wide timer
985	* processing when there is already another task handling them.
986	*/
987	if (!READ_ONCE(pct->timers_active) \|\| pct->expiry_active)
988	return;
989
990	/*
991	* Signify that a thread is checking for process timers.
992	* Write access to this field is protected by the sighand lock.
993	*/
994	pct->expiry_active = true;
995
996	/*
997	* Collect the current process totals. Group accounting is active
998	* so the sample can be taken directly.
999	*/
1000	proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic, samples);
1001	collect_posix_cputimers(pct, samples, firing);
1002
1003	/*
1004	* Check for the special case process timers.
1005	*/
1006	check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_PROF],
1007	expires: &pct->bases[CPUCLOCK_PROF].nextevt,
1008	cur_time: samples[CPUCLOCK_PROF], SIGPROF);
1009	check_cpu_itimer(tsk, it: &sig->it[CPUCLOCK_VIRT],
1010	expires: &pct->bases[CPUCLOCK_VIRT].nextevt,
1011	cur_time: samples[CPUCLOCK_VIRT], SIGVTALRM);
1012
1013	soft = task_rlimit(task: tsk, RLIMIT_CPU);
1014	if (soft != RLIM_INFINITY) {
1015	/ RLIMIT_CPU is in seconds. Samples are nanoseconds /
1016	unsigned long hard = task_rlimit_max(task: tsk, RLIMIT_CPU);
1017	u64 ptime = samples[CPUCLOCK_PROF];
1018	u64 softns = (u64)soft * NSEC_PER_SEC;
1019	u64 hardns = (u64)hard * NSEC_PER_SEC;
1020
1021	/ At the hard limit, send SIGKILL. No further action. /
1022	if (hard != RLIM_INFINITY &&
1023	check_rlimit(time: ptime, limit: hardns, SIGKILL, rt: false, hard: true))
1024	return;
1025
1026	/ At the soft limit, send a SIGXCPU every second /
1027	if (check_rlimit(time: ptime, limit: softns, SIGXCPU, rt: false, hard: false)) {
1028	sig->rlim[RLIMIT_CPU].rlim_cur = soft + `1`;
1029	softns += NSEC_PER_SEC;
1030	}
1031
1032	/ Update the expiry cache /
1033	if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
1034	pct->bases[CPUCLOCK_PROF].nextevt = softns;
1035	}
1036
1037	if (expiry_cache_is_inactive(pct))
1038	stop_process_timers(sig);
1039
1040	pct->expiry_active = false;
1041	}
1042
1043	/*
1044	* This is called from the signal code (via posixtimer_rearm)
1045	* when the last timer signal was delivered and we have to reload the timer.
1046	*/
1047	static void posix_cpu_timer_rearm(struct k_itimer *timer)
1048	{
1049	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
1050	struct task_struct *p;
1051	struct sighand_struct *sighand;
1052	unsigned long flags;
1053	u64 now;
1054
1055	rcu_read_lock();
1056	p = cpu_timer_task_rcu(timer);
1057	if (!p)
1058	goto out;
1059
1060	/ Protect timer list r/w in arm_timer() /
1061	sighand = lock_task_sighand(task: p, flags: &flags);
1062	if (unlikely(sighand == NULL))
1063	goto out;
1064
1065	/*
1066	* Fetch the current sample and update the timer's expiry time.
1067	*/
1068	if (CPUCLOCK_PERTHREAD(timer->it_clock))
1069	now = cpu_clock_sample(clkid, p);
1070	else
1071	now = cpu_clock_sample_group(clkid, p, start: true);
1072
1073	bump_cpu_timer(timer, now);
1074
1075	/*
1076	* Now re-arm for the new expiry time.
1077	*/
1078	arm_timer(timer, p);
1079	unlock_task_sighand(task: p, flags: &flags);
1080	out:
1081	rcu_read_unlock();
1082	}
1083
1084	/**
1085	* task_cputimers_expired - Check whether posix CPU timers are expired
1086	*
1087	* @samples: Array of current samples for the CPUCLOCK clocks
1088	* @pct: Pointer to a posix_cputimers container
1089	*
1090	* Returns true if any member of @samples is greater than the corresponding
1091	* member of @pct->bases[CLK].nextevt. False otherwise
1092	*/
1093	static inline bool
1094	task_cputimers_expired(const u64 samples, struct* posix_cputimers *pct)
1095	{
1096	int i;
1097
1098	for (i = `0`; i < CPUCLOCK_MAX; i++) {
1099	if (samples[i] >= pct->bases[i].nextevt)
1100	return true;
1101	}
1102	return false;
1103	}
1104
1105	/**
1106	* fastpath_timer_check - POSIX CPU timers fast path.
1107	*
1108	* @tsk: The task (thread) being checked.
1109	*
1110	* Check the task and thread group timers. If both are zero (there are no
1111	* timers set) return false. Otherwise snapshot the task and thread group
1112	* timers and compare them with the corresponding expiration times. Return
1113	* true if a timer has expired, else return false.
1114	*/
1115	static inline bool fastpath_timer_check(struct task_struct *tsk)
1116	{
1117	struct posix_cputimers *pct = &tsk->posix_cputimers;
1118	struct signal_struct *sig;
1119
1120	if (!expiry_cache_is_inactive(pct)) {
1121	u64 samples[CPUCLOCK_MAX];
1122
1123	task_sample_cputime(p: tsk, samples);
1124	if (task_cputimers_expired(samples, pct))
1125	return true;
1126	}
1127
1128	sig = tsk->signal;
1129	pct = &sig->posix_cputimers;
1130	/*
1131	* Check if thread group timers expired when timers are active and
1132	* no other thread in the group is already handling expiry for
1133	* thread group cputimers. These fields are read without the
1134	* sighand lock. However, this is fine because this is meant to be
1135	* a fastpath heuristic to determine whether we should try to
1136	* acquire the sighand lock to handle timer expiry.
1137	*
1138	* In the worst case scenario, if concurrently timers_active is set
1139	* or expiry_active is cleared, but the current thread doesn't see
1140	* the change yet, the timer checks are delayed until the next
1141	* thread in the group gets a scheduler interrupt to handle the
1142	* timer. This isn't an issue in practice because these types of
1143	* delays with signals actually getting sent are expected.
1144	*/
1145	if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
1146	u64 samples[CPUCLOCK_MAX];
1147
1148	proc_sample_cputime_atomic(at: &sig->cputimer.cputime_atomic,
1149	samples);
1150
1151	if (task_cputimers_expired(samples, pct))
1152	return true;
1153	}
1154
1155	if (dl_task(p: tsk) && tsk->dl.dl_overrun)
1156	return true;
1157
1158	return false;
1159	}
1160
1161	static void handle_posix_cpu_timers(struct task_struct *tsk);
1162
1163	#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1164	static void posix_cpu_timers_work(struct callback_head *work)
1165	{
1166	struct posix_cputimers_work cw = container_of(work, typeof(cw), work);
1167
1168	mutex_lock(&cw->mutex);
1169	handle_posix_cpu_timers(current);
1170	mutex_unlock(lock: &cw->mutex);
1171	}
1172
1173	/*
1174	* Invoked from the posix-timer core when a cancel operation failed because
1175	* the timer is marked firing. The caller holds rcu_read_lock(), which
1176	* protects the timer and the task which is expiring it from being freed.
1177	*/
1178	static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1179	{
1180	struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
1181
1182	/ Has the handling task completed expiry already? /
1183	if (!tsk)
1184	return;
1185
1186	/ Ensure that the task cannot go away /
1187	get_task_struct(t: tsk);
1188	/ Now drop the RCU protection so the mutex can be locked /
1189	rcu_read_unlock();
1190	/ Wait on the expiry mutex /
1191	mutex_lock(&tsk->posix_cputimers_work.mutex);
1192	/ Release it immediately again. /
1193	mutex_unlock(lock: &tsk->posix_cputimers_work.mutex);
1194	/ Drop the task reference. /
1195	put_task_struct(t: tsk);
1196	/ Relock RCU so the callsite is balanced /
1197	rcu_read_lock();
1198	}
1199
1200	static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1201	{
1202	/ Ensure that timr->it.cpu.handling task cannot go away /
1203	rcu_read_lock();
1204	spin_unlock_irq(lock: &timr->it_lock);
1205	posix_cpu_timer_wait_running(timr);
1206	rcu_read_unlock();
1207	/ @timr is on stack and is valid /
1208	spin_lock_irq(lock: &timr->it_lock);
1209	}
1210
1211	/*
1212	* Clear existing posix CPU timers task work.
1213	*/
1214	void clear_posix_cputimers_work(struct task_struct *p)
1215	{
1216	/*
1217	* A copied work entry from the old task is not meaningful, clear it.
1218	* N.B. init_task_work will not do this.
1219	*/
1220	memset(&p->posix_cputimers_work.work, `0`,
1221	sizeof(p->posix_cputimers_work.work));
1222	init_task_work(twork: &p->posix_cputimers_work.work,
1223	func: posix_cpu_timers_work);
1224	mutex_init(&p->posix_cputimers_work.mutex);
1225	p->posix_cputimers_work.scheduled = false;
1226	}
1227
1228	/*
1229	* Initialize posix CPU timers task work in init task. Out of line to
1230	* keep the callback static and to avoid header recursion hell.
1231	*/
1232	void __init posix_cputimers_init_work(void)
1233	{
1234	clear_posix_cputimers_work(current);
1235	}
1236
1237	/*
1238	* Note: All operations on tsk->posix_cputimer_work.scheduled happen either
1239	* in hard interrupt context or in task context with interrupts
1240	* disabled. Aside of that the writer/reader interaction is always in the
1241	* context of the current task, which means they are strict per CPU.
1242	*/
1243	static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1244	{
1245	return tsk->posix_cputimers_work.scheduled;
1246	}
1247
1248	static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1249	{
1250	if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
1251	return;
1252
1253	/ Schedule task work to actually expire the timers /
1254	tsk->posix_cputimers_work.scheduled = true;
1255	task_work_add(task: tsk, twork: &tsk->posix_cputimers_work.work, mode: TWA_RESUME);
1256	}
1257
1258	static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1259	unsigned long start)
1260	{
1261	bool ret = true;
1262
1263	/*
1264	* On !RT kernels interrupts are disabled while collecting expired
1265	* timers, so no tick can happen and the fast path check can be
1266	* reenabled without further checks.
1267	*/
1268	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
1269	tsk->posix_cputimers_work.scheduled = false;
1270	return true;
1271	}
1272
1273	/*
1274	* On RT enabled kernels ticks can happen while the expired timers
1275	* are collected under sighand lock. But any tick which observes
1276	* the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
1277	* checks. So reenabling the tick work has do be done carefully:
1278	*
1279	* Disable interrupts and run the fast path check if jiffies have
1280	* advanced since the collecting of expired timers started. If
1281	* jiffies have not advanced or the fast path check did not find
1282	* newly expired timers, reenable the fast path check in the timer
1283	* interrupt. If there are newly expired timers, return false and
1284	* let the collection loop repeat.
1285	*/
1286	local_irq_disable();
1287	if (start != jiffies && fastpath_timer_check(tsk))
1288	ret = false;
1289	else
1290	tsk->posix_cputimers_work.scheduled = false;
1291	local_irq_enable();
1292
1293	return ret;
1294	}
1295	#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1296	static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1297	{
1298	lockdep_posixtimer_enter();
1299	handle_posix_cpu_timers(tsk);
1300	lockdep_posixtimer_exit();
1301	}
1302
1303	static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1304	{
1305	cpu_relax();
1306	}
1307
1308	static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1309	{
1310	spin_unlock_irq(&timr->it_lock);
1311	cpu_relax();
1312	spin_lock_irq(&timr->it_lock);
1313	}
1314
1315	static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1316	{
1317	return false;
1318	}
1319
1320	static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1321	unsigned long start)
1322	{
1323	return true;
1324	}
1325	#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1326
1327	static void handle_posix_cpu_timers(struct task_struct *tsk)
1328	{
1329	struct k_itimer timer, next;
1330	unsigned long flags, start;
1331	LIST_HEAD(firing);
1332
1333	if (!lock_task_sighand(task: tsk, flags: &flags))
1334	return;
1335
1336	do {
1337	/*
1338	* On RT locking sighand lock does not disable interrupts,
1339	* so this needs to be careful vs. ticks. Store the current
1340	* jiffies value.
1341	*/
1342	start = READ_ONCE(jiffies);
1343	barrier();
1344
1345	/*
1346	* Here we take off tsk->signal->cpu_timers[N] and
1347	* tsk->cpu_timers[N] all the timers that are firing, and
1348	* put them on the firing list.
1349	*/
1350	check_thread_timers(tsk, firing: &firing);
1351
1352	check_process_timers(tsk, firing: &firing);
1353
1354	/*
1355	* The above timer checks have updated the expiry cache and
1356	* because nothing can have queued or modified timers after
1357	* sighand lock was taken above it is guaranteed to be
1358	* consistent. So the next timer interrupt fastpath check
1359	* will find valid data.
1360	*
1361	* If timer expiry runs in the timer interrupt context then
1362	* the loop is not relevant as timers will be directly
1363	* expired in interrupt context. The stub function below
1364	* returns always true which allows the compiler to
1365	* optimize the loop out.
1366	*
1367	* If timer expiry is deferred to task work context then
1368	* the following rules apply:
1369	*
1370	* - On !RT kernels no tick can have happened on this CPU
1371	* after sighand lock was acquired because interrupts are
1372	* disabled. So reenabling task work before dropping
1373	* sighand lock and reenabling interrupts is race free.
1374	*
1375	* - On RT kernels ticks might have happened but the tick
1376	* work ignored posix CPU timer handling because the
1377	* CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
1378	* must be done very carefully including a check whether
1379	* ticks have happened since the start of the timer
1380	* expiry checks. posix_cpu_timers_enable_work() takes
1381	* care of that and eventually lets the expiry checks
1382	* run again.
1383	*/
1384	} while (!posix_cpu_timers_enable_work(tsk, start));
1385
1386	/*
1387	* We must release sighand lock before taking any timer's lock.
1388	* There is a potential race with timer deletion here, as the
1389	* siglock now protects our private firing list. We have set
1390	* the firing flag in each timer, so that a deletion attempt
1391	* that gets the timer lock before we do will give it up and
1392	* spin until we've taken care of that timer below.
1393	*/
1394	unlock_task_sighand(task: tsk, flags: &flags);
1395
1396	/*
1397	* Now that all the timers on our list have the firing flag,
1398	* no one will touch their list entries but us. We'll take
1399	* each timer's lock before clearing its firing flag, so no
1400	* timer call will interfere.
1401	*/
1402	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
1403	int cpu_firing;
1404
1405	/*
1406	* spin_lock() is sufficient here even independent of the
1407	* expiry context. If expiry happens in hard interrupt
1408	* context it's obvious. For task work context it's safe
1409	* because all other operations on timer::it_lock happen in
1410	* task context (syscall or exit).
1411	*/
1412	spin_lock(lock: &timer->it_lock);
1413	list_del_init(entry: &timer->it.cpu.elist);
1414	cpu_firing = timer->it.cpu.firing;
1415	timer->it.cpu.firing = `0`;
1416	/*
1417	* The firing flag is -1 if we collided with a reset
1418	* of the timer, which already reported this
1419	* almost-firing as an overrun. So don't generate an event.
1420	*/
1421	if (likely(cpu_firing >= `0`))
1422	cpu_timer_fire(timer);
1423	/ See posix_cpu_timer_wait_running() /
1424	rcu_assign_pointer(timer->it.cpu.handling, NULL);
1425	spin_unlock(lock: &timer->it_lock);
1426	}
1427	}
1428
1429	/*
1430	* This is called from the timer interrupt handler. The irq handler has
1431	* already updated our counts. We need to check if any timers fire now.
1432	* Interrupts are disabled.
1433	*/
1434	void run_posix_cpu_timers(void)
1435	{
1436	struct task_struct *tsk = current;
1437
1438	lockdep_assert_irqs_disabled();
1439
1440	/*
1441	* If the actual expiry is deferred to task work context and the
1442	* work is already scheduled there is no point to do anything here.
1443	*/
1444	if (posix_cpu_timers_work_scheduled(tsk))
1445	return;
1446
1447	/*
1448	* The fast path checks that there are no expired thread or thread
1449	* group timers. If that's so, just return.
1450	*/
1451	if (!fastpath_timer_check(tsk))
1452	return;
1453
1454	__run_posix_cpu_timers(tsk);
1455	}
1456
1457	/*
1458	* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1459	* The tsk->sighand->siglock must be held by the caller.
1460	*/
1461	void set_process_cpu_timer(struct task_struct tsk, unsigned* int clkid,
1462	u64 newval, u64 oldval)
1463	{
1464	u64 now, *nextevt;
1465
1466	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
1467	return;
1468
1469	nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
1470	now = cpu_clock_sample_group(clkid, p: tsk, start: true);
1471
1472	if (oldval) {
1473	/*
1474	* We are setting itimer. The *oldval is absolute and we update
1475	* it to be relative, *newval argument is relative and we update
1476	* it to be absolute.
1477	*/
1478	if (*oldval) {
1479	if (*oldval <= now) {
1480	/ Just about to fire. /
1481	*oldval = TICK_NSEC;
1482	} else {
1483	*oldval -= now;
1484	}
1485	}
1486
1487	if (*newval)
1488	*newval += now;
1489	}
1490
1491	/*
1492	* Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
1493	* expiry cache is also used by RLIMIT_CPU!.
1494	*/
1495	if (newval < nextevt)
1496	nextevt = newval;
1497
1498	tick_dep_set_signal(tsk, bit: TICK_DEP_BIT_POSIX_TIMER);
1499	}
1500
1501	static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1502	const struct timespec64 *rqtp)
1503	{
1504	struct itimerspec64 it;
1505	struct k_itimer timer;
1506	u64 expires;
1507	int error;
1508
1509	/*
1510	* Set up a temporary timer and then wait for it to go off.
1511	*/
1512	memset(&timer, `0`, sizeof timer);
1513	spin_lock_init(&timer.it_lock);
1514	timer.it_clock = which_clock;
1515	timer.it_overrun = -`1`;
1516	error = posix_cpu_timer_create(new_timer: &timer);
1517	timer.it_process = current;
1518
1519	if (!error) {
1520	static struct itimerspec64 zero_it;
1521	struct restart_block *restart;
1522
1523	memset(&it, `0`, sizeof(it));
1524	it.it_value = *rqtp;
1525
1526	spin_lock_irq(lock: &timer.it_lock);
1527	error = posix_cpu_timer_set(timer: &timer, timer_flags: flags, new: &it, NULL);
1528	if (error) {
1529	spin_unlock_irq(lock: &timer.it_lock);
1530	return error;
1531	}
1532
1533	while (!signal_pending(current)) {
1534	if (!cpu_timer_getexpires(ctmr: &timer.it.cpu)) {
1535	/*
1536	* Our timer fired and was reset, below
1537	* deletion can not fail.
1538	*/
1539	posix_cpu_timer_del(timer: &timer);
1540	spin_unlock_irq(lock: &timer.it_lock);
1541	return `0`;
1542	}
1543
1544	/*
1545	* Block until cpu_timer_fire (or a signal) wakes us.
1546	*/
1547	__set_current_state(TASK_INTERRUPTIBLE);
1548	spin_unlock_irq(lock: &timer.it_lock);
1549	schedule();
1550	spin_lock_irq(lock: &timer.it_lock);
1551	}
1552
1553	/*
1554	* We were interrupted by a signal.
1555	*/
1556	expires = cpu_timer_getexpires(ctmr: &timer.it.cpu);
1557	error = posix_cpu_timer_set(timer: &timer, timer_flags: `0`, new: &zero_it, old: &it);
1558	if (!error) {
1559	/ Timer is now unarmed, deletion can not fail. /
1560	posix_cpu_timer_del(timer: &timer);
1561	} else {
1562	while (error == TIMER_RETRY) {
1563	posix_cpu_timer_wait_running_nsleep(timr: &timer);
1564	error = posix_cpu_timer_del(timer: &timer);
1565	}
1566	}
1567
1568	spin_unlock_irq(lock: &timer.it_lock);
1569
1570	if ((it.it_value.tv_sec \| it.it_value.tv_nsec) == `0`) {
1571	/*
1572	* It actually did fire already.
1573	*/
1574	return `0`;
1575	}
1576
1577	error = -ERESTART_RESTARTBLOCK;
1578	/*
1579	* Report back to the user the time still remaining.
1580	*/
1581	restart = &current->restart_block;
1582	restart->nanosleep.expires = expires;
1583	if (restart->nanosleep.type != TT_NONE)
1584	error = nanosleep_copyout(restart, &it.it_value);
1585	}
1586
1587	return error;
1588	}
1589
1590	static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1591
1592	static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1593	const struct timespec64 *rqtp)
1594	{
1595	struct restart_block *restart_block = &current->restart_block;
1596	int error;
1597
1598	/*
1599	* Diagnose required errors first.
1600	*/
1601	if (CPUCLOCK_PERTHREAD(which_clock) &&
1602	(CPUCLOCK_PID(which_clock) == `0` \|\|
1603	CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1604	return -EINVAL;
1605
1606	error = do_cpu_nanosleep(which_clock, flags, rqtp);
1607
1608	if (error == -ERESTART_RESTARTBLOCK) {
1609
1610	if (flags & TIMER_ABSTIME)
1611	return -ERESTARTNOHAND;
1612
1613	restart_block->nanosleep.clockid = which_clock;
1614	set_restart_fn(restart: restart_block, fn: posix_cpu_nsleep_restart);
1615	}
1616	return error;
1617	}
1618
1619	static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1620	{
1621	clockid_t which_clock = restart_block->nanosleep.clockid;
1622	struct timespec64 t;
1623
1624	t = ns_to_timespec64(nsec: restart_block->nanosleep.expires);
1625
1626	return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, rqtp: &t);
1627	}
1628
1629	#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED)
1630	#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED)
1631
1632	static int process_cpu_clock_getres(const clockid_t which_clock,
1633	struct timespec64 *tp)
1634	{
1635	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1636	}
1637	static int process_cpu_clock_get(const clockid_t which_clock,
1638	struct timespec64 *tp)
1639	{
1640	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1641	}
1642	static int process_cpu_timer_create(struct k_itimer *timer)
1643	{
1644	timer->it_clock = PROCESS_CLOCK;
1645	return posix_cpu_timer_create(new_timer: timer);
1646	}
1647	static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1648	const struct timespec64 *rqtp)
1649	{
1650	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1651	}
1652	static int thread_cpu_clock_getres(const clockid_t which_clock,
1653	struct timespec64 *tp)
1654	{
1655	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1656	}
1657	static int thread_cpu_clock_get(const clockid_t which_clock,
1658	struct timespec64 *tp)
1659	{
1660	return posix_cpu_clock_get(THREAD_CLOCK, tp);
1661	}
1662	static int thread_cpu_timer_create(struct k_itimer *timer)
1663	{
1664	timer->it_clock = THREAD_CLOCK;
1665	return posix_cpu_timer_create(new_timer: timer);
1666	}
1667
1668	const struct k_clock clock_posix_cpu = {
1669	.clock_getres = posix_cpu_clock_getres,
1670	.clock_set = posix_cpu_clock_set,
1671	.clock_get_timespec = posix_cpu_clock_get,
1672	.timer_create = posix_cpu_timer_create,
1673	.nsleep = posix_cpu_nsleep,
1674	.timer_set = posix_cpu_timer_set,
1675	.timer_del = posix_cpu_timer_del,
1676	.timer_get = posix_cpu_timer_get,
1677	.timer_rearm = posix_cpu_timer_rearm,
1678	.timer_wait_running = posix_cpu_timer_wait_running,
1679	};
1680
1681	const struct k_clock clock_process = {
1682	.clock_getres = process_cpu_clock_getres,
1683	.clock_get_timespec = process_cpu_clock_get,
1684	.timer_create = process_cpu_timer_create,
1685	.nsleep = process_cpu_nsleep,
1686	};
1687
1688	const struct k_clock clock_thread = {
1689	.clock_getres = thread_cpu_clock_getres,
1690	.clock_get_timespec = thread_cpu_clock_get,
1691	.timer_create = thread_cpu_timer_create,
1692	};
1693

source code of linux/kernel/time/posix-cpu-timers.c