exit.c source code [linux/kernel/exit.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/kernel/exit.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	#include <linux/mm.h>
9	#include <linux/slab.h>
10	#include <linux/sched/autogroup.h>
11	#include <linux/sched/mm.h>
12	#include <linux/sched/stat.h>
13	#include <linux/sched/task.h>
14	#include <linux/sched/task_stack.h>
15	#include <linux/sched/cputime.h>
16	#include <linux/interrupt.h>
17	#include <linux/module.h>
18	#include <linux/capability.h>
19	#include <linux/completion.h>
20	#include <linux/personality.h>
21	#include <linux/tty.h>
22	#include <linux/iocontext.h>
23	#include <linux/key.h>
24	#include <linux/cpu.h>
25	#include <linux/acct.h>
26	#include <linux/tsacct_kern.h>
27	#include <linux/file.h>
28	#include <linux/freezer.h>
29	#include <linux/binfmts.h>
30	#include <linux/nsproxy.h>
31	#include <linux/pid_namespace.h>
32	#include <linux/ptrace.h>
33	#include <linux/profile.h>
34	#include <linux/mount.h>
35	#include <linux/proc_fs.h>
36	#include <linux/kthread.h>
37	#include <linux/mempolicy.h>
38	#include <linux/taskstats_kern.h>
39	#include <linux/delayacct.h>
40	#include <linux/cgroup.h>
41	#include <linux/syscalls.h>
42	#include <linux/signal.h>
43	#include <linux/posix-timers.h>
44	#include <linux/cn_proc.h>
45	#include <linux/mutex.h>
46	#include <linux/futex.h>
47	#include <linux/pipe_fs_i.h>
48	#include <linux/audit.h> /* for audit_free() */
49	#include <linux/resource.h>
50	#include <linux/task_io_accounting_ops.h>
51	#include <linux/blkdev.h>
52	#include <linux/task_work.h>
53	#include <linux/fs_struct.h>
54	#include <linux/init_task.h>
55	#include <linux/perf_event.h>
56	#include <trace/events/sched.h>
57	#include <linux/hw_breakpoint.h>
58	#include <linux/oom.h>
59	#include <linux/writeback.h>
60	#include <linux/shm.h>
61	#include <linux/kcov.h>
62	#include <linux/kmsan.h>
63	#include <linux/random.h>
64	#include <linux/rcuwait.h>
65	#include <linux/compat.h>
66	#include <linux/io_uring.h>
67	#include <linux/kprobes.h>
68	#include <linux/rethook.h>
69	#include <linux/sysfs.h>
70	#include <linux/user_events.h>
71	#include <linux/uaccess.h>
72	#include <linux/pidfs.h>
73
74	#include <uapi/linux/wait.h>
75
76	#include <asm/unistd.h>
77	#include <asm/mmu_context.h>
78
79	#include "exit.h"
80
81	/*
82	* The default value should be high enough to not crash a system that randomly
83	* crashes its kernel from time to time, but low enough to at least not permit
84	* overflowing 32-bit refcounts or the ldsem writer count.
85	*/
86	static unsigned int oops_limit = `10000`;
87
88	#ifdef CONFIG_SYSCTL
89	static const struct ctl_table kern_exit_table[] = {
90	{
91	.procname = "oops_limit",
92	.data = &oops_limit,
93	.maxlen = sizeof(oops_limit),
94	.mode = `0644`,
95	.proc_handler = proc_douintvec,
96	},
97	};
98
99	static __init int kernel_exit_sysctls_init(void)
100	{
101	register_sysctl_init("kernel", kern_exit_table);
102	return `0`;
103	}
104	late_initcall(kernel_exit_sysctls_init);
105	#endif
106
107	static atomic_t oops_count = ATOMIC_INIT(`0`);
108
109	#ifdef CONFIG_SYSFS
110	static ssize_t oops_count_show(struct kobject kobj, struct* kobj_attribute *attr,
111	char *page)
112	{
113	return sysfs_emit(buf: page, fmt: "%d\n", atomic_read(v: &oops_count));
114	}
115
116	static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
117
118	static __init int kernel_exit_sysfs_init(void)
119	{
120	sysfs_add_file_to_group(kobj: kernel_kobj, attr: &oops_count_attr.attr, NULL);
121	return `0`;
122	}
123	late_initcall(kernel_exit_sysfs_init);
124	#endif
125
126	/*
127	* For things release_task() would like to do after tasklist_lock is released.
128	*/
129	struct release_task_post {
130	struct pid *pids[PIDTYPE_MAX];
131	};
132
133	static void __unhash_process(struct release_task_post post, struct* task_struct *p,
134	bool group_dead)
135	{
136	struct pid *pid = task_pid(task: p);
137
138	nr_threads--;
139
140	detach_pid(pids: post->pids, task: p, PIDTYPE_PID);
141	wake_up_all(&pid->wait_pidfd);
142
143	if (group_dead) {
144	detach_pid(pids: post->pids, task: p, PIDTYPE_TGID);
145	detach_pid(pids: post->pids, task: p, PIDTYPE_PGID);
146	detach_pid(pids: post->pids, task: p, PIDTYPE_SID);
147
148	list_del_rcu(entry: &p->tasks);
149	list_del_init(entry: &p->sibling);
150	__this_cpu_dec(process_counts);
151	}
152	list_del_rcu(entry: &p->thread_node);
153	}
154
155	/*
156	* This function expects the tasklist_lock write-locked.
157	*/
158	static void __exit_signal(struct release_task_post post, struct* task_struct *tsk)
159	{
160	struct signal_struct *sig = tsk->signal;
161	bool group_dead = thread_group_leader(p: tsk);
162	struct sighand_struct *sighand;
163	struct tty_struct *tty;
164	u64 utime, stime;
165
166	sighand = rcu_dereference_check(tsk->sighand,
167	lockdep_tasklist_lock_is_held());
168	spin_lock(lock: &sighand->siglock);
169
170	#ifdef CONFIG_POSIX_TIMERS
171	posix_cpu_timers_exit(task: tsk);
172	if (group_dead)
173	posix_cpu_timers_exit_group(task: tsk);
174	#endif
175
176	if (group_dead) {
177	tty = sig->tty;
178	sig->tty = NULL;
179	} else {
180	/*
181	* If there is any task waiting for the group exit
182	* then notify it:
183	*/
184	if (sig->notify_count > `0` && !--sig->notify_count)
185	wake_up_process(tsk: sig->group_exec_task);
186
187	if (tsk == sig->curr_target)
188	sig->curr_target = next_thread(p: tsk);
189	}
190
191	/*
192	* Accumulate here the counters for all threads as they die. We could
193	* skip the group leader because it is the last user of signal_struct,
194	* but we want to avoid the race with thread_group_cputime() which can
195	* see the empty ->thread_head list.
196	*/
197	task_cputime(t: tsk, utime: &utime, stime: &stime);
198	write_seqlock(sl: &sig->stats_lock);
199	sig->utime += utime;
200	sig->stime += stime;
201	sig->gtime += task_gtime(t: tsk);
202	sig->min_flt += tsk->min_flt;
203	sig->maj_flt += tsk->maj_flt;
204	sig->nvcsw += tsk->nvcsw;
205	sig->nivcsw += tsk->nivcsw;
206	sig->inblock += task_io_get_inblock(p: tsk);
207	sig->oublock += task_io_get_oublock(p: tsk);
208	task_io_accounting_add(dst: &sig->ioac, src: &tsk->ioac);
209	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
210	sig->nr_threads--;
211	__unhash_process(post, p: tsk, group_dead);
212	write_sequnlock(sl: &sig->stats_lock);
213
214	tsk->sighand = NULL;
215	spin_unlock(lock: &sighand->siglock);
216
217	__cleanup_sighand(sighand);
218	if (group_dead)
219	tty_kref_put(tty);
220	}
221
222	static void delayed_put_task_struct(struct rcu_head *rhp)
223	{
224	struct task_struct tsk = container_of(rhp, struct* task_struct, rcu);
225
226	kprobe_flush_task(tsk);
227	rethook_flush_task(tk: tsk);
228	perf_event_delayed_put(task: tsk);
229	trace_sched_process_free(p: tsk);
230	put_task_struct(t: tsk);
231	}
232
233	void put_task_struct_rcu_user(struct task_struct *task)
234	{
235	if (refcount_dec_and_test(r: &task->rcu_users))
236	call_rcu(head: &task->rcu, func: delayed_put_task_struct);
237	}
238
239	void __weak release_thread(struct task_struct *dead_task)
240	{
241	}
242
243	void release_task(struct task_struct *p)
244	{
245	struct release_task_post post;
246	struct task_struct *leader;
247	struct pid *thread_pid;
248	int zap_leader;
249	repeat:
250	memset(&post, `0`, sizeof(post));
251
252	/ don't need to get the RCU readlock here - the process is dead and*
253	* can't be modifying its own credentials. But shut RCU-lockdep up */
254	rcu_read_lock();
255	dec_rlimit_ucounts(task_ucounts(p), type: UCOUNT_RLIMIT_NPROC, v: `1`);
256	rcu_read_unlock();
257
258	pidfs_exit(tsk: p);
259	cgroup_release(p);
260
261	/ Retrieve @thread_pid before __unhash_process() may set it to NULL. /
262	thread_pid = task_pid(task: p);
263
264	write_lock_irq(&tasklist_lock);
265	ptrace_release_task(task: p);
266	__exit_signal(post: &post, tsk: p);
267
268	/*
269	* If we are the last non-leader member of the thread
270	* group, and the leader is zombie, then notify the
271	* group leader's parent process. (if it wants notification.)
272	*/
273	zap_leader = `0`;
274	leader = p->group_leader;
275	if (leader != p && thread_group_empty(p: leader)
276	&& leader->exit_state == EXIT_ZOMBIE) {
277	/ for pidfs_exit() and do_notify_parent() /
278	if (leader->signal->flags & SIGNAL_GROUP_EXIT)
279	leader->exit_code = leader->signal->group_exit_code;
280	/*
281	* If we were the last child thread and the leader has
282	* exited already, and the leader's parent ignores SIGCHLD,
283	* then we are the one who should release the leader.
284	*/
285	zap_leader = do_notify_parent(leader, leader->exit_signal);
286	if (zap_leader)
287	leader->exit_state = EXIT_DEAD;
288	}
289
290	write_unlock_irq(&tasklist_lock);
291	/ @thread_pid can't go away until free_pids() below /
292	proc_flush_pid(thread_pid);
293	add_device_randomness(buf: &p->se.sum_exec_runtime,
294	len: sizeof(p->se.sum_exec_runtime));
295	free_pids(pids: post.pids);
296	release_thread(dead_task: p);
297	/*
298	* This task was already removed from the process/thread/pid lists
299	* and lock_task_sighand(p) can't succeed. Nobody else can touch
300	* ->pending or, if group dead, signal->shared_pending. We can call
301	* flush_sigqueue() lockless.
302	*/
303	flush_sigqueue(queue: &p->pending);
304	if (thread_group_leader(p))
305	flush_sigqueue(queue: &p->signal->shared_pending);
306
307	put_task_struct_rcu_user(task: p);
308
309	p = leader;
310	if (unlikely(zap_leader))
311	goto repeat;
312	}
313
314	int rcuwait_wake_up(struct rcuwait *w)
315	{
316	int ret = `0`;
317	struct task_struct *task;
318
319	rcu_read_lock();
320
321	/*
322	* Order condition vs @task, such that everything prior to the load
323	* of @task is visible. This is the condition as to why the user called
324	* rcuwait_wake() in the first place. Pairs with set_current_state()
325	* barrier (A) in rcuwait_wait_event().
326	*
327	* WAIT WAKE
328	* [S] tsk = current [S] cond = true
329	* MB (A) MB (B)
330	* [L] cond [L] tsk
331	*/
332	smp_mb(); / (B) /
333
334	task = rcu_dereference(w->task);
335	if (task)
336	ret = wake_up_process(tsk: task);
337	rcu_read_unlock();
338
339	return ret;
340	}
341	EXPORT_SYMBOL_GPL(rcuwait_wake_up);
342
343	/*
344	* Determine if a process group is "orphaned", according to the POSIX
345	* definition in 2.2.2.52. Orphaned process groups are not to be affected
346	* by terminal-generated stop signals. Newly orphaned process groups are
347	* to receive a SIGHUP and a SIGCONT.
348	*
349	* "I ask you, have you ever known what it is to be an orphan?"
350	*/
351	static int will_become_orphaned_pgrp(struct pid *pgrp,
352	struct task_struct *ignored_task)
353	{
354	struct task_struct *p;
355
356	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
357	if ((p == ignored_task) \|\|
358	(p->exit_state && thread_group_empty(p)) \|\|
359	is_global_init(tsk: p->real_parent))
360	continue;
361
362	if (task_pgrp(task: p->real_parent) != pgrp &&
363	task_session(task: p->real_parent) == task_session(task: p))
364	return `0`;
365	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
366
367	return `1`;
368	}
369
370	int is_current_pgrp_orphaned(void)
371	{
372	int retval;
373
374	read_lock(&tasklist_lock);
375	retval = will_become_orphaned_pgrp(pgrp: task_pgrp(current), NULL);
376	read_unlock(&tasklist_lock);
377
378	return retval;
379	}
380
381	static bool has_stopped_jobs(struct pid *pgrp)
382	{
383	struct task_struct *p;
384
385	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
386	if (p->signal->flags & SIGNAL_STOP_STOPPED)
387	return true;
388	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
389
390	return false;
391	}
392
393	/*
394	* Check to see if any process groups have become orphaned as
395	* a result of our exiting, and if they have any stopped jobs,
396	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
397	*/
398	static void
399	kill_orphaned_pgrp(struct task_struct tsk, struct* task_struct *parent)
400	{
401	struct pid *pgrp = task_pgrp(task: tsk);
402	struct task_struct *ignored_task = tsk;
403
404	if (!parent)
405	/ exit: our father is in a different pgrp than*
406	* we are and we were the only connection outside.
407	*/
408	parent = tsk->real_parent;
409	else
410	/ reparent: our child is in a different pgrp than*
411	* we are, and it was the only connection outside.
412	*/
413	ignored_task = NULL;
414
415	if (task_pgrp(task: parent) != pgrp &&
416	task_session(task: parent) == task_session(task: tsk) &&
417	will_become_orphaned_pgrp(pgrp, ignored_task) &&
418	has_stopped_jobs(pgrp)) {
419	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
420	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
421	}
422	}
423
424	static void coredump_task_exit(struct task_struct *tsk,
425	struct core_state *core_state)
426	{
427	struct core_thread self;
428
429	self.task = tsk;
430	if (self.task->flags & PF_SIGNALED)
431	self.next = xchg(&core_state->dumper.next, &self);
432	else
433	self.task = NULL;
434	/*
435	* Implies mb(), the result of xchg() must be visible
436	* to core_state->dumper.
437	*/
438	if (atomic_dec_and_test(v: &core_state->nr_threads))
439	complete(&core_state->startup);
440
441	for (;;) {
442	set_current_state(TASK_IDLE\|TASK_FREEZABLE);
443	if (!self.task) / see coredump_finish() /
444	break;
445	schedule();
446	}
447	__set_current_state(TASK_RUNNING);
448	}
449
450	#ifdef CONFIG_MEMCG
451	/ drops tasklist_lock if succeeds /
452	static bool __try_to_set_owner(struct task_struct tsk, struct* mm_struct *mm)
453	{
454	bool ret = false;
455
456	task_lock(p: tsk);
457	if (likely(tsk->mm == mm)) {
458	/ tsk can't pass exit_mm/exec_mmap and exit /
459	read_unlock(&tasklist_lock);
460	WRITE_ONCE(mm->owner, tsk);
461	lru_gen_migrate_mm(mm);
462	ret = true;
463	}
464	task_unlock(p: tsk);
465	return ret;
466	}
467
468	static bool try_to_set_owner(struct task_struct g, struct* mm_struct *mm)
469	{
470	struct task_struct *t;
471
472	for_each_thread(g, t) {
473	struct mm_struct *t_mm = READ_ONCE(t->mm);
474	if (t_mm == mm) {
475	if (__try_to_set_owner(tsk: t, mm))
476	return true;
477	} else if (t_mm)
478	break;
479	}
480
481	return false;
482	}
483
484	/*
485	* A task is exiting. If it owned this mm, find a new owner for the mm.
486	*/
487	void mm_update_next_owner(struct mm_struct *mm)
488	{
489	struct task_struct g, p = current;
490
491	/*
492	* If the exiting or execing task is not the owner, it's
493	* someone else's problem.
494	*/
495	if (mm->owner != p)
496	return;
497	/*
498	* The current owner is exiting/execing and there are no other
499	* candidates. Do not leave the mm pointing to a possibly
500	* freed task structure.
501	*/
502	if (atomic_read(v: &mm->mm_users) <= `1`) {
503	WRITE_ONCE(mm->owner, NULL);
504	return;
505	}
506
507	read_lock(&tasklist_lock);
508	/*
509	* Search in the children
510	*/
511	list_for_each_entry(g, &p->children, sibling) {
512	if (try_to_set_owner(g, mm))
513	goto ret;
514	}
515	/*
516	* Search in the siblings
517	*/
518	list_for_each_entry(g, &p->real_parent->children, sibling) {
519	if (try_to_set_owner(g, mm))
520	goto ret;
521	}
522	/*
523	* Search through everything else, we should not get here often.
524	*/
525	for_each_process(g) {
526	if (atomic_read(v: &mm->mm_users) <= `1`)
527	break;
528	if (g->flags & PF_KTHREAD)
529	continue;
530	if (try_to_set_owner(g, mm))
531	goto ret;
532	}
533	read_unlock(&tasklist_lock);
534	/*
535	* We found no owner yet mm_users > 1: this implies that we are
536	* most likely racing with swapoff (try_to_unuse()) or /proc or
537	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
538	*/
539	WRITE_ONCE(mm->owner, NULL);
540	ret:
541	return;
542
543	}
544	#endif /* CONFIG_MEMCG */
545
546	/*
547	* Turn us into a lazy TLB process if we
548	* aren't already..
549	*/
550	static void exit_mm(void)
551	{
552	struct mm_struct *mm = current->mm;
553
554	exit_mm_release(current, mm);
555	if (!mm)
556	return;
557	mmap_read_lock(mm);
558	mmgrab_lazy_tlb(mm);
559	BUG_ON(mm != current->active_mm);
560	/ more a memory barrier than a real lock /
561	task_lock(current);
562	/*
563	* When a thread stops operating on an address space, the loop
564	* in membarrier_private_expedited() may not observe that
565	* tsk->mm, and the loop in membarrier_global_expedited() may
566	* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
567	* rq->membarrier_state, so those would not issue an IPI.
568	* Membarrier requires a memory barrier after accessing
569	* user-space memory, before clearing tsk->mm or the
570	* rq->membarrier_state.
571	*/
572	smp_mb__after_spinlock();
573	local_irq_disable();
574	current->mm = NULL;
575	membarrier_update_current_mm(NULL);
576	enter_lazy_tlb(mm, current);
577	local_irq_enable();
578	task_unlock(current);
579	mmap_read_unlock(mm);
580	mm_update_next_owner(mm);
581	mmput(mm);
582	if (test_thread_flag(TIF_MEMDIE))
583	exit_oom_victim();
584	}
585
586	static struct task_struct find_alive_thread(struct* task_struct *p)
587	{
588	struct task_struct *t;
589
590	for_each_thread(p, t) {
591	if (!(t->flags & PF_EXITING))
592	return t;
593	}
594	return NULL;
595	}
596
597	static struct task_struct find_child_reaper(struct* task_struct *father,
598	struct list_head *dead)
599	__releases(&tasklist_lock)
600	__acquires(&tasklist_lock)
601	{
602	struct pid_namespace *pid_ns = task_active_pid_ns(tsk: father);
603	struct task_struct *reaper = pid_ns->child_reaper;
604	struct task_struct p, n;
605
606	if (likely(reaper != father))
607	return reaper;
608
609	reaper = find_alive_thread(p: father);
610	if (reaper) {
611	pid_ns->child_reaper = reaper;
612	return reaper;
613	}
614
615	write_unlock_irq(&tasklist_lock);
616
617	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
618	list_del_init(entry: &p->ptrace_entry);
619	release_task(p);
620	}
621
622	zap_pid_ns_processes(pid_ns);
623	write_lock_irq(&tasklist_lock);
624
625	return father;
626	}
627
628	/*
629	* When we die, we re-parent all our children, and try to:
630	* 1. give them to another thread in our thread group, if such a member exists
631	* 2. give it to the first ancestor process which prctl'd itself as a
632	* child_subreaper for its children (like a service manager)
633	* 3. give it to the init process (PID 1) in our pid namespace
634	*/
635	static struct task_struct find_new_reaper(struct* task_struct *father,
636	struct task_struct *child_reaper)
637	{
638	struct task_struct thread, reaper;
639
640	thread = find_alive_thread(p: father);
641	if (thread)
642	return thread;
643
644	if (father->signal->has_child_subreaper) {
645	unsigned int ns_level = task_pid(task: father)->level;
646	/*
647	* Find the first ->is_child_subreaper ancestor in our pid_ns.
648	* We can't check reaper != child_reaper to ensure we do not
649	* cross the namespaces, the exiting parent could be injected
650	* by setns() + fork().
651	* We check pid->level, this is slightly more efficient than
652	* task_active_pid_ns(reaper) != task_active_pid_ns(father).
653	*/
654	for (reaper = father->real_parent;
655	task_pid(task: reaper)->level == ns_level;
656	reaper = reaper->real_parent) {
657	if (reaper == &init_task)
658	break;
659	if (!reaper->signal->is_child_subreaper)
660	continue;
661	thread = find_alive_thread(p: reaper);
662	if (thread)
663	return thread;
664	}
665	}
666
667	return child_reaper;
668	}
669
670	/*
671	* Any that need to be release_task'd are put on the @dead list.
672	*/
673	static void reparent_leader(struct task_struct father, struct* task_struct *p,
674	struct list_head *dead)
675	{
676	if (unlikely(p->exit_state == EXIT_DEAD))
677	return;
678
679	/ We don't want people slaying init. /
680	p->exit_signal = SIGCHLD;
681
682	/ If it has exited notify the new parent about this child's death. /
683	if (!p->ptrace &&
684	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
685	if (do_notify_parent(p, p->exit_signal)) {
686	p->exit_state = EXIT_DEAD;
687	list_add(new: &p->ptrace_entry, head: dead);
688	}
689	}
690
691	kill_orphaned_pgrp(tsk: p, parent: father);
692	}
693
694	/*
695	* This does two things:
696	*
697	* A. Make init inherit all the child processes
698	* B. Check to see if any process groups have become orphaned
699	* as a result of our exiting, and if they have any stopped
700	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
701	*/
702	static void forget_original_parent(struct task_struct *father,
703	struct list_head *dead)
704	{
705	struct task_struct p, t, *reaper;
706
707	if (unlikely(!list_empty(&father->ptraced)))
708	exit_ptrace(tracer: father, dead);
709
710	/ Can drop and reacquire tasklist_lock /
711	reaper = find_child_reaper(father, dead);
712	if (list_empty(head: &father->children))
713	return;
714
715	reaper = find_new_reaper(father, child_reaper: reaper);
716	list_for_each_entry(p, &father->children, sibling) {
717	for_each_thread(p, t) {
718	RCU_INIT_POINTER(t->real_parent, reaper);
719	BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
720	if (likely(!t->ptrace))
721	t->parent = t->real_parent;
722	if (t->pdeath_signal)
723	group_send_sig_info(sig: t->pdeath_signal,
724	SEND_SIG_NOINFO, p: t,
725	type: PIDTYPE_TGID);
726	}
727	/*
728	* If this is a threaded reparent there is no need to
729	* notify anyone anything has happened.
730	*/
731	if (!same_thread_group(p1: reaper, p2: father))
732	reparent_leader(father, p, dead);
733	}
734	list_splice_tail_init(list: &father->children, head: &reaper->children);
735	}
736
737	/*
738	* Send signals to all our closest relatives so that they know
739	* to properly mourn us..
740	*/
741	static void exit_notify(struct task_struct tsk, int* group_dead)
742	{
743	bool autoreap;
744	struct task_struct p, n;
745	LIST_HEAD(dead);
746
747	write_lock_irq(&tasklist_lock);
748	forget_original_parent(father: tsk, dead: &dead);
749
750	if (group_dead)
751	kill_orphaned_pgrp(tsk: tsk->group_leader, NULL);
752
753	tsk->exit_state = EXIT_ZOMBIE;
754
755	if (unlikely(tsk->ptrace)) {
756	int sig = thread_group_leader(p: tsk) &&
757	thread_group_empty(p: tsk) &&
758	!ptrace_reparented(child: tsk) ?
759	tsk->exit_signal : SIGCHLD;
760	autoreap = do_notify_parent(tsk, sig);
761	} else if (thread_group_leader(p: tsk)) {
762	autoreap = thread_group_empty(p: tsk) &&
763	do_notify_parent(tsk, tsk->exit_signal);
764	} else {
765	autoreap = true;
766	/ untraced sub-thread /
767	do_notify_pidfd(task: tsk);
768	}
769
770	if (autoreap) {
771	tsk->exit_state = EXIT_DEAD;
772	list_add(new: &tsk->ptrace_entry, head: &dead);
773	}
774
775	/ mt-exec, de_thread() is waiting for group leader /
776	if (unlikely(tsk->signal->notify_count < `0`))
777	wake_up_process(tsk: tsk->signal->group_exec_task);
778	write_unlock_irq(&tasklist_lock);
779
780	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
781	list_del_init(entry: &p->ptrace_entry);
782	release_task(p);
783	}
784	}
785
786	#ifdef CONFIG_DEBUG_STACK_USAGE
787	unsigned long stack_not_used(struct task_struct *p)
788	{
789	unsigned long *n = end_of_stack(task: p);
790
791	do { / Skip over canary /
792	# ifdef CONFIG_STACK_GROWSUP
793	n--;
794	# else
795	n++;
796	# endif
797	} while (!*n);
798
799	# ifdef CONFIG_STACK_GROWSUP
800	return (unsigned long)end_of_stack(p) - (unsigned long)n;
801	# else
802	return (unsigned long)n - (unsigned long)end_of_stack(task: p);
803	# endif
804	}
805
806	/ Count the maximum pages reached in kernel stacks /
807	static inline void kstack_histogram(unsigned long used_stack)
808	{
809	#ifdef CONFIG_VM_EVENT_COUNTERS
810	if (used_stack <= `1024`)
811	count_vm_event(item: KSTACK_1K);
812	#if THREAD_SIZE > 1024
813	else if (used_stack <= `2048`)
814	count_vm_event(item: KSTACK_2K);
815	#endif
816	#if THREAD_SIZE > 2048
817	else if (used_stack <= `4096`)
818	count_vm_event(item: KSTACK_4K);
819	#endif
820	#if THREAD_SIZE > 4096
821	else if (used_stack <= `8192`)
822	count_vm_event(item: KSTACK_8K);
823	#endif
824	#if THREAD_SIZE > 8192
825	else if (used_stack <= `16384`)
826	count_vm_event(item: KSTACK_16K);
827	#endif
828	#if THREAD_SIZE > 16384
829	else if (used_stack <= `32768`)
830	count_vm_event(item: KSTACK_32K);
831	#endif
832	#if THREAD_SIZE > 32768
833	else if (used_stack <= `65536`)
834	count_vm_event(KSTACK_64K);
835	#endif
836	#if THREAD_SIZE > 65536
837	else
838	count_vm_event(KSTACK_REST);
839	#endif
840	#endif /* CONFIG_VM_EVENT_COUNTERS */
841	}
842
843	static void check_stack_usage(void)
844	{
845	static DEFINE_SPINLOCK(low_water_lock);
846	static int lowest_to_date = THREAD_SIZE;
847	unsigned long free;
848
849	free = stack_not_used(current);
850	kstack_histogram(THREAD_SIZE - free);
851
852	if (free >= lowest_to_date)
853	return;
854
855	spin_lock(lock: &low_water_lock);
856	if (free < lowest_to_date) {
857	pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
858	current->comm, task_pid_nr(current), free);
859	lowest_to_date = free;
860	}
861	spin_unlock(lock: &low_water_lock);
862	}
863	#else
864	static inline void check_stack_usage(void) {}
865	#endif
866
867	static void synchronize_group_exit(struct task_struct tsk, long* code)
868	{
869	struct sighand_struct *sighand = tsk->sighand;
870	struct signal_struct *signal = tsk->signal;
871	struct core_state *core_state;
872
873	spin_lock_irq(lock: &sighand->siglock);
874	signal->quick_threads--;
875	if ((signal->quick_threads == `0`) &&
876	!(signal->flags & SIGNAL_GROUP_EXIT)) {
877	signal->flags = SIGNAL_GROUP_EXIT;
878	signal->group_exit_code = code;
879	signal->group_stop_count = `0`;
880	}
881	/*
882	* Serialize with any possible pending coredump.
883	* We must hold siglock around checking core_state
884	* and setting PF_POSTCOREDUMP. The core-inducing thread
885	* will increment ->nr_threads for each thread in the
886	* group without PF_POSTCOREDUMP set.
887	*/
888	tsk->flags \|= PF_POSTCOREDUMP;
889	core_state = signal->core_state;
890	spin_unlock_irq(lock: &sighand->siglock);
891
892	if (unlikely(core_state))
893	coredump_task_exit(tsk, core_state);
894	}
895
896	void __noreturn do_exit(long code)
897	{
898	struct task_struct *tsk = current;
899	int group_dead;
900
901	WARN_ON(irqs_disabled());
902	WARN_ON(tsk->plug);
903
904	kcov_task_exit(t: tsk);
905	kmsan_task_exit(task: tsk);
906
907	synchronize_group_exit(tsk, code);
908	ptrace_event(PTRACE_EVENT_EXIT, message: code);
909	user_events_exit(t: tsk);
910
911	io_uring_files_cancel();
912	exit_signals(tsk); / sets PF_EXITING /
913
914	seccomp_filter_release(tsk);
915
916	acct_update_integrals(tsk);
917	group_dead = atomic_dec_and_test(v: &tsk->signal->live);
918	if (group_dead) {
919	/*
920	* If the last thread of global init has exited, panic
921	* immediately to get a useable coredump.
922	*/
923	if (unlikely(is_global_init(tsk)))
924	panic(fmt: "Attempted to kill init! exitcode=0x%08x\n",
925	tsk->signal->group_exit_code ?: (int)code);
926
927	#ifdef CONFIG_POSIX_TIMERS
928	hrtimer_cancel(timer: &tsk->signal->real_timer);
929	exit_itimers(tsk);
930	#endif
931	if (tsk->mm)
932	setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: tsk->mm);
933	}
934	acct_collect(exitcode: code, group_dead);
935	if (group_dead)
936	tty_audit_exit();
937	audit_free(task: tsk);
938
939	tsk->exit_code = code;
940	taskstats_exit(tsk, group_dead);
941	trace_sched_process_exit(p: tsk, group_dead);
942
943	exit_mm();
944
945	if (group_dead)
946	acct_process();
947
948	exit_sem(tsk);
949	exit_shm(task: tsk);
950	exit_files(tsk);
951	exit_fs(tsk);
952	if (group_dead)
953	disassociate_ctty(priv: `1`);
954	exit_task_namespaces(tsk);
955	exit_task_work(task: tsk);
956	exit_thread(tsk);
957
958	/*
959	* Flush inherited counters to the parent - before the parent
960	* gets woken up by child-exit notifications.
961	*
962	* because of cgroup mode, must be called before cgroup_exit()
963	*/
964	perf_event_exit_task(child: tsk);
965
966	sched_autogroup_exit_task(p: tsk);
967	cgroup_exit(p: tsk);
968
969	/*
970	* FIXME: do that only when needed, using sched_exit tracepoint
971	*/
972	flush_ptrace_hw_breakpoint(tsk);
973
974	exit_tasks_rcu_start();
975	exit_notify(tsk, group_dead);
976	proc_exit_connector(task: tsk);
977	mpol_put_task_policy(tsk);
978	#ifdef CONFIG_FUTEX
979	if (unlikely(current->pi_state_cache))
980	kfree(current->pi_state_cache);
981	#endif
982	/*
983	* Make sure we are holding no locks:
984	*/
985	debug_check_no_locks_held();
986
987	if (tsk->io_context)
988	exit_io_context(task: tsk);
989
990	if (tsk->splice_pipe)
991	free_pipe_info(tsk->splice_pipe);
992
993	if (tsk->task_frag.page)
994	put_page(page: tsk->task_frag.page);
995
996	exit_task_stack_account(tsk);
997
998	check_stack_usage();
999	preempt_disable();
1000	if (tsk->nr_dirtied)
1001	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1002	exit_rcu();
1003	exit_tasks_rcu_finish();
1004
1005	lockdep_free_task(task: tsk);
1006	do_task_dead();
1007	}
1008
1009	void __noreturn make_task_dead(int signr)
1010	{
1011	/*
1012	* Take the task off the cpu after something catastrophic has
1013	* happened.
1014	*
1015	* We can get here from a kernel oops, sometimes with preemption off.
1016	* Start by checking for critical errors.
1017	* Then fix up important state like USER_DS and preemption.
1018	* Then do everything else.
1019	*/
1020	struct task_struct *tsk = current;
1021	unsigned int limit;
1022
1023	if (unlikely(in_interrupt()))
1024	panic(fmt: "Aiee, killing interrupt handler!");
1025	if (unlikely(!tsk->pid))
1026	panic(fmt: "Attempted to kill the idle task!");
1027
1028	if (unlikely(irqs_disabled())) {
1029	pr_info("note: %s[%d] exited with irqs disabled\n",
1030	current->comm, task_pid_nr(current));
1031	local_irq_enable();
1032	}
1033	if (unlikely(in_atomic())) {
1034	pr_info("note: %s[%d] exited with preempt_count %d\n",
1035	current->comm, task_pid_nr(current),
1036	preempt_count());
1037	preempt_count_set(PREEMPT_ENABLED);
1038	}
1039
1040	/*
1041	* Every time the system oopses, if the oops happens while a reference
1042	* to an object was held, the reference leaks.
1043	* If the oops doesn't also leak memory, repeated oopsing can cause
1044	* reference counters to wrap around (if they're not using refcount_t).
1045	* This means that repeated oopsing can make unexploitable-looking bugs
1046	* exploitable through repeated oopsing.
1047	* To make sure this can't happen, place an upper bound on how often the
1048	* kernel may oops without panic().
1049	*/
1050	limit = READ_ONCE(oops_limit);
1051	if (atomic_inc_return(v: &oops_count) >= limit && limit)
1052	panic(fmt: "Oopsed too often (kernel.oops_limit is %d)", limit);
1053
1054	/*
1055	* We're taking recursive faults here in make_task_dead. Safest is to just
1056	* leave this task alone and wait for reboot.
1057	*/
1058	if (unlikely(tsk->flags & PF_EXITING)) {
1059	pr_alert("Fixing recursive fault but reboot is needed!\n");
1060	futex_exit_recursive(tsk);
1061	tsk->exit_state = EXIT_DEAD;
1062	refcount_inc(r: &tsk->rcu_users);
1063	do_task_dead();
1064	}
1065
1066	do_exit(code: signr);
1067	}
1068
1069	SYSCALL_DEFINE1(exit, int, error_code)
1070	{
1071	do_exit(code: (error_code&`0xff`)<<`8`);
1072	}
1073
1074	/*
1075	* Take down every thread in the group. This is called by fatal signals
1076	* as well as by sys_exit_group (below).
1077	*/
1078	void __noreturn
1079	do_group_exit(int exit_code)
1080	{
1081	struct signal_struct *sig = current->signal;
1082
1083	if (sig->flags & SIGNAL_GROUP_EXIT)
1084	exit_code = sig->group_exit_code;
1085	else if (sig->group_exec_task)
1086	exit_code = `0`;
1087	else {
1088	struct sighand_struct *const sighand = current->sighand;
1089
1090	spin_lock_irq(lock: &sighand->siglock);
1091	if (sig->flags & SIGNAL_GROUP_EXIT)
1092	/ Another thread got here before we took the lock. /
1093	exit_code = sig->group_exit_code;
1094	else if (sig->group_exec_task)
1095	exit_code = `0`;
1096	else {
1097	sig->group_exit_code = exit_code;
1098	sig->flags = SIGNAL_GROUP_EXIT;
1099	zap_other_threads(current);
1100	}
1101	spin_unlock_irq(lock: &sighand->siglock);
1102	}
1103
1104	do_exit(code: exit_code);
1105	/ NOTREACHED /
1106	}
1107
1108	/*
1109	* this kills every thread in the thread group. Note that any externally
1110	* wait4()-ing process will get the correct exit code - even if this
1111	* thread is not the thread group leader.
1112	*/
1113	SYSCALL_DEFINE1(exit_group, int, error_code)
1114	{
1115	do_group_exit(exit_code: (error_code & `0xff`) << `8`);
1116	/ NOTREACHED /
1117	return `0`;
1118	}
1119
1120	static int eligible_pid(struct wait_opts wo, struct* task_struct *p)
1121	{
1122	return wo->wo_type == PIDTYPE_MAX \|\|
1123	task_pid_type(task: p, type: wo->wo_type) == wo->wo_pid;
1124	}
1125
1126	static int
1127	eligible_child(struct wait_opts wo, bool ptrace, struct* task_struct *p)
1128	{
1129	if (!eligible_pid(wo, p))
1130	return `0`;
1131
1132	/*
1133	* Wait for all children (clone and not) if __WALL is set or
1134	* if it is traced by us.
1135	*/
1136	if (ptrace \|\| (wo->wo_flags & __WALL))
1137	return `1`;
1138
1139	/*
1140	* Otherwise, wait for clone children only if __WCLONE is set;
1141	* otherwise, wait for non-clone children only.
1142	*
1143	* Note: a "clone" child here is one that reports to its parent
1144	* using a signal other than SIGCHLD, or a non-leader thread which
1145	* we can only see if it is traced by us.
1146	*/
1147	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1148	return `0`;
1149
1150	return `1`;
1151	}
1152
1153	/*
1154	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1155	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1156	* the lock and this task is uninteresting. If we return nonzero, we have
1157	* released the lock and the system call should return.
1158	*/
1159	static int wait_task_zombie(struct wait_opts wo, struct* task_struct *p)
1160	{
1161	int state, status;
1162	pid_t pid = task_pid_vnr(tsk: p);
1163	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1164	struct waitid_info *infop;
1165
1166	if (!likely(wo->wo_flags & WEXITED))
1167	return `0`;
1168
1169	if (unlikely(wo->wo_flags & WNOWAIT)) {
1170	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1171	? p->signal->group_exit_code : p->exit_code;
1172	get_task_struct(t: p);
1173	read_unlock(&tasklist_lock);
1174	sched_annotate_sleep();
1175	if (wo->wo_rusage)
1176	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1177	put_task_struct(t: p);
1178	goto out_info;
1179	}
1180	/*
1181	* Move the task's state to DEAD/TRACE, only one thread can do this.
1182	*/
1183	state = (ptrace_reparented(child: p) && thread_group_leader(p)) ?
1184	EXIT_TRACE : EXIT_DEAD;
1185	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1186	return `0`;
1187	/*
1188	* We own this thread, nobody else can reap it.
1189	*/
1190	read_unlock(&tasklist_lock);
1191	sched_annotate_sleep();
1192
1193	/*
1194	* Check thread_group_leader() to exclude the traced sub-threads.
1195	*/
1196	if (state == EXIT_DEAD && thread_group_leader(p)) {
1197	struct signal_struct *sig = p->signal;
1198	struct signal_struct *psig = current->signal;
1199	unsigned long maxrss;
1200	u64 tgutime, tgstime;
1201
1202	/*
1203	* The resource counters for the group leader are in its
1204	* own task_struct. Those for dead threads in the group
1205	* are in its signal_struct, as are those for the child
1206	* processes it has previously reaped. All these
1207	* accumulate in the parent's signal_struct c* fields.
1208	*
1209	* We don't bother to take a lock here to protect these
1210	* p->signal fields because the whole thread group is dead
1211	* and nobody can change them.
1212	*
1213	* psig->stats_lock also protects us from our sub-threads
1214	* which can reap other children at the same time.
1215	*
1216	* We use thread_group_cputime_adjusted() to get times for
1217	* the thread group, which consolidates times for all threads
1218	* in the group including the group leader.
1219	*/
1220	thread_group_cputime_adjusted(p, ut: &tgutime, st: &tgstime);
1221	write_seqlock_irq(sl: &psig->stats_lock);
1222	psig->cutime += tgutime + sig->cutime;
1223	psig->cstime += tgstime + sig->cstime;
1224	psig->cgtime += task_gtime(t: p) + sig->gtime + sig->cgtime;
1225	psig->cmin_flt +=
1226	p->min_flt + sig->min_flt + sig->cmin_flt;
1227	psig->cmaj_flt +=
1228	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1229	psig->cnvcsw +=
1230	p->nvcsw + sig->nvcsw + sig->cnvcsw;
1231	psig->cnivcsw +=
1232	p->nivcsw + sig->nivcsw + sig->cnivcsw;
1233	psig->cinblock +=
1234	task_io_get_inblock(p) +
1235	sig->inblock + sig->cinblock;
1236	psig->coublock +=
1237	task_io_get_oublock(p) +
1238	sig->oublock + sig->coublock;
1239	maxrss = max(sig->maxrss, sig->cmaxrss);
1240	if (psig->cmaxrss < maxrss)
1241	psig->cmaxrss = maxrss;
1242	task_io_accounting_add(dst: &psig->ioac, src: &p->ioac);
1243	task_io_accounting_add(dst: &psig->ioac, src: &sig->ioac);
1244	write_sequnlock_irq(sl: &psig->stats_lock);
1245	}
1246
1247	if (wo->wo_rusage)
1248	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1249	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1250	? p->signal->group_exit_code : p->exit_code;
1251	wo->wo_stat = status;
1252
1253	if (state == EXIT_TRACE) {
1254	write_lock_irq(&tasklist_lock);
1255	/ We dropped tasklist, ptracer could die and untrace /
1256	ptrace_unlink(child: p);
1257
1258	/ If parent wants a zombie, don't release it now /
1259	state = EXIT_ZOMBIE;
1260	if (do_notify_parent(p, p->exit_signal))
1261	state = EXIT_DEAD;
1262	p->exit_state = state;
1263	write_unlock_irq(&tasklist_lock);
1264	}
1265	if (state == EXIT_DEAD)
1266	release_task(p);
1267
1268	out_info:
1269	infop = wo->wo_info;
1270	if (infop) {
1271	if ((status & `0x7f`) == `0`) {
1272	infop->cause = CLD_EXITED;
1273	infop->status = status >> `8`;
1274	} else {
1275	infop->cause = (status & `0x80`) ? CLD_DUMPED : CLD_KILLED;
1276	infop->status = status & `0x7f`;
1277	}
1278	infop->pid = pid;
1279	infop->uid = uid;
1280	}
1281
1282	return pid;
1283	}
1284
1285	static int task_stopped_code(struct* task_struct *p, bool ptrace)
1286	{
1287	if (ptrace) {
1288	if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1289	return &p->exit_code;
1290	} else {
1291	if (p->signal->flags & SIGNAL_STOP_STOPPED)
1292	return &p->signal->group_exit_code;
1293	}
1294	return NULL;
1295	}
1296
1297	/**
1298	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1299	* @wo: wait options
1300	* @ptrace: is the wait for ptrace
1301	* @p: task to wait for
1302	*
1303	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1304	*
1305	* CONTEXT:
1306	* read_lock(&tasklist_lock), which is released if return value is
1307	* non-zero. Also, grabs and releases @p->sighand->siglock.
1308	*
1309	* RETURNS:
1310	* 0 if wait condition didn't exist and search for other wait conditions
1311	* should continue. Non-zero return, -errno on failure and @p's pid on
1312	* success, implies that tasklist_lock is released and wait condition
1313	* search should terminate.
1314	*/
1315	static int wait_task_stopped(struct wait_opts *wo,
1316	int ptrace, struct task_struct *p)
1317	{
1318	struct waitid_info *infop;
1319	int exit_code, *p_code, why;
1320	uid_t uid = `0`; / unneeded, required by compiler /
1321	pid_t pid;
1322
1323	/*
1324	* Traditionally we see ptrace'd stopped tasks regardless of options.
1325	*/
1326	if (!ptrace && !(wo->wo_flags & WUNTRACED))
1327	return `0`;
1328
1329	if (!task_stopped_code(p, ptrace))
1330	return `0`;
1331
1332	exit_code = `0`;
1333	spin_lock_irq(lock: &p->sighand->siglock);
1334
1335	p_code = task_stopped_code(p, ptrace);
1336	if (unlikely(!p_code))
1337	goto unlock_sig;
1338
1339	exit_code = *p_code;
1340	if (!exit_code)
1341	goto unlock_sig;
1342
1343	if (!unlikely(wo->wo_flags & WNOWAIT))
1344	*p_code = `0`;
1345
1346	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1347	unlock_sig:
1348	spin_unlock_irq(lock: &p->sighand->siglock);
1349	if (!exit_code)
1350	return `0`;
1351
1352	/*
1353	* Now we are pretty sure this task is interesting.
1354	* Make sure it doesn't get reaped out from under us while we
1355	* give up the lock and then examine it below. We don't want to
1356	* keep holding onto the tasklist_lock while we call getrusage and
1357	* possibly take page faults for user memory.
1358	*/
1359	get_task_struct(t: p);
1360	pid = task_pid_vnr(tsk: p);
1361	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1362	read_unlock(&tasklist_lock);
1363	sched_annotate_sleep();
1364	if (wo->wo_rusage)
1365	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1366	put_task_struct(t: p);
1367
1368	if (likely(!(wo->wo_flags & WNOWAIT)))
1369	wo->wo_stat = (exit_code << `8`) \| `0x7f`;
1370
1371	infop = wo->wo_info;
1372	if (infop) {
1373	infop->cause = why;
1374	infop->status = exit_code;
1375	infop->pid = pid;
1376	infop->uid = uid;
1377	}
1378	return pid;
1379	}
1380
1381	/*
1382	* Handle do_wait work for one task in a live, non-stopped state.
1383	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1384	* the lock and this task is uninteresting. If we return nonzero, we have
1385	* released the lock and the system call should return.
1386	*/
1387	static int wait_task_continued(struct wait_opts wo, struct* task_struct *p)
1388	{
1389	struct waitid_info *infop;
1390	pid_t pid;
1391	uid_t uid;
1392
1393	if (!unlikely(wo->wo_flags & WCONTINUED))
1394	return `0`;
1395
1396	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1397	return `0`;
1398
1399	spin_lock_irq(lock: &p->sighand->siglock);
1400	/ Re-check with the lock held. /
1401	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1402	spin_unlock_irq(lock: &p->sighand->siglock);
1403	return `0`;
1404	}
1405	if (!unlikely(wo->wo_flags & WNOWAIT))
1406	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1407	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1408	spin_unlock_irq(lock: &p->sighand->siglock);
1409
1410	pid = task_pid_vnr(tsk: p);
1411	get_task_struct(t: p);
1412	read_unlock(&tasklist_lock);
1413	sched_annotate_sleep();
1414	if (wo->wo_rusage)
1415	getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1416	put_task_struct(t: p);
1417
1418	infop = wo->wo_info;
1419	if (!infop) {
1420	wo->wo_stat = `0xffff`;
1421	} else {
1422	infop->cause = CLD_CONTINUED;
1423	infop->pid = pid;
1424	infop->uid = uid;
1425	infop->status = SIGCONT;
1426	}
1427	return pid;
1428	}
1429
1430	/*
1431	* Consider @p for a wait by @parent.
1432	*
1433	* -ECHILD should be in ->notask_error before the first call.
1434	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1435	* Returns zero if the search for a child should continue;
1436	* then ->notask_error is 0 if @p is an eligible child,
1437	* or still -ECHILD.
1438	*/
1439	static int wait_consider_task(struct wait_opts wo, int* ptrace,
1440	struct task_struct *p)
1441	{
1442	/*
1443	* We can race with wait_task_zombie() from another thread.
1444	* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1445	* can't confuse the checks below.
1446	*/
1447	int exit_state = READ_ONCE(p->exit_state);
1448	int ret;
1449
1450	if (unlikely(exit_state == EXIT_DEAD))
1451	return `0`;
1452
1453	ret = eligible_child(wo, ptrace, p);
1454	if (!ret)
1455	return ret;
1456
1457	if (unlikely(exit_state == EXIT_TRACE)) {
1458	/*
1459	* ptrace == 0 means we are the natural parent. In this case
1460	* we should clear notask_error, debugger will notify us.
1461	*/
1462	if (likely(!ptrace))
1463	wo->notask_error = `0`;
1464	return `0`;
1465	}
1466
1467	if (likely(!ptrace) && unlikely(p->ptrace)) {
1468	/*
1469	* If it is traced by its real parent's group, just pretend
1470	* the caller is ptrace_do_wait() and reap this child if it
1471	* is zombie.
1472	*
1473	* This also hides group stop state from real parent; otherwise
1474	* a single stop can be reported twice as group and ptrace stop.
1475	* If a ptracer wants to distinguish these two events for its
1476	* own children it should create a separate process which takes
1477	* the role of real parent.
1478	*/
1479	if (!ptrace_reparented(child: p))
1480	ptrace = `1`;
1481	}
1482
1483	/ slay zombie? /
1484	if (exit_state == EXIT_ZOMBIE) {
1485	/ we don't reap group leaders with subthreads /
1486	if (!delay_group_leader(p)) {
1487	/*
1488	* A zombie ptracee is only visible to its ptracer.
1489	* Notification and reaping will be cascaded to the
1490	* real parent when the ptracer detaches.
1491	*/
1492	if (unlikely(ptrace) \|\| likely(!p->ptrace))
1493	return wait_task_zombie(wo, p);
1494	}
1495
1496	/*
1497	* Allow access to stopped/continued state via zombie by
1498	* falling through. Clearing of notask_error is complex.
1499	*
1500	* When !@ptrace:
1501	*
1502	* If WEXITED is set, notask_error should naturally be
1503	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
1504	* so, if there are live subthreads, there are events to
1505	* wait for. If all subthreads are dead, it's still safe
1506	* to clear - this function will be called again in finite
1507	* amount time once all the subthreads are released and
1508	* will then return without clearing.
1509	*
1510	* When @ptrace:
1511	*
1512	* Stopped state is per-task and thus can't change once the
1513	* target task dies. Only continued and exited can happen.
1514	* Clear notask_error if WCONTINUED \| WEXITED.
1515	*/
1516	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
1517	wo->notask_error = `0`;
1518	} else {
1519	/*
1520	* @p is alive and it's gonna stop, continue or exit, so
1521	* there always is something to wait for.
1522	*/
1523	wo->notask_error = `0`;
1524	}
1525
1526	/*
1527	* Wait for stopped. Depending on @ptrace, different stopped state
1528	* is used and the two don't interact with each other.
1529	*/
1530	ret = wait_task_stopped(wo, ptrace, p);
1531	if (ret)
1532	return ret;
1533
1534	/*
1535	* Wait for continued. There's only one continued state and the
1536	* ptracer can consume it which can confuse the real parent. Don't
1537	* use WCONTINUED from ptracer. You don't need or want it.
1538	*/
1539	return wait_task_continued(wo, p);
1540	}
1541
1542	/*
1543	* Do the work of do_wait() for one thread in the group, @tsk.
1544	*
1545	* -ECHILD should be in ->notask_error before the first call.
1546	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1547	* Returns zero if the search for a child should continue; then
1548	* ->notask_error is 0 if there were any eligible children,
1549	* or still -ECHILD.
1550	*/
1551	static int do_wait_thread(struct wait_opts wo, struct* task_struct *tsk)
1552	{
1553	struct task_struct *p;
1554
1555	list_for_each_entry(p, &tsk->children, sibling) {
1556	int ret = wait_consider_task(wo, ptrace: `0`, p);
1557
1558	if (ret)
1559	return ret;
1560	}
1561
1562	return `0`;
1563	}
1564
1565	static int ptrace_do_wait(struct wait_opts wo, struct* task_struct *tsk)
1566	{
1567	struct task_struct *p;
1568
1569	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1570	int ret = wait_consider_task(wo, ptrace: `1`, p);
1571
1572	if (ret)
1573	return ret;
1574	}
1575
1576	return `0`;
1577	}
1578
1579	bool pid_child_should_wake(struct wait_opts wo, struct* task_struct *p)
1580	{
1581	if (!eligible_pid(wo, p))
1582	return false;
1583
1584	if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
1585	return false;
1586
1587	return true;
1588	}
1589
1590	static int child_wait_callback(wait_queue_entry_t wait, unsigned* mode,
1591	int sync, void *key)
1592	{
1593	struct wait_opts wo = container_of(wait, struct* wait_opts,
1594	child_wait);
1595	struct task_struct *p = key;
1596
1597	if (pid_child_should_wake(wo, p))
1598	return default_wake_function(wq_entry: wait, mode, flags: sync, key);
1599
1600	return `0`;
1601	}
1602
1603	void __wake_up_parent(struct task_struct p, struct* task_struct *parent)
1604	{
1605	__wake_up_sync_key(wq_head: &parent->signal->wait_chldexit,
1606	TASK_INTERRUPTIBLE, key: p);
1607	}
1608
1609	static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
1610	struct task_struct *target)
1611	{
1612	struct task_struct *parent =
1613	!ptrace ? target->real_parent : target->parent;
1614
1615	return current == parent \|\| (!(wo->wo_flags & __WNOTHREAD) &&
1616	same_thread_group(current, p2: parent));
1617	}
1618
1619	/*
1620	* Optimization for waiting on PIDTYPE_PID. No need to iterate through child
1621	* and tracee lists to find the target task.
1622	*/
1623	static int do_wait_pid(struct wait_opts *wo)
1624	{
1625	bool ptrace;
1626	struct task_struct *target;
1627	int retval;
1628
1629	ptrace = false;
1630	target = pid_task(pid: wo->wo_pid, PIDTYPE_TGID);
1631	if (target && is_effectively_child(wo, ptrace, target)) {
1632	retval = wait_consider_task(wo, ptrace, p: target);
1633	if (retval)
1634	return retval;
1635	}
1636
1637	ptrace = true;
1638	target = pid_task(pid: wo->wo_pid, PIDTYPE_PID);
1639	if (target && target->ptrace &&
1640	is_effectively_child(wo, ptrace, target)) {
1641	retval = wait_consider_task(wo, ptrace, p: target);
1642	if (retval)
1643	return retval;
1644	}
1645
1646	return `0`;
1647	}
1648
1649	long __do_wait(struct wait_opts *wo)
1650	{
1651	long retval;
1652
1653	/*
1654	* If there is nothing that can match our criteria, just get out.
1655	* We will clear ->notask_error to zero if we see any child that
1656	* might later match our criteria, even if we are not able to reap
1657	* it yet.
1658	*/
1659	wo->notask_error = -ECHILD;
1660	if ((wo->wo_type < PIDTYPE_MAX) &&
1661	(!wo->wo_pid \|\| !pid_has_task(pid: wo->wo_pid, type: wo->wo_type)))
1662	goto notask;
1663
1664	read_lock(&tasklist_lock);
1665
1666	if (wo->wo_type == PIDTYPE_PID) {
1667	retval = do_wait_pid(wo);
1668	if (retval)
1669	return retval;
1670	} else {
1671	struct task_struct *tsk = current;
1672
1673	do {
1674	retval = do_wait_thread(wo, tsk);
1675	if (retval)
1676	return retval;
1677
1678	retval = ptrace_do_wait(wo, tsk);
1679	if (retval)
1680	return retval;
1681
1682	if (wo->wo_flags & __WNOTHREAD)
1683	break;
1684	} while_each_thread(current, tsk);
1685	}
1686	read_unlock(&tasklist_lock);
1687
1688	notask:
1689	retval = wo->notask_error;
1690	if (!retval && !(wo->wo_flags & WNOHANG))
1691	return -ERESTARTSYS;
1692
1693	return retval;
1694	}
1695
1696	static long do_wait(struct wait_opts *wo)
1697	{
1698	int retval;
1699
1700	trace_sched_process_wait(pid: wo->wo_pid);
1701
1702	init_waitqueue_func_entry(wq_entry: &wo->child_wait, func: child_wait_callback);
1703	wo->child_wait.private = current;
1704	add_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1705
1706	do {
1707	set_current_state(TASK_INTERRUPTIBLE);
1708	retval = __do_wait(wo);
1709	if (retval != -ERESTARTSYS)
1710	break;
1711	if (signal_pending(current))
1712	break;
1713	schedule();
1714	} while (`1`);
1715
1716	__set_current_state(TASK_RUNNING);
1717	remove_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1718	return retval;
1719	}
1720
1721	int kernel_waitid_prepare(struct wait_opts wo, int* which, pid_t upid,
1722	struct waitid_info infop, int* options,
1723	struct rusage *ru)
1724	{
1725	unsigned int f_flags = `0`;
1726	struct pid *pid = NULL;
1727	enum pid_type type;
1728
1729	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED\|
1730	__WNOTHREAD\|__WCLONE\|__WALL))
1731	return -EINVAL;
1732	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
1733	return -EINVAL;
1734
1735	switch (which) {
1736	case P_ALL:
1737	type = PIDTYPE_MAX;
1738	break;
1739	case P_PID:
1740	type = PIDTYPE_PID;
1741	if (upid <= `0`)
1742	return -EINVAL;
1743
1744	pid = find_get_pid(nr: upid);
1745	break;
1746	case P_PGID:
1747	type = PIDTYPE_PGID;
1748	if (upid < `0`)
1749	return -EINVAL;
1750
1751	if (upid)
1752	pid = find_get_pid(nr: upid);
1753	else
1754	pid = get_task_pid(current, type: PIDTYPE_PGID);
1755	break;
1756	case P_PIDFD:
1757	type = PIDTYPE_PID;
1758	if (upid < `0`)
1759	return -EINVAL;
1760
1761	pid = pidfd_get_pid(fd: upid, flags: &f_flags);
1762	if (IS_ERR(ptr: pid))
1763	return PTR_ERR(ptr: pid);
1764
1765	break;
1766	default:
1767	return -EINVAL;
1768	}
1769
1770	wo->wo_type = type;
1771	wo->wo_pid = pid;
1772	wo->wo_flags = options;
1773	wo->wo_info = infop;
1774	wo->wo_rusage = ru;
1775	if (f_flags & O_NONBLOCK)
1776	wo->wo_flags \|= WNOHANG;
1777
1778	return `0`;
1779	}
1780
1781	static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1782	int options, struct rusage *ru)
1783	{
1784	struct wait_opts wo;
1785	long ret;
1786
1787	ret = kernel_waitid_prepare(wo: &wo, which, upid, infop, options, ru);
1788	if (ret)
1789	return ret;
1790
1791	ret = do_wait(wo: &wo);
1792	if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
1793	ret = -EAGAIN;
1794
1795	put_pid(pid: wo.wo_pid);
1796	return ret;
1797	}
1798
1799	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1800	infop, int, options, struct rusage __user *, ru)
1801	{
1802	struct rusage r;
1803	struct waitid_info info = {.status = `0`};
1804	long err = kernel_waitid(which, upid, infop: &info, options, ru: ru ? &r : NULL);
1805	int signo = `0`;
1806
1807	if (err > `0`) {
1808	signo = SIGCHLD;
1809	err = `0`;
1810	if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1811	return -EFAULT;
1812	}
1813	if (!infop)
1814	return err;
1815
1816	if (!user_write_access_begin(infop, sizeof(*infop)))
1817	return -EFAULT;
1818
1819	unsafe_put_user(signo, &infop->si_signo, Efault);
1820	unsafe_put_user(`0`, &infop->si_errno, Efault);
1821	unsafe_put_user(info.cause, &infop->si_code, Efault);
1822	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1823	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1824	unsafe_put_user(info.status, &infop->si_status, Efault);
1825	user_write_access_end();
1826	return err;
1827	Efault:
1828	user_write_access_end();
1829	return -EFAULT;
1830	}
1831
1832	long kernel_wait4(pid_t upid, int __user stat_addr, int* options,
1833	struct rusage *ru)
1834	{
1835	struct wait_opts wo;
1836	struct pid *pid = NULL;
1837	enum pid_type type;
1838	long ret;
1839
1840	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
1841	__WNOTHREAD\|__WCLONE\|__WALL))
1842	return -EINVAL;
1843
1844	/ -INT_MIN is not defined /
1845	if (upid == INT_MIN)
1846	return -ESRCH;
1847
1848	if (upid == -`1`)
1849	type = PIDTYPE_MAX;
1850	else if (upid < `0`) {
1851	type = PIDTYPE_PGID;
1852	pid = find_get_pid(nr: -upid);
1853	} else if (upid == `0`) {
1854	type = PIDTYPE_PGID;
1855	pid = get_task_pid(current, type: PIDTYPE_PGID);
1856	} else / upid > 0 / {
1857	type = PIDTYPE_PID;
1858	pid = find_get_pid(nr: upid);
1859	}
1860
1861	wo.wo_type = type;
1862	wo.wo_pid = pid;
1863	wo.wo_flags = options \| WEXITED;
1864	wo.wo_info = NULL;
1865	wo.wo_stat = `0`;
1866	wo.wo_rusage = ru;
1867	ret = do_wait(wo: &wo);
1868	put_pid(pid);
1869	if (ret > `0` && stat_addr && put_user(wo.wo_stat, stat_addr))
1870	ret = -EFAULT;
1871
1872	return ret;
1873	}
1874
1875	int kernel_wait(pid_t pid, int *stat)
1876	{
1877	struct wait_opts wo = {
1878	.wo_type = PIDTYPE_PID,
1879	.wo_pid = find_get_pid(nr: pid),
1880	.wo_flags = WEXITED,
1881	};
1882	int ret;
1883
1884	ret = do_wait(wo: &wo);
1885	if (ret > `0` && wo.wo_stat)
1886	*stat = wo.wo_stat;
1887	put_pid(pid: wo.wo_pid);
1888	return ret;
1889	}
1890
1891	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1892	int, options, struct rusage __user *, ru)
1893	{
1894	struct rusage r;
1895	long err = kernel_wait4(upid, stat_addr, options, ru: ru ? &r : NULL);
1896
1897	if (err > `0`) {
1898	if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1899	return -EFAULT;
1900	}
1901	return err;
1902	}
1903
1904	#ifdef __ARCH_WANT_SYS_WAITPID
1905
1906	/*
1907	* sys_waitpid() remains for compatibility. waitpid() should be
1908	* implemented by calling sys_wait4() from libc.a.
1909	*/
1910	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user , stat_addr, int*, options)
1911	{
1912	return kernel_wait4(upid: pid, stat_addr, options, NULL);
1913	}
1914
1915	#endif
1916
1917	#ifdef CONFIG_COMPAT
1918	COMPAT_SYSCALL_DEFINE4(wait4,
1919	compat_pid_t, pid,
1920	compat_uint_t __user *, stat_addr,
1921	int, options,
1922	struct compat_rusage __user *, ru)
1923	{
1924	struct rusage r;
1925	long err = kernel_wait4(upid: pid, stat_addr, options, ru: ru ? &r : NULL);
1926	if (err > `0`) {
1927	if (ru && put_compat_rusage(&r, ru))
1928	return -EFAULT;
1929	}
1930	return err;
1931	}
1932
1933	COMPAT_SYSCALL_DEFINE5(waitid,
1934	int, which, compat_pid_t, pid,
1935	struct compat_siginfo __user , infop, int*, options,
1936	struct compat_rusage __user *, uru)
1937	{
1938	struct rusage ru;
1939	struct waitid_info info = {.status = `0`};
1940	long err = kernel_waitid(which, upid: pid, infop: &info, options, ru: uru ? &ru : NULL);
1941	int signo = `0`;
1942	if (err > `0`) {
1943	signo = SIGCHLD;
1944	err = `0`;
1945	if (uru) {
1946	/ kernel_waitid() overwrites everything in ru /
1947	if (COMPAT_USE_64BIT_TIME)
1948	err = copy_to_user(to: uru, from: &ru, n: sizeof(ru));
1949	else
1950	err = put_compat_rusage(&ru, uru);
1951	if (err)
1952	return -EFAULT;
1953	}
1954	}
1955
1956	if (!infop)
1957	return err;
1958
1959	if (!user_write_access_begin(infop, sizeof(*infop)))
1960	return -EFAULT;
1961
1962	unsafe_put_user(signo, &infop->si_signo, Efault);
1963	unsafe_put_user(`0`, &infop->si_errno, Efault);
1964	unsafe_put_user(info.cause, &infop->si_code, Efault);
1965	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1966	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1967	unsafe_put_user(info.status, &infop->si_status, Efault);
1968	user_write_access_end();
1969	return err;
1970	Efault:
1971	user_write_access_end();
1972	return -EFAULT;
1973	}
1974	#endif
1975
1976	/*
1977	* This needs to be __function_aligned as GCC implicitly makes any
1978	* implementation of abort() cold and drops alignment specified by
1979	* -falign-functions=N.
1980	*
1981	* See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
1982	*/
1983	__weak __function_aligned void abort(void)
1984	{
1985	BUG();
1986
1987	/ if that doesn't kill us, halt /
1988	panic(fmt: "Oops failed to kill thread");
1989	}
1990	EXPORT_SYMBOL(abort);
1991

source code of linux/kernel/exit.c