1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * linux/kernel/exit.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/sched/autogroup.h>
11#include <linux/sched/mm.h>
12#include <linux/sched/stat.h>
13#include <linux/sched/task.h>
14#include <linux/sched/task_stack.h>
15#include <linux/sched/cputime.h>
16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/capability.h>
19#include <linux/completion.h>
20#include <linux/personality.h>
21#include <linux/tty.h>
22#include <linux/iocontext.h>
23#include <linux/key.h>
24#include <linux/cpu.h>
25#include <linux/acct.h>
26#include <linux/tsacct_kern.h>
27#include <linux/file.h>
28#include <linux/freezer.h>
29#include <linux/binfmts.h>
30#include <linux/nsproxy.h>
31#include <linux/pid_namespace.h>
32#include <linux/ptrace.h>
33#include <linux/profile.h>
34#include <linux/mount.h>
35#include <linux/proc_fs.h>
36#include <linux/kthread.h>
37#include <linux/mempolicy.h>
38#include <linux/taskstats_kern.h>
39#include <linux/delayacct.h>
40#include <linux/cgroup.h>
41#include <linux/syscalls.h>
42#include <linux/signal.h>
43#include <linux/posix-timers.h>
44#include <linux/cn_proc.h>
45#include <linux/mutex.h>
46#include <linux/futex.h>
47#include <linux/pipe_fs_i.h>
48#include <linux/audit.h> /* for audit_free() */
49#include <linux/resource.h>
50#include <linux/task_io_accounting_ops.h>
51#include <linux/blkdev.h>
52#include <linux/task_work.h>
53#include <linux/fs_struct.h>
54#include <linux/init_task.h>
55#include <linux/perf_event.h>
56#include <trace/events/sched.h>
57#include <linux/hw_breakpoint.h>
58#include <linux/oom.h>
59#include <linux/writeback.h>
60#include <linux/shm.h>
61#include <linux/kcov.h>
62#include <linux/kmsan.h>
63#include <linux/random.h>
64#include <linux/rcuwait.h>
65#include <linux/compat.h>
66#include <linux/io_uring.h>
67#include <linux/kprobes.h>
68#include <linux/rethook.h>
69#include <linux/sysfs.h>
70#include <linux/user_events.h>
71#include <linux/uaccess.h>
72#include <linux/pidfs.h>
73
74#include <uapi/linux/wait.h>
75
76#include <asm/unistd.h>
77#include <asm/mmu_context.h>
78
79#include "exit.h"
80
81/*
82 * The default value should be high enough to not crash a system that randomly
83 * crashes its kernel from time to time, but low enough to at least not permit
84 * overflowing 32-bit refcounts or the ldsem writer count.
85 */
86static unsigned int oops_limit = 10000;
87
88#ifdef CONFIG_SYSCTL
89static const struct ctl_table kern_exit_table[] = {
90 {
91 .procname = "oops_limit",
92 .data = &oops_limit,
93 .maxlen = sizeof(oops_limit),
94 .mode = 0644,
95 .proc_handler = proc_douintvec,
96 },
97};
98
99static __init int kernel_exit_sysctls_init(void)
100{
101 register_sysctl_init("kernel", kern_exit_table);
102 return 0;
103}
104late_initcall(kernel_exit_sysctls_init);
105#endif
106
107static atomic_t oops_count = ATOMIC_INIT(0);
108
109#ifdef CONFIG_SYSFS
110static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
111 char *page)
112{
113 return sysfs_emit(buf: page, fmt: "%d\n", atomic_read(v: &oops_count));
114}
115
116static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
117
118static __init int kernel_exit_sysfs_init(void)
119{
120 sysfs_add_file_to_group(kobj: kernel_kobj, attr: &oops_count_attr.attr, NULL);
121 return 0;
122}
123late_initcall(kernel_exit_sysfs_init);
124#endif
125
126/*
127 * For things release_task() would like to do *after* tasklist_lock is released.
128 */
129struct release_task_post {
130 struct pid *pids[PIDTYPE_MAX];
131};
132
133static void __unhash_process(struct release_task_post *post, struct task_struct *p,
134 bool group_dead)
135{
136 struct pid *pid = task_pid(task: p);
137
138 nr_threads--;
139
140 detach_pid(pids: post->pids, task: p, PIDTYPE_PID);
141 wake_up_all(&pid->wait_pidfd);
142
143 if (group_dead) {
144 detach_pid(pids: post->pids, task: p, PIDTYPE_TGID);
145 detach_pid(pids: post->pids, task: p, PIDTYPE_PGID);
146 detach_pid(pids: post->pids, task: p, PIDTYPE_SID);
147
148 list_del_rcu(entry: &p->tasks);
149 list_del_init(entry: &p->sibling);
150 __this_cpu_dec(process_counts);
151 }
152 list_del_rcu(entry: &p->thread_node);
153}
154
155/*
156 * This function expects the tasklist_lock write-locked.
157 */
158static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
159{
160 struct signal_struct *sig = tsk->signal;
161 bool group_dead = thread_group_leader(p: tsk);
162 struct sighand_struct *sighand;
163 struct tty_struct *tty;
164 u64 utime, stime;
165
166 sighand = rcu_dereference_check(tsk->sighand,
167 lockdep_tasklist_lock_is_held());
168 spin_lock(lock: &sighand->siglock);
169
170#ifdef CONFIG_POSIX_TIMERS
171 posix_cpu_timers_exit(task: tsk);
172 if (group_dead)
173 posix_cpu_timers_exit_group(task: tsk);
174#endif
175
176 if (group_dead) {
177 tty = sig->tty;
178 sig->tty = NULL;
179 } else {
180 /*
181 * If there is any task waiting for the group exit
182 * then notify it:
183 */
184 if (sig->notify_count > 0 && !--sig->notify_count)
185 wake_up_process(tsk: sig->group_exec_task);
186
187 if (tsk == sig->curr_target)
188 sig->curr_target = next_thread(p: tsk);
189 }
190
191 /*
192 * Accumulate here the counters for all threads as they die. We could
193 * skip the group leader because it is the last user of signal_struct,
194 * but we want to avoid the race with thread_group_cputime() which can
195 * see the empty ->thread_head list.
196 */
197 task_cputime(t: tsk, utime: &utime, stime: &stime);
198 write_seqlock(sl: &sig->stats_lock);
199 sig->utime += utime;
200 sig->stime += stime;
201 sig->gtime += task_gtime(t: tsk);
202 sig->min_flt += tsk->min_flt;
203 sig->maj_flt += tsk->maj_flt;
204 sig->nvcsw += tsk->nvcsw;
205 sig->nivcsw += tsk->nivcsw;
206 sig->inblock += task_io_get_inblock(p: tsk);
207 sig->oublock += task_io_get_oublock(p: tsk);
208 task_io_accounting_add(dst: &sig->ioac, src: &tsk->ioac);
209 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
210 sig->nr_threads--;
211 __unhash_process(post, p: tsk, group_dead);
212 write_sequnlock(sl: &sig->stats_lock);
213
214 tsk->sighand = NULL;
215 spin_unlock(lock: &sighand->siglock);
216
217 __cleanup_sighand(sighand);
218 if (group_dead)
219 tty_kref_put(tty);
220}
221
222static void delayed_put_task_struct(struct rcu_head *rhp)
223{
224 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
225
226 kprobe_flush_task(tsk);
227 rethook_flush_task(tk: tsk);
228 perf_event_delayed_put(task: tsk);
229 trace_sched_process_free(p: tsk);
230 put_task_struct(t: tsk);
231}
232
233void put_task_struct_rcu_user(struct task_struct *task)
234{
235 if (refcount_dec_and_test(r: &task->rcu_users))
236 call_rcu(head: &task->rcu, func: delayed_put_task_struct);
237}
238
239void __weak release_thread(struct task_struct *dead_task)
240{
241}
242
243void release_task(struct task_struct *p)
244{
245 struct release_task_post post;
246 struct task_struct *leader;
247 struct pid *thread_pid;
248 int zap_leader;
249repeat:
250 memset(&post, 0, sizeof(post));
251
252 /* don't need to get the RCU readlock here - the process is dead and
253 * can't be modifying its own credentials. But shut RCU-lockdep up */
254 rcu_read_lock();
255 dec_rlimit_ucounts(task_ucounts(p), type: UCOUNT_RLIMIT_NPROC, v: 1);
256 rcu_read_unlock();
257
258 pidfs_exit(tsk: p);
259 cgroup_release(p);
260
261 /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
262 thread_pid = task_pid(task: p);
263
264 write_lock_irq(&tasklist_lock);
265 ptrace_release_task(task: p);
266 __exit_signal(post: &post, tsk: p);
267
268 /*
269 * If we are the last non-leader member of the thread
270 * group, and the leader is zombie, then notify the
271 * group leader's parent process. (if it wants notification.)
272 */
273 zap_leader = 0;
274 leader = p->group_leader;
275 if (leader != p && thread_group_empty(p: leader)
276 && leader->exit_state == EXIT_ZOMBIE) {
277 /* for pidfs_exit() and do_notify_parent() */
278 if (leader->signal->flags & SIGNAL_GROUP_EXIT)
279 leader->exit_code = leader->signal->group_exit_code;
280 /*
281 * If we were the last child thread and the leader has
282 * exited already, and the leader's parent ignores SIGCHLD,
283 * then we are the one who should release the leader.
284 */
285 zap_leader = do_notify_parent(leader, leader->exit_signal);
286 if (zap_leader)
287 leader->exit_state = EXIT_DEAD;
288 }
289
290 write_unlock_irq(&tasklist_lock);
291 /* @thread_pid can't go away until free_pids() below */
292 proc_flush_pid(thread_pid);
293 add_device_randomness(buf: &p->se.sum_exec_runtime,
294 len: sizeof(p->se.sum_exec_runtime));
295 free_pids(pids: post.pids);
296 release_thread(dead_task: p);
297 /*
298 * This task was already removed from the process/thread/pid lists
299 * and lock_task_sighand(p) can't succeed. Nobody else can touch
300 * ->pending or, if group dead, signal->shared_pending. We can call
301 * flush_sigqueue() lockless.
302 */
303 flush_sigqueue(queue: &p->pending);
304 if (thread_group_leader(p))
305 flush_sigqueue(queue: &p->signal->shared_pending);
306
307 put_task_struct_rcu_user(task: p);
308
309 p = leader;
310 if (unlikely(zap_leader))
311 goto repeat;
312}
313
314int rcuwait_wake_up(struct rcuwait *w)
315{
316 int ret = 0;
317 struct task_struct *task;
318
319 rcu_read_lock();
320
321 /*
322 * Order condition vs @task, such that everything prior to the load
323 * of @task is visible. This is the condition as to why the user called
324 * rcuwait_wake() in the first place. Pairs with set_current_state()
325 * barrier (A) in rcuwait_wait_event().
326 *
327 * WAIT WAKE
328 * [S] tsk = current [S] cond = true
329 * MB (A) MB (B)
330 * [L] cond [L] tsk
331 */
332 smp_mb(); /* (B) */
333
334 task = rcu_dereference(w->task);
335 if (task)
336 ret = wake_up_process(tsk: task);
337 rcu_read_unlock();
338
339 return ret;
340}
341EXPORT_SYMBOL_GPL(rcuwait_wake_up);
342
343/*
344 * Determine if a process group is "orphaned", according to the POSIX
345 * definition in 2.2.2.52. Orphaned process groups are not to be affected
346 * by terminal-generated stop signals. Newly orphaned process groups are
347 * to receive a SIGHUP and a SIGCONT.
348 *
349 * "I ask you, have you ever known what it is to be an orphan?"
350 */
351static int will_become_orphaned_pgrp(struct pid *pgrp,
352 struct task_struct *ignored_task)
353{
354 struct task_struct *p;
355
356 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
357 if ((p == ignored_task) ||
358 (p->exit_state && thread_group_empty(p)) ||
359 is_global_init(tsk: p->real_parent))
360 continue;
361
362 if (task_pgrp(task: p->real_parent) != pgrp &&
363 task_session(task: p->real_parent) == task_session(task: p))
364 return 0;
365 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
366
367 return 1;
368}
369
370int is_current_pgrp_orphaned(void)
371{
372 int retval;
373
374 read_lock(&tasklist_lock);
375 retval = will_become_orphaned_pgrp(pgrp: task_pgrp(current), NULL);
376 read_unlock(&tasklist_lock);
377
378 return retval;
379}
380
381static bool has_stopped_jobs(struct pid *pgrp)
382{
383 struct task_struct *p;
384
385 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
386 if (p->signal->flags & SIGNAL_STOP_STOPPED)
387 return true;
388 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
389
390 return false;
391}
392
393/*
394 * Check to see if any process groups have become orphaned as
395 * a result of our exiting, and if they have any stopped jobs,
396 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
397 */
398static void
399kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
400{
401 struct pid *pgrp = task_pgrp(task: tsk);
402 struct task_struct *ignored_task = tsk;
403
404 if (!parent)
405 /* exit: our father is in a different pgrp than
406 * we are and we were the only connection outside.
407 */
408 parent = tsk->real_parent;
409 else
410 /* reparent: our child is in a different pgrp than
411 * we are, and it was the only connection outside.
412 */
413 ignored_task = NULL;
414
415 if (task_pgrp(task: parent) != pgrp &&
416 task_session(task: parent) == task_session(task: tsk) &&
417 will_become_orphaned_pgrp(pgrp, ignored_task) &&
418 has_stopped_jobs(pgrp)) {
419 __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
420 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
421 }
422}
423
424static void coredump_task_exit(struct task_struct *tsk,
425 struct core_state *core_state)
426{
427 struct core_thread self;
428
429 self.task = tsk;
430 if (self.task->flags & PF_SIGNALED)
431 self.next = xchg(&core_state->dumper.next, &self);
432 else
433 self.task = NULL;
434 /*
435 * Implies mb(), the result of xchg() must be visible
436 * to core_state->dumper.
437 */
438 if (atomic_dec_and_test(v: &core_state->nr_threads))
439 complete(&core_state->startup);
440
441 for (;;) {
442 set_current_state(TASK_IDLE|TASK_FREEZABLE);
443 if (!self.task) /* see coredump_finish() */
444 break;
445 schedule();
446 }
447 __set_current_state(TASK_RUNNING);
448}
449
450#ifdef CONFIG_MEMCG
451/* drops tasklist_lock if succeeds */
452static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
453{
454 bool ret = false;
455
456 task_lock(p: tsk);
457 if (likely(tsk->mm == mm)) {
458 /* tsk can't pass exit_mm/exec_mmap and exit */
459 read_unlock(&tasklist_lock);
460 WRITE_ONCE(mm->owner, tsk);
461 lru_gen_migrate_mm(mm);
462 ret = true;
463 }
464 task_unlock(p: tsk);
465 return ret;
466}
467
468static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
469{
470 struct task_struct *t;
471
472 for_each_thread(g, t) {
473 struct mm_struct *t_mm = READ_ONCE(t->mm);
474 if (t_mm == mm) {
475 if (__try_to_set_owner(tsk: t, mm))
476 return true;
477 } else if (t_mm)
478 break;
479 }
480
481 return false;
482}
483
484/*
485 * A task is exiting. If it owned this mm, find a new owner for the mm.
486 */
487void mm_update_next_owner(struct mm_struct *mm)
488{
489 struct task_struct *g, *p = current;
490
491 /*
492 * If the exiting or execing task is not the owner, it's
493 * someone else's problem.
494 */
495 if (mm->owner != p)
496 return;
497 /*
498 * The current owner is exiting/execing and there are no other
499 * candidates. Do not leave the mm pointing to a possibly
500 * freed task structure.
501 */
502 if (atomic_read(v: &mm->mm_users) <= 1) {
503 WRITE_ONCE(mm->owner, NULL);
504 return;
505 }
506
507 read_lock(&tasklist_lock);
508 /*
509 * Search in the children
510 */
511 list_for_each_entry(g, &p->children, sibling) {
512 if (try_to_set_owner(g, mm))
513 goto ret;
514 }
515 /*
516 * Search in the siblings
517 */
518 list_for_each_entry(g, &p->real_parent->children, sibling) {
519 if (try_to_set_owner(g, mm))
520 goto ret;
521 }
522 /*
523 * Search through everything else, we should not get here often.
524 */
525 for_each_process(g) {
526 if (atomic_read(v: &mm->mm_users) <= 1)
527 break;
528 if (g->flags & PF_KTHREAD)
529 continue;
530 if (try_to_set_owner(g, mm))
531 goto ret;
532 }
533 read_unlock(&tasklist_lock);
534 /*
535 * We found no owner yet mm_users > 1: this implies that we are
536 * most likely racing with swapoff (try_to_unuse()) or /proc or
537 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
538 */
539 WRITE_ONCE(mm->owner, NULL);
540 ret:
541 return;
542
543}
544#endif /* CONFIG_MEMCG */
545
546/*
547 * Turn us into a lazy TLB process if we
548 * aren't already..
549 */
550static void exit_mm(void)
551{
552 struct mm_struct *mm = current->mm;
553
554 exit_mm_release(current, mm);
555 if (!mm)
556 return;
557 mmap_read_lock(mm);
558 mmgrab_lazy_tlb(mm);
559 BUG_ON(mm != current->active_mm);
560 /* more a memory barrier than a real lock */
561 task_lock(current);
562 /*
563 * When a thread stops operating on an address space, the loop
564 * in membarrier_private_expedited() may not observe that
565 * tsk->mm, and the loop in membarrier_global_expedited() may
566 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
567 * rq->membarrier_state, so those would not issue an IPI.
568 * Membarrier requires a memory barrier after accessing
569 * user-space memory, before clearing tsk->mm or the
570 * rq->membarrier_state.
571 */
572 smp_mb__after_spinlock();
573 local_irq_disable();
574 current->mm = NULL;
575 membarrier_update_current_mm(NULL);
576 enter_lazy_tlb(mm, current);
577 local_irq_enable();
578 task_unlock(current);
579 mmap_read_unlock(mm);
580 mm_update_next_owner(mm);
581 mmput(mm);
582 if (test_thread_flag(TIF_MEMDIE))
583 exit_oom_victim();
584}
585
586static struct task_struct *find_alive_thread(struct task_struct *p)
587{
588 struct task_struct *t;
589
590 for_each_thread(p, t) {
591 if (!(t->flags & PF_EXITING))
592 return t;
593 }
594 return NULL;
595}
596
597static struct task_struct *find_child_reaper(struct task_struct *father,
598 struct list_head *dead)
599 __releases(&tasklist_lock)
600 __acquires(&tasklist_lock)
601{
602 struct pid_namespace *pid_ns = task_active_pid_ns(tsk: father);
603 struct task_struct *reaper = pid_ns->child_reaper;
604 struct task_struct *p, *n;
605
606 if (likely(reaper != father))
607 return reaper;
608
609 reaper = find_alive_thread(p: father);
610 if (reaper) {
611 pid_ns->child_reaper = reaper;
612 return reaper;
613 }
614
615 write_unlock_irq(&tasklist_lock);
616
617 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
618 list_del_init(entry: &p->ptrace_entry);
619 release_task(p);
620 }
621
622 zap_pid_ns_processes(pid_ns);
623 write_lock_irq(&tasklist_lock);
624
625 return father;
626}
627
628/*
629 * When we die, we re-parent all our children, and try to:
630 * 1. give them to another thread in our thread group, if such a member exists
631 * 2. give it to the first ancestor process which prctl'd itself as a
632 * child_subreaper for its children (like a service manager)
633 * 3. give it to the init process (PID 1) in our pid namespace
634 */
635static struct task_struct *find_new_reaper(struct task_struct *father,
636 struct task_struct *child_reaper)
637{
638 struct task_struct *thread, *reaper;
639
640 thread = find_alive_thread(p: father);
641 if (thread)
642 return thread;
643
644 if (father->signal->has_child_subreaper) {
645 unsigned int ns_level = task_pid(task: father)->level;
646 /*
647 * Find the first ->is_child_subreaper ancestor in our pid_ns.
648 * We can't check reaper != child_reaper to ensure we do not
649 * cross the namespaces, the exiting parent could be injected
650 * by setns() + fork().
651 * We check pid->level, this is slightly more efficient than
652 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
653 */
654 for (reaper = father->real_parent;
655 task_pid(task: reaper)->level == ns_level;
656 reaper = reaper->real_parent) {
657 if (reaper == &init_task)
658 break;
659 if (!reaper->signal->is_child_subreaper)
660 continue;
661 thread = find_alive_thread(p: reaper);
662 if (thread)
663 return thread;
664 }
665 }
666
667 return child_reaper;
668}
669
670/*
671* Any that need to be release_task'd are put on the @dead list.
672 */
673static void reparent_leader(struct task_struct *father, struct task_struct *p,
674 struct list_head *dead)
675{
676 if (unlikely(p->exit_state == EXIT_DEAD))
677 return;
678
679 /* We don't want people slaying init. */
680 p->exit_signal = SIGCHLD;
681
682 /* If it has exited notify the new parent about this child's death. */
683 if (!p->ptrace &&
684 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
685 if (do_notify_parent(p, p->exit_signal)) {
686 p->exit_state = EXIT_DEAD;
687 list_add(new: &p->ptrace_entry, head: dead);
688 }
689 }
690
691 kill_orphaned_pgrp(tsk: p, parent: father);
692}
693
694/*
695 * This does two things:
696 *
697 * A. Make init inherit all the child processes
698 * B. Check to see if any process groups have become orphaned
699 * as a result of our exiting, and if they have any stopped
700 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
701 */
702static void forget_original_parent(struct task_struct *father,
703 struct list_head *dead)
704{
705 struct task_struct *p, *t, *reaper;
706
707 if (unlikely(!list_empty(&father->ptraced)))
708 exit_ptrace(tracer: father, dead);
709
710 /* Can drop and reacquire tasklist_lock */
711 reaper = find_child_reaper(father, dead);
712 if (list_empty(head: &father->children))
713 return;
714
715 reaper = find_new_reaper(father, child_reaper: reaper);
716 list_for_each_entry(p, &father->children, sibling) {
717 for_each_thread(p, t) {
718 RCU_INIT_POINTER(t->real_parent, reaper);
719 BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
720 if (likely(!t->ptrace))
721 t->parent = t->real_parent;
722 if (t->pdeath_signal)
723 group_send_sig_info(sig: t->pdeath_signal,
724 SEND_SIG_NOINFO, p: t,
725 type: PIDTYPE_TGID);
726 }
727 /*
728 * If this is a threaded reparent there is no need to
729 * notify anyone anything has happened.
730 */
731 if (!same_thread_group(p1: reaper, p2: father))
732 reparent_leader(father, p, dead);
733 }
734 list_splice_tail_init(list: &father->children, head: &reaper->children);
735}
736
737/*
738 * Send signals to all our closest relatives so that they know
739 * to properly mourn us..
740 */
741static void exit_notify(struct task_struct *tsk, int group_dead)
742{
743 bool autoreap;
744 struct task_struct *p, *n;
745 LIST_HEAD(dead);
746
747 write_lock_irq(&tasklist_lock);
748 forget_original_parent(father: tsk, dead: &dead);
749
750 if (group_dead)
751 kill_orphaned_pgrp(tsk: tsk->group_leader, NULL);
752
753 tsk->exit_state = EXIT_ZOMBIE;
754
755 if (unlikely(tsk->ptrace)) {
756 int sig = thread_group_leader(p: tsk) &&
757 thread_group_empty(p: tsk) &&
758 !ptrace_reparented(child: tsk) ?
759 tsk->exit_signal : SIGCHLD;
760 autoreap = do_notify_parent(tsk, sig);
761 } else if (thread_group_leader(p: tsk)) {
762 autoreap = thread_group_empty(p: tsk) &&
763 do_notify_parent(tsk, tsk->exit_signal);
764 } else {
765 autoreap = true;
766 /* untraced sub-thread */
767 do_notify_pidfd(task: tsk);
768 }
769
770 if (autoreap) {
771 tsk->exit_state = EXIT_DEAD;
772 list_add(new: &tsk->ptrace_entry, head: &dead);
773 }
774
775 /* mt-exec, de_thread() is waiting for group leader */
776 if (unlikely(tsk->signal->notify_count < 0))
777 wake_up_process(tsk: tsk->signal->group_exec_task);
778 write_unlock_irq(&tasklist_lock);
779
780 list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
781 list_del_init(entry: &p->ptrace_entry);
782 release_task(p);
783 }
784}
785
786#ifdef CONFIG_DEBUG_STACK_USAGE
787unsigned long stack_not_used(struct task_struct *p)
788{
789 unsigned long *n = end_of_stack(task: p);
790
791 do { /* Skip over canary */
792# ifdef CONFIG_STACK_GROWSUP
793 n--;
794# else
795 n++;
796# endif
797 } while (!*n);
798
799# ifdef CONFIG_STACK_GROWSUP
800 return (unsigned long)end_of_stack(p) - (unsigned long)n;
801# else
802 return (unsigned long)n - (unsigned long)end_of_stack(task: p);
803# endif
804}
805
806/* Count the maximum pages reached in kernel stacks */
807static inline void kstack_histogram(unsigned long used_stack)
808{
809#ifdef CONFIG_VM_EVENT_COUNTERS
810 if (used_stack <= 1024)
811 count_vm_event(item: KSTACK_1K);
812#if THREAD_SIZE > 1024
813 else if (used_stack <= 2048)
814 count_vm_event(item: KSTACK_2K);
815#endif
816#if THREAD_SIZE > 2048
817 else if (used_stack <= 4096)
818 count_vm_event(item: KSTACK_4K);
819#endif
820#if THREAD_SIZE > 4096
821 else if (used_stack <= 8192)
822 count_vm_event(item: KSTACK_8K);
823#endif
824#if THREAD_SIZE > 8192
825 else if (used_stack <= 16384)
826 count_vm_event(item: KSTACK_16K);
827#endif
828#if THREAD_SIZE > 16384
829 else if (used_stack <= 32768)
830 count_vm_event(item: KSTACK_32K);
831#endif
832#if THREAD_SIZE > 32768
833 else if (used_stack <= 65536)
834 count_vm_event(KSTACK_64K);
835#endif
836#if THREAD_SIZE > 65536
837 else
838 count_vm_event(KSTACK_REST);
839#endif
840#endif /* CONFIG_VM_EVENT_COUNTERS */
841}
842
843static void check_stack_usage(void)
844{
845 static DEFINE_SPINLOCK(low_water_lock);
846 static int lowest_to_date = THREAD_SIZE;
847 unsigned long free;
848
849 free = stack_not_used(current);
850 kstack_histogram(THREAD_SIZE - free);
851
852 if (free >= lowest_to_date)
853 return;
854
855 spin_lock(lock: &low_water_lock);
856 if (free < lowest_to_date) {
857 pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
858 current->comm, task_pid_nr(current), free);
859 lowest_to_date = free;
860 }
861 spin_unlock(lock: &low_water_lock);
862}
863#else
864static inline void check_stack_usage(void) {}
865#endif
866
867static void synchronize_group_exit(struct task_struct *tsk, long code)
868{
869 struct sighand_struct *sighand = tsk->sighand;
870 struct signal_struct *signal = tsk->signal;
871 struct core_state *core_state;
872
873 spin_lock_irq(lock: &sighand->siglock);
874 signal->quick_threads--;
875 if ((signal->quick_threads == 0) &&
876 !(signal->flags & SIGNAL_GROUP_EXIT)) {
877 signal->flags = SIGNAL_GROUP_EXIT;
878 signal->group_exit_code = code;
879 signal->group_stop_count = 0;
880 }
881 /*
882 * Serialize with any possible pending coredump.
883 * We must hold siglock around checking core_state
884 * and setting PF_POSTCOREDUMP. The core-inducing thread
885 * will increment ->nr_threads for each thread in the
886 * group without PF_POSTCOREDUMP set.
887 */
888 tsk->flags |= PF_POSTCOREDUMP;
889 core_state = signal->core_state;
890 spin_unlock_irq(lock: &sighand->siglock);
891
892 if (unlikely(core_state))
893 coredump_task_exit(tsk, core_state);
894}
895
896void __noreturn do_exit(long code)
897{
898 struct task_struct *tsk = current;
899 int group_dead;
900
901 WARN_ON(irqs_disabled());
902 WARN_ON(tsk->plug);
903
904 kcov_task_exit(t: tsk);
905 kmsan_task_exit(task: tsk);
906
907 synchronize_group_exit(tsk, code);
908 ptrace_event(PTRACE_EVENT_EXIT, message: code);
909 user_events_exit(t: tsk);
910
911 io_uring_files_cancel();
912 exit_signals(tsk); /* sets PF_EXITING */
913
914 seccomp_filter_release(tsk);
915
916 acct_update_integrals(tsk);
917 group_dead = atomic_dec_and_test(v: &tsk->signal->live);
918 if (group_dead) {
919 /*
920 * If the last thread of global init has exited, panic
921 * immediately to get a useable coredump.
922 */
923 if (unlikely(is_global_init(tsk)))
924 panic(fmt: "Attempted to kill init! exitcode=0x%08x\n",
925 tsk->signal->group_exit_code ?: (int)code);
926
927#ifdef CONFIG_POSIX_TIMERS
928 hrtimer_cancel(timer: &tsk->signal->real_timer);
929 exit_itimers(tsk);
930#endif
931 if (tsk->mm)
932 setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: tsk->mm);
933 }
934 acct_collect(exitcode: code, group_dead);
935 if (group_dead)
936 tty_audit_exit();
937 audit_free(task: tsk);
938
939 tsk->exit_code = code;
940 taskstats_exit(tsk, group_dead);
941 trace_sched_process_exit(p: tsk, group_dead);
942
943 exit_mm();
944
945 if (group_dead)
946 acct_process();
947
948 exit_sem(tsk);
949 exit_shm(task: tsk);
950 exit_files(tsk);
951 exit_fs(tsk);
952 if (group_dead)
953 disassociate_ctty(priv: 1);
954 exit_task_namespaces(tsk);
955 exit_task_work(task: tsk);
956 exit_thread(tsk);
957
958 /*
959 * Flush inherited counters to the parent - before the parent
960 * gets woken up by child-exit notifications.
961 *
962 * because of cgroup mode, must be called before cgroup_exit()
963 */
964 perf_event_exit_task(child: tsk);
965
966 sched_autogroup_exit_task(p: tsk);
967 cgroup_exit(p: tsk);
968
969 /*
970 * FIXME: do that only when needed, using sched_exit tracepoint
971 */
972 flush_ptrace_hw_breakpoint(tsk);
973
974 exit_tasks_rcu_start();
975 exit_notify(tsk, group_dead);
976 proc_exit_connector(task: tsk);
977 mpol_put_task_policy(tsk);
978#ifdef CONFIG_FUTEX
979 if (unlikely(current->pi_state_cache))
980 kfree(current->pi_state_cache);
981#endif
982 /*
983 * Make sure we are holding no locks:
984 */
985 debug_check_no_locks_held();
986
987 if (tsk->io_context)
988 exit_io_context(task: tsk);
989
990 if (tsk->splice_pipe)
991 free_pipe_info(tsk->splice_pipe);
992
993 if (tsk->task_frag.page)
994 put_page(page: tsk->task_frag.page);
995
996 exit_task_stack_account(tsk);
997
998 check_stack_usage();
999 preempt_disable();
1000 if (tsk->nr_dirtied)
1001 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1002 exit_rcu();
1003 exit_tasks_rcu_finish();
1004
1005 lockdep_free_task(task: tsk);
1006 do_task_dead();
1007}
1008
1009void __noreturn make_task_dead(int signr)
1010{
1011 /*
1012 * Take the task off the cpu after something catastrophic has
1013 * happened.
1014 *
1015 * We can get here from a kernel oops, sometimes with preemption off.
1016 * Start by checking for critical errors.
1017 * Then fix up important state like USER_DS and preemption.
1018 * Then do everything else.
1019 */
1020 struct task_struct *tsk = current;
1021 unsigned int limit;
1022
1023 if (unlikely(in_interrupt()))
1024 panic(fmt: "Aiee, killing interrupt handler!");
1025 if (unlikely(!tsk->pid))
1026 panic(fmt: "Attempted to kill the idle task!");
1027
1028 if (unlikely(irqs_disabled())) {
1029 pr_info("note: %s[%d] exited with irqs disabled\n",
1030 current->comm, task_pid_nr(current));
1031 local_irq_enable();
1032 }
1033 if (unlikely(in_atomic())) {
1034 pr_info("note: %s[%d] exited with preempt_count %d\n",
1035 current->comm, task_pid_nr(current),
1036 preempt_count());
1037 preempt_count_set(PREEMPT_ENABLED);
1038 }
1039
1040 /*
1041 * Every time the system oopses, if the oops happens while a reference
1042 * to an object was held, the reference leaks.
1043 * If the oops doesn't also leak memory, repeated oopsing can cause
1044 * reference counters to wrap around (if they're not using refcount_t).
1045 * This means that repeated oopsing can make unexploitable-looking bugs
1046 * exploitable through repeated oopsing.
1047 * To make sure this can't happen, place an upper bound on how often the
1048 * kernel may oops without panic().
1049 */
1050 limit = READ_ONCE(oops_limit);
1051 if (atomic_inc_return(v: &oops_count) >= limit && limit)
1052 panic(fmt: "Oopsed too often (kernel.oops_limit is %d)", limit);
1053
1054 /*
1055 * We're taking recursive faults here in make_task_dead. Safest is to just
1056 * leave this task alone and wait for reboot.
1057 */
1058 if (unlikely(tsk->flags & PF_EXITING)) {
1059 pr_alert("Fixing recursive fault but reboot is needed!\n");
1060 futex_exit_recursive(tsk);
1061 tsk->exit_state = EXIT_DEAD;
1062 refcount_inc(r: &tsk->rcu_users);
1063 do_task_dead();
1064 }
1065
1066 do_exit(code: signr);
1067}
1068
1069SYSCALL_DEFINE1(exit, int, error_code)
1070{
1071 do_exit(code: (error_code&0xff)<<8);
1072}
1073
1074/*
1075 * Take down every thread in the group. This is called by fatal signals
1076 * as well as by sys_exit_group (below).
1077 */
1078void __noreturn
1079do_group_exit(int exit_code)
1080{
1081 struct signal_struct *sig = current->signal;
1082
1083 if (sig->flags & SIGNAL_GROUP_EXIT)
1084 exit_code = sig->group_exit_code;
1085 else if (sig->group_exec_task)
1086 exit_code = 0;
1087 else {
1088 struct sighand_struct *const sighand = current->sighand;
1089
1090 spin_lock_irq(lock: &sighand->siglock);
1091 if (sig->flags & SIGNAL_GROUP_EXIT)
1092 /* Another thread got here before we took the lock. */
1093 exit_code = sig->group_exit_code;
1094 else if (sig->group_exec_task)
1095 exit_code = 0;
1096 else {
1097 sig->group_exit_code = exit_code;
1098 sig->flags = SIGNAL_GROUP_EXIT;
1099 zap_other_threads(current);
1100 }
1101 spin_unlock_irq(lock: &sighand->siglock);
1102 }
1103
1104 do_exit(code: exit_code);
1105 /* NOTREACHED */
1106}
1107
1108/*
1109 * this kills every thread in the thread group. Note that any externally
1110 * wait4()-ing process will get the correct exit code - even if this
1111 * thread is not the thread group leader.
1112 */
1113SYSCALL_DEFINE1(exit_group, int, error_code)
1114{
1115 do_group_exit(exit_code: (error_code & 0xff) << 8);
1116 /* NOTREACHED */
1117 return 0;
1118}
1119
1120static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1121{
1122 return wo->wo_type == PIDTYPE_MAX ||
1123 task_pid_type(task: p, type: wo->wo_type) == wo->wo_pid;
1124}
1125
1126static int
1127eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
1128{
1129 if (!eligible_pid(wo, p))
1130 return 0;
1131
1132 /*
1133 * Wait for all children (clone and not) if __WALL is set or
1134 * if it is traced by us.
1135 */
1136 if (ptrace || (wo->wo_flags & __WALL))
1137 return 1;
1138
1139 /*
1140 * Otherwise, wait for clone children *only* if __WCLONE is set;
1141 * otherwise, wait for non-clone children *only*.
1142 *
1143 * Note: a "clone" child here is one that reports to its parent
1144 * using a signal other than SIGCHLD, or a non-leader thread which
1145 * we can only see if it is traced by us.
1146 */
1147 if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1148 return 0;
1149
1150 return 1;
1151}
1152
1153/*
1154 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1155 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1156 * the lock and this task is uninteresting. If we return nonzero, we have
1157 * released the lock and the system call should return.
1158 */
1159static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1160{
1161 int state, status;
1162 pid_t pid = task_pid_vnr(tsk: p);
1163 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1164 struct waitid_info *infop;
1165
1166 if (!likely(wo->wo_flags & WEXITED))
1167 return 0;
1168
1169 if (unlikely(wo->wo_flags & WNOWAIT)) {
1170 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1171 ? p->signal->group_exit_code : p->exit_code;
1172 get_task_struct(t: p);
1173 read_unlock(&tasklist_lock);
1174 sched_annotate_sleep();
1175 if (wo->wo_rusage)
1176 getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1177 put_task_struct(t: p);
1178 goto out_info;
1179 }
1180 /*
1181 * Move the task's state to DEAD/TRACE, only one thread can do this.
1182 */
1183 state = (ptrace_reparented(child: p) && thread_group_leader(p)) ?
1184 EXIT_TRACE : EXIT_DEAD;
1185 if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1186 return 0;
1187 /*
1188 * We own this thread, nobody else can reap it.
1189 */
1190 read_unlock(&tasklist_lock);
1191 sched_annotate_sleep();
1192
1193 /*
1194 * Check thread_group_leader() to exclude the traced sub-threads.
1195 */
1196 if (state == EXIT_DEAD && thread_group_leader(p)) {
1197 struct signal_struct *sig = p->signal;
1198 struct signal_struct *psig = current->signal;
1199 unsigned long maxrss;
1200 u64 tgutime, tgstime;
1201
1202 /*
1203 * The resource counters for the group leader are in its
1204 * own task_struct. Those for dead threads in the group
1205 * are in its signal_struct, as are those for the child
1206 * processes it has previously reaped. All these
1207 * accumulate in the parent's signal_struct c* fields.
1208 *
1209 * We don't bother to take a lock here to protect these
1210 * p->signal fields because the whole thread group is dead
1211 * and nobody can change them.
1212 *
1213 * psig->stats_lock also protects us from our sub-threads
1214 * which can reap other children at the same time.
1215 *
1216 * We use thread_group_cputime_adjusted() to get times for
1217 * the thread group, which consolidates times for all threads
1218 * in the group including the group leader.
1219 */
1220 thread_group_cputime_adjusted(p, ut: &tgutime, st: &tgstime);
1221 write_seqlock_irq(sl: &psig->stats_lock);
1222 psig->cutime += tgutime + sig->cutime;
1223 psig->cstime += tgstime + sig->cstime;
1224 psig->cgtime += task_gtime(t: p) + sig->gtime + sig->cgtime;
1225 psig->cmin_flt +=
1226 p->min_flt + sig->min_flt + sig->cmin_flt;
1227 psig->cmaj_flt +=
1228 p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1229 psig->cnvcsw +=
1230 p->nvcsw + sig->nvcsw + sig->cnvcsw;
1231 psig->cnivcsw +=
1232 p->nivcsw + sig->nivcsw + sig->cnivcsw;
1233 psig->cinblock +=
1234 task_io_get_inblock(p) +
1235 sig->inblock + sig->cinblock;
1236 psig->coublock +=
1237 task_io_get_oublock(p) +
1238 sig->oublock + sig->coublock;
1239 maxrss = max(sig->maxrss, sig->cmaxrss);
1240 if (psig->cmaxrss < maxrss)
1241 psig->cmaxrss = maxrss;
1242 task_io_accounting_add(dst: &psig->ioac, src: &p->ioac);
1243 task_io_accounting_add(dst: &psig->ioac, src: &sig->ioac);
1244 write_sequnlock_irq(sl: &psig->stats_lock);
1245 }
1246
1247 if (wo->wo_rusage)
1248 getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1249 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1250 ? p->signal->group_exit_code : p->exit_code;
1251 wo->wo_stat = status;
1252
1253 if (state == EXIT_TRACE) {
1254 write_lock_irq(&tasklist_lock);
1255 /* We dropped tasklist, ptracer could die and untrace */
1256 ptrace_unlink(child: p);
1257
1258 /* If parent wants a zombie, don't release it now */
1259 state = EXIT_ZOMBIE;
1260 if (do_notify_parent(p, p->exit_signal))
1261 state = EXIT_DEAD;
1262 p->exit_state = state;
1263 write_unlock_irq(&tasklist_lock);
1264 }
1265 if (state == EXIT_DEAD)
1266 release_task(p);
1267
1268out_info:
1269 infop = wo->wo_info;
1270 if (infop) {
1271 if ((status & 0x7f) == 0) {
1272 infop->cause = CLD_EXITED;
1273 infop->status = status >> 8;
1274 } else {
1275 infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1276 infop->status = status & 0x7f;
1277 }
1278 infop->pid = pid;
1279 infop->uid = uid;
1280 }
1281
1282 return pid;
1283}
1284
1285static int *task_stopped_code(struct task_struct *p, bool ptrace)
1286{
1287 if (ptrace) {
1288 if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1289 return &p->exit_code;
1290 } else {
1291 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1292 return &p->signal->group_exit_code;
1293 }
1294 return NULL;
1295}
1296
1297/**
1298 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1299 * @wo: wait options
1300 * @ptrace: is the wait for ptrace
1301 * @p: task to wait for
1302 *
1303 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1304 *
1305 * CONTEXT:
1306 * read_lock(&tasklist_lock), which is released if return value is
1307 * non-zero. Also, grabs and releases @p->sighand->siglock.
1308 *
1309 * RETURNS:
1310 * 0 if wait condition didn't exist and search for other wait conditions
1311 * should continue. Non-zero return, -errno on failure and @p's pid on
1312 * success, implies that tasklist_lock is released and wait condition
1313 * search should terminate.
1314 */
1315static int wait_task_stopped(struct wait_opts *wo,
1316 int ptrace, struct task_struct *p)
1317{
1318 struct waitid_info *infop;
1319 int exit_code, *p_code, why;
1320 uid_t uid = 0; /* unneeded, required by compiler */
1321 pid_t pid;
1322
1323 /*
1324 * Traditionally we see ptrace'd stopped tasks regardless of options.
1325 */
1326 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1327 return 0;
1328
1329 if (!task_stopped_code(p, ptrace))
1330 return 0;
1331
1332 exit_code = 0;
1333 spin_lock_irq(lock: &p->sighand->siglock);
1334
1335 p_code = task_stopped_code(p, ptrace);
1336 if (unlikely(!p_code))
1337 goto unlock_sig;
1338
1339 exit_code = *p_code;
1340 if (!exit_code)
1341 goto unlock_sig;
1342
1343 if (!unlikely(wo->wo_flags & WNOWAIT))
1344 *p_code = 0;
1345
1346 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1347unlock_sig:
1348 spin_unlock_irq(lock: &p->sighand->siglock);
1349 if (!exit_code)
1350 return 0;
1351
1352 /*
1353 * Now we are pretty sure this task is interesting.
1354 * Make sure it doesn't get reaped out from under us while we
1355 * give up the lock and then examine it below. We don't want to
1356 * keep holding onto the tasklist_lock while we call getrusage and
1357 * possibly take page faults for user memory.
1358 */
1359 get_task_struct(t: p);
1360 pid = task_pid_vnr(tsk: p);
1361 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1362 read_unlock(&tasklist_lock);
1363 sched_annotate_sleep();
1364 if (wo->wo_rusage)
1365 getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1366 put_task_struct(t: p);
1367
1368 if (likely(!(wo->wo_flags & WNOWAIT)))
1369 wo->wo_stat = (exit_code << 8) | 0x7f;
1370
1371 infop = wo->wo_info;
1372 if (infop) {
1373 infop->cause = why;
1374 infop->status = exit_code;
1375 infop->pid = pid;
1376 infop->uid = uid;
1377 }
1378 return pid;
1379}
1380
1381/*
1382 * Handle do_wait work for one task in a live, non-stopped state.
1383 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1384 * the lock and this task is uninteresting. If we return nonzero, we have
1385 * released the lock and the system call should return.
1386 */
1387static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1388{
1389 struct waitid_info *infop;
1390 pid_t pid;
1391 uid_t uid;
1392
1393 if (!unlikely(wo->wo_flags & WCONTINUED))
1394 return 0;
1395
1396 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1397 return 0;
1398
1399 spin_lock_irq(lock: &p->sighand->siglock);
1400 /* Re-check with the lock held. */
1401 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1402 spin_unlock_irq(lock: &p->sighand->siglock);
1403 return 0;
1404 }
1405 if (!unlikely(wo->wo_flags & WNOWAIT))
1406 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1407 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1408 spin_unlock_irq(lock: &p->sighand->siglock);
1409
1410 pid = task_pid_vnr(tsk: p);
1411 get_task_struct(t: p);
1412 read_unlock(&tasklist_lock);
1413 sched_annotate_sleep();
1414 if (wo->wo_rusage)
1415 getrusage(p, RUSAGE_BOTH, ru: wo->wo_rusage);
1416 put_task_struct(t: p);
1417
1418 infop = wo->wo_info;
1419 if (!infop) {
1420 wo->wo_stat = 0xffff;
1421 } else {
1422 infop->cause = CLD_CONTINUED;
1423 infop->pid = pid;
1424 infop->uid = uid;
1425 infop->status = SIGCONT;
1426 }
1427 return pid;
1428}
1429
1430/*
1431 * Consider @p for a wait by @parent.
1432 *
1433 * -ECHILD should be in ->notask_error before the first call.
1434 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1435 * Returns zero if the search for a child should continue;
1436 * then ->notask_error is 0 if @p is an eligible child,
1437 * or still -ECHILD.
1438 */
1439static int wait_consider_task(struct wait_opts *wo, int ptrace,
1440 struct task_struct *p)
1441{
1442 /*
1443 * We can race with wait_task_zombie() from another thread.
1444 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1445 * can't confuse the checks below.
1446 */
1447 int exit_state = READ_ONCE(p->exit_state);
1448 int ret;
1449
1450 if (unlikely(exit_state == EXIT_DEAD))
1451 return 0;
1452
1453 ret = eligible_child(wo, ptrace, p);
1454 if (!ret)
1455 return ret;
1456
1457 if (unlikely(exit_state == EXIT_TRACE)) {
1458 /*
1459 * ptrace == 0 means we are the natural parent. In this case
1460 * we should clear notask_error, debugger will notify us.
1461 */
1462 if (likely(!ptrace))
1463 wo->notask_error = 0;
1464 return 0;
1465 }
1466
1467 if (likely(!ptrace) && unlikely(p->ptrace)) {
1468 /*
1469 * If it is traced by its real parent's group, just pretend
1470 * the caller is ptrace_do_wait() and reap this child if it
1471 * is zombie.
1472 *
1473 * This also hides group stop state from real parent; otherwise
1474 * a single stop can be reported twice as group and ptrace stop.
1475 * If a ptracer wants to distinguish these two events for its
1476 * own children it should create a separate process which takes
1477 * the role of real parent.
1478 */
1479 if (!ptrace_reparented(child: p))
1480 ptrace = 1;
1481 }
1482
1483 /* slay zombie? */
1484 if (exit_state == EXIT_ZOMBIE) {
1485 /* we don't reap group leaders with subthreads */
1486 if (!delay_group_leader(p)) {
1487 /*
1488 * A zombie ptracee is only visible to its ptracer.
1489 * Notification and reaping will be cascaded to the
1490 * real parent when the ptracer detaches.
1491 */
1492 if (unlikely(ptrace) || likely(!p->ptrace))
1493 return wait_task_zombie(wo, p);
1494 }
1495
1496 /*
1497 * Allow access to stopped/continued state via zombie by
1498 * falling through. Clearing of notask_error is complex.
1499 *
1500 * When !@ptrace:
1501 *
1502 * If WEXITED is set, notask_error should naturally be
1503 * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1504 * so, if there are live subthreads, there are events to
1505 * wait for. If all subthreads are dead, it's still safe
1506 * to clear - this function will be called again in finite
1507 * amount time once all the subthreads are released and
1508 * will then return without clearing.
1509 *
1510 * When @ptrace:
1511 *
1512 * Stopped state is per-task and thus can't change once the
1513 * target task dies. Only continued and exited can happen.
1514 * Clear notask_error if WCONTINUED | WEXITED.
1515 */
1516 if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1517 wo->notask_error = 0;
1518 } else {
1519 /*
1520 * @p is alive and it's gonna stop, continue or exit, so
1521 * there always is something to wait for.
1522 */
1523 wo->notask_error = 0;
1524 }
1525
1526 /*
1527 * Wait for stopped. Depending on @ptrace, different stopped state
1528 * is used and the two don't interact with each other.
1529 */
1530 ret = wait_task_stopped(wo, ptrace, p);
1531 if (ret)
1532 return ret;
1533
1534 /*
1535 * Wait for continued. There's only one continued state and the
1536 * ptracer can consume it which can confuse the real parent. Don't
1537 * use WCONTINUED from ptracer. You don't need or want it.
1538 */
1539 return wait_task_continued(wo, p);
1540}
1541
1542/*
1543 * Do the work of do_wait() for one thread in the group, @tsk.
1544 *
1545 * -ECHILD should be in ->notask_error before the first call.
1546 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1547 * Returns zero if the search for a child should continue; then
1548 * ->notask_error is 0 if there were any eligible children,
1549 * or still -ECHILD.
1550 */
1551static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1552{
1553 struct task_struct *p;
1554
1555 list_for_each_entry(p, &tsk->children, sibling) {
1556 int ret = wait_consider_task(wo, ptrace: 0, p);
1557
1558 if (ret)
1559 return ret;
1560 }
1561
1562 return 0;
1563}
1564
1565static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1566{
1567 struct task_struct *p;
1568
1569 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1570 int ret = wait_consider_task(wo, ptrace: 1, p);
1571
1572 if (ret)
1573 return ret;
1574 }
1575
1576 return 0;
1577}
1578
1579bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
1580{
1581 if (!eligible_pid(wo, p))
1582 return false;
1583
1584 if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
1585 return false;
1586
1587 return true;
1588}
1589
1590static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1591 int sync, void *key)
1592{
1593 struct wait_opts *wo = container_of(wait, struct wait_opts,
1594 child_wait);
1595 struct task_struct *p = key;
1596
1597 if (pid_child_should_wake(wo, p))
1598 return default_wake_function(wq_entry: wait, mode, flags: sync, key);
1599
1600 return 0;
1601}
1602
1603void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1604{
1605 __wake_up_sync_key(wq_head: &parent->signal->wait_chldexit,
1606 TASK_INTERRUPTIBLE, key: p);
1607}
1608
1609static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
1610 struct task_struct *target)
1611{
1612 struct task_struct *parent =
1613 !ptrace ? target->real_parent : target->parent;
1614
1615 return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
1616 same_thread_group(current, p2: parent));
1617}
1618
1619/*
1620 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
1621 * and tracee lists to find the target task.
1622 */
1623static int do_wait_pid(struct wait_opts *wo)
1624{
1625 bool ptrace;
1626 struct task_struct *target;
1627 int retval;
1628
1629 ptrace = false;
1630 target = pid_task(pid: wo->wo_pid, PIDTYPE_TGID);
1631 if (target && is_effectively_child(wo, ptrace, target)) {
1632 retval = wait_consider_task(wo, ptrace, p: target);
1633 if (retval)
1634 return retval;
1635 }
1636
1637 ptrace = true;
1638 target = pid_task(pid: wo->wo_pid, PIDTYPE_PID);
1639 if (target && target->ptrace &&
1640 is_effectively_child(wo, ptrace, target)) {
1641 retval = wait_consider_task(wo, ptrace, p: target);
1642 if (retval)
1643 return retval;
1644 }
1645
1646 return 0;
1647}
1648
1649long __do_wait(struct wait_opts *wo)
1650{
1651 long retval;
1652
1653 /*
1654 * If there is nothing that can match our criteria, just get out.
1655 * We will clear ->notask_error to zero if we see any child that
1656 * might later match our criteria, even if we are not able to reap
1657 * it yet.
1658 */
1659 wo->notask_error = -ECHILD;
1660 if ((wo->wo_type < PIDTYPE_MAX) &&
1661 (!wo->wo_pid || !pid_has_task(pid: wo->wo_pid, type: wo->wo_type)))
1662 goto notask;
1663
1664 read_lock(&tasklist_lock);
1665
1666 if (wo->wo_type == PIDTYPE_PID) {
1667 retval = do_wait_pid(wo);
1668 if (retval)
1669 return retval;
1670 } else {
1671 struct task_struct *tsk = current;
1672
1673 do {
1674 retval = do_wait_thread(wo, tsk);
1675 if (retval)
1676 return retval;
1677
1678 retval = ptrace_do_wait(wo, tsk);
1679 if (retval)
1680 return retval;
1681
1682 if (wo->wo_flags & __WNOTHREAD)
1683 break;
1684 } while_each_thread(current, tsk);
1685 }
1686 read_unlock(&tasklist_lock);
1687
1688notask:
1689 retval = wo->notask_error;
1690 if (!retval && !(wo->wo_flags & WNOHANG))
1691 return -ERESTARTSYS;
1692
1693 return retval;
1694}
1695
1696static long do_wait(struct wait_opts *wo)
1697{
1698 int retval;
1699
1700 trace_sched_process_wait(pid: wo->wo_pid);
1701
1702 init_waitqueue_func_entry(wq_entry: &wo->child_wait, func: child_wait_callback);
1703 wo->child_wait.private = current;
1704 add_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1705
1706 do {
1707 set_current_state(TASK_INTERRUPTIBLE);
1708 retval = __do_wait(wo);
1709 if (retval != -ERESTARTSYS)
1710 break;
1711 if (signal_pending(current))
1712 break;
1713 schedule();
1714 } while (1);
1715
1716 __set_current_state(TASK_RUNNING);
1717 remove_wait_queue(wq_head: &current->signal->wait_chldexit, wq_entry: &wo->child_wait);
1718 return retval;
1719}
1720
1721int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
1722 struct waitid_info *infop, int options,
1723 struct rusage *ru)
1724{
1725 unsigned int f_flags = 0;
1726 struct pid *pid = NULL;
1727 enum pid_type type;
1728
1729 if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1730 __WNOTHREAD|__WCLONE|__WALL))
1731 return -EINVAL;
1732 if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1733 return -EINVAL;
1734
1735 switch (which) {
1736 case P_ALL:
1737 type = PIDTYPE_MAX;
1738 break;
1739 case P_PID:
1740 type = PIDTYPE_PID;
1741 if (upid <= 0)
1742 return -EINVAL;
1743
1744 pid = find_get_pid(nr: upid);
1745 break;
1746 case P_PGID:
1747 type = PIDTYPE_PGID;
1748 if (upid < 0)
1749 return -EINVAL;
1750
1751 if (upid)
1752 pid = find_get_pid(nr: upid);
1753 else
1754 pid = get_task_pid(current, type: PIDTYPE_PGID);
1755 break;
1756 case P_PIDFD:
1757 type = PIDTYPE_PID;
1758 if (upid < 0)
1759 return -EINVAL;
1760
1761 pid = pidfd_get_pid(fd: upid, flags: &f_flags);
1762 if (IS_ERR(ptr: pid))
1763 return PTR_ERR(ptr: pid);
1764
1765 break;
1766 default:
1767 return -EINVAL;
1768 }
1769
1770 wo->wo_type = type;
1771 wo->wo_pid = pid;
1772 wo->wo_flags = options;
1773 wo->wo_info = infop;
1774 wo->wo_rusage = ru;
1775 if (f_flags & O_NONBLOCK)
1776 wo->wo_flags |= WNOHANG;
1777
1778 return 0;
1779}
1780
1781static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1782 int options, struct rusage *ru)
1783{
1784 struct wait_opts wo;
1785 long ret;
1786
1787 ret = kernel_waitid_prepare(wo: &wo, which, upid, infop, options, ru);
1788 if (ret)
1789 return ret;
1790
1791 ret = do_wait(wo: &wo);
1792 if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
1793 ret = -EAGAIN;
1794
1795 put_pid(pid: wo.wo_pid);
1796 return ret;
1797}
1798
1799SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1800 infop, int, options, struct rusage __user *, ru)
1801{
1802 struct rusage r;
1803 struct waitid_info info = {.status = 0};
1804 long err = kernel_waitid(which, upid, infop: &info, options, ru: ru ? &r : NULL);
1805 int signo = 0;
1806
1807 if (err > 0) {
1808 signo = SIGCHLD;
1809 err = 0;
1810 if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1811 return -EFAULT;
1812 }
1813 if (!infop)
1814 return err;
1815
1816 if (!user_write_access_begin(infop, sizeof(*infop)))
1817 return -EFAULT;
1818
1819 unsafe_put_user(signo, &infop->si_signo, Efault);
1820 unsafe_put_user(0, &infop->si_errno, Efault);
1821 unsafe_put_user(info.cause, &infop->si_code, Efault);
1822 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1823 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1824 unsafe_put_user(info.status, &infop->si_status, Efault);
1825 user_write_access_end();
1826 return err;
1827Efault:
1828 user_write_access_end();
1829 return -EFAULT;
1830}
1831
1832long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1833 struct rusage *ru)
1834{
1835 struct wait_opts wo;
1836 struct pid *pid = NULL;
1837 enum pid_type type;
1838 long ret;
1839
1840 if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1841 __WNOTHREAD|__WCLONE|__WALL))
1842 return -EINVAL;
1843
1844 /* -INT_MIN is not defined */
1845 if (upid == INT_MIN)
1846 return -ESRCH;
1847
1848 if (upid == -1)
1849 type = PIDTYPE_MAX;
1850 else if (upid < 0) {
1851 type = PIDTYPE_PGID;
1852 pid = find_get_pid(nr: -upid);
1853 } else if (upid == 0) {
1854 type = PIDTYPE_PGID;
1855 pid = get_task_pid(current, type: PIDTYPE_PGID);
1856 } else /* upid > 0 */ {
1857 type = PIDTYPE_PID;
1858 pid = find_get_pid(nr: upid);
1859 }
1860
1861 wo.wo_type = type;
1862 wo.wo_pid = pid;
1863 wo.wo_flags = options | WEXITED;
1864 wo.wo_info = NULL;
1865 wo.wo_stat = 0;
1866 wo.wo_rusage = ru;
1867 ret = do_wait(wo: &wo);
1868 put_pid(pid);
1869 if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1870 ret = -EFAULT;
1871
1872 return ret;
1873}
1874
1875int kernel_wait(pid_t pid, int *stat)
1876{
1877 struct wait_opts wo = {
1878 .wo_type = PIDTYPE_PID,
1879 .wo_pid = find_get_pid(nr: pid),
1880 .wo_flags = WEXITED,
1881 };
1882 int ret;
1883
1884 ret = do_wait(wo: &wo);
1885 if (ret > 0 && wo.wo_stat)
1886 *stat = wo.wo_stat;
1887 put_pid(pid: wo.wo_pid);
1888 return ret;
1889}
1890
1891SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1892 int, options, struct rusage __user *, ru)
1893{
1894 struct rusage r;
1895 long err = kernel_wait4(upid, stat_addr, options, ru: ru ? &r : NULL);
1896
1897 if (err > 0) {
1898 if (ru && copy_to_user(to: ru, from: &r, n: sizeof(struct rusage)))
1899 return -EFAULT;
1900 }
1901 return err;
1902}
1903
1904#ifdef __ARCH_WANT_SYS_WAITPID
1905
1906/*
1907 * sys_waitpid() remains for compatibility. waitpid() should be
1908 * implemented by calling sys_wait4() from libc.a.
1909 */
1910SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1911{
1912 return kernel_wait4(upid: pid, stat_addr, options, NULL);
1913}
1914
1915#endif
1916
1917#ifdef CONFIG_COMPAT
1918COMPAT_SYSCALL_DEFINE4(wait4,
1919 compat_pid_t, pid,
1920 compat_uint_t __user *, stat_addr,
1921 int, options,
1922 struct compat_rusage __user *, ru)
1923{
1924 struct rusage r;
1925 long err = kernel_wait4(upid: pid, stat_addr, options, ru: ru ? &r : NULL);
1926 if (err > 0) {
1927 if (ru && put_compat_rusage(&r, ru))
1928 return -EFAULT;
1929 }
1930 return err;
1931}
1932
1933COMPAT_SYSCALL_DEFINE5(waitid,
1934 int, which, compat_pid_t, pid,
1935 struct compat_siginfo __user *, infop, int, options,
1936 struct compat_rusage __user *, uru)
1937{
1938 struct rusage ru;
1939 struct waitid_info info = {.status = 0};
1940 long err = kernel_waitid(which, upid: pid, infop: &info, options, ru: uru ? &ru : NULL);
1941 int signo = 0;
1942 if (err > 0) {
1943 signo = SIGCHLD;
1944 err = 0;
1945 if (uru) {
1946 /* kernel_waitid() overwrites everything in ru */
1947 if (COMPAT_USE_64BIT_TIME)
1948 err = copy_to_user(to: uru, from: &ru, n: sizeof(ru));
1949 else
1950 err = put_compat_rusage(&ru, uru);
1951 if (err)
1952 return -EFAULT;
1953 }
1954 }
1955
1956 if (!infop)
1957 return err;
1958
1959 if (!user_write_access_begin(infop, sizeof(*infop)))
1960 return -EFAULT;
1961
1962 unsafe_put_user(signo, &infop->si_signo, Efault);
1963 unsafe_put_user(0, &infop->si_errno, Efault);
1964 unsafe_put_user(info.cause, &infop->si_code, Efault);
1965 unsafe_put_user(info.pid, &infop->si_pid, Efault);
1966 unsafe_put_user(info.uid, &infop->si_uid, Efault);
1967 unsafe_put_user(info.status, &infop->si_status, Efault);
1968 user_write_access_end();
1969 return err;
1970Efault:
1971 user_write_access_end();
1972 return -EFAULT;
1973}
1974#endif
1975
1976/*
1977 * This needs to be __function_aligned as GCC implicitly makes any
1978 * implementation of abort() cold and drops alignment specified by
1979 * -falign-functions=N.
1980 *
1981 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
1982 */
1983__weak __function_aligned void abort(void)
1984{
1985 BUG();
1986
1987 /* if that doesn't kill us, halt */
1988 panic(fmt: "Oops failed to kill thread");
1989}
1990EXPORT_SYMBOL(abort);
1991

source code of linux/kernel/exit.c