fork.c source code [linux/kernel/fork.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/kernel/fork.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* 'fork.c' contains the help-routines for the 'fork' system call
10	* (see also entry.S and others).
11	* Fork is rather simple, once you get the hang of it, but the memory
12	* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
13	*/
14
15	#include <linux/anon_inodes.h>
16	#include <linux/slab.h>
17	#include <linux/sched/autogroup.h>
18	#include <linux/sched/mm.h>
19	#include <linux/sched/user.h>
20	#include <linux/sched/numa_balancing.h>
21	#include <linux/sched/stat.h>
22	#include <linux/sched/task.h>
23	#include <linux/sched/task_stack.h>
24	#include <linux/sched/cputime.h>
25	#include <linux/sched/ext.h>
26	#include <linux/seq_file.h>
27	#include <linux/rtmutex.h>
28	#include <linux/init.h>
29	#include <linux/unistd.h>
30	#include <linux/module.h>
31	#include <linux/vmalloc.h>
32	#include <linux/completion.h>
33	#include <linux/personality.h>
34	#include <linux/mempolicy.h>
35	#include <linux/sem.h>
36	#include <linux/file.h>
37	#include <linux/fdtable.h>
38	#include <linux/iocontext.h>
39	#include <linux/key.h>
40	#include <linux/kmsan.h>
41	#include <linux/binfmts.h>
42	#include <linux/mman.h>
43	#include <linux/mmu_notifier.h>
44	#include <linux/fs.h>
45	#include <linux/mm.h>
46	#include <linux/mm_inline.h>
47	#include <linux/memblock.h>
48	#include <linux/nsproxy.h>
49	#include <linux/capability.h>
50	#include <linux/cpu.h>
51	#include <linux/cgroup.h>
52	#include <linux/security.h>
53	#include <linux/hugetlb.h>
54	#include <linux/seccomp.h>
55	#include <linux/swap.h>
56	#include <linux/syscalls.h>
57	#include <linux/syscall_user_dispatch.h>
58	#include <linux/jiffies.h>
59	#include <linux/futex.h>
60	#include <linux/compat.h>
61	#include <linux/kthread.h>
62	#include <linux/task_io_accounting_ops.h>
63	#include <linux/rcupdate.h>
64	#include <linux/ptrace.h>
65	#include <linux/mount.h>
66	#include <linux/audit.h>
67	#include <linux/memcontrol.h>
68	#include <linux/ftrace.h>
69	#include <linux/proc_fs.h>
70	#include <linux/profile.h>
71	#include <linux/rmap.h>
72	#include <linux/ksm.h>
73	#include <linux/acct.h>
74	#include <linux/userfaultfd_k.h>
75	#include <linux/tsacct_kern.h>
76	#include <linux/cn_proc.h>
77	#include <linux/freezer.h>
78	#include <linux/delayacct.h>
79	#include <linux/taskstats_kern.h>
80	#include <linux/tty.h>
81	#include <linux/fs_struct.h>
82	#include <linux/magic.h>
83	#include <linux/perf_event.h>
84	#include <linux/posix-timers.h>
85	#include <linux/user-return-notifier.h>
86	#include <linux/oom.h>
87	#include <linux/khugepaged.h>
88	#include <linux/signalfd.h>
89	#include <linux/uprobes.h>
90	#include <linux/aio.h>
91	#include <linux/compiler.h>
92	#include <linux/sysctl.h>
93	#include <linux/kcov.h>
94	#include <linux/livepatch.h>
95	#include <linux/thread_info.h>
96	#include <linux/stackleak.h>
97	#include <linux/kasan.h>
98	#include <linux/scs.h>
99	#include <linux/io_uring.h>
100	#include <linux/bpf.h>
101	#include <linux/stackprotector.h>
102	#include <linux/user_events.h>
103	#include <linux/iommu.h>
104	#include <linux/rseq.h>
105	#include <uapi/linux/pidfd.h>
106	#include <linux/pidfs.h>
107	#include <linux/tick.h>
108
109	#include <asm/pgalloc.h>
110	#include <linux/uaccess.h>
111	#include <asm/mmu_context.h>
112	#include <asm/cacheflush.h>
113	#include <asm/tlbflush.h>
114
115	/ For dup_mmap(). /
116	#include "../mm/internal.h"
117
118	#include <trace/events/sched.h>
119
120	#define CREATE_TRACE_POINTS
121	#include <trace/events/task.h>
122
123	#include <kunit/visibility.h>
124
125	/*
126	* Minimum number of threads to boot the kernel
127	*/
128	#define MIN_THREADS 20
129
130	/*
131	* Maximum number of threads
132	*/
133	#define MAX_THREADS FUTEX_TID_MASK
134
135	/*
136	* Protected counters by write_lock_irq(&tasklist_lock)
137	*/
138	unsigned long total_forks; / Handle normal Linux uptimes. /
139	int nr_threads; / The idle threads do not count.. /
140
141	static int max_threads; / tunable limit on nr_threads /
142
143	#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
144
145	static const char * const resident_page_types[] = {
146	NAMED_ARRAY_INDEX(MM_FILEPAGES),
147	NAMED_ARRAY_INDEX(MM_ANONPAGES),
148	NAMED_ARRAY_INDEX(MM_SWAPENTS),
149	NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
150	};
151
152	DEFINE_PER_CPU(unsigned long, process_counts) = `0`;
153
154	__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); / outer /
155
156	#ifdef CONFIG_PROVE_RCU
157	int lockdep_tasklist_lock_is_held(void)
158	{
159	return lockdep_is_held(&tasklist_lock);
160	}
161	EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
162	#endif /* #ifdef CONFIG_PROVE_RCU */
163
164	int nr_processes(void)
165	{
166	int cpu;
167	int total = `0`;
168
169	for_each_possible_cpu(cpu)
170	total += per_cpu(process_counts, cpu);
171
172	return total;
173	}
174
175	void __weak arch_release_task_struct(struct task_struct *tsk)
176	{
177	}
178
179	static struct kmem_cache *task_struct_cachep;
180
181	static inline struct task_struct alloc_task_struct_node(int* node)
182	{
183	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
184	}
185
186	static inline void free_task_struct(struct task_struct *tsk)
187	{
188	kmem_cache_free(s: task_struct_cachep, objp: tsk);
189	}
190
191	/*
192	* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
193	* kmemcache based allocator.
194	*/
195	# if THREAD_SIZE >= PAGE_SIZE \|\| defined(CONFIG_VMAP_STACK)
196
197	# ifdef CONFIG_VMAP_STACK
198	/*
199	* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
200	* flush. Try to minimize the number of calls by caching stacks.
201	*/
202	#define NR_CACHED_STACKS 2
203	static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
204
205	struct vm_stack {
206	struct rcu_head rcu;
207	struct vm_struct *stack_vm_area;
208	};
209
210	static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
211	{
212	unsigned int i;
213
214	for (i = `0`; i < NR_CACHED_STACKS; i++) {
215	struct vm_struct *tmp = NULL;
216
217	if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
218	return true;
219	}
220	return false;
221	}
222
223	static void thread_stack_free_rcu(struct rcu_head *rh)
224	{
225	struct vm_stack vm_stack = container_of(rh, struct* vm_stack, rcu);
226
227	if (try_release_thread_stack_to_cache(vm: vm_stack->stack_vm_area))
228	return;
229
230	vfree(addr: vm_stack);
231	}
232
233	static void thread_stack_delayed_free(struct task_struct *tsk)
234	{
235	struct vm_stack *vm_stack = tsk->stack;
236
237	vm_stack->stack_vm_area = tsk->stack_vm_area;
238	call_rcu(head: &vm_stack->rcu, func: thread_stack_free_rcu);
239	}
240
241	static int free_vm_stack_cache(unsigned int cpu)
242	{
243	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
244	int i;
245
246	for (i = `0`; i < NR_CACHED_STACKS; i++) {
247	struct vm_struct *vm_stack = cached_vm_stacks[i];
248
249	if (!vm_stack)
250	continue;
251
252	vfree(addr: vm_stack->addr);
253	cached_vm_stacks[i] = NULL;
254	}
255
256	return `0`;
257	}
258
259	static int memcg_charge_kernel_stack(struct vm_struct *vm)
260	{
261	int i;
262	int ret;
263	int nr_charged = `0`;
264
265	BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
266
267	for (i = `0`; i < THREAD_SIZE / PAGE_SIZE; i++) {
268	ret = memcg_kmem_charge_page(page: vm->pages[i], GFP_KERNEL, order: `0`);
269	if (ret)
270	goto err;
271	nr_charged++;
272	}
273	return `0`;
274	err:
275	for (i = `0`; i < nr_charged; i++)
276	memcg_kmem_uncharge_page(page: vm->pages[i], order: `0`);
277	return ret;
278	}
279
280	static int alloc_thread_stack_node(struct task_struct tsk, int* node)
281	{
282	struct vm_struct *vm;
283	void *stack;
284	int i;
285
286	for (i = `0`; i < NR_CACHED_STACKS; i++) {
287	struct vm_struct *s;
288
289	s = this_cpu_xchg(cached_stacks[i], NULL);
290
291	if (!s)
292	continue;
293
294	/ Reset stack metadata. /
295	kasan_unpoison_range(addr: s->addr, THREAD_SIZE);
296
297	stack = kasan_reset_tag(addr: s->addr);
298
299	/ Clear stale pointers from reused stack. /
300	memset(stack, `0`, THREAD_SIZE);
301
302	if (memcg_charge_kernel_stack(vm: s)) {
303	vfree(addr: s->addr);
304	return -ENOMEM;
305	}
306
307	tsk->stack_vm_area = s;
308	tsk->stack = stack;
309	return `0`;
310	}
311
312	/*
313	* Allocated stacks are cached and later reused by new threads,
314	* so memcg accounting is performed manually on assigning/releasing
315	* stacks to tasks. Drop __GFP_ACCOUNT.
316	*/
317	stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
318	THREADINFO_GFP & ~__GFP_ACCOUNT,
319	node, __builtin_return_address(`0`));
320	if (!stack)
321	return -ENOMEM;
322
323	vm = find_vm_area(addr: stack);
324	if (memcg_charge_kernel_stack(vm)) {
325	vfree(addr: stack);
326	return -ENOMEM;
327	}
328	/*
329	* We can't call find_vm_area() in interrupt context, and
330	* free_thread_stack() can be called in interrupt context,
331	* so cache the vm_struct.
332	*/
333	tsk->stack_vm_area = vm;
334	stack = kasan_reset_tag(addr: stack);
335	tsk->stack = stack;
336	return `0`;
337	}
338
339	static void free_thread_stack(struct task_struct *tsk)
340	{
341	if (!try_release_thread_stack_to_cache(vm: tsk->stack_vm_area))
342	thread_stack_delayed_free(tsk);
343
344	tsk->stack = NULL;
345	tsk->stack_vm_area = NULL;
346	}
347
348	# else /* !CONFIG_VMAP_STACK */
349
350	static void thread_stack_free_rcu(struct rcu_head *rh)
351	{
352	__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
353	}
354
355	static void thread_stack_delayed_free(struct task_struct *tsk)
356	{
357	struct rcu_head *rh = tsk->stack;
358
359	call_rcu(rh, thread_stack_free_rcu);
360	}
361
362	static int alloc_thread_stack_node(struct task_struct tsk, int* node)
363	{
364	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
365	THREAD_SIZE_ORDER);
366
367	if (likely(page)) {
368	tsk->stack = kasan_reset_tag(page_address(page));
369	return `0`;
370	}
371	return -ENOMEM;
372	}
373
374	static void free_thread_stack(struct task_struct *tsk)
375	{
376	thread_stack_delayed_free(tsk);
377	tsk->stack = NULL;
378	}
379
380	# endif /* CONFIG_VMAP_STACK */
381	# else /* !(THREAD_SIZE >= PAGE_SIZE \|\| defined(CONFIG_VMAP_STACK)) */
382
383	static struct kmem_cache *thread_stack_cache;
384
385	static void thread_stack_free_rcu(struct rcu_head *rh)
386	{
387	kmem_cache_free(thread_stack_cache, rh);
388	}
389
390	static void thread_stack_delayed_free(struct task_struct *tsk)
391	{
392	struct rcu_head *rh = tsk->stack;
393
394	call_rcu(rh, thread_stack_free_rcu);
395	}
396
397	static int alloc_thread_stack_node(struct task_struct tsk, int* node)
398	{
399	unsigned long *stack;
400	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
401	stack = kasan_reset_tag(stack);
402	tsk->stack = stack;
403	return stack ? `0` : -ENOMEM;
404	}
405
406	static void free_thread_stack(struct task_struct *tsk)
407	{
408	thread_stack_delayed_free(tsk);
409	tsk->stack = NULL;
410	}
411
412	void thread_stack_cache_init(void)
413	{
414	thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
415	THREAD_SIZE, THREAD_SIZE, `0`, `0`,
416	THREAD_SIZE, NULL);
417	BUG_ON(thread_stack_cache == NULL);
418	}
419
420	# endif /* THREAD_SIZE >= PAGE_SIZE \|\| defined(CONFIG_VMAP_STACK) */
421
422	/ SLAB cache for signal_struct structures (tsk->signal) /
423	static struct kmem_cache *signal_cachep;
424
425	/ SLAB cache for sighand_struct structures (tsk->sighand) /
426	struct kmem_cache *sighand_cachep;
427
428	/ SLAB cache for files_struct structures (tsk->files) /
429	struct kmem_cache *files_cachep;
430
431	/ SLAB cache for fs_struct structures (tsk->fs) /
432	struct kmem_cache *fs_cachep;
433
434	/ SLAB cache for mm_struct structures (tsk->mm) /
435	static struct kmem_cache *mm_cachep;
436
437	static void account_kernel_stack(struct task_struct tsk, int* account)
438	{
439	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
440	struct vm_struct *vm = task_stack_vm_area(t: tsk);
441	int i;
442
443	for (i = `0`; i < THREAD_SIZE / PAGE_SIZE; i++)
444	mod_lruvec_page_state(page: vm->pages[i], idx: NR_KERNEL_STACK_KB,
445	val: account * (PAGE_SIZE / `1024`));
446	} else {
447	void *stack = task_stack_page(task: tsk);
448
449	/ All stack pages are in the same node. /
450	mod_lruvec_kmem_state(p: stack, idx: NR_KERNEL_STACK_KB,
451	val: account * (THREAD_SIZE / `1024`));
452	}
453	}
454
455	void exit_task_stack_account(struct task_struct *tsk)
456	{
457	account_kernel_stack(tsk, account: -`1`);
458
459	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
460	struct vm_struct *vm;
461	int i;
462
463	vm = task_stack_vm_area(t: tsk);
464	for (i = `0`; i < THREAD_SIZE / PAGE_SIZE; i++)
465	memcg_kmem_uncharge_page(page: vm->pages[i], order: `0`);
466	}
467	}
468
469	static void release_task_stack(struct task_struct *tsk)
470	{
471	if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
472	return; / Better to leak the stack than to free prematurely /
473
474	free_thread_stack(tsk);
475	}
476
477	#ifdef CONFIG_THREAD_INFO_IN_TASK
478	void put_task_stack(struct task_struct *tsk)
479	{
480	if (refcount_dec_and_test(r: &tsk->stack_refcount))
481	release_task_stack(tsk);
482	}
483	#endif
484
485	void free_task(struct task_struct *tsk)
486	{
487	#ifdef CONFIG_SECCOMP
488	WARN_ON_ONCE(tsk->seccomp.filter);
489	#endif
490	release_user_cpus_ptr(p: tsk);
491	scs_release(tsk);
492
493	#ifndef CONFIG_THREAD_INFO_IN_TASK
494	/*
495	* The task is finally done with both the stack and thread_info,
496	* so free both.
497	*/
498	release_task_stack(tsk);
499	#else
500	/*
501	* If the task had a separate stack allocation, it should be gone
502	* by now.
503	*/
504	WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != `0`);
505	#endif
506	rt_mutex_debug_task_free(tsk);
507	ftrace_graph_exit_task(t: tsk);
508	arch_release_task_struct(tsk);
509	if (tsk->flags & PF_KTHREAD)
510	free_kthread_struct(k: tsk);
511	bpf_task_storage_free(task: tsk);
512	free_task_struct(tsk);
513	}
514	EXPORT_SYMBOL(free_task);
515
516	void dup_mm_exe_file(struct mm_struct mm, struct* mm_struct *oldmm)
517	{
518	struct file *exe_file;
519
520	exe_file = get_mm_exe_file(mm: oldmm);
521	RCU_INIT_POINTER(mm->exe_file, exe_file);
522	/*
523	* We depend on the oldmm having properly denied write access to the
524	* exe_file already.
525	*/
526	if (exe_file && exe_file_deny_write_access(exe_file))
527	pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
528	}
529
530	#ifdef CONFIG_MMU
531	static inline int mm_alloc_pgd(struct mm_struct *mm)
532	{
533	mm->pgd = pgd_alloc(mm);
534	if (unlikely(!mm->pgd))
535	return -ENOMEM;
536	return `0`;
537	}
538
539	static inline void mm_free_pgd(struct mm_struct *mm)
540	{
541	pgd_free(mm, pgd: mm->pgd);
542	}
543	#else
544	#define mm_alloc_pgd(mm) (0)
545	#define mm_free_pgd(mm)
546	#endif /* CONFIG_MMU */
547
548	#ifdef CONFIG_MM_ID
549	static DEFINE_IDA(mm_ida);
550
551	static inline int mm_alloc_id(struct mm_struct *mm)
552	{
553	int ret;
554
555	ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
556	if (ret < `0`)
557	return ret;
558	mm->mm_id = ret;
559	return `0`;
560	}
561
562	static inline void mm_free_id(struct mm_struct *mm)
563	{
564	const mm_id_t id = mm->mm_id;
565
566	mm->mm_id = MM_ID_DUMMY;
567	if (id == MM_ID_DUMMY)
568	return;
569	if (WARN_ON_ONCE(id < MM_ID_MIN \|\| id > MM_ID_MAX))
570	return;
571	ida_free(&mm_ida, id);
572	}
573	#else /* !CONFIG_MM_ID */
574	static inline int mm_alloc_id(struct mm_struct mm) { return* `0`; }
575	static inline void mm_free_id(struct mm_struct *mm) {}
576	#endif /* CONFIG_MM_ID */
577
578	static void check_mm(struct mm_struct *mm)
579	{
580	int i;
581
582	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
583	"Please make sure 'struct resident_page_types[]' is updated as well");
584
585	for (i = `0`; i < NR_MM_COUNTERS; i++) {
586	long x = percpu_counter_sum(fbc: &mm->rss_stat[i]);
587
588	if (unlikely(x))
589	pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
590	mm, resident_page_types[i], x);
591	}
592
593	if (mm_pgtables_bytes(mm))
594	pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
595	mm_pgtables_bytes(mm));
596
597	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
598	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
599	#endif
600	}
601
602	#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
603	#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
604
605	static void do_check_lazy_tlb(void *arg)
606	{
607	struct mm_struct *mm = arg;
608
609	WARN_ON_ONCE(current->active_mm == mm);
610	}
611
612	static void do_shoot_lazy_tlb(void *arg)
613	{
614	struct mm_struct *mm = arg;
615
616	if (current->active_mm == mm) {
617	WARN_ON_ONCE(current->mm);
618	current->active_mm = &init_mm;
619	switch_mm(prev: mm, next: &init_mm, current);
620	}
621	}
622
623	static void cleanup_lazy_tlbs(struct mm_struct *mm)
624	{
625	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
626	/*
627	* In this case, lazy tlb mms are refounted and would not reach
628	* __mmdrop until all CPUs have switched away and mmdrop()ed.
629	*/
630	return;
631	}
632
633	/*
634	* Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
635	* requires lazy mm users to switch to another mm when the refcount
636	* drops to zero, before the mm is freed. This requires IPIs here to
637	* switch kernel threads to init_mm.
638	*
639	* archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
640	* switch with the final userspace teardown TLB flush which leaves the
641	* mm lazy on this CPU but no others, reducing the need for additional
642	* IPIs here. There are cases where a final IPI is still required here,
643	* such as the final mmdrop being performed on a different CPU than the
644	* one exiting, or kernel threads using the mm when userspace exits.
645	*
646	* IPI overheads have not found to be expensive, but they could be
647	* reduced in a number of possible ways, for example (roughly
648	* increasing order of complexity):
649	* - The last lazy reference created by exit_mm() could instead switch
650	* to init_mm, however it's probable this will run on the same CPU
651	* immediately afterwards, so this may not reduce IPIs much.
652	* - A batch of mms requiring IPIs could be gathered and freed at once.
653	* - CPUs store active_mm where it can be remotely checked without a
654	* lock, to filter out false-positives in the cpumask.
655	* - After mm_users or mm_count reaches zero, switching away from the
656	* mm could clear mm_cpumask to reduce some IPIs, perhaps together
657	* with some batching or delaying of the final IPIs.
658	* - A delayed freeing and RCU-like quiescing sequence based on mm
659	* switching to avoid IPIs completely.
660	*/
661	on_each_cpu_mask(mask: mm_cpumask(mm), func: do_shoot_lazy_tlb, info: (void *)mm, wait: `1`);
662	if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
663	on_each_cpu(func: do_check_lazy_tlb, info: (void *)mm, wait: `1`);
664	}
665
666	/*
667	* Called when the last reference to the mm
668	* is dropped: either by a lazy thread or by
669	* mmput. Free the page directory and the mm.
670	*/
671	void __mmdrop(struct mm_struct *mm)
672	{
673	BUG_ON(mm == &init_mm);
674	WARN_ON_ONCE(mm == current->mm);
675
676	/ Ensure no CPUs are using this as their lazy tlb mm /
677	cleanup_lazy_tlbs(mm);
678
679	WARN_ON_ONCE(mm == current->active_mm);
680	mm_free_pgd(mm);
681	mm_free_id(mm);
682	destroy_context(mm);
683	mmu_notifier_subscriptions_destroy(mm);
684	check_mm(mm);
685	put_user_ns(ns: mm->user_ns);
686	mm_pasid_drop(mm);
687	mm_destroy_cid(mm);
688	percpu_counter_destroy_many(fbc: mm->rss_stat, nr_counters: NR_MM_COUNTERS);
689
690	free_mm(mm);
691	}
692	EXPORT_SYMBOL_GPL(__mmdrop);
693
694	static void mmdrop_async_fn(struct work_struct *work)
695	{
696	struct mm_struct *mm;
697
698	mm = container_of(work, struct mm_struct, async_put_work);
699	__mmdrop(mm);
700	}
701
702	static void mmdrop_async(struct mm_struct *mm)
703	{
704	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
705	INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
706	schedule_work(work: &mm->async_put_work);
707	}
708	}
709
710	static inline void free_signal_struct(struct signal_struct *sig)
711	{
712	taskstats_tgid_free(sig);
713	sched_autogroup_exit(sig);
714	/*
715	* __mmdrop is not safe to call from softirq context on x86 due to
716	* pgd_dtor so postpone it to the async context
717	*/
718	if (sig->oom_mm)
719	mmdrop_async(mm: sig->oom_mm);
720	kmem_cache_free(s: signal_cachep, objp: sig);
721	}
722
723	static inline void put_signal_struct(struct signal_struct *sig)
724	{
725	if (refcount_dec_and_test(r: &sig->sigcnt))
726	free_signal_struct(sig);
727	}
728
729	void __put_task_struct(struct task_struct *tsk)
730	{
731	WARN_ON(!tsk->exit_state);
732	WARN_ON(refcount_read(&tsk->usage));
733	WARN_ON(tsk == current);
734
735	sched_ext_free(p: tsk);
736	io_uring_free(tsk);
737	cgroup_free(p: tsk);
738	task_numa_free(p: tsk, final: true);
739	security_task_free(task: tsk);
740	exit_creds(tsk);
741	delayacct_tsk_free(tsk);
742	put_signal_struct(sig: tsk->signal);
743	sched_core_free(tsk);
744	free_task(tsk);
745	}
746	EXPORT_SYMBOL_GPL(__put_task_struct);
747
748	void __put_task_struct_rcu_cb(struct rcu_head *rhp)
749	{
750	struct task_struct task = container_of(rhp, struct* task_struct, rcu);
751
752	__put_task_struct(task);
753	}
754	EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
755
756	void __init __weak arch_task_cache_init(void) { }
757
758	/*
759	* set_max_threads
760	*/
761	static void __init set_max_threads(unsigned int max_threads_suggested)
762	{
763	u64 threads;
764	unsigned long nr_pages = memblock_estimated_nr_free_pages();
765
766	/*
767	* The number of threads shall be limited such that the thread
768	* structures may only consume a small part of the available memory.
769	*/
770	if (fls64(x: nr_pages) + fls64(PAGE_SIZE) > `64`)
771	threads = MAX_THREADS;
772	else
773	threads = div64_u64(dividend: (u64) nr_pages * (u64) PAGE_SIZE,
774	divisor: (u64) THREAD_SIZE * `8UL`);
775
776	if (threads > max_threads_suggested)
777	threads = max_threads_suggested;
778
779	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
780	}
781
782	#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
783	/ Initialized by the architecture: /
784	int arch_task_struct_size __read_mostly;
785	#endif
786
787	static void __init task_struct_whitelist(unsigned long offset, unsigned* long *size)
788	{
789	/ Fetch thread_struct whitelist for the architecture. /
790	arch_thread_struct_whitelist(offset, size);
791
792	/*
793	* Handle zero-sized whitelist or empty thread_struct, otherwise
794	* adjust offset to position of thread_struct in task_struct.
795	*/
796	if (unlikely(*size == `0`))
797	*offset = `0`;
798	else
799	offset += offsetof(struct* task_struct, thread);
800	}
801
802	void __init fork_init(void)
803	{
804	int i;
805	#ifndef ARCH_MIN_TASKALIGN
806	#define ARCH_MIN_TASKALIGN 0
807	#endif
808	int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
809	unsigned long useroffset, usersize;
810
811	/ create a slab on which task_structs can be allocated /
812	task_struct_whitelist(offset: &useroffset, size: &usersize);
813	task_struct_cachep = kmem_cache_create_usercopy(name: "task_struct",
814	size: arch_task_struct_size, align,
815	SLAB_PANIC\|SLAB_ACCOUNT,
816	useroffset, usersize, NULL);
817
818	/ do the arch specific task caches init /
819	arch_task_cache_init();
820
821	set_max_threads(MAX_THREADS);
822
823	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/`2`;
824	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/`2`;
825	init_task.signal->rlim[RLIMIT_SIGPENDING] =
826	init_task.signal->rlim[RLIMIT_NPROC];
827
828	for (i = `0`; i < UCOUNT_COUNTS; i++)
829	init_user_ns.ucount_max[i] = max_threads/`2`;
830
831	set_userns_rlimit_max(ns: &init_user_ns, type: UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
832	set_userns_rlimit_max(ns: &init_user_ns, type: UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
833	set_userns_rlimit_max(ns: &init_user_ns, type: UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
834	set_userns_rlimit_max(ns: &init_user_ns, type: UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
835
836	#ifdef CONFIG_VMAP_STACK
837	cpuhp_setup_state(state: CPUHP_BP_PREPARE_DYN, name: "fork:vm_stack_cache",
838	NULL, teardown: free_vm_stack_cache);
839	#endif
840
841	scs_init();
842
843	lockdep_init_task(task: &init_task);
844	uprobes_init();
845	}
846
847	int __weak arch_dup_task_struct(struct task_struct *dst,
848	struct task_struct *src)
849	{
850	dst = src;
851	return `0`;
852	}
853
854	void set_task_stack_end_magic(struct task_struct *tsk)
855	{
856	unsigned long *stackend;
857
858	stackend = end_of_stack(task: tsk);
859	stackend = STACK_END_MAGIC; /* for overflow detection /
860	}
861
862	static struct task_struct dup_task_struct(struct* task_struct orig, int* node)
863	{
864	struct task_struct *tsk;
865	int err;
866
867	if (node == NUMA_NO_NODE)
868	node = tsk_fork_get_node(tsk: orig);
869	tsk = alloc_task_struct_node(node);
870	if (!tsk)
871	return NULL;
872
873	err = arch_dup_task_struct(dst: tsk, src: orig);
874	if (err)
875	goto free_tsk;
876
877	err = alloc_thread_stack_node(tsk, node);
878	if (err)
879	goto free_tsk;
880
881	#ifdef CONFIG_THREAD_INFO_IN_TASK
882	refcount_set(r: &tsk->stack_refcount, n: `1`);
883	#endif
884	account_kernel_stack(tsk, account: `1`);
885
886	err = scs_prepare(tsk, node);
887	if (err)
888	goto free_stack;
889
890	#ifdef CONFIG_SECCOMP
891	/*
892	* We must handle setting up seccomp filters once we're under
893	* the sighand lock in case orig has changed between now and
894	* then. Until then, filter must be NULL to avoid messing up
895	* the usage counts on the error path calling free_task.
896	*/
897	tsk->seccomp.filter = NULL;
898	#endif
899
900	setup_thread_stack(tsk, orig);
901	clear_user_return_notifier(p: tsk);
902	clear_tsk_need_resched(tsk);
903	set_task_stack_end_magic(tsk);
904	clear_syscall_work_syscall_user_dispatch(tsk);
905
906	#ifdef CONFIG_STACKPROTECTOR
907	tsk->stack_canary = get_random_canary();
908	#endif
909	if (orig->cpus_ptr == &orig->cpus_mask)
910	tsk->cpus_ptr = &tsk->cpus_mask;
911	dup_user_cpus_ptr(dst: tsk, src: orig, node);
912
913	/*
914	* One for the user space visible state that goes away when reaped.
915	* One for the scheduler.
916	*/
917	refcount_set(r: &tsk->rcu_users, n: `2`);
918	/ One for the rcu users /
919	refcount_set(r: &tsk->usage, n: `1`);
920	#ifdef CONFIG_BLK_DEV_IO_TRACE
921	tsk->btrace_seq = `0`;
922	#endif
923	tsk->splice_pipe = NULL;
924	tsk->task_frag.page = NULL;
925	tsk->wake_q.next = NULL;
926	tsk->worker_private = NULL;
927
928	kcov_task_init(t: tsk);
929	kmsan_task_create(task: tsk);
930	kmap_local_fork(tsk);
931
932	#ifdef CONFIG_FAULT_INJECTION
933	tsk->fail_nth = `0`;
934	#endif
935
936	#ifdef CONFIG_BLK_CGROUP
937	tsk->throttle_disk = NULL;
938	tsk->use_memdelay = `0`;
939	#endif
940
941	#ifdef CONFIG_ARCH_HAS_CPU_PASID
942	tsk->pasid_activated = `0`;
943	#endif
944
945	#ifdef CONFIG_MEMCG
946	tsk->active_memcg = NULL;
947	#endif
948
949	#ifdef CONFIG_X86_BUS_LOCK_DETECT
950	tsk->reported_split_lock = `0`;
951	#endif
952
953	#ifdef CONFIG_SCHED_MM_CID
954	tsk->mm_cid = -`1`;
955	tsk->last_mm_cid = -`1`;
956	tsk->mm_cid_active = `0`;
957	tsk->migrate_from_cpu = -`1`;
958	#endif
959	return tsk;
960
961	free_stack:
962	exit_task_stack_account(tsk);
963	free_thread_stack(tsk);
964	free_tsk:
965	free_task_struct(tsk);
966	return NULL;
967	}
968
969	__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
970
971	static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
972
973	static int __init coredump_filter_setup(char *s)
974	{
975	default_dump_filter =
976	(simple_strtoul(s, NULL, `0`) << MMF_DUMP_FILTER_SHIFT) &
977	MMF_DUMP_FILTER_MASK;
978	return `1`;
979	}
980
981	__setup("coredump_filter=", coredump_filter_setup);
982
983	#include <linux/init_task.h>
984
985	static void mm_init_aio(struct mm_struct *mm)
986	{
987	#ifdef CONFIG_AIO
988	spin_lock_init(&mm->ioctx_lock);
989	mm->ioctx_table = NULL;
990	#endif
991	}
992
993	static __always_inline void mm_clear_owner(struct mm_struct *mm,
994	struct task_struct *p)
995	{
996	#ifdef CONFIG_MEMCG
997	if (mm->owner == p)
998	WRITE_ONCE(mm->owner, NULL);
999	#endif
1000	}
1001
1002	static void mm_init_owner(struct mm_struct mm, struct* task_struct *p)
1003	{
1004	#ifdef CONFIG_MEMCG
1005	mm->owner = p;
1006	#endif
1007	}
1008
1009	static void mm_init_uprobes_state(struct mm_struct *mm)
1010	{
1011	#ifdef CONFIG_UPROBES
1012	mm->uprobes_state.xol_area = NULL;
1013	#endif
1014	}
1015
1016	static void mmap_init_lock(struct mm_struct *mm)
1017	{
1018	init_rwsem(&mm->mmap_lock);
1019	mm_lock_seqcount_init(mm);
1020	#ifdef CONFIG_PER_VMA_LOCK
1021	rcuwait_init(w: &mm->vma_writer_wait);
1022	#endif
1023	}
1024
1025	static struct mm_struct mm_init(struct* mm_struct mm, struct* task_struct *p,
1026	struct user_namespace *user_ns)
1027	{
1028	mt_init_flags(mt: &mm->mm_mt, MM_MT_FLAGS);
1029	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
1030	atomic_set(v: &mm->mm_users, i: `1`);
1031	atomic_set(v: &mm->mm_count, i: `1`);
1032	seqcount_init(&mm->write_protect_seq);
1033	mmap_init_lock(mm);
1034	INIT_LIST_HEAD(list: &mm->mmlist);
1035	mm_pgtables_bytes_init(mm);
1036	mm->map_count = `0`;
1037	mm->locked_vm = `0`;
1038	atomic64_set(v: &mm->pinned_vm, i: `0`);
1039	memset(&mm->rss_stat, `0`, sizeof(mm->rss_stat));
1040	spin_lock_init(&mm->page_table_lock);
1041	spin_lock_init(&mm->arg_lock);
1042	mm_init_cpumask(mm);
1043	mm_init_aio(mm);
1044	mm_init_owner(mm, p);
1045	mm_pasid_init(mm);
1046	RCU_INIT_POINTER(mm->exe_file, NULL);
1047	mmu_notifier_subscriptions_init(mm);
1048	init_tlb_flush_pending(mm);
1049	futex_mm_init(mm);
1050	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
1051	mm->pmd_huge_pte = NULL;
1052	#endif
1053	mm_init_uprobes_state(mm);
1054	hugetlb_count_init(mm);
1055
1056	if (current->mm) {
1057	mm->flags = mmf_init_flags(current->mm->flags);
1058	mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1059	} else {
1060	mm->flags = default_dump_filter;
1061	mm->def_flags = `0`;
1062	}
1063
1064	if (mm_alloc_pgd(mm))
1065	goto fail_nopgd;
1066
1067	if (mm_alloc_id(mm))
1068	goto fail_noid;
1069
1070	if (init_new_context(tsk: p, mm))
1071	goto fail_nocontext;
1072
1073	if (mm_alloc_cid(mm, p))
1074	goto fail_cid;
1075
1076	if (percpu_counter_init_many(mm->rss_stat, `0`, GFP_KERNEL_ACCOUNT,
1077	NR_MM_COUNTERS))
1078	goto fail_pcpu;
1079
1080	mm->user_ns = get_user_ns(ns: user_ns);
1081	lru_gen_init_mm(mm);
1082	return mm;
1083
1084	fail_pcpu:
1085	mm_destroy_cid(mm);
1086	fail_cid:
1087	destroy_context(mm);
1088	fail_nocontext:
1089	mm_free_id(mm);
1090	fail_noid:
1091	mm_free_pgd(mm);
1092	fail_nopgd:
1093	free_mm(mm);
1094	return NULL;
1095	}
1096
1097	/*
1098	* Allocate and initialize an mm_struct.
1099	*/
1100	struct mm_struct mm_alloc(void*)
1101	{
1102	struct mm_struct *mm;
1103
1104	mm = allocate_mm();
1105	if (!mm)
1106	return NULL;
1107
1108	memset(mm, `0`, sizeof(*mm));
1109	return mm_init(mm, current, current_user_ns());
1110	}
1111	EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
1112
1113	static inline void __mmput(struct mm_struct *mm)
1114	{
1115	VM_BUG_ON(atomic_read(&mm->mm_users));
1116
1117	uprobe_clear_state(mm);
1118	exit_aio(mm);
1119	ksm_exit(mm);
1120	khugepaged_exit(mm); / must run before exit_mmap /
1121	exit_mmap(mm);
1122	mm_put_huge_zero_folio(mm);
1123	set_mm_exe_file(mm, NULL);
1124	if (!list_empty(head: &mm->mmlist)) {
1125	spin_lock(lock: &mmlist_lock);
1126	list_del(entry: &mm->mmlist);
1127	spin_unlock(lock: &mmlist_lock);
1128	}
1129	if (mm->binfmt)
1130	module_put(module: mm->binfmt->module);
1131	lru_gen_del_mm(mm);
1132	futex_hash_free(mm);
1133	mmdrop(mm);
1134	}
1135
1136	/*
1137	* Decrement the use count and release all resources for an mm.
1138	*/
1139	void mmput(struct mm_struct *mm)
1140	{
1141	might_sleep();
1142
1143	if (atomic_dec_and_test(v: &mm->mm_users))
1144	__mmput(mm);
1145	}
1146	EXPORT_SYMBOL_GPL(mmput);
1147
1148	#ifdef CONFIG_MMU
1149	static void mmput_async_fn(struct work_struct *work)
1150	{
1151	struct mm_struct mm = container_of(work, struct* mm_struct,
1152	async_put_work);
1153
1154	__mmput(mm);
1155	}
1156
1157	void mmput_async(struct mm_struct *mm)
1158	{
1159	if (atomic_dec_and_test(v: &mm->mm_users)) {
1160	INIT_WORK(&mm->async_put_work, mmput_async_fn);
1161	schedule_work(work: &mm->async_put_work);
1162	}
1163	}
1164	EXPORT_SYMBOL_GPL(mmput_async);
1165	#endif
1166
1167	/**
1168	* set_mm_exe_file - change a reference to the mm's executable file
1169	* @mm: The mm to change.
1170	* @new_exe_file: The new file to use.
1171	*
1172	* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1173	*
1174	* Main users are mmput() and sys_execve(). Callers prevent concurrent
1175	* invocations: in mmput() nobody alive left, in execve it happens before
1176	* the new mm is made visible to anyone.
1177	*
1178	* Can only fail if new_exe_file != NULL.
1179	*/
1180	int set_mm_exe_file(struct mm_struct mm, struct* file *new_exe_file)
1181	{
1182	struct file *old_exe_file;
1183
1184	/*
1185	* It is safe to dereference the exe_file without RCU as
1186	* this function is only called if nobody else can access
1187	* this mm -- see comment above for justification.
1188	*/
1189	old_exe_file = rcu_dereference_raw(mm->exe_file);
1190
1191	if (new_exe_file) {
1192	/*
1193	* We expect the caller (i.e., sys_execve) to already denied
1194	* write access, so this is unlikely to fail.
1195	*/
1196	if (unlikely(exe_file_deny_write_access(new_exe_file)))
1197	return -EACCES;
1198	get_file(f: new_exe_file);
1199	}
1200	rcu_assign_pointer(mm->exe_file, new_exe_file);
1201	if (old_exe_file) {
1202	exe_file_allow_write_access(exe_file: old_exe_file);
1203	fput(old_exe_file);
1204	}
1205	return `0`;
1206	}
1207
1208	/**
1209	* replace_mm_exe_file - replace a reference to the mm's executable file
1210	* @mm: The mm to change.
1211	* @new_exe_file: The new file to use.
1212	*
1213	* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1214	*
1215	* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
1216	*/
1217	int replace_mm_exe_file(struct mm_struct mm, struct* file *new_exe_file)
1218	{
1219	struct vm_area_struct *vma;
1220	struct file *old_exe_file;
1221	int ret = `0`;
1222
1223	/ Forbid mm->exe_file change if old file still mapped. /
1224	old_exe_file = get_mm_exe_file(mm);
1225	if (old_exe_file) {
1226	VMA_ITERATOR(vmi, mm, `0`);
1227	mmap_read_lock(mm);
1228	for_each_vma(vmi, vma) {
1229	if (!vma->vm_file)
1230	continue;
1231	if (path_equal(path1: &vma->vm_file->f_path,
1232	path2: &old_exe_file->f_path)) {
1233	ret = -EBUSY;
1234	break;
1235	}
1236	}
1237	mmap_read_unlock(mm);
1238	fput(old_exe_file);
1239	if (ret)
1240	return ret;
1241	}
1242
1243	ret = exe_file_deny_write_access(exe_file: new_exe_file);
1244	if (ret)
1245	return -EACCES;
1246	get_file(f: new_exe_file);
1247
1248	/ set the new file /
1249	mmap_write_lock(mm);
1250	old_exe_file = rcu_dereference_raw(mm->exe_file);
1251	rcu_assign_pointer(mm->exe_file, new_exe_file);
1252	mmap_write_unlock(mm);
1253
1254	if (old_exe_file) {
1255	exe_file_allow_write_access(exe_file: old_exe_file);
1256	fput(old_exe_file);
1257	}
1258	return `0`;
1259	}
1260
1261	/**
1262	* get_mm_exe_file - acquire a reference to the mm's executable file
1263	* @mm: The mm of interest.
1264	*
1265	* Returns %NULL if mm has no associated executable file.
1266	* User must release file via fput().
1267	*/
1268	struct file get_mm_exe_file(struct* mm_struct *mm)
1269	{
1270	struct file *exe_file;
1271
1272	rcu_read_lock();
1273	exe_file = get_file_rcu(f: &mm->exe_file);
1274	rcu_read_unlock();
1275	return exe_file;
1276	}
1277
1278	/**
1279	* get_task_exe_file - acquire a reference to the task's executable file
1280	* @task: The task.
1281	*
1282	* Returns %NULL if task's mm (if any) has no associated executable file or
1283	* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
1284	* User must release file via fput().
1285	*/
1286	struct file get_task_exe_file(struct* task_struct *task)
1287	{
1288	struct file *exe_file = NULL;
1289	struct mm_struct *mm;
1290
1291	if (task->flags & PF_KTHREAD)
1292	return NULL;
1293
1294	task_lock(p: task);
1295	mm = task->mm;
1296	if (mm)
1297	exe_file = get_mm_exe_file(mm);
1298	task_unlock(p: task);
1299	return exe_file;
1300	}
1301
1302	/**
1303	* get_task_mm - acquire a reference to the task's mm
1304	* @task: The task.
1305	*
1306	* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
1307	* this kernel workthread has transiently adopted a user mm with use_mm,
1308	* to do its AIO) is not set and if so returns a reference to it, after
1309	* bumping up the use count. User must release the mm via mmput()
1310	* after use. Typically used by /proc and ptrace.
1311	*/
1312	struct mm_struct get_task_mm(struct* task_struct *task)
1313	{
1314	struct mm_struct *mm;
1315
1316	if (task->flags & PF_KTHREAD)
1317	return NULL;
1318
1319	task_lock(p: task);
1320	mm = task->mm;
1321	if (mm)
1322	mmget(mm);
1323	task_unlock(p: task);
1324	return mm;
1325	}
1326	EXPORT_SYMBOL_GPL(get_task_mm);
1327
1328	static bool may_access_mm(struct mm_struct mm, struct* task_struct task, unsigned* int mode)
1329	{
1330	if (mm == current->mm)
1331	return true;
1332	if (ptrace_may_access(task, mode))
1333	return true;
1334	if ((mode & PTRACE_MODE_READ) && perfmon_capable())
1335	return true;
1336	return false;
1337	}
1338
1339	struct mm_struct mm_access(struct* task_struct task, unsigned* int mode)
1340	{
1341	struct mm_struct *mm;
1342	int err;
1343
1344	err = down_read_killable(sem: &task->signal->exec_update_lock);
1345	if (err)
1346	return ERR_PTR(error: err);
1347
1348	mm = get_task_mm(task);
1349	if (!mm) {
1350	mm = ERR_PTR(error: -ESRCH);
1351	} else if (!may_access_mm(mm, task, mode)) {
1352	mmput(mm);
1353	mm = ERR_PTR(error: -EACCES);
1354	}
1355	up_read(sem: &task->signal->exec_update_lock);
1356
1357	return mm;
1358	}
1359
1360	static void complete_vfork_done(struct task_struct *tsk)
1361	{
1362	struct completion *vfork;
1363
1364	task_lock(p: tsk);
1365	vfork = tsk->vfork_done;
1366	if (likely(vfork)) {
1367	tsk->vfork_done = NULL;
1368	complete(vfork);
1369	}
1370	task_unlock(p: tsk);
1371	}
1372
1373	static int wait_for_vfork_done(struct task_struct *child,
1374	struct completion *vfork)
1375	{
1376	unsigned int state = TASK_KILLABLE\|TASK_FREEZABLE;
1377	int killed;
1378
1379	cgroup_enter_frozen();
1380	killed = wait_for_completion_state(x: vfork, state);
1381	cgroup_leave_frozen(always_leave: false);
1382
1383	if (killed) {
1384	task_lock(p: child);
1385	child->vfork_done = NULL;
1386	task_unlock(p: child);
1387	}
1388
1389	put_task_struct(t: child);
1390	return killed;
1391	}
1392
1393	/ Please note the differences between mmput and mm_release.*
1394	* mmput is called whenever we stop holding onto a mm_struct,
1395	* error success whatever.
1396	*
1397	* mm_release is called after a mm_struct has been removed
1398	* from the current process.
1399	*
1400	* This difference is important for error handling, when we
1401	* only half set up a mm_struct for a new process and need to restore
1402	* the old one. Because we mmput the new mm_struct before
1403	* restoring the old one. . .
1404	* Eric Biederman 10 January 1998
1405	*/
1406	static void mm_release(struct task_struct tsk, struct* mm_struct *mm)
1407	{
1408	uprobe_free_utask(t: tsk);
1409
1410	/ Get rid of any cached register state /
1411	deactivate_mm(tsk, mm);
1412
1413	/*
1414	* Signal userspace if we're not exiting with a core dump
1415	* because we want to leave the value intact for debugging
1416	* purposes.
1417	*/
1418	if (tsk->clear_child_tid) {
1419	if (atomic_read(v: &mm->mm_users) > `1`) {
1420	/*
1421	* We don't check the error code - if userspace has
1422	* not set up a proper pointer then tough luck.
1423	*/
1424	put_user(`0`, tsk->clear_child_tid);
1425	do_futex(uaddr: tsk->clear_child_tid, FUTEX_WAKE,
1426	val: `1`, NULL, NULL, val2: `0`, val3: `0`);
1427	}
1428	tsk->clear_child_tid = NULL;
1429	}
1430
1431	/*
1432	* All done, finally we can wake up parent and return this mm to him.
1433	* Also kthread_stop() uses this completion for synchronization.
1434	*/
1435	if (tsk->vfork_done)
1436	complete_vfork_done(tsk);
1437	}
1438
1439	void exit_mm_release(struct task_struct tsk, struct* mm_struct *mm)
1440	{
1441	futex_exit_release(tsk);
1442	mm_release(tsk, mm);
1443	}
1444
1445	void exec_mm_release(struct task_struct tsk, struct* mm_struct *mm)
1446	{
1447	futex_exec_release(tsk);
1448	mm_release(tsk, mm);
1449	}
1450
1451	/**
1452	* dup_mm() - duplicates an existing mm structure
1453	* @tsk: the task_struct with which the new mm will be associated.
1454	* @oldmm: the mm to duplicate.
1455	*
1456	* Allocates a new mm structure and duplicates the provided @oldmm structure
1457	* content into it.
1458	*
1459	* Return: the duplicated mm or NULL on failure.
1460	*/
1461	static struct mm_struct dup_mm(struct* task_struct *tsk,
1462	struct mm_struct *oldmm)
1463	{
1464	struct mm_struct *mm;
1465	int err;
1466
1467	mm = allocate_mm();
1468	if (!mm)
1469	goto fail_nomem;
1470
1471	memcpy(mm, oldmm, sizeof(*mm));
1472
1473	if (!mm_init(mm, p: tsk, user_ns: mm->user_ns))
1474	goto fail_nomem;
1475
1476	uprobe_start_dup_mmap();
1477	err = dup_mmap(mm, oldmm);
1478	if (err)
1479	goto free_pt;
1480	uprobe_end_dup_mmap();
1481
1482	mm->hiwater_rss = get_mm_rss(mm);
1483	mm->hiwater_vm = mm->total_vm;
1484
1485	if (mm->binfmt && !try_module_get(module: mm->binfmt->module))
1486	goto free_pt;
1487
1488	return mm;
1489
1490	free_pt:
1491	/ don't put binfmt in mmput, we haven't got module yet /
1492	mm->binfmt = NULL;
1493	mm_init_owner(mm, NULL);
1494	mmput(mm);
1495	if (err)
1496	uprobe_end_dup_mmap();
1497
1498	fail_nomem:
1499	return NULL;
1500	}
1501
1502	static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1503	{
1504	struct mm_struct mm, oldmm;
1505
1506	tsk->min_flt = tsk->maj_flt = `0`;
1507	tsk->nvcsw = tsk->nivcsw = `0`;
1508	#ifdef CONFIG_DETECT_HUNG_TASK
1509	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1510	tsk->last_switch_time = `0`;
1511	#endif
1512
1513	tsk->mm = NULL;
1514	tsk->active_mm = NULL;
1515
1516	/*
1517	* Are we cloning a kernel thread?
1518	*
1519	* We need to steal a active VM for that..
1520	*/
1521	oldmm = current->mm;
1522	if (!oldmm)
1523	return `0`;
1524
1525	if (clone_flags & CLONE_VM) {
1526	mmget(mm: oldmm);
1527	mm = oldmm;
1528	} else {
1529	mm = dup_mm(tsk, current->mm);
1530	if (!mm)
1531	return -ENOMEM;
1532	}
1533
1534	tsk->mm = mm;
1535	tsk->active_mm = mm;
1536	sched_mm_cid_fork(t: tsk);
1537	return `0`;
1538	}
1539
1540	static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1541	{
1542	struct fs_struct *fs = current->fs;
1543	if (clone_flags & CLONE_FS) {
1544	/ tsk->fs is already what we want /
1545	spin_lock(lock: &fs->lock);
1546	/ "users" and "in_exec" locked for check_unsafe_exec() /
1547	if (fs->in_exec) {
1548	spin_unlock(lock: &fs->lock);
1549	return -EAGAIN;
1550	}
1551	fs->users++;
1552	spin_unlock(lock: &fs->lock);
1553	return `0`;
1554	}
1555	tsk->fs = copy_fs_struct(fs);
1556	if (!tsk->fs)
1557	return -ENOMEM;
1558	return `0`;
1559	}
1560
1561	static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
1562	int no_files)
1563	{
1564	struct files_struct oldf, newf;
1565
1566	/*
1567	* A background process may not have any files ...
1568	*/
1569	oldf = current->files;
1570	if (!oldf)
1571	return `0`;
1572
1573	if (no_files) {
1574	tsk->files = NULL;
1575	return `0`;
1576	}
1577
1578	if (clone_flags & CLONE_FILES) {
1579	atomic_inc(v: &oldf->count);
1580	return `0`;
1581	}
1582
1583	newf = dup_fd(oldf, NULL);
1584	if (IS_ERR(ptr: newf))
1585	return PTR_ERR(ptr: newf);
1586
1587	tsk->files = newf;
1588	return `0`;
1589	}
1590
1591	static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1592	{
1593	struct sighand_struct *sig;
1594
1595	if (clone_flags & CLONE_SIGHAND) {
1596	refcount_inc(r: &current->sighand->count);
1597	return `0`;
1598	}
1599	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1600	RCU_INIT_POINTER(tsk->sighand, sig);
1601	if (!sig)
1602	return -ENOMEM;
1603
1604	refcount_set(r: &sig->count, n: `1`);
1605	spin_lock_irq(lock: &current->sighand->siglock);
1606	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1607	spin_unlock_irq(lock: &current->sighand->siglock);
1608
1609	/ Reset all signal handler not set to SIG_IGN to SIG_DFL. /
1610	if (clone_flags & CLONE_CLEAR_SIGHAND)
1611	flush_signal_handlers(tsk, force_default: `0`);
1612
1613	return `0`;
1614	}
1615
1616	void __cleanup_sighand(struct sighand_struct *sighand)
1617	{
1618	if (refcount_dec_and_test(r: &sighand->count)) {
1619	signalfd_cleanup(sighand);
1620	/*
1621	* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1622	* without an RCU grace period, see __lock_task_sighand().
1623	*/
1624	kmem_cache_free(s: sighand_cachep, objp: sighand);
1625	}
1626	}
1627
1628	/*
1629	* Initialize POSIX timer handling for a thread group.
1630	*/
1631	static void posix_cpu_timers_init_group(struct signal_struct *sig)
1632	{
1633	struct posix_cputimers *pct = &sig->posix_cputimers;
1634	unsigned long cpu_limit;
1635
1636	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1637	posix_cputimers_group_init(pct, cpu_limit);
1638	}
1639
1640	static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1641	{
1642	struct signal_struct *sig;
1643
1644	if (clone_flags & CLONE_THREAD)
1645	return `0`;
1646
1647	sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1648	tsk->signal = sig;
1649	if (!sig)
1650	return -ENOMEM;
1651
1652	sig->nr_threads = `1`;
1653	sig->quick_threads = `1`;
1654	atomic_set(v: &sig->live, i: `1`);
1655	refcount_set(r: &sig->sigcnt, n: `1`);
1656
1657	/ list_add(thread_node, thread_head) without INIT_LIST_HEAD() /
1658	sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1659	tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1660
1661	init_waitqueue_head(&sig->wait_chldexit);
1662	sig->curr_target = tsk;
1663	init_sigpending(sig: &sig->shared_pending);
1664	INIT_HLIST_HEAD(&sig->multiprocess);
1665	seqlock_init(&sig->stats_lock);
1666	prev_cputime_init(prev: &sig->prev_cputime);
1667
1668	#ifdef CONFIG_POSIX_TIMERS
1669	INIT_HLIST_HEAD(&sig->posix_timers);
1670	INIT_HLIST_HEAD(&sig->ignored_posix_timers);
1671	hrtimer_setup(timer: &sig->real_timer, function: it_real_fn, CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL);
1672	#endif
1673
1674	task_lock(current->group_leader);
1675	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1676	task_unlock(current->group_leader);
1677
1678	posix_cpu_timers_init_group(sig);
1679
1680	tty_audit_fork(sig);
1681	sched_autogroup_fork(sig);
1682
1683	sig->oom_score_adj = current->signal->oom_score_adj;
1684	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1685
1686	mutex_init(&sig->cred_guard_mutex);
1687	init_rwsem(&sig->exec_update_lock);
1688
1689	return `0`;
1690	}
1691
1692	static void copy_seccomp(struct task_struct *p)
1693	{
1694	#ifdef CONFIG_SECCOMP
1695	/*
1696	* Must be called with sighand->lock held, which is common to
1697	* all threads in the group. Holding cred_guard_mutex is not
1698	* needed because this new task is not yet running and cannot
1699	* be racing exec.
1700	*/
1701	assert_spin_locked(&current->sighand->siglock);
1702
1703	/ Ref-count the new filter user, and assign it. /
1704	get_seccomp_filter(current);
1705	p->seccomp = current->seccomp;
1706
1707	/*
1708	* Explicitly enable no_new_privs here in case it got set
1709	* between the task_struct being duplicated and holding the
1710	* sighand lock. The seccomp state and nnp must be in sync.
1711	*/
1712	if (task_no_new_privs(current))
1713	task_set_no_new_privs(p);
1714
1715	/*
1716	* If the parent gained a seccomp mode after copying thread
1717	* flags and between before we held the sighand lock, we have
1718	* to manually enable the seccomp thread flag here.
1719	*/
1720	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1721	set_task_syscall_work(p, SECCOMP);
1722	#endif
1723	}
1724
1725	SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1726	{
1727	current->clear_child_tid = tidptr;
1728
1729	return task_pid_vnr(current);
1730	}
1731
1732	static void rt_mutex_init_task(struct task_struct *p)
1733	{
1734	raw_spin_lock_init(&p->pi_lock);
1735	#ifdef CONFIG_RT_MUTEXES
1736	p->pi_waiters = RB_ROOT_CACHED;
1737	p->pi_top_task = NULL;
1738	p->pi_blocked_on = NULL;
1739	#endif
1740	}
1741
1742	static inline void init_task_pid_links(struct task_struct *task)
1743	{
1744	enum pid_type type;
1745
1746	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1747	INIT_HLIST_NODE(h: &task->pid_links[type]);
1748	}
1749
1750	static inline void
1751	init_task_pid(struct task_struct task, enum* pid_type type, struct pid *pid)
1752	{
1753	if (type == PIDTYPE_PID)
1754	task->thread_pid = pid;
1755	else
1756	task->signal->pids[type] = pid;
1757	}
1758
1759	static inline void rcu_copy_process(struct task_struct *p)
1760	{
1761	#ifdef CONFIG_PREEMPT_RCU
1762	p->rcu_read_lock_nesting = `0`;
1763	p->rcu_read_unlock_special.s = `0`;
1764	p->rcu_blocked_node = NULL;
1765	INIT_LIST_HEAD(list: &p->rcu_node_entry);
1766	#endif /* #ifdef CONFIG_PREEMPT_RCU */
1767	#ifdef CONFIG_TASKS_RCU
1768	p->rcu_tasks_holdout = false;
1769	INIT_LIST_HEAD(list: &p->rcu_tasks_holdout_list);
1770	p->rcu_tasks_idle_cpu = -`1`;
1771	INIT_LIST_HEAD(list: &p->rcu_tasks_exit_list);
1772	#endif /* #ifdef CONFIG_TASKS_RCU */
1773	#ifdef CONFIG_TASKS_TRACE_RCU
1774	p->trc_reader_nesting = `0`;
1775	p->trc_reader_special.s = `0`;
1776	INIT_LIST_HEAD(list: &p->trc_holdout_list);
1777	INIT_LIST_HEAD(list: &p->trc_blkd_node);
1778	#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
1779	}
1780
1781	/**
1782	* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
1783	* @pid: the struct pid for which to create a pidfd
1784	* @flags: flags of the new @pidfd
1785	* @ret_file: return the new pidfs file
1786	*
1787	* Allocate a new file that stashes @pid and reserve a new pidfd number in the
1788	* caller's file descriptor table. The pidfd is reserved but not installed yet.
1789	*
1790	* The helper verifies that @pid is still in use, without PIDFD_THREAD the
1791	* task identified by @pid must be a thread-group leader.
1792	*
1793	* If this function returns successfully the caller is responsible to either
1794	* call fd_install() passing the returned pidfd and pidfd file as arguments in
1795	* order to install the pidfd into its file descriptor table or they must use
1796	* put_unused_fd() and fput() on the returned pidfd and pidfd file
1797	* respectively.
1798	*
1799	* This function is useful when a pidfd must already be reserved but there
1800	* might still be points of failure afterwards and the caller wants to ensure
1801	* that no pidfd is leaked into its file descriptor table.
1802	*
1803	* Return: On success, a reserved pidfd is returned from the function and a new
1804	* pidfd file is returned in the last argument to the function. On
1805	* error, a negative error code is returned from the function and the
1806	* last argument remains unchanged.
1807	*/
1808	int pidfd_prepare(struct pid pid, unsigned* int flags, struct file **ret_file)
1809	{
1810	struct file *pidfs_file;
1811
1812	/*
1813	* PIDFD_STALE is only allowed to be passed if the caller knows
1814	* that @pid is already registered in pidfs and thus
1815	* PIDFD_INFO_EXIT information is guaranteed to be available.
1816	*/
1817	if (!(flags & PIDFD_STALE)) {
1818	/*
1819	* While holding the pidfd waitqueue lock removing the
1820	* task linkage for the thread-group leader pid
1821	* (PIDTYPE_TGID) isn't possible. Thus, if there's still
1822	* task linkage for PIDTYPE_PID not having thread-group
1823	* leader linkage for the pid means it wasn't a
1824	* thread-group leader in the first place.
1825	*/
1826	guard(spinlock_irq)(l: &pid->wait_pidfd.lock);
1827
1828	/ Task has already been reaped. /
1829	if (!pid_has_task(pid, type: PIDTYPE_PID))
1830	return -ESRCH;
1831	/*
1832	* If this struct pid isn't used as a thread-group
1833	* leader but the caller requested to create a
1834	* thread-group leader pidfd then report ENOENT.
1835	*/
1836	if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, type: PIDTYPE_TGID))
1837	return -ENOENT;
1838	}
1839
1840	CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
1841	if (pidfd < `0`)
1842	return pidfd;
1843
1844	pidfs_file = pidfs_alloc_file(pid, flags: flags \| O_RDWR);
1845	if (IS_ERR(ptr: pidfs_file))
1846	return PTR_ERR(ptr: pidfs_file);
1847
1848	*ret_file = pidfs_file;
1849	return take_fd(pidfd);
1850	}
1851
1852	static void __delayed_free_task(struct rcu_head *rhp)
1853	{
1854	struct task_struct tsk = container_of(rhp, struct* task_struct, rcu);
1855
1856	free_task(tsk);
1857	}
1858
1859	static __always_inline void delayed_free_task(struct task_struct *tsk)
1860	{
1861	if (IS_ENABLED(CONFIG_MEMCG))
1862	call_rcu(head: &tsk->rcu, func: __delayed_free_task);
1863	else
1864	free_task(tsk);
1865	}
1866
1867	static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1868	{
1869	/ Skip if kernel thread /
1870	if (!tsk->mm)
1871	return;
1872
1873	/ Skip if spawning a thread or using vfork /
1874	if ((clone_flags & (CLONE_VM \| CLONE_THREAD \| CLONE_VFORK)) != CLONE_VM)
1875	return;
1876
1877	/ We need to synchronize with __set_oom_adj /
1878	mutex_lock(&oom_adj_mutex);
1879	set_bit(MMF_MULTIPROCESS, addr: &tsk->mm->flags);
1880	/ Update the values in case they were changed after copy_signal /
1881	tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1882	tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1883	mutex_unlock(lock: &oom_adj_mutex);
1884	}
1885
1886	#ifdef CONFIG_RV
1887	static void rv_task_fork(struct task_struct *p)
1888	{
1889	int i;
1890
1891	for (i = `0`; i < RV_PER_TASK_MONITORS; i++)
1892	p->rv[i].da_mon.monitoring = false;
1893	}
1894	#else
1895	#define rv_task_fork(p) do {} while (0)
1896	#endif
1897
1898	static bool need_futex_hash_allocate_default(u64 clone_flags)
1899	{
1900	if ((clone_flags & (CLONE_THREAD \| CLONE_VM)) != (CLONE_THREAD \| CLONE_VM))
1901	return false;
1902	return true;
1903	}
1904
1905	/*
1906	* This creates a new process as a copy of the old one,
1907	* but does not actually start it yet.
1908	*
1909	* It copies the registers, and all the appropriate
1910	* parts of the process environment (as per the clone
1911	* flags). The actual kick-off is left to the caller.
1912	*/
1913	__latent_entropy struct task_struct *copy_process(
1914	struct pid *pid,
1915	int trace,
1916	int node,
1917	struct kernel_clone_args *args)
1918	{
1919	int pidfd = -`1`, retval;
1920	struct task_struct *p;
1921	struct multiprocess_signals delayed;
1922	struct file *pidfile = NULL;
1923	const u64 clone_flags = args->flags;
1924	struct nsproxy *nsp = current->nsproxy;
1925
1926	/*
1927	* Don't allow sharing the root directory with processes in a different
1928	* namespace
1929	*/
1930	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
1931	return ERR_PTR(error: -EINVAL);
1932
1933	if ((clone_flags & (CLONE_NEWUSER\|CLONE_FS)) == (CLONE_NEWUSER\|CLONE_FS))
1934	return ERR_PTR(error: -EINVAL);
1935
1936	/*
1937	* Thread groups must share signals as well, and detached threads
1938	* can only be started up within the thread group.
1939	*/
1940	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1941	return ERR_PTR(error: -EINVAL);
1942
1943	/*
1944	* Shared signal handlers imply shared VM. By way of the above,
1945	* thread groups also imply shared VM. Blocking this case allows
1946	* for various simplifications in other code.
1947	*/
1948	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1949	return ERR_PTR(error: -EINVAL);
1950
1951	/*
1952	* Siblings of global init remain as zombies on exit since they are
1953	* not reaped by their parent (swapper). To solve this and to avoid
1954	* multi-rooted process trees, prevent global and container-inits
1955	* from creating siblings.
1956	*/
1957	if ((clone_flags & CLONE_PARENT) &&
1958	current->signal->flags & SIGNAL_UNKILLABLE)
1959	return ERR_PTR(error: -EINVAL);
1960
1961	/*
1962	* If the new process will be in a different pid or user namespace
1963	* do not allow it to share a thread group with the forking task.
1964	*/
1965	if (clone_flags & CLONE_THREAD) {
1966	if ((clone_flags & (CLONE_NEWUSER \| CLONE_NEWPID)) \|\|
1967	(task_active_pid_ns(current) != nsp->pid_ns_for_children))
1968	return ERR_PTR(error: -EINVAL);
1969	}
1970
1971	if (clone_flags & CLONE_PIDFD) {
1972	/*
1973	* - CLONE_DETACHED is blocked so that we can potentially
1974	* reuse it later for CLONE_PIDFD.
1975	*/
1976	if (clone_flags & CLONE_DETACHED)
1977	return ERR_PTR(error: -EINVAL);
1978	}
1979
1980	/*
1981	* Force any signals received before this point to be delivered
1982	* before the fork happens. Collect up signals sent to multiple
1983	* processes that happen during the fork and delay them so that
1984	* they appear to happen after the fork.
1985	*/
1986	sigemptyset(set: &delayed.signal);
1987	INIT_HLIST_NODE(h: &delayed.node);
1988
1989	spin_lock_irq(lock: &current->sighand->siglock);
1990	if (!(clone_flags & CLONE_THREAD))
1991	hlist_add_head(n: &delayed.node, h: &current->signal->multiprocess);
1992	recalc_sigpending();
1993	spin_unlock_irq(lock: &current->sighand->siglock);
1994	retval = -ERESTARTNOINTR;
1995	if (task_sigpending(current))
1996	goto fork_out;
1997
1998	retval = -ENOMEM;
1999	p = dup_task_struct(current, node);
2000	if (!p)
2001	goto fork_out;
2002	p->flags &= ~PF_KTHREAD;
2003	if (args->kthread)
2004	p->flags \|= PF_KTHREAD;
2005	if (args->user_worker) {
2006	/*
2007	* Mark us a user worker, and block any signal that isn't
2008	* fatal or STOP
2009	*/
2010	p->flags \|= PF_USER_WORKER;
2011	siginitsetinv(set: &p->blocked, sigmask(SIGKILL)\|sigmask(SIGSTOP));
2012	}
2013	if (args->io_thread)
2014	p->flags \|= PF_IO_WORKER;
2015
2016	if (args->name)
2017	strscpy_pad(p->comm, args->name, sizeof(p->comm));
2018
2019	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2020	/*
2021	* Clear TID on mm_release()?
2022	*/
2023	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2024
2025	ftrace_graph_init_task(t: p);
2026
2027	rt_mutex_init_task(p);
2028
2029	lockdep_assert_irqs_enabled();
2030	#ifdef CONFIG_PROVE_LOCKING
2031	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2032	#endif
2033	retval = copy_creds(p, clone_flags);
2034	if (retval < `0`)
2035	goto bad_fork_free;
2036
2037	retval = -EAGAIN;
2038	if (is_rlimit_overlimit(task_ucounts(p), type: UCOUNT_RLIMIT_NPROC, max: rlimit(RLIMIT_NPROC))) {
2039	if (p->real_cred->user != INIT_USER &&
2040	!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2041	goto bad_fork_cleanup_count;
2042	}
2043	current->flags &= ~PF_NPROC_EXCEEDED;
2044
2045	/*
2046	* If multiple threads are within copy_process(), then this check
2047	* triggers too late. This doesn't hurt, the check is only there
2048	* to stop root fork bombs.
2049	*/
2050	retval = -EAGAIN;
2051	if (data_race(nr_threads >= max_threads))
2052	goto bad_fork_cleanup_count;
2053
2054	delayacct_tsk_init(tsk: p); / Must remain after dup_task_struct() /
2055	p->flags &= ~(PF_SUPERPRIV \| PF_WQ_WORKER \| PF_IDLE \| PF_NO_SETAFFINITY);
2056	p->flags \|= PF_FORKNOEXEC;
2057	INIT_LIST_HEAD(list: &p->children);
2058	INIT_LIST_HEAD(list: &p->sibling);
2059	rcu_copy_process(p);
2060	p->vfork_done = NULL;
2061	spin_lock_init(&p->alloc_lock);
2062
2063	init_sigpending(sig: &p->pending);
2064
2065	p->utime = p->stime = p->gtime = `0`;
2066	#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2067	p->utimescaled = p->stimescaled = `0`;
2068	#endif
2069	prev_cputime_init(prev: &p->prev_cputime);
2070
2071	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2072	seqcount_init(&p->vtime.seqcount);
2073	p->vtime.starttime = `0`;
2074	p->vtime.state = VTIME_INACTIVE;
2075	#endif
2076
2077	#ifdef CONFIG_IO_URING
2078	p->io_uring = NULL;
2079	#endif
2080
2081	p->default_timer_slack_ns = current->timer_slack_ns;
2082
2083	#ifdef CONFIG_PSI
2084	p->psi_flags = `0`;
2085	#endif
2086
2087	task_io_accounting_init(ioac: &p->ioac);
2088	acct_clear_integrals(tsk: p);
2089
2090	posix_cputimers_init(pct: &p->posix_cputimers);
2091	tick_dep_init_task(tsk: p);
2092
2093	p->io_context = NULL;
2094	audit_set_context(task: p, NULL);
2095	cgroup_fork(p);
2096	if (args->kthread) {
2097	if (!set_kthread_struct(p))
2098	goto bad_fork_cleanup_delayacct;
2099	}
2100	#ifdef CONFIG_NUMA
2101	p->mempolicy = mpol_dup(pol: p->mempolicy);
2102	if (IS_ERR(ptr: p->mempolicy)) {
2103	retval = PTR_ERR(ptr: p->mempolicy);
2104	p->mempolicy = NULL;
2105	goto bad_fork_cleanup_delayacct;
2106	}
2107	#endif
2108	#ifdef CONFIG_CPUSETS
2109	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2110	seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2111	#endif
2112	#ifdef CONFIG_TRACE_IRQFLAGS
2113	memset(&p->irqtrace, `0`, sizeof(p->irqtrace));
2114	p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2115	p->irqtrace.softirq_enable_ip = _THIS_IP_;
2116	p->softirqs_enabled = `1`;
2117	p->softirq_context = `0`;
2118	#endif
2119
2120	p->pagefault_disabled = `0`;
2121
2122	#ifdef CONFIG_LOCKDEP
2123	lockdep_init_task(task: p);
2124	#endif
2125
2126	#ifdef CONFIG_DEBUG_MUTEXES
2127	p->blocked_on = NULL; / not blocked yet /
2128	#endif
2129	#ifdef CONFIG_BCACHE
2130	p->sequential_io = `0`;
2131	p->sequential_io_avg = `0`;
2132	#endif
2133	#ifdef CONFIG_BPF_SYSCALL
2134	RCU_INIT_POINTER(p->bpf_storage, NULL);
2135	p->bpf_ctx = NULL;
2136	#endif
2137
2138	/ Perform scheduler related setup. Assign this task to a CPU. /
2139	retval = sched_fork(clone_flags, p);
2140	if (retval)
2141	goto bad_fork_cleanup_policy;
2142
2143	retval = perf_event_init_task(child: p, clone_flags);
2144	if (retval)
2145	goto bad_fork_sched_cancel_fork;
2146	retval = audit_alloc(task: p);
2147	if (retval)
2148	goto bad_fork_cleanup_perf;
2149	/ copy all the process information /
2150	shm_init_task(p);
2151	retval = security_task_alloc(task: p, clone_flags);
2152	if (retval)
2153	goto bad_fork_cleanup_audit;
2154	retval = copy_semundo(clone_flags, tsk: p);
2155	if (retval)
2156	goto bad_fork_cleanup_security;
2157	retval = copy_files(clone_flags, tsk: p, no_files: args->no_files);
2158	if (retval)
2159	goto bad_fork_cleanup_semundo;
2160	retval = copy_fs(clone_flags, tsk: p);
2161	if (retval)
2162	goto bad_fork_cleanup_files;
2163	retval = copy_sighand(clone_flags, tsk: p);
2164	if (retval)
2165	goto bad_fork_cleanup_fs;
2166	retval = copy_signal(clone_flags, tsk: p);
2167	if (retval)
2168	goto bad_fork_cleanup_sighand;
2169	retval = copy_mm(clone_flags, tsk: p);
2170	if (retval)
2171	goto bad_fork_cleanup_signal;
2172	retval = copy_namespaces(flags: clone_flags, tsk: p);
2173	if (retval)
2174	goto bad_fork_cleanup_mm;
2175	retval = copy_io(clone_flags, tsk: p);
2176	if (retval)
2177	goto bad_fork_cleanup_namespaces;
2178	retval = copy_thread(p, args);
2179	if (retval)
2180	goto bad_fork_cleanup_io;
2181
2182	stackleak_task_init(t: p);
2183
2184	if (pid != &init_struct_pid) {
2185	pid = alloc_pid(ns: p->nsproxy->pid_ns_for_children, set_tid: args->set_tid,
2186	set_tid_size: args->set_tid_size);
2187	if (IS_ERR(ptr: pid)) {
2188	retval = PTR_ERR(ptr: pid);
2189	goto bad_fork_cleanup_thread;
2190	}
2191	}
2192
2193	/*
2194	* This has to happen after we've potentially unshared the file
2195	* descriptor table (so that the pidfd doesn't leak into the child
2196	* if the fd table isn't shared).
2197	*/
2198	if (clone_flags & CLONE_PIDFD) {
2199	int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : `0`;
2200
2201	/*
2202	* Note that no task has been attached to @pid yet indicate
2203	* that via CLONE_PIDFD.
2204	*/
2205	retval = pidfd_prepare(pid, flags: flags \| PIDFD_STALE, ret_file: &pidfile);
2206	if (retval < `0`)
2207	goto bad_fork_free_pid;
2208	pidfd = retval;
2209
2210	retval = put_user(pidfd, args->pidfd);
2211	if (retval)
2212	goto bad_fork_put_pidfd;
2213	}
2214
2215	#ifdef CONFIG_BLOCK
2216	p->plug = NULL;
2217	#endif
2218	futex_init_task(tsk: p);
2219
2220	/*
2221	* sigaltstack should be cleared when sharing the same VM
2222	*/
2223	if ((clone_flags & (CLONE_VM\|CLONE_VFORK)) == CLONE_VM)
2224	sas_ss_reset(p);
2225
2226	/*
2227	* Syscall tracing and stepping should be turned off in the
2228	* child regardless of CLONE_PTRACE.
2229	*/
2230	user_disable_single_step(p);
2231	clear_task_syscall_work(p, SYSCALL_TRACE);
2232	#if defined(CONFIG_GENERIC_ENTRY) \|\| defined(TIF_SYSCALL_EMU)
2233	clear_task_syscall_work(p, SYSCALL_EMU);
2234	#endif
2235	clear_tsk_latency_tracing(p);
2236
2237	/ ok, now we should be set up.. /
2238	p->pid = pid_nr(pid);
2239	if (clone_flags & CLONE_THREAD) {
2240	p->group_leader = current->group_leader;
2241	p->tgid = current->tgid;
2242	} else {
2243	p->group_leader = p;
2244	p->tgid = p->pid;
2245	}
2246
2247	p->nr_dirtied = `0`;
2248	p->nr_dirtied_pause = `128` >> (PAGE_SHIFT - `10`);
2249	p->dirty_paused_when = `0`;
2250
2251	p->pdeath_signal = `0`;
2252	p->task_works = NULL;
2253	clear_posix_cputimers_work(p);
2254
2255	#ifdef CONFIG_KRETPROBES
2256	p->kretprobe_instances.first = NULL;
2257	#endif
2258	#ifdef CONFIG_RETHOOK
2259	p->rethooks.first = NULL;
2260	#endif
2261
2262	/*
2263	* Ensure that the cgroup subsystem policies allow the new process to be
2264	* forked. It should be noted that the new process's css_set can be changed
2265	* between here and cgroup_post_fork() if an organisation operation is in
2266	* progress.
2267	*/
2268	retval = cgroup_can_fork(p, kargs: args);
2269	if (retval)
2270	goto bad_fork_put_pidfd;
2271
2272	/*
2273	* Now that the cgroups are pinned, re-clone the parent cgroup and put
2274	* the new task on the correct runqueue. All this before the task
2275	* becomes visible.
2276	*
2277	* This isn't part of ->can_fork() because while the re-cloning is
2278	* cgroup specific, it unconditionally needs to place the task on a
2279	* runqueue.
2280	*/
2281	retval = sched_cgroup_fork(p, kargs: args);
2282	if (retval)
2283	goto bad_fork_cancel_cgroup;
2284
2285	/*
2286	* Allocate a default futex hash for the user process once the first
2287	* thread spawns.
2288	*/
2289	if (need_futex_hash_allocate_default(clone_flags)) {
2290	retval = futex_hash_allocate_default();
2291	if (retval)
2292	goto bad_fork_core_free;
2293	/*
2294	* If we fail beyond this point we don't free the allocated
2295	* futex hash map. We assume that another thread will be created
2296	* and makes use of it. The hash map will be freed once the main
2297	* thread terminates.
2298	*/
2299	}
2300	/*
2301	* From this point on we must avoid any synchronous user-space
2302	* communication until we take the tasklist-lock. In particular, we do
2303	* not want user-space to be able to predict the process start-time by
2304	* stalling fork(2) after we recorded the start_time but before it is
2305	* visible to the system.
2306	*/
2307
2308	p->start_time = ktime_get_ns();
2309	p->start_boottime = ktime_get_boottime_ns();
2310
2311	/*
2312	* Make it visible to the rest of the system, but dont wake it up yet.
2313	* Need tasklist lock for parent etc handling!
2314	*/
2315	write_lock_irq(&tasklist_lock);
2316
2317	/ CLONE_PARENT re-uses the old parent /
2318	if (clone_flags & (CLONE_PARENT\|CLONE_THREAD)) {
2319	p->real_parent = current->real_parent;
2320	p->parent_exec_id = current->parent_exec_id;
2321	if (clone_flags & CLONE_THREAD)
2322	p->exit_signal = -`1`;
2323	else
2324	p->exit_signal = current->group_leader->exit_signal;
2325	} else {
2326	p->real_parent = current;
2327	p->parent_exec_id = current->self_exec_id;
2328	p->exit_signal = args->exit_signal;
2329	}
2330
2331	klp_copy_process(child: p);
2332
2333	sched_core_fork(p);
2334
2335	spin_lock(lock: &current->sighand->siglock);
2336
2337	rv_task_fork(p);
2338
2339	rseq_fork(t: p, clone_flags);
2340
2341	/ Don't start children in a dying pid namespace /
2342	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2343	retval = -ENOMEM;
2344	goto bad_fork_core_free;
2345	}
2346
2347	/ Let kill terminate clone/fork in the middle /
2348	if (fatal_signal_pending(current)) {
2349	retval = -EINTR;
2350	goto bad_fork_core_free;
2351	}
2352
2353	/ No more failure paths after this point. /
2354
2355	/*
2356	* Copy seccomp details explicitly here, in case they were changed
2357	* before holding sighand lock.
2358	*/
2359	copy_seccomp(p);
2360
2361	init_task_pid_links(task: p);
2362	if (likely(p->pid)) {
2363	ptrace_init_task(child: p, ptrace: (clone_flags & CLONE_PTRACE) \|\| trace);
2364
2365	init_task_pid(task: p, type: PIDTYPE_PID, pid);
2366	if (thread_group_leader(p)) {
2367	init_task_pid(task: p, type: PIDTYPE_TGID, pid);
2368	init_task_pid(task: p, type: PIDTYPE_PGID, pid: task_pgrp(current));
2369	init_task_pid(task: p, type: PIDTYPE_SID, pid: task_session(current));
2370
2371	if (is_child_reaper(pid)) {
2372	ns_of_pid(pid)->child_reaper = p;
2373	p->signal->flags \|= SIGNAL_UNKILLABLE;
2374	}
2375	p->signal->shared_pending.signal = delayed.signal;
2376	p->signal->tty = tty_kref_get(current->signal->tty);
2377	/*
2378	* Inherit has_child_subreaper flag under the same
2379	* tasklist_lock with adding child to the process tree
2380	* for propagate_has_child_subreaper optimization.
2381	*/
2382	p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper \|\|
2383	p->real_parent->signal->is_child_subreaper;
2384	list_add_tail(new: &p->sibling, head: &p->real_parent->children);
2385	list_add_tail_rcu(new: &p->tasks, head: &init_task.tasks);
2386	attach_pid(task: p, PIDTYPE_TGID);
2387	attach_pid(task: p, PIDTYPE_PGID);
2388	attach_pid(task: p, PIDTYPE_SID);
2389	__this_cpu_inc(process_counts);
2390	} else {
2391	current->signal->nr_threads++;
2392	current->signal->quick_threads++;
2393	atomic_inc(v: &current->signal->live);
2394	refcount_inc(r: &current->signal->sigcnt);
2395	task_join_group_stop(task: p);
2396	list_add_tail_rcu(new: &p->thread_node,
2397	head: &p->signal->thread_head);
2398	}
2399	attach_pid(task: p, PIDTYPE_PID);
2400	nr_threads++;
2401	}
2402	total_forks++;
2403	hlist_del_init(n: &delayed.node);
2404	spin_unlock(lock: &current->sighand->siglock);
2405	syscall_tracepoint_update(p);
2406	write_unlock_irq(&tasklist_lock);
2407
2408	if (pidfile)
2409	fd_install(fd: pidfd, file: pidfile);
2410
2411	proc_fork_connector(task: p);
2412	sched_post_fork(p);
2413	cgroup_post_fork(p, kargs: args);
2414	perf_event_fork(tsk: p);
2415
2416	trace_task_newtask(task: p, clone_flags);
2417	uprobe_copy_process(t: p, flags: clone_flags);
2418	user_events_fork(t: p, clone_flags);
2419
2420	copy_oom_score_adj(clone_flags, tsk: p);
2421
2422	return p;
2423
2424	bad_fork_core_free:
2425	sched_core_free(tsk: p);
2426	spin_unlock(lock: &current->sighand->siglock);
2427	write_unlock_irq(&tasklist_lock);
2428	bad_fork_cancel_cgroup:
2429	cgroup_cancel_fork(p, kargs: args);
2430	bad_fork_put_pidfd:
2431	if (clone_flags & CLONE_PIDFD) {
2432	fput(pidfile);
2433	put_unused_fd(fd: pidfd);
2434	}
2435	bad_fork_free_pid:
2436	if (pid != &init_struct_pid)
2437	free_pid(pid);
2438	bad_fork_cleanup_thread:
2439	exit_thread(tsk: p);
2440	bad_fork_cleanup_io:
2441	if (p->io_context)
2442	exit_io_context(task: p);
2443	bad_fork_cleanup_namespaces:
2444	exit_task_namespaces(tsk: p);
2445	bad_fork_cleanup_mm:
2446	if (p->mm) {
2447	mm_clear_owner(mm: p->mm, p);
2448	mmput(p->mm);
2449	}
2450	bad_fork_cleanup_signal:
2451	if (!(clone_flags & CLONE_THREAD))
2452	free_signal_struct(sig: p->signal);
2453	bad_fork_cleanup_sighand:
2454	__cleanup_sighand(sighand: p->sighand);
2455	bad_fork_cleanup_fs:
2456	exit_fs(p); / blocking /
2457	bad_fork_cleanup_files:
2458	exit_files(p); / blocking /
2459	bad_fork_cleanup_semundo:
2460	exit_sem(tsk: p);
2461	bad_fork_cleanup_security:
2462	security_task_free(task: p);
2463	bad_fork_cleanup_audit:
2464	audit_free(task: p);
2465	bad_fork_cleanup_perf:
2466	perf_event_free_task(task: p);
2467	bad_fork_sched_cancel_fork:
2468	sched_cancel_fork(p);
2469	bad_fork_cleanup_policy:
2470	lockdep_free_task(task: p);
2471	#ifdef CONFIG_NUMA
2472	mpol_put(pol: p->mempolicy);
2473	#endif
2474	bad_fork_cleanup_delayacct:
2475	delayacct_tsk_free(tsk: p);
2476	bad_fork_cleanup_count:
2477	dec_rlimit_ucounts(task_ucounts(p), type: UCOUNT_RLIMIT_NPROC, v: `1`);
2478	exit_creds(p);
2479	bad_fork_free:
2480	WRITE_ONCE(p->__state, TASK_DEAD);
2481	exit_task_stack_account(tsk: p);
2482	put_task_stack(tsk: p);
2483	delayed_free_task(tsk: p);
2484	fork_out:
2485	spin_lock_irq(lock: &current->sighand->siglock);
2486	hlist_del_init(n: &delayed.node);
2487	spin_unlock_irq(lock: &current->sighand->siglock);
2488	return ERR_PTR(error: retval);
2489	}
2490
2491	static inline void init_idle_pids(struct task_struct *idle)
2492	{
2493	enum pid_type type;
2494
2495	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2496	INIT_HLIST_NODE(h: &idle->pid_links[type]); / not really needed /
2497	init_task_pid(task: idle, type, pid: &init_struct_pid);
2498	}
2499	}
2500
2501	static int idle_dummy(void *dummy)
2502	{
2503	/ This function is never called /
2504	return `0`;
2505	}
2506
2507	struct task_struct * __init fork_idle(int cpu)
2508	{
2509	struct task_struct *task;
2510	struct kernel_clone_args args = {
2511	.flags = CLONE_VM,
2512	.fn = &idle_dummy,
2513	.fn_arg = NULL,
2514	.kthread = `1`,
2515	.idle = `1`,
2516	};
2517
2518	task = copy_process(pid: &init_struct_pid, trace: `0`, cpu_to_node(cpu), args: &args);
2519	if (!IS_ERR(ptr: task)) {
2520	init_idle_pids(idle: task);
2521	init_idle(idle: task, cpu);
2522	}
2523
2524	return task;
2525	}
2526
2527	/*
2528	* This is like kernel_clone(), but shaved down and tailored to just
2529	* creating io_uring workers. It returns a created task, or an error pointer.
2530	* The returned task is inactive, and the caller must fire it up through
2531	* wake_up_new_task(p). All signals are blocked in the created task.
2532	*/
2533	struct task_struct create_io_thread(int* (fn)(void* ), void* arg, int* node)
2534	{
2535	unsigned long flags = CLONE_FS\|CLONE_FILES\|CLONE_SIGHAND\|CLONE_THREAD\|
2536	CLONE_IO;
2537	struct kernel_clone_args args = {
2538	.flags = ((lower_32_bits(flags) \| CLONE_VM \|
2539	CLONE_UNTRACED) & ~CSIGNAL),
2540	.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2541	.fn = fn,
2542	.fn_arg = arg,
2543	.io_thread = `1`,
2544	.user_worker = `1`,
2545	};
2546
2547	return copy_process(NULL, trace: `0`, node, args: &args);
2548	}
2549
2550	/*
2551	* Ok, this is the main fork-routine.
2552	*
2553	* It copies the process, and if successful kick-starts
2554	* it and waits for it to finish using the VM if required.
2555	*
2556	* args->exit_signal is expected to be checked for sanity by the caller.
2557	*/
2558	pid_t kernel_clone(struct kernel_clone_args *args)
2559	{
2560	u64 clone_flags = args->flags;
2561	struct completion vfork;
2562	struct pid *pid;
2563	struct task_struct *p;
2564	int trace = `0`;
2565	pid_t nr;
2566
2567	/*
2568	* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2569	* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2570	* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2571	* field in struct clone_args and it still doesn't make sense to have
2572	* them both point at the same memory location. Performing this check
2573	* here has the advantage that we don't need to have a separate helper
2574	* to check for legacy clone().
2575	*/
2576	if ((clone_flags & CLONE_PIDFD) &&
2577	(clone_flags & CLONE_PARENT_SETTID) &&
2578	(args->pidfd == args->parent_tid))
2579	return -EINVAL;
2580
2581	/*
2582	* Determine whether and which event to report to ptracer. When
2583	* called from kernel_thread or CLONE_UNTRACED is explicitly
2584	* requested, no event is reported; otherwise, report if the event
2585	* for the type of forking is enabled.
2586	*/
2587	if (!(clone_flags & CLONE_UNTRACED)) {
2588	if (clone_flags & CLONE_VFORK)
2589	trace = PTRACE_EVENT_VFORK;
2590	else if (args->exit_signal != SIGCHLD)
2591	trace = PTRACE_EVENT_CLONE;
2592	else
2593	trace = PTRACE_EVENT_FORK;
2594
2595	if (likely(!ptrace_event_enabled(current, trace)))
2596	trace = `0`;
2597	}
2598
2599	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2600	add_latent_entropy();
2601
2602	if (IS_ERR(ptr: p))
2603	return PTR_ERR(ptr: p);
2604
2605	/*
2606	* Do this prior waking up the new thread - the thread pointer
2607	* might get invalid after that point, if the thread exits quickly.
2608	*/
2609	trace_sched_process_fork(current, child: p);
2610
2611	pid = get_task_pid(task: p, type: PIDTYPE_PID);
2612	nr = pid_vnr(pid);
2613
2614	if (clone_flags & CLONE_PARENT_SETTID)
2615	put_user(nr, args->parent_tid);
2616
2617	if (clone_flags & CLONE_VFORK) {
2618	p->vfork_done = &vfork;
2619	init_completion(x: &vfork);
2620	get_task_struct(t: p);
2621	}
2622
2623	if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
2624	/ lock the task to synchronize with memcg migration /
2625	task_lock(p);
2626	lru_gen_add_mm(mm: p->mm);
2627	task_unlock(p);
2628	}
2629
2630	wake_up_new_task(tsk: p);
2631
2632	/ forking complete and child started to run, tell ptracer /
2633	if (unlikely(trace))
2634	ptrace_event_pid(event: trace, pid);
2635
2636	if (clone_flags & CLONE_VFORK) {
2637	if (!wait_for_vfork_done(child: p, vfork: &vfork))
2638	ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2639	}
2640
2641	put_pid(pid);
2642	return nr;
2643	}
2644
2645	/*
2646	* Create a kernel thread.
2647	*/
2648	pid_t kernel_thread(int (fn)(void* ), void* arg, const* char *name,
2649	unsigned long flags)
2650	{
2651	struct kernel_clone_args args = {
2652	.flags = ((lower_32_bits(flags) \| CLONE_VM \|
2653	CLONE_UNTRACED) & ~CSIGNAL),
2654	.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2655	.fn = fn,
2656	.fn_arg = arg,
2657	.name = name,
2658	.kthread = `1`,
2659	};
2660
2661	return kernel_clone(args: &args);
2662	}
2663
2664	/*
2665	* Create a user mode thread.
2666	*/
2667	pid_t user_mode_thread(int (fn)(void* ), void* arg, unsigned* long flags)
2668	{
2669	struct kernel_clone_args args = {
2670	.flags = ((lower_32_bits(flags) \| CLONE_VM \|
2671	CLONE_UNTRACED) & ~CSIGNAL),
2672	.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2673	.fn = fn,
2674	.fn_arg = arg,
2675	};
2676
2677	return kernel_clone(args: &args);
2678	}
2679
2680	#ifdef __ARCH_WANT_SYS_FORK
2681	SYSCALL_DEFINE0(fork)
2682	{
2683	#ifdef CONFIG_MMU
2684	struct kernel_clone_args args = {
2685	.exit_signal = SIGCHLD,
2686	};
2687
2688	return kernel_clone(args: &args);
2689	#else
2690	/ can not support in nommu mode /
2691	return -EINVAL;
2692	#endif
2693	}
2694	#endif
2695
2696	#ifdef __ARCH_WANT_SYS_VFORK
2697	SYSCALL_DEFINE0(vfork)
2698	{
2699	struct kernel_clone_args args = {
2700	.flags = CLONE_VFORK \| CLONE_VM,
2701	.exit_signal = SIGCHLD,
2702	};
2703
2704	return kernel_clone(args: &args);
2705	}
2706	#endif
2707
2708	#ifdef __ARCH_WANT_SYS_CLONE
2709	#ifdef CONFIG_CLONE_BACKWARDS
2710	SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2711	int __user *, parent_tidptr,
2712	unsigned long, tls,
2713	int __user *, child_tidptr)
2714	#elif defined(CONFIG_CLONE_BACKWARDS2)
2715	SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2716	int __user *, parent_tidptr,
2717	int __user *, child_tidptr,
2718	unsigned long, tls)
2719	#elif defined(CONFIG_CLONE_BACKWARDS3)
2720	SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2721	int, stack_size,
2722	int __user *, parent_tidptr,
2723	int __user *, child_tidptr,
2724	unsigned long, tls)
2725	#else
2726	SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2727	int __user *, parent_tidptr,
2728	int __user *, child_tidptr,
2729	unsigned long, tls)
2730	#endif
2731	{
2732	struct kernel_clone_args args = {
2733	.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2734	.pidfd = parent_tidptr,
2735	.child_tid = child_tidptr,
2736	.parent_tid = parent_tidptr,
2737	.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2738	.stack = newsp,
2739	.tls = tls,
2740	};
2741
2742	return kernel_clone(args: &args);
2743	}
2744	#endif
2745
2746	noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2747	struct clone_args __user *uargs,
2748	size_t usize)
2749	{
2750	int err;
2751	struct clone_args args;
2752	pid_t *kset_tid = kargs->set_tid;
2753
2754	BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2755	CLONE_ARGS_SIZE_VER0);
2756	BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2757	CLONE_ARGS_SIZE_VER1);
2758	BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2759	CLONE_ARGS_SIZE_VER2);
2760	BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2761
2762	if (unlikely(usize > PAGE_SIZE))
2763	return -E2BIG;
2764	if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2765	return -EINVAL;
2766
2767	err = copy_struct_from_user(dst: &args, ksize: sizeof(args), src: uargs, usize);
2768	if (err)
2769	return err;
2770
2771	if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2772	return -EINVAL;
2773
2774	if (unlikely(!args.set_tid && args.set_tid_size > `0`))
2775	return -EINVAL;
2776
2777	if (unlikely(args.set_tid && args.set_tid_size == `0`))
2778	return -EINVAL;
2779
2780	/*
2781	* Verify that higher 32bits of exit_signal are unset and that
2782	* it is a valid signal
2783	*/
2784	if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) \|\|
2785	!valid_signal(args.exit_signal)))
2786	return -EINVAL;
2787
2788	if ((args.flags & CLONE_INTO_CGROUP) &&
2789	(args.cgroup > INT_MAX \|\| usize < CLONE_ARGS_SIZE_VER2))
2790	return -EINVAL;
2791
2792	kargs = (struct* kernel_clone_args){
2793	.flags = args.flags,
2794	.pidfd = u64_to_user_ptr(args.pidfd),
2795	.child_tid = u64_to_user_ptr(args.child_tid),
2796	.parent_tid = u64_to_user_ptr(args.parent_tid),
2797	.exit_signal = args.exit_signal,
2798	.stack = args.stack,
2799	.stack_size = args.stack_size,
2800	.tls = args.tls,
2801	.set_tid_size = args.set_tid_size,
2802	.cgroup = args.cgroup,
2803	};
2804
2805	if (args.set_tid &&
2806	copy_from_user(to: kset_tid, u64_to_user_ptr(args.set_tid),
2807	n: (kargs->set_tid_size * sizeof(pid_t))))
2808	return -EFAULT;
2809
2810	kargs->set_tid = kset_tid;
2811
2812	return `0`;
2813	}
2814
2815	/**
2816	* clone3_stack_valid - check and prepare stack
2817	* @kargs: kernel clone args
2818	*
2819	* Verify that the stack arguments userspace gave us are sane.
2820	* In addition, set the stack direction for userspace since it's easy for us to
2821	* determine.
2822	*/
2823	static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2824	{
2825	if (kargs->stack == `0`) {
2826	if (kargs->stack_size > `0`)
2827	return false;
2828	} else {
2829	if (kargs->stack_size == `0`)
2830	return false;
2831
2832	if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2833	return false;
2834
2835	#if !defined(CONFIG_STACK_GROWSUP)
2836	kargs->stack += kargs->stack_size;
2837	#endif
2838	}
2839
2840	return true;
2841	}
2842
2843	static bool clone3_args_valid(struct kernel_clone_args *kargs)
2844	{
2845	/ Verify that no unknown flags are passed along. /
2846	if (kargs->flags &
2847	~(CLONE_LEGACY_FLAGS \| CLONE_CLEAR_SIGHAND \| CLONE_INTO_CGROUP))
2848	return false;
2849
2850	/*
2851	* - make the CLONE_DETACHED bit reusable for clone3
2852	* - make the CSIGNAL bits reusable for clone3
2853	*/
2854	if (kargs->flags & (CLONE_DETACHED \| (CSIGNAL & (~CLONE_NEWTIME))))
2855	return false;
2856
2857	if ((kargs->flags & (CLONE_SIGHAND \| CLONE_CLEAR_SIGHAND)) ==
2858	(CLONE_SIGHAND \| CLONE_CLEAR_SIGHAND))
2859	return false;
2860
2861	if ((kargs->flags & (CLONE_THREAD \| CLONE_PARENT)) &&
2862	kargs->exit_signal)
2863	return false;
2864
2865	if (!clone3_stack_valid(kargs))
2866	return false;
2867
2868	return true;
2869	}
2870
2871	/**
2872	* sys_clone3 - create a new process with specific properties
2873	* @uargs: argument structure
2874	* @size: size of @uargs
2875	*
2876	* clone3() is the extensible successor to clone()/clone2().
2877	* It takes a struct as argument that is versioned by its size.
2878	*
2879	* Return: On success, a positive PID for the child process.
2880	* On error, a negative errno number.
2881	*/
2882	SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2883	{
2884	int err;
2885
2886	struct kernel_clone_args kargs;
2887	pid_t set_tid[MAX_PID_NS_LEVEL];
2888
2889	#ifdef __ARCH_BROKEN_SYS_CLONE3
2890	#warning clone3() entry point is missing, please fix
2891	return -ENOSYS;
2892	#endif
2893
2894	kargs.set_tid = set_tid;
2895
2896	err = copy_clone_args_from_user(kargs: &kargs, uargs, usize: size);
2897	if (err)
2898	return err;
2899
2900	if (!clone3_args_valid(kargs: &kargs))
2901	return -EINVAL;
2902
2903	return kernel_clone(args: &kargs);
2904	}
2905
2906	void walk_process_tree(struct task_struct top, proc_visitor visitor, void* *data)
2907	{
2908	struct task_struct leader, parent, *child;
2909	int res;
2910
2911	read_lock(&tasklist_lock);
2912	leader = top = top->group_leader;
2913	down:
2914	for_each_thread(leader, parent) {
2915	list_for_each_entry(child, &parent->children, sibling) {
2916	res = visitor(child, data);
2917	if (res) {
2918	if (res < `0`)
2919	goto out;
2920	leader = child;
2921	goto down;
2922	}
2923	up:
2924	;
2925	}
2926	}
2927
2928	if (leader != top) {
2929	child = leader;
2930	parent = child->real_parent;
2931	leader = parent->group_leader;
2932	goto up;
2933	}
2934	out:
2935	read_unlock(&tasklist_lock);
2936	}
2937
2938	#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2939	#define ARCH_MIN_MMSTRUCT_ALIGN 0
2940	#endif
2941
2942	static void sighand_ctor(void *data)
2943	{
2944	struct sighand_struct *sighand = data;
2945
2946	spin_lock_init(&sighand->siglock);
2947	init_waitqueue_head(&sighand->signalfd_wqh);
2948	}
2949
2950	void __init mm_cache_init(void)
2951	{
2952	unsigned int mm_size;
2953
2954	/*
2955	* The mm_cpumask is located at the end of mm_struct, and is
2956	* dynamically sized based on the maximum CPU number this system
2957	* can have, taking hotplug into account (nr_cpu_ids).
2958	*/
2959	mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
2960
2961	mm_cachep = kmem_cache_create_usercopy(name: "mm_struct",
2962	size: mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2963	SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT,
2964	offsetof(struct mm_struct, saved_auxv),
2965	sizeof_field(struct mm_struct, saved_auxv),
2966	NULL);
2967	}
2968
2969	void __init proc_caches_init(void)
2970	{
2971	sighand_cachep = kmem_cache_create("sighand_cache",
2972	sizeof(struct sighand_struct), `0`,
2973	SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_TYPESAFE_BY_RCU\|
2974	SLAB_ACCOUNT, sighand_ctor);
2975	signal_cachep = kmem_cache_create("signal_cache",
2976	sizeof(struct signal_struct), `0`,
2977	SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT,
2978	NULL);
2979	files_cachep = kmem_cache_create("files_cache",
2980	sizeof(struct files_struct), `0`,
2981	SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT,
2982	NULL);
2983	fs_cachep = kmem_cache_create("fs_cache",
2984	sizeof(struct fs_struct), `0`,
2985	SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT,
2986	NULL);
2987	mmap_init();
2988	nsproxy_cache_init();
2989	}
2990
2991	/*
2992	* Check constraints on flags passed to the unshare system call.
2993	*/
2994	static int check_unshare_flags(unsigned long unshare_flags)
2995	{
2996	if (unshare_flags & ~(CLONE_THREAD\|CLONE_FS\|CLONE_NEWNS\|CLONE_SIGHAND\|
2997	CLONE_VM\|CLONE_FILES\|CLONE_SYSVSEM\|
2998	CLONE_NEWUTS\|CLONE_NEWIPC\|CLONE_NEWNET\|
2999	CLONE_NEWUSER\|CLONE_NEWPID\|CLONE_NEWCGROUP\|
3000	CLONE_NEWTIME))
3001	return -EINVAL;
3002	/*
3003	* Not implemented, but pretend it works if there is nothing
3004	* to unshare. Note that unsharing the address space or the
3005	* signal handlers also need to unshare the signal queues (aka
3006	* CLONE_THREAD).
3007	*/
3008	if (unshare_flags & (CLONE_THREAD \| CLONE_SIGHAND \| CLONE_VM)) {
3009	if (!thread_group_empty(current))
3010	return -EINVAL;
3011	}
3012	if (unshare_flags & (CLONE_SIGHAND \| CLONE_VM)) {
3013	if (refcount_read(r: &current->sighand->count) > `1`)
3014	return -EINVAL;
3015	}
3016	if (unshare_flags & CLONE_VM) {
3017	if (!current_is_single_threaded())
3018	return -EINVAL;
3019	}
3020
3021	return `0`;
3022	}
3023
3024	/*
3025	* Unshare the filesystem structure if it is being shared
3026	*/
3027	static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
3028	{
3029	struct fs_struct *fs = current->fs;
3030
3031	if (!(unshare_flags & CLONE_FS) \|\| !fs)
3032	return `0`;
3033
3034	/ don't need lock here; in the worst case we'll do useless copy /
3035	if (fs->users == `1`)
3036	return `0`;
3037
3038	*new_fsp = copy_fs_struct(fs);
3039	if (!*new_fsp)
3040	return -ENOMEM;
3041
3042	return `0`;
3043	}
3044
3045	/*
3046	* Unshare file descriptor table if it is being shared
3047	*/
3048	static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
3049	{
3050	struct files_struct *fd = current->files;
3051
3052	if ((unshare_flags & CLONE_FILES) &&
3053	(fd && atomic_read(v: &fd->count) > `1`)) {
3054	fd = dup_fd(fd, NULL);
3055	if (IS_ERR(ptr: fd))
3056	return PTR_ERR(ptr: fd);
3057	*new_fdp = fd;
3058	}
3059
3060	return `0`;
3061	}
3062
3063	/*
3064	* unshare allows a process to 'unshare' part of the process
3065	* context which was originally shared using clone. copy_*
3066	* functions used by kernel_clone() cannot be used here directly
3067	* because they modify an inactive task_struct that is being
3068	* constructed. Here we are modifying the current, active,
3069	* task_struct.
3070	*/
3071	int ksys_unshare(unsigned long unshare_flags)
3072	{
3073	struct fs_struct fs, new_fs = NULL;
3074	struct files_struct *new_fd = NULL;
3075	struct cred *new_cred = NULL;
3076	struct nsproxy *new_nsproxy = NULL;
3077	int do_sysvsem = `0`;
3078	int err;
3079
3080	/*
3081	* If unsharing a user namespace must also unshare the thread group
3082	* and unshare the filesystem root and working directories.
3083	*/
3084	if (unshare_flags & CLONE_NEWUSER)
3085	unshare_flags \|= CLONE_THREAD \| CLONE_FS;
3086	/*
3087	* If unsharing vm, must also unshare signal handlers.
3088	*/
3089	if (unshare_flags & CLONE_VM)
3090	unshare_flags \|= CLONE_SIGHAND;
3091	/*
3092	* If unsharing a signal handlers, must also unshare the signal queues.
3093	*/
3094	if (unshare_flags & CLONE_SIGHAND)
3095	unshare_flags \|= CLONE_THREAD;
3096	/*
3097	* If unsharing namespace, must also unshare filesystem information.
3098	*/
3099	if (unshare_flags & CLONE_NEWNS)
3100	unshare_flags \|= CLONE_FS;
3101
3102	err = check_unshare_flags(unshare_flags);
3103	if (err)
3104	goto bad_unshare_out;
3105	/*
3106	* CLONE_NEWIPC must also detach from the undolist: after switching
3107	* to a new ipc namespace, the semaphore arrays from the old
3108	* namespace are unreachable.
3109	*/
3110	if (unshare_flags & (CLONE_NEWIPC\|CLONE_SYSVSEM))
3111	do_sysvsem = `1`;
3112	err = unshare_fs(unshare_flags, new_fsp: &new_fs);
3113	if (err)
3114	goto bad_unshare_out;
3115	err = unshare_fd(unshare_flags, new_fdp: &new_fd);
3116	if (err)
3117	goto bad_unshare_cleanup_fs;
3118	err = unshare_userns(unshare_flags, new_cred: &new_cred);
3119	if (err)
3120	goto bad_unshare_cleanup_fd;
3121	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3122	new_cred, new_fs);
3123	if (err)
3124	goto bad_unshare_cleanup_cred;
3125
3126	if (new_cred) {
3127	err = set_cred_ucounts(new_cred);
3128	if (err)
3129	goto bad_unshare_cleanup_cred;
3130	}
3131
3132	if (new_fs \|\| new_fd \|\| do_sysvsem \|\| new_cred \|\| new_nsproxy) {
3133	if (do_sysvsem) {
3134	/*
3135	* CLONE_SYSVSEM is equivalent to sys_exit().
3136	*/
3137	exit_sem(current);
3138	}
3139	if (unshare_flags & CLONE_NEWIPC) {
3140	/ Orphan segments in old ns (see sem above). /
3141	exit_shm(current);
3142	shm_init_task(current);
3143	}
3144
3145	if (new_nsproxy)
3146	switch_task_namespaces(current, new: new_nsproxy);
3147
3148	task_lock(current);
3149
3150	if (new_fs) {
3151	fs = current->fs;
3152	spin_lock(lock: &fs->lock);
3153	current->fs = new_fs;
3154	if (--fs->users)
3155	new_fs = NULL;
3156	else
3157	new_fs = fs;
3158	spin_unlock(lock: &fs->lock);
3159	}
3160
3161	if (new_fd)
3162	swap(current->files, new_fd);
3163
3164	task_unlock(current);
3165
3166	if (new_cred) {
3167	/ Install the new user namespace /
3168	commit_creds(new_cred);
3169	new_cred = NULL;
3170	}
3171	}
3172
3173	perf_event_namespaces(current);
3174
3175	bad_unshare_cleanup_cred:
3176	if (new_cred)
3177	put_cred(cred: new_cred);
3178	bad_unshare_cleanup_fd:
3179	if (new_fd)
3180	put_files_struct(fs: new_fd);
3181
3182	bad_unshare_cleanup_fs:
3183	if (new_fs)
3184	free_fs_struct(new_fs);
3185
3186	bad_unshare_out:
3187	return err;
3188	}
3189
3190	SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3191	{
3192	return ksys_unshare(unshare_flags);
3193	}
3194
3195	/*
3196	* Helper to unshare the files of the current task.
3197	* We don't want to expose copy_files internals to
3198	* the exec layer of the kernel.
3199	*/
3200
3201	int unshare_files(void)
3202	{
3203	struct task_struct *task = current;
3204	struct files_struct old, copy = NULL;
3205	int error;
3206
3207	error = unshare_fd(CLONE_FILES, new_fdp: &copy);
3208	if (error \|\| !copy)
3209	return error;
3210
3211	old = task->files;
3212	task_lock(p: task);
3213	task->files = copy;
3214	task_unlock(p: task);
3215	put_files_struct(fs: old);
3216	return `0`;
3217	}
3218
3219	int sysctl_max_threads(const struct ctl_table table, int* write,
3220	void buffer, size_t lenp, loff_t *ppos)
3221	{
3222	struct ctl_table t;
3223	int ret;
3224	int threads = max_threads;
3225	int min = `1`;
3226	int max = MAX_THREADS;
3227
3228	t = *table;
3229	t.data = &threads;
3230	t.extra1 = &min;
3231	t.extra2 = &max;
3232
3233	ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3234	if (ret \|\| !write)
3235	return ret;
3236
3237	max_threads = threads;
3238
3239	return `0`;
3240	}
3241

source code of linux/kernel/fork.c