exec.c source code [linux/fs/exec.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/exec.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* #!-checking implemented by tytso.
10	*/
11	/*
12	* Demand-loading implemented 01.12.91 - no need to read anything but
13	* the header into memory. The inode of the executable is put into
14	* "current->executable", and page faults do the actual loading. Clean.
15	*
16	* Once more I can proudly say that linux stood up to being changed: it
17	* was less than 2 hours work to get demand-loading completely implemented.
18	*
19	* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
20	* current->executable is only used by the procfs. This allows a dispatch
21	* table to check for several different types of binary formats. We keep
22	* trying until we recognize the file or we run out of supported binary
23	* formats.
24	*/
25
26	#include <linux/kernel_read_file.h>
27	#include <linux/slab.h>
28	#include <linux/file.h>
29	#include <linux/fdtable.h>
30	#include <linux/mm.h>
31	#include <linux/stat.h>
32	#include <linux/fcntl.h>
33	#include <linux/swap.h>
34	#include <linux/string.h>
35	#include <linux/init.h>
36	#include <linux/sched/mm.h>
37	#include <linux/sched/coredump.h>
38	#include <linux/sched/signal.h>
39	#include <linux/sched/numa_balancing.h>
40	#include <linux/sched/task.h>
41	#include <linux/pagemap.h>
42	#include <linux/perf_event.h>
43	#include <linux/highmem.h>
44	#include <linux/spinlock.h>
45	#include <linux/key.h>
46	#include <linux/personality.h>
47	#include <linux/binfmts.h>
48	#include <linux/utsname.h>
49	#include <linux/pid_namespace.h>
50	#include <linux/module.h>
51	#include <linux/namei.h>
52	#include <linux/mount.h>
53	#include <linux/security.h>
54	#include <linux/syscalls.h>
55	#include <linux/tsacct_kern.h>
56	#include <linux/cn_proc.h>
57	#include <linux/audit.h>
58	#include <linux/kmod.h>
59	#include <linux/fsnotify.h>
60	#include <linux/fs_struct.h>
61	#include <linux/oom.h>
62	#include <linux/compat.h>
63	#include <linux/vmalloc.h>
64	#include <linux/io_uring.h>
65	#include <linux/syscall_user_dispatch.h>
66	#include <linux/coredump.h>
67	#include <linux/time_namespace.h>
68	#include <linux/user_events.h>
69	#include <linux/rseq.h>
70	#include <linux/ksm.h>
71
72	#include <linux/uaccess.h>
73	#include <asm/mmu_context.h>
74	#include <asm/tlb.h>
75
76	#include <trace/events/task.h>
77	#include "internal.h"
78
79	#include <trace/events/sched.h>
80
81	/ For vma exec functions. /
82	#include "../mm/internal.h"
83
84	static int bprm_creds_from_file(struct linux_binprm *bprm);
85
86	int suid_dumpable = `0`;
87
88	static LIST_HEAD(formats);
89	static DEFINE_RWLOCK(binfmt_lock);
90
91	void __register_binfmt(struct linux_binfmt * fmt, int insert)
92	{
93	write_lock(&binfmt_lock);
94	insert ? list_add(new: &fmt->lh, head: &formats) :
95	list_add_tail(new: &fmt->lh, head: &formats);
96	write_unlock(&binfmt_lock);
97	}
98
99	EXPORT_SYMBOL(__register_binfmt);
100
101	void unregister_binfmt(struct linux_binfmt * fmt)
102	{
103	write_lock(&binfmt_lock);
104	list_del(entry: &fmt->lh);
105	write_unlock(&binfmt_lock);
106	}
107
108	EXPORT_SYMBOL(unregister_binfmt);
109
110	static inline void put_binfmt(struct linux_binfmt * fmt)
111	{
112	module_put(module: fmt->module);
113	}
114
115	bool path_noexec(const struct path *path)
116	{
117	return (path->mnt->mnt_flags & MNT_NOEXEC) \|\|
118	(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
119	}
120
121	#ifdef CONFIG_MMU
122	/*
123	* The nascent bprm->mm is not visible until exec_mmap() but it can
124	* use a lot of memory, account these pages in current->mm temporary
125	* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
126	* change the counter back via acct_arg_size(0).
127	*/
128	static void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
129	{
130	struct mm_struct *mm = current->mm;
131	long diff = (long)(pages - bprm->vma_pages);
132
133	if (!mm \|\| !diff)
134	return;
135
136	bprm->vma_pages = pages;
137	add_mm_counter(mm, member: MM_ANONPAGES, value: diff);
138	}
139
140	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
141	int write)
142	{
143	struct page *page;
144	struct vm_area_struct *vma = bprm->vma;
145	struct mm_struct *mm = bprm->mm;
146	int ret;
147
148	/*
149	* Avoid relying on expanding the stack down in GUP (which
150	* does not work for STACK_GROWSUP anyway), and just do it
151	* ahead of time.
152	*/
153	if (!mmap_read_lock_maybe_expand(mm, vma, addr: pos, write))
154	return NULL;
155
156	/*
157	* We are doing an exec(). 'current' is the process
158	* doing the exec and 'mm' is the new process's mm.
159	*/
160	ret = get_user_pages_remote(mm, start: pos, nr_pages: `1`,
161	gup_flags: write ? FOLL_WRITE : `0`,
162	pages: &page, NULL);
163	mmap_read_unlock(mm);
164	if (ret <= `0`)
165	return NULL;
166
167	if (write)
168	acct_arg_size(bprm, pages: vma_pages(vma));
169
170	return page;
171	}
172
173	static void put_arg_page(struct page *page)
174	{
175	put_page(page);
176	}
177
178	static void free_arg_pages(struct linux_binprm *bprm)
179	{
180	}
181
182	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
183	struct page *page)
184	{
185	flush_cache_page(vma: bprm->vma, vmaddr: pos, page_to_pfn(page));
186	}
187
188	static bool valid_arg_len(struct linux_binprm bprm, long* len)
189	{
190	return len <= MAX_ARG_STRLEN;
191	}
192
193	#else
194
195	static inline void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
196	{
197	}
198
199	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
200	int write)
201	{
202	struct page *page;
203
204	page = bprm->page[pos / PAGE_SIZE];
205	if (!page && write) {
206	page = alloc_page(GFP_HIGHUSER\|__GFP_ZERO);
207	if (!page)
208	return NULL;
209	bprm->page[pos / PAGE_SIZE] = page;
210	}
211
212	return page;
213	}
214
215	static void put_arg_page(struct page *page)
216	{
217	}
218
219	static void free_arg_page(struct linux_binprm bprm, int* i)
220	{
221	if (bprm->page[i]) {
222	__free_page(bprm->page[i]);
223	bprm->page[i] = NULL;
224	}
225	}
226
227	static void free_arg_pages(struct linux_binprm *bprm)
228	{
229	int i;
230
231	for (i = `0`; i < MAX_ARG_PAGES; i++)
232	free_arg_page(bprm, i);
233	}
234
235	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
236	struct page *page)
237	{
238	}
239
240	static bool valid_arg_len(struct linux_binprm bprm, long* len)
241	{
242	return len <= bprm->p;
243	}
244
245	#endif /* CONFIG_MMU */
246
247	/*
248	* Create a new mm_struct and populate it with a temporary stack
249	* vm_area_struct. We don't have enough context at this point to set the stack
250	* flags, permissions, and offset, so we use temporary values. We'll update
251	* them later in setup_arg_pages().
252	*/
253	static int bprm_mm_init(struct linux_binprm *bprm)
254	{
255	int err;
256	struct mm_struct *mm = NULL;
257
258	bprm->mm = mm = mm_alloc();
259	err = -ENOMEM;
260	if (!mm)
261	goto err;
262
263	/ Save current stack limit for all calculations made during exec. /
264	task_lock(current->group_leader);
265	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
266	task_unlock(current->group_leader);
267
268	#ifndef CONFIG_MMU
269	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
270	#else
271	err = create_init_stack_vma(mm: bprm->mm, vmap: &bprm->vma, top_mem_p: &bprm->p);
272	if (err)
273	goto err;
274	#endif
275
276	return `0`;
277
278	err:
279	if (mm) {
280	bprm->mm = NULL;
281	mmdrop(mm);
282	}
283
284	return err;
285	}
286
287	struct user_arg_ptr {
288	#ifdef CONFIG_COMPAT
289	bool is_compat;
290	#endif
291	union {
292	const char __user *const __user *native;
293	#ifdef CONFIG_COMPAT
294	const compat_uptr_t __user *compat;
295	#endif
296	} ptr;
297	};
298
299	static const char __user get_user_arg_ptr(struct* user_arg_ptr argv, int nr)
300	{
301	const char __user *native;
302
303	#ifdef CONFIG_COMPAT
304	if (unlikely(argv.is_compat)) {
305	compat_uptr_t compat;
306
307	if (get_user(compat, argv.ptr.compat + nr))
308	return ERR_PTR(error: -EFAULT);
309
310	return compat_ptr(uptr: compat);
311	}
312	#endif
313
314	if (get_user(native, argv.ptr.native + nr))
315	return ERR_PTR(error: -EFAULT);
316
317	return native;
318	}
319
320	/*
321	* count() counts the number of strings in array ARGV.
322	*/
323	static int count(struct user_arg_ptr argv, int max)
324	{
325	int i = `0`;
326
327	if (argv.ptr.native != NULL) {
328	for (;;) {
329	const char __user *p = get_user_arg_ptr(argv, nr: i);
330
331	if (!p)
332	break;
333
334	if (IS_ERR(ptr: p))
335	return -EFAULT;
336
337	if (i >= max)
338	return -E2BIG;
339	++i;
340
341	if (fatal_signal_pending(current))
342	return -ERESTARTNOHAND;
343	cond_resched();
344	}
345	}
346	return i;
347	}
348
349	static int count_strings_kernel(const char *const *argv)
350	{
351	int i;
352
353	if (!argv)
354	return `0`;
355
356	for (i = `0`; argv[i]; ++i) {
357	if (i >= MAX_ARG_STRINGS)
358	return -E2BIG;
359	if (fatal_signal_pending(current))
360	return -ERESTARTNOHAND;
361	cond_resched();
362	}
363	return i;
364	}
365
366	static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
367	unsigned long limit)
368	{
369	#ifdef CONFIG_MMU
370	/ Avoid a pathological bprm->p. /
371	if (bprm->p < limit)
372	return -E2BIG;
373	bprm->argmin = bprm->p - limit;
374	#endif
375	return `0`;
376	}
377	static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
378	{
379	#ifdef CONFIG_MMU
380	return bprm->p < bprm->argmin;
381	#else
382	return false;
383	#endif
384	}
385
386	/*
387	* Calculate bprm->argmin from:
388	* - _STK_LIM
389	* - ARG_MAX
390	* - bprm->rlim_stack.rlim_cur
391	* - bprm->argc
392	* - bprm->envc
393	* - bprm->p
394	*/
395	static int bprm_stack_limits(struct linux_binprm *bprm)
396	{
397	unsigned long limit, ptr_size;
398
399	/*
400	* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
401	* (whichever is smaller) for the argv+env strings.
402	* This ensures that:
403	* - the remaining binfmt code will not run out of stack space,
404	* - the program will have a reasonable amount of stack left
405	* to work from.
406	*/
407	limit = _STK_LIM / `4` * `3`;
408	limit = min(limit, bprm->rlim_stack.rlim_cur / `4`);
409	/*
410	* We've historically supported up to 32 pages (ARG_MAX)
411	* of argument strings even with small stacks
412	*/
413	limit = max_t(unsigned long, limit, ARG_MAX);
414	/ Reject totally pathological counts. /
415	if (bprm->argc < `0` \|\| bprm->envc < `0`)
416	return -E2BIG;
417	/*
418	* We must account for the size of all the argv and envp pointers to
419	* the argv and envp strings, since they will also take up space in
420	* the stack. They aren't stored until much later when we can't
421	* signal to the parent that the child has run out of stack space.
422	* Instead, calculate it here so it's possible to fail gracefully.
423	*
424	* In the case of argc = 0, make sure there is space for adding a
425	* empty string (which will bump argc to 1), to ensure confused
426	* userspace programs don't start processing from argv[1], thinking
427	* argc can never be 0, to keep them from walking envp by accident.
428	* See do_execveat_common().
429	*/
430	if (check_add_overflow(max(bprm->argc, `1`), bprm->envc, &ptr_size) \|\|
431	check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
432	return -E2BIG;
433	if (limit <= ptr_size)
434	return -E2BIG;
435	limit -= ptr_size;
436
437	return bprm_set_stack_limit(bprm, limit);
438	}
439
440	/*
441	* 'copy_strings()' copies argument/environment strings from the old
442	* processes's memory to the new process's stack. The call to get_user_pages()
443	* ensures the destination page is created and not swapped out.
444	*/
445	static int copy_strings(int argc, struct user_arg_ptr argv,
446	struct linux_binprm *bprm)
447	{
448	struct page *kmapped_page = NULL;
449	char *kaddr = NULL;
450	unsigned long kpos = `0`;
451	int ret;
452
453	while (argc-- > `0`) {
454	const char __user *str;
455	int len;
456	unsigned long pos;
457
458	ret = -EFAULT;
459	str = get_user_arg_ptr(argv, nr: argc);
460	if (IS_ERR(ptr: str))
461	goto out;
462
463	len = strnlen_user(str, MAX_ARG_STRLEN);
464	if (!len)
465	goto out;
466
467	ret = -E2BIG;
468	if (!valid_arg_len(bprm, len))
469	goto out;
470
471	/ We're going to work our way backwards. /
472	pos = bprm->p;
473	str += len;
474	bprm->p -= len;
475	if (bprm_hit_stack_limit(bprm))
476	goto out;
477
478	while (len > `0`) {
479	int offset, bytes_to_copy;
480
481	if (fatal_signal_pending(current)) {
482	ret = -ERESTARTNOHAND;
483	goto out;
484	}
485	cond_resched();
486
487	offset = pos % PAGE_SIZE;
488	if (offset == `0`)
489	offset = PAGE_SIZE;
490
491	bytes_to_copy = offset;
492	if (bytes_to_copy > len)
493	bytes_to_copy = len;
494
495	offset -= bytes_to_copy;
496	pos -= bytes_to_copy;
497	str -= bytes_to_copy;
498	len -= bytes_to_copy;
499
500	if (!kmapped_page \|\| kpos != (pos & PAGE_MASK)) {
501	struct page *page;
502
503	page = get_arg_page(bprm, pos, write: `1`);
504	if (!page) {
505	ret = -E2BIG;
506	goto out;
507	}
508
509	if (kmapped_page) {
510	flush_dcache_page(page: kmapped_page);
511	kunmap_local(kaddr);
512	put_arg_page(page: kmapped_page);
513	}
514	kmapped_page = page;
515	kaddr = kmap_local_page(page: kmapped_page);
516	kpos = pos & PAGE_MASK;
517	flush_arg_page(bprm, pos: kpos, page: kmapped_page);
518	}
519	if (copy_from_user(to: kaddr+offset, from: str, n: bytes_to_copy)) {
520	ret = -EFAULT;
521	goto out;
522	}
523	}
524	}
525	ret = `0`;
526	out:
527	if (kmapped_page) {
528	flush_dcache_page(page: kmapped_page);
529	kunmap_local(kaddr);
530	put_arg_page(page: kmapped_page);
531	}
532	return ret;
533	}
534
535	/*
536	* Copy and argument/environment string from the kernel to the processes stack.
537	*/
538	int copy_string_kernel(const char arg, struct* linux_binprm *bprm)
539	{
540	int len = strnlen(p: arg, MAX_ARG_STRLEN) + `1` / terminating NUL /;
541	unsigned long pos = bprm->p;
542
543	if (len == `0`)
544	return -EFAULT;
545	if (!valid_arg_len(bprm, len))
546	return -E2BIG;
547
548	/ We're going to work our way backwards. /
549	arg += len;
550	bprm->p -= len;
551	if (bprm_hit_stack_limit(bprm))
552	return -E2BIG;
553
554	while (len > `0`) {
555	unsigned int bytes_to_copy = min_t(unsigned int, len,
556	min_not_zero(offset_in_page(pos), PAGE_SIZE));
557	struct page *page;
558
559	pos -= bytes_to_copy;
560	arg -= bytes_to_copy;
561	len -= bytes_to_copy;
562
563	page = get_arg_page(bprm, pos, write: `1`);
564	if (!page)
565	return -E2BIG;
566	flush_arg_page(bprm, pos: pos & PAGE_MASK, page);
567	memcpy_to_page(page, offset_in_page(pos), from: arg, len: bytes_to_copy);
568	put_arg_page(page);
569	}
570
571	return `0`;
572	}
573	EXPORT_SYMBOL(copy_string_kernel);
574
575	static int copy_strings_kernel(int argc, const char *const *argv,
576	struct linux_binprm *bprm)
577	{
578	while (argc-- > `0`) {
579	int ret = copy_string_kernel(argv[argc], bprm);
580	if (ret < `0`)
581	return ret;
582	if (fatal_signal_pending(current))
583	return -ERESTARTNOHAND;
584	cond_resched();
585	}
586	return `0`;
587	}
588
589	#ifdef CONFIG_MMU
590
591	/*
592	* Finalizes the stack vm_area_struct. The flags and permissions are updated,
593	* the stack is optionally relocated, and some extra space is added.
594	*/
595	int setup_arg_pages(struct linux_binprm *bprm,
596	unsigned long stack_top,
597	int executable_stack)
598	{
599	unsigned long ret;
600	unsigned long stack_shift;
601	struct mm_struct *mm = current->mm;
602	struct vm_area_struct *vma = bprm->vma;
603	struct vm_area_struct *prev = NULL;
604	unsigned long vm_flags;
605	unsigned long stack_base;
606	unsigned long stack_size;
607	unsigned long stack_expand;
608	unsigned long rlim_stack;
609	struct mmu_gather tlb;
610	struct vma_iterator vmi;
611
612	#ifdef CONFIG_STACK_GROWSUP
613	/ Limit stack size /
614	stack_base = bprm->rlim_stack.rlim_max;
615
616	stack_base = calc_max_stack_size(stack_base);
617
618	/ Add space for stack randomization. /
619	if (current->flags & PF_RANDOMIZE)
620	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
621
622	/ Make sure we didn't let the argument array grow too large. /
623	if (vma->vm_end - vma->vm_start > stack_base)
624	return -ENOMEM;
625
626	stack_base = PAGE_ALIGN(stack_top - stack_base);
627
628	stack_shift = vma->vm_start - stack_base;
629	mm->arg_start = bprm->p - stack_shift;
630	bprm->p = vma->vm_end - stack_shift;
631	#else
632	stack_top = arch_align_stack(sp: stack_top);
633	stack_top = PAGE_ALIGN(stack_top);
634
635	if (unlikely(stack_top < mmap_min_addr) \|\|
636	unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
637	return -ENOMEM;
638
639	stack_shift = vma->vm_end - stack_top;
640
641	bprm->p -= stack_shift;
642	mm->arg_start = bprm->p;
643	#endif
644
645	bprm->exec -= stack_shift;
646
647	if (mmap_write_lock_killable(mm))
648	return -EINTR;
649
650	vm_flags = VM_STACK_FLAGS;
651
652	/*
653	* Adjust stack execute permissions; explicitly enable for
654	* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
655	* (arch default) otherwise.
656	*/
657	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
658	vm_flags \|= VM_EXEC;
659	else if (executable_stack == EXSTACK_DISABLE_X)
660	vm_flags &= ~VM_EXEC;
661	vm_flags \|= mm->def_flags;
662	vm_flags \|= VM_STACK_INCOMPLETE_SETUP;
663
664	vma_iter_init(vmi: &vmi, mm, addr: vma->vm_start);
665
666	tlb_gather_mmu(tlb: &tlb, mm);
667	ret = mprotect_fixup(vmi: &vmi, tlb: &tlb, vma, pprev: &prev, start: vma->vm_start, end: vma->vm_end,
668	newflags: vm_flags);
669	tlb_finish_mmu(tlb: &tlb);
670
671	if (ret)
672	goto out_unlock;
673	BUG_ON(prev != vma);
674
675	if (unlikely(vm_flags & VM_EXEC)) {
676	pr_warn_once("process '%pD4' started with executable stack\n",
677	bprm->file);
678	}
679
680	/ Move stack pages down in memory. /
681	if (stack_shift) {
682	/*
683	* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
684	* the binfmt code determines where the new stack should reside, we shift it to
685	* its final location.
686	*/
687	ret = relocate_vma_down(vma, shift: stack_shift);
688	if (ret)
689	goto out_unlock;
690	}
691
692	/ mprotect_fixup is overkill to remove the temporary stack flags /
693	vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);
694
695	stack_expand = `131072UL`; / randomly 324k (or 264k) pages /
696	stack_size = vma->vm_end - vma->vm_start;
697	/*
698	* Align this down to a page boundary as expand_stack
699	* will align it up.
700	*/
701	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
702
703	stack_expand = min(rlim_stack, stack_size + stack_expand);
704
705	#ifdef CONFIG_STACK_GROWSUP
706	stack_base = vma->vm_start + stack_expand;
707	#else
708	stack_base = vma->vm_end - stack_expand;
709	#endif
710	current->mm->start_stack = bprm->p;
711	ret = expand_stack_locked(vma, address: stack_base);
712	if (ret)
713	ret = -EFAULT;
714
715	out_unlock:
716	mmap_write_unlock(mm);
717	return ret;
718	}
719	EXPORT_SYMBOL(setup_arg_pages);
720
721	#else
722
723	/*
724	* Transfer the program arguments and environment from the holding pages
725	* onto the stack. The provided stack pointer is adjusted accordingly.
726	*/
727	int transfer_args_to_stack(struct linux_binprm *bprm,
728	unsigned long *sp_location)
729	{
730	unsigned long index, stop, sp;
731	int ret = `0`;
732
733	stop = bprm->p >> PAGE_SHIFT;
734	sp = *sp_location;
735
736	for (index = MAX_ARG_PAGES - `1`; index >= stop; index--) {
737	unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : `0`;
738	char *src = kmap_local_page(bprm->page[index]) + offset;
739	sp -= PAGE_SIZE - offset;
740	if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != `0`)
741	ret = -EFAULT;
742	kunmap_local(src);
743	if (ret)
744	goto out;
745	}
746
747	bprm->exec += sp_location - MAX_ARG_PAGES PAGE_SIZE;
748	*sp_location = sp;
749
750	out:
751	return ret;
752	}
753	EXPORT_SYMBOL(transfer_args_to_stack);
754
755	#endif /* CONFIG_MMU */
756
757	/*
758	* On success, caller must call do_close_execat() on the returned
759	* struct file to close it.
760	*/
761	static struct file do_open_execat(int* fd, struct filename name, int* flags)
762	{
763	int err;
764	struct file *file __free(fput) = NULL;
765	struct open_flags open_exec_flags = {
766	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
767	.acc_mode = MAY_EXEC,
768	.intent = LOOKUP_OPEN,
769	.lookup_flags = LOOKUP_FOLLOW,
770	};
771
772	if ((flags &
773	~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH \| AT_EXECVE_CHECK)) != `0`)
774	return ERR_PTR(error: -EINVAL);
775	if (flags & AT_SYMLINK_NOFOLLOW)
776	open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
777	if (flags & AT_EMPTY_PATH)
778	open_exec_flags.lookup_flags \|= LOOKUP_EMPTY;
779
780	file = do_filp_open(dfd: fd, pathname: name, op: &open_exec_flags);
781	if (IS_ERR(ptr: file))
782	return file;
783
784	/*
785	* In the past the regular type check was here. It moved to may_open() in
786	* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
787	* an invariant that all non-regular files error out before we get here.
788	*/
789	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) \|\|
790	path_noexec(path: &file->f_path))
791	return ERR_PTR(error: -EACCES);
792
793	err = exe_file_deny_write_access(exe_file: file);
794	if (err)
795	return ERR_PTR(error: err);
796
797	return no_free_ptr(file);
798	}
799
800	/**
801	* open_exec - Open a path name for execution
802	*
803	* @name: path name to open with the intent of executing it.
804	*
805	* Returns ERR_PTR on failure or allocated struct file on success.
806	*
807	* As this is a wrapper for the internal do_open_execat(), callers
808	* must call exe_file_allow_write_access() before fput() on release. Also see
809	* do_close_execat().
810	*/
811	struct file open_exec(const* char *name)
812	{
813	struct filename *filename = getname_kernel(name);
814	struct file *f = ERR_CAST(ptr: filename);
815
816	if (!IS_ERR(ptr: filename)) {
817	f = do_open_execat(AT_FDCWD, name: filename, flags: `0`);
818	putname(name: filename);
819	}
820	return f;
821	}
822	EXPORT_SYMBOL(open_exec);
823
824	#if defined(CONFIG_BINFMT_FLAT) \|\| defined(CONFIG_BINFMT_ELF_FDPIC)
825	ssize_t read_code(struct file file, unsigned* long addr, loff_t pos, size_t len)
826	{
827	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
828	if (res > `0`)
829	flush_icache_user_range(addr, addr + len);
830	return res;
831	}
832	EXPORT_SYMBOL(read_code);
833	#endif
834
835	/*
836	* Maps the mm_struct mm into the current task struct.
837	* On success, this function returns with exec_update_lock
838	* held for writing.
839	*/
840	static int exec_mmap(struct mm_struct *mm)
841	{
842	struct task_struct *tsk;
843	struct mm_struct old_mm, active_mm;
844	int ret;
845
846	/ Notify parent that we're no longer interested in the old VM /
847	tsk = current;
848	old_mm = current->mm;
849	exec_mm_release(tsk, old_mm);
850
851	ret = down_write_killable(sem: &tsk->signal->exec_update_lock);
852	if (ret)
853	return ret;
854
855	if (old_mm) {
856	/*
857	* If there is a pending fatal signal perhaps a signal
858	* whose default action is to create a coredump get
859	* out and die instead of going through with the exec.
860	*/
861	ret = mmap_read_lock_killable(mm: old_mm);
862	if (ret) {
863	up_write(sem: &tsk->signal->exec_update_lock);
864	return ret;
865	}
866	}
867
868	task_lock(p: tsk);
869	membarrier_exec_mmap(mm);
870
871	local_irq_disable();
872	active_mm = tsk->active_mm;
873	tsk->active_mm = mm;
874	tsk->mm = mm;
875	mm_init_cid(mm, p: tsk);
876	/*
877	* This prevents preemption while active_mm is being loaded and
878	* it and mm are being updated, which could cause problems for
879	* lazy tlb mm refcounting when these are updated by context
880	* switches. Not all architectures can handle irqs off over
881	* activate_mm yet.
882	*/
883	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
884	local_irq_enable();
885	activate_mm(active_mm, mm);
886	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
887	local_irq_enable();
888	lru_gen_add_mm(mm);
889	task_unlock(p: tsk);
890	lru_gen_use_mm(mm);
891	if (old_mm) {
892	mmap_read_unlock(mm: old_mm);
893	BUG_ON(active_mm != old_mm);
894	setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: old_mm);
895	mm_update_next_owner(mm: old_mm);
896	mmput(old_mm);
897	return `0`;
898	}
899	mmdrop_lazy_tlb(mm: active_mm);
900	return `0`;
901	}
902
903	static int de_thread(struct task_struct *tsk)
904	{
905	struct signal_struct *sig = tsk->signal;
906	struct sighand_struct *oldsighand = tsk->sighand;
907	spinlock_t *lock = &oldsighand->siglock;
908
909	if (thread_group_empty(p: tsk))
910	goto no_thread_group;
911
912	/*
913	* Kill all other threads in the thread group.
914	*/
915	spin_lock_irq(lock);
916	if ((sig->flags & SIGNAL_GROUP_EXIT) \|\| sig->group_exec_task) {
917	/*
918	* Another group action in progress, just
919	* return so that the signal is processed.
920	*/
921	spin_unlock_irq(lock);
922	return -EAGAIN;
923	}
924
925	sig->group_exec_task = tsk;
926	sig->notify_count = zap_other_threads(p: tsk);
927	if (!thread_group_leader(p: tsk))
928	sig->notify_count--;
929
930	while (sig->notify_count) {
931	__set_current_state(TASK_KILLABLE);
932	spin_unlock_irq(lock);
933	schedule();
934	if (__fatal_signal_pending(p: tsk))
935	goto killed;
936	spin_lock_irq(lock);
937	}
938	spin_unlock_irq(lock);
939
940	/*
941	* At this point all other threads have exited, all we have to
942	* do is to wait for the thread group leader to become inactive,
943	* and to assume its PID:
944	*/
945	if (!thread_group_leader(p: tsk)) {
946	struct task_struct *leader = tsk->group_leader;
947
948	for (;;) {
949	cgroup_threadgroup_change_begin(tsk);
950	write_lock_irq(&tasklist_lock);
951	/*
952	* Do this under tasklist_lock to ensure that
953	* exit_notify() can't miss ->group_exec_task
954	*/
955	sig->notify_count = -`1`;
956	if (likely(leader->exit_state))
957	break;
958	__set_current_state(TASK_KILLABLE);
959	write_unlock_irq(&tasklist_lock);
960	cgroup_threadgroup_change_end(tsk);
961	schedule();
962	if (__fatal_signal_pending(p: tsk))
963	goto killed;
964	}
965
966	/*
967	* The only record we have of the real-time age of a
968	* process, regardless of execs it's done, is start_time.
969	* All the past CPU time is accumulated in signal_struct
970	* from sister threads now dead. But in this non-leader
971	* exec, nothing survives from the original leader thread,
972	* whose birth marks the true age of this process now.
973	* When we take on its identity by switching to its PID, we
974	* also take its birthdate (always earlier than our own).
975	*/
976	tsk->start_time = leader->start_time;
977	tsk->start_boottime = leader->start_boottime;
978
979	BUG_ON(!same_thread_group(leader, tsk));
980	/*
981	* An exec() starts a new thread group with the
982	* TGID of the previous thread group. Rehash the
983	* two threads with a switched PID, and release
984	* the former thread group leader:
985	*/
986
987	/ Become a process group leader with the old leader's pid.*
988	* The old leader becomes a thread of the this thread group.
989	*/
990	exchange_tids(task: tsk, old: leader);
991	transfer_pid(old: leader, new: tsk, PIDTYPE_TGID);
992	transfer_pid(old: leader, new: tsk, PIDTYPE_PGID);
993	transfer_pid(old: leader, new: tsk, PIDTYPE_SID);
994
995	list_replace_rcu(old: &leader->tasks, new: &tsk->tasks);
996	list_replace_init(old: &leader->sibling, new: &tsk->sibling);
997
998	tsk->group_leader = tsk;
999	leader->group_leader = tsk;
1000
1001	tsk->exit_signal = SIGCHLD;
1002	leader->exit_signal = -`1`;
1003
1004	BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1005	leader->exit_state = EXIT_DEAD;
1006	/*
1007	* We are going to release_task()->ptrace_unlink() silently,
1008	* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1009	* the tracer won't block again waiting for this thread.
1010	*/
1011	if (unlikely(leader->ptrace))
1012	__wake_up_parent(p: leader, parent: leader->parent);
1013	write_unlock_irq(&tasklist_lock);
1014	cgroup_threadgroup_change_end(tsk);
1015
1016	release_task(p: leader);
1017	}
1018
1019	sig->group_exec_task = NULL;
1020	sig->notify_count = `0`;
1021
1022	no_thread_group:
1023	/ we have changed execution domain /
1024	tsk->exit_signal = SIGCHLD;
1025
1026	BUG_ON(!thread_group_leader(tsk));
1027	return `0`;
1028
1029	killed:
1030	/ protects against exit_notify() and __exit_signal() /
1031	read_lock(&tasklist_lock);
1032	sig->group_exec_task = NULL;
1033	sig->notify_count = `0`;
1034	read_unlock(&tasklist_lock);
1035	return -EAGAIN;
1036	}
1037
1038
1039	/*
1040	* This function makes sure the current process has its own signal table,
1041	* so that flush_signal_handlers can later reset the handlers without
1042	* disturbing other processes. (Other processes might share the signal
1043	* table via the CLONE_SIGHAND option to clone().)
1044	*/
1045	static int unshare_sighand(struct task_struct *me)
1046	{
1047	struct sighand_struct *oldsighand = me->sighand;
1048
1049	if (refcount_read(r: &oldsighand->count) != `1`) {
1050	struct sighand_struct *newsighand;
1051	/*
1052	* This ->sighand is shared with the CLONE_SIGHAND
1053	* but not CLONE_THREAD task, switch to the new one.
1054	*/
1055	newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1056	if (!newsighand)
1057	return -ENOMEM;
1058
1059	refcount_set(r: &newsighand->count, n: `1`);
1060
1061	write_lock_irq(&tasklist_lock);
1062	spin_lock(lock: &oldsighand->siglock);
1063	memcpy(newsighand->action, oldsighand->action,
1064	sizeof(newsighand->action));
1065	rcu_assign_pointer(me->sighand, newsighand);
1066	spin_unlock(lock: &oldsighand->siglock);
1067	write_unlock_irq(&tasklist_lock);
1068
1069	__cleanup_sighand(oldsighand);
1070	}
1071	return `0`;
1072	}
1073
1074	/*
1075	* This is unlocked -- the string will always be NUL-terminated, but
1076	* may show overlapping contents if racing concurrent reads.
1077	*/
1078	void __set_task_comm(struct task_struct tsk, const* char *buf, bool exec)
1079	{
1080	size_t len = min(strlen(buf), sizeof(tsk->comm) - `1`);
1081
1082	trace_task_rename(task: tsk, comm: buf);
1083	memcpy(tsk->comm, buf, len);
1084	memset(&tsk->comm[len], `0`, sizeof(tsk->comm) - len);
1085	perf_event_comm(tsk, exec);
1086	}
1087
1088	/*
1089	* Calling this is the point of no return. None of the failures will be
1090	* seen by userspace since either the process is already taking a fatal
1091	* signal (via de_thread() or coredump), or will have SEGV raised
1092	* (after exec_mmap()) by search_binary_handler (see below).
1093	*/
1094	int begin_new_exec(struct linux_binprm * bprm)
1095	{
1096	struct task_struct *me = current;
1097	int retval;
1098
1099	/ Once we are committed compute the creds /
1100	retval = bprm_creds_from_file(bprm);
1101	if (retval)
1102	return retval;
1103
1104	/*
1105	* This tracepoint marks the point before flushing the old exec where
1106	* the current task is still unchanged, but errors are fatal (point of
1107	* no return). The later "sched_process_exec" tracepoint is called after
1108	* the current task has successfully switched to the new exec.
1109	*/
1110	trace_sched_prepare_exec(current, bprm);
1111
1112	/*
1113	* Ensure all future errors are fatal.
1114	*/
1115	bprm->point_of_no_return = true;
1116
1117	/ Make this the only thread in the thread group /
1118	retval = de_thread(tsk: me);
1119	if (retval)
1120	goto out;
1121	/ see the comment in check_unsafe_exec() /
1122	current->fs->in_exec = `0`;
1123	/*
1124	* Cancel any io_uring activity across execve
1125	*/
1126	io_uring_task_cancel();
1127
1128	/ Ensure the files table is not shared. /
1129	retval = unshare_files();
1130	if (retval)
1131	goto out;
1132
1133	/*
1134	* Must be called _before_ exec_mmap() as bprm->mm is
1135	* not visible until then. Doing it here also ensures
1136	* we don't race against replace_mm_exe_file().
1137	*/
1138	retval = set_mm_exe_file(mm: bprm->mm, new_exe_file: bprm->file);
1139	if (retval)
1140	goto out;
1141
1142	/ If the binary is not readable then enforce mm->dumpable=0 /
1143	would_dump(bprm, bprm->file);
1144	if (bprm->have_execfd)
1145	would_dump(bprm, bprm->executable);
1146
1147	/*
1148	* Release all of the old mmap stuff
1149	*/
1150	acct_arg_size(bprm, pages: `0`);
1151	retval = exec_mmap(mm: bprm->mm);
1152	if (retval)
1153	goto out;
1154
1155	bprm->mm = NULL;
1156
1157	retval = exec_task_namespaces();
1158	if (retval)
1159	goto out_unlock;
1160
1161	#ifdef CONFIG_POSIX_TIMERS
1162	spin_lock_irq(lock: &me->sighand->siglock);
1163	posix_cpu_timers_exit(task: me);
1164	spin_unlock_irq(lock: &me->sighand->siglock);
1165	exit_itimers(me);
1166	flush_itimer_signals();
1167	#endif
1168
1169	/*
1170	* Make the signal table private.
1171	*/
1172	retval = unshare_sighand(me);
1173	if (retval)
1174	goto out_unlock;
1175
1176	me->flags &= ~(PF_RANDOMIZE \| PF_FORKNOEXEC \|
1177	PF_NOFREEZE \| PF_NO_SETAFFINITY);
1178	flush_thread();
1179	me->personality &= ~bprm->per_clear;
1180
1181	clear_syscall_work_syscall_user_dispatch(me);
1182
1183	/*
1184	* We have to apply CLOEXEC before we change whether the process is
1185	* dumpable (in setup_new_exec) to avoid a race with a process in userspace
1186	* trying to access the should-be-closed file descriptors of a process
1187	* undergoing exec(2).
1188	*/
1189	do_close_on_exec(me->files);
1190
1191	if (bprm->secureexec) {
1192	/ Make sure parent cannot signal privileged process. /
1193	me->pdeath_signal = `0`;
1194
1195	/*
1196	* For secureexec, reset the stack limit to sane default to
1197	* avoid bad behavior from the prior rlimits. This has to
1198	* happen before arch_pick_mmap_layout(), which examines
1199	* RLIMIT_STACK, but after the point of no return to avoid
1200	* needing to clean up the change on failure.
1201	*/
1202	if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1203	bprm->rlim_stack.rlim_cur = _STK_LIM;
1204	}
1205
1206	me->sas_ss_sp = me->sas_ss_size = `0`;
1207
1208	/*
1209	* Figure out dumpability. Note that this checking only of current
1210	* is wrong, but userspace depends on it. This should be testing
1211	* bprm->secureexec instead.
1212	*/
1213	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP \|\|
1214	!(uid_eq(current_euid(), current_uid()) &&
1215	gid_eq(current_egid(), current_gid())))
1216	set_dumpable(current->mm, value: suid_dumpable);
1217	else
1218	set_dumpable(current->mm, SUID_DUMP_USER);
1219
1220	perf_event_exec();
1221
1222	/*
1223	* If the original filename was empty, alloc_bprm() made up a path
1224	* that will probably not be useful to admins running ps or similar.
1225	* Let's fix it up to be something reasonable.
1226	*/
1227	if (bprm->comm_from_dentry) {
1228	/*
1229	* Hold RCU lock to keep the name from being freed behind our back.
1230	* Use acquire semantics to make sure the terminating NUL from
1231	* __d_alloc() is seen.
1232	*
1233	* Note, we're deliberately sloppy here. We don't need to care about
1234	* detecting a concurrent rename and just want a terminated name.
1235	*/
1236	rcu_read_lock();
1237	__set_task_comm(tsk: me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
1238	exec: true);
1239	rcu_read_unlock();
1240	} else {
1241	__set_task_comm(tsk: me, buf: kbasename(path: bprm->filename), exec: true);
1242	}
1243
1244	/ An exec changes our domain. We are no longer part of the thread*
1245	group /*
1246	WRITE_ONCE(me->self_exec_id, me->self_exec_id + `1`);
1247	flush_signal_handlers(me, force_default: `0`);
1248
1249	retval = set_cred_ucounts(bprm->cred);
1250	if (retval < `0`)
1251	goto out_unlock;
1252
1253	/*
1254	* install the new credentials for this executable
1255	*/
1256	security_bprm_committing_creds(bprm);
1257
1258	commit_creds(bprm->cred);
1259	bprm->cred = NULL;
1260
1261	/*
1262	* Disable monitoring for regular users
1263	* when executing setuid binaries. Must
1264	* wait until new credentials are committed
1265	* by commit_creds() above
1266	*/
1267	if (get_dumpable(mm: me->mm) != SUID_DUMP_USER)
1268	perf_event_exit_task(child: me);
1269	/*
1270	* cred_guard_mutex must be held at least to this point to prevent
1271	* ptrace_attach() from altering our determination of the task's
1272	* credentials; any time after this it may be unlocked.
1273	*/
1274	security_bprm_committed_creds(bprm);
1275
1276	/ Pass the opened binary to the interpreter. /
1277	if (bprm->have_execfd) {
1278	retval = get_unused_fd_flags(flags: `0`);
1279	if (retval < `0`)
1280	goto out_unlock;
1281	fd_install(fd: retval, file: bprm->executable);
1282	bprm->executable = NULL;
1283	bprm->execfd = retval;
1284	}
1285	return `0`;
1286
1287	out_unlock:
1288	up_write(sem: &me->signal->exec_update_lock);
1289	if (!bprm->cred)
1290	mutex_unlock(lock: &me->signal->cred_guard_mutex);
1291
1292	out:
1293	return retval;
1294	}
1295	EXPORT_SYMBOL(begin_new_exec);
1296
1297	void would_dump(struct linux_binprm bprm, struct* file *file)
1298	{
1299	struct inode *inode = file_inode(f: file);
1300	struct mnt_idmap *idmap = file_mnt_idmap(file);
1301	if (inode_permission(idmap, inode, MAY_READ) < `0`) {
1302	struct user_namespace old, user_ns;
1303	bprm->interp_flags \|= BINPRM_FLAGS_ENFORCE_NONDUMP;
1304
1305	/ Ensure mm->user_ns contains the executable /
1306	user_ns = old = bprm->mm->user_ns;
1307	while ((user_ns != &init_user_ns) &&
1308	!privileged_wrt_inode_uidgid(ns: user_ns, idmap, inode))
1309	user_ns = user_ns->parent;
1310
1311	if (old != user_ns) {
1312	bprm->mm->user_ns = get_user_ns(ns: user_ns);
1313	put_user_ns(ns: old);
1314	}
1315	}
1316	}
1317	EXPORT_SYMBOL(would_dump);
1318
1319	void setup_new_exec(struct linux_binprm * bprm)
1320	{
1321	/ Setup things that can depend upon the personality /
1322	struct task_struct *me = current;
1323
1324	arch_pick_mmap_layout(mm: me->mm, rlim_stack: &bprm->rlim_stack);
1325
1326	arch_setup_new_exec();
1327
1328	/ Set the new mm task size. We have to do that late because it may*
1329	* depend on TIF_32BIT which is only updated in flush_thread() on
1330	* some architectures like powerpc
1331	*/
1332	me->mm->task_size = TASK_SIZE;
1333	up_write(sem: &me->signal->exec_update_lock);
1334	mutex_unlock(lock: &me->signal->cred_guard_mutex);
1335	}
1336	EXPORT_SYMBOL(setup_new_exec);
1337
1338	/ Runs immediately before start_thread() takes over. /
1339	void finalize_exec(struct linux_binprm *bprm)
1340	{
1341	/ Store any stack rlimit changes before starting thread. /
1342	task_lock(current->group_leader);
1343	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1344	task_unlock(current->group_leader);
1345	}
1346	EXPORT_SYMBOL(finalize_exec);
1347
1348	/*
1349	* Prepare credentials and lock ->cred_guard_mutex.
1350	* setup_new_exec() commits the new creds and drops the lock.
1351	* Or, if exec fails before, free_bprm() should release ->cred
1352	* and unlock.
1353	*/
1354	static int prepare_bprm_creds(struct linux_binprm *bprm)
1355	{
1356	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1357	return -ERESTARTNOINTR;
1358
1359	bprm->cred = prepare_exec_creds();
1360	if (likely(bprm->cred))
1361	return `0`;
1362
1363	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1364	return -ENOMEM;
1365	}
1366
1367	/ Matches do_open_execat() /
1368	static void do_close_execat(struct file *file)
1369	{
1370	if (!file)
1371	return;
1372	exe_file_allow_write_access(exe_file: file);
1373	fput(file);
1374	}
1375
1376	static void free_bprm(struct linux_binprm *bprm)
1377	{
1378	if (bprm->mm) {
1379	acct_arg_size(bprm, pages: `0`);
1380	mmput(bprm->mm);
1381	}
1382	free_arg_pages(bprm);
1383	if (bprm->cred) {
1384	/ in case exec fails before de_thread() succeeds /
1385	current->fs->in_exec = `0`;
1386	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1387	abort_creds(bprm->cred);
1388	}
1389	do_close_execat(file: bprm->file);
1390	if (bprm->executable)
1391	fput(bprm->executable);
1392	/ If a binfmt changed the interp, free it. /
1393	if (bprm->interp != bprm->filename)
1394	kfree(objp: bprm->interp);
1395	kfree(objp: bprm->fdpath);
1396	kfree(objp: bprm);
1397	}
1398
1399	static struct linux_binprm alloc_bprm(int* fd, struct filename filename, int* flags)
1400	{
1401	struct linux_binprm *bprm;
1402	struct file *file;
1403	int retval = -ENOMEM;
1404
1405	file = do_open_execat(fd, name: filename, flags);
1406	if (IS_ERR(ptr: file))
1407	return ERR_CAST(ptr: file);
1408
1409	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1410	if (!bprm) {
1411	do_close_execat(file);
1412	return ERR_PTR(error: -ENOMEM);
1413	}
1414
1415	bprm->file = file;
1416
1417	if (fd == AT_FDCWD \|\| filename->name[`0`] == `'/'`) {
1418	bprm->filename = filename->name;
1419	} else {
1420	if (filename->name[`0`] == `'\0'`) {
1421	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d", fd);
1422	bprm->comm_from_dentry = `1`;
1423	} else {
1424	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d/%s",
1425	fd, filename->name);
1426	}
1427	if (!bprm->fdpath)
1428	goto out_free;
1429
1430	/*
1431	* Record that a name derived from an O_CLOEXEC fd will be
1432	* inaccessible after exec. This allows the code in exec to
1433	* choose to fail when the executable is not mmaped into the
1434	* interpreter and an open file descriptor is not passed to
1435	* the interpreter. This makes for a better user experience
1436	* than having the interpreter start and then immediately fail
1437	* when it finds the executable is inaccessible.
1438	*/
1439	if (get_close_on_exec(fd))
1440	bprm->interp_flags \|= BINPRM_FLAGS_PATH_INACCESSIBLE;
1441
1442	bprm->filename = bprm->fdpath;
1443	}
1444	bprm->interp = bprm->filename;
1445
1446	/*
1447	* At this point, security_file_open() has already been called (with
1448	* __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
1449	* stop just after the security_bprm_creds_for_exec() call in
1450	* bprm_execve(). Indeed, the kernel should not try to parse the
1451	* content of the file with exec_binprm() nor change the calling
1452	* thread, which means that the following security functions will not
1453	* be called:
1454	* - security_bprm_check()
1455	* - security_bprm_creds_from_file()
1456	* - security_bprm_committing_creds()
1457	* - security_bprm_committed_creds()
1458	*/
1459	bprm->is_check = !!(flags & AT_EXECVE_CHECK);
1460
1461	retval = bprm_mm_init(bprm);
1462	if (!retval)
1463	return bprm;
1464
1465	out_free:
1466	free_bprm(bprm);
1467	return ERR_PTR(error: retval);
1468	}
1469
1470	int bprm_change_interp(const char interp, struct* linux_binprm *bprm)
1471	{
1472	/ If a binfmt changed the interp, free it first. /
1473	if (bprm->interp != bprm->filename)
1474	kfree(objp: bprm->interp);
1475	bprm->interp = kstrdup(s: interp, GFP_KERNEL);
1476	if (!bprm->interp)
1477	return -ENOMEM;
1478	return `0`;
1479	}
1480	EXPORT_SYMBOL(bprm_change_interp);
1481
1482	/*
1483	* determine how safe it is to execute the proposed program
1484	* - the caller must hold ->cred_guard_mutex to protect against
1485	* PTRACE_ATTACH or seccomp thread-sync
1486	*/
1487	static void check_unsafe_exec(struct linux_binprm *bprm)
1488	{
1489	struct task_struct p = current, t;
1490	unsigned n_fs;
1491
1492	if (p->ptrace)
1493	bprm->unsafe \|= LSM_UNSAFE_PTRACE;
1494
1495	/*
1496	* This isn't strictly necessary, but it makes it harder for LSMs to
1497	* mess up.
1498	*/
1499	if (task_no_new_privs(current))
1500	bprm->unsafe \|= LSM_UNSAFE_NO_NEW_PRIVS;
1501
1502	/*
1503	* If another task is sharing our fs, we cannot safely
1504	* suid exec because the differently privileged task
1505	* will be able to manipulate the current directory, etc.
1506	* It would be nice to force an unshare instead...
1507	*
1508	* Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
1509	* from another sub-thread until de_thread() succeeds, this
1510	* state is protected by cred_guard_mutex we hold.
1511	*/
1512	n_fs = `1`;
1513	spin_lock(lock: &p->fs->lock);
1514	rcu_read_lock();
1515	for_other_threads(p, t) {
1516	if (t->fs == p->fs)
1517	n_fs++;
1518	}
1519	rcu_read_unlock();
1520
1521	/ "users" and "in_exec" locked for copy_fs() /
1522	if (p->fs->users > n_fs)
1523	bprm->unsafe \|= LSM_UNSAFE_SHARE;
1524	else
1525	p->fs->in_exec = `1`;
1526	spin_unlock(lock: &p->fs->lock);
1527	}
1528
1529	static void bprm_fill_uid(struct linux_binprm bprm, struct* file *file)
1530	{
1531	/ Handle suid and sgid on files /
1532	struct mnt_idmap *idmap;
1533	struct inode *inode = file_inode(f: file);
1534	unsigned int mode;
1535	vfsuid_t vfsuid;
1536	vfsgid_t vfsgid;
1537	int err;
1538
1539	if (!mnt_may_suid(mnt: file->f_path.mnt))
1540	return;
1541
1542	if (task_no_new_privs(current))
1543	return;
1544
1545	mode = READ_ONCE(inode->i_mode);
1546	if (!(mode & (S_ISUID\|S_ISGID)))
1547	return;
1548
1549	idmap = file_mnt_idmap(file);
1550
1551	/ Be careful if suid/sgid is set /
1552	inode_lock(inode);
1553
1554	/ Atomically reload and check mode/uid/gid now that lock held. /
1555	mode = inode->i_mode;
1556	vfsuid = i_uid_into_vfsuid(idmap, inode);
1557	vfsgid = i_gid_into_vfsgid(idmap, inode);
1558	err = inode_permission(idmap, inode, MAY_EXEC);
1559	inode_unlock(inode);
1560
1561	/ Did the exec bit vanish out from under us? Give up. /
1562	if (err)
1563	return;
1564
1565	/ We ignore suid/sgid if there are no mappings for them in the ns /
1566	if (!vfsuid_has_mapping(userns: bprm->cred->user_ns, vfsuid) \|\|
1567	!vfsgid_has_mapping(userns: bprm->cred->user_ns, vfsgid))
1568	return;
1569
1570	if (mode & S_ISUID) {
1571	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1572	bprm->cred->euid = vfsuid_into_kuid(vfsuid);
1573	}
1574
1575	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP)) {
1576	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1577	bprm->cred->egid = vfsgid_into_kgid(vfsgid);
1578	}
1579	}
1580
1581	/*
1582	* Compute brpm->cred based upon the final binary.
1583	*/
1584	static int bprm_creds_from_file(struct linux_binprm *bprm)
1585	{
1586	/ Compute creds based on which file? /
1587	struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1588
1589	bprm_fill_uid(bprm, file);
1590	return security_bprm_creds_from_file(bprm, file);
1591	}
1592
1593	/*
1594	* Fill the binprm structure from the inode.
1595	* Read the first BINPRM_BUF_SIZE bytes
1596	*
1597	* This may be called multiple times for binary chains (scripts for example).
1598	*/
1599	static int prepare_binprm(struct linux_binprm *bprm)
1600	{
1601	loff_t pos = `0`;
1602
1603	memset(bprm->buf, `0`, BINPRM_BUF_SIZE);
1604	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1605	}
1606
1607	/*
1608	* Arguments are '\0' separated strings found at the location bprm->p
1609	* points to; chop off the first by relocating brpm->p to right after
1610	* the first '\0' encountered.
1611	*/
1612	int remove_arg_zero(struct linux_binprm *bprm)
1613	{
1614	unsigned long offset;
1615	char *kaddr;
1616	struct page *page;
1617
1618	if (!bprm->argc)
1619	return `0`;
1620
1621	do {
1622	offset = bprm->p & ~PAGE_MASK;
1623	page = get_arg_page(bprm, pos: bprm->p, write: `0`);
1624	if (!page)
1625	return -EFAULT;
1626	kaddr = kmap_local_page(page);
1627
1628	for (; offset < PAGE_SIZE && kaddr[offset];
1629	offset++, bprm->p++)
1630	;
1631
1632	kunmap_local(kaddr);
1633	put_arg_page(page);
1634	} while (offset == PAGE_SIZE);
1635
1636	bprm->p++;
1637	bprm->argc--;
1638
1639	return `0`;
1640	}
1641	EXPORT_SYMBOL(remove_arg_zero);
1642
1643	/*
1644	* cycle the list of binary formats handler, until one recognizes the image
1645	*/
1646	static int search_binary_handler(struct linux_binprm *bprm)
1647	{
1648	struct linux_binfmt *fmt;
1649	int retval;
1650
1651	retval = prepare_binprm(bprm);
1652	if (retval < `0`)
1653	return retval;
1654
1655	retval = security_bprm_check(bprm);
1656	if (retval)
1657	return retval;
1658
1659	read_lock(&binfmt_lock);
1660	list_for_each_entry(fmt, &formats, lh) {
1661	if (!try_module_get(module: fmt->module))
1662	continue;
1663	read_unlock(&binfmt_lock);
1664
1665	retval = fmt->load_binary(bprm);
1666
1667	read_lock(&binfmt_lock);
1668	put_binfmt(fmt);
1669	if (bprm->point_of_no_return \|\| (retval != -ENOEXEC)) {
1670	read_unlock(&binfmt_lock);
1671	return retval;
1672	}
1673	}
1674	read_unlock(&binfmt_lock);
1675
1676	return -ENOEXEC;
1677	}
1678
1679	/ binfmt handlers will call back into begin_new_exec() on success. /
1680	static int exec_binprm(struct linux_binprm *bprm)
1681	{
1682	pid_t old_pid, old_vpid;
1683	int ret, depth;
1684
1685	/ Need to fetch pid before load_binary changes it /
1686	old_pid = current->pid;
1687	rcu_read_lock();
1688	old_vpid = task_pid_nr_ns(current, ns: task_active_pid_ns(current->parent));
1689	rcu_read_unlock();
1690
1691	/ This allows 4 levels of binfmt rewrites before failing hard. /
1692	for (depth = `0`;; depth++) {
1693	struct file *exec;
1694	if (depth > `5`)
1695	return -ELOOP;
1696
1697	ret = search_binary_handler(bprm);
1698	if (ret < `0`)
1699	return ret;
1700	if (!bprm->interpreter)
1701	break;
1702
1703	exec = bprm->file;
1704	bprm->file = bprm->interpreter;
1705	bprm->interpreter = NULL;
1706
1707	exe_file_allow_write_access(exe_file: exec);
1708	if (unlikely(bprm->have_execfd)) {
1709	if (bprm->executable) {
1710	fput(exec);
1711	return -ENOEXEC;
1712	}
1713	bprm->executable = exec;
1714	} else
1715	fput(exec);
1716	}
1717
1718	audit_bprm(bprm);
1719	trace_sched_process_exec(current, old_pid, bprm);
1720	ptrace_event(PTRACE_EVENT_EXEC, message: old_vpid);
1721	proc_exec_connector(current);
1722	return `0`;
1723	}
1724
1725	static int bprm_execve(struct linux_binprm *bprm)
1726	{
1727	int retval;
1728
1729	retval = prepare_bprm_creds(bprm);
1730	if (retval)
1731	return retval;
1732
1733	/*
1734	* Check for unsafe execution states before exec_binprm(), which
1735	* will call back into begin_new_exec(), into bprm_creds_from_file(),
1736	* where setuid-ness is evaluated.
1737	*/
1738	check_unsafe_exec(bprm);
1739	current->in_execve = `1`;
1740	sched_mm_cid_before_execve(current);
1741
1742	sched_exec();
1743
1744	/ Set the unchanging part of bprm->cred /
1745	retval = security_bprm_creds_for_exec(bprm);
1746	if (retval \|\| bprm->is_check)
1747	goto out;
1748
1749	retval = exec_binprm(bprm);
1750	if (retval < `0`)
1751	goto out;
1752
1753	sched_mm_cid_after_execve(current);
1754	rseq_execve(current);
1755	/ execve succeeded /
1756	current->in_execve = `0`;
1757	user_events_execve(current);
1758	acct_update_integrals(current);
1759	task_numa_free(current, final: false);
1760	return retval;
1761
1762	out:
1763	/*
1764	* If past the point of no return ensure the code never
1765	* returns to the userspace process. Use an existing fatal
1766	* signal if present otherwise terminate the process with
1767	* SIGSEGV.
1768	*/
1769	if (bprm->point_of_no_return && !fatal_signal_pending(current))
1770	force_fatal_sig(SIGSEGV);
1771
1772	sched_mm_cid_after_execve(current);
1773	rseq_set_notify_resume(current);
1774	current->in_execve = `0`;
1775
1776	return retval;
1777	}
1778
1779	static int do_execveat_common(int fd, struct filename *filename,
1780	struct user_arg_ptr argv,
1781	struct user_arg_ptr envp,
1782	int flags)
1783	{
1784	struct linux_binprm *bprm;
1785	int retval;
1786
1787	if (IS_ERR(ptr: filename))
1788	return PTR_ERR(ptr: filename);
1789
1790	/*
1791	* We move the actual failure in case of RLIMIT_NPROC excess from
1792	* set*uid() to execve() because too many poorly written programs
1793	* don't check setuid() return code. Here we additionally recheck
1794	* whether NPROC limit is still exceeded.
1795	*/
1796	if ((current->flags & PF_NPROC_EXCEEDED) &&
1797	is_rlimit_overlimit(current_ucounts(), type: UCOUNT_RLIMIT_NPROC, max: rlimit(RLIMIT_NPROC))) {
1798	retval = -EAGAIN;
1799	goto out_ret;
1800	}
1801
1802	/ We're below the limit (still or again), so we don't want to make*
1803	* further execve() calls fail. */
1804	current->flags &= ~PF_NPROC_EXCEEDED;
1805
1806	bprm = alloc_bprm(fd, filename, flags);
1807	if (IS_ERR(ptr: bprm)) {
1808	retval = PTR_ERR(ptr: bprm);
1809	goto out_ret;
1810	}
1811
1812	retval = count(argv, MAX_ARG_STRINGS);
1813	if (retval < `0`)
1814	goto out_free;
1815	bprm->argc = retval;
1816
1817	retval = count(argv: envp, MAX_ARG_STRINGS);
1818	if (retval < `0`)
1819	goto out_free;
1820	bprm->envc = retval;
1821
1822	retval = bprm_stack_limits(bprm);
1823	if (retval < `0`)
1824	goto out_free;
1825
1826	retval = copy_string_kernel(bprm->filename, bprm);
1827	if (retval < `0`)
1828	goto out_free;
1829	bprm->exec = bprm->p;
1830
1831	retval = copy_strings(argc: bprm->envc, argv: envp, bprm);
1832	if (retval < `0`)
1833	goto out_free;
1834
1835	retval = copy_strings(argc: bprm->argc, argv, bprm);
1836	if (retval < `0`)
1837	goto out_free;
1838
1839	/*
1840	* When argv is empty, add an empty string ("") as argv[0] to
1841	* ensure confused userspace programs that start processing
1842	* from argv[1] won't end up walking envp. See also
1843	* bprm_stack_limits().
1844	*/
1845	if (bprm->argc == `0`) {
1846	retval = copy_string_kernel("", bprm);
1847	if (retval < `0`)
1848	goto out_free;
1849	bprm->argc = `1`;
1850
1851	pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1852	current->comm, bprm->filename);
1853	}
1854
1855	retval = bprm_execve(bprm);
1856	out_free:
1857	free_bprm(bprm);
1858
1859	out_ret:
1860	putname(name: filename);
1861	return retval;
1862	}
1863
1864	int kernel_execve(const char *kernel_filename,
1865	const char *const argv, const* char *const *envp)
1866	{
1867	struct filename *filename;
1868	struct linux_binprm *bprm;
1869	int fd = AT_FDCWD;
1870	int retval;
1871
1872	/ It is non-sense for kernel threads to call execve /
1873	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
1874	return -EINVAL;
1875
1876	filename = getname_kernel(kernel_filename);
1877	if (IS_ERR(ptr: filename))
1878	return PTR_ERR(ptr: filename);
1879
1880	bprm = alloc_bprm(fd, filename, flags: `0`);
1881	if (IS_ERR(ptr: bprm)) {
1882	retval = PTR_ERR(ptr: bprm);
1883	goto out_ret;
1884	}
1885
1886	retval = count_strings_kernel(argv);
1887	if (WARN_ON_ONCE(retval == `0`))
1888	retval = -EINVAL;
1889	if (retval < `0`)
1890	goto out_free;
1891	bprm->argc = retval;
1892
1893	retval = count_strings_kernel(argv: envp);
1894	if (retval < `0`)
1895	goto out_free;
1896	bprm->envc = retval;
1897
1898	retval = bprm_stack_limits(bprm);
1899	if (retval < `0`)
1900	goto out_free;
1901
1902	retval = copy_string_kernel(bprm->filename, bprm);
1903	if (retval < `0`)
1904	goto out_free;
1905	bprm->exec = bprm->p;
1906
1907	retval = copy_strings_kernel(argc: bprm->envc, argv: envp, bprm);
1908	if (retval < `0`)
1909	goto out_free;
1910
1911	retval = copy_strings_kernel(argc: bprm->argc, argv, bprm);
1912	if (retval < `0`)
1913	goto out_free;
1914
1915	retval = bprm_execve(bprm);
1916	out_free:
1917	free_bprm(bprm);
1918	out_ret:
1919	putname(name: filename);
1920	return retval;
1921	}
1922
1923	static int do_execve(struct filename *filename,
1924	const char __user *const __user *__argv,
1925	const char __user *const __user *__envp)
1926	{
1927	struct user_arg_ptr argv = { .ptr.native = __argv };
1928	struct user_arg_ptr envp = { .ptr.native = __envp };
1929	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
1930	}
1931
1932	static int do_execveat(int fd, struct filename *filename,
1933	const char __user *const __user *__argv,
1934	const char __user *const __user *__envp,
1935	int flags)
1936	{
1937	struct user_arg_ptr argv = { .ptr.native = __argv };
1938	struct user_arg_ptr envp = { .ptr.native = __envp };
1939
1940	return do_execveat_common(fd, filename, argv, envp, flags);
1941	}
1942
1943	#ifdef CONFIG_COMPAT
1944	static int compat_do_execve(struct filename *filename,
1945	const compat_uptr_t __user *__argv,
1946	const compat_uptr_t __user *__envp)
1947	{
1948	struct user_arg_ptr argv = {
1949	.is_compat = true,
1950	.ptr.compat = __argv,
1951	};
1952	struct user_arg_ptr envp = {
1953	.is_compat = true,
1954	.ptr.compat = __envp,
1955	};
1956	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
1957	}
1958
1959	static int compat_do_execveat(int fd, struct filename *filename,
1960	const compat_uptr_t __user *__argv,
1961	const compat_uptr_t __user *__envp,
1962	int flags)
1963	{
1964	struct user_arg_ptr argv = {
1965	.is_compat = true,
1966	.ptr.compat = __argv,
1967	};
1968	struct user_arg_ptr envp = {
1969	.is_compat = true,
1970	.ptr.compat = __envp,
1971	};
1972	return do_execveat_common(fd, filename, argv, envp, flags);
1973	}
1974	#endif
1975
1976	void set_binfmt(struct linux_binfmt *new)
1977	{
1978	struct mm_struct *mm = current->mm;
1979
1980	if (mm->binfmt)
1981	module_put(module: mm->binfmt->module);
1982
1983	mm->binfmt = new;
1984	if (new)
1985	__module_get(module: new->module);
1986	}
1987	EXPORT_SYMBOL(set_binfmt);
1988
1989	/*
1990	* set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1991	*/
1992	void set_dumpable(struct mm_struct mm, int* value)
1993	{
1994	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1995	return;
1996
1997	set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
1998	}
1999
2000	SYSCALL_DEFINE3(execve,
2001	const char __user *, filename,
2002	const char __user *const __user *, argv,
2003	const char __user *const __user *, envp)
2004	{
2005	return do_execve(filename: getname(name: filename), argv: argv, envp: envp);
2006	}
2007
2008	SYSCALL_DEFINE5(execveat,
2009	int, fd, const char __user *, filename,
2010	const char __user *const __user *, argv,
2011	const char __user *const __user *, envp,
2012	int, flags)
2013	{
2014	return do_execveat(fd,
2015	filename: getname_uflags(filename, flags),
2016	argv: argv, envp: envp, flags);
2017	}
2018
2019	#ifdef CONFIG_COMPAT
2020	COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
2021	const compat_uptr_t __user *, argv,
2022	const compat_uptr_t __user *, envp)
2023	{
2024	return compat_do_execve(filename: getname(name: filename), argv: argv, envp: envp);
2025	}
2026
2027	COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
2028	const char __user *, filename,
2029	const compat_uptr_t __user *, argv,
2030	const compat_uptr_t __user *, envp,
2031	int, flags)
2032	{
2033	return compat_do_execveat(fd,
2034	filename: getname_uflags(filename, flags),
2035	argv: argv, envp: envp, flags);
2036	}
2037	#endif
2038
2039	#ifdef CONFIG_SYSCTL
2040
2041	static int proc_dointvec_minmax_coredump(const struct ctl_table table, int* write,
2042	void buffer, size_t lenp, loff_t *ppos)
2043	{
2044	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2045
2046	if (!error)
2047	validate_coredump_safety();
2048	return error;
2049	}
2050
2051	static const struct ctl_table fs_exec_sysctls[] = {
2052	{
2053	.procname = "suid_dumpable",
2054	.data = &suid_dumpable,
2055	.maxlen = sizeof(int),
2056	.mode = `0644`,
2057	.proc_handler = proc_dointvec_minmax_coredump,
2058	.extra1 = SYSCTL_ZERO,
2059	.extra2 = SYSCTL_TWO,
2060	},
2061	};
2062
2063	static int __init init_fs_exec_sysctls(void)
2064	{
2065	register_sysctl_init("fs", fs_exec_sysctls);
2066	return `0`;
2067	}
2068
2069	fs_initcall(init_fs_exec_sysctls);
2070	#endif /* CONFIG_SYSCTL */
2071
2072	#ifdef CONFIG_EXEC_KUNIT_TEST
2073	#include "tests/exec_kunit.c"
2074	#endif
2075

source code of linux/fs/exec.c