namei.c source code [linux/fs/namei.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/namei.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* Some corrections by tytso.
10	*/
11
12	/ [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname*
13	* lookup logic.
14	*/
15	/ [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.*
16	*/
17
18	#include <linux/init.h>
19	#include <linux/export.h>
20	#include <linux/slab.h>
21	#include <linux/wordpart.h>
22	#include <linux/fs.h>
23	#include <linux/filelock.h>
24	#include <linux/namei.h>
25	#include <linux/pagemap.h>
26	#include <linux/sched/mm.h>
27	#include <linux/fsnotify.h>
28	#include <linux/personality.h>
29	#include <linux/security.h>
30	#include <linux/syscalls.h>
31	#include <linux/mount.h>
32	#include <linux/audit.h>
33	#include <linux/capability.h>
34	#include <linux/file.h>
35	#include <linux/fcntl.h>
36	#include <linux/device_cgroup.h>
37	#include <linux/fs_struct.h>
38	#include <linux/posix_acl.h>
39	#include <linux/hash.h>
40	#include <linux/bitops.h>
41	#include <linux/init_task.h>
42	#include <linux/uaccess.h>
43
44	#include "internal.h"
45	#include "mount.h"
46
47	/ [Feb-1997 T. Schoebel-Theuer]*
48	* Fundamental changes in the pathname lookup mechanisms (namei)
49	* were necessary because of omirr. The reason is that omirr needs
50	* to know the _real_ pathname, not the user-supplied one, in case
51	* of symlinks (and also when transname replacements occur).
52	*
53	* The new code replaces the old recursive symlink resolution with
54	* an iterative one (in case of non-nested symlink chains). It does
55	* this with calls to <fs>_follow_link().
56	* As a side effect, dir_namei(), _namei() and follow_link() are now
57	* replaced with a single function lookup_dentry() that can handle all
58	* the special cases of the former code.
59	*
60	* With the new dcache, the pathname is stored at each inode, at least as
61	* long as the refcount of the inode is positive. As a side effect, the
62	* size of the dcache depends on the inode cache and thus is dynamic.
63	*
64	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
65	* resolution to correspond with current state of the code.
66	*
67	* Note that the symlink resolution is not completely iterative.
68	* There is still a significant amount of tail- and mid- recursion in
69	* the algorithm. Also, note that <fs>_readlink() is not used in
70	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
71	* may return different results than <fs>_follow_link(). Many virtual
72	* filesystems (including /proc) exhibit this behavior.
73	*/
74
75	/ [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:*
76	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
77	* and the name already exists in form of a symlink, try to create the new
78	* name indicated by the symlink. The old code always complained that the
79	* name already exists, due to not following the symlink even if its target
80	* is nonexistent. The new semantics affects also mknod() and link() when
81	* the name is a symlink pointing to a non-existent name.
82	*
83	* I don't know which semantics is the right one, since I have no access
84	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
85	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
86	* "old" one. Personally, I think the new semantics is much more logical.
87	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
88	* file does succeed in both HP-UX and SunOs, but not in Solaris
89	* and in the old Linux semantics.
90	*/
91
92	/ [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink*
93	* semantics. See the comments in "open_namei" and "do_link" below.
94	*
95	* [10-Sep-98 Alan Modra] Another symlink change.
96	*/
97
98	/ [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:*
99	* inside the path - always follow.
100	* in the last component in creation/removal/renaming - never follow.
101	* if LOOKUP_FOLLOW passed - follow.
102	* if the pathname has trailing slashes - follow.
103	* otherwise - don't follow.
104	* (applied in that order).
105	*
106	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
107	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
108	* During the 2.4 we need to fix the userland stuff depending on it -
109	* hopefully we will be able to get rid of that wart in 2.5. So far only
110	* XEmacs seems to be relying on it...
111	*/
112	/*
113	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
115	* any extra contention...
116	*/
117
118	/ In order to reduce some races, while at the same time doing additional*
119	* checking and hopefully speeding things up, we copy filenames to the
120	* kernel data space before using them..
121	*
122	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
123	* PATH_MAX includes the nul terminator --RR.
124	*/
125
126	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
127
128	static inline void initname(struct filename name, const* char __user *uptr)
129	{
130	name->uptr = uptr;
131	name->aname = NULL;
132	atomic_set(v: &name->refcnt, i: `1`);
133	}
134
135	struct filename *
136	getname_flags(const char __user filename, int* flags)
137	{
138	struct filename *result;
139	char *kname;
140	int len;
141
142	result = audit_reusename(name: filename);
143	if (result)
144	return result;
145
146	result = __getname();
147	if (unlikely(!result))
148	return ERR_PTR(error: -ENOMEM);
149
150	/*
151	* First, try to embed the struct filename inside the names_cache
152	* allocation
153	*/
154	kname = (char *)result->iname;
155	result->name = kname;
156
157	len = strncpy_from_user(dst: kname, src: filename, EMBEDDED_NAME_MAX);
158	/*
159	* Handle both empty path and copy failure in one go.
160	*/
161	if (unlikely(len <= `0`)) {
162	if (unlikely(len < `0`)) {
163	__putname(result);
164	return ERR_PTR(error: len);
165	}
166
167	/ The empty path is special. /
168	if (!(flags & LOOKUP_EMPTY)) {
169	__putname(result);
170	return ERR_PTR(error: -ENOENT);
171	}
172	}
173
174	/*
175	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
176	* separate struct filename so we can dedicate the entire
177	* names_cache allocation for the pathname, and re-do the copy from
178	* userland.
179	*/
180	if (unlikely(len == EMBEDDED_NAME_MAX)) {
181	const size_t size = offsetof(struct filename, iname[`1`]);
182	kname = (char *)result;
183
184	/*
185	* size is chosen that way we to guarantee that
186	* result->iname[0] is within the same object and that
187	* kname can't be equal to result->iname, no matter what.
188	*/
189	result = kzalloc(size, GFP_KERNEL);
190	if (unlikely(!result)) {
191	__putname(kname);
192	return ERR_PTR(error: -ENOMEM);
193	}
194	result->name = kname;
195	len = strncpy_from_user(dst: kname, src: filename, PATH_MAX);
196	if (unlikely(len < `0`)) {
197	__putname(kname);
198	kfree(objp: result);
199	return ERR_PTR(error: len);
200	}
201	/ The empty path is special. /
202	if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
203	__putname(kname);
204	kfree(objp: result);
205	return ERR_PTR(error: -ENOENT);
206	}
207	if (unlikely(len == PATH_MAX)) {
208	__putname(kname);
209	kfree(objp: result);
210	return ERR_PTR(error: -ENAMETOOLONG);
211	}
212	}
213	initname(name: result, uptr: filename);
214	audit_getname(name: result);
215	return result;
216	}
217
218	struct filename getname_uflags(const* char __user filename, int* uflags)
219	{
220	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : `0`;
221
222	return getname_flags(filename, flags);
223	}
224
225	struct filename __getname_maybe_null(const* char __user *pathname)
226	{
227	struct filename *name;
228	char c;
229
230	/ try to save on allocations; loss on um, though /
231	if (get_user(c, pathname))
232	return ERR_PTR(error: -EFAULT);
233	if (!c)
234	return NULL;
235
236	name = getname_flags(filename: pathname, LOOKUP_EMPTY);
237	if (!IS_ERR(ptr: name) && !(name->name[`0`])) {
238	putname(name);
239	name = NULL;
240	}
241	return name;
242	}
243
244	struct filename getname_kernel(const* char * filename)
245	{
246	struct filename *result;
247	int len = strlen(filename) + `1`;
248
249	result = __getname();
250	if (unlikely(!result))
251	return ERR_PTR(error: -ENOMEM);
252
253	if (len <= EMBEDDED_NAME_MAX) {
254	result->name = (char *)result->iname;
255	} else if (len <= PATH_MAX) {
256	const size_t size = offsetof(struct filename, iname[`1`]);
257	struct filename *tmp;
258
259	tmp = kmalloc(size, GFP_KERNEL);
260	if (unlikely(!tmp)) {
261	__putname(result);
262	return ERR_PTR(error: -ENOMEM);
263	}
264	tmp->name = (char *)result;
265	result = tmp;
266	} else {
267	__putname(result);
268	return ERR_PTR(error: -ENAMETOOLONG);
269	}
270	memcpy((char *)result->name, filename, len);
271	initname(name: result, NULL);
272	audit_getname(name: result);
273	return result;
274	}
275	EXPORT_SYMBOL(getname_kernel);
276
277	void putname(struct filename *name)
278	{
279	int refcnt;
280
281	if (IS_ERR_OR_NULL(ptr: name))
282	return;
283
284	refcnt = atomic_read(v: &name->refcnt);
285	if (refcnt != `1`) {
286	if (WARN_ON_ONCE(!refcnt))
287	return;
288
289	if (!atomic_dec_and_test(v: &name->refcnt))
290	return;
291	}
292
293	if (name->name != name->iname) {
294	__putname(name->name);
295	kfree(objp: name);
296	} else
297	__putname(name);
298	}
299	EXPORT_SYMBOL(putname);
300
301	/**
302	* check_acl - perform ACL permission checking
303	* @idmap: idmap of the mount the inode was found from
304	* @inode: inode to check permissions on
305	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
306	*
307	* This function performs the ACL permission checking. Since this function
308	* retrieve POSIX acls it needs to know whether it is called from a blocking or
309	* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
310	*
311	* If the inode has been found through an idmapped mount the idmap of
312	* the vfsmount must be passed through @idmap. This function will then take
313	* care to map the inode according to @idmap before checking permissions.
314	* On non-idmapped mounts or if permission checking is to be performed on the
315	* raw inode simply pass @nop_mnt_idmap.
316	*/
317	static int check_acl(struct mnt_idmap *idmap,
318	struct inode inode, int* mask)
319	{
320	#ifdef CONFIG_FS_POSIX_ACL
321	struct posix_acl *acl;
322
323	if (mask & MAY_NOT_BLOCK) {
324	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
325	if (!acl)
326	return -EAGAIN;
327	/ no ->get_inode_acl() calls in RCU mode... /
328	if (is_uncached_acl(acl))
329	return -ECHILD;
330	return posix_acl_permission(idmap, inode, acl, mask);
331	}
332
333	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
334	if (IS_ERR(ptr: acl))
335	return PTR_ERR(ptr: acl);
336	if (acl) {
337	int error = posix_acl_permission(idmap, inode, acl, mask);
338	posix_acl_release(acl);
339	return error;
340	}
341	#endif
342
343	return -EAGAIN;
344	}
345
346	/*
347	* Very quick optimistic "we know we have no ACL's" check.
348	*
349	* Note that this is purely for ACL_TYPE_ACCESS, and purely
350	* for the "we have cached that there are no ACLs" case.
351	*
352	* If this returns true, we know there are no ACLs. But if
353	* it returns false, we might still not have ACLs (it could
354	* be the is_uncached_acl() case).
355	*/
356	static inline bool no_acl_inode(struct inode *inode)
357	{
358	#ifdef CONFIG_FS_POSIX_ACL
359	return likely(!READ_ONCE(inode->i_acl));
360	#else
361	return true;
362	#endif
363	}
364
365	/**
366	* acl_permission_check - perform basic UNIX permission checking
367	* @idmap: idmap of the mount the inode was found from
368	* @inode: inode to check permissions on
369	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
370	*
371	* This function performs the basic UNIX permission checking. Since this
372	* function may retrieve POSIX acls it needs to know whether it is called from a
373	* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
374	*
375	* If the inode has been found through an idmapped mount the idmap of
376	* the vfsmount must be passed through @idmap. This function will then take
377	* care to map the inode according to @idmap before checking permissions.
378	* On non-idmapped mounts or if permission checking is to be performed on the
379	* raw inode simply pass @nop_mnt_idmap.
380	*/
381	static int acl_permission_check(struct mnt_idmap *idmap,
382	struct inode inode, int* mask)
383	{
384	unsigned int mode = inode->i_mode;
385	vfsuid_t vfsuid;
386
387	/*
388	* Common cheap case: everybody has the requested
389	* rights, and there are no ACLs to check. No need
390	* to do any owner/group checks in that case.
391	*
392	* - 'mask&7' is the requested permission bit set
393	* - multiplying by 0111 spreads them out to all of ugo
394	* - '& ~mode' looks for missing inode permission bits
395	* - the '!' is for "no missing permissions"
396	*
397	* After that, we just need to check that there are no
398	* ACL's on the inode - do the 'IS_POSIXACL()' check last
399	* because it will dereference the ->i_sb pointer and we
400	* want to avoid that if at all possible.
401	*/
402	if (!((mask & `7`) * `0111` & ~mode)) {
403	if (no_acl_inode(inode))
404	return `0`;
405	if (!IS_POSIXACL(inode))
406	return `0`;
407	}
408
409	/ Are we the owner? If so, ACL's don't matter /
410	vfsuid = i_uid_into_vfsuid(idmap, inode);
411	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
412	mask &= `7`;
413	mode >>= `6`;
414	return (mask & ~mode) ? -EACCES : `0`;
415	}
416
417	/ Do we have ACL's? /
418	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
419	int error = check_acl(idmap, inode, mask);
420	if (error != -EAGAIN)
421	return error;
422	}
423
424	/ Only RWX matters for group/other mode bits /
425	mask &= `7`;
426
427	/*
428	* Are the group permissions different from
429	* the other permissions in the bits we care
430	* about? Need to check group ownership if so.
431	*/
432	if (mask & (mode ^ (mode >> `3`))) {
433	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
434	if (vfsgid_in_group_p(vfsgid))
435	mode >>= `3`;
436	}
437
438	/ Bits in 'mode' clear that we require? /
439	return (mask & ~mode) ? -EACCES : `0`;
440	}
441
442	/**
443	* generic_permission - check for access rights on a Posix-like filesystem
444	* @idmap: idmap of the mount the inode was found from
445	* @inode: inode to check access rights for
446	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
447	* %MAY_NOT_BLOCK ...)
448	*
449	* Used to check for read/write/execute permissions on a file.
450	* We use "fsuid" for this, letting us set arbitrary permissions
451	* for filesystem access without changing the "normal" uids which
452	* are used for other things.
453	*
454	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
455	* request cannot be satisfied (eg. requires blocking or too much complexity).
456	* It would then be called again in ref-walk mode.
457	*
458	* If the inode has been found through an idmapped mount the idmap of
459	* the vfsmount must be passed through @idmap. This function will then take
460	* care to map the inode according to @idmap before checking permissions.
461	* On non-idmapped mounts or if permission checking is to be performed on the
462	* raw inode simply pass @nop_mnt_idmap.
463	*/
464	int generic_permission(struct mnt_idmap idmap, struct* inode *inode,
465	int mask)
466	{
467	int ret;
468
469	/*
470	* Do the basic permission checks.
471	*/
472	ret = acl_permission_check(idmap, inode, mask);
473	if (ret != -EACCES)
474	return ret;
475
476	if (S_ISDIR(inode->i_mode)) {
477	/ DACs are overridable for directories /
478	if (!(mask & MAY_WRITE))
479	if (capable_wrt_inode_uidgid(idmap, inode,
480	CAP_DAC_READ_SEARCH))
481	return `0`;
482	if (capable_wrt_inode_uidgid(idmap, inode,
483	CAP_DAC_OVERRIDE))
484	return `0`;
485	return -EACCES;
486	}
487
488	/*
489	* Searching includes executable on directories, else just read.
490	*/
491	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
492	if (mask == MAY_READ)
493	if (capable_wrt_inode_uidgid(idmap, inode,
494	CAP_DAC_READ_SEARCH))
495	return `0`;
496	/*
497	* Read/write DACs are always overridable.
498	* Executable DACs are overridable when there is
499	* at least one exec bit set.
500	*/
501	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
502	if (capable_wrt_inode_uidgid(idmap, inode,
503	CAP_DAC_OVERRIDE))
504	return `0`;
505
506	return -EACCES;
507	}
508	EXPORT_SYMBOL(generic_permission);
509
510	/**
511	* do_inode_permission - UNIX permission checking
512	* @idmap: idmap of the mount the inode was found from
513	* @inode: inode to check permissions on
514	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
515	*
516	* We _really_ want to just do "generic_permission()" without
517	* even looking at the inode->i_op values. So we keep a cache
518	* flag in inode->i_opflags, that says "this has not special
519	* permission function, use the fast case".
520	*/
521	static inline int do_inode_permission(struct mnt_idmap *idmap,
522	struct inode inode, int* mask)
523	{
524	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
525	if (likely(inode->i_op->permission))
526	return inode->i_op->permission(idmap, inode, mask);
527
528	/ This gets set once for the inode lifetime /
529	spin_lock(lock: &inode->i_lock);
530	inode->i_opflags \|= IOP_FASTPERM;
531	spin_unlock(lock: &inode->i_lock);
532	}
533	return generic_permission(idmap, inode, mask);
534	}
535
536	/**
537	* sb_permission - Check superblock-level permissions
538	* @sb: Superblock of inode to check permission on
539	* @inode: Inode to check permission on
540	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
541	*
542	* Separate out file-system wide checks from inode-specific permission checks.
543	*/
544	static int sb_permission(struct super_block sb, struct* inode inode, int* mask)
545	{
546	if (unlikely(mask & MAY_WRITE)) {
547	umode_t mode = inode->i_mode;
548
549	/ Nobody gets write access to a read-only fs. /
550	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
551	return -EROFS;
552	}
553	return `0`;
554	}
555
556	/**
557	* inode_permission - Check for access rights to a given inode
558	* @idmap: idmap of the mount the inode was found from
559	* @inode: Inode to check permission on
560	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
561	*
562	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
563	* this, letting us set arbitrary permissions for filesystem access without
564	* changing the "normal" UIDs which are used for other things.
565	*
566	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
567	*/
568	int inode_permission(struct mnt_idmap *idmap,
569	struct inode inode, int* mask)
570	{
571	int retval;
572
573	retval = sb_permission(sb: inode->i_sb, inode, mask);
574	if (unlikely(retval))
575	return retval;
576
577	if (unlikely(mask & MAY_WRITE)) {
578	/*
579	* Nobody gets write access to an immutable file.
580	*/
581	if (unlikely(IS_IMMUTABLE(inode)))
582	return -EPERM;
583
584	/*
585	* Updating mtime will likely cause i_uid and i_gid to be
586	* written back improperly if their true value is unknown
587	* to the vfs.
588	*/
589	if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
590	return -EACCES;
591	}
592
593	retval = do_inode_permission(idmap, inode, mask);
594	if (unlikely(retval))
595	return retval;
596
597	retval = devcgroup_inode_permission(inode, mask);
598	if (unlikely(retval))
599	return retval;
600
601	return security_inode_permission(inode, mask);
602	}
603	EXPORT_SYMBOL(inode_permission);
604
605	/**
606	* path_get - get a reference to a path
607	* @path: path to get the reference to
608	*
609	* Given a path increment the reference count to the dentry and the vfsmount.
610	*/
611	void path_get(const struct path *path)
612	{
613	mntget(mnt: path->mnt);
614	dget(dentry: path->dentry);
615	}
616	EXPORT_SYMBOL(path_get);
617
618	/**
619	* path_put - put a reference to a path
620	* @path: path to put the reference to
621	*
622	* Given a path decrement the reference count to the dentry and the vfsmount.
623	*/
624	void path_put(const struct path *path)
625	{
626	dput(path->dentry);
627	mntput(mnt: path->mnt);
628	}
629	EXPORT_SYMBOL(path_put);
630
631	#define EMBEDDED_LEVELS 2
632	struct nameidata {
633	struct path path;
634	struct qstr last;
635	struct path root;
636	struct inode inode; /* path.dentry.d_inode /
637	unsigned int flags, state;
638	unsigned seq, next_seq, m_seq, r_seq;
639	int last_type;
640	unsigned depth;
641	int total_link_count;
642	struct saved {
643	struct path link;
644	struct delayed_call done;
645	const char *name;
646	unsigned seq;
647	} *stack, internal[EMBEDDED_LEVELS];
648	struct filename *name;
649	const char *pathname;
650	struct nameidata *saved;
651	unsigned root_seq;
652	int dfd;
653	vfsuid_t dir_vfsuid;
654	umode_t dir_mode;
655	} __randomize_layout;
656
657	#define ND_ROOT_PRESET 1
658	#define ND_ROOT_GRABBED 2
659	#define ND_JUMPED 4
660
661	static void __set_nameidata(struct nameidata p, int* dfd, struct filename *name)
662	{
663	struct nameidata *old = current->nameidata;
664	p->stack = p->internal;
665	p->depth = `0`;
666	p->dfd = dfd;
667	p->name = name;
668	p->pathname = likely(name) ? name->name : "";
669	p->path.mnt = NULL;
670	p->path.dentry = NULL;
671	p->total_link_count = old ? old->total_link_count : `0`;
672	p->saved = old;
673	current->nameidata = p;
674	}
675
676	static inline void set_nameidata(struct nameidata p, int* dfd, struct filename *name,
677	const struct path *root)
678	{
679	__set_nameidata(p, dfd, name);
680	p->state = `0`;
681	if (unlikely(root)) {
682	p->state = ND_ROOT_PRESET;
683	p->root = *root;
684	}
685	}
686
687	static void restore_nameidata(void)
688	{
689	struct nameidata now = current->nameidata, old = now->saved;
690
691	current->nameidata = old;
692	if (old)
693	old->total_link_count = now->total_link_count;
694	if (now->stack != now->internal)
695	kfree(objp: now->stack);
696	}
697
698	static bool nd_alloc_stack(struct nameidata *nd)
699	{
700	struct saved *p;
701
702	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
703	nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
704	if (unlikely(!p))
705	return false;
706	memcpy(p, nd->internal, sizeof(nd->internal));
707	nd->stack = p;
708	return true;
709	}
710
711	/**
712	* path_connected - Verify that a dentry is below mnt.mnt_root
713	* @mnt: The mountpoint to check.
714	* @dentry: The dentry to check.
715	*
716	* Rename can sometimes move a file or directory outside of a bind
717	* mount, path_connected allows those cases to be detected.
718	*/
719	static bool path_connected(struct vfsmount mnt, struct* dentry *dentry)
720	{
721	struct super_block *sb = mnt->mnt_sb;
722
723	/ Bind mounts can have disconnected paths /
724	if (mnt->mnt_root == sb->s_root)
725	return true;
726
727	return is_subdir(dentry, mnt->mnt_root);
728	}
729
730	static void drop_links(struct nameidata *nd)
731	{
732	int i = nd->depth;
733	while (i--) {
734	struct saved *last = nd->stack + i;
735	do_delayed_call(call: &last->done);
736	clear_delayed_call(call: &last->done);
737	}
738	}
739
740	static void leave_rcu(struct nameidata *nd)
741	{
742	nd->flags &= ~LOOKUP_RCU;
743	nd->seq = nd->next_seq = `0`;
744	rcu_read_unlock();
745	}
746
747	static void terminate_walk(struct nameidata *nd)
748	{
749	drop_links(nd);
750	if (!(nd->flags & LOOKUP_RCU)) {
751	int i;
752	path_put(&nd->path);
753	for (i = `0`; i < nd->depth; i++)
754	path_put(&nd->stack[i].link);
755	if (nd->state & ND_ROOT_GRABBED) {
756	path_put(&nd->root);
757	nd->state &= ~ND_ROOT_GRABBED;
758	}
759	} else {
760	leave_rcu(nd);
761	}
762	nd->depth = `0`;
763	nd->path.mnt = NULL;
764	nd->path.dentry = NULL;
765	}
766
767	/ path_put is needed afterwards regardless of success or failure /
768	static bool __legitimize_path(struct path path, unsigned* seq, unsigned mseq)
769	{
770	int res = __legitimize_mnt(path->mnt, mseq);
771	if (unlikely(res)) {
772	if (res > `0`)
773	path->mnt = NULL;
774	path->dentry = NULL;
775	return false;
776	}
777	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
778	path->dentry = NULL;
779	return false;
780	}
781	return !read_seqcount_retry(&path->dentry->d_seq, seq);
782	}
783
784	static inline bool legitimize_path(struct nameidata *nd,
785	struct path path, unsigned* seq)
786	{
787	return __legitimize_path(path, seq, mseq: nd->m_seq);
788	}
789
790	static bool legitimize_links(struct nameidata *nd)
791	{
792	int i;
793	if (unlikely(nd->flags & LOOKUP_CACHED)) {
794	drop_links(nd);
795	nd->depth = `0`;
796	return false;
797	}
798	for (i = `0`; i < nd->depth; i++) {
799	struct saved *last = nd->stack + i;
800	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
801	drop_links(nd);
802	nd->depth = i + `1`;
803	return false;
804	}
805	}
806	return true;
807	}
808
809	static bool legitimize_root(struct nameidata *nd)
810	{
811	/ Nothing to do if nd->root is zero or is managed by the VFS user. /
812	if (!nd->root.mnt \|\| (nd->state & ND_ROOT_PRESET))
813	return true;
814	nd->state \|= ND_ROOT_GRABBED;
815	return legitimize_path(nd, path: &nd->root, seq: nd->root_seq);
816	}
817
818	/*
819	* Path walking has 2 modes, rcu-walk and ref-walk (see
820	* Documentation/filesystems/path-lookup.txt). In situations when we can't
821	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
822	* normal reference counts on dentries and vfsmounts to transition to ref-walk
823	* mode. Refcounts are grabbed at the last known good point before rcu-walk
824	* got stuck, so ref-walk may continue from there. If this is not successful
825	* (eg. a seqcount has changed), then failure is returned and it's up to caller
826	* to restart the path walk from the beginning in ref-walk mode.
827	*/
828
829	/**
830	* try_to_unlazy - try to switch to ref-walk mode.
831	* @nd: nameidata pathwalk data
832	* Returns: true on success, false on failure
833	*
834	* try_to_unlazy attempts to legitimize the current nd->path and nd->root
835	* for ref-walk mode.
836	* Must be called from rcu-walk context.
837	* Nothing should touch nameidata between try_to_unlazy() failure and
838	* terminate_walk().
839	*/
840	static bool try_to_unlazy(struct nameidata *nd)
841	{
842	struct dentry *parent = nd->path.dentry;
843
844	BUG_ON(!(nd->flags & LOOKUP_RCU));
845
846	if (unlikely(!legitimize_links(nd)))
847	goto out1;
848	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
849	goto out;
850	if (unlikely(!legitimize_root(nd)))
851	goto out;
852	leave_rcu(nd);
853	BUG_ON(nd->inode != parent->d_inode);
854	return true;
855
856	out1:
857	nd->path.mnt = NULL;
858	nd->path.dentry = NULL;
859	out:
860	leave_rcu(nd);
861	return false;
862	}
863
864	/**
865	* try_to_unlazy_next - try to switch to ref-walk mode.
866	* @nd: nameidata pathwalk data
867	* @dentry: next dentry to step into
868	* Returns: true on success, false on failure
869	*
870	* Similar to try_to_unlazy(), but here we have the next dentry already
871	* picked by rcu-walk and want to legitimize that in addition to the current
872	* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
873	* Nothing should touch nameidata between try_to_unlazy_next() failure and
874	* terminate_walk().
875	*/
876	static bool try_to_unlazy_next(struct nameidata nd, struct* dentry *dentry)
877	{
878	int res;
879	BUG_ON(!(nd->flags & LOOKUP_RCU));
880
881	if (unlikely(!legitimize_links(nd)))
882	goto out2;
883	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
884	if (unlikely(res)) {
885	if (res > `0`)
886	goto out2;
887	goto out1;
888	}
889	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
890	goto out1;
891
892	/*
893	* We need to move both the parent and the dentry from the RCU domain
894	* to be properly refcounted. And the sequence number in the dentry
895	* validates both dentry counters, since we checked the sequence
896	* number of the parent after we got the child sequence number. So we
897	* know the parent must still be valid if the child sequence number is
898	*/
899	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
900	goto out;
901	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
902	goto out_dput;
903	/*
904	* Sequence counts matched. Now make sure that the root is
905	* still valid and get it if required.
906	*/
907	if (unlikely(!legitimize_root(nd)))
908	goto out_dput;
909	leave_rcu(nd);
910	return true;
911
912	out2:
913	nd->path.mnt = NULL;
914	out1:
915	nd->path.dentry = NULL;
916	out:
917	leave_rcu(nd);
918	return false;
919	out_dput:
920	leave_rcu(nd);
921	dput(dentry);
922	return false;
923	}
924
925	static inline int d_revalidate(struct inode dir, const* struct qstr *name,
926	struct dentry dentry, unsigned* int flags)
927	{
928	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
929	return dentry->d_op->d_revalidate(dir, name, dentry, flags);
930	else
931	return `1`;
932	}
933
934	/**
935	* complete_walk - successful completion of path walk
936	* @nd: pointer nameidata
937	*
938	* If we had been in RCU mode, drop out of it and legitimize nd->path.
939	* Revalidate the final result, unless we'd already done that during
940	* the path walk or the filesystem doesn't ask for it. Return 0 on
941	* success, -error on failure. In case of failure caller does not
942	* need to drop nd->path.
943	*/
944	static int complete_walk(struct nameidata *nd)
945	{
946	struct dentry *dentry = nd->path.dentry;
947	int status;
948
949	if (nd->flags & LOOKUP_RCU) {
950	/*
951	* We don't want to zero nd->root for scoped-lookups or
952	* externally-managed nd->root.
953	*/
954	if (!(nd->state & ND_ROOT_PRESET))
955	if (!(nd->flags & LOOKUP_IS_SCOPED))
956	nd->root.mnt = NULL;
957	nd->flags &= ~LOOKUP_CACHED;
958	if (!try_to_unlazy(nd))
959	return -ECHILD;
960	}
961
962	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
963	/*
964	* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
965	* ever step outside the root during lookup" and should already
966	* be guaranteed by the rest of namei, we want to avoid a namei
967	* BUG resulting in userspace being given a path that was not
968	* scoped within the root at some point during the lookup.
969	*
970	* So, do a final sanity-check to make sure that in the
971	* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
972	* we won't silently return an fd completely outside of the
973	* requested root to userspace.
974	*
975	* Userspace could move the path outside the root after this
976	* check, but as discussed elsewhere this is not a concern (the
977	* resolved file was inside the root at some point).
978	*/
979	if (!path_is_under(&nd->path, &nd->root))
980	return -EXDEV;
981	}
982
983	if (likely(!(nd->state & ND_JUMPED)))
984	return `0`;
985
986	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
987	return `0`;
988
989	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
990	if (status > `0`)
991	return `0`;
992
993	if (!status)
994	status = -ESTALE;
995
996	return status;
997	}
998
999	static int set_root(struct nameidata *nd)
1000	{
1001	struct fs_struct *fs = current->fs;
1002
1003	/*
1004	* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
1005	* still have to ensure it doesn't happen because it will cause a breakout
1006	* from the dirfd.
1007	*/
1008	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
1009	return -ENOTRECOVERABLE;
1010
1011	if (nd->flags & LOOKUP_RCU) {
1012	unsigned seq;
1013
1014	do {
1015	seq = read_seqcount_begin(&fs->seq);
1016	nd->root = fs->root;
1017	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
1018	} while (read_seqcount_retry(&fs->seq, seq));
1019	} else {
1020	get_fs_root(fs, root: &nd->root);
1021	nd->state \|= ND_ROOT_GRABBED;
1022	}
1023	return `0`;
1024	}
1025
1026	static int nd_jump_root(struct nameidata *nd)
1027	{
1028	if (unlikely(nd->flags & LOOKUP_BENEATH))
1029	return -EXDEV;
1030	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1031	/ Absolute path arguments to path_init() are allowed. /
1032	if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
1033	return -EXDEV;
1034	}
1035	if (!nd->root.mnt) {
1036	int error = set_root(nd);
1037	if (error)
1038	return error;
1039	}
1040	if (nd->flags & LOOKUP_RCU) {
1041	struct dentry *d;
1042	nd->path = nd->root;
1043	d = nd->path.dentry;
1044	nd->inode = d->d_inode;
1045	nd->seq = nd->root_seq;
1046	if (read_seqcount_retry(&d->d_seq, nd->seq))
1047	return -ECHILD;
1048	} else {
1049	path_put(&nd->path);
1050	nd->path = nd->root;
1051	path_get(&nd->path);
1052	nd->inode = nd->path.dentry->d_inode;
1053	}
1054	nd->state \|= ND_JUMPED;
1055	return `0`;
1056	}
1057
1058	/*
1059	* Helper to directly jump to a known parsed path from ->get_link,
1060	* caller must have taken a reference to path beforehand.
1061	*/
1062	int nd_jump_link(const struct path *path)
1063	{
1064	int error = -ELOOP;
1065	struct nameidata *nd = current->nameidata;
1066
1067	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1068	goto err;
1069
1070	error = -EXDEV;
1071	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1072	if (nd->path.mnt != path->mnt)
1073	goto err;
1074	}
1075	/ Not currently safe for scoped-lookups. /
1076	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1077	goto err;
1078
1079	path_put(&nd->path);
1080	nd->path = *path;
1081	nd->inode = nd->path.dentry->d_inode;
1082	nd->state \|= ND_JUMPED;
1083	return `0`;
1084
1085	err:
1086	path_put(path);
1087	return error;
1088	}
1089
1090	static inline void put_link(struct nameidata *nd)
1091	{
1092	struct saved *last = nd->stack + --nd->depth;
1093	do_delayed_call(call: &last->done);
1094	if (!(nd->flags & LOOKUP_RCU))
1095	path_put(&last->link);
1096	}
1097
1098	static int sysctl_protected_symlinks __read_mostly;
1099	static int sysctl_protected_hardlinks __read_mostly;
1100	static int sysctl_protected_fifos __read_mostly;
1101	static int sysctl_protected_regular __read_mostly;
1102
1103	#ifdef CONFIG_SYSCTL
1104	static const struct ctl_table namei_sysctls[] = {
1105	{
1106	.procname = "protected_symlinks",
1107	.data = &sysctl_protected_symlinks,
1108	.maxlen = sizeof(int),
1109	.mode = `0644`,
1110	.proc_handler = proc_dointvec_minmax,
1111	.extra1 = SYSCTL_ZERO,
1112	.extra2 = SYSCTL_ONE,
1113	},
1114	{
1115	.procname = "protected_hardlinks",
1116	.data = &sysctl_protected_hardlinks,
1117	.maxlen = sizeof(int),
1118	.mode = `0644`,
1119	.proc_handler = proc_dointvec_minmax,
1120	.extra1 = SYSCTL_ZERO,
1121	.extra2 = SYSCTL_ONE,
1122	},
1123	{
1124	.procname = "protected_fifos",
1125	.data = &sysctl_protected_fifos,
1126	.maxlen = sizeof(int),
1127	.mode = `0644`,
1128	.proc_handler = proc_dointvec_minmax,
1129	.extra1 = SYSCTL_ZERO,
1130	.extra2 = SYSCTL_TWO,
1131	},
1132	{
1133	.procname = "protected_regular",
1134	.data = &sysctl_protected_regular,
1135	.maxlen = sizeof(int),
1136	.mode = `0644`,
1137	.proc_handler = proc_dointvec_minmax,
1138	.extra1 = SYSCTL_ZERO,
1139	.extra2 = SYSCTL_TWO,
1140	},
1141	};
1142
1143	static int __init init_fs_namei_sysctls(void)
1144	{
1145	register_sysctl_init("fs", namei_sysctls);
1146	return `0`;
1147	}
1148	fs_initcall(init_fs_namei_sysctls);
1149
1150	#endif /* CONFIG_SYSCTL */
1151
1152	/**
1153	* may_follow_link - Check symlink following for unsafe situations
1154	* @nd: nameidata pathwalk data
1155	* @inode: Used for idmapping.
1156	*
1157	* In the case of the sysctl_protected_symlinks sysctl being enabled,
1158	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1159	* in a sticky world-writable directory. This is to protect privileged
1160	* processes from failing races against path names that may change out
1161	* from under them by way of other users creating malicious symlinks.
1162	* It will permit symlinks to be followed only when outside a sticky
1163	* world-writable directory, or when the uid of the symlink and follower
1164	* match, or when the directory owner matches the symlink's owner.
1165	*
1166	* Returns 0 if following the symlink is allowed, -ve on error.
1167	*/
1168	static inline int may_follow_link(struct nameidata nd, const* struct inode *inode)
1169	{
1170	struct mnt_idmap *idmap;
1171	vfsuid_t vfsuid;
1172
1173	if (!sysctl_protected_symlinks)
1174	return `0`;
1175
1176	idmap = mnt_idmap(mnt: nd->path.mnt);
1177	vfsuid = i_uid_into_vfsuid(idmap, inode);
1178	/ Allowed if owner and follower match. /
1179	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1180	return `0`;
1181
1182	/ Allowed if parent directory not sticky and world-writable. /
1183	if ((nd->dir_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
1184	return `0`;
1185
1186	/ Allowed if parent directory and link owner match. /
1187	if (vfsuid_valid(uid: nd->dir_vfsuid) && vfsuid_eq(left: nd->dir_vfsuid, right: vfsuid))
1188	return `0`;
1189
1190	if (nd->flags & LOOKUP_RCU)
1191	return -ECHILD;
1192
1193	audit_inode(name: nd->name, dentry: nd->stack[`0`].link.dentry, aflags: `0`);
1194	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "follow_link");
1195	return -EACCES;
1196	}
1197
1198	/**
1199	* safe_hardlink_source - Check for safe hardlink conditions
1200	* @idmap: idmap of the mount the inode was found from
1201	* @inode: the source inode to hardlink from
1202	*
1203	* Return false if at least one of the following conditions:
1204	* - inode is not a regular file
1205	* - inode is setuid
1206	* - inode is setgid and group-exec
1207	* - access failure for read and write
1208	*
1209	* Otherwise returns true.
1210	*/
1211	static bool safe_hardlink_source(struct mnt_idmap *idmap,
1212	struct inode *inode)
1213	{
1214	umode_t mode = inode->i_mode;
1215
1216	/ Special files should not get pinned to the filesystem. /
1217	if (!S_ISREG(mode))
1218	return false;
1219
1220	/ Setuid files should not get pinned to the filesystem. /
1221	if (mode & S_ISUID)
1222	return false;
1223
1224	/ Executable setgid files should not get pinned to the filesystem. /
1225	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
1226	return false;
1227
1228	/ Hardlinking to unreadable or unwritable sources is dangerous. /
1229	if (inode_permission(idmap, inode, MAY_READ \| MAY_WRITE))
1230	return false;
1231
1232	return true;
1233	}
1234
1235	/**
1236	* may_linkat - Check permissions for creating a hardlink
1237	* @idmap: idmap of the mount the inode was found from
1238	* @link: the source to hardlink from
1239	*
1240	* Block hardlink when all of:
1241	* - sysctl_protected_hardlinks enabled
1242	* - fsuid does not match inode
1243	* - hardlink source is unsafe (see safe_hardlink_source() above)
1244	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
1245	*
1246	* If the inode has been found through an idmapped mount the idmap of
1247	* the vfsmount must be passed through @idmap. This function will then take
1248	* care to map the inode according to @idmap before checking permissions.
1249	* On non-idmapped mounts or if permission checking is to be performed on the
1250	* raw inode simply pass @nop_mnt_idmap.
1251	*
1252	* Returns 0 if successful, -ve on error.
1253	*/
1254	int may_linkat(struct mnt_idmap idmap, const* struct path *link)
1255	{
1256	struct inode *inode = link->dentry->d_inode;
1257
1258	/ Inode writeback is not safe when the uid or gid are invalid. /
1259	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
1260	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
1261	return -EOVERFLOW;
1262
1263	if (!sysctl_protected_hardlinks)
1264	return `0`;
1265
1266	/ Source inode owner (or CAP_FOWNER) can hardlink all they like,*
1267	* otherwise, it must be a safe source.
1268	*/
1269	if (safe_hardlink_source(idmap, inode) \|\|
1270	inode_owner_or_capable(idmap, inode))
1271	return `0`;
1272
1273	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "linkat");
1274	return -EPERM;
1275	}
1276
1277	/**
1278	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1279	* should be allowed, or not, on files that already
1280	* exist.
1281	* @idmap: idmap of the mount the inode was found from
1282	* @nd: nameidata pathwalk data
1283	* @inode: the inode of the file to open
1284	*
1285	* Block an O_CREAT open of a FIFO (or a regular file) when:
1286	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1287	* - the file already exists
1288	* - we are in a sticky directory
1289	* - we don't own the file
1290	* - the owner of the directory doesn't own the file
1291	* - the directory is world writable
1292	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1293	* the directory doesn't have to be world writable: being group writable will
1294	* be enough.
1295	*
1296	* If the inode has been found through an idmapped mount the idmap of
1297	* the vfsmount must be passed through @idmap. This function will then take
1298	* care to map the inode according to @idmap before checking permissions.
1299	* On non-idmapped mounts or if permission checking is to be performed on the
1300	* raw inode simply pass @nop_mnt_idmap.
1301	*
1302	* Returns 0 if the open is allowed, -ve on error.
1303	*/
1304	static int may_create_in_sticky(struct mnt_idmap idmap, struct* nameidata *nd,
1305	struct inode *const inode)
1306	{
1307	umode_t dir_mode = nd->dir_mode;
1308	vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
1309
1310	if (likely(!(dir_mode & S_ISVTX)))
1311	return `0`;
1312
1313	if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
1314	return `0`;
1315
1316	if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
1317	return `0`;
1318
1319	i_vfsuid = i_uid_into_vfsuid(idmap, inode);
1320
1321	if (vfsuid_eq(left: i_vfsuid, right: dir_vfsuid))
1322	return `0`;
1323
1324	if (vfsuid_eq_kuid(vfsuid: i_vfsuid, current_fsuid()))
1325	return `0`;
1326
1327	if (likely(dir_mode & `0002`)) {
1328	audit_log_path_denied(AUDIT_ANOM_CREAT, operation: "sticky_create");
1329	return -EACCES;
1330	}
1331
1332	if (dir_mode & `0020`) {
1333	if (sysctl_protected_fifos >= `2` && S_ISFIFO(inode->i_mode)) {
1334	audit_log_path_denied(AUDIT_ANOM_CREAT,
1335	operation: "sticky_create_fifo");
1336	return -EACCES;
1337	}
1338
1339	if (sysctl_protected_regular >= `2` && S_ISREG(inode->i_mode)) {
1340	audit_log_path_denied(AUDIT_ANOM_CREAT,
1341	operation: "sticky_create_regular");
1342	return -EACCES;
1343	}
1344	}
1345
1346	return `0`;
1347	}
1348
1349	/*
1350	* follow_up - Find the mountpoint of path's vfsmount
1351	*
1352	* Given a path, find the mountpoint of its source file system.
1353	* Replace @path with the path of the mountpoint in the parent mount.
1354	* Up is towards /.
1355	*
1356	* Return 1 if we went up a level and 0 if we were already at the
1357	* root.
1358	*/
1359	int follow_up(struct path *path)
1360	{
1361	struct mount *mnt = real_mount(mnt: path->mnt);
1362	struct mount *parent;
1363	struct dentry *mountpoint;
1364
1365	read_seqlock_excl(sl: &mount_lock);
1366	parent = mnt->mnt_parent;
1367	if (parent == mnt) {
1368	read_sequnlock_excl(sl: &mount_lock);
1369	return `0`;
1370	}
1371	mntget(mnt: &parent->mnt);
1372	mountpoint = dget(dentry: mnt->mnt_mountpoint);
1373	read_sequnlock_excl(sl: &mount_lock);
1374	dput(path->dentry);
1375	path->dentry = mountpoint;
1376	mntput(mnt: path->mnt);
1377	path->mnt = &parent->mnt;
1378	return `1`;
1379	}
1380	EXPORT_SYMBOL(follow_up);
1381
1382	static bool choose_mountpoint_rcu(struct mount m, const* struct path *root,
1383	struct path path, unsigned* *seqp)
1384	{
1385	while (mnt_has_parent(mnt: m)) {
1386	struct dentry *mountpoint = m->mnt_mountpoint;
1387
1388	m = m->mnt_parent;
1389	if (unlikely(root->dentry == mountpoint &&
1390	root->mnt == &m->mnt))
1391	break;
1392	if (mountpoint != m->mnt.mnt_root) {
1393	path->mnt = &m->mnt;
1394	path->dentry = mountpoint;
1395	*seqp = read_seqcount_begin(&mountpoint->d_seq);
1396	return true;
1397	}
1398	}
1399	return false;
1400	}
1401
1402	static bool choose_mountpoint(struct mount m, const* struct path *root,
1403	struct path *path)
1404	{
1405	bool found;
1406
1407	rcu_read_lock();
1408	while (`1`) {
1409	unsigned seq, mseq = read_seqbegin(sl: &mount_lock);
1410
1411	found = choose_mountpoint_rcu(m, root, path, seqp: &seq);
1412	if (unlikely(!found)) {
1413	if (!read_seqretry(sl: &mount_lock, start: mseq))
1414	break;
1415	} else {
1416	if (likely(__legitimize_path(path, seq, mseq)))
1417	break;
1418	rcu_read_unlock();
1419	path_put(path);
1420	rcu_read_lock();
1421	}
1422	}
1423	rcu_read_unlock();
1424	return found;
1425	}
1426
1427	/*
1428	* Perform an automount
1429	* - return -EISDIR to tell follow_managed() to stop and return the path we
1430	* were called with.
1431	*/
1432	static int follow_automount(struct path path, int* count, unsigned* lookup_flags)
1433	{
1434	struct dentry *dentry = path->dentry;
1435
1436	/ We don't want to mount if someone's just doing a stat -*
1437	* unless they're stat'ing a directory and appended a '/' to
1438	* the name.
1439	*
1440	* We do, however, want to mount if someone wants to open or
1441	* create a file of any type under the mountpoint, wants to
1442	* traverse through the mountpoint or wants to open the
1443	* mounted directory. Also, autofs may mark negative dentries
1444	* as being automount points. These will need the attentions
1445	* of the daemon to instantiate them before they can be used.
1446	*/
1447	if (!(lookup_flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
1448	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
1449	dentry->d_inode)
1450	return -EISDIR;
1451
1452	if (count && (*count)++ >= MAXSYMLINKS)
1453	return -ELOOP;
1454
1455	return finish_automount(dentry->d_op->d_automount(path), path);
1456	}
1457
1458	/*
1459	* mount traversal - out-of-line part. One note on ->d_flags accesses -
1460	* dentries are pinned but not locked here, so negative dentry can go
1461	* positive right under us. Use of smp_load_acquire() provides a barrier
1462	* sufficient for ->d_inode and ->d_flags consistency.
1463	*/
1464	static int __traverse_mounts(struct path path, unsigned* flags, bool *jumped,
1465	int count, unsigned* lookup_flags)
1466	{
1467	struct vfsmount *mnt = path->mnt;
1468	bool need_mntput = false;
1469	int ret = `0`;
1470
1471	while (flags & DCACHE_MANAGED_DENTRY) {
1472	/ Allow the filesystem to manage the transit without i_mutex*
1473	* being held. */
1474	if (flags & DCACHE_MANAGE_TRANSIT) {
1475	ret = path->dentry->d_op->d_manage(path, false);
1476	flags = smp_load_acquire(&path->dentry->d_flags);
1477	if (ret < `0`)
1478	break;
1479	}
1480
1481	if (flags & DCACHE_MOUNTED) { // something's mounted on it..
1482	struct vfsmount *mounted = lookup_mnt(path);
1483	if (mounted) { // ... in our namespace
1484	dput(path->dentry);
1485	if (need_mntput)
1486	mntput(mnt: path->mnt);
1487	path->mnt = mounted;
1488	path->dentry = dget(dentry: mounted->mnt_root);
1489	// here we know it's positive
1490	flags = path->dentry->d_flags;
1491	need_mntput = true;
1492	continue;
1493	}
1494	}
1495
1496	if (!(flags & DCACHE_NEED_AUTOMOUNT))
1497	break;
1498
1499	// uncovered automount point
1500	ret = follow_automount(path, count, lookup_flags);
1501	flags = smp_load_acquire(&path->dentry->d_flags);
1502	if (ret < `0`)
1503	break;
1504	}
1505
1506	if (ret == -EISDIR)
1507	ret = `0`;
1508	// possible if you race with several mount --move
1509	if (need_mntput && path->mnt == mnt)
1510	mntput(mnt: path->mnt);
1511	if (!ret && unlikely(d_flags_negative(flags)))
1512	ret = -ENOENT;
1513	*jumped = need_mntput;
1514	return ret;
1515	}
1516
1517	static inline int traverse_mounts(struct path path, bool jumped,
1518	int count, unsigned* lookup_flags)
1519	{
1520	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1521
1522	/ fastpath /
1523	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1524	*jumped = false;
1525	if (unlikely(d_flags_negative(flags)))
1526	return -ENOENT;
1527	return `0`;
1528	}
1529	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1530	}
1531
1532	int follow_down_one(struct path *path)
1533	{
1534	struct vfsmount *mounted;
1535
1536	mounted = lookup_mnt(path);
1537	if (mounted) {
1538	dput(path->dentry);
1539	mntput(mnt: path->mnt);
1540	path->mnt = mounted;
1541	path->dentry = dget(dentry: mounted->mnt_root);
1542	return `1`;
1543	}
1544	return `0`;
1545	}
1546	EXPORT_SYMBOL(follow_down_one);
1547
1548	/*
1549	* Follow down to the covering mount currently visible to userspace. At each
1550	* point, the filesystem owning that dentry may be queried as to whether the
1551	* caller is permitted to proceed or not.
1552	*/
1553	int follow_down(struct path path, unsigned* int flags)
1554	{
1555	struct vfsmount *mnt = path->mnt;
1556	bool jumped;
1557	int ret = traverse_mounts(path, jumped: &jumped, NULL, lookup_flags: flags);
1558
1559	if (path->mnt != mnt)
1560	mntput(mnt);
1561	return ret;
1562	}
1563	EXPORT_SYMBOL(follow_down);
1564
1565	/*
1566	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1567	* we meet a managed dentry that would need blocking.
1568	*/
1569	static bool __follow_mount_rcu(struct nameidata nd, struct* path *path)
1570	{
1571	struct dentry *dentry = path->dentry;
1572	unsigned int flags = dentry->d_flags;
1573
1574	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1575	return true;
1576
1577	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1578	return false;
1579
1580	for (;;) {
1581	/*
1582	* Don't forget we might have a non-mountpoint managed dentry
1583	* that wants to block transit.
1584	*/
1585	if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1586	int res = dentry->d_op->d_manage(path, true);
1587	if (res)
1588	return res == -EISDIR;
1589	flags = dentry->d_flags;
1590	}
1591
1592	if (flags & DCACHE_MOUNTED) {
1593	struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1594	if (mounted) {
1595	path->mnt = &mounted->mnt;
1596	dentry = path->dentry = mounted->mnt.mnt_root;
1597	nd->state \|= ND_JUMPED;
1598	nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1599	flags = dentry->d_flags;
1600	// makes sure that non-RCU pathwalk could reach
1601	// this state.
1602	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1603	return false;
1604	continue;
1605	}
1606	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1607	return false;
1608	}
1609	return !(flags & DCACHE_NEED_AUTOMOUNT);
1610	}
1611	}
1612
1613	static inline int handle_mounts(struct nameidata nd, struct* dentry *dentry,
1614	struct path *path)
1615	{
1616	bool jumped;
1617	int ret;
1618
1619	path->mnt = nd->path.mnt;
1620	path->dentry = dentry;
1621	if (nd->flags & LOOKUP_RCU) {
1622	unsigned int seq = nd->next_seq;
1623	if (likely(__follow_mount_rcu(nd, path)))
1624	return `0`;
1625	// path and nd->next_seq might've been clobbered*
1626	path->mnt = nd->path.mnt;
1627	path->dentry = dentry;
1628	nd->next_seq = seq;
1629	if (!try_to_unlazy_next(nd, dentry))
1630	return -ECHILD;
1631	}
1632	ret = traverse_mounts(path, jumped: &jumped, count: &nd->total_link_count, lookup_flags: nd->flags);
1633	if (jumped) {
1634	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1635	ret = -EXDEV;
1636	else
1637	nd->state \|= ND_JUMPED;
1638	}
1639	if (unlikely(ret)) {
1640	dput(path->dentry);
1641	if (path->mnt != nd->path.mnt)
1642	mntput(mnt: path->mnt);
1643	}
1644	return ret;
1645	}
1646
1647	/*
1648	* This looks up the name in dcache and possibly revalidates the found dentry.
1649	* NULL is returned if the dentry does not exist in the cache.
1650	*/
1651	static struct dentry lookup_dcache(const* struct qstr *name,
1652	struct dentry *dir,
1653	unsigned int flags)
1654	{
1655	struct dentry *dentry = d_lookup(dir, name);
1656	if (dentry) {
1657	int error = d_revalidate(dir: dir->d_inode, name, dentry, flags);
1658	if (unlikely(error <= `0`)) {
1659	if (!error)
1660	d_invalidate(dentry);
1661	dput(dentry);
1662	return ERR_PTR(error);
1663	}
1664	}
1665	return dentry;
1666	}
1667
1668	static struct dentry lookup_one_qstr_excl_raw(const* struct qstr *name,
1669	struct dentry *base,
1670	unsigned int flags)
1671	{
1672	struct dentry *dentry;
1673	struct dentry *old;
1674	struct inode *dir;
1675
1676	dentry = lookup_dcache(name, dir: base, flags);
1677	if (dentry)
1678	return dentry;
1679
1680	/ Don't create child dentry for a dead directory. /
1681	dir = base->d_inode;
1682	if (unlikely(IS_DEADDIR(dir)))
1683	return ERR_PTR(error: -ENOENT);
1684
1685	dentry = d_alloc(base, name);
1686	if (unlikely(!dentry))
1687	return ERR_PTR(error: -ENOMEM);
1688
1689	old = dir->i_op->lookup(dir, dentry, flags);
1690	if (unlikely(old)) {
1691	dput(dentry);
1692	dentry = old;
1693	}
1694	return dentry;
1695	}
1696
1697	/*
1698	* Parent directory has inode locked exclusive. This is one
1699	* and only case when ->lookup() gets called on non in-lookup
1700	* dentries - as the matter of fact, this only gets called
1701	* when directory is guaranteed to have no in-lookup children
1702	* at all.
1703	* Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
1704	* Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
1705	*/
1706	struct dentry lookup_one_qstr_excl(const* struct qstr *name,
1707	struct dentry base, unsigned* int flags)
1708	{
1709	struct dentry *dentry;
1710
1711	dentry = lookup_one_qstr_excl_raw(name, base, flags);
1712	if (IS_ERR(ptr: dentry))
1713	return dentry;
1714	if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
1715	dput(dentry);
1716	return ERR_PTR(error: -ENOENT);
1717	}
1718	if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
1719	dput(dentry);
1720	return ERR_PTR(error: -EEXIST);
1721	}
1722	return dentry;
1723	}
1724	EXPORT_SYMBOL(lookup_one_qstr_excl);
1725
1726	/**
1727	* lookup_fast - do fast lockless (but racy) lookup of a dentry
1728	* @nd: current nameidata
1729	*
1730	* Do a fast, but racy lookup in the dcache for the given dentry, and
1731	* revalidate it. Returns a valid dentry pointer or NULL if one wasn't
1732	* found. On error, an ERR_PTR will be returned.
1733	*
1734	* If this function returns a valid dentry and the walk is no longer
1735	* lazy, the dentry will carry a reference that must later be put. If
1736	* RCU mode is still in force, then this is not the case and the dentry
1737	* must be legitimized before use. If this returns NULL, then the walk
1738	* will no longer be in RCU mode.
1739	*/
1740	static struct dentry lookup_fast(struct* nameidata *nd)
1741	{
1742	struct dentry dentry, parent = nd->path.dentry;
1743	int status = `1`;
1744
1745	/*
1746	* Rename seqlock is not required here because in the off chance
1747	* of a false negative due to a concurrent rename, the caller is
1748	* going to fall back to non-racy lookup.
1749	*/
1750	if (nd->flags & LOOKUP_RCU) {
1751	dentry = __d_lookup_rcu(parent, name: &nd->last, seq: &nd->next_seq);
1752	if (unlikely(!dentry)) {
1753	if (!try_to_unlazy(nd))
1754	return ERR_PTR(error: -ECHILD);
1755	return NULL;
1756	}
1757
1758	/*
1759	* This sequence count validates that the parent had no
1760	* changes while we did the lookup of the dentry above.
1761	*/
1762	if (read_seqcount_retry(&parent->d_seq, nd->seq))
1763	return ERR_PTR(error: -ECHILD);
1764
1765	status = d_revalidate(dir: nd->inode, name: &nd->last, dentry, flags: nd->flags);
1766	if (likely(status > `0`))
1767	return dentry;
1768	if (!try_to_unlazy_next(nd, dentry))
1769	return ERR_PTR(error: -ECHILD);
1770	if (status == -ECHILD)
1771	/ we'd been told to redo it in non-rcu mode /
1772	status = d_revalidate(dir: nd->inode, name: &nd->last,
1773	dentry, flags: nd->flags);
1774	} else {
1775	dentry = __d_lookup(parent, &nd->last);
1776	if (unlikely(!dentry))
1777	return NULL;
1778	status = d_revalidate(dir: nd->inode, name: &nd->last, dentry, flags: nd->flags);
1779	}
1780	if (unlikely(status <= `0`)) {
1781	if (!status)
1782	d_invalidate(dentry);
1783	dput(dentry);
1784	return ERR_PTR(error: status);
1785	}
1786	return dentry;
1787	}
1788
1789	/ Fast lookup failed, do it the slow way /
1790	static struct dentry __lookup_slow(const* struct qstr *name,
1791	struct dentry *dir,
1792	unsigned int flags)
1793	{
1794	struct dentry dentry, old;
1795	struct inode *inode = dir->d_inode;
1796	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1797
1798	/ Don't go there if it's already dead /
1799	if (unlikely(IS_DEADDIR(inode)))
1800	return ERR_PTR(error: -ENOENT);
1801	again:
1802	dentry = d_alloc_parallel(dir, name, &wq);
1803	if (IS_ERR(ptr: dentry))
1804	return dentry;
1805	if (unlikely(!d_in_lookup(dentry))) {
1806	int error = d_revalidate(dir: inode, name, dentry, flags);
1807	if (unlikely(error <= `0`)) {
1808	if (!error) {
1809	d_invalidate(dentry);
1810	dput(dentry);
1811	goto again;
1812	}
1813	dput(dentry);
1814	dentry = ERR_PTR(error);
1815	}
1816	} else {
1817	old = inode->i_op->lookup(inode, dentry, flags);
1818	d_lookup_done(dentry);
1819	if (unlikely(old)) {
1820	dput(dentry);
1821	dentry = old;
1822	}
1823	}
1824	return dentry;
1825	}
1826
1827	static struct dentry lookup_slow(const* struct qstr *name,
1828	struct dentry *dir,
1829	unsigned int flags)
1830	{
1831	struct inode *inode = dir->d_inode;
1832	struct dentry *res;
1833	inode_lock_shared(inode);
1834	res = __lookup_slow(name, dir, flags);
1835	inode_unlock_shared(inode);
1836	return res;
1837	}
1838
1839	static inline int may_lookup(struct mnt_idmap *idmap,
1840	struct nameidata *restrict nd)
1841	{
1842	int err, mask;
1843
1844	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : `0`;
1845	err = inode_permission(idmap, nd->inode, mask \| MAY_EXEC);
1846	if (likely(!err))
1847	return `0`;
1848
1849	// If we failed, and we weren't in LOOKUP_RCU, it's final
1850	if (!(nd->flags & LOOKUP_RCU))
1851	return err;
1852
1853	// Drop out of RCU mode to make sure it wasn't transient
1854	if (!try_to_unlazy(nd))
1855	return -ECHILD; // redo it all non-lazy
1856
1857	if (err != -ECHILD) // hard error
1858	return err;
1859
1860	return inode_permission(idmap, nd->inode, MAY_EXEC);
1861	}
1862
1863	static int reserve_stack(struct nameidata nd, struct* path *link)
1864	{
1865	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1866	return -ELOOP;
1867
1868	if (likely(nd->depth != EMBEDDED_LEVELS))
1869	return `0`;
1870	if (likely(nd->stack != nd->internal))
1871	return `0`;
1872	if (likely(nd_alloc_stack(nd)))
1873	return `0`;
1874
1875	if (nd->flags & LOOKUP_RCU) {
1876	// we need to grab link before we do unlazy. And we can't skip
1877	// unlazy even if we fail to grab the link - cleanup needs it
1878	bool grabbed_link = legitimize_path(nd, path: link, seq: nd->next_seq);
1879
1880	if (!try_to_unlazy(nd) \|\| !grabbed_link)
1881	return -ECHILD;
1882
1883	if (nd_alloc_stack(nd))
1884	return `0`;
1885	}
1886	return -ENOMEM;
1887	}
1888
1889	enum {WALK_TRAILING = `1`, WALK_MORE = `2`, WALK_NOFOLLOW = `4`};
1890
1891	static const char pick_link(struct* nameidata nd, struct* path *link,
1892	struct inode inode, int* flags)
1893	{
1894	struct saved *last;
1895	const char *res;
1896	int error = reserve_stack(nd, link);
1897
1898	if (unlikely(error)) {
1899	if (!(nd->flags & LOOKUP_RCU))
1900	path_put(link);
1901	return ERR_PTR(error);
1902	}
1903	last = nd->stack + nd->depth++;
1904	last->link = *link;
1905	clear_delayed_call(call: &last->done);
1906	last->seq = nd->next_seq;
1907
1908	if (flags & WALK_TRAILING) {
1909	error = may_follow_link(nd, inode);
1910	if (unlikely(error))
1911	return ERR_PTR(error);
1912	}
1913
1914	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) \|\|
1915	unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1916	return ERR_PTR(error: -ELOOP);
1917
1918	if (unlikely(atime_needs_update(&last->link, inode))) {
1919	if (nd->flags & LOOKUP_RCU) {
1920	if (!try_to_unlazy(nd))
1921	return ERR_PTR(error: -ECHILD);
1922	}
1923	touch_atime(&last->link);
1924	cond_resched();
1925	}
1926
1927	error = security_inode_follow_link(dentry: link->dentry, inode,
1928	rcu: nd->flags & LOOKUP_RCU);
1929	if (unlikely(error))
1930	return ERR_PTR(error);
1931
1932	res = READ_ONCE(inode->i_link);
1933	if (!res) {
1934	const char * (get)(struct* dentry , struct* inode *,
1935	struct delayed_call *);
1936	get = inode->i_op->get_link;
1937	if (nd->flags & LOOKUP_RCU) {
1938	res = get(NULL, inode, &last->done);
1939	if (res == ERR_PTR(error: -ECHILD) && try_to_unlazy(nd))
1940	res = get(link->dentry, inode, &last->done);
1941	} else {
1942	res = get(link->dentry, inode, &last->done);
1943	}
1944	if (!res)
1945	goto all_done;
1946	if (IS_ERR(ptr: res))
1947	return res;
1948	}
1949	if (*res == `'/'`) {
1950	error = nd_jump_root(nd);
1951	if (unlikely(error))
1952	return ERR_PTR(error);
1953	while (unlikely(*++res == `'/'`))
1954	;
1955	}
1956	if (*res)
1957	return res;
1958	all_done: // pure jump
1959	put_link(nd);
1960	return NULL;
1961	}
1962
1963	/*
1964	* Do we need to follow links? We _really_ want to be able
1965	* to do this check without having to look at inode->i_op,
1966	* so we keep a cache of "no, this doesn't need follow_link"
1967	* for the common case.
1968	*
1969	* NOTE: dentry must be what nd->next_seq had been sampled from.
1970	*/
1971	static const char step_into(struct* nameidata nd, int* flags,
1972	struct dentry *dentry)
1973	{
1974	struct path path;
1975	struct inode *inode;
1976	int err = handle_mounts(nd, dentry, path: &path);
1977
1978	if (err < `0`)
1979	return ERR_PTR(error: err);
1980	inode = path.dentry->d_inode;
1981	if (likely(!d_is_symlink(path.dentry)) \|\|
1982	((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) \|\|
1983	(flags & WALK_NOFOLLOW)) {
1984	/ not a symlink or should not follow /
1985	if (nd->flags & LOOKUP_RCU) {
1986	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1987	return ERR_PTR(error: -ECHILD);
1988	if (unlikely(!inode))
1989	return ERR_PTR(error: -ENOENT);
1990	} else {
1991	dput(nd->path.dentry);
1992	if (nd->path.mnt != path.mnt)
1993	mntput(mnt: nd->path.mnt);
1994	}
1995	nd->path = path;
1996	nd->inode = inode;
1997	nd->seq = nd->next_seq;
1998	return NULL;
1999	}
2000	if (nd->flags & LOOKUP_RCU) {
2001	/ make sure that d_is_symlink above matches inode /
2002	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
2003	return ERR_PTR(error: -ECHILD);
2004	} else {
2005	if (path.mnt == nd->path.mnt)
2006	mntget(mnt: path.mnt);
2007	}
2008	return pick_link(nd, link: &path, inode, flags);
2009	}
2010
2011	static struct dentry follow_dotdot_rcu(struct* nameidata *nd)
2012	{
2013	struct dentry parent, old;
2014
2015	if (path_equal(path1: &nd->path, path2: &nd->root))
2016	goto in_root;
2017	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2018	struct path path;
2019	unsigned seq;
2020	if (!choose_mountpoint_rcu(m: real_mount(mnt: nd->path.mnt),
2021	root: &nd->root, path: &path, seqp: &seq))
2022	goto in_root;
2023	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2024	return ERR_PTR(error: -ECHILD);
2025	nd->path = path;
2026	nd->inode = path.dentry->d_inode;
2027	nd->seq = seq;
2028	// makes sure that non-RCU pathwalk could reach this state
2029	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
2030	return ERR_PTR(error: -ECHILD);
2031	/ we know that mountpoint was pinned /
2032	}
2033	old = nd->path.dentry;
2034	parent = old->d_parent;
2035	nd->next_seq = read_seqcount_begin(&parent->d_seq);
2036	// makes sure that non-RCU pathwalk could reach this state
2037	if (read_seqcount_retry(&old->d_seq, nd->seq))
2038	return ERR_PTR(error: -ECHILD);
2039	if (unlikely(!path_connected(nd->path.mnt, parent)))
2040	return ERR_PTR(error: -ECHILD);
2041	return parent;
2042	in_root:
2043	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
2044	return ERR_PTR(error: -ECHILD);
2045	if (unlikely(nd->flags & LOOKUP_BENEATH))
2046	return ERR_PTR(error: -ECHILD);
2047	nd->next_seq = nd->seq;
2048	return nd->path.dentry;
2049	}
2050
2051	static struct dentry follow_dotdot(struct* nameidata *nd)
2052	{
2053	struct dentry *parent;
2054
2055	if (path_equal(path1: &nd->path, path2: &nd->root))
2056	goto in_root;
2057	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
2058	struct path path;
2059
2060	if (!choose_mountpoint(m: real_mount(mnt: nd->path.mnt),
2061	root: &nd->root, path: &path))
2062	goto in_root;
2063	path_put(&nd->path);
2064	nd->path = path;
2065	nd->inode = path.dentry->d_inode;
2066	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
2067	return ERR_PTR(error: -EXDEV);
2068	}
2069	/ rare case of legitimate dget_parent()... /
2070	parent = dget_parent(dentry: nd->path.dentry);
2071	if (unlikely(!path_connected(nd->path.mnt, parent))) {
2072	dput(parent);
2073	return ERR_PTR(error: -ENOENT);
2074	}
2075	return parent;
2076
2077	in_root:
2078	if (unlikely(nd->flags & LOOKUP_BENEATH))
2079	return ERR_PTR(error: -EXDEV);
2080	return dget(dentry: nd->path.dentry);
2081	}
2082
2083	static const char handle_dots(struct* nameidata nd, int* type)
2084	{
2085	if (type == LAST_DOTDOT) {
2086	const char *error = NULL;
2087	struct dentry *parent;
2088
2089	if (!nd->root.mnt) {
2090	error = ERR_PTR(error: set_root(nd));
2091	if (error)
2092	return error;
2093	}
2094	if (nd->flags & LOOKUP_RCU)
2095	parent = follow_dotdot_rcu(nd);
2096	else
2097	parent = follow_dotdot(nd);
2098	if (IS_ERR(ptr: parent))
2099	return ERR_CAST(ptr: parent);
2100	error = step_into(nd, flags: WALK_NOFOLLOW, dentry: parent);
2101	if (unlikely(error))
2102	return error;
2103
2104	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
2105	/*
2106	* If there was a racing rename or mount along our
2107	* path, then we can't be sure that ".." hasn't jumped
2108	* above nd->root (and so userspace should retry or use
2109	* some fallback).
2110	*/
2111	smp_rmb();
2112	if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
2113	return ERR_PTR(error: -EAGAIN);
2114	if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
2115	return ERR_PTR(error: -EAGAIN);
2116	}
2117	}
2118	return NULL;
2119	}
2120
2121	static const char walk_component(struct* nameidata nd, int* flags)
2122	{
2123	struct dentry *dentry;
2124	/*
2125	* "." and ".." are special - ".." especially so because it has
2126	* to be able to know about the current root directory and
2127	* parent relationships.
2128	*/
2129	if (unlikely(nd->last_type != LAST_NORM)) {
2130	if (!(flags & WALK_MORE) && nd->depth)
2131	put_link(nd);
2132	return handle_dots(nd, type: nd->last_type);
2133	}
2134	dentry = lookup_fast(nd);
2135	if (IS_ERR(ptr: dentry))
2136	return ERR_CAST(ptr: dentry);
2137	if (unlikely(!dentry)) {
2138	dentry = lookup_slow(name: &nd->last, dir: nd->path.dentry, flags: nd->flags);
2139	if (IS_ERR(ptr: dentry))
2140	return ERR_CAST(ptr: dentry);
2141	}
2142	if (!(flags & WALK_MORE) && nd->depth)
2143	put_link(nd);
2144	return step_into(nd, flags, dentry);
2145	}
2146
2147	/*
2148	* We can do the critical dentry name comparison and hashing
2149	* operations one word at a time, but we are limited to:
2150	*
2151	* - Architectures with fast unaligned word accesses. We could
2152	* do a "get_unaligned()" if this helps and is sufficiently
2153	* fast.
2154	*
2155	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2156	* do not trap on the (extremely unlikely) case of a page
2157	* crossing operation.
2158	*
2159	* - Furthermore, we need an efficient 64-bit compile for the
2160	* 64-bit case in order to generate the "number of bytes in
2161	* the final mask". Again, that could be replaced with a
2162	* efficient population count instruction or similar.
2163	*/
2164	#ifdef CONFIG_DCACHE_WORD_ACCESS
2165
2166	#include <asm/word-at-a-time.h>
2167
2168	#ifdef HASH_MIX
2169
2170	/ Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> /
2171
2172	#elif defined(CONFIG_64BIT)
2173	/*
2174	* Register pressure in the mixing function is an issue, particularly
2175	* on 32-bit x86, but almost any function requires one state value and
2176	* one temporary. Instead, use a function designed for two state values
2177	* and no temporaries.
2178	*
2179	* This function cannot create a collision in only two iterations, so
2180	* we have two iterations to achieve avalanche. In those two iterations,
2181	* we have six layers of mixing, which is enough to spread one bit's
2182	* influence out to 2^6 = 64 state bits.
2183	*
2184	* Rotate constants are scored by considering either 64 one-bit input
2185	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2186	* probability of that delta causing a change to each of the 128 output
2187	* bits, using a sample of random initial states.
2188	*
2189	* The Shannon entropy of the computed probabilities is then summed
2190	* to produce a score. Ideally, any input change has a 50% chance of
2191	* toggling any given output bit.
2192	*
2193	* Mixing scores (in bits) for (12,45):
2194	* Input delta: 1-bit 2-bit
2195	* 1 round: 713.3 42542.6
2196	* 2 rounds: 2753.7 140389.8
2197	* 3 rounds: 5954.1 233458.2
2198	* 4 rounds: 7862.6 256672.2
2199	* Perfect: 8192 258048
2200	* (64128) (6463/2 * 128)
2201	*/
2202	#define HASH_MIX(x, y, a) \
2203	( x ^= (a), \
2204	y ^= x, x = rol64(x,12),\
2205	x += y, y = rol64(y,45),\
2206	y *= 9 )
2207
2208	/*
2209	* Fold two longs into one 32-bit hash value. This must be fast, but
2210	* latency isn't quite as critical, as there is a fair bit of additional
2211	* work done before the hash value is used.
2212	*/
2213	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2214	{
2215	y ^= x * GOLDEN_RATIO_64;
2216	y *= GOLDEN_RATIO_64;
2217	return y >> `32`;
2218	}
2219
2220	#else /* 32-bit case */
2221
2222	/*
2223	* Mixing scores (in bits) for (7,20):
2224	* Input delta: 1-bit 2-bit
2225	* 1 round: 330.3 9201.6
2226	* 2 rounds: 1246.4 25475.4
2227	* 3 rounds: 1907.1 31295.1
2228	* 4 rounds: 2042.3 31718.6
2229	* Perfect: 2048 31744
2230	* (3264) (3231/2 * 64)
2231	*/
2232	#define HASH_MIX(x, y, a) \
2233	( x ^= (a), \
2234	y ^= x, x = rol32(x, 7),\
2235	x += y, y = rol32(y,20),\
2236	y *= 9 )
2237
2238	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2239	{
2240	/ Use arch-optimized multiply if one exists /
2241	return __hash_32(y ^ __hash_32(x));
2242	}
2243
2244	#endif
2245
2246	/*
2247	* Return the hash of a string of known length. This is carfully
2248	* designed to match hash_name(), which is the more critical function.
2249	* In particular, we must end by hashing a final word containing 0..7
2250	* payload bytes, to match the way that hash_name() iterates until it
2251	* finds the delimiter after the name.
2252	*/
2253	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2254	{
2255	unsigned long a, x = `0`, y = (unsigned long)salt;
2256
2257	for (;;) {
2258	if (!len)
2259	goto done;
2260	a = load_unaligned_zeropad(addr: name);
2261	if (len < sizeof(unsigned long))
2262	break;
2263	HASH_MIX(x, y, a);
2264	name += sizeof(unsigned long);
2265	len -= sizeof(unsigned long);
2266	}
2267	x ^= a & bytemask_from_count(len);
2268	done:
2269	return fold_hash(x, y);
2270	}
2271	EXPORT_SYMBOL(full_name_hash);
2272
2273	/ Return the "hash_len" (hash and length) of a null-terminated string /
2274	u64 hashlen_string(const void salt, const* char *name)
2275	{
2276	unsigned long a = `0`, x = `0`, y = (unsigned long)salt;
2277	unsigned long adata, mask, len;
2278	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2279
2280	len = `0`;
2281	goto inside;
2282
2283	do {
2284	HASH_MIX(x, y, a);
2285	len += sizeof(unsigned long);
2286	inside:
2287	a = load_unaligned_zeropad(addr: name+len);
2288	} while (!has_zero(a, bits: &adata, c: &constants));
2289
2290	adata = prep_zero_mask(a, bits: adata, c: &constants);
2291	mask = create_zero_mask(adata);
2292	x ^= a & zero_bytemask(bits: mask);
2293
2294	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2295	}
2296	EXPORT_SYMBOL(hashlen_string);
2297
2298	/*
2299	* Calculate the length and hash of the path component, and
2300	* return the length as the result.
2301	*/
2302	static inline const char hash_name(struct* nameidata *nd,
2303	const char *name,
2304	unsigned long *lastword)
2305	{
2306	unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
2307	unsigned long adata, bdata, mask, len;
2308	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2309
2310	/*
2311	* The first iteration is special, because it can result in
2312	* '.' and '..' and has no mixing other than the final fold.
2313	*/
2314	a = load_unaligned_zeropad(addr: name);
2315	b = a ^ REPEAT_BYTE(`'/'`);
2316	if (has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)) {
2317	adata = prep_zero_mask(a, bits: adata, c: &constants);
2318	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2319	mask = create_zero_mask(adata \| bdata);
2320	a &= zero_bytemask(bits: mask);
2321	*lastword = a;
2322	len = find_zero(mask);
2323	nd->last.hash = fold_hash(x: a, y);
2324	nd->last.len = len;
2325	return name + len;
2326	}
2327
2328	len = `0`;
2329	x = `0`;
2330	do {
2331	HASH_MIX(x, y, a);
2332	len += sizeof(unsigned long);
2333	a = load_unaligned_zeropad(addr: name+len);
2334	b = a ^ REPEAT_BYTE(`'/'`);
2335	} while (!(has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)));
2336
2337	adata = prep_zero_mask(a, bits: adata, c: &constants);
2338	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2339	mask = create_zero_mask(adata \| bdata);
2340	a &= zero_bytemask(bits: mask);
2341	x ^= a;
2342	len += find_zero(mask);
2343	lastword = `0`; // Multi-word components cannot be DOT or DOTDOT*
2344
2345	nd->last.hash = fold_hash(x, y);
2346	nd->last.len = len;
2347	return name + len;
2348	}
2349
2350	/*
2351	* Note that the 'last' word is always zero-masked, but
2352	* was loaded as a possibly big-endian word.
2353	*/
2354	#ifdef __BIG_ENDIAN
2355	#define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
2356	#define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
2357	#endif
2358
2359	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2360
2361	/ Return the hash of a string of known length /
2362	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2363	{
2364	unsigned long hash = init_name_hash(salt);
2365	while (len--)
2366	hash = partial_name_hash((unsigned char)*name++, hash);
2367	return end_name_hash(hash);
2368	}
2369	EXPORT_SYMBOL(full_name_hash);
2370
2371	/ Return the "hash_len" (hash and length) of a null-terminated string /
2372	u64 hashlen_string(const void salt, const* char *name)
2373	{
2374	unsigned long hash = init_name_hash(salt);
2375	unsigned long len = `0`, c;
2376
2377	c = (unsigned char)*name;
2378	while (c) {
2379	len++;
2380	hash = partial_name_hash(c, hash);
2381	c = (unsigned char)name[len];
2382	}
2383	return hashlen_create(end_name_hash(hash), len);
2384	}
2385	EXPORT_SYMBOL(hashlen_string);
2386
2387	/*
2388	* We know there's a real path component here of at least
2389	* one character.
2390	*/
2391	static inline const char hash_name(struct* nameidata nd, const* char name, unsigned* long *lastword)
2392	{
2393	unsigned long hash = init_name_hash(nd->path.dentry);
2394	unsigned long len = `0`, c, last = `0`;
2395
2396	c = (unsigned char)*name;
2397	do {
2398	last = (last << `8`) + c;
2399	len++;
2400	hash = partial_name_hash(c, hash);
2401	c = (unsigned char)name[len];
2402	} while (c && c != `'/'`);
2403
2404	// This is reliable for DOT or DOTDOT, since the component
2405	// cannot contain NUL characters - top bits being zero means
2406	// we cannot have had any other pathnames.
2407	*lastword = last;
2408	nd->last.hash = end_name_hash(hash);
2409	nd->last.len = len;
2410	return name + len;
2411	}
2412
2413	#endif
2414
2415	#ifndef LAST_WORD_IS_DOT
2416	#define LAST_WORD_IS_DOT 0x2e
2417	#define LAST_WORD_IS_DOTDOT 0x2e2e
2418	#endif
2419
2420	/*
2421	* Name resolution.
2422	* This is the basic name resolution function, turning a pathname into
2423	* the final dentry. We expect 'base' to be positive and a directory.
2424	*
2425	* Returns 0 and nd will have valid dentry and mnt on success.
2426	* Returns error and drops reference to input namei data on failure.
2427	*/
2428	static int link_path_walk(const char name, struct* nameidata *nd)
2429	{
2430	int depth = `0`; // depth <= nd->depth
2431	int err;
2432
2433	nd->last_type = LAST_ROOT;
2434	nd->flags \|= LOOKUP_PARENT;
2435	if (IS_ERR(ptr: name))
2436	return PTR_ERR(ptr: name);
2437	if (*name == `'/'`) {
2438	do {
2439	name++;
2440	} while (unlikely(*name == `'/'`));
2441	}
2442	if (unlikely(!*name)) {
2443	nd->dir_mode = `0`; // short-circuit the 'hardening' idiocy
2444	return `0`;
2445	}
2446
2447	/ At this point we know we have a real path component. /
2448	for(;;) {
2449	struct mnt_idmap *idmap;
2450	const char *link;
2451	unsigned long lastword;
2452
2453	idmap = mnt_idmap(mnt: nd->path.mnt);
2454	err = may_lookup(idmap, nd);
2455	if (unlikely(err))
2456	return err;
2457
2458	nd->last.name = name;
2459	name = hash_name(nd, name, lastword: &lastword);
2460
2461	switch(lastword) {
2462	case LAST_WORD_IS_DOTDOT:
2463	nd->last_type = LAST_DOTDOT;
2464	nd->state \|= ND_JUMPED;
2465	break;
2466
2467	case LAST_WORD_IS_DOT:
2468	nd->last_type = LAST_DOT;
2469	break;
2470
2471	default:
2472	nd->last_type = LAST_NORM;
2473	nd->state &= ~ND_JUMPED;
2474
2475	struct dentry *parent = nd->path.dentry;
2476	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2477	err = parent->d_op->d_hash(parent, &nd->last);
2478	if (err < `0`)
2479	return err;
2480	}
2481	}
2482
2483	if (!*name)
2484	goto OK;
2485	/*
2486	* If it wasn't NUL, we know it was '/'. Skip that
2487	* slash, and continue until no more slashes.
2488	*/
2489	do {
2490	name++;
2491	} while (unlikely(*name == `'/'`));
2492	if (unlikely(!*name)) {
2493	OK:
2494	/ pathname or trailing symlink, done /
2495	if (!depth) {
2496	nd->dir_vfsuid = i_uid_into_vfsuid(idmap, inode: nd->inode);
2497	nd->dir_mode = nd->inode->i_mode;
2498	nd->flags &= ~LOOKUP_PARENT;
2499	return `0`;
2500	}
2501	/ last component of nested symlink /
2502	name = nd->stack[--depth].name;
2503	link = walk_component(nd, flags: `0`);
2504	} else {
2505	/ not the last component /
2506	link = walk_component(nd, flags: WALK_MORE);
2507	}
2508	if (unlikely(link)) {
2509	if (IS_ERR(ptr: link))
2510	return PTR_ERR(ptr: link);
2511	/ a symlink to follow /
2512	nd->stack[depth++].name = name;
2513	name = link;
2514	continue;
2515	}
2516	if (unlikely(!d_can_lookup(nd->path.dentry))) {
2517	if (nd->flags & LOOKUP_RCU) {
2518	if (!try_to_unlazy(nd))
2519	return -ECHILD;
2520	}
2521	return -ENOTDIR;
2522	}
2523	}
2524	}
2525
2526	/ must be paired with terminate_walk() /
2527	static const char path_init(struct* nameidata nd, unsigned* flags)
2528	{
2529	int error;
2530	const char *s = nd->pathname;
2531
2532	/ LOOKUP_CACHED requires RCU, ask caller to retry /
2533	if ((flags & (LOOKUP_RCU \| LOOKUP_CACHED)) == LOOKUP_CACHED)
2534	return ERR_PTR(error: -EAGAIN);
2535
2536	if (!*s)
2537	flags &= ~LOOKUP_RCU;
2538	if (flags & LOOKUP_RCU)
2539	rcu_read_lock();
2540	else
2541	nd->seq = nd->next_seq = `0`;
2542
2543	nd->flags = flags;
2544	nd->state \|= ND_JUMPED;
2545
2546	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2547	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2548	smp_rmb();
2549
2550	if (nd->state & ND_ROOT_PRESET) {
2551	struct dentry *root = nd->root.dentry;
2552	struct inode *inode = root->d_inode;
2553	if (*s && unlikely(!d_can_lookup(root)))
2554	return ERR_PTR(error: -ENOTDIR);
2555	nd->path = nd->root;
2556	nd->inode = inode;
2557	if (flags & LOOKUP_RCU) {
2558	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2559	nd->root_seq = nd->seq;
2560	} else {
2561	path_get(&nd->path);
2562	}
2563	return s;
2564	}
2565
2566	nd->root.mnt = NULL;
2567
2568	/ Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). /
2569	if (*s == `'/'` && !(flags & LOOKUP_IN_ROOT)) {
2570	error = nd_jump_root(nd);
2571	if (unlikely(error))
2572	return ERR_PTR(error);
2573	return s;
2574	}
2575
2576	/ Relative pathname -- get the starting-point it is relative to. /
2577	if (nd->dfd == AT_FDCWD) {
2578	if (flags & LOOKUP_RCU) {
2579	struct fs_struct *fs = current->fs;
2580	unsigned seq;
2581
2582	do {
2583	seq = read_seqcount_begin(&fs->seq);
2584	nd->path = fs->pwd;
2585	nd->inode = nd->path.dentry->d_inode;
2586	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2587	} while (read_seqcount_retry(&fs->seq, seq));
2588	} else {
2589	get_fs_pwd(current->fs, pwd: &nd->path);
2590	nd->inode = nd->path.dentry->d_inode;
2591	}
2592	} else {
2593	/ Caller must check execute permissions on the starting path component /
2594	CLASS(fd_raw, f)(fd: nd->dfd);
2595	struct dentry *dentry;
2596
2597	if (fd_empty(f))
2598	return ERR_PTR(error: -EBADF);
2599
2600	if (flags & LOOKUP_LINKAT_EMPTY) {
2601	if (fd_file(f)->f_cred != current_cred() &&
2602	!ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
2603	return ERR_PTR(error: -ENOENT);
2604	}
2605
2606	dentry = fd_file(f)->f_path.dentry;
2607
2608	if (*s && unlikely(!d_can_lookup(dentry)))
2609	return ERR_PTR(error: -ENOTDIR);
2610
2611	nd->path = fd_file(f)->f_path;
2612	if (flags & LOOKUP_RCU) {
2613	nd->inode = nd->path.dentry->d_inode;
2614	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2615	} else {
2616	path_get(&nd->path);
2617	nd->inode = nd->path.dentry->d_inode;
2618	}
2619	}
2620
2621	/ For scoped-lookups we need to set the root to the dirfd as well. /
2622	if (flags & LOOKUP_IS_SCOPED) {
2623	nd->root = nd->path;
2624	if (flags & LOOKUP_RCU) {
2625	nd->root_seq = nd->seq;
2626	} else {
2627	path_get(&nd->root);
2628	nd->state \|= ND_ROOT_GRABBED;
2629	}
2630	}
2631	return s;
2632	}
2633
2634	static inline const char lookup_last(struct* nameidata *nd)
2635	{
2636	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2637	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
2638
2639	return walk_component(nd, flags: WALK_TRAILING);
2640	}
2641
2642	static int handle_lookup_down(struct nameidata *nd)
2643	{
2644	if (!(nd->flags & LOOKUP_RCU))
2645	dget(dentry: nd->path.dentry);
2646	nd->next_seq = nd->seq;
2647	return PTR_ERR(ptr: step_into(nd, flags: WALK_NOFOLLOW, dentry: nd->path.dentry));
2648	}
2649
2650	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2651	static int path_lookupat(struct nameidata nd, unsigned* flags, struct path *path)
2652	{
2653	const char *s = path_init(nd, flags);
2654	int err;
2655
2656	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(ptr: s)) {
2657	err = handle_lookup_down(nd);
2658	if (unlikely(err < `0`))
2659	s = ERR_PTR(error: err);
2660	}
2661
2662	while (!(err = link_path_walk(name: s, nd)) &&
2663	(s = lookup_last(nd)) != NULL)
2664	;
2665	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2666	err = handle_lookup_down(nd);
2667	nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2668	}
2669	if (!err)
2670	err = complete_walk(nd);
2671
2672	if (!err && nd->flags & LOOKUP_DIRECTORY)
2673	if (!d_can_lookup(dentry: nd->path.dentry))
2674	err = -ENOTDIR;
2675	if (!err) {
2676	*path = nd->path;
2677	nd->path.mnt = NULL;
2678	nd->path.dentry = NULL;
2679	}
2680	terminate_walk(nd);
2681	return err;
2682	}
2683
2684	int filename_lookup(int dfd, struct filename name, unsigned* flags,
2685	struct path path, struct* path *root)
2686	{
2687	int retval;
2688	struct nameidata nd;
2689	if (IS_ERR(ptr: name))
2690	return PTR_ERR(ptr: name);
2691	set_nameidata(p: &nd, dfd, name, root);
2692	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_RCU, path);
2693	if (unlikely(retval == -ECHILD))
2694	retval = path_lookupat(nd: &nd, flags, path);
2695	if (unlikely(retval == -ESTALE))
2696	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_REVAL, path);
2697
2698	if (likely(!retval))
2699	audit_inode(name, dentry: path->dentry,
2700	aflags: flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : `0`);
2701	restore_nameidata();
2702	return retval;
2703	}
2704
2705	/ Returns 0 and nd will be valid on success; Returns error, otherwise. /
2706	static int path_parentat(struct nameidata nd, unsigned* flags,
2707	struct path *parent)
2708	{
2709	const char *s = path_init(nd, flags);
2710	int err = link_path_walk(name: s, nd);
2711	if (!err)
2712	err = complete_walk(nd);
2713	if (!err) {
2714	*parent = nd->path;
2715	nd->path.mnt = NULL;
2716	nd->path.dentry = NULL;
2717	}
2718	terminate_walk(nd);
2719	return err;
2720	}
2721
2722	/ Note: this does not consume "name" /
2723	static int __filename_parentat(int dfd, struct filename *name,
2724	unsigned int flags, struct path *parent,
2725	struct qstr last, int* *type,
2726	const struct path *root)
2727	{
2728	int retval;
2729	struct nameidata nd;
2730
2731	if (IS_ERR(ptr: name))
2732	return PTR_ERR(ptr: name);
2733	set_nameidata(p: &nd, dfd, name, root);
2734	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_RCU, parent);
2735	if (unlikely(retval == -ECHILD))
2736	retval = path_parentat(nd: &nd, flags, parent);
2737	if (unlikely(retval == -ESTALE))
2738	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_REVAL, parent);
2739	if (likely(!retval)) {
2740	*last = nd.last;
2741	*type = nd.last_type;
2742	audit_inode(name, dentry: parent->dentry, AUDIT_INODE_PARENT);
2743	}
2744	restore_nameidata();
2745	return retval;
2746	}
2747
2748	static int filename_parentat(int dfd, struct filename *name,
2749	unsigned int flags, struct path *parent,
2750	struct qstr last, int* *type)
2751	{
2752	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2753	}
2754
2755	/ does lookup, returns the object with parent locked /
2756	static struct dentry __kern_path_locked(int* dfd, struct filename name, struct* path *path)
2757	{
2758	struct path parent_path __free(path_put) = {};
2759	struct dentry *d;
2760	struct qstr last;
2761	int type, error;
2762
2763	error = filename_parentat(dfd, name, flags: `0`, parent: &parent_path, last: &last, type: &type);
2764	if (error)
2765	return ERR_PTR(error);
2766	if (unlikely(type != LAST_NORM))
2767	return ERR_PTR(error: -EINVAL);
2768	inode_lock_nested(inode: parent_path.dentry->d_inode, subclass: I_MUTEX_PARENT);
2769	d = lookup_one_qstr_excl(&last, parent_path.dentry, `0`);
2770	if (IS_ERR(ptr: d)) {
2771	inode_unlock(inode: parent_path.dentry->d_inode);
2772	return d;
2773	}
2774	path->dentry = no_free_ptr(parent_path.dentry);
2775	path->mnt = no_free_ptr(parent_path.mnt);
2776	return d;
2777	}
2778
2779	struct dentry kern_path_locked_negative(const* char name, struct* path *path)
2780	{
2781	struct path parent_path __free(path_put) = {};
2782	struct filename *filename __free(putname) = getname_kernel(name);
2783	struct dentry *d;
2784	struct qstr last;
2785	int type, error;
2786
2787	error = filename_parentat(AT_FDCWD, name: filename, flags: `0`, parent: &parent_path, last: &last, type: &type);
2788	if (error)
2789	return ERR_PTR(error);
2790	if (unlikely(type != LAST_NORM))
2791	return ERR_PTR(error: -EINVAL);
2792	inode_lock_nested(inode: parent_path.dentry->d_inode, subclass: I_MUTEX_PARENT);
2793	d = lookup_one_qstr_excl_raw(name: &last, base: parent_path.dentry, flags: `0`);
2794	if (IS_ERR(ptr: d)) {
2795	inode_unlock(inode: parent_path.dentry->d_inode);
2796	return d;
2797	}
2798	path->dentry = no_free_ptr(parent_path.dentry);
2799	path->mnt = no_free_ptr(parent_path.mnt);
2800	return d;
2801	}
2802
2803	struct dentry kern_path_locked(const* char name, struct* path *path)
2804	{
2805	struct filename *filename = getname_kernel(name);
2806	struct dentry *res = __kern_path_locked(AT_FDCWD, name: filename, path);
2807
2808	putname(filename);
2809	return res;
2810	}
2811
2812	struct dentry user_path_locked_at(int* dfd, const char __user name, struct* path *path)
2813	{
2814	struct filename *filename = getname(name);
2815	struct dentry *res = __kern_path_locked(dfd, name: filename, path);
2816
2817	putname(filename);
2818	return res;
2819	}
2820	EXPORT_SYMBOL(user_path_locked_at);
2821
2822	int kern_path(const char name, unsigned* int flags, struct path *path)
2823	{
2824	struct filename *filename = getname_kernel(name);
2825	int ret = filename_lookup(AT_FDCWD, name: filename, flags, path, NULL);
2826
2827	putname(filename);
2828	return ret;
2829
2830	}
2831	EXPORT_SYMBOL(kern_path);
2832
2833	/**
2834	* vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
2835	* @filename: filename structure
2836	* @flags: lookup flags
2837	* @parent: pointer to struct path to fill
2838	* @last: last component
2839	* @type: type of the last component
2840	* @root: pointer to struct path of the base directory
2841	*/
2842	int vfs_path_parent_lookup(struct filename filename, unsigned* int flags,
2843	struct path parent, struct* qstr last, int* *type,
2844	const struct path *root)
2845	{
2846	return __filename_parentat(AT_FDCWD, name: filename, flags, parent, last,
2847	type, root);
2848	}
2849	EXPORT_SYMBOL(vfs_path_parent_lookup);
2850
2851	/**
2852	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2853	* @dentry: pointer to dentry of the base directory
2854	* @mnt: pointer to vfs mount of the base directory
2855	* @name: pointer to file name
2856	* @flags: lookup flags
2857	* @path: pointer to struct path to fill
2858	*/
2859	int vfs_path_lookup(struct dentry dentry, struct* vfsmount *mnt,
2860	const char name, unsigned* int flags,
2861	struct path *path)
2862	{
2863	struct filename *filename;
2864	struct path root = {.mnt = mnt, .dentry = dentry};
2865	int ret;
2866
2867	filename = getname_kernel(name);
2868	/ the first argument of filename_lookup() is ignored with root /
2869	ret = filename_lookup(AT_FDCWD, name: filename, flags, path, root: &root);
2870	putname(filename);
2871	return ret;
2872	}
2873	EXPORT_SYMBOL(vfs_path_lookup);
2874
2875	static int lookup_noperm_common(struct qstr qname, struct* dentry *base)
2876	{
2877	const char *name = qname->name;
2878	u32 len = qname->len;
2879
2880	qname->hash = full_name_hash(base, name, len);
2881	if (!len)
2882	return -EACCES;
2883
2884	if (is_dot_dotdot(name, len))
2885	return -EACCES;
2886
2887	while (len--) {
2888	unsigned int c = (const* unsigned char *)name++;
2889	if (c == `'/'` \|\| c == `'\0'`)
2890	return -EACCES;
2891	}
2892	/*
2893	* See if the low-level filesystem might want
2894	* to use its own hash..
2895	*/
2896	if (base->d_flags & DCACHE_OP_HASH) {
2897	int err = base->d_op->d_hash(base, qname);
2898	if (err < `0`)
2899	return err;
2900	}
2901	return `0`;
2902	}
2903
2904	static int lookup_one_common(struct mnt_idmap *idmap,
2905	struct qstr qname, struct* dentry *base)
2906	{
2907	int err;
2908	err = lookup_noperm_common(qname, base);
2909	if (err < `0`)
2910	return err;
2911	return inode_permission(idmap, base->d_inode, MAY_EXEC);
2912	}
2913
2914	/**
2915	* try_lookup_noperm - filesystem helper to lookup single pathname component
2916	* @name: qstr storing pathname component to lookup
2917	* @base: base directory to lookup from
2918	*
2919	* Look up a dentry by name in the dcache, returning NULL if it does not
2920	* currently exist. The function does not try to create a dentry.
2921	*
2922	* Note that this routine is purely a helper for filesystem usage and should
2923	* not be called by generic code. It does no permission checking.
2924	*
2925	* No locks need be held - only a counted reference to @base is needed.
2926	*
2927	*/
2928	struct dentry try_lookup_noperm(struct* qstr name, struct* dentry *base)
2929	{
2930	int err;
2931
2932	err = lookup_noperm_common(qname: name, base);
2933	if (err)
2934	return ERR_PTR(error: err);
2935
2936	return lookup_dcache(name, dir: base, flags: `0`);
2937	}
2938	EXPORT_SYMBOL(try_lookup_noperm);
2939
2940	/**
2941	* lookup_noperm - filesystem helper to lookup single pathname component
2942	* @name: qstr storing pathname component to lookup
2943	* @base: base directory to lookup from
2944	*
2945	* Note that this routine is purely a helper for filesystem usage and should
2946	* not be called by generic code. It does no permission checking.
2947	*
2948	* The caller must hold base->i_mutex.
2949	*/
2950	struct dentry lookup_noperm(struct* qstr name, struct* dentry *base)
2951	{
2952	struct dentry *dentry;
2953	int err;
2954
2955	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2956
2957	err = lookup_noperm_common(qname: name, base);
2958	if (err)
2959	return ERR_PTR(error: err);
2960
2961	dentry = lookup_dcache(name, dir: base, flags: `0`);
2962	return dentry ? dentry : __lookup_slow(name, dir: base, flags: `0`);
2963	}
2964	EXPORT_SYMBOL(lookup_noperm);
2965
2966	/**
2967	* lookup_one - lookup single pathname component
2968	* @idmap: idmap of the mount the lookup is performed from
2969	* @name: qstr holding pathname component to lookup
2970	* @base: base directory to lookup from
2971	*
2972	* This can be used for in-kernel filesystem clients such as file servers.
2973	*
2974	* The caller must hold base->i_mutex.
2975	*/
2976	struct dentry lookup_one(struct* mnt_idmap idmap, struct* qstr *name,
2977	struct dentry *base)
2978	{
2979	struct dentry *dentry;
2980	int err;
2981
2982	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2983
2984	err = lookup_one_common(idmap, qname: name, base);
2985	if (err)
2986	return ERR_PTR(error: err);
2987
2988	dentry = lookup_dcache(name, dir: base, flags: `0`);
2989	return dentry ? dentry : __lookup_slow(name, dir: base, flags: `0`);
2990	}
2991	EXPORT_SYMBOL(lookup_one);
2992
2993	/**
2994	* lookup_one_unlocked - lookup single pathname component
2995	* @idmap: idmap of the mount the lookup is performed from
2996	* @name: qstr olding pathname component to lookup
2997	* @base: base directory to lookup from
2998	*
2999	* This can be used for in-kernel filesystem clients such as file servers.
3000	*
3001	* Unlike lookup_one, it should be called without the parent
3002	* i_rwsem held, and will take the i_rwsem itself if necessary.
3003	*/
3004	struct dentry lookup_one_unlocked(struct* mnt_idmap idmap, struct* qstr *name,
3005	struct dentry *base)
3006	{
3007	int err;
3008	struct dentry *ret;
3009
3010	err = lookup_one_common(idmap, qname: name, base);
3011	if (err)
3012	return ERR_PTR(error: err);
3013
3014	ret = lookup_dcache(name, dir: base, flags: `0`);
3015	if (!ret)
3016	ret = lookup_slow(name, dir: base, flags: `0`);
3017	return ret;
3018	}
3019	EXPORT_SYMBOL(lookup_one_unlocked);
3020
3021	/**
3022	* lookup_one_positive_unlocked - lookup single pathname component
3023	* @idmap: idmap of the mount the lookup is performed from
3024	* @name: qstr holding pathname component to lookup
3025	* @base: base directory to lookup from
3026	*
3027	* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
3028	* known positive or ERR_PTR(). This is what most of the users want.
3029	*
3030	* Note that pinned negative with unlocked parent _can_ become positive at any
3031	* time, so callers of lookup_one_unlocked() need to be very careful; pinned
3032	* positives have >d_inode stable, so this one avoids such problems.
3033	*
3034	* This can be used for in-kernel filesystem clients such as file servers.
3035	*
3036	* The helper should be called without i_rwsem held.
3037	*/
3038	struct dentry lookup_one_positive_unlocked(struct* mnt_idmap *idmap,
3039	struct qstr *name,
3040	struct dentry *base)
3041	{
3042	struct dentry *ret = lookup_one_unlocked(idmap, name, base);
3043
3044	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3045	dput(ret);
3046	ret = ERR_PTR(error: -ENOENT);
3047	}
3048	return ret;
3049	}
3050	EXPORT_SYMBOL(lookup_one_positive_unlocked);
3051
3052	/**
3053	* lookup_noperm_unlocked - filesystem helper to lookup single pathname component
3054	* @name: pathname component to lookup
3055	* @base: base directory to lookup from
3056	*
3057	* Note that this routine is purely a helper for filesystem usage and should
3058	* not be called by generic code. It does no permission checking.
3059	*
3060	* Unlike lookup_noperm, it should be called without the parent
3061	* i_rwsem held, and will take the i_rwsem itself if necessary.
3062	*/
3063	struct dentry lookup_noperm_unlocked(struct* qstr name, struct* dentry *base)
3064	{
3065	struct dentry *ret;
3066
3067	ret = try_lookup_noperm(name, base);
3068	if (!ret)
3069	ret = lookup_slow(name, dir: base, flags: `0`);
3070	return ret;
3071	}
3072	EXPORT_SYMBOL(lookup_noperm_unlocked);
3073
3074	/*
3075	* Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
3076	* on negatives. Returns known positive or ERR_PTR(); that's what
3077	* most of the users want. Note that pinned negative with unlocked parent
3078	* _can_ become positive at any time, so callers of lookup_noperm_unlocked()
3079	* need to be very careful; pinned positives have ->d_inode stable, so
3080	* this one avoids such problems.
3081	*/
3082	struct dentry lookup_noperm_positive_unlocked(struct* qstr *name,
3083	struct dentry *base)
3084	{
3085	struct dentry *ret;
3086
3087	ret = lookup_noperm_unlocked(name, base);
3088	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
3089	dput(ret);
3090	ret = ERR_PTR(error: -ENOENT);
3091	}
3092	return ret;
3093	}
3094	EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
3095
3096	#ifdef CONFIG_UNIX98_PTYS
3097	int path_pts(struct path *path)
3098	{
3099	/ Find something mounted on "pts" in the same directory as*
3100	* the input path.
3101	*/
3102	struct dentry *parent = dget_parent(dentry: path->dentry);
3103	struct dentry *child;
3104	struct qstr this = QSTR_INIT("pts", `3`);
3105
3106	if (unlikely(!path_connected(path->mnt, parent))) {
3107	dput(parent);
3108	return -ENOENT;
3109	}
3110	dput(path->dentry);
3111	path->dentry = parent;
3112	child = d_hash_and_lookup(parent, &this);
3113	if (IS_ERR_OR_NULL(ptr: child))
3114	return -ENOENT;
3115
3116	path->dentry = child;
3117	dput(parent);
3118	follow_down(path, `0`);
3119	return `0`;
3120	}
3121	#endif
3122
3123	int user_path_at(int dfd, const char __user name, unsigned* flags,
3124	struct path *path)
3125	{
3126	struct filename *filename = getname_flags(filename: name, flags);
3127	int ret = filename_lookup(dfd, name: filename, flags, path, NULL);
3128
3129	putname(filename);
3130	return ret;
3131	}
3132	EXPORT_SYMBOL(user_path_at);
3133
3134	int __check_sticky(struct mnt_idmap idmap, struct* inode *dir,
3135	struct inode *inode)
3136	{
3137	kuid_t fsuid = current_fsuid();
3138
3139	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), kuid: fsuid))
3140	return `0`;
3141	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode: dir), kuid: fsuid))
3142	return `0`;
3143	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
3144	}
3145	EXPORT_SYMBOL(__check_sticky);
3146
3147	/*
3148	* Check whether we can remove a link victim from directory dir, check
3149	* whether the type of victim is right.
3150	* 1. We can't do it if dir is read-only (done in permission())
3151	* 2. We should have write and exec permissions on dir
3152	* 3. We can't remove anything from append-only dir
3153	* 4. We can't do anything with immutable dir (done in permission())
3154	* 5. If the sticky bit on dir is set we should either
3155	* a. be owner of dir, or
3156	* b. be owner of victim, or
3157	* c. have CAP_FOWNER capability
3158	* 6. If the victim is append-only or immutable we can't do antyhing with
3159	* links pointing to it.
3160	* 7. If the victim has an unknown uid or gid we can't change the inode.
3161	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
3162	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
3163	* 10. We can't remove a root or mountpoint.
3164	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
3165	* nfs_async_unlink().
3166	*/
3167	static int may_delete(struct mnt_idmap idmap, struct* inode *dir,
3168	struct dentry *victim, bool isdir)
3169	{
3170	struct inode *inode = d_backing_inode(upper: victim);
3171	int error;
3172
3173	if (d_is_negative(dentry: victim))
3174	return -ENOENT;
3175	BUG_ON(!inode);
3176
3177	BUG_ON(victim->d_parent->d_inode != dir);
3178
3179	/ Inode writeback is not safe when the uid or gid are invalid. /
3180	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
3181	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
3182	return -EOVERFLOW;
3183
3184	audit_inode_child(parent: dir, dentry: victim, AUDIT_TYPE_CHILD_DELETE);
3185
3186	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3187	if (error)
3188	return error;
3189	if (IS_APPEND(dir))
3190	return -EPERM;
3191
3192	if (check_sticky(idmap, dir, inode) \|\| IS_APPEND(inode) \|\|
3193	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\|
3194	HAS_UNMAPPED_ID(idmap, inode))
3195	return -EPERM;
3196	if (isdir) {
3197	if (!d_is_dir(dentry: victim))
3198	return -ENOTDIR;
3199	if (IS_ROOT(victim))
3200	return -EBUSY;
3201	} else if (d_is_dir(dentry: victim))
3202	return -EISDIR;
3203	if (IS_DEADDIR(dir))
3204	return -ENOENT;
3205	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
3206	return -EBUSY;
3207	return `0`;
3208	}
3209
3210	/ Check whether we can create an object with dentry child in directory*
3211	* dir.
3212	* 1. We can't do it if child already exists (open has special treatment for
3213	* this case, but since we are inlined it's OK)
3214	* 2. We can't do it if dir is read-only (done in permission())
3215	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
3216	* 4. We should have write and exec permissions on dir
3217	* 5. We can't do it if dir is immutable (done in permission())
3218	*/
3219	static inline int may_create(struct mnt_idmap *idmap,
3220	struct inode dir, struct* dentry *child)
3221	{
3222	audit_inode_child(parent: dir, dentry: child, AUDIT_TYPE_CHILD_CREATE);
3223	if (child->d_inode)
3224	return -EEXIST;
3225	if (IS_DEADDIR(dir))
3226	return -ENOENT;
3227	if (!fsuidgid_has_mapping(sb: dir->i_sb, idmap))
3228	return -EOVERFLOW;
3229
3230	return inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3231	}
3232
3233	// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
3234	static struct dentry lock_two_directories(struct* dentry p1, struct* dentry *p2)
3235	{
3236	struct dentry p = p1, q = p2, *r;
3237
3238	while ((r = p->d_parent) != p2 && r != p)
3239	p = r;
3240	if (r == p2) {
3241	// p is a child of p2 and an ancestor of p1 or p1 itself
3242	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3243	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT2);
3244	return p;
3245	}
3246	// p is the root of connected component that contains p1
3247	// p2 does not occur on the path from p to p1
3248	while ((r = q->d_parent) != p1 && r != p && r != q)
3249	q = r;
3250	if (r == p1) {
3251	// q is a child of p1 and an ancestor of p2 or p2 itself
3252	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3253	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3254	return q;
3255	} else if (likely(r == p)) {
3256	// both p2 and p1 are descendents of p
3257	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3258	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT2);
3259	return NULL;
3260	} else { // no common ancestor at the time we'd been called
3261	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3262	return ERR_PTR(error: -EXDEV);
3263	}
3264	}
3265
3266	/*
3267	* p1 and p2 should be directories on the same fs.
3268	*/
3269	struct dentry lock_rename(struct* dentry p1, struct* dentry *p2)
3270	{
3271	if (p1 == p2) {
3272	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3273	return NULL;
3274	}
3275
3276	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3277	return lock_two_directories(p1, p2);
3278	}
3279	EXPORT_SYMBOL(lock_rename);
3280
3281	/*
3282	* c1 and p2 should be on the same fs.
3283	*/
3284	struct dentry lock_rename_child(struct* dentry c1, struct* dentry *p2)
3285	{
3286	if (READ_ONCE(c1->d_parent) == p2) {
3287	/*
3288	* hopefully won't need to touch ->s_vfs_rename_mutex at all.
3289	*/
3290	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3291	/*
3292	* now that p2 is locked, nobody can move in or out of it,
3293	* so the test below is safe.
3294	*/
3295	if (likely(c1->d_parent == p2))
3296	return NULL;
3297
3298	/*
3299	* c1 got moved out of p2 while we'd been taking locks;
3300	* unlock and fall back to slow case.
3301	*/
3302	inode_unlock(inode: p2->d_inode);
3303	}
3304
3305	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3306	/*
3307	* nobody can move out of any directories on this fs.
3308	*/
3309	if (likely(c1->d_parent != p2))
3310	return lock_two_directories(p1: c1->d_parent, p2);
3311
3312	/*
3313	* c1 got moved into p2 while we were taking locks;
3314	* we need p2 locked and ->s_vfs_rename_mutex unlocked,
3315	* for consistency with lock_rename().
3316	*/
3317	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3318	mutex_unlock(lock: &c1->d_sb->s_vfs_rename_mutex);
3319	return NULL;
3320	}
3321	EXPORT_SYMBOL(lock_rename_child);
3322
3323	void unlock_rename(struct dentry p1, struct* dentry *p2)
3324	{
3325	inode_unlock(inode: p1->d_inode);
3326	if (p1 != p2) {
3327	inode_unlock(inode: p2->d_inode);
3328	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3329	}
3330	}
3331	EXPORT_SYMBOL(unlock_rename);
3332
3333	/**
3334	* vfs_prepare_mode - prepare the mode to be used for a new inode
3335	* @idmap: idmap of the mount the inode was found from
3336	* @dir: parent directory of the new inode
3337	* @mode: mode of the new inode
3338	* @mask_perms: allowed permission by the vfs
3339	* @type: type of file to be created
3340	*
3341	* This helper consolidates and enforces vfs restrictions on the @mode of a new
3342	* object to be created.
3343	*
3344	* Umask stripping depends on whether the filesystem supports POSIX ACLs (see
3345	* the kernel documentation for mode_strip_umask()). Moving umask stripping
3346	* after setgid stripping allows the same ordering for both non-POSIX ACL and
3347	* POSIX ACL supporting filesystems.
3348	*
3349	* Note that it's currently valid for @type to be 0 if a directory is created.
3350	* Filesystems raise that flag individually and we need to check whether each
3351	* filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
3352	* non-zero type.
3353	*
3354	* Returns: mode to be passed to the filesystem
3355	*/
3356	static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
3357	const struct inode *dir, umode_t mode,
3358	umode_t mask_perms, umode_t type)
3359	{
3360	mode = mode_strip_sgid(idmap, dir, mode);
3361	mode = mode_strip_umask(dir, mode);
3362
3363	/*
3364	* Apply the vfs mandated allowed permission mask and set the type of
3365	* file to be created before we call into the filesystem.
3366	*/
3367	mode &= (mask_perms & ~S_IFMT);
3368	mode \|= (type & S_IFMT);
3369
3370	return mode;
3371	}
3372
3373	/**
3374	* vfs_create - create new file
3375	* @idmap: idmap of the mount the inode was found from
3376	* @dir: inode of the parent directory
3377	* @dentry: dentry of the child file
3378	* @mode: mode of the child file
3379	* @want_excl: whether the file must not yet exist
3380	*
3381	* Create a new file.
3382	*
3383	* If the inode has been found through an idmapped mount the idmap of
3384	* the vfsmount must be passed through @idmap. This function will then take
3385	* care to map the inode according to @idmap before checking permissions.
3386	* On non-idmapped mounts or if permission checking is to be performed on the
3387	* raw inode simply pass @nop_mnt_idmap.
3388	*/
3389	int vfs_create(struct mnt_idmap idmap, struct* inode *dir,
3390	struct dentry *dentry, umode_t mode, bool want_excl)
3391	{
3392	int error;
3393
3394	error = may_create(idmap, dir, child: dentry);
3395	if (error)
3396	return error;
3397
3398	if (!dir->i_op->create)
3399	return -EACCES; / shouldn't it be ENOSYS? /
3400
3401	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
3402	error = security_inode_create(dir, dentry, mode);
3403	if (error)
3404	return error;
3405	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
3406	if (!error)
3407	fsnotify_create(dir, dentry);
3408	return error;
3409	}
3410	EXPORT_SYMBOL(vfs_create);
3411
3412	int vfs_mkobj(struct dentry *dentry, umode_t mode,
3413	int (f)(struct* dentry , umode_t, void* *),
3414	void *arg)
3415	{
3416	struct inode *dir = dentry->d_parent->d_inode;
3417	int error = may_create(idmap: &nop_mnt_idmap, dir, child: dentry);
3418	if (error)
3419	return error;
3420
3421	mode &= S_IALLUGO;
3422	mode \|= S_IFREG;
3423	error = security_inode_create(dir, dentry, mode);
3424	if (error)
3425	return error;
3426	error = f(dentry, mode, arg);
3427	if (!error)
3428	fsnotify_create(dir, dentry);
3429	return error;
3430	}
3431	EXPORT_SYMBOL(vfs_mkobj);
3432
3433	bool may_open_dev(const struct path *path)
3434	{
3435	return !(path->mnt->mnt_flags & MNT_NODEV) &&
3436	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
3437	}
3438
3439	static int may_open(struct mnt_idmap idmap, const* struct path *path,
3440	int acc_mode, int flag)
3441	{
3442	struct dentry *dentry = path->dentry;
3443	struct inode *inode = dentry->d_inode;
3444	int error;
3445
3446	if (!inode)
3447	return -ENOENT;
3448
3449	switch (inode->i_mode & S_IFMT) {
3450	case S_IFLNK:
3451	return -ELOOP;
3452	case S_IFDIR:
3453	if (acc_mode & MAY_WRITE)
3454	return -EISDIR;
3455	if (acc_mode & MAY_EXEC)
3456	return -EACCES;
3457	break;
3458	case S_IFBLK:
3459	case S_IFCHR:
3460	if (!may_open_dev(path))
3461	return -EACCES;
3462	fallthrough;
3463	case S_IFIFO:
3464	case S_IFSOCK:
3465	if (acc_mode & MAY_EXEC)
3466	return -EACCES;
3467	flag &= ~O_TRUNC;
3468	break;
3469	case S_IFREG:
3470	if ((acc_mode & MAY_EXEC) && path_noexec(path))
3471	return -EACCES;
3472	break;
3473	default:
3474	VFS_BUG_ON_INODE(`1`, inode);
3475	}
3476
3477	error = inode_permission(idmap, inode, MAY_OPEN \| acc_mode);
3478	if (error)
3479	return error;
3480
3481	/*
3482	* An append-only file must be opened in append mode for writing.
3483	*/
3484	if (IS_APPEND(inode)) {
3485	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3486	return -EPERM;
3487	if (flag & O_TRUNC)
3488	return -EPERM;
3489	}
3490
3491	/ O_NOATIME can only be set by the owner or superuser /
3492	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
3493	return -EPERM;
3494
3495	return `0`;
3496	}
3497
3498	static int handle_truncate(struct mnt_idmap idmap, struct* file *filp)
3499	{
3500	const struct path *path = &filp->f_path;
3501	struct inode *inode = path->dentry->d_inode;
3502	int error = get_write_access(inode);
3503	if (error)
3504	return error;
3505
3506	error = security_file_truncate(file: filp);
3507	if (!error) {
3508	error = do_truncate(idmap, path->dentry, start: `0`,
3509	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
3510	filp);
3511	}
3512	put_write_access(inode);
3513	return error;
3514	}
3515
3516	static inline int open_to_namei_flags(int flag)
3517	{
3518	if ((flag & O_ACCMODE) == `3`)
3519	flag--;
3520	return flag;
3521	}
3522
3523	static int may_o_create(struct mnt_idmap *idmap,
3524	const struct path dir, struct* dentry *dentry,
3525	umode_t mode)
3526	{
3527	int error = security_path_mknod(dir, dentry, mode, dev: `0`);
3528	if (error)
3529	return error;
3530
3531	if (!fsuidgid_has_mapping(sb: dir->dentry->d_sb, idmap))
3532	return -EOVERFLOW;
3533
3534	error = inode_permission(idmap, dir->dentry->d_inode,
3535	MAY_WRITE \| MAY_EXEC);
3536	if (error)
3537	return error;
3538
3539	return security_inode_create(dir: dir->dentry->d_inode, dentry, mode);
3540	}
3541
3542	/*
3543	* Attempt to atomically look up, create and open a file from a negative
3544	* dentry.
3545	*
3546	* Returns 0 if successful. The file will have been created and attached to
3547	* @file by the filesystem calling finish_open().
3548	*
3549	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
3550	* be set. The caller will need to perform the open themselves. @path will
3551	* have been updated to point to the new dentry. This may be negative.
3552	*
3553	* Returns an error code otherwise.
3554	*/
3555	static struct dentry atomic_open(struct* nameidata nd, struct* dentry *dentry,
3556	struct file *file,
3557	int open_flag, umode_t mode)
3558	{
3559	struct dentry *const DENTRY_NOT_SET = (void *) -`1UL`;
3560	struct inode *dir = nd->path.dentry->d_inode;
3561	int error;
3562
3563	if (nd->flags & LOOKUP_DIRECTORY)
3564	open_flag \|= O_DIRECTORY;
3565
3566	file->f_path.dentry = DENTRY_NOT_SET;
3567	file->f_path.mnt = nd->path.mnt;
3568	error = dir->i_op->atomic_open(dir, dentry, file,
3569	open_to_namei_flags(flag: open_flag), mode);
3570	d_lookup_done(dentry);
3571	if (!error) {
3572	if (file->f_mode & FMODE_OPENED) {
3573	if (unlikely(dentry != file->f_path.dentry)) {
3574	dput(dentry);
3575	dentry = dget(dentry: file->f_path.dentry);
3576	}
3577	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3578	error = -EIO;
3579	} else {
3580	if (file->f_path.dentry) {
3581	dput(dentry);
3582	dentry = file->f_path.dentry;
3583	}
3584	if (unlikely(d_is_negative(dentry)))
3585	error = -ENOENT;
3586	}
3587	}
3588	if (error) {
3589	dput(dentry);
3590	dentry = ERR_PTR(error);
3591	}
3592	return dentry;
3593	}
3594
3595	/*
3596	* Look up and maybe create and open the last component.
3597	*
3598	* Must be called with parent locked (exclusive in O_CREAT case).
3599	*
3600	* Returns 0 on success, that is, if
3601	* the file was successfully atomically created (if necessary) and opened, or
3602	* the file was not completely opened at this time, though lookups and
3603	* creations were performed.
3604	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3605	* In the latter case dentry returned in @path might be negative if O_CREAT
3606	* hadn't been specified.
3607	*
3608	* An error code is returned on failure.
3609	*/
3610	static struct dentry lookup_open(struct* nameidata nd, struct* file *file,
3611	const struct open_flags *op,
3612	bool got_write)
3613	{
3614	struct mnt_idmap *idmap;
3615	struct dentry *dir = nd->path.dentry;
3616	struct inode *dir_inode = dir->d_inode;
3617	int open_flag = op->open_flag;
3618	struct dentry *dentry;
3619	int error, create_error = `0`;
3620	umode_t mode = op->mode;
3621	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3622
3623	if (unlikely(IS_DEADDIR(dir_inode)))
3624	return ERR_PTR(error: -ENOENT);
3625
3626	file->f_mode &= ~FMODE_CREATED;
3627	dentry = d_lookup(dir, &nd->last);
3628	for (;;) {
3629	if (!dentry) {
3630	dentry = d_alloc_parallel(dir, &nd->last, &wq);
3631	if (IS_ERR(ptr: dentry))
3632	return dentry;
3633	}
3634	if (d_in_lookup(dentry))
3635	break;
3636
3637	error = d_revalidate(dir: dir_inode, name: &nd->last, dentry, flags: nd->flags);
3638	if (likely(error > `0`))
3639	break;
3640	if (error)
3641	goto out_dput;
3642	d_invalidate(dentry);
3643	dput(dentry);
3644	dentry = NULL;
3645	}
3646	if (dentry->d_inode) {
3647	/ Cached positive dentry: will open in f_op->open /
3648	return dentry;
3649	}
3650
3651	if (open_flag & O_CREAT)
3652	audit_inode(name: nd->name, dentry: dir, AUDIT_INODE_PARENT);
3653
3654	/*
3655	* Checking write permission is tricky, bacuse we don't know if we are
3656	* going to actually need it: O_CREAT opens should work as long as the
3657	* file exists. But checking existence breaks atomicity. The trick is
3658	* to check access and if not granted clear O_CREAT from the flags.
3659	*
3660	* Another problem is returing the "right" error value (e.g. for an
3661	* O_EXCL open we want to return EEXIST not EROFS).
3662	*/
3663	if (unlikely(!got_write))
3664	open_flag &= ~O_TRUNC;
3665	idmap = mnt_idmap(mnt: nd->path.mnt);
3666	if (open_flag & O_CREAT) {
3667	if (open_flag & O_EXCL)
3668	open_flag &= ~O_TRUNC;
3669	mode = vfs_prepare_mode(idmap, dir: dir->d_inode, mode, mask_perms: mode, type: mode);
3670	if (likely(got_write))
3671	create_error = may_o_create(idmap, dir: &nd->path,
3672	dentry, mode);
3673	else
3674	create_error = -EROFS;
3675	}
3676	if (create_error)
3677	open_flag &= ~O_CREAT;
3678	if (dir_inode->i_op->atomic_open) {
3679	dentry = atomic_open(nd, dentry, file, open_flag, mode);
3680	if (unlikely(create_error) && dentry == ERR_PTR(error: -ENOENT))
3681	dentry = ERR_PTR(error: create_error);
3682	return dentry;
3683	}
3684
3685	if (d_in_lookup(dentry)) {
3686	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3687	nd->flags);
3688	d_lookup_done(dentry);
3689	if (unlikely(res)) {
3690	if (IS_ERR(ptr: res)) {
3691	error = PTR_ERR(ptr: res);
3692	goto out_dput;
3693	}
3694	dput(dentry);
3695	dentry = res;
3696	}
3697	}
3698
3699	/ Negative dentry, just create the file /
3700	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3701	file->f_mode \|= FMODE_CREATED;
3702	audit_inode_child(parent: dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3703	if (!dir_inode->i_op->create) {
3704	error = -EACCES;
3705	goto out_dput;
3706	}
3707
3708	error = dir_inode->i_op->create(idmap, dir_inode, dentry,
3709	mode, open_flag & O_EXCL);
3710	if (error)
3711	goto out_dput;
3712	}
3713	if (unlikely(create_error) && !dentry->d_inode) {
3714	error = create_error;
3715	goto out_dput;
3716	}
3717	return dentry;
3718
3719	out_dput:
3720	dput(dentry);
3721	return ERR_PTR(error);
3722	}
3723
3724	static inline bool trailing_slashes(struct nameidata *nd)
3725	{
3726	return (bool)nd->last.name[nd->last.len];
3727	}
3728
3729	static struct dentry lookup_fast_for_open(struct* nameidata nd, int* open_flag)
3730	{
3731	struct dentry *dentry;
3732
3733	if (open_flag & O_CREAT) {
3734	if (trailing_slashes(nd))
3735	return ERR_PTR(error: -EISDIR);
3736
3737	/ Don't bother on an O_EXCL create /
3738	if (open_flag & O_EXCL)
3739	return NULL;
3740	}
3741
3742	if (trailing_slashes(nd))
3743	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
3744
3745	dentry = lookup_fast(nd);
3746	if (IS_ERR_OR_NULL(ptr: dentry))
3747	return dentry;
3748
3749	if (open_flag & O_CREAT) {
3750	/ Discard negative dentries. Need inode_lock to do the create /
3751	if (!dentry->d_inode) {
3752	if (!(nd->flags & LOOKUP_RCU))
3753	dput(dentry);
3754	dentry = NULL;
3755	}
3756	}
3757	return dentry;
3758	}
3759
3760	static const char open_last_lookups(struct* nameidata *nd,
3761	struct file file, const* struct open_flags *op)
3762	{
3763	struct dentry *dir = nd->path.dentry;
3764	int open_flag = op->open_flag;
3765	bool got_write = false;
3766	struct dentry *dentry;
3767	const char *res;
3768
3769	nd->flags \|= op->intent;
3770
3771	if (nd->last_type != LAST_NORM) {
3772	if (nd->depth)
3773	put_link(nd);
3774	return handle_dots(nd, type: nd->last_type);
3775	}
3776
3777	/ We _can_ be in RCU mode here /
3778	dentry = lookup_fast_for_open(nd, open_flag);
3779	if (IS_ERR(ptr: dentry))
3780	return ERR_CAST(ptr: dentry);
3781
3782	if (likely(dentry))
3783	goto finish_lookup;
3784
3785	if (!(open_flag & O_CREAT)) {
3786	if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
3787	return ERR_PTR(error: -ECHILD);
3788	} else {
3789	if (nd->flags & LOOKUP_RCU) {
3790	if (!try_to_unlazy(nd))
3791	return ERR_PTR(error: -ECHILD);
3792	}
3793	}
3794
3795	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
3796	got_write = !mnt_want_write(mnt: nd->path.mnt);
3797	/*
3798	* do _not_ fail yet - we might not need that or fail with
3799	* a different error; let lookup_open() decide; we'll be
3800	* dropping this one anyway.
3801	*/
3802	}
3803	if (open_flag & O_CREAT)
3804	inode_lock(inode: dir->d_inode);
3805	else
3806	inode_lock_shared(inode: dir->d_inode);
3807	dentry = lookup_open(nd, file, op, got_write);
3808	if (!IS_ERR(ptr: dentry)) {
3809	if (file->f_mode & FMODE_CREATED)
3810	fsnotify_create(dir: dir->d_inode, dentry);
3811	if (file->f_mode & FMODE_OPENED)
3812	fsnotify_open(file);
3813	}
3814	if (open_flag & O_CREAT)
3815	inode_unlock(inode: dir->d_inode);
3816	else
3817	inode_unlock_shared(inode: dir->d_inode);
3818
3819	if (got_write)
3820	mnt_drop_write(mnt: nd->path.mnt);
3821
3822	if (IS_ERR(ptr: dentry))
3823	return ERR_CAST(ptr: dentry);
3824
3825	if (file->f_mode & (FMODE_OPENED \| FMODE_CREATED)) {
3826	dput(nd->path.dentry);
3827	nd->path.dentry = dentry;
3828	return NULL;
3829	}
3830
3831	finish_lookup:
3832	if (nd->depth)
3833	put_link(nd);
3834	res = step_into(nd, flags: WALK_TRAILING, dentry);
3835	if (unlikely(res))
3836	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
3837	return res;
3838	}
3839
3840	/*
3841	* Handle the last step of open()
3842	*/
3843	static int do_open(struct nameidata *nd,
3844	struct file file, const* struct open_flags *op)
3845	{
3846	struct mnt_idmap *idmap;
3847	int open_flag = op->open_flag;
3848	bool do_truncate;
3849	int acc_mode;
3850	int error;
3851
3852	if (!(file->f_mode & (FMODE_OPENED \| FMODE_CREATED))) {
3853	error = complete_walk(nd);
3854	if (error)
3855	return error;
3856	}
3857	if (!(file->f_mode & FMODE_CREATED))
3858	audit_inode(name: nd->name, dentry: nd->path.dentry, aflags: `0`);
3859	idmap = mnt_idmap(mnt: nd->path.mnt);
3860	if (open_flag & O_CREAT) {
3861	if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3862	return -EEXIST;
3863	if (d_is_dir(dentry: nd->path.dentry))
3864	return -EISDIR;
3865	error = may_create_in_sticky(idmap, nd,
3866	inode: d_backing_inode(upper: nd->path.dentry));
3867	if (unlikely(error))
3868	return error;
3869	}
3870	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(dentry: nd->path.dentry))
3871	return -ENOTDIR;
3872
3873	do_truncate = false;
3874	acc_mode = op->acc_mode;
3875	if (file->f_mode & FMODE_CREATED) {
3876	/ Don't check for write permission, don't truncate /
3877	open_flag &= ~O_TRUNC;
3878	acc_mode = `0`;
3879	} else if (d_is_reg(dentry: nd->path.dentry) && open_flag & O_TRUNC) {
3880	error = mnt_want_write(mnt: nd->path.mnt);
3881	if (error)
3882	return error;
3883	do_truncate = true;
3884	}
3885	error = may_open(idmap, path: &nd->path, acc_mode, flag: open_flag);
3886	if (!error && !(file->f_mode & FMODE_OPENED))
3887	error = vfs_open(&nd->path, file);
3888	if (!error)
3889	error = security_file_post_open(file, mask: op->acc_mode);
3890	if (!error && do_truncate)
3891	error = handle_truncate(idmap, filp: file);
3892	if (unlikely(error > `0`)) {
3893	WARN_ON(`1`);
3894	error = -EINVAL;
3895	}
3896	if (do_truncate)
3897	mnt_drop_write(mnt: nd->path.mnt);
3898	return error;
3899	}
3900
3901	/**
3902	* vfs_tmpfile - create tmpfile
3903	* @idmap: idmap of the mount the inode was found from
3904	* @parentpath: pointer to the path of the base directory
3905	* @file: file descriptor of the new tmpfile
3906	* @mode: mode of the new tmpfile
3907	*
3908	* Create a temporary file.
3909	*
3910	* If the inode has been found through an idmapped mount the idmap of
3911	* the vfsmount must be passed through @idmap. This function will then take
3912	* care to map the inode according to @idmap before checking permissions.
3913	* On non-idmapped mounts or if permission checking is to be performed on the
3914	* raw inode simply pass @nop_mnt_idmap.
3915	*/
3916	int vfs_tmpfile(struct mnt_idmap *idmap,
3917	const struct path *parentpath,
3918	struct file *file, umode_t mode)
3919	{
3920	struct dentry *child;
3921	struct inode *dir = d_inode(dentry: parentpath->dentry);
3922	struct inode *inode;
3923	int error;
3924	int open_flag = file->f_flags;
3925
3926	/ we want directory to be writable /
3927	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3928	if (error)
3929	return error;
3930	if (!dir->i_op->tmpfile)
3931	return -EOPNOTSUPP;
3932	child = d_alloc(parentpath->dentry, &slash_name);
3933	if (unlikely(!child))
3934	return -ENOMEM;
3935	file->f_path.mnt = parentpath->mnt;
3936	file->f_path.dentry = child;
3937	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
3938	error = dir->i_op->tmpfile(idmap, dir, file, mode);
3939	dput(child);
3940	if (file->f_mode & FMODE_OPENED)
3941	fsnotify_open(file);
3942	if (error)
3943	return error;
3944	/ Don't check for other permissions, the inode was just created /
3945	error = may_open(idmap, path: &file->f_path, acc_mode: `0`, flag: file->f_flags);
3946	if (error)
3947	return error;
3948	inode = file_inode(f: file);
3949	if (!(open_flag & O_EXCL)) {
3950	spin_lock(lock: &inode->i_lock);
3951	inode->i_state \|= I_LINKABLE;
3952	spin_unlock(lock: &inode->i_lock);
3953	}
3954	security_inode_post_create_tmpfile(idmap, inode);
3955	return `0`;
3956	}
3957
3958	/**
3959	* kernel_tmpfile_open - open a tmpfile for kernel internal use
3960	* @idmap: idmap of the mount the inode was found from
3961	* @parentpath: path of the base directory
3962	* @mode: mode of the new tmpfile
3963	* @open_flag: flags
3964	* @cred: credentials for open
3965	*
3966	* Create and open a temporary file. The file is not accounted in nr_files,
3967	* hence this is only for kernel internal use, and must not be installed into
3968	* file tables or such.
3969	*/
3970	struct file kernel_tmpfile_open(struct* mnt_idmap *idmap,
3971	const struct path *parentpath,
3972	umode_t mode, int open_flag,
3973	const struct cred *cred)
3974	{
3975	struct file *file;
3976	int error;
3977
3978	file = alloc_empty_file_noaccount(flags: open_flag, cred);
3979	if (IS_ERR(ptr: file))
3980	return file;
3981
3982	error = vfs_tmpfile(idmap, parentpath, file, mode);
3983	if (error) {
3984	fput(file);
3985	file = ERR_PTR(error);
3986	}
3987	return file;
3988	}
3989	EXPORT_SYMBOL(kernel_tmpfile_open);
3990
3991	static int do_tmpfile(struct nameidata nd, unsigned* flags,
3992	const struct open_flags *op,
3993	struct file *file)
3994	{
3995	struct path path;
3996	int error = path_lookupat(nd, flags: flags \| LOOKUP_DIRECTORY, path: &path);
3997
3998	if (unlikely(error))
3999	return error;
4000	error = mnt_want_write(mnt: path.mnt);
4001	if (unlikely(error))
4002	goto out;
4003	error = vfs_tmpfile(idmap: mnt_idmap(mnt: path.mnt), parentpath: &path, file, mode: op->mode);
4004	if (error)
4005	goto out2;
4006	audit_inode(name: nd->name, dentry: file->f_path.dentry, aflags: `0`);
4007	out2:
4008	mnt_drop_write(mnt: path.mnt);
4009	out:
4010	path_put(&path);
4011	return error;
4012	}
4013
4014	static int do_o_path(struct nameidata nd, unsigned* flags, struct file *file)
4015	{
4016	struct path path;
4017	int error = path_lookupat(nd, flags, path: &path);
4018	if (!error) {
4019	audit_inode(name: nd->name, dentry: path.dentry, aflags: `0`);
4020	error = vfs_open(&path, file);
4021	path_put(&path);
4022	}
4023	return error;
4024	}
4025
4026	static struct file path_openat(struct* nameidata *nd,
4027	const struct open_flags op, unsigned* flags)
4028	{
4029	struct file *file;
4030	int error;
4031
4032	file = alloc_empty_file(flags: op->open_flag, current_cred());
4033	if (IS_ERR(ptr: file))
4034	return file;
4035
4036	if (unlikely(file->f_flags & __O_TMPFILE)) {
4037	error = do_tmpfile(nd, flags, op, file);
4038	} else if (unlikely(file->f_flags & O_PATH)) {
4039	error = do_o_path(nd, flags, file);
4040	} else {
4041	const char *s = path_init(nd, flags);
4042	while (!(error = link_path_walk(name: s, nd)) &&
4043	(s = open_last_lookups(nd, file, op)) != NULL)
4044	;
4045	if (!error)
4046	error = do_open(nd, file, op);
4047	terminate_walk(nd);
4048	}
4049	if (likely(!error)) {
4050	if (likely(file->f_mode & FMODE_OPENED))
4051	return file;
4052	WARN_ON(`1`);
4053	error = -EINVAL;
4054	}
4055	fput_close(file);
4056	if (error == -EOPENSTALE) {
4057	if (flags & LOOKUP_RCU)
4058	error = -ECHILD;
4059	else
4060	error = -ESTALE;
4061	}
4062	return ERR_PTR(error);
4063	}
4064
4065	struct file do_filp_open(int* dfd, struct filename *pathname,
4066	const struct open_flags *op)
4067	{
4068	struct nameidata nd;
4069	int flags = op->lookup_flags;
4070	struct file *filp;
4071
4072	set_nameidata(p: &nd, dfd, name: pathname, NULL);
4073	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
4074	if (unlikely(filp == ERR_PTR(-ECHILD)))
4075	filp = path_openat(nd: &nd, op, flags);
4076	if (unlikely(filp == ERR_PTR(-ESTALE)))
4077	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
4078	restore_nameidata();
4079	return filp;
4080	}
4081
4082	struct file do_file_open_root(const* struct path *root,
4083	const char name, const* struct open_flags *op)
4084	{
4085	struct nameidata nd;
4086	struct file *file;
4087	struct filename *filename;
4088	int flags = op->lookup_flags;
4089
4090	if (d_is_symlink(dentry: root->dentry) && op->intent & LOOKUP_OPEN)
4091	return ERR_PTR(error: -ELOOP);
4092
4093	filename = getname_kernel(name);
4094	if (IS_ERR(ptr: filename))
4095	return ERR_CAST(ptr: filename);
4096
4097	set_nameidata(p: &nd, dfd: -`1`, name: filename, root);
4098	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
4099	if (unlikely(file == ERR_PTR(-ECHILD)))
4100	file = path_openat(nd: &nd, op, flags);
4101	if (unlikely(file == ERR_PTR(-ESTALE)))
4102	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
4103	restore_nameidata();
4104	putname(filename);
4105	return file;
4106	}
4107
4108	static struct dentry filename_create(int* dfd, struct filename *name,
4109	struct path path, unsigned* int lookup_flags)
4110	{
4111	struct dentry *dentry = ERR_PTR(error: -EEXIST);
4112	struct qstr last;
4113	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
4114	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
4115	unsigned int create_flags = LOOKUP_CREATE \| LOOKUP_EXCL;
4116	int type;
4117	int err2;
4118	int error;
4119
4120	error = filename_parentat(dfd, name, flags: reval_flag, parent: path, last: &last, type: &type);
4121	if (error)
4122	return ERR_PTR(error);
4123
4124	/*
4125	* Yucky last component or no last component at all?
4126	* (foo/., foo/.., /////)
4127	*/
4128	if (unlikely(type != LAST_NORM))
4129	goto out;
4130
4131	/ don't fail immediately if it's r/o, at least try to report other errors /
4132	err2 = mnt_want_write(mnt: path->mnt);
4133	/*
4134	* Do the final lookup. Suppress 'create' if there is a trailing
4135	* '/', and a directory wasn't requested.
4136	*/
4137	if (last.name[last.len] && !want_dir)
4138	create_flags &= ~LOOKUP_CREATE;
4139	inode_lock_nested(inode: path->dentry->d_inode, subclass: I_MUTEX_PARENT);
4140	dentry = lookup_one_qstr_excl(&last, path->dentry,
4141	reval_flag \| create_flags);
4142	if (IS_ERR(ptr: dentry))
4143	goto unlock;
4144
4145	if (unlikely(err2)) {
4146	error = err2;
4147	goto fail;
4148	}
4149	return dentry;
4150	fail:
4151	dput(dentry);
4152	dentry = ERR_PTR(error);
4153	unlock:
4154	inode_unlock(inode: path->dentry->d_inode);
4155	if (!err2)
4156	mnt_drop_write(mnt: path->mnt);
4157	out:
4158	path_put(path);
4159	return dentry;
4160	}
4161
4162	struct dentry kern_path_create(int* dfd, const char *pathname,
4163	struct path path, unsigned* int lookup_flags)
4164	{
4165	struct filename *filename = getname_kernel(pathname);
4166	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
4167
4168	putname(filename);
4169	return res;
4170	}
4171	EXPORT_SYMBOL(kern_path_create);
4172
4173	void done_path_create(struct path path, struct* dentry *dentry)
4174	{
4175	if (!IS_ERR(ptr: dentry))
4176	dput(dentry);
4177	inode_unlock(inode: path->dentry->d_inode);
4178	mnt_drop_write(mnt: path->mnt);
4179	path_put(path);
4180	}
4181	EXPORT_SYMBOL(done_path_create);
4182
4183	inline struct dentry user_path_create(int* dfd, const char __user *pathname,
4184	struct path path, unsigned* int lookup_flags)
4185	{
4186	struct filename *filename = getname(name: pathname);
4187	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
4188
4189	putname(filename);
4190	return res;
4191	}
4192	EXPORT_SYMBOL(user_path_create);
4193
4194	/**
4195	* vfs_mknod - create device node or file
4196	* @idmap: idmap of the mount the inode was found from
4197	* @dir: inode of the parent directory
4198	* @dentry: dentry of the child device node
4199	* @mode: mode of the child device node
4200	* @dev: device number of device to create
4201	*
4202	* Create a device node or file.
4203	*
4204	* If the inode has been found through an idmapped mount the idmap of
4205	* the vfsmount must be passed through @idmap. This function will then take
4206	* care to map the inode according to @idmap before checking permissions.
4207	* On non-idmapped mounts or if permission checking is to be performed on the
4208	* raw inode simply pass @nop_mnt_idmap.
4209	*/
4210	int vfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
4211	struct dentry *dentry, umode_t mode, dev_t dev)
4212	{
4213	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
4214	int error = may_create(idmap, dir, child: dentry);
4215
4216	if (error)
4217	return error;
4218
4219	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !is_whiteout &&
4220	!capable(CAP_MKNOD))
4221	return -EPERM;
4222
4223	if (!dir->i_op->mknod)
4224	return -EPERM;
4225
4226	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
4227	error = devcgroup_inode_mknod(mode, dev);
4228	if (error)
4229	return error;
4230
4231	error = security_inode_mknod(dir, dentry, mode, dev);
4232	if (error)
4233	return error;
4234
4235	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
4236	if (!error)
4237	fsnotify_create(dir, dentry);
4238	return error;
4239	}
4240	EXPORT_SYMBOL(vfs_mknod);
4241
4242	static int may_mknod(umode_t mode)
4243	{
4244	switch (mode & S_IFMT) {
4245	case S_IFREG:
4246	case S_IFCHR:
4247	case S_IFBLK:
4248	case S_IFIFO:
4249	case S_IFSOCK:
4250	case `0`: / zero mode translates to S_IFREG /
4251	return `0`;
4252	case S_IFDIR:
4253	return -EPERM;
4254	default:
4255	return -EINVAL;
4256	}
4257	}
4258
4259	static int do_mknodat(int dfd, struct filename *name, umode_t mode,
4260	unsigned int dev)
4261	{
4262	struct mnt_idmap *idmap;
4263	struct dentry *dentry;
4264	struct path path;
4265	int error;
4266	unsigned int lookup_flags = `0`;
4267
4268	error = may_mknod(mode);
4269	if (error)
4270	goto out1;
4271	retry:
4272	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4273	error = PTR_ERR(ptr: dentry);
4274	if (IS_ERR(ptr: dentry))
4275	goto out1;
4276
4277	error = security_path_mknod(dir: &path, dentry,
4278	mode: mode_strip_umask(dir: path.dentry->d_inode, mode), dev);
4279	if (error)
4280	goto out2;
4281
4282	idmap = mnt_idmap(mnt: path.mnt);
4283	switch (mode & S_IFMT) {
4284	case `0`: case S_IFREG:
4285	error = vfs_create(idmap, path.dentry->d_inode,
4286	dentry, mode, true);
4287	if (!error)
4288	security_path_post_mknod(idmap, dentry);
4289	break;
4290	case S_IFCHR: case S_IFBLK:
4291	error = vfs_mknod(idmap, path.dentry->d_inode,
4292	dentry, mode, new_decode_dev(dev));
4293	break;
4294	case S_IFIFO: case S_IFSOCK:
4295	error = vfs_mknod(idmap, path.dentry->d_inode,
4296	dentry, mode, `0`);
4297	break;
4298	}
4299	out2:
4300	done_path_create(&path, dentry);
4301	if (retry_estale(error, flags: lookup_flags)) {
4302	lookup_flags \|= LOOKUP_REVAL;
4303	goto retry;
4304	}
4305	out1:
4306	putname(name);
4307	return error;
4308	}
4309
4310	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
4311	unsigned int, dev)
4312	{
4313	return do_mknodat(dfd, name: getname(name: filename), mode, dev);
4314	}
4315
4316	SYSCALL_DEFINE3(mknod, const char __user , filename, umode_t, mode, unsigned*, dev)
4317	{
4318	return do_mknodat(AT_FDCWD, name: getname(name: filename), mode, dev);
4319	}
4320
4321	/**
4322	* vfs_mkdir - create directory returning correct dentry if possible
4323	* @idmap: idmap of the mount the inode was found from
4324	* @dir: inode of the parent directory
4325	* @dentry: dentry of the child directory
4326	* @mode: mode of the child directory
4327	*
4328	* Create a directory.
4329	*
4330	* If the inode has been found through an idmapped mount the idmap of
4331	* the vfsmount must be passed through @idmap. This function will then take
4332	* care to map the inode according to @idmap before checking permissions.
4333	* On non-idmapped mounts or if permission checking is to be performed on the
4334	* raw inode simply pass @nop_mnt_idmap.
4335	*
4336	* In the event that the filesystem does not use the *@dentry but leaves it
4337	* negative or unhashes it and possibly splices a different one returning it,
4338	* the original dentry is dput() and the alternate is returned.
4339	*
4340	* In case of an error the dentry is dput() and an ERR_PTR() is returned.
4341	*/
4342	struct dentry vfs_mkdir(struct* mnt_idmap idmap, struct* inode *dir,
4343	struct dentry *dentry, umode_t mode)
4344	{
4345	int error;
4346	unsigned max_links = dir->i_sb->s_max_links;
4347	struct dentry *de;
4348
4349	error = may_create(idmap, dir, child: dentry);
4350	if (error)
4351	goto err;
4352
4353	error = -EPERM;
4354	if (!dir->i_op->mkdir)
4355	goto err;
4356
4357	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO \| S_ISVTX, type: `0`);
4358	error = security_inode_mkdir(dir, dentry, mode);
4359	if (error)
4360	goto err;
4361
4362	error = -EMLINK;
4363	if (max_links && dir->i_nlink >= max_links)
4364	goto err;
4365
4366	de = dir->i_op->mkdir(idmap, dir, dentry, mode);
4367	error = PTR_ERR(ptr: de);
4368	if (IS_ERR(ptr: de))
4369	goto err;
4370	if (de) {
4371	dput(dentry);
4372	dentry = de;
4373	}
4374	fsnotify_mkdir(dir, dentry);
4375	return dentry;
4376
4377	err:
4378	dput(dentry);
4379	return ERR_PTR(error);
4380	}
4381	EXPORT_SYMBOL(vfs_mkdir);
4382
4383	int do_mkdirat(int dfd, struct filename *name, umode_t mode)
4384	{
4385	struct dentry *dentry;
4386	struct path path;
4387	int error;
4388	unsigned int lookup_flags = LOOKUP_DIRECTORY;
4389
4390	retry:
4391	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4392	error = PTR_ERR(ptr: dentry);
4393	if (IS_ERR(ptr: dentry))
4394	goto out_putname;
4395
4396	error = security_path_mkdir(dir: &path, dentry,
4397	mode: mode_strip_umask(dir: path.dentry->d_inode, mode));
4398	if (!error) {
4399	dentry = vfs_mkdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4400	dentry, mode);
4401	if (IS_ERR(ptr: dentry))
4402	error = PTR_ERR(ptr: dentry);
4403	}
4404	done_path_create(&path, dentry);
4405	if (retry_estale(error, flags: lookup_flags)) {
4406	lookup_flags \|= LOOKUP_REVAL;
4407	goto retry;
4408	}
4409	out_putname:
4410	putname(name);
4411	return error;
4412	}
4413
4414	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
4415	{
4416	return do_mkdirat(dfd, name: getname(name: pathname), mode);
4417	}
4418
4419	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
4420	{
4421	return do_mkdirat(AT_FDCWD, name: getname(name: pathname), mode);
4422	}
4423
4424	/**
4425	* vfs_rmdir - remove directory
4426	* @idmap: idmap of the mount the inode was found from
4427	* @dir: inode of the parent directory
4428	* @dentry: dentry of the child directory
4429	*
4430	* Remove a directory.
4431	*
4432	* If the inode has been found through an idmapped mount the idmap of
4433	* the vfsmount must be passed through @idmap. This function will then take
4434	* care to map the inode according to @idmap before checking permissions.
4435	* On non-idmapped mounts or if permission checking is to be performed on the
4436	* raw inode simply pass @nop_mnt_idmap.
4437	*/
4438	int vfs_rmdir(struct mnt_idmap idmap, struct* inode *dir,
4439	struct dentry *dentry)
4440	{
4441	int error = may_delete(idmap, dir, victim: dentry, isdir: `1`);
4442
4443	if (error)
4444	return error;
4445
4446	if (!dir->i_op->rmdir)
4447	return -EPERM;
4448
4449	dget(dentry);
4450	inode_lock(inode: dentry->d_inode);
4451
4452	error = -EBUSY;
4453	if (is_local_mountpoint(dentry) \|\|
4454	(dentry->d_inode->i_flags & S_KERNEL_FILE))
4455	goto out;
4456
4457	error = security_inode_rmdir(dir, dentry);
4458	if (error)
4459	goto out;
4460
4461	error = dir->i_op->rmdir(dir, dentry);
4462	if (error)
4463	goto out;
4464
4465	shrink_dcache_parent(dentry);
4466	dentry->d_inode->i_flags \|= S_DEAD;
4467	dont_mount(dentry);
4468	detach_mounts(dentry);
4469
4470	out:
4471	inode_unlock(inode: dentry->d_inode);
4472	dput(dentry);
4473	if (!error)
4474	d_delete_notify(dir, dentry);
4475	return error;
4476	}
4477	EXPORT_SYMBOL(vfs_rmdir);
4478
4479	int do_rmdir(int dfd, struct filename *name)
4480	{
4481	int error;
4482	struct dentry *dentry;
4483	struct path path;
4484	struct qstr last;
4485	int type;
4486	unsigned int lookup_flags = `0`;
4487	retry:
4488	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4489	if (error)
4490	goto exit1;
4491
4492	switch (type) {
4493	case LAST_DOTDOT:
4494	error = -ENOTEMPTY;
4495	goto exit2;
4496	case LAST_DOT:
4497	error = -EINVAL;
4498	goto exit2;
4499	case LAST_ROOT:
4500	error = -EBUSY;
4501	goto exit2;
4502	}
4503
4504	error = mnt_want_write(mnt: path.mnt);
4505	if (error)
4506	goto exit2;
4507
4508	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4509	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4510	error = PTR_ERR(ptr: dentry);
4511	if (IS_ERR(ptr: dentry))
4512	goto exit3;
4513	error = security_path_rmdir(dir: &path, dentry);
4514	if (error)
4515	goto exit4;
4516	error = vfs_rmdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode, dentry);
4517	exit4:
4518	dput(dentry);
4519	exit3:
4520	inode_unlock(inode: path.dentry->d_inode);
4521	mnt_drop_write(mnt: path.mnt);
4522	exit2:
4523	path_put(&path);
4524	if (retry_estale(error, flags: lookup_flags)) {
4525	lookup_flags \|= LOOKUP_REVAL;
4526	goto retry;
4527	}
4528	exit1:
4529	putname(name);
4530	return error;
4531	}
4532
4533	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
4534	{
4535	return do_rmdir(AT_FDCWD, name: getname(name: pathname));
4536	}
4537
4538	/**
4539	* vfs_unlink - unlink a filesystem object
4540	* @idmap: idmap of the mount the inode was found from
4541	* @dir: parent directory
4542	* @dentry: victim
4543	* @delegated_inode: returns victim inode, if the inode is delegated.
4544	*
4545	* The caller must hold dir->i_mutex.
4546	*
4547	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4548	* return a reference to the inode in delegated_inode. The caller
4549	* should then break the delegation on that inode and retry. Because
4550	* breaking a delegation may take a long time, the caller should drop
4551	* dir->i_mutex before doing so.
4552	*
4553	* Alternatively, a caller may pass NULL for delegated_inode. This may
4554	* be appropriate for callers that expect the underlying filesystem not
4555	* to be NFS exported.
4556	*
4557	* If the inode has been found through an idmapped mount the idmap of
4558	* the vfsmount must be passed through @idmap. This function will then take
4559	* care to map the inode according to @idmap before checking permissions.
4560	* On non-idmapped mounts or if permission checking is to be performed on the
4561	* raw inode simply pass @nop_mnt_idmap.
4562	*/
4563	int vfs_unlink(struct mnt_idmap idmap, struct* inode *dir,
4564	struct dentry dentry, struct* inode **delegated_inode)
4565	{
4566	struct inode *target = dentry->d_inode;
4567	int error = may_delete(idmap, dir, victim: dentry, isdir: `0`);
4568
4569	if (error)
4570	return error;
4571
4572	if (!dir->i_op->unlink)
4573	return -EPERM;
4574
4575	inode_lock(inode: target);
4576	if (IS_SWAPFILE(target))
4577	error = -EPERM;
4578	else if (is_local_mountpoint(dentry))
4579	error = -EBUSY;
4580	else {
4581	error = security_inode_unlink(dir, dentry);
4582	if (!error) {
4583	error = try_break_deleg(inode: target, delegated_inode);
4584	if (error)
4585	goto out;
4586	error = dir->i_op->unlink(dir, dentry);
4587	if (!error) {
4588	dont_mount(dentry);
4589	detach_mounts(dentry);
4590	}
4591	}
4592	}
4593	out:
4594	inode_unlock(inode: target);
4595
4596	/ We don't d_delete() NFS sillyrenamed files--they still exist. /
4597	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
4598	fsnotify_unlink(dir, dentry);
4599	} else if (!error) {
4600	fsnotify_link_count(inode: target);
4601	d_delete_notify(dir, dentry);
4602	}
4603
4604	return error;
4605	}
4606	EXPORT_SYMBOL(vfs_unlink);
4607
4608	/*
4609	* Make sure that the actual truncation of the file will occur outside its
4610	* directory's i_mutex. Truncate can take a long time if there is a lot of
4611	* writeout happening, and we don't want to prevent access to the directory
4612	* while waiting on the I/O.
4613	*/
4614	int do_unlinkat(int dfd, struct filename *name)
4615	{
4616	int error;
4617	struct dentry *dentry;
4618	struct path path;
4619	struct qstr last;
4620	int type;
4621	struct inode *inode = NULL;
4622	struct inode *delegated_inode = NULL;
4623	unsigned int lookup_flags = `0`;
4624	retry:
4625	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4626	if (error)
4627	goto exit1;
4628
4629	error = -EISDIR;
4630	if (type != LAST_NORM)
4631	goto exit2;
4632
4633	error = mnt_want_write(mnt: path.mnt);
4634	if (error)
4635	goto exit2;
4636	retry_deleg:
4637	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4638	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4639	error = PTR_ERR(ptr: dentry);
4640	if (!IS_ERR(ptr: dentry)) {
4641
4642	/ Why not before? Because we want correct error value /
4643	if (last.name[last.len])
4644	goto slashes;
4645	inode = dentry->d_inode;
4646	ihold(inode);
4647	error = security_path_unlink(dir: &path, dentry);
4648	if (error)
4649	goto exit3;
4650	error = vfs_unlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4651	dentry, &delegated_inode);
4652	exit3:
4653	dput(dentry);
4654	}
4655	inode_unlock(inode: path.dentry->d_inode);
4656	if (inode)
4657	iput(inode); / truncate the inode here /
4658	inode = NULL;
4659	if (delegated_inode) {
4660	error = break_deleg_wait(delegated_inode: &delegated_inode);
4661	if (!error)
4662	goto retry_deleg;
4663	}
4664	mnt_drop_write(mnt: path.mnt);
4665	exit2:
4666	path_put(&path);
4667	if (retry_estale(error, flags: lookup_flags)) {
4668	lookup_flags \|= LOOKUP_REVAL;
4669	inode = NULL;
4670	goto retry;
4671	}
4672	exit1:
4673	putname(name);
4674	return error;
4675
4676	slashes:
4677	if (d_is_dir(dentry))
4678	error = -EISDIR;
4679	else
4680	error = -ENOTDIR;
4681	goto exit3;
4682	}
4683
4684	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user , pathname, int*, flag)
4685	{
4686	if ((flag & ~AT_REMOVEDIR) != `0`)
4687	return -EINVAL;
4688
4689	if (flag & AT_REMOVEDIR)
4690	return do_rmdir(dfd, name: getname(name: pathname));
4691	return do_unlinkat(dfd, name: getname(name: pathname));
4692	}
4693
4694	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4695	{
4696	return do_unlinkat(AT_FDCWD, name: getname(name: pathname));
4697	}
4698
4699	/**
4700	* vfs_symlink - create symlink
4701	* @idmap: idmap of the mount the inode was found from
4702	* @dir: inode of the parent directory
4703	* @dentry: dentry of the child symlink file
4704	* @oldname: name of the file to link to
4705	*
4706	* Create a symlink.
4707	*
4708	* If the inode has been found through an idmapped mount the idmap of
4709	* the vfsmount must be passed through @idmap. This function will then take
4710	* care to map the inode according to @idmap before checking permissions.
4711	* On non-idmapped mounts or if permission checking is to be performed on the
4712	* raw inode simply pass @nop_mnt_idmap.
4713	*/
4714	int vfs_symlink(struct mnt_idmap idmap, struct* inode *dir,
4715	struct dentry dentry, const* char *oldname)
4716	{
4717	int error;
4718
4719	error = may_create(idmap, dir, child: dentry);
4720	if (error)
4721	return error;
4722
4723	if (!dir->i_op->symlink)
4724	return -EPERM;
4725
4726	error = security_inode_symlink(dir, dentry, old_name: oldname);
4727	if (error)
4728	return error;
4729
4730	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
4731	if (!error)
4732	fsnotify_create(dir, dentry);
4733	return error;
4734	}
4735	EXPORT_SYMBOL(vfs_symlink);
4736
4737	int do_symlinkat(struct filename from, int* newdfd, struct filename *to)
4738	{
4739	int error;
4740	struct dentry *dentry;
4741	struct path path;
4742	unsigned int lookup_flags = `0`;
4743
4744	if (IS_ERR(ptr: from)) {
4745	error = PTR_ERR(ptr: from);
4746	goto out_putnames;
4747	}
4748	retry:
4749	dentry = filename_create(dfd: newdfd, name: to, path: &path, lookup_flags);
4750	error = PTR_ERR(ptr: dentry);
4751	if (IS_ERR(ptr: dentry))
4752	goto out_putnames;
4753
4754	error = security_path_symlink(dir: &path, dentry, old_name: from->name);
4755	if (!error)
4756	error = vfs_symlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4757	dentry, from->name);
4758	done_path_create(&path, dentry);
4759	if (retry_estale(error, flags: lookup_flags)) {
4760	lookup_flags \|= LOOKUP_REVAL;
4761	goto retry;
4762	}
4763	out_putnames:
4764	putname(to);
4765	putname(from);
4766	return error;
4767	}
4768
4769	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4770	int, newdfd, const char __user *, newname)
4771	{
4772	return do_symlinkat(from: getname(name: oldname), newdfd, to: getname(name: newname));
4773	}
4774
4775	SYSCALL_DEFINE2(symlink, const char __user , oldname, const* char __user *, newname)
4776	{
4777	return do_symlinkat(from: getname(name: oldname), AT_FDCWD, to: getname(name: newname));
4778	}
4779
4780	/**
4781	* vfs_link - create a new link
4782	* @old_dentry: object to be linked
4783	* @idmap: idmap of the mount
4784	* @dir: new parent
4785	* @new_dentry: where to create the new link
4786	* @delegated_inode: returns inode needing a delegation break
4787	*
4788	* The caller must hold dir->i_mutex
4789	*
4790	* If vfs_link discovers a delegation on the to-be-linked file in need
4791	* of breaking, it will return -EWOULDBLOCK and return a reference to the
4792	* inode in delegated_inode. The caller should then break the delegation
4793	* and retry. Because breaking a delegation may take a long time, the
4794	* caller should drop the i_mutex before doing so.
4795	*
4796	* Alternatively, a caller may pass NULL for delegated_inode. This may
4797	* be appropriate for callers that expect the underlying filesystem not
4798	* to be NFS exported.
4799	*
4800	* If the inode has been found through an idmapped mount the idmap of
4801	* the vfsmount must be passed through @idmap. This function will then take
4802	* care to map the inode according to @idmap before checking permissions.
4803	* On non-idmapped mounts or if permission checking is to be performed on the
4804	* raw inode simply pass @nop_mnt_idmap.
4805	*/
4806	int vfs_link(struct dentry old_dentry, struct* mnt_idmap *idmap,
4807	struct inode dir, struct* dentry *new_dentry,
4808	struct inode **delegated_inode)
4809	{
4810	struct inode *inode = old_dentry->d_inode;
4811	unsigned max_links = dir->i_sb->s_max_links;
4812	int error;
4813
4814	if (!inode)
4815	return -ENOENT;
4816
4817	error = may_create(idmap, dir, child: new_dentry);
4818	if (error)
4819	return error;
4820
4821	if (dir->i_sb != inode->i_sb)
4822	return -EXDEV;
4823
4824	/*
4825	* A link to an append-only or immutable file cannot be created.
4826	*/
4827	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
4828	return -EPERM;
4829	/*
4830	* Updating the link count will likely cause i_uid and i_gid to
4831	* be writen back improperly if their true value is unknown to
4832	* the vfs.
4833	*/
4834	if (HAS_UNMAPPED_ID(idmap, inode))
4835	return -EPERM;
4836	if (!dir->i_op->link)
4837	return -EPERM;
4838	if (S_ISDIR(inode->i_mode))
4839	return -EPERM;
4840
4841	error = security_inode_link(old_dentry, dir, new_dentry);
4842	if (error)
4843	return error;
4844
4845	inode_lock(inode);
4846	/ Make sure we don't allow creating hardlink to an unlinked file /
4847	if (inode->i_nlink == `0` && !(inode->i_state & I_LINKABLE))
4848	error = -ENOENT;
4849	else if (max_links && inode->i_nlink >= max_links)
4850	error = -EMLINK;
4851	else {
4852	error = try_break_deleg(inode, delegated_inode);
4853	if (!error)
4854	error = dir->i_op->link(old_dentry, dir, new_dentry);
4855	}
4856
4857	if (!error && (inode->i_state & I_LINKABLE)) {
4858	spin_lock(lock: &inode->i_lock);
4859	inode->i_state &= ~I_LINKABLE;
4860	spin_unlock(lock: &inode->i_lock);
4861	}
4862	inode_unlock(inode);
4863	if (!error)
4864	fsnotify_link(dir, inode, new_dentry);
4865	return error;
4866	}
4867	EXPORT_SYMBOL(vfs_link);
4868
4869	/*
4870	* Hardlinks are often used in delicate situations. We avoid
4871	* security-related surprises by not following symlinks on the
4872	* newname. --KAB
4873	*
4874	* We don't follow them on the oldname either to be compatible
4875	* with linux 2.0, and to avoid hard-linking to directories
4876	* and other special files. --ADM
4877	*/
4878	int do_linkat(int olddfd, struct filename old, int* newdfd,
4879	struct filename new, int* flags)
4880	{
4881	struct mnt_idmap *idmap;
4882	struct dentry *new_dentry;
4883	struct path old_path, new_path;
4884	struct inode *delegated_inode = NULL;
4885	int how = `0`;
4886	int error;
4887
4888	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != `0`) {
4889	error = -EINVAL;
4890	goto out_putnames;
4891	}
4892	/*
4893	* To use null names we require CAP_DAC_READ_SEARCH or
4894	* that the open-time creds of the dfd matches current.
4895	* This ensures that not everyone will be able to create
4896	* a hardlink using the passed file descriptor.
4897	*/
4898	if (flags & AT_EMPTY_PATH)
4899	how \|= LOOKUP_LINKAT_EMPTY;
4900
4901	if (flags & AT_SYMLINK_FOLLOW)
4902	how \|= LOOKUP_FOLLOW;
4903	retry:
4904	error = filename_lookup(dfd: olddfd, name: old, flags: how, path: &old_path, NULL);
4905	if (error)
4906	goto out_putnames;
4907
4908	new_dentry = filename_create(dfd: newdfd, name: new, path: &new_path,
4909	lookup_flags: (how & LOOKUP_REVAL));
4910	error = PTR_ERR(ptr: new_dentry);
4911	if (IS_ERR(ptr: new_dentry))
4912	goto out_putpath;
4913
4914	error = -EXDEV;
4915	if (old_path.mnt != new_path.mnt)
4916	goto out_dput;
4917	idmap = mnt_idmap(mnt: new_path.mnt);
4918	error = may_linkat(idmap, link: &old_path);
4919	if (unlikely(error))
4920	goto out_dput;
4921	error = security_path_link(old_dentry: old_path.dentry, new_dir: &new_path, new_dentry);
4922	if (error)
4923	goto out_dput;
4924	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
4925	new_dentry, &delegated_inode);
4926	out_dput:
4927	done_path_create(&new_path, new_dentry);
4928	if (delegated_inode) {
4929	error = break_deleg_wait(delegated_inode: &delegated_inode);
4930	if (!error) {
4931	path_put(&old_path);
4932	goto retry;
4933	}
4934	}
4935	if (retry_estale(error, flags: how)) {
4936	path_put(&old_path);
4937	how \|= LOOKUP_REVAL;
4938	goto retry;
4939	}
4940	out_putpath:
4941	path_put(&old_path);
4942	out_putnames:
4943	putname(old);
4944	putname(new);
4945
4946	return error;
4947	}
4948
4949	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4950	int, newdfd, const char __user , newname, int*, flags)
4951	{
4952	return do_linkat(olddfd, old: getname_uflags(filename: oldname, uflags: flags),
4953	newdfd, new: getname(name: newname), flags);
4954	}
4955
4956	SYSCALL_DEFINE2(link, const char __user , oldname, const* char __user *, newname)
4957	{
4958	return do_linkat(AT_FDCWD, old: getname(name: oldname), AT_FDCWD, new: getname(name: newname), flags: `0`);
4959	}
4960
4961	/**
4962	* vfs_rename - rename a filesystem object
4963	* @rd: pointer to &struct renamedata info
4964	*
4965	* The caller must hold multiple mutexes--see lock_rename()).
4966	*
4967	* If vfs_rename discovers a delegation in need of breaking at either
4968	* the source or destination, it will return -EWOULDBLOCK and return a
4969	* reference to the inode in delegated_inode. The caller should then
4970	* break the delegation and retry. Because breaking a delegation may
4971	* take a long time, the caller should drop all locks before doing
4972	* so.
4973	*
4974	* Alternatively, a caller may pass NULL for delegated_inode. This may
4975	* be appropriate for callers that expect the underlying filesystem not
4976	* to be NFS exported.
4977	*
4978	* The worst of all namespace operations - renaming directory. "Perverted"
4979	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4980	* Problems:
4981	*
4982	* a) we can get into loop creation.
4983	* b) race potential - two innocent renames can create a loop together.
4984	* That's where 4.4BSD screws up. Current fix: serialization on
4985	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4986	* story.
4987	* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
4988	* and source (if it's a non-directory or a subdirectory that moves to
4989	* different parent).
4990	* And that - after we got ->i_mutex on parents (until then we don't know
4991	* whether the target exists). Solution: try to be smart with locking
4992	* order for inodes. We rely on the fact that tree topology may change
4993	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
4994	* move will be locked. Thus we can rank directories by the tree
4995	* (ancestors first) and rank all non-directories after them.
4996	* That works since everybody except rename does "lock parent, lookup,
4997	* lock child" and rename is under ->s_vfs_rename_mutex.
4998	* HOWEVER, it relies on the assumption that any object with ->lookup()
4999	* has no more than 1 dentry. If "hybrid" objects will ever appear,
5000	* we'd better make sure that there's no link(2) for them.
5001	* d) conversion from fhandle to dentry may come in the wrong moment - when
5002	* we are removing the target. Solution: we will have to grab ->i_mutex
5003	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
5004	* ->i_mutex on parents, which works but leads to some truly excessive
5005	* locking].
5006	*/
5007	int vfs_rename(struct renamedata *rd)
5008	{
5009	int error;
5010	struct inode old_dir = rd->old_dir, new_dir = rd->new_dir;
5011	struct dentry *old_dentry = rd->old_dentry;
5012	struct dentry *new_dentry = rd->new_dentry;
5013	struct inode **delegated_inode = rd->delegated_inode;
5014	unsigned int flags = rd->flags;
5015	bool is_dir = d_is_dir(dentry: old_dentry);
5016	struct inode *source = old_dentry->d_inode;
5017	struct inode *target = new_dentry->d_inode;
5018	bool new_is_dir = false;
5019	unsigned max_links = new_dir->i_sb->s_max_links;
5020	struct name_snapshot old_name;
5021	bool lock_old_subdir, lock_new_subdir;
5022
5023	if (source == target)
5024	return `0`;
5025
5026	error = may_delete(idmap: rd->old_mnt_idmap, dir: old_dir, victim: old_dentry, isdir: is_dir);
5027	if (error)
5028	return error;
5029
5030	if (!target) {
5031	error = may_create(idmap: rd->new_mnt_idmap, dir: new_dir, child: new_dentry);
5032	} else {
5033	new_is_dir = d_is_dir(dentry: new_dentry);
5034
5035	if (!(flags & RENAME_EXCHANGE))
5036	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
5037	victim: new_dentry, isdir: is_dir);
5038	else
5039	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
5040	victim: new_dentry, isdir: new_is_dir);
5041	}
5042	if (error)
5043	return error;
5044
5045	if (!old_dir->i_op->rename)
5046	return -EPERM;
5047
5048	/*
5049	* If we are going to change the parent - check write permissions,
5050	* we'll need to flip '..'.
5051	*/
5052	if (new_dir != old_dir) {
5053	if (is_dir) {
5054	error = inode_permission(rd->old_mnt_idmap, source,
5055	MAY_WRITE);
5056	if (error)
5057	return error;
5058	}
5059	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
5060	error = inode_permission(rd->new_mnt_idmap, target,
5061	MAY_WRITE);
5062	if (error)
5063	return error;
5064	}
5065	}
5066
5067	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
5068	flags);
5069	if (error)
5070	return error;
5071
5072	take_dentry_name_snapshot(&old_name, old_dentry);
5073	dget(dentry: new_dentry);
5074	/*
5075	* Lock children.
5076	* The source subdirectory needs to be locked on cross-directory
5077	* rename or cross-directory exchange since its parent changes.
5078	* The target subdirectory needs to be locked on cross-directory
5079	* exchange due to parent change and on any rename due to becoming
5080	* a victim.
5081	* Non-directories need locking in all cases (for NFS reasons);
5082	* they get locked after any subdirectories (in inode address order).
5083	*
5084	* NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
5085	* NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
5086	*/
5087	lock_old_subdir = new_dir != old_dir;
5088	lock_new_subdir = new_dir != old_dir \|\| !(flags & RENAME_EXCHANGE);
5089	if (is_dir) {
5090	if (lock_old_subdir)
5091	inode_lock_nested(inode: source, subclass: I_MUTEX_CHILD);
5092	if (target && (!new_is_dir \|\| lock_new_subdir))
5093	inode_lock(inode: target);
5094	} else if (new_is_dir) {
5095	if (lock_new_subdir)
5096	inode_lock_nested(inode: target, subclass: I_MUTEX_CHILD);
5097	inode_lock(inode: source);
5098	} else {
5099	lock_two_nondirectories(source, target);
5100	}
5101
5102	error = -EPERM;
5103	if (IS_SWAPFILE(source) \|\| (target && IS_SWAPFILE(target)))
5104	goto out;
5105
5106	error = -EBUSY;
5107	if (is_local_mountpoint(dentry: old_dentry) \|\| is_local_mountpoint(dentry: new_dentry))
5108	goto out;
5109
5110	if (max_links && new_dir != old_dir) {
5111	error = -EMLINK;
5112	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
5113	goto out;
5114	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
5115	old_dir->i_nlink >= max_links)
5116	goto out;
5117	}
5118	if (!is_dir) {
5119	error = try_break_deleg(inode: source, delegated_inode);
5120	if (error)
5121	goto out;
5122	}
5123	if (target && !new_is_dir) {
5124	error = try_break_deleg(inode: target, delegated_inode);
5125	if (error)
5126	goto out;
5127	}
5128	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
5129	new_dir, new_dentry, flags);
5130	if (error)
5131	goto out;
5132
5133	if (!(flags & RENAME_EXCHANGE) && target) {
5134	if (is_dir) {
5135	shrink_dcache_parent(new_dentry);
5136	target->i_flags \|= S_DEAD;
5137	}
5138	dont_mount(dentry: new_dentry);
5139	detach_mounts(dentry: new_dentry);
5140	}
5141	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
5142	if (!(flags & RENAME_EXCHANGE))
5143	d_move(old_dentry, new_dentry);
5144	else
5145	d_exchange(old_dentry, new_dentry);
5146	}
5147	out:
5148	if (!is_dir \|\| lock_old_subdir)
5149	inode_unlock(inode: source);
5150	if (target && (!new_is_dir \|\| lock_new_subdir))
5151	inode_unlock(inode: target);
5152	dput(new_dentry);
5153	if (!error) {
5154	fsnotify_move(old_dir, new_dir, old_name: &old_name.name, isdir: is_dir,
5155	target: !(flags & RENAME_EXCHANGE) ? target : NULL, moved: old_dentry);
5156	if (flags & RENAME_EXCHANGE) {
5157	fsnotify_move(old_dir: new_dir, new_dir: old_dir, old_name: &old_dentry->d_name,
5158	isdir: new_is_dir, NULL, moved: new_dentry);
5159	}
5160	}
5161	release_dentry_name_snapshot(&old_name);
5162
5163	return error;
5164	}
5165	EXPORT_SYMBOL(vfs_rename);
5166
5167	int do_renameat2(int olddfd, struct filename from, int* newdfd,
5168	struct filename to, unsigned* int flags)
5169	{
5170	struct renamedata rd;
5171	struct dentry old_dentry, new_dentry;
5172	struct dentry *trap;
5173	struct path old_path, new_path;
5174	struct qstr old_last, new_last;
5175	int old_type, new_type;
5176	struct inode *delegated_inode = NULL;
5177	unsigned int lookup_flags = `0`, target_flags =
5178	LOOKUP_RENAME_TARGET \| LOOKUP_CREATE;
5179	bool should_retry = false;
5180	int error = -EINVAL;
5181
5182	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
5183	goto put_names;
5184
5185	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
5186	(flags & RENAME_EXCHANGE))
5187	goto put_names;
5188
5189	if (flags & RENAME_EXCHANGE)
5190	target_flags = `0`;
5191	if (flags & RENAME_NOREPLACE)
5192	target_flags \|= LOOKUP_EXCL;
5193
5194	retry:
5195	error = filename_parentat(dfd: olddfd, name: from, flags: lookup_flags, parent: &old_path,
5196	last: &old_last, type: &old_type);
5197	if (error)
5198	goto put_names;
5199
5200	error = filename_parentat(dfd: newdfd, name: to, flags: lookup_flags, parent: &new_path, last: &new_last,
5201	type: &new_type);
5202	if (error)
5203	goto exit1;
5204
5205	error = -EXDEV;
5206	if (old_path.mnt != new_path.mnt)
5207	goto exit2;
5208
5209	error = -EBUSY;
5210	if (old_type != LAST_NORM)
5211	goto exit2;
5212
5213	if (flags & RENAME_NOREPLACE)
5214	error = -EEXIST;
5215	if (new_type != LAST_NORM)
5216	goto exit2;
5217
5218	error = mnt_want_write(mnt: old_path.mnt);
5219	if (error)
5220	goto exit2;
5221
5222	retry_deleg:
5223	trap = lock_rename(new_path.dentry, old_path.dentry);
5224	if (IS_ERR(ptr: trap)) {
5225	error = PTR_ERR(ptr: trap);
5226	goto exit_lock_rename;
5227	}
5228
5229	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
5230	lookup_flags);
5231	error = PTR_ERR(ptr: old_dentry);
5232	if (IS_ERR(ptr: old_dentry))
5233	goto exit3;
5234	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
5235	lookup_flags \| target_flags);
5236	error = PTR_ERR(ptr: new_dentry);
5237	if (IS_ERR(ptr: new_dentry))
5238	goto exit4;
5239	if (flags & RENAME_EXCHANGE) {
5240	if (!d_is_dir(dentry: new_dentry)) {
5241	error = -ENOTDIR;
5242	if (new_last.name[new_last.len])
5243	goto exit5;
5244	}
5245	}
5246	/ unless the source is a directory trailing slashes give -ENOTDIR /
5247	if (!d_is_dir(dentry: old_dentry)) {
5248	error = -ENOTDIR;
5249	if (old_last.name[old_last.len])
5250	goto exit5;
5251	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
5252	goto exit5;
5253	}
5254	/ source should not be ancestor of target /
5255	error = -EINVAL;
5256	if (old_dentry == trap)
5257	goto exit5;
5258	/ target should not be an ancestor of source /
5259	if (!(flags & RENAME_EXCHANGE))
5260	error = -ENOTEMPTY;
5261	if (new_dentry == trap)
5262	goto exit5;
5263
5264	error = security_path_rename(old_dir: &old_path, old_dentry,
5265	new_dir: &new_path, new_dentry, flags);
5266	if (error)
5267	goto exit5;
5268
5269	rd.old_dir = old_path.dentry->d_inode;
5270	rd.old_dentry = old_dentry;
5271	rd.old_mnt_idmap = mnt_idmap(mnt: old_path.mnt);
5272	rd.new_dir = new_path.dentry->d_inode;
5273	rd.new_dentry = new_dentry;
5274	rd.new_mnt_idmap = mnt_idmap(mnt: new_path.mnt);
5275	rd.delegated_inode = &delegated_inode;
5276	rd.flags = flags;
5277	error = vfs_rename(&rd);
5278	exit5:
5279	dput(new_dentry);
5280	exit4:
5281	dput(old_dentry);
5282	exit3:
5283	unlock_rename(new_path.dentry, old_path.dentry);
5284	exit_lock_rename:
5285	if (delegated_inode) {
5286	error = break_deleg_wait(delegated_inode: &delegated_inode);
5287	if (!error)
5288	goto retry_deleg;
5289	}
5290	mnt_drop_write(mnt: old_path.mnt);
5291	exit2:
5292	if (retry_estale(error, flags: lookup_flags))
5293	should_retry = true;
5294	path_put(&new_path);
5295	exit1:
5296	path_put(&old_path);
5297	if (should_retry) {
5298	should_retry = false;
5299	lookup_flags \|= LOOKUP_REVAL;
5300	goto retry;
5301	}
5302	put_names:
5303	putname(from);
5304	putname(to);
5305	return error;
5306	}
5307
5308	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
5309	int, newdfd, const char __user , newname, unsigned* int, flags)
5310	{
5311	return do_renameat2(olddfd, from: getname(name: oldname), newdfd, to: getname(name: newname),
5312	flags);
5313	}
5314
5315	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
5316	int, newdfd, const char __user *, newname)
5317	{
5318	return do_renameat2(olddfd, from: getname(name: oldname), newdfd, to: getname(name: newname),
5319	flags: `0`);
5320	}
5321
5322	SYSCALL_DEFINE2(rename, const char __user , oldname, const* char __user *, newname)
5323	{
5324	return do_renameat2(AT_FDCWD, from: getname(name: oldname), AT_FDCWD,
5325	to: getname(name: newname), flags: `0`);
5326	}
5327
5328	int readlink_copy(char __user buffer, int* buflen, const char link, int* linklen)
5329	{
5330	int copylen;
5331
5332	copylen = linklen;
5333	if (unlikely(copylen > (unsigned) buflen))
5334	copylen = buflen;
5335	if (copy_to_user(to: buffer, from: link, n: copylen))
5336	copylen = -EFAULT;
5337	return copylen;
5338	}
5339
5340	/**
5341	* vfs_readlink - copy symlink body into userspace buffer
5342	* @dentry: dentry on which to get symbolic link
5343	* @buffer: user memory pointer
5344	* @buflen: size of buffer
5345	*
5346	* Does not touch atime. That's up to the caller if necessary
5347	*
5348	* Does not call security hook.
5349	*/
5350	int vfs_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5351	{
5352	struct inode *inode = d_inode(dentry);
5353	DEFINE_DELAYED_CALL(done);
5354	const char *link;
5355	int res;
5356
5357	if (inode->i_opflags & IOP_CACHED_LINK)
5358	return readlink_copy(buffer, buflen, link: inode->i_link, linklen: inode->i_linklen);
5359
5360	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
5361	if (unlikely(inode->i_op->readlink))
5362	return inode->i_op->readlink(dentry, buffer, buflen);
5363
5364	if (!d_is_symlink(dentry))
5365	return -EINVAL;
5366
5367	spin_lock(lock: &inode->i_lock);
5368	inode->i_opflags \|= IOP_DEFAULT_READLINK;
5369	spin_unlock(lock: &inode->i_lock);
5370	}
5371
5372	link = READ_ONCE(inode->i_link);
5373	if (!link) {
5374	link = inode->i_op->get_link(dentry, inode, &done);
5375	if (IS_ERR(ptr: link))
5376	return PTR_ERR(ptr: link);
5377	}
5378	res = readlink_copy(buffer, buflen, link, strlen(link));
5379	do_delayed_call(call: &done);
5380	return res;
5381	}
5382	EXPORT_SYMBOL(vfs_readlink);
5383
5384	/**
5385	* vfs_get_link - get symlink body
5386	* @dentry: dentry on which to get symbolic link
5387	* @done: caller needs to free returned data with this
5388	*
5389	* Calls security hook and i_op->get_link() on the supplied inode.
5390	*
5391	* It does not touch atime. That's up to the caller if necessary.
5392	*
5393	* Does not work on "special" symlinks like /proc/$$/fd/N
5394	*/
5395	const char vfs_get_link(struct* dentry dentry, struct* delayed_call *done)
5396	{
5397	const char *res = ERR_PTR(error: -EINVAL);
5398	struct inode *inode = d_inode(dentry);
5399
5400	if (d_is_symlink(dentry)) {
5401	res = ERR_PTR(error: security_inode_readlink(dentry));
5402	if (!res)
5403	res = inode->i_op->get_link(dentry, inode, done);
5404	}
5405	return res;
5406	}
5407	EXPORT_SYMBOL(vfs_get_link);
5408
5409	/ get the link contents into pagecache /
5410	static char __page_get_link(struct* dentry dentry, struct* inode *inode,
5411	struct delayed_call *callback)
5412	{
5413	struct folio *folio;
5414	struct address_space *mapping = inode->i_mapping;
5415
5416	if (!dentry) {
5417	folio = filemap_get_folio(mapping, index: `0`);
5418	if (IS_ERR(ptr: folio))
5419	return ERR_PTR(error: -ECHILD);
5420	if (!folio_test_uptodate(folio)) {
5421	folio_put(folio);
5422	return ERR_PTR(error: -ECHILD);
5423	}
5424	} else {
5425	folio = read_mapping_folio(mapping, index: `0`, NULL);
5426	if (IS_ERR(ptr: folio))
5427	return ERR_CAST(ptr: folio);
5428	}
5429	set_delayed_call(call: callback, fn: page_put_link, arg: folio);
5430	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
5431	return folio_address(folio);
5432	}
5433
5434	const char page_get_link_raw(struct* dentry dentry, struct* inode *inode,
5435	struct delayed_call *callback)
5436	{
5437	return __page_get_link(dentry, inode, callback);
5438	}
5439	EXPORT_SYMBOL_GPL(page_get_link_raw);
5440
5441	/**
5442	* page_get_link() - An implementation of the get_link inode_operation.
5443	* @dentry: The directory entry which is the symlink.
5444	* @inode: The inode for the symlink.
5445	* @callback: Used to drop the reference to the symlink.
5446	*
5447	* Filesystems which store their symlinks in the page cache should use
5448	* this to implement the get_link() member of their inode_operations.
5449	*
5450	* Return: A pointer to the NUL-terminated symlink.
5451	*/
5452	const char page_get_link(struct* dentry dentry, struct* inode *inode,
5453	struct delayed_call *callback)
5454	{
5455	char *kaddr = __page_get_link(dentry, inode, callback);
5456
5457	if (!IS_ERR(ptr: kaddr))
5458	nd_terminate_link(name: kaddr, len: inode->i_size, PAGE_SIZE - `1`);
5459	return kaddr;
5460	}
5461	EXPORT_SYMBOL(page_get_link);
5462
5463	/**
5464	* page_put_link() - Drop the reference to the symlink.
5465	* @arg: The folio which contains the symlink.
5466	*
5467	* This is used internally by page_get_link(). It is exported for use
5468	* by filesystems which need to implement a variant of page_get_link()
5469	* themselves. Despite the apparent symmetry, filesystems which use
5470	* page_get_link() do not need to call page_put_link().
5471	*
5472	* The argument, while it has a void pointer type, must be a pointer to
5473	* the folio which was retrieved from the page cache. The delayed_call
5474	* infrastructure is used to drop the reference count once the caller
5475	* is done with the symlink.
5476	*/
5477	void page_put_link(void *arg)
5478	{
5479	folio_put(folio: arg);
5480	}
5481	EXPORT_SYMBOL(page_put_link);
5482
5483	int page_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5484	{
5485	const char *link;
5486	int res;
5487
5488	DEFINE_DELAYED_CALL(done);
5489	link = page_get_link(dentry, d_inode(dentry), &done);
5490	res = PTR_ERR(ptr: link);
5491	if (!IS_ERR(ptr: link))
5492	res = readlink_copy(buffer, buflen, link, strlen(link));
5493	do_delayed_call(call: &done);
5494	return res;
5495	}
5496	EXPORT_SYMBOL(page_readlink);
5497
5498	int page_symlink(struct inode inode, const* char symname, int* len)
5499	{
5500	struct address_space *mapping = inode->i_mapping;
5501	const struct address_space_operations *aops = mapping->a_ops;
5502	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
5503	struct folio *folio;
5504	void *fsdata = NULL;
5505	int err;
5506	unsigned int flags;
5507
5508	retry:
5509	if (nofs)
5510	flags = memalloc_nofs_save();
5511	err = aops->write_begin(NULL, mapping, `0`, len-`1`, &folio, &fsdata);
5512	if (nofs)
5513	memalloc_nofs_restore(flags);
5514	if (err)
5515	goto fail;
5516
5517	memcpy(folio_address(folio), symname, len - `1`);
5518
5519	err = aops->write_end(NULL, mapping, `0`, len - `1`, len - `1`,
5520	folio, fsdata);
5521	if (err < `0`)
5522	goto fail;
5523	if (err < len-`1`)
5524	goto retry;
5525
5526	mark_inode_dirty(inode);
5527	return `0`;
5528	fail:
5529	return err;
5530	}
5531	EXPORT_SYMBOL(page_symlink);
5532
5533	const struct inode_operations page_symlink_inode_operations = {
5534	.get_link = page_get_link,
5535	};
5536	EXPORT_SYMBOL(page_symlink_inode_operations);
5537

Provided by KDAB

Definitions

initname
getname_flags
getname_uflags
__getname_maybe_null
getname_kernel
putname
check_acl
no_acl_inode
acl_permission_check
generic_permission
do_inode_permission
sb_permission
inode_permission
path_get
path_put
nameidata
saved
__set_nameidata
set_nameidata
restore_nameidata
nd_alloc_stack
path_connected
drop_links
leave_rcu
terminate_walk
__legitimize_path
legitimize_path
legitimize_links
legitimize_root
try_to_unlazy
try_to_unlazy_next
d_revalidate
complete_walk
set_root
nd_jump_root
nd_jump_link
put_link
sysctl_protected_symlinks
sysctl_protected_hardlinks
sysctl_protected_fifos
sysctl_protected_regular
namei_sysctls
init_fs_namei_sysctls
may_follow_link
safe_hardlink_source
may_linkat
may_create_in_sticky
follow_up
choose_mountpoint_rcu
choose_mountpoint
follow_automount
__traverse_mounts
traverse_mounts
follow_down_one
follow_down
__follow_mount_rcu
handle_mounts
lookup_dcache
lookup_one_qstr_excl_raw
lookup_one_qstr_excl
lookup_fast
__lookup_slow
lookup_slow
may_lookup
reserve_stack
pick_link
step_into
follow_dotdot_rcu
follow_dotdot
handle_dots
walk_component
fold_hash
full_name_hash
hashlen_string
hash_name
link_path_walk
path_init
lookup_last
handle_lookup_down
path_lookupat
filename_lookup
path_parentat
__filename_parentat
filename_parentat
__kern_path_locked
kern_path_locked_negative
kern_path_locked
user_path_locked_at
kern_path
vfs_path_parent_lookup
vfs_path_lookup
lookup_noperm_common
lookup_one_common
try_lookup_noperm
lookup_noperm
lookup_one
lookup_one_unlocked
lookup_one_positive_unlocked
lookup_noperm_unlocked
lookup_noperm_positive_unlocked
path_pts
user_path_at
__check_sticky
may_delete
may_create
lock_two_directories
lock_rename
lock_rename_child
unlock_rename
vfs_prepare_mode
vfs_create
vfs_mkobj
may_open_dev
may_open
handle_truncate
open_to_namei_flags
may_o_create
atomic_open
lookup_open
trailing_slashes
lookup_fast_for_open
open_last_lookups
do_open
vfs_tmpfile
kernel_tmpfile_open
do_tmpfile
do_o_path
path_openat
do_filp_open
do_file_open_root
filename_create
kern_path_create
done_path_create
user_path_create
vfs_mknod
may_mknod
do_mknodat
vfs_mkdir
do_mkdirat
vfs_rmdir
do_rmdir
vfs_unlink
do_unlinkat
vfs_symlink
do_symlinkat
vfs_link
do_linkat
vfs_rename
do_renameat2
readlink_copy
vfs_readlink
vfs_get_link
__page_get_link
page_get_link_raw
page_get_link
page_put_link
page_readlink
page_symlink

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/fs/namei.c