namespace.c source code [linux/fs/namespace.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/namespace.c
4	*
5	* (C) Copyright Al Viro 2000, 2001
6	*
7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
8	* Heavily rewritten.
9	*/
10
11	#include <linux/syscalls.h>
12	#include <linux/export.h>
13	#include <linux/capability.h>
14	#include <linux/mnt_namespace.h>
15	#include <linux/user_namespace.h>
16	#include <linux/namei.h>
17	#include <linux/security.h>
18	#include <linux/cred.h>
19	#include <linux/idr.h>
20	#include <linux/init.h> /* init_rootfs */
21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23	#include <linux/file.h>
24	#include <linux/uaccess.h>
25	#include <linux/proc_ns.h>
26	#include <linux/magic.h>
27	#include <linux/memblock.h>
28	#include <linux/proc_fs.h>
29	#include <linux/task_work.h>
30	#include <linux/sched/task.h>
31	#include <uapi/linux/mount.h>
32	#include <linux/fs_context.h>
33	#include <linux/shmem_fs.h>
34	#include <linux/mnt_idmapping.h>
35	#include <linux/pidfs.h>
36
37	#include "pnode.h"
38	#include "internal.h"
39
40	/ Maximum number of mounts in a mount namespace /
41	static unsigned int sysctl_mount_max __read_mostly = `100000`;
42
43	static unsigned int m_hash_mask __ro_after_init;
44	static unsigned int m_hash_shift __ro_after_init;
45	static unsigned int mp_hash_mask __ro_after_init;
46	static unsigned int mp_hash_shift __ro_after_init;
47
48	static __initdata unsigned long mhash_entries;
49	static int __init set_mhash_entries(char *str)
50	{
51	if (!str)
52	return `0`;
53	mhash_entries = simple_strtoul(str, &str, `0`);
54	return `1`;
55	}
56	__setup("mhash_entries=", set_mhash_entries);
57
58	static __initdata unsigned long mphash_entries;
59	static int __init set_mphash_entries(char *str)
60	{
61	if (!str)
62	return `0`;
63	mphash_entries = simple_strtoul(str, &str, `0`);
64	return `1`;
65	}
66	__setup("mphash_entries=", set_mphash_entries);
67
68	static u64 event;
69	static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
70	static DEFINE_IDA(mnt_group_ida);
71
72	/ Don't allow confusion with old 32bit mount ID /
73	#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
74	static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
75
76	static struct hlist_head *mount_hashtable __ro_after_init;
77	static struct hlist_head *mountpoint_hashtable __ro_after_init;
78	static struct kmem_cache *mnt_cache __ro_after_init;
79	static DECLARE_RWSEM(namespace_sem);
80	static HLIST_HEAD(unmounted); / protected by namespace_sem /
81	static LIST_HEAD(ex_mountpoints); / protected by namespace_sem /
82	static DEFINE_SEQLOCK(mnt_ns_tree_lock);
83
84	#ifdef CONFIG_FSNOTIFY
85	LIST_HEAD(notify_list); / protected by namespace_sem /
86	#endif
87	static struct rb_root mnt_ns_tree = RB_ROOT; / protected by mnt_ns_tree_lock /
88	static LIST_HEAD(mnt_ns_list); / protected by mnt_ns_tree_lock /
89
90	enum mount_kattr_flags_t {
91	MOUNT_KATTR_RECURSE = (`1` << `0`),
92	MOUNT_KATTR_IDMAP_REPLACE = (`1` << `1`),
93	};
94
95	struct mount_kattr {
96	unsigned int attr_set;
97	unsigned int attr_clr;
98	unsigned int propagation;
99	unsigned int lookup_flags;
100	enum mount_kattr_flags_t kflags;
101	struct user_namespace *mnt_userns;
102	struct mnt_idmap *mnt_idmap;
103	};
104
105	/ /sys/fs /
106	struct kobject *fs_kobj __ro_after_init;
107	EXPORT_SYMBOL_GPL(fs_kobj);
108
109	/*
110	* vfsmount lock may be taken for read to prevent changes to the
111	* vfsmount hash, ie. during mountpoint lookups or walking back
112	* up the tree.
113	*
114	* It should be taken for write in all cases where the vfsmount
115	* tree or hash is modified or when a vfsmount structure is modified.
116	*/
117	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
118
119	static inline struct mnt_namespace node_to_mnt_ns(const* struct rb_node *node)
120	{
121	if (!node)
122	return NULL;
123	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
124	}
125
126	static int mnt_ns_cmp(struct rb_node a, const* struct rb_node *b)
127	{
128	struct mnt_namespace *ns_a = node_to_mnt_ns(node: a);
129	struct mnt_namespace *ns_b = node_to_mnt_ns(node: b);
130	u64 seq_a = ns_a->seq;
131	u64 seq_b = ns_b->seq;
132
133	if (seq_a < seq_b)
134	return -`1`;
135	if (seq_a > seq_b)
136	return `1`;
137	return `0`;
138	}
139
140	static inline void mnt_ns_tree_write_lock(void)
141	{
142	write_seqlock(sl: &mnt_ns_tree_lock);
143	}
144
145	static inline void mnt_ns_tree_write_unlock(void)
146	{
147	write_sequnlock(sl: &mnt_ns_tree_lock);
148	}
149
150	static void mnt_ns_tree_add(struct mnt_namespace *ns)
151	{
152	struct rb_node node, prev;
153
154	mnt_ns_tree_write_lock();
155	node = rb_find_add_rcu(node: &ns->mnt_ns_tree_node, tree: &mnt_ns_tree, cmp: mnt_ns_cmp);
156	/*
157	* If there's no previous entry simply add it after the
158	* head and if there is add it after the previous entry.
159	*/
160	prev = rb_prev(&ns->mnt_ns_tree_node);
161	if (!prev)
162	list_add_rcu(new: &ns->mnt_ns_list, head: &mnt_ns_list);
163	else
164	list_add_rcu(new: &ns->mnt_ns_list, head: &node_to_mnt_ns(node: prev)->mnt_ns_list);
165	mnt_ns_tree_write_unlock();
166
167	WARN_ON_ONCE(node);
168	}
169
170	static void mnt_ns_release(struct mnt_namespace *ns)
171	{
172	/ keep alive for {list,stat}mount() /
173	if (refcount_dec_and_test(r: &ns->passive)) {
174	fsnotify_mntns_delete(mntns: ns);
175	put_user_ns(ns: ns->user_ns);
176	kfree(objp: ns);
177	}
178	}
179	DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
180
181	static void mnt_ns_release_rcu(struct rcu_head *rcu)
182	{
183	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
184	}
185
186	static void mnt_ns_tree_remove(struct mnt_namespace *ns)
187	{
188	/ remove from global mount namespace list /
189	if (!is_anon_ns(ns)) {
190	mnt_ns_tree_write_lock();
191	rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
192	list_bidir_del_rcu(entry: &ns->mnt_ns_list);
193	mnt_ns_tree_write_unlock();
194	}
195
196	call_rcu(head: &ns->mnt_ns_rcu, func: mnt_ns_release_rcu);
197	}
198
199	static int mnt_ns_find(const void key, const* struct rb_node *node)
200	{
201	const u64 mnt_ns_id = (u64 )key;
202	const struct mnt_namespace *ns = node_to_mnt_ns(node);
203
204	if (mnt_ns_id < ns->seq)
205	return -`1`;
206	if (mnt_ns_id > ns->seq)
207	return `1`;
208	return `0`;
209	}
210
211	/*
212	* Lookup a mount namespace by id and take a passive reference count. Taking a
213	* passive reference means the mount namespace can be emptied if e.g., the last
214	* task holding an active reference exits. To access the mounts of the
215	* namespace the @namespace_sem must first be acquired. If the namespace has
216	* already shut down before acquiring @namespace_sem, {list,stat}mount() will
217	* see that the mount rbtree of the namespace is empty.
218	*
219	* Note the lookup is lockless protected by a sequence counter. We only
220	* need to guard against false negatives as false positives aren't
221	* possible. So if we didn't find a mount namespace and the sequence
222	* counter has changed we need to retry. If the sequence counter is
223	* still the same we know the search actually failed.
224	*/
225	static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
226	{
227	struct mnt_namespace *ns;
228	struct rb_node *node;
229	unsigned int seq;
230
231	guard(rcu)();
232	do {
233	seq = read_seqbegin(sl: &mnt_ns_tree_lock);
234	node = rb_find_rcu(key: &mnt_ns_id, tree: &mnt_ns_tree, cmp: mnt_ns_find);
235	if (node)
236	break;
237	} while (read_seqretry(sl: &mnt_ns_tree_lock, start: seq));
238
239	if (!node)
240	return NULL;
241
242	/*
243	* The last reference count is put with RCU delay so we can
244	* unconditonally acquire a reference here.
245	*/
246	ns = node_to_mnt_ns(node);
247	refcount_inc(r: &ns->passive);
248	return ns;
249	}
250
251	static inline void lock_mount_hash(void)
252	{
253	write_seqlock(sl: &mount_lock);
254	}
255
256	static inline void unlock_mount_hash(void)
257	{
258	write_sequnlock(sl: &mount_lock);
259	}
260
261	static inline struct hlist_head m_hash(struct* vfsmount mnt, struct* dentry *dentry)
262	{
263	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
264	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
265	tmp = tmp + (tmp >> m_hash_shift);
266	return &mount_hashtable[tmp & m_hash_mask];
267	}
268
269	static inline struct hlist_head mp_hash(struct* dentry *dentry)
270	{
271	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
272	tmp = tmp + (tmp >> mp_hash_shift);
273	return &mountpoint_hashtable[tmp & mp_hash_mask];
274	}
275
276	static int mnt_alloc_id(struct mount *mnt)
277	{
278	int res;
279
280	xa_lock(&mnt_id_xa);
281	res = __xa_alloc(&mnt_id_xa, id: &mnt->mnt_id, entry: mnt, XA_LIMIT(`1`, INT_MAX), GFP_KERNEL);
282	if (!res)
283	mnt->mnt_id_unique = ++mnt_id_ctr;
284	xa_unlock(&mnt_id_xa);
285	return res;
286	}
287
288	static void mnt_free_id(struct mount *mnt)
289	{
290	xa_erase(&mnt_id_xa, index: mnt->mnt_id);
291	}
292
293	/*
294	* Allocate a new peer group ID
295	*/
296	static int mnt_alloc_group_id(struct mount *mnt)
297	{
298	int res = ida_alloc_min(ida: &mnt_group_ida, min: `1`, GFP_KERNEL);
299
300	if (res < `0`)
301	return res;
302	mnt->mnt_group_id = res;
303	return `0`;
304	}
305
306	/*
307	* Release a peer group ID
308	*/
309	void mnt_release_group_id(struct mount *mnt)
310	{
311	ida_free(&mnt_group_ida, id: mnt->mnt_group_id);
312	mnt->mnt_group_id = `0`;
313	}
314
315	/*
316	* vfsmount lock must be held for read
317	*/
318	static inline void mnt_add_count(struct mount mnt, int* n)
319	{
320	#ifdef CONFIG_SMP
321	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
322	#else
323	preempt_disable();
324	mnt->mnt_count += n;
325	preempt_enable();
326	#endif
327	}
328
329	/*
330	* vfsmount lock must be held for write
331	*/
332	int mnt_get_count(struct mount *mnt)
333	{
334	#ifdef CONFIG_SMP
335	int count = `0`;
336	int cpu;
337
338	for_each_possible_cpu(cpu) {
339	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
340	}
341
342	return count;
343	#else
344	return mnt->mnt_count;
345	#endif
346	}
347
348	static struct mount alloc_vfsmnt(const* char *name)
349	{
350	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
351	if (mnt) {
352	int err;
353
354	err = mnt_alloc_id(mnt);
355	if (err)
356	goto out_free_cache;
357
358	if (name)
359	mnt->mnt_devname = kstrdup_const(s: name,
360	GFP_KERNEL_ACCOUNT);
361	else
362	mnt->mnt_devname = "none";
363	if (!mnt->mnt_devname)
364	goto out_free_id;
365
366	#ifdef CONFIG_SMP
367	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
368	if (!mnt->mnt_pcp)
369	goto out_free_devname;
370
371	this_cpu_add(mnt->mnt_pcp->mnt_count, `1`);
372	#else
373	mnt->mnt_count = `1`;
374	mnt->mnt_writers = `0`;
375	#endif
376
377	INIT_HLIST_NODE(h: &mnt->mnt_hash);
378	INIT_LIST_HEAD(list: &mnt->mnt_child);
379	INIT_LIST_HEAD(list: &mnt->mnt_mounts);
380	INIT_LIST_HEAD(list: &mnt->mnt_list);
381	INIT_LIST_HEAD(list: &mnt->mnt_expire);
382	INIT_LIST_HEAD(list: &mnt->mnt_share);
383	INIT_LIST_HEAD(list: &mnt->mnt_slave_list);
384	INIT_LIST_HEAD(list: &mnt->mnt_slave);
385	INIT_HLIST_NODE(h: &mnt->mnt_mp_list);
386	INIT_LIST_HEAD(list: &mnt->mnt_umounting);
387	INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
388	RB_CLEAR_NODE(&mnt->mnt_node);
389	mnt->mnt.mnt_idmap = &nop_mnt_idmap;
390	}
391	return mnt;
392
393	#ifdef CONFIG_SMP
394	out_free_devname:
395	kfree_const(x: mnt->mnt_devname);
396	#endif
397	out_free_id:
398	mnt_free_id(mnt);
399	out_free_cache:
400	kmem_cache_free(s: mnt_cache, objp: mnt);
401	return NULL;
402	}
403
404	/*
405	* Most r/o checks on a fs are for operations that take
406	* discrete amounts of time, like a write() or unlink().
407	* We must keep track of when those operations start
408	* (for permission checks) and when they end, so that
409	* we can determine when writes are able to occur to
410	* a filesystem.
411	*/
412	/*
413	* __mnt_is_readonly: check whether a mount is read-only
414	* @mnt: the mount to check for its write status
415	*
416	* This shouldn't be used directly ouside of the VFS.
417	* It does not guarantee that the filesystem will stay
418	* r/w, just that it is right now. This can not and
419	* should not be used in place of IS_RDONLY(inode).
420	* mnt_want/drop_write() will _keep_ the filesystem
421	* r/w.
422	*/
423	bool __mnt_is_readonly(struct vfsmount *mnt)
424	{
425	return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(sb: mnt->mnt_sb);
426	}
427	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
428
429	static inline void mnt_inc_writers(struct mount *mnt)
430	{
431	#ifdef CONFIG_SMP
432	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
433	#else
434	mnt->mnt_writers++;
435	#endif
436	}
437
438	static inline void mnt_dec_writers(struct mount *mnt)
439	{
440	#ifdef CONFIG_SMP
441	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
442	#else
443	mnt->mnt_writers--;
444	#endif
445	}
446
447	static unsigned int mnt_get_writers(struct mount *mnt)
448	{
449	#ifdef CONFIG_SMP
450	unsigned int count = `0`;
451	int cpu;
452
453	for_each_possible_cpu(cpu) {
454	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
455	}
456
457	return count;
458	#else
459	return mnt->mnt_writers;
460	#endif
461	}
462
463	static int mnt_is_readonly(struct vfsmount *mnt)
464	{
465	if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
466	return `1`;
467	/*
468	* The barrier pairs with the barrier in sb_start_ro_state_change()
469	* making sure if we don't see s_readonly_remount set yet, we also will
470	* not see any superblock / mount flag changes done by remount.
471	* It also pairs with the barrier in sb_end_ro_state_change()
472	* assuring that if we see s_readonly_remount already cleared, we will
473	* see the values of superblock / mount flags updated by remount.
474	*/
475	smp_rmb();
476	return __mnt_is_readonly(mnt);
477	}
478
479	/*
480	* Most r/o & frozen checks on a fs are for operations that take discrete
481	* amounts of time, like a write() or unlink(). We must keep track of when
482	* those operations start (for permission checks) and when they end, so that we
483	* can determine when writes are able to occur to a filesystem.
484	*/
485	/**
486	* mnt_get_write_access - get write access to a mount without freeze protection
487	* @m: the mount on which to take a write
488	*
489	* This tells the low-level filesystem that a write is about to be performed to
490	* it, and makes sure that writes are allowed (mnt it read-write) before
491	* returning success. This operation does not protect against filesystem being
492	* frozen. When the write operation is finished, mnt_put_write_access() must be
493	* called. This is effectively a refcount.
494	*/
495	int mnt_get_write_access(struct vfsmount *m)
496	{
497	struct mount *mnt = real_mount(mnt: m);
498	int ret = `0`;
499
500	preempt_disable();
501	mnt_inc_writers(mnt);
502	/*
503	* The store to mnt_inc_writers must be visible before we pass
504	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
505	* incremented count after it has set MNT_WRITE_HOLD.
506	*/
507	smp_mb();
508	might_lock(&mount_lock.lock);
509	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
510	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
511	cpu_relax();
512	} else {
513	/*
514	* This prevents priority inversion, if the task
515	* setting MNT_WRITE_HOLD got preempted on a remote
516	* CPU, and it prevents life lock if the task setting
517	* MNT_WRITE_HOLD has a lower priority and is bound to
518	* the same CPU as the task that is spinning here.
519	*/
520	preempt_enable();
521	lock_mount_hash();
522	unlock_mount_hash();
523	preempt_disable();
524	}
525	}
526	/*
527	* The barrier pairs with the barrier sb_start_ro_state_change() making
528	* sure that if we see MNT_WRITE_HOLD cleared, we will also see
529	* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
530	* mnt_is_readonly() and bail in case we are racing with remount
531	* read-only.
532	*/
533	smp_rmb();
534	if (mnt_is_readonly(mnt: m)) {
535	mnt_dec_writers(mnt);
536	ret = -EROFS;
537	}
538	preempt_enable();
539
540	return ret;
541	}
542	EXPORT_SYMBOL_GPL(mnt_get_write_access);
543
544	/**
545	* mnt_want_write - get write access to a mount
546	* @m: the mount on which to take a write
547	*
548	* This tells the low-level filesystem that a write is about to be performed to
549	* it, and makes sure that writes are allowed (mount is read-write, filesystem
550	* is not frozen) before returning success. When the write operation is
551	* finished, mnt_drop_write() must be called. This is effectively a refcount.
552	*/
553	int mnt_want_write(struct vfsmount *m)
554	{
555	int ret;
556
557	sb_start_write(sb: m->mnt_sb);
558	ret = mnt_get_write_access(m);
559	if (ret)
560	sb_end_write(sb: m->mnt_sb);
561	return ret;
562	}
563	EXPORT_SYMBOL_GPL(mnt_want_write);
564
565	/**
566	* mnt_get_write_access_file - get write access to a file's mount
567	* @file: the file who's mount on which to take a write
568	*
569	* This is like mnt_get_write_access, but if @file is already open for write it
570	* skips incrementing mnt_writers (since the open file already has a reference)
571	* and instead only does the check for emergency r/o remounts. This must be
572	* paired with mnt_put_write_access_file.
573	*/
574	int mnt_get_write_access_file(struct file *file)
575	{
576	if (file->f_mode & FMODE_WRITER) {
577	/*
578	* Superblock may have become readonly while there are still
579	* writable fd's, e.g. due to a fs error with errors=remount-ro
580	*/
581	if (__mnt_is_readonly(file->f_path.mnt))
582	return -EROFS;
583	return `0`;
584	}
585	return mnt_get_write_access(file->f_path.mnt);
586	}
587
588	/**
589	* mnt_want_write_file - get write access to a file's mount
590	* @file: the file who's mount on which to take a write
591	*
592	* This is like mnt_want_write, but if the file is already open for writing it
593	* skips incrementing mnt_writers (since the open file already has a reference)
594	* and instead only does the freeze protection and the check for emergency r/o
595	* remounts. This must be paired with mnt_drop_write_file.
596	*/
597	int mnt_want_write_file(struct file *file)
598	{
599	int ret;
600
601	sb_start_write(sb: file_inode(f: file)->i_sb);
602	ret = mnt_get_write_access_file(file);
603	if (ret)
604	sb_end_write(sb: file_inode(f: file)->i_sb);
605	return ret;
606	}
607	EXPORT_SYMBOL_GPL(mnt_want_write_file);
608
609	/**
610	* mnt_put_write_access - give up write access to a mount
611	* @mnt: the mount on which to give up write access
612	*
613	* Tells the low-level filesystem that we are done
614	* performing writes to it. Must be matched with
615	* mnt_get_write_access() call above.
616	*/
617	void mnt_put_write_access(struct vfsmount *mnt)
618	{
619	preempt_disable();
620	mnt_dec_writers(mnt: real_mount(mnt));
621	preempt_enable();
622	}
623	EXPORT_SYMBOL_GPL(mnt_put_write_access);
624
625	/**
626	* mnt_drop_write - give up write access to a mount
627	* @mnt: the mount on which to give up write access
628	*
629	* Tells the low-level filesystem that we are done performing writes to it and
630	* also allows filesystem to be frozen again. Must be matched with
631	* mnt_want_write() call above.
632	*/
633	void mnt_drop_write(struct vfsmount *mnt)
634	{
635	mnt_put_write_access(mnt);
636	sb_end_write(sb: mnt->mnt_sb);
637	}
638	EXPORT_SYMBOL_GPL(mnt_drop_write);
639
640	void mnt_put_write_access_file(struct file *file)
641	{
642	if (!(file->f_mode & FMODE_WRITER))
643	mnt_put_write_access(file->f_path.mnt);
644	}
645
646	void mnt_drop_write_file(struct file *file)
647	{
648	mnt_put_write_access_file(file);
649	sb_end_write(sb: file_inode(f: file)->i_sb);
650	}
651	EXPORT_SYMBOL(mnt_drop_write_file);
652
653	/**
654	* mnt_hold_writers - prevent write access to the given mount
655	* @mnt: mnt to prevent write access to
656	*
657	* Prevents write access to @mnt if there are no active writers for @mnt.
658	* This function needs to be called and return successfully before changing
659	* properties of @mnt that need to remain stable for callers with write access
660	* to @mnt.
661	*
662	* After this functions has been called successfully callers must pair it with
663	* a call to mnt_unhold_writers() in order to stop preventing write access to
664	* @mnt.
665	*
666	* Context: This function expects lock_mount_hash() to be held serializing
667	* setting MNT_WRITE_HOLD.
668	* Return: On success 0 is returned.
669	* On error, -EBUSY is returned.
670	*/
671	static inline int mnt_hold_writers(struct mount *mnt)
672	{
673	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
674	/*
675	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
676	* should be visible before we do.
677	*/
678	smp_mb();
679
680	/*
681	* With writers on hold, if this value is zero, then there are
682	* definitely no active writers (although held writers may subsequently
683	* increment the count, they'll have to wait, and decrement it after
684	* seeing MNT_READONLY).
685	*
686	* It is OK to have counter incremented on one CPU and decremented on
687	* another: the sum will add up correctly. The danger would be when we
688	* sum up each counter, if we read a counter before it is incremented,
689	* but then read another CPU's count which it has been subsequently
690	* decremented from -- we would see more decrements than we should.
691	* MNT_WRITE_HOLD protects against this scenario, because
692	* mnt_want_write first increments count, then smp_mb, then spins on
693	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
694	* we're counting up here.
695	*/
696	if (mnt_get_writers(mnt) > `0`)
697	return -EBUSY;
698
699	return `0`;
700	}
701
702	/**
703	* mnt_unhold_writers - stop preventing write access to the given mount
704	* @mnt: mnt to stop preventing write access to
705	*
706	* Stop preventing write access to @mnt allowing callers to gain write access
707	* to @mnt again.
708	*
709	* This function can only be called after a successful call to
710	* mnt_hold_writers().
711	*
712	* Context: This function expects lock_mount_hash() to be held.
713	*/
714	static inline void mnt_unhold_writers(struct mount *mnt)
715	{
716	/*
717	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
718	* that become unheld will see MNT_READONLY.
719	*/
720	smp_wmb();
721	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
722	}
723
724	static int mnt_make_readonly(struct mount *mnt)
725	{
726	int ret;
727
728	ret = mnt_hold_writers(mnt);
729	if (!ret)
730	mnt->mnt.mnt_flags \|= MNT_READONLY;
731	mnt_unhold_writers(mnt);
732	return ret;
733	}
734
735	int sb_prepare_remount_readonly(struct super_block *sb)
736	{
737	struct mount *mnt;
738	int err = `0`;
739
740	/ Racy optimization. Recheck the counter under MNT_WRITE_HOLD /
741	if (atomic_long_read(v: &sb->s_remove_count))
742	return -EBUSY;
743
744	lock_mount_hash();
745	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
746	if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
747	err = mnt_hold_writers(mnt);
748	if (err)
749	break;
750	}
751	}
752	if (!err && atomic_long_read(v: &sb->s_remove_count))
753	err = -EBUSY;
754
755	if (!err)
756	sb_start_ro_state_change(sb);
757	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
758	if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
759	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
760	}
761	unlock_mount_hash();
762
763	return err;
764	}
765
766	static void free_vfsmnt(struct mount *mnt)
767	{
768	mnt_idmap_put(idmap: mnt_idmap(mnt: &mnt->mnt));
769	kfree_const(x: mnt->mnt_devname);
770	#ifdef CONFIG_SMP
771	free_percpu(pdata: mnt->mnt_pcp);
772	#endif
773	kmem_cache_free(s: mnt_cache, objp: mnt);
774	}
775
776	static void delayed_free_vfsmnt(struct rcu_head *head)
777	{
778	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
779	}
780
781	/ call under rcu_read_lock /
782	int __legitimize_mnt(struct vfsmount bastard, unsigned* seq)
783	{
784	struct mount *mnt;
785	if (read_seqretry(sl: &mount_lock, start: seq))
786	return `1`;
787	if (bastard == NULL)
788	return `0`;
789	mnt = real_mount(mnt: bastard);
790	mnt_add_count(mnt, n: `1`);
791	smp_mb(); // see mntput_no_expire() and do_umount()
792	if (likely(!read_seqretry(&mount_lock, seq)))
793	return `0`;
794	lock_mount_hash();
795	if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT \| MNT_DOOMED))) {
796	mnt_add_count(mnt, n: -`1`);
797	unlock_mount_hash();
798	return `1`;
799	}
800	unlock_mount_hash();
801	/ caller will mntput() /
802	return -`1`;
803	}
804
805	/ call under rcu_read_lock /
806	static bool legitimize_mnt(struct vfsmount bastard, unsigned* seq)
807	{
808	int res = __legitimize_mnt(bastard, seq);
809	if (likely(!res))
810	return true;
811	if (unlikely(res < `0`)) {
812	rcu_read_unlock();
813	mntput(mnt: bastard);
814	rcu_read_lock();
815	}
816	return false;
817	}
818
819	/**
820	* __lookup_mnt - find first child mount
821	* @mnt: parent mount
822	* @dentry: mountpoint
823	*
824	* If @mnt has a child mount @c mounted @dentry find and return it.
825	*
826	* Note that the child mount @c need not be unique. There are cases
827	* where shadow mounts are created. For example, during mount
828	* propagation when a source mount @mnt whose root got overmounted by a
829	* mount @o after path lookup but before @namespace_sem could be
830	* acquired gets copied and propagated. So @mnt gets copied including
831	* @o. When @mnt is propagated to a destination mount @d that already
832	* has another mount @n mounted at the same mountpoint then the source
833	* mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
834	* @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
835	* on @dentry.
836	*
837	* Return: The first child of @mnt mounted @dentry or NULL.
838	*/
839	struct mount __lookup_mnt(struct* vfsmount mnt, struct* dentry *dentry)
840	{
841	struct hlist_head *head = m_hash(mnt, dentry);
842	struct mount *p;
843
844	hlist_for_each_entry_rcu(p, head, mnt_hash)
845	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
846	return p;
847	return NULL;
848	}
849
850	/*
851	* lookup_mnt - Return the first child mount mounted at path
852	*
853	* "First" means first mounted chronologically. If you create the
854	* following mounts:
855	*
856	* mount /dev/sda1 /mnt
857	* mount /dev/sda2 /mnt
858	* mount /dev/sda3 /mnt
859	*
860	* Then lookup_mnt() on the base /mnt dentry in the root mount will
861	* return successively the root dentry and vfsmount of /dev/sda1, then
862	* /dev/sda2, then /dev/sda3, then NULL.
863	*
864	* lookup_mnt takes a reference to the found vfsmount.
865	*/
866	struct vfsmount lookup_mnt(const* struct path *path)
867	{
868	struct mount *child_mnt;
869	struct vfsmount *m;
870	unsigned seq;
871
872	rcu_read_lock();
873	do {
874	seq = read_seqbegin(sl: &mount_lock);
875	child_mnt = __lookup_mnt(mnt: path->mnt, dentry: path->dentry);
876	m = child_mnt ? &child_mnt->mnt : NULL;
877	} while (!legitimize_mnt(bastard: m, seq));
878	rcu_read_unlock();
879	return m;
880	}
881
882	/*
883	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
884	* current mount namespace.
885	*
886	* The common case is dentries are not mountpoints at all and that
887	* test is handled inline. For the slow case when we are actually
888	* dealing with a mountpoint of some kind, walk through all of the
889	* mounts in the current mount namespace and test to see if the dentry
890	* is a mountpoint.
891	*
892	* The mount_hashtable is not usable in the context because we
893	* need to identify all mounts that may be in the current mount
894	* namespace not just a mount that happens to have some specified
895	* parent mount.
896	*/
897	bool __is_local_mountpoint(struct dentry *dentry)
898	{
899	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
900	struct mount mnt, n;
901	bool is_covered = false;
902
903	down_read(sem: &namespace_sem);
904	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
905	is_covered = (mnt->mnt_mountpoint == dentry);
906	if (is_covered)
907	break;
908	}
909	up_read(sem: &namespace_sem);
910
911	return is_covered;
912	}
913
914	static struct mountpoint lookup_mountpoint(struct* dentry *dentry)
915	{
916	struct hlist_head *chain = mp_hash(dentry);
917	struct mountpoint *mp;
918
919	hlist_for_each_entry(mp, chain, m_hash) {
920	if (mp->m_dentry == dentry) {
921	mp->m_count++;
922	return mp;
923	}
924	}
925	return NULL;
926	}
927
928	static struct mountpoint get_mountpoint(struct* dentry *dentry)
929	{
930	struct mountpoint mp, new = NULL;
931	int ret;
932
933	if (d_mountpoint(dentry)) {
934	/ might be worth a WARN_ON() /
935	if (d_unlinked(dentry))
936	return ERR_PTR(error: -ENOENT);
937	mountpoint:
938	read_seqlock_excl(sl: &mount_lock);
939	mp = lookup_mountpoint(dentry);
940	read_sequnlock_excl(sl: &mount_lock);
941	if (mp)
942	goto done;
943	}
944
945	if (!new)
946	new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
947	if (!new)
948	return ERR_PTR(error: -ENOMEM);
949
950
951	/ Exactly one processes may set d_mounted /
952	ret = d_set_mounted(dentry);
953
954	/ Someone else set d_mounted? /
955	if (ret == -EBUSY)
956	goto mountpoint;
957
958	/ The dentry is not available as a mountpoint? /
959	mp = ERR_PTR(error: ret);
960	if (ret)
961	goto done;
962
963	/ Add the new mountpoint to the hash table /
964	read_seqlock_excl(sl: &mount_lock);
965	new->m_dentry = dget(dentry);
966	new->m_count = `1`;
967	hlist_add_head(n: &new->m_hash, h: mp_hash(dentry));
968	INIT_HLIST_HEAD(&new->m_list);
969	read_sequnlock_excl(sl: &mount_lock);
970
971	mp = new;
972	new = NULL;
973	done:
974	kfree(objp: new);
975	return mp;
976	}
977
978	/*
979	* vfsmount lock must be held. Additionally, the caller is responsible
980	* for serializing calls for given disposal list.
981	*/
982	static void __put_mountpoint(struct mountpoint mp, struct* list_head *list)
983	{
984	if (!--mp->m_count) {
985	struct dentry *dentry = mp->m_dentry;
986	BUG_ON(!hlist_empty(&mp->m_list));
987	spin_lock(lock: &dentry->d_lock);
988	dentry->d_flags &= ~DCACHE_MOUNTED;
989	spin_unlock(lock: &dentry->d_lock);
990	dput_to_list(dentry, list);
991	hlist_del(n: &mp->m_hash);
992	kfree(objp: mp);
993	}
994	}
995
996	/ called with namespace_lock and vfsmount lock /
997	static void put_mountpoint(struct mountpoint *mp)
998	{
999	__put_mountpoint(mp, list: &ex_mountpoints);
1000	}
1001
1002	static inline int check_mnt(struct mount *mnt)
1003	{
1004	return mnt->mnt_ns == current->nsproxy->mnt_ns;
1005	}
1006
1007	static inline bool check_anonymous_mnt(struct mount *mnt)
1008	{
1009	u64 seq;
1010
1011	if (!is_anon_ns(ns: mnt->mnt_ns))
1012	return false;
1013
1014	seq = mnt->mnt_ns->seq_origin;
1015	return !seq \|\| (seq == current->nsproxy->mnt_ns->seq);
1016	}
1017
1018	/*
1019	* vfsmount lock must be held for write
1020	*/
1021	static void touch_mnt_namespace(struct mnt_namespace *ns)
1022	{
1023	if (ns) {
1024	ns->event = ++event;
1025	wake_up_interruptible(&ns->poll);
1026	}
1027	}
1028
1029	/*
1030	* vfsmount lock must be held for write
1031	*/
1032	static void __touch_mnt_namespace(struct mnt_namespace *ns)
1033	{
1034	if (ns && ns->event != event) {
1035	ns->event = event;
1036	wake_up_interruptible(&ns->poll);
1037	}
1038	}
1039
1040	/*
1041	* vfsmount lock must be held for write
1042	*/
1043	static struct mountpoint unhash_mnt(struct* mount *mnt)
1044	{
1045	struct mountpoint *mp;
1046	mnt->mnt_parent = mnt;
1047	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1048	list_del_init(entry: &mnt->mnt_child);
1049	hlist_del_init_rcu(n: &mnt->mnt_hash);
1050	hlist_del_init(n: &mnt->mnt_mp_list);
1051	mp = mnt->mnt_mp;
1052	mnt->mnt_mp = NULL;
1053	return mp;
1054	}
1055
1056	/*
1057	* vfsmount lock must be held for write
1058	*/
1059	static void umount_mnt(struct mount *mnt)
1060	{
1061	put_mountpoint(mp: unhash_mnt(mnt));
1062	}
1063
1064	/*
1065	* vfsmount lock must be held for write
1066	*/
1067	void mnt_set_mountpoint(struct mount *mnt,
1068	struct mountpoint *mp,
1069	struct mount *child_mnt)
1070	{
1071	mp->m_count++;
1072	mnt_add_count(mnt, n: `1`); / essentially, that's mntget /
1073	child_mnt->mnt_mountpoint = mp->m_dentry;
1074	child_mnt->mnt_parent = mnt;
1075	child_mnt->mnt_mp = mp;
1076	hlist_add_head(n: &child_mnt->mnt_mp_list, h: &mp->m_list);
1077	}
1078
1079	/**
1080	* mnt_set_mountpoint_beneath - mount a mount beneath another one
1081	*
1082	* @new_parent: the source mount
1083	* @top_mnt: the mount beneath which @new_parent is mounted
1084	* @new_mp: the new mountpoint of @top_mnt on @new_parent
1085	*
1086	* Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
1087	* parent @top_mnt->mnt_parent and mount it on top of @new_parent at
1088	* @new_mp. And mount @new_parent on the old parent and old
1089	* mountpoint of @top_mnt.
1090	*
1091	* Context: This function expects namespace_lock() and lock_mount_hash()
1092	* to have been acquired in that order.
1093	*/
1094	static void mnt_set_mountpoint_beneath(struct mount *new_parent,
1095	struct mount *top_mnt,
1096	struct mountpoint *new_mp)
1097	{
1098	struct mount *old_top_parent = top_mnt->mnt_parent;
1099	struct mountpoint *old_top_mp = top_mnt->mnt_mp;
1100
1101	mnt_set_mountpoint(mnt: old_top_parent, mp: old_top_mp, child_mnt: new_parent);
1102	mnt_change_mountpoint(parent: new_parent, mp: new_mp, mnt: top_mnt);
1103	}
1104
1105
1106	static void __attach_mnt(struct mount mnt, struct* mount *parent)
1107	{
1108	hlist_add_head_rcu(n: &mnt->mnt_hash,
1109	h: m_hash(mnt: &parent->mnt, dentry: mnt->mnt_mountpoint));
1110	list_add_tail(new: &mnt->mnt_child, head: &parent->mnt_mounts);
1111	}
1112
1113	/**
1114	* attach_mnt - mount a mount, attach to @mount_hashtable and parent's
1115	* list of child mounts
1116	* @parent: the parent
1117	* @mnt: the new mount
1118	* @mp: the new mountpoint
1119	* @beneath: whether to mount @mnt beneath or on top of @parent
1120	*
1121	* If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
1122	* to @parent's child mount list and to @mount_hashtable.
1123	*
1124	* If @beneath is true, remove @mnt from its current parent and
1125	* mountpoint and mount it on @mp on @parent, and mount @parent on the
1126	* old parent and old mountpoint of @mnt. Finally, attach @parent to
1127	* @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
1128	*
1129	* Note, when __attach_mnt() is called @mnt->mnt_parent already points
1130	* to the correct parent.
1131	*
1132	* Context: This function expects namespace_lock() and lock_mount_hash()
1133	* to have been acquired in that order.
1134	*/
1135	static void attach_mnt(struct mount mnt, struct* mount *parent,
1136	struct mountpoint *mp, bool beneath)
1137	{
1138	if (beneath)
1139	mnt_set_mountpoint_beneath(new_parent: mnt, top_mnt: parent, new_mp: mp);
1140	else
1141	mnt_set_mountpoint(mnt: parent, mp, child_mnt: mnt);
1142	/*
1143	* Note, @mnt->mnt_parent has to be used. If @mnt was mounted
1144	* beneath @parent then @mnt will need to be attached to
1145	* @parent's old parent, not @parent. IOW, @mnt->mnt_parent
1146	* isn't the same mount as @parent.
1147	*/
1148	__attach_mnt(mnt, parent: mnt->mnt_parent);
1149	}
1150
1151	void mnt_change_mountpoint(struct mount parent, struct* mountpoint mp, struct* mount *mnt)
1152	{
1153	struct mountpoint *old_mp = mnt->mnt_mp;
1154	struct mount *old_parent = mnt->mnt_parent;
1155
1156	list_del_init(entry: &mnt->mnt_child);
1157	hlist_del_init(n: &mnt->mnt_mp_list);
1158	hlist_del_init_rcu(n: &mnt->mnt_hash);
1159
1160	attach_mnt(mnt, parent, mp, beneath: false);
1161
1162	put_mountpoint(mp: old_mp);
1163	mnt_add_count(mnt: old_parent, n: -`1`);
1164	}
1165
1166	static inline struct mount node_to_mount(struct* rb_node *node)
1167	{
1168	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
1169	}
1170
1171	static void mnt_add_to_ns(struct mnt_namespace ns, struct* mount *mnt)
1172	{
1173	struct rb_node **link = &ns->mounts.rb_node;
1174	struct rb_node *parent = NULL;
1175	bool mnt_first_node = true, mnt_last_node = true;
1176
1177	WARN_ON(mnt_ns_attached(mnt));
1178	mnt->mnt_ns = ns;
1179	while (*link) {
1180	parent = *link;
1181	if (mnt->mnt_id_unique < node_to_mount(node: parent)->mnt_id_unique) {
1182	link = &parent->rb_left;
1183	mnt_last_node = false;
1184	} else {
1185	link = &parent->rb_right;
1186	mnt_first_node = false;
1187	}
1188	}
1189
1190	if (mnt_last_node)
1191	ns->mnt_last_node = &mnt->mnt_node;
1192	if (mnt_first_node)
1193	ns->mnt_first_node = &mnt->mnt_node;
1194	rb_link_node(node: &mnt->mnt_node, parent, rb_link: link);
1195	rb_insert_color(&mnt->mnt_node, &ns->mounts);
1196
1197	mnt_notify_add(m: mnt);
1198	}
1199
1200	/*
1201	* vfsmount lock must be held for write
1202	*/
1203	static void commit_tree(struct mount *mnt)
1204	{
1205	struct mount *parent = mnt->mnt_parent;
1206	struct mount *m;
1207	LIST_HEAD(head);
1208	struct mnt_namespace *n = parent->mnt_ns;
1209
1210	BUG_ON(parent == mnt);
1211
1212	list_add_tail(new: &head, head: &mnt->mnt_list);
1213	while (!list_empty(head: &head)) {
1214	m = list_first_entry(&head, typeof(*m), mnt_list);
1215	list_del(entry: &m->mnt_list);
1216
1217	mnt_add_to_ns(ns: n, mnt: m);
1218	}
1219	n->nr_mounts += n->pending_mounts;
1220	n->pending_mounts = `0`;
1221
1222	__attach_mnt(mnt, parent);
1223	touch_mnt_namespace(ns: n);
1224	}
1225
1226	static struct mount next_mnt(struct* mount p, struct* mount *root)
1227	{
1228	struct list_head *next = p->mnt_mounts.next;
1229	if (next == &p->mnt_mounts) {
1230	while (`1`) {
1231	if (p == root)
1232	return NULL;
1233	next = p->mnt_child.next;
1234	if (next != &p->mnt_parent->mnt_mounts)
1235	break;
1236	p = p->mnt_parent;
1237	}
1238	}
1239	return list_entry(next, struct mount, mnt_child);
1240	}
1241
1242	static struct mount skip_mnt_tree(struct* mount *p)
1243	{
1244	struct list_head *prev = p->mnt_mounts.prev;
1245	while (prev != &p->mnt_mounts) {
1246	p = list_entry(prev, struct mount, mnt_child);
1247	prev = p->mnt_mounts.prev;
1248	}
1249	return p;
1250	}
1251
1252	/**
1253	* vfs_create_mount - Create a mount for a configured superblock
1254	* @fc: The configuration context with the superblock attached
1255	*
1256	* Create a mount to an already configured superblock. If necessary, the
1257	* caller should invoke vfs_get_tree() before calling this.
1258	*
1259	* Note that this does not attach the mount to anything.
1260	*/
1261	struct vfsmount vfs_create_mount(struct* fs_context *fc)
1262	{
1263	struct mount *mnt;
1264
1265	if (!fc->root)
1266	return ERR_PTR(error: -EINVAL);
1267
1268	mnt = alloc_vfsmnt(name: fc->source);
1269	if (!mnt)
1270	return ERR_PTR(error: -ENOMEM);
1271
1272	if (fc->sb_flags & SB_KERNMOUNT)
1273	mnt->mnt.mnt_flags = MNT_INTERNAL;
1274
1275	atomic_inc(v: &fc->root->d_sb->s_active);
1276	mnt->mnt.mnt_sb = fc->root->d_sb;
1277	mnt->mnt.mnt_root = dget(dentry: fc->root);
1278	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1279	mnt->mnt_parent = mnt;
1280
1281	lock_mount_hash();
1282	list_add_tail(new: &mnt->mnt_instance, head: &mnt->mnt.mnt_sb->s_mounts);
1283	unlock_mount_hash();
1284	return &mnt->mnt;
1285	}
1286	EXPORT_SYMBOL(vfs_create_mount);
1287
1288	struct vfsmount fc_mount(struct* fs_context *fc)
1289	{
1290	int err = vfs_get_tree(fc);
1291	if (!err) {
1292	up_write(sem: &fc->root->d_sb->s_umount);
1293	return vfs_create_mount(fc);
1294	}
1295	return ERR_PTR(error: err);
1296	}
1297	EXPORT_SYMBOL(fc_mount);
1298
1299	struct vfsmount vfs_kern_mount(struct* file_system_type *type,
1300	int flags, const char *name,
1301	void *data)
1302	{
1303	struct fs_context *fc;
1304	struct vfsmount *mnt;
1305	int ret = `0`;
1306
1307	if (!type)
1308	return ERR_PTR(error: -EINVAL);
1309
1310	fc = fs_context_for_mount(fs_type: type, sb_flags: flags);
1311	if (IS_ERR(ptr: fc))
1312	return ERR_CAST(ptr: fc);
1313
1314	if (name)
1315	ret = vfs_parse_fs_string(fc, key: "source",
1316	value: name, strlen(name));
1317	if (!ret)
1318	ret = parse_monolithic_mount_data(fc, data);
1319	if (!ret)
1320	mnt = fc_mount(fc);
1321	else
1322	mnt = ERR_PTR(error: ret);
1323
1324	put_fs_context(fc);
1325	return mnt;
1326	}
1327	EXPORT_SYMBOL_GPL(vfs_kern_mount);
1328
1329	static struct mount clone_mnt(struct* mount old, struct* dentry *root,
1330	int flag)
1331	{
1332	struct super_block *sb = old->mnt.mnt_sb;
1333	struct mount *mnt;
1334	int err;
1335
1336	mnt = alloc_vfsmnt(name: old->mnt_devname);
1337	if (!mnt)
1338	return ERR_PTR(error: -ENOMEM);
1339
1340	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
1341	mnt->mnt_group_id = `0`; / not a peer of original /
1342	else
1343	mnt->mnt_group_id = old->mnt_group_id;
1344
1345	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1346	err = mnt_alloc_group_id(mnt);
1347	if (err)
1348	goto out_free;
1349	}
1350
1351	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1352	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
1353
1354	atomic_inc(v: &sb->s_active);
1355	mnt->mnt.mnt_idmap = mnt_idmap_get(idmap: mnt_idmap(mnt: &old->mnt));
1356
1357	mnt->mnt.mnt_sb = sb;
1358	mnt->mnt.mnt_root = dget(dentry: root);
1359	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1360	mnt->mnt_parent = mnt;
1361	lock_mount_hash();
1362	list_add_tail(new: &mnt->mnt_instance, head: &sb->s_mounts);
1363	unlock_mount_hash();
1364
1365	if ((flag & CL_SLAVE) \|\|
1366	((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1367	list_add(new: &mnt->mnt_slave, head: &old->mnt_slave_list);
1368	mnt->mnt_master = old;
1369	CLEAR_MNT_SHARED(mnt);
1370	} else if (!(flag & CL_PRIVATE)) {
1371	if ((flag & CL_MAKE_SHARED) \|\| IS_MNT_SHARED(old))
1372	list_add(new: &mnt->mnt_share, head: &old->mnt_share);
1373	if (IS_MNT_SLAVE(old))
1374	list_add(new: &mnt->mnt_slave, head: &old->mnt_slave);
1375	mnt->mnt_master = old->mnt_master;
1376	} else {
1377	CLEAR_MNT_SHARED(mnt);
1378	}
1379	if (flag & CL_MAKE_SHARED)
1380	set_mnt_shared(mnt);
1381
1382	/ stick the duplicate mount on the same expiry list*
1383	* as the original if that was on one */
1384	if (flag & CL_EXPIRE) {
1385	if (!list_empty(head: &old->mnt_expire))
1386	list_add(new: &mnt->mnt_expire, head: &old->mnt_expire);
1387	}
1388
1389	return mnt;
1390
1391	out_free:
1392	mnt_free_id(mnt);
1393	free_vfsmnt(mnt);
1394	return ERR_PTR(error: err);
1395	}
1396
1397	static void cleanup_mnt(struct mount *mnt)
1398	{
1399	struct hlist_node *p;
1400	struct mount *m;
1401	/*
1402	* The warning here probably indicates that somebody messed
1403	* up a mnt_want/drop_write() pair. If this happens, the
1404	* filesystem was probably unable to make r/w->r/o transitions.
1405	* The locking used to deal with mnt_count decrement provides barriers,
1406	* so mnt_get_writers() below is safe.
1407	*/
1408	WARN_ON(mnt_get_writers(mnt));
1409	if (unlikely(mnt->mnt_pins.first))
1410	mnt_pin_kill(m: mnt);
1411	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1412	hlist_del(n: &m->mnt_umount);
1413	mntput(mnt: &m->mnt);
1414	}
1415	fsnotify_vfsmount_delete(mnt: &mnt->mnt);
1416	dput(mnt->mnt.mnt_root);
1417	deactivate_super(sb: mnt->mnt.mnt_sb);
1418	mnt_free_id(mnt);
1419	call_rcu(head: &mnt->mnt_rcu, func: delayed_free_vfsmnt);
1420	}
1421
1422	static void __cleanup_mnt(struct rcu_head *head)
1423	{
1424	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1425	}
1426
1427	static LLIST_HEAD(delayed_mntput_list);
1428	static void delayed_mntput(struct work_struct *unused)
1429	{
1430	struct llist_node *node = llist_del_all(head: &delayed_mntput_list);
1431	struct mount m, t;
1432
1433	llist_for_each_entry_safe(m, t, node, mnt_llist)
1434	cleanup_mnt(mnt: m);
1435	}
1436	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1437
1438	static void mntput_no_expire(struct mount *mnt)
1439	{
1440	LIST_HEAD(list);
1441	int count;
1442
1443	rcu_read_lock();
1444	if (likely(READ_ONCE(mnt->mnt_ns))) {
1445	/*
1446	* Since we don't do lock_mount_hash() here,
1447	* ->mnt_ns can change under us. However, if it's
1448	* non-NULL, then there's a reference that won't
1449	* be dropped until after an RCU delay done after
1450	* turning ->mnt_ns NULL. So if we observe it
1451	* non-NULL under rcu_read_lock(), the reference
1452	* we are dropping is not the final one.
1453	*/
1454	mnt_add_count(mnt, n: -`1`);
1455	rcu_read_unlock();
1456	return;
1457	}
1458	lock_mount_hash();
1459	/*
1460	* make sure that if __legitimize_mnt() has not seen us grab
1461	* mount_lock, we'll see their refcount increment here.
1462	*/
1463	smp_mb();
1464	mnt_add_count(mnt, n: -`1`);
1465	count = mnt_get_count(mnt);
1466	if (count != `0`) {
1467	WARN_ON(count < `0`);
1468	rcu_read_unlock();
1469	unlock_mount_hash();
1470	return;
1471	}
1472	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1473	rcu_read_unlock();
1474	unlock_mount_hash();
1475	return;
1476	}
1477	mnt->mnt.mnt_flags \|= MNT_DOOMED;
1478	rcu_read_unlock();
1479
1480	list_del(entry: &mnt->mnt_instance);
1481
1482	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1483	struct mount p, tmp;
1484	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1485	__put_mountpoint(mp: unhash_mnt(mnt: p), list: &list);
1486	hlist_add_head(n: &p->mnt_umount, h: &mnt->mnt_stuck_children);
1487	}
1488	}
1489	unlock_mount_hash();
1490	shrink_dentry_list(&list);
1491
1492	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1493	struct task_struct *task = current;
1494	if (likely(!(task->flags & PF_KTHREAD))) {
1495	init_task_work(twork: &mnt->mnt_rcu, func: __cleanup_mnt);
1496	if (!task_work_add(task, twork: &mnt->mnt_rcu, mode: TWA_RESUME))
1497	return;
1498	}
1499	if (llist_add(new: &mnt->mnt_llist, head: &delayed_mntput_list))
1500	schedule_delayed_work(dwork: &delayed_mntput_work, delay: `1`);
1501	return;
1502	}
1503	cleanup_mnt(mnt);
1504	}
1505
1506	void mntput(struct vfsmount *mnt)
1507	{
1508	if (mnt) {
1509	struct mount *m = real_mount(mnt);
1510	/ avoid cacheline pingpong /
1511	if (unlikely(m->mnt_expiry_mark))
1512	WRITE_ONCE(m->mnt_expiry_mark, `0`);
1513	mntput_no_expire(mnt: m);
1514	}
1515	}
1516	EXPORT_SYMBOL(mntput);
1517
1518	struct vfsmount mntget(struct* vfsmount *mnt)
1519	{
1520	if (mnt)
1521	mnt_add_count(mnt: real_mount(mnt), n: `1`);
1522	return mnt;
1523	}
1524	EXPORT_SYMBOL(mntget);
1525
1526	/*
1527	* Make a mount point inaccessible to new lookups.
1528	* Because there may still be current users, the caller MUST WAIT
1529	* for an RCU grace period before destroying the mount point.
1530	*/
1531	void mnt_make_shortterm(struct vfsmount *mnt)
1532	{
1533	if (mnt)
1534	real_mount(mnt)->mnt_ns = NULL;
1535	}
1536
1537	/**
1538	* path_is_mountpoint() - Check if path is a mount in the current namespace.
1539	* @path: path to check
1540	*
1541	* d_mountpoint() can only be used reliably to establish if a dentry is
1542	* not mounted in any namespace and that common case is handled inline.
1543	* d_mountpoint() isn't aware of the possibility there may be multiple
1544	* mounts using a given dentry in a different namespace. This function
1545	* checks if the passed in path is a mountpoint rather than the dentry
1546	* alone.
1547	*/
1548	bool path_is_mountpoint(const struct path *path)
1549	{
1550	unsigned seq;
1551	bool res;
1552
1553	if (!d_mountpoint(dentry: path->dentry))
1554	return false;
1555
1556	rcu_read_lock();
1557	do {
1558	seq = read_seqbegin(sl: &mount_lock);
1559	res = __path_is_mountpoint(path);
1560	} while (read_seqretry(sl: &mount_lock, start: seq));
1561	rcu_read_unlock();
1562
1563	return res;
1564	}
1565	EXPORT_SYMBOL(path_is_mountpoint);
1566
1567	struct vfsmount mnt_clone_internal(const* struct path *path)
1568	{
1569	struct mount *p;
1570	p = clone_mnt(old: real_mount(mnt: path->mnt), root: path->dentry, CL_PRIVATE);
1571	if (IS_ERR(ptr: p))
1572	return ERR_CAST(ptr: p);
1573	p->mnt.mnt_flags \|= MNT_INTERNAL;
1574	return &p->mnt;
1575	}
1576
1577	/*
1578	* Returns the mount which either has the specified mnt_id, or has the next
1579	* smallest id afer the specified one.
1580	*/
1581	static struct mount mnt_find_id_at(struct* mnt_namespace *ns, u64 mnt_id)
1582	{
1583	struct rb_node *node = ns->mounts.rb_node;
1584	struct mount *ret = NULL;
1585
1586	while (node) {
1587	struct mount *m = node_to_mount(node);
1588
1589	if (mnt_id <= m->mnt_id_unique) {
1590	ret = node_to_mount(node);
1591	if (mnt_id == m->mnt_id_unique)
1592	break;
1593	node = node->rb_left;
1594	} else {
1595	node = node->rb_right;
1596	}
1597	}
1598	return ret;
1599	}
1600
1601	/*
1602	* Returns the mount which either has the specified mnt_id, or has the next
1603	* greater id before the specified one.
1604	*/
1605	static struct mount mnt_find_id_at_reverse(struct* mnt_namespace *ns, u64 mnt_id)
1606	{
1607	struct rb_node *node = ns->mounts.rb_node;
1608	struct mount *ret = NULL;
1609
1610	while (node) {
1611	struct mount *m = node_to_mount(node);
1612
1613	if (mnt_id >= m->mnt_id_unique) {
1614	ret = node_to_mount(node);
1615	if (mnt_id == m->mnt_id_unique)
1616	break;
1617	node = node->rb_right;
1618	} else {
1619	node = node->rb_left;
1620	}
1621	}
1622	return ret;
1623	}
1624
1625	#ifdef CONFIG_PROC_FS
1626
1627	/ iterator; we want it to have access to namespace_sem, thus here... /
1628	static void m_start(struct* seq_file m, loff_t pos)
1629	{
1630	struct proc_mounts *p = m->private;
1631
1632	down_read(sem: &namespace_sem);
1633
1634	return mnt_find_id_at(ns: p->ns, mnt_id: *pos);
1635	}
1636
1637	static void m_next(struct* seq_file m, void* v, loff_t pos)
1638	{
1639	struct mount next = NULL, mnt = v;
1640	struct rb_node *node = rb_next(&mnt->mnt_node);
1641
1642	++*pos;
1643	if (node) {
1644	next = node_to_mount(node);
1645	*pos = next->mnt_id_unique;
1646	}
1647	return next;
1648	}
1649
1650	static void m_stop(struct seq_file m, void* *v)
1651	{
1652	up_read(sem: &namespace_sem);
1653	}
1654
1655	static int m_show(struct seq_file m, void* *v)
1656	{
1657	struct proc_mounts *p = m->private;
1658	struct mount *r = v;
1659	return p->show(m, &r->mnt);
1660	}
1661
1662	const struct seq_operations mounts_op = {
1663	.start = m_start,
1664	.next = m_next,
1665	.stop = m_stop,
1666	.show = m_show,
1667	};
1668
1669	#endif /* CONFIG_PROC_FS */
1670
1671	/**
1672	* may_umount_tree - check if a mount tree is busy
1673	* @m: root of mount tree
1674	*
1675	* This is called to check if a tree of mounts has any
1676	* open files, pwds, chroots or sub mounts that are
1677	* busy.
1678	*/
1679	int may_umount_tree(struct vfsmount *m)
1680	{
1681	struct mount *mnt = real_mount(mnt: m);
1682	int actual_refs = `0`;
1683	int minimum_refs = `0`;
1684	struct mount *p;
1685	BUG_ON(!m);
1686
1687	/ write lock needed for mnt_get_count /
1688	lock_mount_hash();
1689	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
1690	actual_refs += mnt_get_count(mnt: p);
1691	minimum_refs += `2`;
1692	}
1693	unlock_mount_hash();
1694
1695	if (actual_refs > minimum_refs)
1696	return `0`;
1697
1698	return `1`;
1699	}
1700
1701	EXPORT_SYMBOL(may_umount_tree);
1702
1703	/**
1704	* may_umount - check if a mount point is busy
1705	* @mnt: root of mount
1706	*
1707	* This is called to check if a mount point has any
1708	* open files, pwds, chroots or sub mounts. If the
1709	* mount has sub mounts this will return busy
1710	* regardless of whether the sub mounts are busy.
1711	*
1712	* Doesn't take quota and stuff into account. IOW, in some cases it will
1713	* give false negatives. The main reason why it's here is that we need
1714	* a non-destructive way to look for easily umountable filesystems.
1715	*/
1716	int may_umount(struct vfsmount *mnt)
1717	{
1718	int ret = `1`;
1719	down_read(sem: &namespace_sem);
1720	lock_mount_hash();
1721	if (propagate_mount_busy(real_mount(mnt), `2`))
1722	ret = `0`;
1723	unlock_mount_hash();
1724	up_read(sem: &namespace_sem);
1725	return ret;
1726	}
1727
1728	EXPORT_SYMBOL(may_umount);
1729
1730	#ifdef CONFIG_FSNOTIFY
1731	static void mnt_notify(struct mount *p)
1732	{
1733	if (!p->prev_ns && p->mnt_ns) {
1734	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1735	} else if (p->prev_ns && !p->mnt_ns) {
1736	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1737	} else if (p->prev_ns == p->mnt_ns) {
1738	fsnotify_mnt_move(ns: p->mnt_ns, mnt: &p->mnt);
1739	} else {
1740	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1741	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1742	}
1743	p->prev_ns = p->mnt_ns;
1744	}
1745
1746	static void notify_mnt_list(void)
1747	{
1748	struct mount m, tmp;
1749	/*
1750	* Notify about mounts that were added/reparented/detached/remain
1751	* connected after unmount.
1752	*/
1753	list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
1754	mnt_notify(p: m);
1755	list_del_init(entry: &m->to_notify);
1756	}
1757	}
1758
1759	static bool need_notify_mnt_list(void)
1760	{
1761	return !list_empty(head: &notify_list);
1762	}
1763	#else
1764	static void notify_mnt_list(void)
1765	{
1766	}
1767
1768	static bool need_notify_mnt_list(void)
1769	{
1770	return false;
1771	}
1772	#endif
1773
1774	static void namespace_unlock(void)
1775	{
1776	struct hlist_head head;
1777	struct hlist_node *p;
1778	struct mount *m;
1779	LIST_HEAD(list);
1780
1781	hlist_move_list(old: &unmounted, new: &head);
1782	list_splice_init(list: &ex_mountpoints, head: &list);
1783
1784	if (need_notify_mnt_list()) {
1785	/*
1786	* No point blocking out concurrent readers while notifications
1787	* are sent. This will also allow statmount()/listmount() to run
1788	* concurrently.
1789	*/
1790	downgrade_write(sem: &namespace_sem);
1791	notify_mnt_list();
1792	up_read(sem: &namespace_sem);
1793	} else {
1794	up_write(sem: &namespace_sem);
1795	}
1796
1797	shrink_dentry_list(&list);
1798
1799	if (likely(hlist_empty(&head)))
1800	return;
1801
1802	synchronize_rcu_expedited();
1803
1804	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1805	hlist_del(n: &m->mnt_umount);
1806	mntput(&m->mnt);
1807	}
1808	}
1809
1810	static inline void namespace_lock(void)
1811	{
1812	down_write(sem: &namespace_sem);
1813	}
1814
1815	DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
1816
1817	enum umount_tree_flags {
1818	UMOUNT_SYNC = `1`,
1819	UMOUNT_PROPAGATE = `2`,
1820	UMOUNT_CONNECTED = `4`,
1821	};
1822
1823	static bool disconnect_mount(struct mount mnt, enum* umount_tree_flags how)
1824	{
1825	/ Leaving mounts connected is only valid for lazy umounts /
1826	if (how & UMOUNT_SYNC)
1827	return true;
1828
1829	/ A mount without a parent has nothing to be connected to /
1830	if (!mnt_has_parent(mnt))
1831	return true;
1832
1833	/ Because the reference counting rules change when mounts are*
1834	* unmounted and connected, umounted mounts may not be
1835	* connected to mounted mounts.
1836	*/
1837	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1838	return true;
1839
1840	/ Has it been requested that the mount remain connected? /
1841	if (how & UMOUNT_CONNECTED)
1842	return false;
1843
1844	/ Is the mount locked such that it needs to remain connected? /
1845	if (IS_MNT_LOCKED(mnt))
1846	return false;
1847
1848	/ By default disconnect the mount /
1849	return true;
1850	}
1851
1852	/*
1853	* mount_lock must be held
1854	* namespace_sem must be held for write
1855	*/
1856	static void umount_tree(struct mount mnt, enum* umount_tree_flags how)
1857	{
1858	LIST_HEAD(tmp_list);
1859	struct mount *p;
1860
1861	if (how & UMOUNT_PROPAGATE)
1862	propagate_mount_unlock(mnt);
1863
1864	/ Gather the mounts to umount /
1865	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
1866	p->mnt.mnt_flags \|= MNT_UMOUNT;
1867	if (mnt_ns_attached(mnt: p))
1868	move_from_ns(mnt: p, dt_list: &tmp_list);
1869	else
1870	list_move(list: &p->mnt_list, head: &tmp_list);
1871	}
1872
1873	/ Hide the mounts from mnt_mounts /
1874	list_for_each_entry(p, &tmp_list, mnt_list) {
1875	list_del_init(entry: &p->mnt_child);
1876	}
1877
1878	/ Add propagated mounts to the tmp_list /
1879	if (how & UMOUNT_PROPAGATE)
1880	propagate_umount(&tmp_list);
1881
1882	while (!list_empty(head: &tmp_list)) {
1883	struct mnt_namespace *ns;
1884	bool disconnect;
1885	p = list_first_entry(&tmp_list, struct mount, mnt_list);
1886	list_del_init(entry: &p->mnt_expire);
1887	list_del_init(entry: &p->mnt_list);
1888	ns = p->mnt_ns;
1889	if (ns) {
1890	ns->nr_mounts--;
1891	__touch_mnt_namespace(ns);
1892	}
1893	p->mnt_ns = NULL;
1894	if (how & UMOUNT_SYNC)
1895	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1896
1897	disconnect = disconnect_mount(mnt: p, how);
1898	if (mnt_has_parent(mnt: p)) {
1899	mnt_add_count(mnt: p->mnt_parent, n: -`1`);
1900	if (!disconnect) {
1901	/ Don't forget about p /
1902	list_add_tail(new: &p->mnt_child, head: &p->mnt_parent->mnt_mounts);
1903	} else {
1904	umount_mnt(mnt: p);
1905	}
1906	}
1907	change_mnt_propagation(p, MS_PRIVATE);
1908	if (disconnect)
1909	hlist_add_head(n: &p->mnt_umount, h: &unmounted);
1910
1911	/*
1912	* At this point p->mnt_ns is NULL, notification will be queued
1913	* only if
1914	*
1915	* - p->prev_ns is non-NULL and
1916	* - p->prev_ns->n_fsnotify_marks is non-NULL
1917	*
1918	* This will preclude queuing the mount if this is a cleanup
1919	* after a failed copy_tree() or destruction of an anonymous
1920	* namespace, etc.
1921	*/
1922	mnt_notify_add(m: p);
1923	}
1924	}
1925
1926	static void shrink_submounts(struct mount *mnt);
1927
1928	static int do_umount_root(struct super_block *sb)
1929	{
1930	int ret = `0`;
1931
1932	down_write(sem: &sb->s_umount);
1933	if (!sb_rdonly(sb)) {
1934	struct fs_context *fc;
1935
1936	fc = fs_context_for_reconfigure(dentry: sb->s_root, SB_RDONLY,
1937	SB_RDONLY);
1938	if (IS_ERR(ptr: fc)) {
1939	ret = PTR_ERR(ptr: fc);
1940	} else {
1941	ret = parse_monolithic_mount_data(fc, NULL);
1942	if (!ret)
1943	ret = reconfigure_super(fc);
1944	put_fs_context(fc);
1945	}
1946	}
1947	up_write(sem: &sb->s_umount);
1948	return ret;
1949	}
1950
1951	static int do_umount(struct mount mnt, int* flags)
1952	{
1953	struct super_block *sb = mnt->mnt.mnt_sb;
1954	int retval;
1955
1956	retval = security_sb_umount(mnt: &mnt->mnt, flags);
1957	if (retval)
1958	return retval;
1959
1960	/*
1961	* Allow userspace to request a mountpoint be expired rather than
1962	* unmounting unconditionally. Unmount only happens if:
1963	* (1) the mark is already set (the mark is cleared by mntput())
1964	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1965	*/
1966	if (flags & MNT_EXPIRE) {
1967	if (&mnt->mnt == current->fs->root.mnt \|\|
1968	flags & (MNT_FORCE \| MNT_DETACH))
1969	return -EINVAL;
1970
1971	/*
1972	* probably don't strictly need the lock here if we examined
1973	* all race cases, but it's a slowpath.
1974	*/
1975	lock_mount_hash();
1976	if (mnt_get_count(mnt) != `2`) {
1977	unlock_mount_hash();
1978	return -EBUSY;
1979	}
1980	unlock_mount_hash();
1981
1982	if (!xchg(&mnt->mnt_expiry_mark, `1`))
1983	return -EAGAIN;
1984	}
1985
1986	/*
1987	* If we may have to abort operations to get out of this
1988	* mount, and they will themselves hold resources we must
1989	* allow the fs to do things. In the Unix tradition of
1990	* 'Gee thats tricky lets do it in userspace' the umount_begin
1991	* might fail to complete on the first run through as other tasks
1992	* must return, and the like. Thats for the mount program to worry
1993	* about for the moment.
1994	*/
1995
1996	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1997	sb->s_op->umount_begin(sb);
1998	}
1999
2000	/*
2001	* No sense to grab the lock for this test, but test itself looks
2002	* somewhat bogus. Suggestions for better replacement?
2003	* Ho-hum... In principle, we might treat that as umount + switch
2004	* to rootfs. GC would eventually take care of the old vfsmount.
2005	* Actually it makes sense, especially if rootfs would contain a
2006	* /reboot - static binary that would close all descriptors and
2007	* call reboot(9). Then init(8) could umount root and exec /reboot.
2008	*/
2009	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
2010	/*
2011	* Special case for "unmounting" root ...
2012	* we just try to remount it readonly.
2013	*/
2014	if (!ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
2015	return -EPERM;
2016	return do_umount_root(sb);
2017	}
2018
2019	namespace_lock();
2020	lock_mount_hash();
2021
2022	/ Recheck MNT_LOCKED with the locks held /
2023	retval = -EINVAL;
2024	if (mnt->mnt.mnt_flags & MNT_LOCKED)
2025	goto out;
2026
2027	event++;
2028	if (flags & MNT_DETACH) {
2029	if (mnt_ns_attached(mnt) \|\| !list_empty(head: &mnt->mnt_list))
2030	umount_tree(mnt, how: UMOUNT_PROPAGATE);
2031	retval = `0`;
2032	} else {
2033	smp_mb(); // paired with __legitimize_mnt()
2034	shrink_submounts(mnt);
2035	retval = -EBUSY;
2036	if (!propagate_mount_busy(mnt, `2`)) {
2037	if (mnt_ns_attached(mnt) \|\| !list_empty(head: &mnt->mnt_list))
2038	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
2039	retval = `0`;
2040	}
2041	}
2042	out:
2043	unlock_mount_hash();
2044	namespace_unlock();
2045	return retval;
2046	}
2047
2048	/*
2049	* __detach_mounts - lazily unmount all mounts on the specified dentry
2050	*
2051	* During unlink, rmdir, and d_drop it is possible to loose the path
2052	* to an existing mountpoint, and wind up leaking the mount.
2053	* detach_mounts allows lazily unmounting those mounts instead of
2054	* leaking them.
2055	*
2056	* The caller may hold dentry->d_inode->i_mutex.
2057	*/
2058	void __detach_mounts(struct dentry *dentry)
2059	{
2060	struct mountpoint *mp;
2061	struct mount *mnt;
2062
2063	namespace_lock();
2064	lock_mount_hash();
2065	mp = lookup_mountpoint(dentry);
2066	if (!mp)
2067	goto out_unlock;
2068
2069	event++;
2070	while (!hlist_empty(h: &mp->m_list)) {
2071	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
2072	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
2073	umount_mnt(mnt);
2074	hlist_add_head(n: &mnt->mnt_umount, h: &unmounted);
2075	}
2076	else umount_tree(mnt, how: UMOUNT_CONNECTED);
2077	}
2078	put_mountpoint(mp);
2079	out_unlock:
2080	unlock_mount_hash();
2081	namespace_unlock();
2082	}
2083
2084	/*
2085	* Is the caller allowed to modify his namespace?
2086	*/
2087	bool may_mount(void)
2088	{
2089	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
2090	}
2091
2092	static void warn_mandlock(void)
2093	{
2094	pr_warn_once("=======================================================\n"
2095	"WARNING: The mand mount option has been deprecated and\n"
2096	" and is ignored by this kernel. Remove the mand\n"
2097	" option from the mount to silence this warning.\n"
2098	"=======================================================\n");
2099	}
2100
2101	static int can_umount(const struct path path, int* flags)
2102	{
2103	struct mount *mnt = real_mount(mnt: path->mnt);
2104	struct super_block *sb = path->dentry->d_sb;
2105
2106	if (!may_mount())
2107	return -EPERM;
2108	if (!path_mounted(path))
2109	return -EINVAL;
2110	if (!check_mnt(mnt))
2111	return -EINVAL;
2112	if (mnt->mnt.mnt_flags & MNT_LOCKED) / Check optimistically /
2113	return -EINVAL;
2114	if (flags & MNT_FORCE && !ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
2115	return -EPERM;
2116	return `0`;
2117	}
2118
2119	// caller is responsible for flags being sane
2120	int path_umount(struct path path, int* flags)
2121	{
2122	struct mount *mnt = real_mount(mnt: path->mnt);
2123	int ret;
2124
2125	ret = can_umount(path, flags);
2126	if (!ret)
2127	ret = do_umount(mnt, flags);
2128
2129	/ we mustn't call path_put() as that would clear mnt_expiry_mark /
2130	dput(path->dentry);
2131	mntput_no_expire(mnt);
2132	return ret;
2133	}
2134
2135	static int ksys_umount(char __user name, int* flags)
2136	{
2137	int lookup_flags = LOOKUP_MOUNTPOINT;
2138	struct path path;
2139	int ret;
2140
2141	// basic validity checks done first
2142	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
2143	return -EINVAL;
2144
2145	if (!(flags & UMOUNT_NOFOLLOW))
2146	lookup_flags \|= LOOKUP_FOLLOW;
2147	ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
2148	if (ret)
2149	return ret;
2150	return path_umount(path: &path, flags);
2151	}
2152
2153	SYSCALL_DEFINE2(umount, char __user , name, int*, flags)
2154	{
2155	return ksys_umount(name, flags);
2156	}
2157
2158	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
2159
2160	/*
2161	* The 2.0 compatible umount. No flags.
2162	*/
2163	SYSCALL_DEFINE1(oldumount, char __user *, name)
2164	{
2165	return ksys_umount(name, flags: `0`);
2166	}
2167
2168	#endif
2169
2170	static bool is_mnt_ns_file(struct dentry *dentry)
2171	{
2172	struct ns_common *ns;
2173
2174	/ Is this a proxy for a mount namespace? /
2175	if (dentry->d_op != &ns_dentry_operations)
2176	return false;
2177
2178	ns = d_inode(dentry)->i_private;
2179
2180	return ns->ops == &mntns_operations;
2181	}
2182
2183	struct ns_common from_mnt_ns(struct* mnt_namespace *mnt)
2184	{
2185	return &mnt->ns;
2186	}
2187
2188	struct mnt_namespace get_sequential_mnt_ns(struct* mnt_namespace *mntns, bool previous)
2189	{
2190	guard(rcu)();
2191
2192	for (;;) {
2193	struct list_head *list;
2194
2195	if (previous)
2196	list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
2197	else
2198	list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
2199	if (list_is_head(list, head: &mnt_ns_list))
2200	return ERR_PTR(error: -ENOENT);
2201
2202	mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
2203
2204	/*
2205	* The last passive reference count is put with RCU
2206	* delay so accessing the mount namespace is not just
2207	* safe but all relevant members are still valid.
2208	*/
2209	if (!ns_capable_noaudit(ns: mntns->user_ns, CAP_SYS_ADMIN))
2210	continue;
2211
2212	/*
2213	* We need an active reference count as we're persisting
2214	* the mount namespace and it might already be on its
2215	* deathbed.
2216	*/
2217	if (!refcount_inc_not_zero(r: &mntns->ns.count))
2218	continue;
2219
2220	return mntns;
2221	}
2222	}
2223
2224	struct mnt_namespace mnt_ns_from_dentry(struct* dentry *dentry)
2225	{
2226	if (!is_mnt_ns_file(dentry))
2227	return NULL;
2228
2229	return to_mnt_ns(get_proc_ns(dentry->d_inode));
2230	}
2231
2232	static bool mnt_ns_loop(struct dentry *dentry)
2233	{
2234	/ Could bind mounting the mount namespace inode cause a*
2235	* mount namespace loop?
2236	*/
2237	struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
2238
2239	if (!mnt_ns)
2240	return false;
2241
2242	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
2243	}
2244
2245	struct mount copy_tree(struct* mount src_root, struct* dentry *dentry,
2246	int flag)
2247	{
2248	struct mount res, src_parent, src_root_child, src_mnt,
2249	dst_parent, dst_mnt;
2250
2251	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
2252	return ERR_PTR(error: -EINVAL);
2253
2254	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
2255	return ERR_PTR(error: -EINVAL);
2256
2257	res = dst_mnt = clone_mnt(old: src_root, root: dentry, flag);
2258	if (IS_ERR(ptr: dst_mnt))
2259	return dst_mnt;
2260
2261	src_parent = src_root;
2262	dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
2263
2264	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
2265	if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
2266	continue;
2267
2268	for (src_mnt = src_root_child; src_mnt;
2269	src_mnt = next_mnt(p: src_mnt, root: src_root_child)) {
2270	if (!(flag & CL_COPY_UNBINDABLE) &&
2271	IS_MNT_UNBINDABLE(src_mnt)) {
2272	if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
2273	/ Both unbindable and locked. /
2274	dst_mnt = ERR_PTR(error: -EPERM);
2275	goto out;
2276	} else {
2277	src_mnt = skip_mnt_tree(p: src_mnt);
2278	continue;
2279	}
2280	}
2281	if (!(flag & CL_COPY_MNT_NS_FILE) &&
2282	is_mnt_ns_file(dentry: src_mnt->mnt.mnt_root)) {
2283	src_mnt = skip_mnt_tree(p: src_mnt);
2284	continue;
2285	}
2286	while (src_parent != src_mnt->mnt_parent) {
2287	src_parent = src_parent->mnt_parent;
2288	dst_mnt = dst_mnt->mnt_parent;
2289	}
2290
2291	src_parent = src_mnt;
2292	dst_parent = dst_mnt;
2293	dst_mnt = clone_mnt(old: src_mnt, root: src_mnt->mnt.mnt_root, flag);
2294	if (IS_ERR(ptr: dst_mnt))
2295	goto out;
2296	lock_mount_hash();
2297	list_add_tail(new: &dst_mnt->mnt_list, head: &res->mnt_list);
2298	attach_mnt(mnt: dst_mnt, parent: dst_parent, mp: src_parent->mnt_mp, beneath: false);
2299	unlock_mount_hash();
2300	}
2301	}
2302	return res;
2303
2304	out:
2305	if (res) {
2306	lock_mount_hash();
2307	umount_tree(mnt: res, how: UMOUNT_SYNC);
2308	unlock_mount_hash();
2309	}
2310	return dst_mnt;
2311	}
2312
2313	/ Caller should check returned pointer for errors /
2314
2315	struct vfsmount collect_mounts(const* struct path *path)
2316	{
2317	struct mount *tree;
2318	namespace_lock();
2319	if (!check_mnt(mnt: real_mount(mnt: path->mnt)))
2320	tree = ERR_PTR(error: -EINVAL);
2321	else
2322	tree = copy_tree(src_root: real_mount(mnt: path->mnt), dentry: path->dentry,
2323	CL_COPY_ALL \| CL_PRIVATE);
2324	namespace_unlock();
2325	if (IS_ERR(ptr: tree))
2326	return ERR_CAST(ptr: tree);
2327	return &tree->mnt;
2328	}
2329
2330	static void free_mnt_ns(struct mnt_namespace *);
2331	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *, bool);
2332
2333	static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
2334	{
2335	/*
2336	* This mount belonged to an anonymous mount namespace
2337	* but was moved to a non-anonymous mount namespace and
2338	* then unmounted.
2339	*/
2340	if (unlikely(!mnt_ns))
2341	return false;
2342
2343	/*
2344	* This mount belongs to a non-anonymous mount namespace
2345	* and we know that such a mount can never transition to
2346	* an anonymous mount namespace again.
2347	*/
2348	if (!is_anon_ns(ns: mnt_ns)) {
2349	/*
2350	* A detached mount either belongs to an anonymous mount
2351	* namespace or a non-anonymous mount namespace. It
2352	* should never belong to something purely internal.
2353	*/
2354	VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
2355	return false;
2356	}
2357
2358	return true;
2359	}
2360
2361	void dissolve_on_fput(struct vfsmount *mnt)
2362	{
2363	struct mnt_namespace *ns;
2364	struct mount *m = real_mount(mnt);
2365
2366	scoped_guard(rcu) {
2367	if (!must_dissolve(READ_ONCE(m->mnt_ns)))
2368	return;
2369	}
2370
2371	scoped_guard(namespace_lock, &namespace_sem) {
2372	ns = m->mnt_ns;
2373	if (!must_dissolve(mnt_ns: ns))
2374	return;
2375
2376	/*
2377	* After must_dissolve() we know that this is a detached
2378	* mount in an anonymous mount namespace.
2379	*
2380	* Now when mnt_has_parent() reports that this mount
2381	* tree has a parent, we know that this anonymous mount
2382	* tree has been moved to another anonymous mount
2383	* namespace.
2384	*
2385	* So when closing this file we cannot unmount the mount
2386	* tree. This will be done when the file referring to
2387	* the root of the anonymous mount namespace will be
2388	* closed (It could already be closed but it would sync
2389	* on @namespace_sem and wait for us to finish.).
2390	*/
2391	if (mnt_has_parent(mnt: m))
2392	return;
2393
2394	lock_mount_hash();
2395	umount_tree(mnt: m, how: UMOUNT_CONNECTED);
2396	unlock_mount_hash();
2397	}
2398
2399	/ Make sure we notice when we leak mounts. /
2400	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
2401	free_mnt_ns(ns);
2402	}
2403
2404	void drop_collected_mounts(struct vfsmount *mnt)
2405	{
2406	namespace_lock();
2407	lock_mount_hash();
2408	umount_tree(mnt: real_mount(mnt), how: `0`);
2409	unlock_mount_hash();
2410	namespace_unlock();
2411	}
2412
2413	static bool __has_locked_children(struct mount mnt, struct* dentry *dentry)
2414	{
2415	struct mount *child;
2416
2417	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2418	if (!is_subdir(child->mnt_mountpoint, dentry))
2419	continue;
2420
2421	if (child->mnt.mnt_flags & MNT_LOCKED)
2422	return true;
2423	}
2424	return false;
2425	}
2426
2427	bool has_locked_children(struct mount mnt, struct* dentry *dentry)
2428	{
2429	bool res;
2430
2431	read_seqlock_excl(sl: &mount_lock);
2432	res = __has_locked_children(mnt, dentry);
2433	read_sequnlock_excl(sl: &mount_lock);
2434	return res;
2435	}
2436
2437	/*
2438	* Check that there aren't references to earlier/same mount namespaces in the
2439	* specified subtree. Such references can act as pins for mount namespaces
2440	* that aren't checked by the mount-cycle checking code, thereby allowing
2441	* cycles to be made.
2442	*/
2443	static bool check_for_nsfs_mounts(struct mount *subtree)
2444	{
2445	struct mount *p;
2446	bool ret = false;
2447
2448	lock_mount_hash();
2449	for (p = subtree; p; p = next_mnt(p, root: subtree))
2450	if (mnt_ns_loop(dentry: p->mnt.mnt_root))
2451	goto out;
2452
2453	ret = true;
2454	out:
2455	unlock_mount_hash();
2456	return ret;
2457	}
2458
2459	/**
2460	* clone_private_mount - create a private clone of a path
2461	* @path: path to clone
2462	*
2463	* This creates a new vfsmount, which will be the clone of @path. The new mount
2464	* will not be attached anywhere in the namespace and will be private (i.e.
2465	* changes to the originating mount won't be propagated into this).
2466	*
2467	* This assumes caller has called or done the equivalent of may_mount().
2468	*
2469	* Release with mntput().
2470	*/
2471	struct vfsmount clone_private_mount(const* struct path *path)
2472	{
2473	struct mount *old_mnt = real_mount(mnt: path->mnt);
2474	struct mount *new_mnt;
2475
2476	guard(rwsem_read)(T: &namespace_sem);
2477
2478	if (IS_MNT_UNBINDABLE(old_mnt))
2479	return ERR_PTR(error: -EINVAL);
2480
2481	/*
2482	* Make sure the source mount is acceptable.
2483	* Anything mounted in our mount namespace is allowed.
2484	* Otherwise, it must be the root of an anonymous mount
2485	* namespace, and we need to make sure no namespace
2486	* loops get created.
2487	*/
2488	if (!check_mnt(mnt: old_mnt)) {
2489	if (!is_mounted(mnt: &old_mnt->mnt) \|\|
2490	!is_anon_ns(ns: old_mnt->mnt_ns) \|\|
2491	mnt_has_parent(mnt: old_mnt))
2492	return ERR_PTR(error: -EINVAL);
2493
2494	if (!check_for_nsfs_mounts(subtree: old_mnt))
2495	return ERR_PTR(error: -EINVAL);
2496	}
2497
2498	if (!ns_capable(ns: old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
2499	return ERR_PTR(error: -EPERM);
2500
2501	if (__has_locked_children(mnt: old_mnt, dentry: path->dentry))
2502	return ERR_PTR(error: -EINVAL);
2503
2504	new_mnt = clone_mnt(old: old_mnt, root: path->dentry, CL_PRIVATE);
2505	if (IS_ERR(ptr: new_mnt))
2506	return ERR_PTR(error: -EINVAL);
2507
2508	/ Longterm mount to be removed by kern_unmount() /*
2509	new_mnt->mnt_ns = MNT_NS_INTERNAL;
2510	return &new_mnt->mnt;
2511	}
2512	EXPORT_SYMBOL_GPL(clone_private_mount);
2513
2514	int iterate_mounts(int (f)(struct* vfsmount , void* ), void* *arg,
2515	struct vfsmount *root)
2516	{
2517	struct mount *mnt;
2518	int res = f(root, arg);
2519	if (res)
2520	return res;
2521	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2522	res = f(&mnt->mnt, arg);
2523	if (res)
2524	return res;
2525	}
2526	return `0`;
2527	}
2528
2529	static void lock_mnt_tree(struct mount *mnt)
2530	{
2531	struct mount *p;
2532
2533	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
2534	int flags = p->mnt.mnt_flags;
2535	/ Don't allow unprivileged users to change mount flags /
2536	flags \|= MNT_LOCK_ATIME;
2537
2538	if (flags & MNT_READONLY)
2539	flags \|= MNT_LOCK_READONLY;
2540
2541	if (flags & MNT_NODEV)
2542	flags \|= MNT_LOCK_NODEV;
2543
2544	if (flags & MNT_NOSUID)
2545	flags \|= MNT_LOCK_NOSUID;
2546
2547	if (flags & MNT_NOEXEC)
2548	flags \|= MNT_LOCK_NOEXEC;
2549	/ Don't allow unprivileged users to reveal what is under a mount /
2550	if (list_empty(head: &p->mnt_expire))
2551	flags \|= MNT_LOCKED;
2552	p->mnt.mnt_flags = flags;
2553	}
2554	}
2555
2556	static void cleanup_group_ids(struct mount mnt, struct* mount *end)
2557	{
2558	struct mount *p;
2559
2560	for (p = mnt; p != end; p = next_mnt(p, root: mnt)) {
2561	if (p->mnt_group_id && !IS_MNT_SHARED(p))
2562	mnt_release_group_id(mnt: p);
2563	}
2564	}
2565
2566	static int invent_group_ids(struct mount *mnt, bool recurse)
2567	{
2568	struct mount *p;
2569
2570	for (p = mnt; p; p = recurse ? next_mnt(p, root: mnt) : NULL) {
2571	if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2572	int err = mnt_alloc_group_id(mnt: p);
2573	if (err) {
2574	cleanup_group_ids(mnt, end: p);
2575	return err;
2576	}
2577	}
2578	}
2579
2580	return `0`;
2581	}
2582
2583	int count_mounts(struct mnt_namespace ns, struct* mount *mnt)
2584	{
2585	unsigned int max = READ_ONCE(sysctl_mount_max);
2586	unsigned int mounts = `0`;
2587	struct mount *p;
2588
2589	if (ns->nr_mounts >= max)
2590	return -ENOSPC;
2591	max -= ns->nr_mounts;
2592	if (ns->pending_mounts >= max)
2593	return -ENOSPC;
2594	max -= ns->pending_mounts;
2595
2596	for (p = mnt; p; p = next_mnt(p, root: mnt))
2597	mounts++;
2598
2599	if (mounts > max)
2600	return -ENOSPC;
2601
2602	ns->pending_mounts += mounts;
2603	return `0`;
2604	}
2605
2606	enum mnt_tree_flags_t {
2607	MNT_TREE_MOVE = BIT(`0`),
2608	MNT_TREE_BENEATH = BIT(`1`),
2609	MNT_TREE_PROPAGATION = BIT(`2`),
2610	};
2611
2612	/**
2613	* attach_recursive_mnt - attach a source mount tree
2614	* @source_mnt: mount tree to be attached
2615	* @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
2616	* @dest_mp: the mountpoint @source_mnt will be mounted at
2617	* @flags: modify how @source_mnt is supposed to be attached
2618	*
2619	* NOTE: in the table below explains the semantics when a source mount
2620	* of a given type is attached to a destination mount of a given type.
2621	* ---------------------------------------------------------------------------
2622	* \| BIND MOUNT OPERATION \|
2623	* \|**************************************************************************
2624	* \| source-->\| shared \| private \| slave \| unbindable \|
2625	* \| dest \| \| \| \| \|
2626	* \| \| \| \| \| \| \|
2627	* \| v \| \| \| \| \|
2628	* \|**************************************************************************
2629	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
2630	* \| \| \| \| \| \|
2631	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
2632	* ***************************************************************************
2633	* A bind operation clones the source mount and mounts the clone on the
2634	* destination mount.
2635	*
2636	* (++) the cloned mount is propagated to all the mounts in the propagation
2637	* tree of the destination mount and the cloned mount is added to
2638	* the peer group of the source mount.
2639	* (+) the cloned mount is created under the destination mount and is marked
2640	* as shared. The cloned mount is added to the peer group of the source
2641	* mount.
2642	* (+++) the mount is propagated to all the mounts in the propagation tree
2643	* of the destination mount and the cloned mount is made slave
2644	* of the same master as that of the source mount. The cloned mount
2645	* is marked as 'shared and slave'.
2646	* (*) the cloned mount is made a slave of the same master as that of the
2647	* source mount.
2648	*
2649	* ---------------------------------------------------------------------------
2650	* \| MOVE MOUNT OPERATION \|
2651	* \|**************************************************************************
2652	* \| source-->\| shared \| private \| slave \| unbindable \|
2653	* \| dest \| \| \| \| \|
2654	* \| \| \| \| \| \| \|
2655	* \| v \| \| \| \| \|
2656	* \|**************************************************************************
2657	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
2658	* \| \| \| \| \| \|
2659	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
2660	* ***************************************************************************
2661	*
2662	* (+) the mount is moved to the destination. And is then propagated to
2663	* all the mounts in the propagation tree of the destination mount.
2664	* (+*) the mount is moved to the destination.
2665	* (+++) the mount is moved to the destination and is then propagated to
2666	* all the mounts belonging to the destination mount's propagation tree.
2667	* the mount is marked as 'shared and slave'.
2668	* (*) the mount continues to be a slave at the new location.
2669	*
2670	* if the source mount is a tree, the operations explained above is
2671	* applied to each mount in the tree.
2672	* Must be called without spinlocks held, since this function can sleep
2673	* in allocations.
2674	*
2675	* Context: The function expects namespace_lock() to be held.
2676	* Return: If @source_mnt was successfully attached 0 is returned.
2677	* Otherwise a negative error code is returned.
2678	*/
2679	static int attach_recursive_mnt(struct mount *source_mnt,
2680	struct mount *top_mnt,
2681	struct mountpoint *dest_mp,
2682	enum mnt_tree_flags_t flags)
2683	{
2684	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2685	HLIST_HEAD(tree_list);
2686	struct mnt_namespace *ns = top_mnt->mnt_ns;
2687	struct mountpoint *smp;
2688	struct mount child, dest_mnt, *p;
2689	struct hlist_node *n;
2690	int err = `0`;
2691	bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
2692
2693	/*
2694	* Preallocate a mountpoint in case the new mounts need to be
2695	* mounted beneath mounts on the same mountpoint.
2696	*/
2697	smp = get_mountpoint(dentry: source_mnt->mnt.mnt_root);
2698	if (IS_ERR(ptr: smp))
2699	return PTR_ERR(ptr: smp);
2700
2701	/ Is there space to add these mounts to the mount namespace? /
2702	if (!moving) {
2703	err = count_mounts(ns, mnt: source_mnt);
2704	if (err)
2705	goto out;
2706	}
2707
2708	if (beneath)
2709	dest_mnt = top_mnt->mnt_parent;
2710	else
2711	dest_mnt = top_mnt;
2712
2713	if (IS_MNT_SHARED(dest_mnt)) {
2714	err = invent_group_ids(mnt: source_mnt, recurse: true);
2715	if (err)
2716	goto out;
2717	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2718	}
2719	lock_mount_hash();
2720	if (err)
2721	goto out_cleanup_ids;
2722
2723	if (IS_MNT_SHARED(dest_mnt)) {
2724	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2725	set_mnt_shared(p);
2726	}
2727
2728	if (moving) {
2729	if (beneath)
2730	dest_mp = smp;
2731	unhash_mnt(mnt: source_mnt);
2732	attach_mnt(mnt: source_mnt, parent: top_mnt, mp: dest_mp, beneath);
2733	mnt_notify_add(m: source_mnt);
2734	touch_mnt_namespace(ns: source_mnt->mnt_ns);
2735	} else {
2736	if (source_mnt->mnt_ns) {
2737	LIST_HEAD(head);
2738
2739	/ move from anon - the caller will destroy /
2740	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2741	move_from_ns(mnt: p, dt_list: &head);
2742	list_del_init(entry: &head);
2743	}
2744	if (beneath)
2745	mnt_set_mountpoint_beneath(new_parent: source_mnt, top_mnt, new_mp: smp);
2746	else
2747	mnt_set_mountpoint(mnt: dest_mnt, mp: dest_mp, child_mnt: source_mnt);
2748	commit_tree(mnt: source_mnt);
2749	}
2750
2751	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2752	struct mount *q;
2753	hlist_del_init(n: &child->mnt_hash);
2754	q = __lookup_mnt(mnt: &child->mnt_parent->mnt,
2755	dentry: child->mnt_mountpoint);
2756	if (q)
2757	mnt_change_mountpoint(parent: child, mp: smp, mnt: q);
2758	/ Notice when we are propagating across user namespaces /
2759	if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2760	lock_mnt_tree(mnt: child);
2761	child->mnt.mnt_flags &= ~MNT_LOCKED;
2762	commit_tree(mnt: child);
2763	}
2764	put_mountpoint(mp: smp);
2765	unlock_mount_hash();
2766
2767	return `0`;
2768
2769	out_cleanup_ids:
2770	while (!hlist_empty(h: &tree_list)) {
2771	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2772	child->mnt_parent->mnt_ns->pending_mounts = `0`;
2773	umount_tree(mnt: child, how: UMOUNT_SYNC);
2774	}
2775	unlock_mount_hash();
2776	cleanup_group_ids(mnt: source_mnt, NULL);
2777	out:
2778	ns->pending_mounts = `0`;
2779
2780	read_seqlock_excl(sl: &mount_lock);
2781	put_mountpoint(mp: smp);
2782	read_sequnlock_excl(sl: &mount_lock);
2783
2784	return err;
2785	}
2786
2787	/**
2788	* do_lock_mount - lock mount and mountpoint
2789	* @path: target path
2790	* @beneath: whether the intention is to mount beneath @path
2791	*
2792	* Follow the mount stack on @path until the top mount @mnt is found. If
2793	* the initial @path->{mnt,dentry} is a mountpoint lookup the first
2794	* mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
2795	* until nothing is stacked on top of it anymore.
2796	*
2797	* Acquire the inode_lock() on the top mount's ->mnt_root to protect
2798	* against concurrent removal of the new mountpoint from another mount
2799	* namespace.
2800	*
2801	* If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
2802	* @mp on @mnt->mnt_parent must be acquired. This protects against a
2803	* concurrent unlink of @mp->mnt_dentry from another mount namespace
2804	* where @mnt doesn't have a child mount mounted @mp. A concurrent
2805	* removal of @mnt->mnt_root doesn't matter as nothing will be mounted
2806	* on top of it for @beneath.
2807	*
2808	* In addition, @beneath needs to make sure that @mnt hasn't been
2809	* unmounted or moved from its current mountpoint in between dropping
2810	* @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
2811	* being unmounted would be detected later by e.g., calling
2812	* check_mnt(mnt) in the function it's called from. For the @beneath
2813	* case however, it's useful to detect it directly in do_lock_mount().
2814	* If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
2815	* to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
2816	* point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
2817	*
2818	* Return: Either the target mountpoint on the top mount or the top
2819	* mount's mountpoint.
2820	*/
2821	static struct mountpoint do_lock_mount(struct* path *path, bool beneath)
2822	{
2823	struct vfsmount *mnt = path->mnt;
2824	struct dentry *dentry;
2825	struct mountpoint *mp = ERR_PTR(error: -ENOENT);
2826	struct path under = {};
2827
2828	for (;;) {
2829	struct mount *m = real_mount(mnt);
2830
2831	if (beneath) {
2832	path_put(&under);
2833	read_seqlock_excl(sl: &mount_lock);
2834	under.mnt = mntget(&m->mnt_parent->mnt);
2835	under.dentry = dget(dentry: m->mnt_mountpoint);
2836	read_sequnlock_excl(sl: &mount_lock);
2837	dentry = under.dentry;
2838	} else {
2839	dentry = path->dentry;
2840	}
2841
2842	inode_lock(inode: dentry->d_inode);
2843	namespace_lock();
2844
2845	if (unlikely(cant_mount(dentry) \|\| !is_mounted(mnt)))
2846	break; // not to be mounted on
2847
2848	if (beneath && unlikely(m->mnt_mountpoint != dentry \|\|
2849	&m->mnt_parent->mnt != under.mnt)) {
2850	namespace_unlock();
2851	inode_unlock(inode: dentry->d_inode);
2852	continue; // got moved
2853	}
2854
2855	mnt = lookup_mnt(path);
2856	if (unlikely(mnt)) {
2857	namespace_unlock();
2858	inode_unlock(inode: dentry->d_inode);
2859	path_put(path);
2860	path->mnt = mnt;
2861	path->dentry = dget(dentry: mnt->mnt_root);
2862	continue; // got overmounted
2863	}
2864	mp = get_mountpoint(dentry);
2865	if (IS_ERR(ptr: mp))
2866	break;
2867	if (beneath) {
2868	/*
2869	* @under duplicates the references that will stay
2870	* at least until namespace_unlock(), so the path_put()
2871	* below is safe (and OK to do under namespace_lock -
2872	* we are not dropping the final references here).
2873	*/
2874	path_put(&under);
2875	}
2876	return mp;
2877	}
2878	namespace_unlock();
2879	inode_unlock(inode: dentry->d_inode);
2880	if (beneath)
2881	path_put(&under);
2882	return mp;
2883	}
2884
2885	static inline struct mountpoint lock_mount(struct* path *path)
2886	{
2887	return do_lock_mount(path, beneath: false);
2888	}
2889
2890	static void unlock_mount(struct mountpoint *where)
2891	{
2892	inode_unlock(inode: where->m_dentry->d_inode);
2893	read_seqlock_excl(sl: &mount_lock);
2894	put_mountpoint(mp: where);
2895	read_sequnlock_excl(sl: &mount_lock);
2896	namespace_unlock();
2897	}
2898
2899	static int graft_tree(struct mount mnt, struct* mount p, struct* mountpoint *mp)
2900	{
2901	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2902	return -EINVAL;
2903
2904	if (d_is_dir(dentry: mp->m_dentry) !=
2905	d_is_dir(dentry: mnt->mnt.mnt_root))
2906	return -ENOTDIR;
2907
2908	return attach_recursive_mnt(source_mnt: mnt, top_mnt: p, dest_mp: mp, flags: `0`);
2909	}
2910
2911	/*
2912	* Sanity check the flags to change_mnt_propagation.
2913	*/
2914
2915	static int flags_to_propagation_type(int ms_flags)
2916	{
2917	int type = ms_flags & ~(MS_REC \| MS_SILENT);
2918
2919	/ Fail if any non-propagation flags are set /
2920	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
2921	return `0`;
2922	/ Only one propagation flag should be set /
2923	if (!is_power_of_2(n: type))
2924	return `0`;
2925	return type;
2926	}
2927
2928	/*
2929	* recursively change the type of the mountpoint.
2930	*/
2931	static int do_change_type(struct path path, int* ms_flags)
2932	{
2933	struct mount *m;
2934	struct mount *mnt = real_mount(mnt: path->mnt);
2935	int recurse = ms_flags & MS_REC;
2936	int type;
2937	int err = `0`;
2938
2939	if (!path_mounted(path))
2940	return -EINVAL;
2941
2942	type = flags_to_propagation_type(ms_flags);
2943	if (!type)
2944	return -EINVAL;
2945
2946	namespace_lock();
2947	if (!check_mnt(mnt)) {
2948	err = -EINVAL;
2949	goto out_unlock;
2950	}
2951	if (type == MS_SHARED) {
2952	err = invent_group_ids(mnt, recurse);
2953	if (err)
2954	goto out_unlock;
2955	}
2956
2957	lock_mount_hash();
2958	for (m = mnt; m; m = (recurse ? next_mnt(p: m, root: mnt) : NULL))
2959	change_mnt_propagation(m, type);
2960	unlock_mount_hash();
2961
2962	out_unlock:
2963	namespace_unlock();
2964	return err;
2965	}
2966
2967	/ may_copy_tree() - check if a mount tree can be copied*
2968	* @path: path to the mount tree to be copied
2969	*
2970	* This helper checks if the caller may copy the mount tree starting
2971	* from @path->mnt. The caller may copy the mount tree under the
2972	* following circumstances:
2973	*
2974	* (1) The caller is located in the mount namespace of the mount tree.
2975	* This also implies that the mount does not belong to an anonymous
2976	* mount namespace.
2977	* (2) The caller tries to copy an nfs mount referring to a mount
2978	* namespace, i.e., the caller is trying to copy a mount namespace
2979	* entry from nsfs.
2980	* (3) The caller tries to copy a pidfs mount referring to a pidfd.
2981	* (4) The caller is trying to copy a mount tree that belongs to an
2982	* anonymous mount namespace.
2983	*
2984	* For that to be safe, this helper enforces that the origin mount
2985	* namespace the anonymous mount namespace was created from is the
2986	* same as the caller's mount namespace by comparing the sequence
2987	* numbers.
2988	*
2989	* This is not strictly necessary. The current semantics of the new
2990	* mount api enforce that the caller must be located in the same
2991	* mount namespace as the mount tree it interacts with. Using the
2992	* origin sequence number preserves these semantics even for
2993	* anonymous mount namespaces. However, one could envision extending
2994	* the api to directly operate across mount namespace if needed.
2995	*
2996	* The ownership of a non-anonymous mount namespace such as the
2997	* caller's cannot change.
2998	* => We know that the caller's mount namespace is stable.
2999	*
3000	* If the origin sequence number of the anonymous mount namespace is
3001	* the same as the sequence number of the caller's mount namespace.
3002	* => The owning namespaces are the same.
3003	*
3004	* ==> The earlier capability check on the owning namespace of the
3005	* caller's mount namespace ensures that the caller has the
3006	* ability to copy the mount tree.
3007	*
3008	* Returns true if the mount tree can be copied, false otherwise.
3009	*/
3010	static inline bool may_copy_tree(struct path *path)
3011	{
3012	struct mount *mnt = real_mount(mnt: path->mnt);
3013	const struct dentry_operations *d_op;
3014
3015	if (check_mnt(mnt))
3016	return true;
3017
3018	d_op = path->dentry->d_op;
3019	if (d_op == &ns_dentry_operations)
3020	return true;
3021
3022	if (d_op == &pidfs_dentry_operations)
3023	return true;
3024
3025	if (!is_mounted(mnt: path->mnt))
3026	return false;
3027
3028	return check_anonymous_mnt(mnt);
3029	}
3030
3031
3032	static struct mount __do_loopback(struct* path old_path, int* recurse)
3033	{
3034	struct mount mnt = ERR_PTR(error: -EINVAL), old = real_mount(mnt: old_path->mnt);
3035
3036	if (IS_MNT_UNBINDABLE(old))
3037	return mnt;
3038
3039	if (!may_copy_tree(path: old_path))
3040	return mnt;
3041
3042	if (!recurse && __has_locked_children(mnt: old, dentry: old_path->dentry))
3043	return mnt;
3044
3045	if (recurse)
3046	mnt = copy_tree(src_root: old, dentry: old_path->dentry, CL_COPY_MNT_NS_FILE);
3047	else
3048	mnt = clone_mnt(old, root: old_path->dentry, flag: `0`);
3049
3050	if (!IS_ERR(ptr: mnt))
3051	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
3052
3053	return mnt;
3054	}
3055
3056	/*
3057	* do loopback mount.
3058	*/
3059	static int do_loopback(struct path path, const* char *old_name,
3060	int recurse)
3061	{
3062	struct path old_path;
3063	struct mount mnt = NULL, parent;
3064	struct mountpoint *mp;
3065	int err;
3066	if (!old_name \|\| !*old_name)
3067	return -EINVAL;
3068	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
3069	if (err)
3070	return err;
3071
3072	err = -EINVAL;
3073	if (mnt_ns_loop(dentry: old_path.dentry))
3074	goto out;
3075
3076	mp = lock_mount(path);
3077	if (IS_ERR(ptr: mp)) {
3078	err = PTR_ERR(ptr: mp);
3079	goto out;
3080	}
3081
3082	parent = real_mount(mnt: path->mnt);
3083	if (!check_mnt(mnt: parent))
3084	goto out2;
3085
3086	mnt = __do_loopback(old_path: &old_path, recurse);
3087	if (IS_ERR(ptr: mnt)) {
3088	err = PTR_ERR(ptr: mnt);
3089	goto out2;
3090	}
3091
3092	err = graft_tree(mnt, p: parent, mp);
3093	if (err) {
3094	lock_mount_hash();
3095	umount_tree(mnt, how: UMOUNT_SYNC);
3096	unlock_mount_hash();
3097	}
3098	out2:
3099	unlock_mount(where: mp);
3100	out:
3101	path_put(&old_path);
3102	return err;
3103	}
3104
3105	static struct file open_detached_copy(struct* path *path, bool recursive)
3106	{
3107	struct mnt_namespace ns, mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
3108	struct user_namespace *user_ns = mnt_ns->user_ns;
3109	struct mount mnt, p;
3110	struct file *file;
3111
3112	ns = alloc_mnt_ns(user_ns, true);
3113	if (IS_ERR(ptr: ns))
3114	return ERR_CAST(ptr: ns);
3115
3116	namespace_lock();
3117
3118	/*
3119	* Record the sequence number of the source mount namespace.
3120	* This needs to hold namespace_sem to ensure that the mount
3121	* doesn't get attached.
3122	*/
3123	if (is_mounted(mnt: path->mnt)) {
3124	src_mnt_ns = real_mount(mnt: path->mnt)->mnt_ns;
3125	if (is_anon_ns(ns: src_mnt_ns))
3126	ns->seq_origin = src_mnt_ns->seq_origin;
3127	else
3128	ns->seq_origin = src_mnt_ns->seq;
3129	}
3130
3131	mnt = __do_loopback(old_path: path, recurse: recursive);
3132	if (IS_ERR(ptr: mnt)) {
3133	namespace_unlock();
3134	free_mnt_ns(ns);
3135	return ERR_CAST(ptr: mnt);
3136	}
3137
3138	lock_mount_hash();
3139	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3140	mnt_add_to_ns(ns, mnt: p);
3141	ns->nr_mounts++;
3142	}
3143	ns->root = mnt;
3144	mntget(&mnt->mnt);
3145	unlock_mount_hash();
3146	namespace_unlock();
3147
3148	mntput(path->mnt);
3149	path->mnt = &mnt->mnt;
3150	file = dentry_open(path, O_PATH, current_cred());
3151	if (IS_ERR(ptr: file))
3152	dissolve_on_fput(mnt: path->mnt);
3153	else
3154	file->f_mode \|= FMODE_NEED_UNMOUNT;
3155	return file;
3156	}
3157
3158	static struct file vfs_open_tree(int* dfd, const char __user filename, unsigned* int flags)
3159	{
3160	int ret;
3161	struct path path __free(path_put) = {};
3162	int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
3163	bool detached = flags & OPEN_TREE_CLONE;
3164
3165	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
3166
3167	if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
3168	AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
3169	OPEN_TREE_CLOEXEC))
3170	return ERR_PTR(error: -EINVAL);
3171
3172	if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
3173	return ERR_PTR(error: -EINVAL);
3174
3175	if (flags & AT_NO_AUTOMOUNT)
3176	lookup_flags &= ~LOOKUP_AUTOMOUNT;
3177	if (flags & AT_SYMLINK_NOFOLLOW)
3178	lookup_flags &= ~LOOKUP_FOLLOW;
3179	if (flags & AT_EMPTY_PATH)
3180	lookup_flags \|= LOOKUP_EMPTY;
3181
3182	if (detached && !may_mount())
3183	return ERR_PTR(error: -EPERM);
3184
3185	ret = user_path_at(dfd, filename, lookup_flags, &path);
3186	if (unlikely(ret))
3187	return ERR_PTR(error: ret);
3188
3189	if (detached)
3190	return open_detached_copy(path: &path, recursive: flags & AT_RECURSIVE);
3191
3192	return dentry_open(path: &path, O_PATH, current_cred());
3193	}
3194
3195	SYSCALL_DEFINE3(open_tree, int, dfd, const char __user , filename, unsigned*, flags)
3196	{
3197	int fd;
3198	struct file *file __free(fput) = NULL;
3199
3200	file = vfs_open_tree(dfd, filename, flags);
3201	if (IS_ERR(ptr: file))
3202	return PTR_ERR(ptr: file);
3203
3204	fd = get_unused_fd_flags(flags: flags & O_CLOEXEC);
3205	if (fd < `0`)
3206	return fd;
3207
3208	fd_install(fd, no_free_ptr(file));
3209	return fd;
3210	}
3211
3212	/*
3213	* Don't allow locked mount flags to be cleared.
3214	*
3215	* No locks need to be held here while testing the various MNT_LOCK
3216	* flags because those flags can never be cleared once they are set.
3217	*/
3218	static bool can_change_locked_flags(struct mount mnt, unsigned* int mnt_flags)
3219	{
3220	unsigned int fl = mnt->mnt.mnt_flags;
3221
3222	if ((fl & MNT_LOCK_READONLY) &&
3223	!(mnt_flags & MNT_READONLY))
3224	return false;
3225
3226	if ((fl & MNT_LOCK_NODEV) &&
3227	!(mnt_flags & MNT_NODEV))
3228	return false;
3229
3230	if ((fl & MNT_LOCK_NOSUID) &&
3231	!(mnt_flags & MNT_NOSUID))
3232	return false;
3233
3234	if ((fl & MNT_LOCK_NOEXEC) &&
3235	!(mnt_flags & MNT_NOEXEC))
3236	return false;
3237
3238	if ((fl & MNT_LOCK_ATIME) &&
3239	((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
3240	return false;
3241
3242	return true;
3243	}
3244
3245	static int change_mount_ro_state(struct mount mnt, unsigned* int mnt_flags)
3246	{
3247	bool readonly_request = (mnt_flags & MNT_READONLY);
3248
3249	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
3250	return `0`;
3251
3252	if (readonly_request)
3253	return mnt_make_readonly(mnt);
3254
3255	mnt->mnt.mnt_flags &= ~MNT_READONLY;
3256	return `0`;
3257	}
3258
3259	static void set_mount_attributes(struct mount mnt, unsigned* int mnt_flags)
3260	{
3261	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
3262	mnt->mnt.mnt_flags = mnt_flags;
3263	touch_mnt_namespace(ns: mnt->mnt_ns);
3264	}
3265
3266	static void mnt_warn_timestamp_expiry(struct path mountpoint, struct* vfsmount *mnt)
3267	{
3268	struct super_block *sb = mnt->mnt_sb;
3269
3270	if (!__mnt_is_readonly(mnt) &&
3271	(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
3272	(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
3273	char buf, mntpath;
3274
3275	buf = (char *)__get_free_page(GFP_KERNEL);
3276	if (buf)
3277	mntpath = d_path(mountpoint, buf, PAGE_SIZE);
3278	else
3279	mntpath = ERR_PTR(error: -ENOMEM);
3280	if (IS_ERR(ptr: mntpath))
3281	mntpath = "(unknown)";
3282
3283	pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
3284	sb->s_type->name,
3285	is_mounted(mnt) ? "remounted" : "mounted",
3286	mntpath, &sb->s_time_max,
3287	(unsigned long long)sb->s_time_max);
3288
3289	sb->s_iflags \|= SB_I_TS_EXPIRY_WARNED;
3290	if (buf)
3291	free_page((unsigned long)buf);
3292	}
3293	}
3294
3295	/*
3296	* Handle reconfiguration of the mountpoint only without alteration of the
3297	* superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
3298	* to mount(2).
3299	*/
3300	static int do_reconfigure_mnt(struct path path, unsigned* int mnt_flags)
3301	{
3302	struct super_block *sb = path->mnt->mnt_sb;
3303	struct mount *mnt = real_mount(mnt: path->mnt);
3304	int ret;
3305
3306	if (!check_mnt(mnt))
3307	return -EINVAL;
3308
3309	if (!path_mounted(path))
3310	return -EINVAL;
3311
3312	if (!can_change_locked_flags(mnt, mnt_flags))
3313	return -EPERM;
3314
3315	/*
3316	* We're only checking whether the superblock is read-only not
3317	* changing it, so only take down_read(&sb->s_umount).
3318	*/
3319	down_read(sem: &sb->s_umount);
3320	lock_mount_hash();
3321	ret = change_mount_ro_state(mnt, mnt_flags);
3322	if (ret == `0`)
3323	set_mount_attributes(mnt, mnt_flags);
3324	unlock_mount_hash();
3325	up_read(sem: &sb->s_umount);
3326
3327	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3328
3329	return ret;
3330	}
3331
3332	/*
3333	* change filesystem flags. dir should be a physical root of filesystem.
3334	* If you've mounted a non-root directory somewhere and want to do remount
3335	* on it - tough luck.
3336	*/
3337	static int do_remount(struct path path, int* ms_flags, int sb_flags,
3338	int mnt_flags, void *data)
3339	{
3340	int err;
3341	struct super_block *sb = path->mnt->mnt_sb;
3342	struct mount *mnt = real_mount(mnt: path->mnt);
3343	struct fs_context *fc;
3344
3345	if (!check_mnt(mnt))
3346	return -EINVAL;
3347
3348	if (!path_mounted(path))
3349	return -EINVAL;
3350
3351	if (!can_change_locked_flags(mnt, mnt_flags))
3352	return -EPERM;
3353
3354	fc = fs_context_for_reconfigure(dentry: path->dentry, sb_flags, MS_RMT_MASK);
3355	if (IS_ERR(ptr: fc))
3356	return PTR_ERR(ptr: fc);
3357
3358	/*
3359	* Indicate to the filesystem that the remount request is coming
3360	* from the legacy mount system call.
3361	*/
3362	fc->oldapi = true;
3363
3364	err = parse_monolithic_mount_data(fc, data);
3365	if (!err) {
3366	down_write(sem: &sb->s_umount);
3367	err = -EPERM;
3368	if (ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN)) {
3369	err = reconfigure_super(fc);
3370	if (!err) {
3371	lock_mount_hash();
3372	set_mount_attributes(mnt, mnt_flags);
3373	unlock_mount_hash();
3374	}
3375	}
3376	up_write(sem: &sb->s_umount);
3377	}
3378
3379	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3380
3381	put_fs_context(fc);
3382	return err;
3383	}
3384
3385	static inline int tree_contains_unbindable(struct mount *mnt)
3386	{
3387	struct mount *p;
3388	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3389	if (IS_MNT_UNBINDABLE(p))
3390	return `1`;
3391	}
3392	return `0`;
3393	}
3394
3395	static int do_set_group(struct path from_path, struct* path *to_path)
3396	{
3397	struct mount from, to;
3398	int err;
3399
3400	from = real_mount(mnt: from_path->mnt);
3401	to = real_mount(mnt: to_path->mnt);
3402
3403	namespace_lock();
3404
3405	err = -EINVAL;
3406	/ To and From must be mounted /
3407	if (!is_mounted(mnt: &from->mnt))
3408	goto out;
3409	if (!is_mounted(mnt: &to->mnt))
3410	goto out;
3411
3412	err = -EPERM;
3413	/ We should be allowed to modify mount namespaces of both mounts /
3414	if (!ns_capable(ns: from->mnt_ns->user_ns, CAP_SYS_ADMIN))
3415	goto out;
3416	if (!ns_capable(ns: to->mnt_ns->user_ns, CAP_SYS_ADMIN))
3417	goto out;
3418
3419	err = -EINVAL;
3420	/ To and From paths should be mount roots /
3421	if (!path_mounted(path: from_path))
3422	goto out;
3423	if (!path_mounted(path: to_path))
3424	goto out;
3425
3426	/ Setting sharing groups is only allowed across same superblock /
3427	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
3428	goto out;
3429
3430	/ From mount root should be wider than To mount root /
3431	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
3432	goto out;
3433
3434	/ From mount should not have locked children in place of To's root /
3435	if (__has_locked_children(mnt: from, dentry: to->mnt.mnt_root))
3436	goto out;
3437
3438	/ Setting sharing groups is only allowed on private mounts /
3439	if (IS_MNT_SHARED(to) \|\| IS_MNT_SLAVE(to))
3440	goto out;
3441
3442	/ From should not be private /
3443	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
3444	goto out;
3445
3446	if (IS_MNT_SLAVE(from)) {
3447	struct mount *m = from->mnt_master;
3448
3449	list_add(new: &to->mnt_slave, head: &from->mnt_slave);
3450	to->mnt_master = m;
3451	}
3452
3453	if (IS_MNT_SHARED(from)) {
3454	to->mnt_group_id = from->mnt_group_id;
3455	list_add(new: &to->mnt_share, head: &from->mnt_share);
3456	lock_mount_hash();
3457	set_mnt_shared(to);
3458	unlock_mount_hash();
3459	}
3460
3461	err = `0`;
3462	out:
3463	namespace_unlock();
3464	return err;
3465	}
3466
3467	/**
3468	* path_overmounted - check if path is overmounted
3469	* @path: path to check
3470	*
3471	* Check if path is overmounted, i.e., if there's a mount on top of
3472	* @path->mnt with @path->dentry as mountpoint.
3473	*
3474	* Context: namespace_sem must be held at least shared.
3475	* MUST NOT be called under lock_mount_hash() (there one should just
3476	* call __lookup_mnt() and check if it returns NULL).
3477	* Return: If path is overmounted true is returned, false if not.
3478	*/
3479	static inline bool path_overmounted(const struct path *path)
3480	{
3481	unsigned seq = read_seqbegin(sl: &mount_lock);
3482	bool no_child;
3483
3484	rcu_read_lock();
3485	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3486	rcu_read_unlock();
3487	if (need_seqretry(lock: &mount_lock, seq)) {
3488	read_seqlock_excl(sl: &mount_lock);
3489	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3490	read_sequnlock_excl(sl: &mount_lock);
3491	}
3492	return unlikely(!no_child);
3493	}
3494
3495	/**
3496	* can_move_mount_beneath - check that we can mount beneath the top mount
3497	* @from: mount to mount beneath
3498	* @to: mount under which to mount
3499	* @mp: mountpoint of @to
3500	*
3501	* - Make sure that @to->dentry is actually the root of a mount under
3502	* which we can mount another mount.
3503	* - Make sure that nothing can be mounted beneath the caller's current
3504	* root or the rootfs of the namespace.
3505	* - Make sure that the caller can unmount the topmost mount ensuring
3506	* that the caller could reveal the underlying mountpoint.
3507	* - Ensure that nothing has been mounted on top of @from before we
3508	* grabbed @namespace_sem to avoid creating pointless shadow mounts.
3509	* - Prevent mounting beneath a mount if the propagation relationship
3510	* between the source mount, parent mount, and top mount would lead to
3511	* nonsensical mount trees.
3512	*
3513	* Context: This function expects namespace_lock() to be held.
3514	* Return: On success 0, and on error a negative error code is returned.
3515	*/
3516	static int can_move_mount_beneath(const struct path *from,
3517	const struct path *to,
3518	const struct mountpoint *mp)
3519	{
3520	struct mount *mnt_from = real_mount(mnt: from->mnt),
3521	*mnt_to = real_mount(mnt: to->mnt),
3522	*parent_mnt_to = mnt_to->mnt_parent;
3523
3524	if (!mnt_has_parent(mnt: mnt_to))
3525	return -EINVAL;
3526
3527	if (!path_mounted(path: to))
3528	return -EINVAL;
3529
3530	if (IS_MNT_LOCKED(mnt_to))
3531	return -EINVAL;
3532
3533	/ Avoid creating shadow mounts during mount propagation. /
3534	if (path_overmounted(path: from))
3535	return -EINVAL;
3536
3537	/*
3538	* Mounting beneath the rootfs only makes sense when the
3539	* semantics of pivot_root(".", ".") are used.
3540	*/
3541	if (&mnt_to->mnt == current->fs->root.mnt)
3542	return -EINVAL;
3543	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3544	return -EINVAL;
3545
3546	for (struct mount *p = mnt_from; mnt_has_parent(mnt: p); p = p->mnt_parent)
3547	if (p == mnt_to)
3548	return -EINVAL;
3549
3550	/*
3551	* If the parent mount propagates to the child mount this would
3552	* mean mounting @mnt_from on @mnt_to->mnt_parent and then
3553	* propagating a copy @c of @mnt_from on top of @mnt_to. This
3554	* defeats the whole purpose of mounting beneath another mount.
3555	*/
3556	if (propagation_would_overmount(from: parent_mnt_to, to: mnt_to, mp))
3557	return -EINVAL;
3558
3559	/*
3560	* If @mnt_to->mnt_parent propagates to @mnt_from this would
3561	* mean propagating a copy @c of @mnt_from on top of @mnt_from.
3562	* Afterwards @mnt_from would be mounted on top of
3563	* @mnt_to->mnt_parent and @mnt_to would be unmounted from
3564	* @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3565	* already mounted on @mnt_from, @mnt_to would ultimately be
3566	* remounted on top of @c. Afterwards, @mnt_from would be
3567	* covered by a copy @c of @mnt_from and @c would be covered by
3568	* @mnt_from itself. This defeats the whole purpose of mounting
3569	* @mnt_from beneath @mnt_to.
3570	*/
3571	if (check_mnt(mnt: mnt_from) &&
3572	propagation_would_overmount(from: parent_mnt_to, to: mnt_from, mp))
3573	return -EINVAL;
3574
3575	return `0`;
3576	}
3577
3578	/ may_use_mount() - check if a mount tree can be used*
3579	* @mnt: vfsmount to be used
3580	*
3581	* This helper checks if the caller may use the mount tree starting
3582	* from @path->mnt. The caller may use the mount tree under the
3583	* following circumstances:
3584	*
3585	* (1) The caller is located in the mount namespace of the mount tree.
3586	* This also implies that the mount does not belong to an anonymous
3587	* mount namespace.
3588	* (2) The caller is trying to use a mount tree that belongs to an
3589	* anonymous mount namespace.
3590	*
3591	* For that to be safe, this helper enforces that the origin mount
3592	* namespace the anonymous mount namespace was created from is the
3593	* same as the caller's mount namespace by comparing the sequence
3594	* numbers.
3595	*
3596	* The ownership of a non-anonymous mount namespace such as the
3597	* caller's cannot change.
3598	* => We know that the caller's mount namespace is stable.
3599	*
3600	* If the origin sequence number of the anonymous mount namespace is
3601	* the same as the sequence number of the caller's mount namespace.
3602	* => The owning namespaces are the same.
3603	*
3604	* ==> The earlier capability check on the owning namespace of the
3605	* caller's mount namespace ensures that the caller has the
3606	* ability to use the mount tree.
3607	*
3608	* Returns true if the mount tree can be used, false otherwise.
3609	*/
3610	static inline bool may_use_mount(struct mount *mnt)
3611	{
3612	if (check_mnt(mnt))
3613	return true;
3614
3615	/*
3616	* Make sure that noone unmounted the target path or somehow
3617	* managed to get their hands on something purely kernel
3618	* internal.
3619	*/
3620	if (!is_mounted(mnt: &mnt->mnt))
3621	return false;
3622
3623	return check_anonymous_mnt(mnt);
3624	}
3625
3626	static int do_move_mount(struct path *old_path,
3627	struct path new_path, enum* mnt_tree_flags_t flags)
3628	{
3629	struct mnt_namespace *ns;
3630	struct mount *p;
3631	struct mount *old;
3632	struct mount *parent;
3633	struct mountpoint mp, old_mp;
3634	int err;
3635	bool attached, beneath = flags & MNT_TREE_BENEATH;
3636
3637	mp = do_lock_mount(path: new_path, beneath);
3638	if (IS_ERR(ptr: mp))
3639	return PTR_ERR(ptr: mp);
3640
3641	old = real_mount(mnt: old_path->mnt);
3642	p = real_mount(mnt: new_path->mnt);
3643	parent = old->mnt_parent;
3644	attached = mnt_has_parent(mnt: old);
3645	if (attached)
3646	flags \|= MNT_TREE_MOVE;
3647	old_mp = old->mnt_mp;
3648	ns = old->mnt_ns;
3649
3650	err = -EINVAL;
3651	/ The thing moved must be mounted... /
3652	if (!is_mounted(mnt: &old->mnt))
3653	goto out;
3654
3655	if (check_mnt(mnt: old)) {
3656	/ if the source is in our namespace... /
3657	/ ... it should be detachable from parent /
3658	if (!mnt_has_parent(mnt: old) \|\| IS_MNT_LOCKED(old))
3659	goto out;
3660	/ ... and the target should be in our namespace /
3661	if (!check_mnt(mnt: p))
3662	goto out;
3663	} else {
3664	/*
3665	* otherwise the source must be the root of some anon namespace.
3666	* AV: check for mount being root of an anon namespace is worth
3667	* an inlined predicate...
3668	*/
3669	if (!is_anon_ns(ns) \|\| mnt_has_parent(mnt: old))
3670	goto out;
3671	/*
3672	* Bail out early if the target is within the same namespace -
3673	* subsequent checks would've rejected that, but they lose
3674	* some corner cases if we check it early.
3675	*/
3676	if (ns == p->mnt_ns)
3677	goto out;
3678	/*
3679	* Target should be either in our namespace or in an acceptable
3680	* anon namespace, sensu check_anonymous_mnt().
3681	*/
3682	if (!may_use_mount(mnt: p))
3683	goto out;
3684	}
3685
3686	if (!path_mounted(path: old_path))
3687	goto out;
3688
3689	if (d_is_dir(dentry: new_path->dentry) !=
3690	d_is_dir(dentry: old_path->dentry))
3691	goto out;
3692	/*
3693	* Don't move a mount residing in a shared parent.
3694	*/
3695	if (attached && IS_MNT_SHARED(parent))
3696	goto out;
3697
3698	if (beneath) {
3699	err = can_move_mount_beneath(from: old_path, to: new_path, mp);
3700	if (err)
3701	goto out;
3702
3703	err = -EINVAL;
3704	p = p->mnt_parent;
3705	flags \|= MNT_TREE_BENEATH;
3706	}
3707
3708	/*
3709	* Don't move a mount tree containing unbindable mounts to a destination
3710	* mount which is shared.
3711	*/
3712	if (IS_MNT_SHARED(p) && tree_contains_unbindable(mnt: old))
3713	goto out;
3714	err = -ELOOP;
3715	if (!check_for_nsfs_mounts(subtree: old))
3716	goto out;
3717	for (; mnt_has_parent(mnt: p); p = p->mnt_parent)
3718	if (p == old)
3719	goto out;
3720
3721	err = attach_recursive_mnt(source_mnt: old, top_mnt: real_mount(mnt: new_path->mnt), dest_mp: mp, flags);
3722	if (err)
3723	goto out;
3724
3725	/ if the mount is moved, it should no longer be expire*
3726	* automatically */
3727	list_del_init(entry: &old->mnt_expire);
3728	if (attached)
3729	put_mountpoint(mp: old_mp);
3730	out:
3731	unlock_mount(where: mp);
3732	if (!err) {
3733	if (attached) {
3734	mntput_no_expire(mnt: parent);
3735	} else {
3736	/ Make sure we notice when we leak mounts. /
3737	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
3738	free_mnt_ns(ns);
3739	}
3740	}
3741	return err;
3742	}
3743
3744	static int do_move_mount_old(struct path path, const* char *old_name)
3745	{
3746	struct path old_path;
3747	int err;
3748
3749	if (!old_name \|\| !*old_name)
3750	return -EINVAL;
3751
3752	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3753	if (err)
3754	return err;
3755
3756	err = do_move_mount(old_path: &old_path, new_path: path, flags: `0`);
3757	path_put(&old_path);
3758	return err;
3759	}
3760
3761	/*
3762	* add a mount into a namespace's mount tree
3763	*/
3764	static int do_add_mount(struct mount newmnt, struct* mountpoint *mp,
3765	const struct path path, int* mnt_flags)
3766	{
3767	struct mount *parent = real_mount(mnt: path->mnt);
3768
3769	mnt_flags &= ~MNT_INTERNAL_FLAGS;
3770
3771	if (unlikely(!check_mnt(parent))) {
3772	/ that's acceptable only for automounts done in private ns /
3773	if (!(mnt_flags & MNT_SHRINKABLE))
3774	return -EINVAL;
3775	/ ... and for those we'd better have mountpoint still alive /
3776	if (!parent->mnt_ns)
3777	return -EINVAL;
3778	}
3779
3780	/ Refuse the same filesystem on the same mount point /
3781	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
3782	return -EBUSY;
3783
3784	if (d_is_symlink(dentry: newmnt->mnt.mnt_root))
3785	return -EINVAL;
3786
3787	newmnt->mnt.mnt_flags = mnt_flags;
3788	return graft_tree(mnt: newmnt, p: parent, mp);
3789	}
3790
3791	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags);
3792
3793	/*
3794	* Create a new mount using a superblock configuration and request it
3795	* be added to the namespace tree.
3796	*/
3797	static int do_new_mount_fc(struct fs_context fc, struct* path *mountpoint,
3798	unsigned int mnt_flags)
3799	{
3800	struct vfsmount *mnt;
3801	struct mountpoint *mp;
3802	struct super_block *sb = fc->root->d_sb;
3803	int error;
3804
3805	error = security_sb_kern_mount(sb);
3806	if (!error && mount_too_revealing(sb, new_mnt_flags: &mnt_flags))
3807	error = -EPERM;
3808
3809	if (unlikely(error)) {
3810	fc_drop_locked(fc);
3811	return error;
3812	}
3813
3814	up_write(sem: &sb->s_umount);
3815
3816	mnt = vfs_create_mount(fc);
3817	if (IS_ERR(ptr: mnt))
3818	return PTR_ERR(ptr: mnt);
3819
3820	mnt_warn_timestamp_expiry(mountpoint, mnt);
3821
3822	mp = lock_mount(path: mountpoint);
3823	if (IS_ERR(ptr: mp)) {
3824	mntput(mnt);
3825	return PTR_ERR(ptr: mp);
3826	}
3827	error = do_add_mount(newmnt: real_mount(mnt), mp, path: mountpoint, mnt_flags);
3828	unlock_mount(where: mp);
3829	if (error < `0`)
3830	mntput(mnt);
3831	return error;
3832	}
3833
3834	/*
3835	* create a new mount for userspace and request it to be added into the
3836	* namespace's tree
3837	*/
3838	static int do_new_mount(struct path path, const* char fstype, int* sb_flags,
3839	int mnt_flags, const char name, void* *data)
3840	{
3841	struct file_system_type *type;
3842	struct fs_context *fc;
3843	const char *subtype = NULL;
3844	int err = `0`;
3845
3846	if (!fstype)
3847	return -EINVAL;
3848
3849	type = get_fs_type(name: fstype);
3850	if (!type)
3851	return -ENODEV;
3852
3853	if (type->fs_flags & FS_HAS_SUBTYPE) {
3854	subtype = strchr(fstype, `'.'`);
3855	if (subtype) {
3856	subtype++;
3857	if (!*subtype) {
3858	put_filesystem(fs: type);
3859	return -EINVAL;
3860	}
3861	}
3862	}
3863
3864	fc = fs_context_for_mount(fs_type: type, sb_flags);
3865	put_filesystem(fs: type);
3866	if (IS_ERR(ptr: fc))
3867	return PTR_ERR(ptr: fc);
3868
3869	/*
3870	* Indicate to the filesystem that the mount request is coming
3871	* from the legacy mount system call.
3872	*/
3873	fc->oldapi = true;
3874
3875	if (subtype)
3876	err = vfs_parse_fs_string(fc, key: "subtype",
3877	value: subtype, strlen(subtype));
3878	if (!err && name)
3879	err = vfs_parse_fs_string(fc, key: "source", value: name, strlen(name));
3880	if (!err)
3881	err = parse_monolithic_mount_data(fc, data);
3882	if (!err && !mount_capable(fc))
3883	err = -EPERM;
3884	if (!err)
3885	err = vfs_get_tree(fc);
3886	if (!err)
3887	err = do_new_mount_fc(fc, mountpoint: path, mnt_flags);
3888
3889	put_fs_context(fc);
3890	return err;
3891	}
3892
3893	int finish_automount(struct vfsmount m, const* struct path *path)
3894	{
3895	struct dentry *dentry = path->dentry;
3896	struct mountpoint *mp;
3897	struct mount *mnt;
3898	int err;
3899
3900	if (!m)
3901	return `0`;
3902	if (IS_ERR(ptr: m))
3903	return PTR_ERR(ptr: m);
3904
3905	mnt = real_mount(mnt: m);
3906
3907	if (m->mnt_sb == path->mnt->mnt_sb &&
3908	m->mnt_root == dentry) {
3909	err = -ELOOP;
3910	goto discard;
3911	}
3912
3913	/*
3914	* we don't want to use lock_mount() - in this case finding something
3915	* that overmounts our mountpoint to be means "quitely drop what we've
3916	* got", not "try to mount it on top".
3917	*/
3918	inode_lock(inode: dentry->d_inode);
3919	namespace_lock();
3920	if (unlikely(cant_mount(dentry))) {
3921	err = -ENOENT;
3922	goto discard_locked;
3923	}
3924	if (path_overmounted(path)) {
3925	err = `0`;
3926	goto discard_locked;
3927	}
3928	mp = get_mountpoint(dentry);
3929	if (IS_ERR(ptr: mp)) {
3930	err = PTR_ERR(ptr: mp);
3931	goto discard_locked;
3932	}
3933
3934	err = do_add_mount(newmnt: mnt, mp, path, mnt_flags: path->mnt->mnt_flags \| MNT_SHRINKABLE);
3935	unlock_mount(where: mp);
3936	if (unlikely(err))
3937	goto discard;
3938	return `0`;
3939
3940	discard_locked:
3941	namespace_unlock();
3942	inode_unlock(inode: dentry->d_inode);
3943	discard:
3944	/ remove m from any expiration list it may be on /
3945	if (!list_empty(head: &mnt->mnt_expire)) {
3946	namespace_lock();
3947	list_del_init(entry: &mnt->mnt_expire);
3948	namespace_unlock();
3949	}
3950	mntput(m);
3951	return err;
3952	}
3953
3954	/**
3955	* mnt_set_expiry - Put a mount on an expiration list
3956	* @mnt: The mount to list.
3957	* @expiry_list: The list to add the mount to.
3958	*/
3959	void mnt_set_expiry(struct vfsmount mnt, struct* list_head *expiry_list)
3960	{
3961	namespace_lock();
3962
3963	list_add_tail(new: &real_mount(mnt)->mnt_expire, head: expiry_list);
3964
3965	namespace_unlock();
3966	}
3967	EXPORT_SYMBOL(mnt_set_expiry);
3968
3969	/*
3970	* process a list of expirable mountpoints with the intent of discarding any
3971	* mountpoints that aren't in use and haven't been touched since last we came
3972	* here
3973	*/
3974	void mark_mounts_for_expiry(struct list_head *mounts)
3975	{
3976	struct mount mnt, next;
3977	LIST_HEAD(graveyard);
3978
3979	if (list_empty(head: mounts))
3980	return;
3981
3982	namespace_lock();
3983	lock_mount_hash();
3984
3985	/ extract from the expiration list every vfsmount that matches the*
3986	* following criteria:
3987	* - already mounted
3988	* - only referenced by its parent vfsmount
3989	* - still marked for expiry (marked on the last call here; marks are
3990	* cleared by mntput())
3991	*/
3992	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3993	if (!is_mounted(mnt: &mnt->mnt))
3994	continue;
3995	if (!xchg(&mnt->mnt_expiry_mark, `1`) \|\|
3996	propagate_mount_busy(mnt, `1`))
3997	continue;
3998	list_move(list: &mnt->mnt_expire, head: &graveyard);
3999	}
4000	while (!list_empty(head: &graveyard)) {
4001	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
4002	touch_mnt_namespace(ns: mnt->mnt_ns);
4003	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
4004	}
4005	unlock_mount_hash();
4006	namespace_unlock();
4007	}
4008
4009	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
4010
4011	/*
4012	* Ripoff of 'select_parent()'
4013	*
4014	* search the list of submounts for a given mountpoint, and move any
4015	* shrinkable submounts to the 'graveyard' list.
4016	*/
4017	static int select_submounts(struct mount parent, struct* list_head *graveyard)
4018	{
4019	struct mount *this_parent = parent;
4020	struct list_head *next;
4021	int found = `0`;
4022
4023	repeat:
4024	next = this_parent->mnt_mounts.next;
4025	resume:
4026	while (next != &this_parent->mnt_mounts) {
4027	struct list_head *tmp = next;
4028	struct mount mnt = list_entry(tmp, struct* mount, mnt_child);
4029
4030	next = tmp->next;
4031	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
4032	continue;
4033	/*
4034	* Descend a level if the d_mounts list is non-empty.
4035	*/
4036	if (!list_empty(head: &mnt->mnt_mounts)) {
4037	this_parent = mnt;
4038	goto repeat;
4039	}
4040
4041	if (!propagate_mount_busy(mnt, `1`)) {
4042	list_move_tail(list: &mnt->mnt_expire, head: graveyard);
4043	found++;
4044	}
4045	}
4046	/*
4047	* All done at this level ... ascend and resume the search
4048	*/
4049	if (this_parent != parent) {
4050	next = this_parent->mnt_child.next;
4051	this_parent = this_parent->mnt_parent;
4052	goto resume;
4053	}
4054	return found;
4055	}
4056
4057	/*
4058	* process a list of expirable mountpoints with the intent of discarding any
4059	* submounts of a specific parent mountpoint
4060	*
4061	* mount_lock must be held for write
4062	*/
4063	static void shrink_submounts(struct mount *mnt)
4064	{
4065	LIST_HEAD(graveyard);
4066	struct mount *m;
4067
4068	/ extract submounts of 'mountpoint' from the expiration list /
4069	while (select_submounts(parent: mnt, graveyard: &graveyard)) {
4070	while (!list_empty(head: &graveyard)) {
4071	m = list_first_entry(&graveyard, struct mount,
4072	mnt_expire);
4073	touch_mnt_namespace(ns: m->mnt_ns);
4074	umount_tree(mnt: m, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
4075	}
4076	}
4077	}
4078
4079	static void copy_mount_options(const* void __user * data)
4080	{
4081	char *copy;
4082	unsigned left, offset;
4083
4084	if (!data)
4085	return NULL;
4086
4087	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
4088	if (!copy)
4089	return ERR_PTR(error: -ENOMEM);
4090
4091	left = copy_from_user(to: copy, from: data, PAGE_SIZE);
4092
4093	/*
4094	* Not all architectures have an exact copy_from_user(). Resort to
4095	* byte at a time.
4096	*/
4097	offset = PAGE_SIZE - left;
4098	while (left) {
4099	char c;
4100	if (get_user(c, (const char __user *)data + offset))
4101	break;
4102	copy[offset] = c;
4103	left--;
4104	offset++;
4105	}
4106
4107	if (left == PAGE_SIZE) {
4108	kfree(objp: copy);
4109	return ERR_PTR(error: -EFAULT);
4110	}
4111
4112	return copy;
4113	}
4114
4115	static char copy_mount_string(const* void __user *data)
4116	{
4117	return data ? strndup_user(data, PATH_MAX) : NULL;
4118	}
4119
4120	/*
4121	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
4122	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
4123	*
4124	* data is a (void *) that can point to any structure up to
4125	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
4126	* information (or be NULL).
4127	*
4128	* Pre-0.97 versions of mount() didn't have a flags word.
4129	* When the flags word was introduced its top half was required
4130	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
4131	* Therefore, if this magic number is present, it carries no information
4132	* and must be discarded.
4133	*/
4134	int path_mount(const char dev_name, struct* path *path,
4135	const char type_page, unsigned* long flags, void *data_page)
4136	{
4137	unsigned int mnt_flags = `0`, sb_flags;
4138	int ret;
4139
4140	/ Discard magic /
4141	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
4142	flags &= ~MS_MGC_MSK;
4143
4144	/ Basic sanity checks /
4145	if (data_page)
4146	((char *)data_page)[PAGE_SIZE - `1`] = `0`;
4147
4148	if (flags & MS_NOUSER)
4149	return -EINVAL;
4150
4151	ret = security_sb_mount(dev_name, path, type: type_page, flags, data: data_page);
4152	if (ret)
4153	return ret;
4154	if (!may_mount())
4155	return -EPERM;
4156	if (flags & SB_MANDLOCK)
4157	warn_mandlock();
4158
4159	/ Default to relatime unless overriden /
4160	if (!(flags & MS_NOATIME))
4161	mnt_flags \|= MNT_RELATIME;
4162
4163	/ Separate the per-mountpoint flags /
4164	if (flags & MS_NOSUID)
4165	mnt_flags \|= MNT_NOSUID;
4166	if (flags & MS_NODEV)
4167	mnt_flags \|= MNT_NODEV;
4168	if (flags & MS_NOEXEC)
4169	mnt_flags \|= MNT_NOEXEC;
4170	if (flags & MS_NOATIME)
4171	mnt_flags \|= MNT_NOATIME;
4172	if (flags & MS_NODIRATIME)
4173	mnt_flags \|= MNT_NODIRATIME;
4174	if (flags & MS_STRICTATIME)
4175	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
4176	if (flags & MS_RDONLY)
4177	mnt_flags \|= MNT_READONLY;
4178	if (flags & MS_NOSYMFOLLOW)
4179	mnt_flags \|= MNT_NOSYMFOLLOW;
4180
4181	/ The default atime for remount is preservation /
4182	if ((flags & MS_REMOUNT) &&
4183	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
4184	MS_STRICTATIME)) == `0`)) {
4185	mnt_flags &= ~MNT_ATIME_MASK;
4186	mnt_flags \|= path->mnt->mnt_flags & MNT_ATIME_MASK;
4187	}
4188
4189	sb_flags = flags & (SB_RDONLY \|
4190	SB_SYNCHRONOUS \|
4191	SB_MANDLOCK \|
4192	SB_DIRSYNC \|
4193	SB_SILENT \|
4194	SB_POSIXACL \|
4195	SB_LAZYTIME \|
4196	SB_I_VERSION);
4197
4198	if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
4199	return do_reconfigure_mnt(path, mnt_flags);
4200	if (flags & MS_REMOUNT)
4201	return do_remount(path, ms_flags: flags, sb_flags, mnt_flags, data: data_page);
4202	if (flags & MS_BIND)
4203	return do_loopback(path, old_name: dev_name, recurse: flags & MS_REC);
4204	if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
4205	return do_change_type(path, ms_flags: flags);
4206	if (flags & MS_MOVE)
4207	return do_move_mount_old(path, old_name: dev_name);
4208
4209	return do_new_mount(path, fstype: type_page, sb_flags, mnt_flags, name: dev_name,
4210	data: data_page);
4211	}
4212
4213	int do_mount(const char dev_name, const* char __user *dir_name,
4214	const char type_page, unsigned* long flags, void *data_page)
4215	{
4216	struct path path;
4217	int ret;
4218
4219	ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
4220	if (ret)
4221	return ret;
4222	ret = path_mount(dev_name, path: &path, type_page, flags, data_page);
4223	path_put(&path);
4224	return ret;
4225	}
4226
4227	static struct ucounts inc_mnt_namespaces(struct* user_namespace *ns)
4228	{
4229	return inc_ucount(ns, current_euid(), type: UCOUNT_MNT_NAMESPACES);
4230	}
4231
4232	static void dec_mnt_namespaces(struct ucounts *ucounts)
4233	{
4234	dec_ucount(ucounts, type: UCOUNT_MNT_NAMESPACES);
4235	}
4236
4237	static void free_mnt_ns(struct mnt_namespace *ns)
4238	{
4239	if (!is_anon_ns(ns))
4240	ns_free_inum(&ns->ns);
4241	dec_mnt_namespaces(ucounts: ns->ucounts);
4242	mnt_ns_tree_remove(ns);
4243	}
4244
4245	/*
4246	* Assign a sequence number so we can detect when we attempt to bind
4247	* mount a reference to an older mount namespace into the current
4248	* mount namespace, preventing reference counting loops. A 64bit
4249	* number incrementing at 10Ghz will take 12,427 years to wrap which
4250	* is effectively never, so we can ignore the possibility.
4251	*/
4252	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(`1`);
4253
4254	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *user_ns, bool anon)
4255	{
4256	struct mnt_namespace *new_ns;
4257	struct ucounts *ucounts;
4258	int ret;
4259
4260	ucounts = inc_mnt_namespaces(ns: user_ns);
4261	if (!ucounts)
4262	return ERR_PTR(error: -ENOSPC);
4263
4264	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
4265	if (!new_ns) {
4266	dec_mnt_namespaces(ucounts);
4267	return ERR_PTR(error: -ENOMEM);
4268	}
4269	if (!anon) {
4270	ret = ns_alloc_inum(ns: &new_ns->ns);
4271	if (ret) {
4272	kfree(objp: new_ns);
4273	dec_mnt_namespaces(ucounts);
4274	return ERR_PTR(error: ret);
4275	}
4276	}
4277	new_ns->ns.ops = &mntns_operations;
4278	if (!anon)
4279	new_ns->seq = atomic64_inc_return(v: &mnt_ns_seq);
4280	refcount_set(r: &new_ns->ns.count, n: `1`);
4281	refcount_set(r: &new_ns->passive, n: `1`);
4282	new_ns->mounts = RB_ROOT;
4283	INIT_LIST_HEAD(list: &new_ns->mnt_ns_list);
4284	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
4285	init_waitqueue_head(&new_ns->poll);
4286	new_ns->user_ns = get_user_ns(ns: user_ns);
4287	new_ns->ucounts = ucounts;
4288	return new_ns;
4289	}
4290
4291	__latent_entropy
4292	struct mnt_namespace copy_mnt_ns(unsigned* long flags, struct mnt_namespace *ns,
4293	struct user_namespace user_ns, struct* fs_struct *new_fs)
4294	{
4295	struct mnt_namespace *new_ns;
4296	struct vfsmount rootmnt = NULL, pwdmnt = NULL;
4297	struct mount p, q;
4298	struct mount *old;
4299	struct mount *new;
4300	int copy_flags;
4301
4302	BUG_ON(!ns);
4303
4304	if (likely(!(flags & CLONE_NEWNS))) {
4305	get_mnt_ns(ns);
4306	return ns;
4307	}
4308
4309	old = ns->root;
4310
4311	new_ns = alloc_mnt_ns(user_ns, anon: false);
4312	if (IS_ERR(ptr: new_ns))
4313	return new_ns;
4314
4315	namespace_lock();
4316	/ First pass: copy the tree topology /
4317	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
4318	if (user_ns != ns->user_ns)
4319	copy_flags \|= CL_SHARED_TO_SLAVE;
4320	new = copy_tree(src_root: old, dentry: old->mnt.mnt_root, flag: copy_flags);
4321	if (IS_ERR(ptr: new)) {
4322	namespace_unlock();
4323	ns_free_inum(&new_ns->ns);
4324	dec_mnt_namespaces(ucounts: new_ns->ucounts);
4325	mnt_ns_release(ns: new_ns);
4326	return ERR_CAST(ptr: new);
4327	}
4328	if (user_ns != ns->user_ns) {
4329	lock_mount_hash();
4330	lock_mnt_tree(mnt: new);
4331	unlock_mount_hash();
4332	}
4333	new_ns->root = new;
4334
4335	/*
4336	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
4337	* as belonging to new namespace. We have already acquired a private
4338	* fs_struct, so tsk->fs->lock is not needed.
4339	*/
4340	p = old;
4341	q = new;
4342	while (p) {
4343	mnt_add_to_ns(ns: new_ns, mnt: q);
4344	new_ns->nr_mounts++;
4345	if (new_fs) {
4346	if (&p->mnt == new_fs->root.mnt) {
4347	new_fs->root.mnt = mntget(&q->mnt);
4348	rootmnt = &p->mnt;
4349	}
4350	if (&p->mnt == new_fs->pwd.mnt) {
4351	new_fs->pwd.mnt = mntget(&q->mnt);
4352	pwdmnt = &p->mnt;
4353	}
4354	}
4355	p = next_mnt(p, root: old);
4356	q = next_mnt(p: q, root: new);
4357	if (!q)
4358	break;
4359	// an mntns binding we'd skipped?
4360	while (p->mnt.mnt_root != q->mnt.mnt_root)
4361	p = next_mnt(p: skip_mnt_tree(p), root: old);
4362	}
4363	namespace_unlock();
4364
4365	if (rootmnt)
4366	mntput(rootmnt);
4367	if (pwdmnt)
4368	mntput(pwdmnt);
4369
4370	mnt_ns_tree_add(ns: new_ns);
4371	return new_ns;
4372	}
4373
4374	struct dentry mount_subtree(struct* vfsmount m, const* char *name)
4375	{
4376	struct mount *mnt = real_mount(mnt: m);
4377	struct mnt_namespace *ns;
4378	struct super_block *s;
4379	struct path path;
4380	int err;
4381
4382	ns = alloc_mnt_ns(user_ns: &init_user_ns, anon: true);
4383	if (IS_ERR(ptr: ns)) {
4384	mntput(m);
4385	return ERR_CAST(ptr: ns);
4386	}
4387	ns->root = mnt;
4388	ns->nr_mounts++;
4389	mnt_add_to_ns(ns, mnt);
4390
4391	err = vfs_path_lookup(m->mnt_root, m,
4392	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
4393
4394	put_mnt_ns(ns);
4395
4396	if (err)
4397	return ERR_PTR(error: err);
4398
4399	/ trade a vfsmount reference for active sb one /
4400	s = path.mnt->mnt_sb;
4401	atomic_inc(v: &s->s_active);
4402	mntput(path.mnt);
4403	/ lock the sucker /
4404	down_write(sem: &s->s_umount);
4405	/ ... and return the root of (sub)tree on it /
4406	return path.dentry;
4407	}
4408	EXPORT_SYMBOL(mount_subtree);
4409
4410	SYSCALL_DEFINE5(mount, char __user , dev_name, char* __user *, dir_name,
4411	char __user , type, unsigned* long, flags, void __user *, data)
4412	{
4413	int ret;
4414	char *kernel_type;
4415	char *kernel_dev;
4416	void *options;
4417
4418	kernel_type = copy_mount_string(data: type);
4419	ret = PTR_ERR(ptr: kernel_type);
4420	if (IS_ERR(ptr: kernel_type))
4421	goto out_type;
4422
4423	kernel_dev = copy_mount_string(data: dev_name);
4424	ret = PTR_ERR(ptr: kernel_dev);
4425	if (IS_ERR(ptr: kernel_dev))
4426	goto out_dev;
4427
4428	options = copy_mount_options(data);
4429	ret = PTR_ERR(ptr: options);
4430	if (IS_ERR(ptr: options))
4431	goto out_data;
4432
4433	ret = do_mount(dev_name: kernel_dev, dir_name, type_page: kernel_type, flags, data_page: options);
4434
4435	kfree(objp: options);
4436	out_data:
4437	kfree(objp: kernel_dev);
4438	out_dev:
4439	kfree(objp: kernel_type);
4440	out_type:
4441	return ret;
4442	}
4443
4444	#define FSMOUNT_VALID_FLAGS \
4445	(MOUNT_ATTR_RDONLY \| MOUNT_ATTR_NOSUID \| MOUNT_ATTR_NODEV \| \
4446	MOUNT_ATTR_NOEXEC \| MOUNT_ATTR__ATIME \| MOUNT_ATTR_NODIRATIME \| \
4447	MOUNT_ATTR_NOSYMFOLLOW)
4448
4449	#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS \| MOUNT_ATTR_IDMAP)
4450
4451	#define MOUNT_SETATTR_PROPAGATION_FLAGS \
4452	(MS_UNBINDABLE \| MS_PRIVATE \| MS_SLAVE \| MS_SHARED)
4453
4454	static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
4455	{
4456	unsigned int mnt_flags = `0`;
4457
4458	if (attr_flags & MOUNT_ATTR_RDONLY)
4459	mnt_flags \|= MNT_READONLY;
4460	if (attr_flags & MOUNT_ATTR_NOSUID)
4461	mnt_flags \|= MNT_NOSUID;
4462	if (attr_flags & MOUNT_ATTR_NODEV)
4463	mnt_flags \|= MNT_NODEV;
4464	if (attr_flags & MOUNT_ATTR_NOEXEC)
4465	mnt_flags \|= MNT_NOEXEC;
4466	if (attr_flags & MOUNT_ATTR_NODIRATIME)
4467	mnt_flags \|= MNT_NODIRATIME;
4468	if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
4469	mnt_flags \|= MNT_NOSYMFOLLOW;
4470
4471	return mnt_flags;
4472	}
4473
4474	/*
4475	* Create a kernel mount representation for a new, prepared superblock
4476	* (specified by fs_fd) and attach to an open_tree-like file descriptor.
4477	*/
4478	SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
4479	unsigned int, attr_flags)
4480	{
4481	struct mnt_namespace *ns;
4482	struct fs_context *fc;
4483	struct file *file;
4484	struct path newmount;
4485	struct mount *mnt;
4486	unsigned int mnt_flags = `0`;
4487	long ret;
4488
4489	if (!may_mount())
4490	return -EPERM;
4491
4492	if ((flags & ~(FSMOUNT_CLOEXEC)) != `0`)
4493	return -EINVAL;
4494
4495	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
4496	return -EINVAL;
4497
4498	mnt_flags = attr_flags_to_mnt_flags(attr_flags);
4499
4500	switch (attr_flags & MOUNT_ATTR__ATIME) {
4501	case MOUNT_ATTR_STRICTATIME:
4502	break;
4503	case MOUNT_ATTR_NOATIME:
4504	mnt_flags \|= MNT_NOATIME;
4505	break;
4506	case MOUNT_ATTR_RELATIME:
4507	mnt_flags \|= MNT_RELATIME;
4508	break;
4509	default:
4510	return -EINVAL;
4511	}
4512
4513	CLASS(fd, f)(fd: fs_fd);
4514	if (fd_empty(f))
4515	return -EBADF;
4516
4517	if (fd_file(f)->f_op != &fscontext_fops)
4518	return -EINVAL;
4519
4520	fc = fd_file(f)->private_data;
4521
4522	ret = mutex_lock_interruptible(&fc->uapi_mutex);
4523	if (ret < `0`)
4524	return ret;
4525
4526	/ There must be a valid superblock or we can't mount it /
4527	ret = -EINVAL;
4528	if (!fc->root)
4529	goto err_unlock;
4530
4531	ret = -EPERM;
4532	if (mount_too_revealing(sb: fc->root->d_sb, new_mnt_flags: &mnt_flags)) {
4533	pr_warn("VFS: Mount too revealing\n");
4534	goto err_unlock;
4535	}
4536
4537	ret = -EBUSY;
4538	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
4539	goto err_unlock;
4540
4541	if (fc->sb_flags & SB_MANDLOCK)
4542	warn_mandlock();
4543
4544	newmount.mnt = vfs_create_mount(fc);
4545	if (IS_ERR(ptr: newmount.mnt)) {
4546	ret = PTR_ERR(ptr: newmount.mnt);
4547	goto err_unlock;
4548	}
4549	newmount.dentry = dget(dentry: fc->root);
4550	newmount.mnt->mnt_flags = mnt_flags;
4551
4552	/ We've done the mount bit - now move the file context into more or*
4553	* less the same state as if we'd done an fspick(). We don't want to
4554	* do any memory allocation or anything like that at this point as we
4555	* don't want to have to handle any errors incurred.
4556	*/
4557	vfs_clean_context(fc);
4558
4559	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, anon: true);
4560	if (IS_ERR(ptr: ns)) {
4561	ret = PTR_ERR(ptr: ns);
4562	goto err_path;
4563	}
4564	mnt = real_mount(mnt: newmount.mnt);
4565	ns->root = mnt;
4566	ns->nr_mounts = `1`;
4567	mnt_add_to_ns(ns, mnt);
4568	mntget(newmount.mnt);
4569
4570	/ Attach to an apparent O_PATH fd with a note that we need to unmount*
4571	* it, not just simply put it.
4572	*/
4573	file = dentry_open(path: &newmount, O_PATH, creds: fc->cred);
4574	if (IS_ERR(ptr: file)) {
4575	dissolve_on_fput(mnt: newmount.mnt);
4576	ret = PTR_ERR(ptr: file);
4577	goto err_path;
4578	}
4579	file->f_mode \|= FMODE_NEED_UNMOUNT;
4580
4581	ret = get_unused_fd_flags(flags: (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : `0`);
4582	if (ret >= `0`)
4583	fd_install(fd: ret, file);
4584	else
4585	fput(file);
4586
4587	err_path:
4588	path_put(&newmount);
4589	err_unlock:
4590	mutex_unlock(lock: &fc->uapi_mutex);
4591	return ret;
4592	}
4593
4594	static inline int vfs_move_mount(struct path from_path, struct* path *to_path,
4595	enum mnt_tree_flags_t mflags)
4596	{
4597	int ret;
4598
4599	ret = security_move_mount(from_path, to_path);
4600	if (ret)
4601	return ret;
4602
4603	if (mflags & MNT_TREE_PROPAGATION)
4604	return do_set_group(from_path, to_path);
4605
4606	return do_move_mount(old_path: from_path, new_path: to_path, flags: mflags);
4607	}
4608
4609	/*
4610	* Move a mount from one place to another. In combination with
4611	* fsopen()/fsmount() this is used to install a new mount and in combination
4612	* with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
4613	* a mount subtree.
4614	*
4615	* Note the flags value is a combination of MOVE_MOUNT_* flags.
4616	*/
4617	SYSCALL_DEFINE5(move_mount,
4618	int, from_dfd, const char __user *, from_pathname,
4619	int, to_dfd, const char __user *, to_pathname,
4620	unsigned int, flags)
4621	{
4622	struct path to_path __free(path_put) = {};
4623	struct path from_path __free(path_put) = {};
4624	struct filename *to_name __free(putname) = NULL;
4625	struct filename *from_name __free(putname) = NULL;
4626	unsigned int lflags, uflags;
4627	enum mnt_tree_flags_t mflags = `0`;
4628	int ret = `0`;
4629
4630	if (!may_mount())
4631	return -EPERM;
4632
4633	if (flags & ~MOVE_MOUNT__MASK)
4634	return -EINVAL;
4635
4636	if ((flags & (MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP)) ==
4637	(MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP))
4638	return -EINVAL;
4639
4640	if (flags & MOVE_MOUNT_SET_GROUP) mflags \|= MNT_TREE_PROPAGATION;
4641	if (flags & MOVE_MOUNT_BENEATH) mflags \|= MNT_TREE_BENEATH;
4642
4643	lflags = `0`;
4644	if (flags & MOVE_MOUNT_F_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
4645	if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
4646	uflags = `0`;
4647	if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
4648	from_name = getname_maybe_null(name: from_pathname, flags: uflags);
4649	if (IS_ERR(ptr: from_name))
4650	return PTR_ERR(ptr: from_name);
4651
4652	lflags = `0`;
4653	if (flags & MOVE_MOUNT_T_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
4654	if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
4655	uflags = `0`;
4656	if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
4657	to_name = getname_maybe_null(name: to_pathname, flags: uflags);
4658	if (IS_ERR(ptr: to_name))
4659	return PTR_ERR(ptr: to_name);
4660
4661	if (!to_name && to_dfd >= `0`) {
4662	CLASS(fd_raw, f_to)(fd: to_dfd);
4663	if (fd_empty(f: f_to))
4664	return -EBADF;
4665
4666	to_path = fd_file(f_to)->f_path;
4667	path_get(&to_path);
4668	} else {
4669	ret = filename_lookup(dfd: to_dfd, name: to_name, flags: lflags, path: &to_path, NULL);
4670	if (ret)
4671	return ret;
4672	}
4673
4674	if (!from_name && from_dfd >= `0`) {
4675	CLASS(fd_raw, f_from)(fd: from_dfd);
4676	if (fd_empty(f: f_from))
4677	return -EBADF;
4678
4679	return vfs_move_mount(from_path: &fd_file(f_from)->f_path, to_path: &to_path, mflags);
4680	}
4681
4682	ret = filename_lookup(dfd: from_dfd, name: from_name, flags: lflags, path: &from_path, NULL);
4683	if (ret)
4684	return ret;
4685
4686	return vfs_move_mount(from_path: &from_path, to_path: &to_path, mflags);
4687	}
4688
4689	/*
4690	* Return true if path is reachable from root
4691	*
4692	* namespace_sem or mount_lock is held
4693	*/
4694	bool is_path_reachable(struct mount mnt, struct* dentry *dentry,
4695	const struct path *root)
4696	{
4697	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4698	dentry = mnt->mnt_mountpoint;
4699	mnt = mnt->mnt_parent;
4700	}
4701	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4702	}
4703
4704	bool path_is_under(const struct path path1, const* struct path *path2)
4705	{
4706	bool res;
4707	read_seqlock_excl(sl: &mount_lock);
4708	res = is_path_reachable(mnt: real_mount(mnt: path1->mnt), dentry: path1->dentry, root: path2);
4709	read_sequnlock_excl(sl: &mount_lock);
4710	return res;
4711	}
4712	EXPORT_SYMBOL(path_is_under);
4713
4714	/*
4715	* pivot_root Semantics:
4716	* Moves the root file system of the current process to the directory put_old,
4717	* makes new_root as the new root file system of the current process, and sets
4718	* root/cwd of all processes which had them on the current root to new_root.
4719	*
4720	* Restrictions:
4721	* The new_root and put_old must be directories, and must not be on the
4722	* same file system as the current process root. The put_old must be
4723	* underneath new_root, i.e. adding a non-zero number of /.. to the string
4724	* pointed to by put_old must yield the same directory as new_root. No other
4725	* file system may be mounted on put_old. After all, new_root is a mountpoint.
4726	*
4727	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4728	* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4729	* in this situation.
4730	*
4731	* Notes:
4732	* - we don't move root/cwd if they are not at the root (reason: if something
4733	* cared enough to change them, it's probably wrong to force them elsewhere)
4734	* - it's okay to pick a root that isn't the root of a file system, e.g.
4735	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4736	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4737	* first.
4738	*/
4739	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4740	const char __user *, put_old)
4741	{
4742	struct path new, old, root;
4743	struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
4744	struct mountpoint old_mp, root_mp;
4745	int error;
4746
4747	if (!may_mount())
4748	return -EPERM;
4749
4750	error = user_path_at(AT_FDCWD, new_root,
4751	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
4752	if (error)
4753	goto out0;
4754
4755	error = user_path_at(AT_FDCWD, put_old,
4756	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
4757	if (error)
4758	goto out1;
4759
4760	error = security_sb_pivotroot(old_path: &old, new_path: &new);
4761	if (error)
4762	goto out2;
4763
4764	get_fs_root(current->fs, root: &root);
4765	old_mp = lock_mount(path: &old);
4766	error = PTR_ERR(ptr: old_mp);
4767	if (IS_ERR(ptr: old_mp))
4768	goto out3;
4769
4770	error = -EINVAL;
4771	new_mnt = real_mount(mnt: new.mnt);
4772	root_mnt = real_mount(mnt: root.mnt);
4773	old_mnt = real_mount(mnt: old.mnt);
4774	ex_parent = new_mnt->mnt_parent;
4775	root_parent = root_mnt->mnt_parent;
4776	if (IS_MNT_SHARED(old_mnt) \|\|
4777	IS_MNT_SHARED(ex_parent) \|\|
4778	IS_MNT_SHARED(root_parent))
4779	goto out4;
4780	if (!check_mnt(mnt: root_mnt) \|\| !check_mnt(mnt: new_mnt))
4781	goto out4;
4782	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4783	goto out4;
4784	error = -ENOENT;
4785	if (d_unlinked(dentry: new.dentry))
4786	goto out4;
4787	error = -EBUSY;
4788	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
4789	goto out4; / loop, on the same file system /
4790	error = -EINVAL;
4791	if (!path_mounted(path: &root))
4792	goto out4; / not a mountpoint /
4793	if (!mnt_has_parent(mnt: root_mnt))
4794	goto out4; / not attached /
4795	if (!path_mounted(path: &new))
4796	goto out4; / not a mountpoint /
4797	if (!mnt_has_parent(mnt: new_mnt))
4798	goto out4; / not attached /
4799	/ make sure we can reach put_old from new_root /
4800	if (!is_path_reachable(mnt: old_mnt, dentry: old.dentry, root: &new))
4801	goto out4;
4802	/ make certain new is below the root /
4803	if (!is_path_reachable(mnt: new_mnt, dentry: new.dentry, root: &root))
4804	goto out4;
4805	lock_mount_hash();
4806	umount_mnt(mnt: new_mnt);
4807	root_mp = unhash_mnt(mnt: root_mnt); / we'll need its mountpoint /
4808	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4809	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
4810	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4811	}
4812	/ mount old root on put_old /
4813	attach_mnt(mnt: root_mnt, parent: old_mnt, mp: old_mp, beneath: false);
4814	/ mount new_root on / /
4815	attach_mnt(mnt: new_mnt, parent: root_parent, mp: root_mp, beneath: false);
4816	mnt_add_count(mnt: root_parent, n: -`1`);
4817	touch_mnt_namespace(current->nsproxy->mnt_ns);
4818	/ A moved mount should not expire automatically /
4819	list_del_init(entry: &new_mnt->mnt_expire);
4820	put_mountpoint(mp: root_mp);
4821	unlock_mount_hash();
4822	mnt_notify_add(m: root_mnt);
4823	mnt_notify_add(m: new_mnt);
4824	chroot_fs_refs(&root, &new);
4825	error = `0`;
4826	out4:
4827	unlock_mount(where: old_mp);
4828	if (!error)
4829	mntput_no_expire(mnt: ex_parent);
4830	out3:
4831	path_put(&root);
4832	out2:
4833	path_put(&old);
4834	out1:
4835	path_put(&new);
4836	out0:
4837	return error;
4838	}
4839
4840	static unsigned int recalc_flags(struct mount_kattr kattr, struct* mount *mnt)
4841	{
4842	unsigned int flags = mnt->mnt.mnt_flags;
4843
4844	/ flags to clear /
4845	flags &= ~kattr->attr_clr;
4846	/ flags to raise /
4847	flags \|= kattr->attr_set;
4848
4849	return flags;
4850	}
4851
4852	static int can_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4853	{
4854	struct vfsmount *m = &mnt->mnt;
4855	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4856
4857	if (!kattr->mnt_idmap)
4858	return `0`;
4859
4860	/*
4861	* Creating an idmapped mount with the filesystem wide idmapping
4862	* doesn't make sense so block that. We don't allow mushy semantics.
4863	*/
4864	if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
4865	return -EINVAL;
4866
4867	/*
4868	* We only allow an mount to change it's idmapping if it has
4869	* never been accessible to userspace.
4870	*/
4871	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(mnt: m))
4872	return -EPERM;
4873
4874	/ The underlying filesystem doesn't support idmapped mounts yet. /
4875	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4876	return -EINVAL;
4877
4878	/ The filesystem has turned off idmapped mounts. /
4879	if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
4880	return -EINVAL;
4881
4882	/ We're not controlling the superblock. /
4883	if (!ns_capable(ns: fs_userns, CAP_SYS_ADMIN))
4884	return -EPERM;
4885
4886	/ Mount has already been visible in the filesystem hierarchy. /
4887	if (!is_anon_ns(ns: mnt->mnt_ns))
4888	return -EINVAL;
4889
4890	return `0`;
4891	}
4892
4893	/**
4894	* mnt_allow_writers() - check whether the attribute change allows writers
4895	* @kattr: the new mount attributes
4896	* @mnt: the mount to which @kattr will be applied
4897	*
4898	* Check whether thew new mount attributes in @kattr allow concurrent writers.
4899	*
4900	* Return: true if writers need to be held, false if not
4901	*/
4902	static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4903	const struct mount *mnt)
4904	{
4905	return (!(kattr->attr_set & MNT_READONLY) \|\|
4906	(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4907	!kattr->mnt_idmap;
4908	}
4909
4910	static int mount_setattr_prepare(struct mount_kattr kattr, struct* mount *mnt)
4911	{
4912	struct mount *m;
4913	int err;
4914
4915	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4916	if (!can_change_locked_flags(mnt: m, mnt_flags: recalc_flags(kattr, mnt: m))) {
4917	err = -EPERM;
4918	break;
4919	}
4920
4921	err = can_idmap_mount(kattr, mnt: m);
4922	if (err)
4923	break;
4924
4925	if (!mnt_allow_writers(kattr, mnt: m)) {
4926	err = mnt_hold_writers(mnt: m);
4927	if (err)
4928	break;
4929	}
4930
4931	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4932	return `0`;
4933	}
4934
4935	if (err) {
4936	struct mount *p;
4937
4938	/*
4939	* If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4940	* be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4941	* mounts and needs to take care to include the first mount.
4942	*/
4943	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
4944	/ If we had to hold writers unblock them. /
4945	if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4946	mnt_unhold_writers(mnt: p);
4947
4948	/*
4949	* We're done once the first mount we changed got
4950	* MNT_WRITE_HOLD unset.
4951	*/
4952	if (p == m)
4953	break;
4954	}
4955	}
4956	return err;
4957	}
4958
4959	static void do_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4960	{
4961	struct mnt_idmap *old_idmap;
4962
4963	if (!kattr->mnt_idmap)
4964	return;
4965
4966	old_idmap = mnt_idmap(mnt: &mnt->mnt);
4967
4968	/ Pairs with smp_load_acquire() in mnt_idmap(). /
4969	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4970	mnt_idmap_put(idmap: old_idmap);
4971	}
4972
4973	static void mount_setattr_commit(struct mount_kattr kattr, struct* mount *mnt)
4974	{
4975	struct mount *m;
4976
4977	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4978	unsigned int flags;
4979
4980	do_idmap_mount(kattr, mnt: m);
4981	flags = recalc_flags(kattr, mnt: m);
4982	WRITE_ONCE(m->mnt.mnt_flags, flags);
4983
4984	/ If we had to hold writers unblock them. /
4985	if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4986	mnt_unhold_writers(mnt: m);
4987
4988	if (kattr->propagation)
4989	change_mnt_propagation(m, kattr->propagation);
4990	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4991	break;
4992	}
4993	touch_mnt_namespace(ns: mnt->mnt_ns);
4994	}
4995
4996	static int do_mount_setattr(struct path path, struct* mount_kattr *kattr)
4997	{
4998	struct mount *mnt = real_mount(mnt: path->mnt);
4999	int err = `0`;
5000
5001	if (!path_mounted(path))
5002	return -EINVAL;
5003
5004	if (kattr->mnt_userns) {
5005	struct mnt_idmap *mnt_idmap;
5006
5007	mnt_idmap = alloc_mnt_idmap(mnt_userns: kattr->mnt_userns);
5008	if (IS_ERR(ptr: mnt_idmap))
5009	return PTR_ERR(ptr: mnt_idmap);
5010	kattr->mnt_idmap = mnt_idmap;
5011	}
5012
5013	if (kattr->propagation) {
5014	/*
5015	* Only take namespace_lock() if we're actually changing
5016	* propagation.
5017	*/
5018	namespace_lock();
5019	if (kattr->propagation == MS_SHARED) {
5020	err = invent_group_ids(mnt, recurse: kattr->kflags & MOUNT_KATTR_RECURSE);
5021	if (err) {
5022	namespace_unlock();
5023	return err;
5024	}
5025	}
5026	}
5027
5028	err = -EINVAL;
5029	lock_mount_hash();
5030
5031	/ Ensure that this isn't anything purely vfs internal. /
5032	if (!is_mounted(mnt: &mnt->mnt))
5033	goto out;
5034
5035	/*
5036	* If this is an attached mount make sure it's located in the callers
5037	* mount namespace. If it's not don't let the caller interact with it.
5038	*
5039	* If this mount doesn't have a parent it's most often simply a
5040	* detached mount with an anonymous mount namespace. IOW, something
5041	* that's simply not attached yet. But there are apparently also users
5042	* that do change mount properties on the rootfs itself. That obviously
5043	* neither has a parent nor is it a detached mount so we cannot
5044	* unconditionally check for detached mounts.
5045	*/
5046	if ((mnt_has_parent(mnt) \|\| !is_anon_ns(ns: mnt->mnt_ns)) && !check_mnt(mnt))
5047	goto out;
5048
5049	/*
5050	* First, we get the mount tree in a shape where we can change mount
5051	* properties without failure. If we succeeded to do so we commit all
5052	* changes and if we failed we clean up.
5053	*/
5054	err = mount_setattr_prepare(kattr, mnt);
5055	if (!err)
5056	mount_setattr_commit(kattr, mnt);
5057
5058	out:
5059	unlock_mount_hash();
5060
5061	if (kattr->propagation) {
5062	if (err)
5063	cleanup_group_ids(mnt, NULL);
5064	namespace_unlock();
5065	}
5066
5067	return err;
5068	}
5069
5070	static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
5071	struct mount_kattr *kattr)
5072	{
5073	struct ns_common *ns;
5074	struct user_namespace *mnt_userns;
5075
5076	if (!((attr->attr_set \| attr->attr_clr) & MOUNT_ATTR_IDMAP))
5077	return `0`;
5078
5079	if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
5080	/*
5081	* We can only remove an idmapping if it's never been
5082	* exposed to userspace.
5083	*/
5084	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
5085	return -EINVAL;
5086
5087	/*
5088	* Removal of idmappings is equivalent to setting
5089	* nop_mnt_idmap.
5090	*/
5091	if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
5092	kattr->mnt_idmap = &nop_mnt_idmap;
5093	return `0`;
5094	}
5095	}
5096
5097	if (attr->userns_fd > INT_MAX)
5098	return -EINVAL;
5099
5100	CLASS(fd, f)(fd: attr->userns_fd);
5101	if (fd_empty(f))
5102	return -EBADF;
5103
5104	if (!proc_ns_file(fd_file(f)))
5105	return -EINVAL;
5106
5107	ns = get_proc_ns(file_inode(fd_file(f)));
5108	if (ns->ops->type != CLONE_NEWUSER)
5109	return -EINVAL;
5110
5111	/*
5112	* The initial idmapping cannot be used to create an idmapped
5113	* mount. We use the initial idmapping as an indicator of a mount
5114	* that is not idmapped. It can simply be passed into helpers that
5115	* are aware of idmapped mounts as a convenient shortcut. A user
5116	* can just create a dedicated identity mapping to achieve the same
5117	* result.
5118	*/
5119	mnt_userns = container_of(ns, struct user_namespace, ns);
5120	if (mnt_userns == &init_user_ns)
5121	return -EPERM;
5122
5123	/ We're not controlling the target namespace. /
5124	if (!ns_capable(ns: mnt_userns, CAP_SYS_ADMIN))
5125	return -EPERM;
5126
5127	kattr->mnt_userns = get_user_ns(ns: mnt_userns);
5128	return `0`;
5129	}
5130
5131	static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
5132	struct mount_kattr *kattr)
5133	{
5134	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
5135	return -EINVAL;
5136	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > `1`)
5137	return -EINVAL;
5138	kattr->propagation = attr->propagation;
5139
5140	if ((attr->attr_set \| attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
5141	return -EINVAL;
5142
5143	kattr->attr_set = attr_flags_to_mnt_flags(attr_flags: attr->attr_set);
5144	kattr->attr_clr = attr_flags_to_mnt_flags(attr_flags: attr->attr_clr);
5145
5146	/*
5147	* Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
5148	* users wanting to transition to a different atime setting cannot
5149	* simply specify the atime setting in @attr_set, but must also
5150	* specify MOUNT_ATTR__ATIME in the @attr_clr field.
5151	* So ensure that MOUNT_ATTR__ATIME can't be partially set in
5152	* @attr_clr and that @attr_set can't have any atime bits set if
5153	* MOUNT_ATTR__ATIME isn't set in @attr_clr.
5154	*/
5155	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
5156	if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
5157	return -EINVAL;
5158
5159	/*
5160	* Clear all previous time settings as they are mutually
5161	* exclusive.
5162	*/
5163	kattr->attr_clr \|= MNT_RELATIME \| MNT_NOATIME;
5164	switch (attr->attr_set & MOUNT_ATTR__ATIME) {
5165	case MOUNT_ATTR_RELATIME:
5166	kattr->attr_set \|= MNT_RELATIME;
5167	break;
5168	case MOUNT_ATTR_NOATIME:
5169	kattr->attr_set \|= MNT_NOATIME;
5170	break;
5171	case MOUNT_ATTR_STRICTATIME:
5172	break;
5173	default:
5174	return -EINVAL;
5175	}
5176	} else {
5177	if (attr->attr_set & MOUNT_ATTR__ATIME)
5178	return -EINVAL;
5179	}
5180
5181	return build_mount_idmapped(attr, usize, kattr);
5182	}
5183
5184	static void finish_mount_kattr(struct mount_kattr *kattr)
5185	{
5186	if (kattr->mnt_userns) {
5187	put_user_ns(ns: kattr->mnt_userns);
5188	kattr->mnt_userns = NULL;
5189	}
5190
5191	if (kattr->mnt_idmap)
5192	mnt_idmap_put(idmap: kattr->mnt_idmap);
5193	}
5194
5195	static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
5196	struct mount_kattr *kattr)
5197	{
5198	int ret;
5199	struct mount_attr attr;
5200
5201	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
5202
5203	if (unlikely(usize > PAGE_SIZE))
5204	return -E2BIG;
5205	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
5206	return -EINVAL;
5207
5208	if (!may_mount())
5209	return -EPERM;
5210
5211	ret = copy_struct_from_user(dst: &attr, ksize: sizeof(attr), src: uattr, usize);
5212	if (ret)
5213	return ret;
5214
5215	/ Don't bother walking through the mounts if this is a nop. /
5216	if (attr.attr_set == `0` &&
5217	attr.attr_clr == `0` &&
5218	attr.propagation == `0`)
5219	return `0`; / Tell caller to not bother. /
5220
5221	ret = build_mount_kattr(attr: &attr, usize, kattr);
5222	if (ret < `0`)
5223	return ret;
5224
5225	return `1`;
5226	}
5227
5228	SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
5229	unsigned int, flags, struct mount_attr __user *, uattr,
5230	size_t, usize)
5231	{
5232	int err;
5233	struct path target;
5234	struct mount_kattr kattr;
5235	unsigned int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
5236
5237	if (flags & ~(AT_EMPTY_PATH \|
5238	AT_RECURSIVE \|
5239	AT_SYMLINK_NOFOLLOW \|
5240	AT_NO_AUTOMOUNT))
5241	return -EINVAL;
5242
5243	if (flags & AT_NO_AUTOMOUNT)
5244	lookup_flags &= ~LOOKUP_AUTOMOUNT;
5245	if (flags & AT_SYMLINK_NOFOLLOW)
5246	lookup_flags &= ~LOOKUP_FOLLOW;
5247	if (flags & AT_EMPTY_PATH)
5248	lookup_flags \|= LOOKUP_EMPTY;
5249
5250	kattr = (struct mount_kattr) {
5251	.lookup_flags = lookup_flags,
5252	};
5253
5254	if (flags & AT_RECURSIVE)
5255	kattr.kflags \|= MOUNT_KATTR_RECURSE;
5256
5257	err = wants_mount_setattr(uattr, usize, kattr: &kattr);
5258	if (err <= `0`)
5259	return err;
5260
5261	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
5262	if (!err) {
5263	err = do_mount_setattr(path: &target, kattr: &kattr);
5264	path_put(&target);
5265	}
5266	finish_mount_kattr(kattr: &kattr);
5267	return err;
5268	}
5269
5270	SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
5271	unsigned, flags, struct mount_attr __user *, uattr,
5272	size_t, usize)
5273	{
5274	struct file __free(fput) *file = NULL;
5275	int fd;
5276
5277	if (!uattr && usize)
5278	return -EINVAL;
5279
5280	file = vfs_open_tree(dfd, filename, flags);
5281	if (IS_ERR(ptr: file))
5282	return PTR_ERR(ptr: file);
5283
5284	if (uattr) {
5285	int ret;
5286	struct mount_kattr kattr = {};
5287
5288	kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
5289	if (flags & AT_RECURSIVE)
5290	kattr.kflags \|= MOUNT_KATTR_RECURSE;
5291
5292	ret = wants_mount_setattr(uattr, usize, kattr: &kattr);
5293	if (ret < `0`)
5294	return ret;
5295
5296	if (ret) {
5297	ret = do_mount_setattr(path: &file->f_path, kattr: &kattr);
5298	if (ret)
5299	return ret;
5300
5301	finish_mount_kattr(kattr: &kattr);
5302	}
5303	}
5304
5305	fd = get_unused_fd_flags(flags: flags & O_CLOEXEC);
5306	if (fd < `0`)
5307	return fd;
5308
5309	fd_install(fd, no_free_ptr(file));
5310	return fd;
5311	}
5312
5313	int show_path(struct seq_file m, struct* dentry *root)
5314	{
5315	if (root->d_sb->s_op->show_path)
5316	return root->d_sb->s_op->show_path(m, root);
5317
5318	seq_dentry(m, root, " \t\n\\");
5319	return `0`;
5320	}
5321
5322	static struct vfsmount lookup_mnt_in_ns(u64 id, struct* mnt_namespace *ns)
5323	{
5324	struct mount *mnt = mnt_find_id_at(ns, mnt_id: id);
5325
5326	if (!mnt \|\| mnt->mnt_id_unique != id)
5327	return NULL;
5328
5329	return &mnt->mnt;
5330	}
5331
5332	struct kstatmount {
5333	struct statmount __user *buf;
5334	size_t bufsize;
5335	struct vfsmount *mnt;
5336	struct mnt_idmap *idmap;
5337	u64 mask;
5338	struct path root;
5339	struct seq_file seq;
5340
5341	/ Must be last --ends in a flexible-array member. /
5342	struct statmount sm;
5343	};
5344
5345	static u64 mnt_to_attr_flags(struct vfsmount *mnt)
5346	{
5347	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
5348	u64 attr_flags = `0`;
5349
5350	if (mnt_flags & MNT_READONLY)
5351	attr_flags \|= MOUNT_ATTR_RDONLY;
5352	if (mnt_flags & MNT_NOSUID)
5353	attr_flags \|= MOUNT_ATTR_NOSUID;
5354	if (mnt_flags & MNT_NODEV)
5355	attr_flags \|= MOUNT_ATTR_NODEV;
5356	if (mnt_flags & MNT_NOEXEC)
5357	attr_flags \|= MOUNT_ATTR_NOEXEC;
5358	if (mnt_flags & MNT_NODIRATIME)
5359	attr_flags \|= MOUNT_ATTR_NODIRATIME;
5360	if (mnt_flags & MNT_NOSYMFOLLOW)
5361	attr_flags \|= MOUNT_ATTR_NOSYMFOLLOW;
5362
5363	if (mnt_flags & MNT_NOATIME)
5364	attr_flags \|= MOUNT_ATTR_NOATIME;
5365	else if (mnt_flags & MNT_RELATIME)
5366	attr_flags \|= MOUNT_ATTR_RELATIME;
5367	else
5368	attr_flags \|= MOUNT_ATTR_STRICTATIME;
5369
5370	if (is_idmapped_mnt(mnt))
5371	attr_flags \|= MOUNT_ATTR_IDMAP;
5372
5373	return attr_flags;
5374	}
5375
5376	static u64 mnt_to_propagation_flags(struct mount *m)
5377	{
5378	u64 propagation = `0`;
5379
5380	if (IS_MNT_SHARED(m))
5381	propagation \|= MS_SHARED;
5382	if (IS_MNT_SLAVE(m))
5383	propagation \|= MS_SLAVE;
5384	if (IS_MNT_UNBINDABLE(m))
5385	propagation \|= MS_UNBINDABLE;
5386	if (!propagation)
5387	propagation \|= MS_PRIVATE;
5388
5389	return propagation;
5390	}
5391
5392	static void statmount_sb_basic(struct kstatmount *s)
5393	{
5394	struct super_block *sb = s->mnt->mnt_sb;
5395
5396	s->sm.mask \|= STATMOUNT_SB_BASIC;
5397	s->sm.sb_dev_major = MAJOR(sb->s_dev);
5398	s->sm.sb_dev_minor = MINOR(sb->s_dev);
5399	s->sm.sb_magic = sb->s_magic;
5400	s->sm.sb_flags = sb->s_flags & (SB_RDONLY\|SB_SYNCHRONOUS\|SB_DIRSYNC\|SB_LAZYTIME);
5401	}
5402
5403	static void statmount_mnt_basic(struct kstatmount *s)
5404	{
5405	struct mount *m = real_mount(mnt: s->mnt);
5406
5407	s->sm.mask \|= STATMOUNT_MNT_BASIC;
5408	s->sm.mnt_id = m->mnt_id_unique;
5409	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
5410	s->sm.mnt_id_old = m->mnt_id;
5411	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
5412	s->sm.mnt_attr = mnt_to_attr_flags(mnt: &m->mnt);
5413	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
5414	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : `0`;
5415	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : `0`;
5416	}
5417
5418	static void statmount_propagate_from(struct kstatmount *s)
5419	{
5420	struct mount *m = real_mount(mnt: s->mnt);
5421
5422	s->sm.mask \|= STATMOUNT_PROPAGATE_FROM;
5423	if (IS_MNT_SLAVE(m))
5424	s->sm.propagate_from = get_dominating_id(mnt: m, root: &current->fs->root);
5425	}
5426
5427	static int statmount_mnt_root(struct kstatmount s, struct* seq_file *seq)
5428	{
5429	int ret;
5430	size_t start = seq->count;
5431
5432	ret = show_path(m: seq, root: s->mnt->mnt_root);
5433	if (ret)
5434	return ret;
5435
5436	if (unlikely(seq_has_overflowed(seq)))
5437	return -EAGAIN;
5438
5439	/*
5440	* Unescape the result. It would be better if supplied string was not
5441	* escaped in the first place, but that's a pretty invasive change.
5442	*/
5443	seq->buf[seq->count] = `'\0'`;
5444	seq->count = start;
5445	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5446	return `0`;
5447	}
5448
5449	static int statmount_mnt_point(struct kstatmount s, struct* seq_file *seq)
5450	{
5451	struct vfsmount *mnt = s->mnt;
5452	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
5453	int err;
5454
5455	err = seq_path_root(m: seq, path: &mnt_path, root: &s->root, esc: "");
5456	return err == SEQ_SKIP ? `0` : err;
5457	}
5458
5459	static int statmount_fs_type(struct kstatmount s, struct* seq_file *seq)
5460	{
5461	struct super_block *sb = s->mnt->mnt_sb;
5462
5463	seq_puts(m: seq, s: sb->s_type->name);
5464	return `0`;
5465	}
5466
5467	static void statmount_fs_subtype(struct kstatmount s, struct* seq_file *seq)
5468	{
5469	struct super_block *sb = s->mnt->mnt_sb;
5470
5471	if (sb->s_subtype)
5472	seq_puts(m: seq, s: sb->s_subtype);
5473	}
5474
5475	static int statmount_sb_source(struct kstatmount s, struct* seq_file *seq)
5476	{
5477	struct super_block *sb = s->mnt->mnt_sb;
5478	struct mount *r = real_mount(mnt: s->mnt);
5479
5480	if (sb->s_op->show_devname) {
5481	size_t start = seq->count;
5482	int ret;
5483
5484	ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
5485	if (ret)
5486	return ret;
5487
5488	if (unlikely(seq_has_overflowed(seq)))
5489	return -EAGAIN;
5490
5491	/ Unescape the result /
5492	seq->buf[seq->count] = `'\0'`;
5493	seq->count = start;
5494	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5495	} else {
5496	seq_puts(m: seq, s: r->mnt_devname);
5497	}
5498	return `0`;
5499	}
5500
5501	static void statmount_mnt_ns_id(struct kstatmount s, struct* mnt_namespace *ns)
5502	{
5503	s->sm.mask \|= STATMOUNT_MNT_NS_ID;
5504	s->sm.mnt_ns_id = ns->seq;
5505	}
5506
5507	static int statmount_mnt_opts(struct kstatmount s, struct* seq_file *seq)
5508	{
5509	struct vfsmount *mnt = s->mnt;
5510	struct super_block *sb = mnt->mnt_sb;
5511	size_t start = seq->count;
5512	int err;
5513
5514	err = security_sb_show_options(m: seq, sb);
5515	if (err)
5516	return err;
5517
5518	if (sb->s_op->show_options) {
5519	err = sb->s_op->show_options(seq, mnt->mnt_root);
5520	if (err)
5521	return err;
5522	}
5523
5524	if (unlikely(seq_has_overflowed(seq)))
5525	return -EAGAIN;
5526
5527	if (seq->count == start)
5528	return `0`;
5529
5530	/ skip leading comma /
5531	memmove(seq->buf + start, seq->buf + start + `1`,
5532	seq->count - start - `1`);
5533	seq->count--;
5534
5535	return `0`;
5536	}
5537
5538	static inline int statmount_opt_process(struct seq_file *seq, size_t start)
5539	{
5540	char buf_end, opt_end, src, dst;
5541	int count = `0`;
5542
5543	if (unlikely(seq_has_overflowed(seq)))
5544	return -EAGAIN;
5545
5546	buf_end = seq->buf + seq->count;
5547	dst = seq->buf + start;
5548	src = dst + `1`; / skip initial comma /
5549
5550	if (src >= buf_end) {
5551	seq->count = start;
5552	return `0`;
5553	}
5554
5555	*buf_end = `'\0'`;
5556	for (; src < buf_end; src = opt_end + `1`) {
5557	opt_end = strchrnul(src, `','`);
5558	*opt_end = `'\0'`;
5559	dst += string_unescape(src, dst, size: `0`, UNESCAPE_OCTAL) + `1`;
5560	if (WARN_ON_ONCE(++count == INT_MAX))
5561	return -EOVERFLOW;
5562	}
5563	seq->count = dst - `1` - seq->buf;
5564	return count;
5565	}
5566
5567	static int statmount_opt_array(struct kstatmount s, struct* seq_file *seq)
5568	{
5569	struct vfsmount *mnt = s->mnt;
5570	struct super_block *sb = mnt->mnt_sb;
5571	size_t start = seq->count;
5572	int err;
5573
5574	if (!sb->s_op->show_options)
5575	return `0`;
5576
5577	err = sb->s_op->show_options(seq, mnt->mnt_root);
5578	if (err)
5579	return err;
5580
5581	err = statmount_opt_process(seq, start);
5582	if (err < `0`)
5583	return err;
5584
5585	s->sm.opt_num = err;
5586	return `0`;
5587	}
5588
5589	static int statmount_opt_sec_array(struct kstatmount s, struct* seq_file *seq)
5590	{
5591	struct vfsmount *mnt = s->mnt;
5592	struct super_block *sb = mnt->mnt_sb;
5593	size_t start = seq->count;
5594	int err;
5595
5596	err = security_sb_show_options(m: seq, sb);
5597	if (err)
5598	return err;
5599
5600	err = statmount_opt_process(seq, start);
5601	if (err < `0`)
5602	return err;
5603
5604	s->sm.opt_sec_num = err;
5605	return `0`;
5606	}
5607
5608	static inline int statmount_mnt_uidmap(struct kstatmount s, struct* seq_file *seq)
5609	{
5610	int ret;
5611
5612	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: true);
5613	if (ret < `0`)
5614	return ret;
5615
5616	s->sm.mnt_uidmap_num = ret;
5617	/*
5618	* Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
5619	* mappings. This allows userspace to distinguish between a
5620	* non-idmapped mount and an idmapped mount where none of the
5621	* individual mappings are valid in the caller's idmapping.
5622	*/
5623	if (is_valid_mnt_idmap(idmap: s->idmap))
5624	s->sm.mask \|= STATMOUNT_MNT_UIDMAP;
5625	return `0`;
5626	}
5627
5628	static inline int statmount_mnt_gidmap(struct kstatmount s, struct* seq_file *seq)
5629	{
5630	int ret;
5631
5632	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: false);
5633	if (ret < `0`)
5634	return ret;
5635
5636	s->sm.mnt_gidmap_num = ret;
5637	/*
5638	* Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
5639	* mappings. This allows userspace to distinguish between a
5640	* non-idmapped mount and an idmapped mount where none of the
5641	* individual mappings are valid in the caller's idmapping.
5642	*/
5643	if (is_valid_mnt_idmap(idmap: s->idmap))
5644	s->sm.mask \|= STATMOUNT_MNT_GIDMAP;
5645	return `0`;
5646	}
5647
5648	static int statmount_string(struct kstatmount *s, u64 flag)
5649	{
5650	int ret = `0`;
5651	size_t kbufsize;
5652	struct seq_file *seq = &s->seq;
5653	struct statmount *sm = &s->sm;
5654	u32 start, *offp;
5655
5656	/ Reserve an empty string at the beginning for any unset offsets /
5657	if (!seq->count)
5658	seq_putc(m: seq, c: `0`);
5659
5660	start = seq->count;
5661
5662	switch (flag) {
5663	case STATMOUNT_FS_TYPE:
5664	offp = &sm->fs_type;
5665	ret = statmount_fs_type(s, seq);
5666	break;
5667	case STATMOUNT_MNT_ROOT:
5668	offp = &sm->mnt_root;
5669	ret = statmount_mnt_root(s, seq);
5670	break;
5671	case STATMOUNT_MNT_POINT:
5672	offp = &sm->mnt_point;
5673	ret = statmount_mnt_point(s, seq);
5674	break;
5675	case STATMOUNT_MNT_OPTS:
5676	offp = &sm->mnt_opts;
5677	ret = statmount_mnt_opts(s, seq);
5678	break;
5679	case STATMOUNT_OPT_ARRAY:
5680	offp = &sm->opt_array;
5681	ret = statmount_opt_array(s, seq);
5682	break;
5683	case STATMOUNT_OPT_SEC_ARRAY:
5684	offp = &sm->opt_sec_array;
5685	ret = statmount_opt_sec_array(s, seq);
5686	break;
5687	case STATMOUNT_FS_SUBTYPE:
5688	offp = &sm->fs_subtype;
5689	statmount_fs_subtype(s, seq);
5690	break;
5691	case STATMOUNT_SB_SOURCE:
5692	offp = &sm->sb_source;
5693	ret = statmount_sb_source(s, seq);
5694	break;
5695	case STATMOUNT_MNT_UIDMAP:
5696	sm->mnt_uidmap = start;
5697	ret = statmount_mnt_uidmap(s, seq);
5698	break;
5699	case STATMOUNT_MNT_GIDMAP:
5700	sm->mnt_gidmap = start;
5701	ret = statmount_mnt_gidmap(s, seq);
5702	break;
5703	default:
5704	WARN_ON_ONCE(true);
5705	return -EINVAL;
5706	}
5707
5708	/*
5709	* If nothing was emitted, return to avoid setting the flag
5710	* and terminating the buffer.
5711	*/
5712	if (seq->count == start)
5713	return ret;
5714	if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
5715	return -EOVERFLOW;
5716	if (kbufsize >= s->bufsize)
5717	return -EOVERFLOW;
5718
5719	/ signal a retry /
5720	if (unlikely(seq_has_overflowed(seq)))
5721	return -EAGAIN;
5722
5723	if (ret)
5724	return ret;
5725
5726	seq->buf[seq->count++] = `'\0'`;
5727	sm->mask \|= flag;
5728	*offp = start;
5729	return `0`;
5730	}
5731
5732	static int copy_statmount_to_user(struct kstatmount *s)
5733	{
5734	struct statmount *sm = &s->sm;
5735	struct seq_file *seq = &s->seq;
5736	char __user str = ((char* __user )s->buf) + sizeof(sm);
5737	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
5738
5739	if (seq->count && copy_to_user(to: str, from: seq->buf, n: seq->count))
5740	return -EFAULT;
5741
5742	/ Return the number of bytes copied to the buffer /
5743	sm->size = copysize + seq->count;
5744	if (copy_to_user(to: s->buf, from: sm, n: copysize))
5745	return -EFAULT;
5746
5747	return `0`;
5748	}
5749
5750	static struct mount listmnt_next(struct* mount *curr, bool reverse)
5751	{
5752	struct rb_node *node;
5753
5754	if (reverse)
5755	node = rb_prev(&curr->mnt_node);
5756	else
5757	node = rb_next(&curr->mnt_node);
5758
5759	return node_to_mount(node);
5760	}
5761
5762	static int grab_requested_root(struct mnt_namespace ns, struct* path *root)
5763	{
5764	struct mount first, child;
5765
5766	rwsem_assert_held(sem: &namespace_sem);
5767
5768	/ We're looking at our own ns, just use get_fs_root. /
5769	if (ns == current->nsproxy->mnt_ns) {
5770	get_fs_root(current->fs, root);
5771	return `0`;
5772	}
5773
5774	/*
5775	* We have to find the first mount in our ns and use that, however it
5776	* may not exist, so handle that properly.
5777	*/
5778	if (mnt_ns_empty(ns))
5779	return -ENOENT;
5780
5781	first = child = ns->root;
5782	for (;;) {
5783	child = listmnt_next(curr: child, reverse: false);
5784	if (!child)
5785	return -ENOENT;
5786	if (child->mnt_parent == first)
5787	break;
5788	}
5789
5790	root->mnt = mntget(&child->mnt);
5791	root->dentry = dget(dentry: root->mnt->mnt_root);
5792	return `0`;
5793	}
5794
5795	/ This must be updated whenever a new flag is added /
5796	#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC \| \
5797	STATMOUNT_MNT_BASIC \| \
5798	STATMOUNT_PROPAGATE_FROM \| \
5799	STATMOUNT_MNT_ROOT \| \
5800	STATMOUNT_MNT_POINT \| \
5801	STATMOUNT_FS_TYPE \| \
5802	STATMOUNT_MNT_NS_ID \| \
5803	STATMOUNT_MNT_OPTS \| \
5804	STATMOUNT_FS_SUBTYPE \| \
5805	STATMOUNT_SB_SOURCE \| \
5806	STATMOUNT_OPT_ARRAY \| \
5807	STATMOUNT_OPT_SEC_ARRAY \| \
5808	STATMOUNT_SUPPORTED_MASK \| \
5809	STATMOUNT_MNT_UIDMAP \| \
5810	STATMOUNT_MNT_GIDMAP)
5811
5812	static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
5813	struct mnt_namespace *ns)
5814	{
5815	struct path root __free(path_put) = {};
5816	struct mount *m;
5817	int err;
5818
5819	/ Has the namespace already been emptied? /
5820	if (mnt_ns_id && mnt_ns_empty(ns))
5821	return -ENOENT;
5822
5823	s->mnt = lookup_mnt_in_ns(id: mnt_id, ns);
5824	if (!s->mnt)
5825	return -ENOENT;
5826
5827	err = grab_requested_root(ns, root: &root);
5828	if (err)
5829	return err;
5830
5831	/*
5832	* Don't trigger audit denials. We just want to determine what
5833	* mounts to show users.
5834	*/
5835	m = real_mount(mnt: s->mnt);
5836	if (!is_path_reachable(mnt: m, dentry: m->mnt.mnt_root, root: &root) &&
5837	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5838	return -EPERM;
5839
5840	err = security_sb_statfs(dentry: s->mnt->mnt_root);
5841	if (err)
5842	return err;
5843
5844	s->root = root;
5845
5846	/*
5847	* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
5848	* can change concurrently as we only hold the read-side of the
5849	* namespace semaphore and mount properties may change with only
5850	* the mount lock held.
5851	*
5852	* We could sample the mount lock sequence counter to detect
5853	* those changes and retry. But it's not worth it. Worst that
5854	* happens is that the mnt->mnt_idmap pointer is already changed
5855	* while mnt->mnt_flags isn't or vica versa. So what.
5856	*
5857	* Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
5858	* via READ_ONCE()/WRITE_ONCE() and guard against theoretical
5859	* torn read/write. That's all we care about right now.
5860	*/
5861	s->idmap = mnt_idmap(mnt: s->mnt);
5862	if (s->mask & STATMOUNT_MNT_BASIC)
5863	statmount_mnt_basic(s);
5864
5865	if (s->mask & STATMOUNT_SB_BASIC)
5866	statmount_sb_basic(s);
5867
5868	if (s->mask & STATMOUNT_PROPAGATE_FROM)
5869	statmount_propagate_from(s);
5870
5871	if (s->mask & STATMOUNT_FS_TYPE)
5872	err = statmount_string(s, STATMOUNT_FS_TYPE);
5873
5874	if (!err && s->mask & STATMOUNT_MNT_ROOT)
5875	err = statmount_string(s, STATMOUNT_MNT_ROOT);
5876
5877	if (!err && s->mask & STATMOUNT_MNT_POINT)
5878	err = statmount_string(s, STATMOUNT_MNT_POINT);
5879
5880	if (!err && s->mask & STATMOUNT_MNT_OPTS)
5881	err = statmount_string(s, STATMOUNT_MNT_OPTS);
5882
5883	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
5884	err = statmount_string(s, STATMOUNT_OPT_ARRAY);
5885
5886	if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
5887	err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
5888
5889	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
5890	err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
5891
5892	if (!err && s->mask & STATMOUNT_SB_SOURCE)
5893	err = statmount_string(s, STATMOUNT_SB_SOURCE);
5894
5895	if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
5896	err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
5897
5898	if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
5899	err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
5900
5901	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
5902	statmount_mnt_ns_id(s, ns);
5903
5904	if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
5905	s->sm.mask \|= STATMOUNT_SUPPORTED_MASK;
5906	s->sm.supported_mask = STATMOUNT_SUPPORTED;
5907	}
5908
5909	if (err)
5910	return err;
5911
5912	/ Are there bits in the return mask not present in STATMOUNT_SUPPORTED? /
5913	WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
5914
5915	return `0`;
5916	}
5917
5918	static inline bool retry_statmount(const long ret, size_t *seq_size)
5919	{
5920	if (likely(ret != -EAGAIN))
5921	return false;
5922	if (unlikely(check_mul_overflow(*seq_size, `2`, seq_size)))
5923	return false;
5924	if (unlikely(*seq_size > MAX_RW_COUNT))
5925	return false;
5926	return true;
5927	}
5928
5929	#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT \| STATMOUNT_MNT_POINT \| \
5930	STATMOUNT_FS_TYPE \| STATMOUNT_MNT_OPTS \| \
5931	STATMOUNT_FS_SUBTYPE \| STATMOUNT_SB_SOURCE \| \
5932	STATMOUNT_OPT_ARRAY \| STATMOUNT_OPT_SEC_ARRAY \| \
5933	STATMOUNT_MNT_UIDMAP \| STATMOUNT_MNT_GIDMAP)
5934
5935	static int prepare_kstatmount(struct kstatmount ks, struct* mnt_id_req *kreq,
5936	struct statmount __user *buf, size_t bufsize,
5937	size_t seq_size)
5938	{
5939	if (!access_ok(buf, bufsize))
5940	return -EFAULT;
5941
5942	memset(ks, `0`, sizeof(*ks));
5943	ks->mask = kreq->param;
5944	ks->buf = buf;
5945	ks->bufsize = bufsize;
5946
5947	if (ks->mask & STATMOUNT_STRING_REQ) {
5948	if (bufsize == sizeof(ks->sm))
5949	return -EOVERFLOW;
5950
5951	ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
5952	if (!ks->seq.buf)
5953	return -ENOMEM;
5954
5955	ks->seq.size = seq_size;
5956	}
5957
5958	return `0`;
5959	}
5960
5961	static int copy_mnt_id_req(const struct mnt_id_req __user *req,
5962	struct mnt_id_req *kreq)
5963	{
5964	int ret;
5965	size_t usize;
5966
5967	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
5968
5969	ret = get_user(usize, &req->size);
5970	if (ret)
5971	return -EFAULT;
5972	if (unlikely(usize > PAGE_SIZE))
5973	return -E2BIG;
5974	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
5975	return -EINVAL;
5976	memset(kreq, `0`, sizeof(*kreq));
5977	ret = copy_struct_from_user(dst: kreq, ksize: sizeof(*kreq), src: req, usize);
5978	if (ret)
5979	return ret;
5980	if (kreq->spare != `0`)
5981	return -EINVAL;
5982	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
5983	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
5984	return -EINVAL;
5985	return `0`;
5986	}
5987
5988	/*
5989	* If the user requested a specific mount namespace id, look that up and return
5990	* that, or if not simply grab a passive reference on our mount namespace and
5991	* return that.
5992	*/
5993	static struct mnt_namespace grab_requested_mnt_ns(const* struct mnt_id_req *kreq)
5994	{
5995	struct mnt_namespace *mnt_ns;
5996
5997	if (kreq->mnt_ns_id && kreq->spare)
5998	return ERR_PTR(error: -EINVAL);
5999
6000	if (kreq->mnt_ns_id)
6001	return lookup_mnt_ns(mnt_ns_id: kreq->mnt_ns_id);
6002
6003	if (kreq->spare) {
6004	struct ns_common *ns;
6005
6006	CLASS(fd, f)(fd: kreq->spare);
6007	if (fd_empty(f))
6008	return ERR_PTR(error: -EBADF);
6009
6010	if (!proc_ns_file(fd_file(f)))
6011	return ERR_PTR(error: -EINVAL);
6012
6013	ns = get_proc_ns(file_inode(fd_file(f)));
6014	if (ns->ops->type != CLONE_NEWNS)
6015	return ERR_PTR(error: -EINVAL);
6016
6017	mnt_ns = to_mnt_ns(ns);
6018	} else {
6019	mnt_ns = current->nsproxy->mnt_ns;
6020	}
6021
6022	refcount_inc(r: &mnt_ns->passive);
6023	return mnt_ns;
6024	}
6025
6026	SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
6027	struct statmount __user *, buf, size_t, bufsize,
6028	unsigned int, flags)
6029	{
6030	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
6031	struct kstatmount *ks __free(kfree) = NULL;
6032	struct mnt_id_req kreq;
6033	/ We currently support retrieval of 3 strings. /
6034	size_t seq_size = `3` * PATH_MAX;
6035	int ret;
6036
6037	if (flags)
6038	return -EINVAL;
6039
6040	ret = copy_mnt_id_req(req, kreq: &kreq);
6041	if (ret)
6042	return ret;
6043
6044	ns = grab_requested_mnt_ns(kreq: &kreq);
6045	if (!ns)
6046	return -ENOENT;
6047
6048	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
6049	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
6050	return -ENOENT;
6051
6052	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
6053	if (!ks)
6054	return -ENOMEM;
6055
6056	retry:
6057	ret = prepare_kstatmount(ks, kreq: &kreq, buf, bufsize, seq_size);
6058	if (ret)
6059	return ret;
6060
6061	scoped_guard(rwsem_read, &namespace_sem)
6062	ret = do_statmount(s: ks, mnt_id: kreq.mnt_id, mnt_ns_id: kreq.mnt_ns_id, ns);
6063
6064	if (!ret)
6065	ret = copy_statmount_to_user(s: ks);
6066	kvfree(addr: ks->seq.buf);
6067	if (retry_statmount(ret, seq_size: &seq_size))
6068	goto retry;
6069	return ret;
6070	}
6071
6072	static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
6073	u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
6074	bool reverse)
6075	{
6076	struct path root __free(path_put) = {};
6077	struct path orig;
6078	struct mount r, first;
6079	ssize_t ret;
6080
6081	rwsem_assert_held(sem: &namespace_sem);
6082
6083	ret = grab_requested_root(ns, root: &root);
6084	if (ret)
6085	return ret;
6086
6087	if (mnt_parent_id == LSMT_ROOT) {
6088	orig = root;
6089	} else {
6090	orig.mnt = lookup_mnt_in_ns(id: mnt_parent_id, ns);
6091	if (!orig.mnt)
6092	return -ENOENT;
6093	orig.dentry = orig.mnt->mnt_root;
6094	}
6095
6096	/*
6097	* Don't trigger audit denials. We just want to determine what
6098	* mounts to show users.
6099	*/
6100	if (!is_path_reachable(mnt: real_mount(mnt: orig.mnt), dentry: orig.dentry, root: &root) &&
6101	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
6102	return -EPERM;
6103
6104	ret = security_sb_statfs(dentry: orig.dentry);
6105	if (ret)
6106	return ret;
6107
6108	if (!last_mnt_id) {
6109	if (reverse)
6110	first = node_to_mount(node: ns->mnt_last_node);
6111	else
6112	first = node_to_mount(node: ns->mnt_first_node);
6113	} else {
6114	if (reverse)
6115	first = mnt_find_id_at_reverse(ns, mnt_id: last_mnt_id - `1`);
6116	else
6117	first = mnt_find_id_at(ns, mnt_id: last_mnt_id + `1`);
6118	}
6119
6120	for (ret = `0`, r = first; r && nr_mnt_ids; r = listmnt_next(curr: r, reverse)) {
6121	if (r->mnt_id_unique == mnt_parent_id)
6122	continue;
6123	if (!is_path_reachable(mnt: r, dentry: r->mnt.mnt_root, root: &orig))
6124	continue;
6125	*mnt_ids = r->mnt_id_unique;
6126	mnt_ids++;
6127	nr_mnt_ids--;
6128	ret++;
6129	}
6130	return ret;
6131	}
6132
6133	SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
6134	u64 __user , mnt_ids, size_t, nr_mnt_ids, unsigned* int, flags)
6135	{
6136	u64 *kmnt_ids __free(kvfree) = NULL;
6137	const size_t maxcount = `1000000`;
6138	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
6139	struct mnt_id_req kreq;
6140	u64 last_mnt_id;
6141	ssize_t ret;
6142
6143	if (flags & ~LISTMOUNT_REVERSE)
6144	return -EINVAL;
6145
6146	/*
6147	* If the mount namespace really has more than 1 million mounts the
6148	* caller must iterate over the mount namespace (and reconsider their
6149	* system design...).
6150	*/
6151	if (unlikely(nr_mnt_ids > maxcount))
6152	return -EOVERFLOW;
6153
6154	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
6155	return -EFAULT;
6156
6157	ret = copy_mnt_id_req(req, kreq: &kreq);
6158	if (ret)
6159	return ret;
6160
6161	last_mnt_id = kreq.param;
6162	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
6163	if (last_mnt_id != `0` && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
6164	return -EINVAL;
6165
6166	kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
6167	GFP_KERNEL_ACCOUNT);
6168	if (!kmnt_ids)
6169	return -ENOMEM;
6170
6171	ns = grab_requested_mnt_ns(kreq: &kreq);
6172	if (!ns)
6173	return -ENOENT;
6174
6175	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
6176	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
6177	return -ENOENT;
6178
6179	/*
6180	* We only need to guard against mount topology changes as
6181	* listmount() doesn't care about any mount properties.
6182	*/
6183	scoped_guard(rwsem_read, &namespace_sem)
6184	ret = do_listmount(ns, mnt_parent_id: kreq.mnt_id, last_mnt_id, mnt_ids: kmnt_ids,
6185	nr_mnt_ids, reverse: (flags & LISTMOUNT_REVERSE));
6186	if (ret <= `0`)
6187	return ret;
6188
6189	if (copy_to_user(to: mnt_ids, from: kmnt_ids, n: ret * sizeof(*mnt_ids)))
6190	return -EFAULT;
6191
6192	return ret;
6193	}
6194
6195	static void __init init_mount_tree(void)
6196	{
6197	struct vfsmount *mnt;
6198	struct mount *m;
6199	struct mnt_namespace *ns;
6200	struct path root;
6201
6202	mnt = vfs_kern_mount(&rootfs_fs_type, `0`, "rootfs", NULL);
6203	if (IS_ERR(ptr: mnt))
6204	panic(fmt: "Can't create rootfs");
6205
6206	ns = alloc_mnt_ns(user_ns: &init_user_ns, anon: false);
6207	if (IS_ERR(ptr: ns))
6208	panic(fmt: "Can't allocate initial namespace");
6209	m = real_mount(mnt);
6210	ns->root = m;
6211	ns->nr_mounts = `1`;
6212	mnt_add_to_ns(ns, mnt: m);
6213	init_task.nsproxy->mnt_ns = ns;
6214	get_mnt_ns(ns);
6215
6216	root.mnt = mnt;
6217	root.dentry = mnt->mnt_root;
6218	mnt->mnt_flags \|= MNT_LOCKED;
6219
6220	set_fs_pwd(current->fs, &root);
6221	set_fs_root(current->fs, &root);
6222
6223	mnt_ns_tree_add(ns);
6224	}
6225
6226	void __init mnt_init(void)
6227	{
6228	int err;
6229
6230	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
6231	`0`, SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT, NULL);
6232
6233	mount_hashtable = alloc_large_system_hash(tablename: "Mount-cache",
6234	bucketsize: sizeof(struct hlist_head),
6235	numentries: mhash_entries, scale: `19`,
6236	HASH_ZERO,
6237	hash_shift: &m_hash_shift, hash_mask: &m_hash_mask, low_limit: `0`, high_limit: `0`);
6238	mountpoint_hashtable = alloc_large_system_hash(tablename: "Mountpoint-cache",
6239	bucketsize: sizeof(struct hlist_head),
6240	numentries: mphash_entries, scale: `19`,
6241	HASH_ZERO,
6242	hash_shift: &mp_hash_shift, hash_mask: &mp_hash_mask, low_limit: `0`, high_limit: `0`);
6243
6244	if (!mount_hashtable \|\| !mountpoint_hashtable)
6245	panic(fmt: "Failed to allocate mount hash table\n");
6246
6247	kernfs_init();
6248
6249	err = sysfs_init();
6250	if (err)
6251	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
6252	__func__, err);
6253	fs_kobj = kobject_create_and_add(name: "fs", NULL);
6254	if (!fs_kobj)
6255	printk(KERN_WARNING "%s: kobj create error\n", __func__);
6256	shmem_init();
6257	init_rootfs();
6258	init_mount_tree();
6259	}
6260
6261	void put_mnt_ns(struct mnt_namespace *ns)
6262	{
6263	if (!refcount_dec_and_test(r: &ns->ns.count))
6264	return;
6265	drop_collected_mounts(mnt: &ns->root->mnt);
6266	free_mnt_ns(ns);
6267	}
6268
6269	struct vfsmount kern_mount(struct* file_system_type *type)
6270	{
6271	struct vfsmount *mnt;
6272	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
6273	if (!IS_ERR(ptr: mnt)) {
6274	/*
6275	* it is a longterm mount, don't release mnt until
6276	* we unmount before file sys is unregistered
6277	*/
6278	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
6279	}
6280	return mnt;
6281	}
6282	EXPORT_SYMBOL_GPL(kern_mount);
6283
6284	void kern_unmount(struct vfsmount *mnt)
6285	{
6286	/ release long term mount so mount point can be released /
6287	if (!IS_ERR(ptr: mnt)) {
6288	mnt_make_shortterm(mnt);
6289	synchronize_rcu(); / yecchhh... /
6290	mntput(mnt);
6291	}
6292	}
6293	EXPORT_SYMBOL(kern_unmount);
6294
6295	void kern_unmount_array(struct vfsmount mnt[], unsigned* int num)
6296	{
6297	unsigned int i;
6298
6299	for (i = `0`; i < num; i++)
6300	mnt_make_shortterm(mnt: mnt[i]);
6301	synchronize_rcu_expedited();
6302	for (i = `0`; i < num; i++)
6303	mntput(mnt[i]);
6304	}
6305	EXPORT_SYMBOL(kern_unmount_array);
6306
6307	bool our_mnt(struct vfsmount *mnt)
6308	{
6309	return check_mnt(mnt: real_mount(mnt));
6310	}
6311
6312	bool current_chrooted(void)
6313	{
6314	/ Does the current process have a non-standard root /
6315	struct path ns_root;
6316	struct path fs_root;
6317	bool chrooted;
6318
6319	/ Find the namespace root /
6320	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
6321	ns_root.dentry = ns_root.mnt->mnt_root;
6322	path_get(&ns_root);
6323	while (d_mountpoint(dentry: ns_root.dentry) && follow_down_one(&ns_root))
6324	;
6325
6326	get_fs_root(current->fs, root: &fs_root);
6327
6328	chrooted = !path_equal(path1: &fs_root, path2: &ns_root);
6329
6330	path_put(&fs_root);
6331	path_put(&ns_root);
6332
6333	return chrooted;
6334	}
6335
6336	static bool mnt_already_visible(struct mnt_namespace *ns,
6337	const struct super_block *sb,
6338	int *new_mnt_flags)
6339	{
6340	int new_flags = *new_mnt_flags;
6341	struct mount mnt, n;
6342	bool visible = false;
6343
6344	down_read(sem: &namespace_sem);
6345	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
6346	struct mount *child;
6347	int mnt_flags;
6348
6349	if (mnt->mnt.mnt_sb->s_type != sb->s_type)
6350	continue;
6351
6352	/ This mount is not fully visible if it's root directory*
6353	* is not the root directory of the filesystem.
6354	*/
6355	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
6356	continue;
6357
6358	/ A local view of the mount flags /
6359	mnt_flags = mnt->mnt.mnt_flags;
6360
6361	/ Don't miss readonly hidden in the superblock flags /
6362	if (sb_rdonly(sb: mnt->mnt.mnt_sb))
6363	mnt_flags \|= MNT_LOCK_READONLY;
6364
6365	/ Verify the mount flags are equal to or more permissive*
6366	* than the proposed new mount.
6367	*/
6368	if ((mnt_flags & MNT_LOCK_READONLY) &&
6369	!(new_flags & MNT_READONLY))
6370	continue;
6371	if ((mnt_flags & MNT_LOCK_ATIME) &&
6372	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
6373	continue;
6374
6375	/ This mount is not fully visible if there are any*
6376	* locked child mounts that cover anything except for
6377	* empty directories.
6378	*/
6379	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
6380	struct inode *inode = child->mnt_mountpoint->d_inode;
6381	/ Only worry about locked mounts /
6382	if (!(child->mnt.mnt_flags & MNT_LOCKED))
6383	continue;
6384	/ Is the directory permanently empty? /
6385	if (!is_empty_dir_inode(inode))
6386	goto next;
6387	}
6388	/ Preserve the locked attributes /
6389	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
6390	MNT_LOCK_ATIME);
6391	visible = true;
6392	goto found;
6393	next: ;
6394	}
6395	found:
6396	up_read(sem: &namespace_sem);
6397	return visible;
6398	}
6399
6400	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags)
6401	{
6402	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
6403	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
6404	unsigned long s_iflags;
6405
6406	if (ns->user_ns == &init_user_ns)
6407	return false;
6408
6409	/ Can this filesystem be too revealing? /
6410	s_iflags = sb->s_iflags;
6411	if (!(s_iflags & SB_I_USERNS_VISIBLE))
6412	return false;
6413
6414	if ((s_iflags & required_iflags) != required_iflags) {
6415	WARN_ONCE(`1`, "Expected s_iflags to contain 0x%lx\n",
6416	required_iflags);
6417	return true;
6418	}
6419
6420	return !mnt_already_visible(ns, sb, new_mnt_flags);
6421	}
6422
6423	bool mnt_may_suid(struct vfsmount *mnt)
6424	{
6425	/*
6426	* Foreign mounts (accessed via fchdir or through /proc
6427	* symlinks) are always treated as if they are nosuid. This
6428	* prevents namespaces from trusting potentially unsafe
6429	* suid/sgid bits, file caps, or security labels that originate
6430	* in other namespaces.
6431	*/
6432	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(mnt: real_mount(mnt)) &&
6433	current_in_userns(target_ns: mnt->mnt_sb->s_user_ns);
6434	}
6435
6436	static struct ns_common mntns_get(struct* task_struct *task)
6437	{
6438	struct ns_common *ns = NULL;
6439	struct nsproxy *nsproxy;
6440
6441	task_lock(p: task);
6442	nsproxy = task->nsproxy;
6443	if (nsproxy) {
6444	ns = &nsproxy->mnt_ns->ns;
6445	get_mnt_ns(ns: to_mnt_ns(ns));
6446	}
6447	task_unlock(p: task);
6448
6449	return ns;
6450	}
6451
6452	static void mntns_put(struct ns_common *ns)
6453	{
6454	put_mnt_ns(ns: to_mnt_ns(ns));
6455	}
6456
6457	static int mntns_install(struct nsset nsset, struct* ns_common *ns)
6458	{
6459	struct nsproxy *nsproxy = nsset->nsproxy;
6460	struct fs_struct *fs = nsset->fs;
6461	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
6462	struct user_namespace *user_ns = nsset->cred->user_ns;
6463	struct path root;
6464	int err;
6465
6466	if (!ns_capable(ns: mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
6467	!ns_capable(ns: user_ns, CAP_SYS_CHROOT) \|\|
6468	!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
6469	return -EPERM;
6470
6471	if (is_anon_ns(ns: mnt_ns))
6472	return -EINVAL;
6473
6474	if (fs->users != `1`)
6475	return -EINVAL;
6476
6477	get_mnt_ns(ns: mnt_ns);
6478	old_mnt_ns = nsproxy->mnt_ns;
6479	nsproxy->mnt_ns = mnt_ns;
6480
6481	/ Find the root /
6482	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
6483	"/", LOOKUP_DOWN, &root);
6484	if (err) {
6485	/ revert to old namespace /
6486	nsproxy->mnt_ns = old_mnt_ns;
6487	put_mnt_ns(ns: mnt_ns);
6488	return err;
6489	}
6490
6491	put_mnt_ns(ns: old_mnt_ns);
6492
6493	/ Update the pwd and root /
6494	set_fs_pwd(fs, &root);
6495	set_fs_root(fs, &root);
6496
6497	path_put(&root);
6498	return `0`;
6499	}
6500
6501	static struct user_namespace mntns_owner(struct* ns_common *ns)
6502	{
6503	return to_mnt_ns(ns)->user_ns;
6504	}
6505
6506	const struct proc_ns_operations mntns_operations = {
6507	.name = "mnt",
6508	.type = CLONE_NEWNS,
6509	.get = mntns_get,
6510	.put = mntns_put,
6511	.install = mntns_install,
6512	.owner = mntns_owner,
6513	};
6514
6515	#ifdef CONFIG_SYSCTL
6516	static const struct ctl_table fs_namespace_sysctls[] = {
6517	{
6518	.procname = "mount-max",
6519	.data = &sysctl_mount_max,
6520	.maxlen = sizeof(unsigned int),
6521	.mode = `0644`,
6522	.proc_handler = proc_dointvec_minmax,
6523	.extra1 = SYSCTL_ONE,
6524	},
6525	};
6526
6527	static int __init init_fs_namespace_sysctls(void)
6528	{
6529	register_sysctl_init("fs", fs_namespace_sysctls);
6530	return `0`;
6531	}
6532	fs_initcall(init_fs_namespace_sysctls);
6533
6534	#endif /* CONFIG_SYSCTL */
6535

source code of linux/fs/namespace.c