open.c source code [linux/fs/open.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/open.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	#include <linux/string.h>
9	#include <linux/mm.h>
10	#include <linux/file.h>
11	#include <linux/fdtable.h>
12	#include <linux/fsnotify.h>
13	#include <linux/module.h>
14	#include <linux/tty.h>
15	#include <linux/namei.h>
16	#include <linux/backing-dev.h>
17	#include <linux/capability.h>
18	#include <linux/securebits.h>
19	#include <linux/security.h>
20	#include <linux/mount.h>
21	#include <linux/fcntl.h>
22	#include <linux/slab.h>
23	#include <linux/uaccess.h>
24	#include <linux/fs.h>
25	#include <linux/personality.h>
26	#include <linux/pagemap.h>
27	#include <linux/syscalls.h>
28	#include <linux/rcupdate.h>
29	#include <linux/audit.h>
30	#include <linux/falloc.h>
31	#include <linux/fs_struct.h>
32	#include <linux/dnotify.h>
33	#include <linux/compat.h>
34	#include <linux/mnt_idmapping.h>
35	#include <linux/filelock.h>
36
37	#include "internal.h"
38
39	int do_truncate(struct mnt_idmap idmap, struct* dentry *dentry,
40	loff_t length, unsigned int time_attrs, struct file *filp)
41	{
42	int ret;
43	struct iattr newattrs;
44
45	/ Not pretty: "inode->i_size" shouldn't really be signed. But it is. /
46	if (length < `0`)
47	return -EINVAL;
48
49	newattrs.ia_size = length;
50	newattrs.ia_valid = ATTR_SIZE \| time_attrs;
51	if (filp) {
52	newattrs.ia_file = filp;
53	newattrs.ia_valid \|= ATTR_FILE;
54	}
55
56	/ Remove suid, sgid, and file capabilities on truncate too /
57	ret = dentry_needs_remove_privs(idmap, dentry);
58	if (ret < `0`)
59	return ret;
60	if (ret)
61	newattrs.ia_valid \|= ret \| ATTR_FORCE;
62
63	ret = inode_lock_killable(inode: dentry->d_inode);
64	if (ret)
65	return ret;
66
67	/ Note any delegations or leases have already been broken: /
68	ret = notify_change(idmap, dentry, &newattrs, NULL);
69	inode_unlock(inode: dentry->d_inode);
70	return ret;
71	}
72
73	int vfs_truncate(const struct path *path, loff_t length)
74	{
75	struct mnt_idmap *idmap;
76	struct inode *inode;
77	int error;
78
79	inode = path->dentry->d_inode;
80
81	/ For directories it's -EISDIR, for other non-regulars - -EINVAL /
82	if (S_ISDIR(inode->i_mode))
83	return -EISDIR;
84	if (!S_ISREG(inode->i_mode))
85	return -EINVAL;
86
87	idmap = mnt_idmap(mnt: path->mnt);
88	error = inode_permission(idmap, inode, MAY_WRITE);
89	if (error)
90	return error;
91
92	error = fsnotify_truncate_perm(path, length);
93	if (error)
94	return error;
95
96	error = mnt_want_write(mnt: path->mnt);
97	if (error)
98	return error;
99
100	error = -EPERM;
101	if (IS_APPEND(inode))
102	goto mnt_drop_write_and_out;
103
104	error = get_write_access(inode);
105	if (error)
106	goto mnt_drop_write_and_out;
107
108	/*
109	* Make sure that there are no leases. get_write_access() protects
110	* against the truncate racing with a lease-granting setlease().
111	*/
112	error = break_lease(inode, O_WRONLY);
113	if (error)
114	goto put_write_and_out;
115
116	error = security_path_truncate(path);
117	if (!error)
118	error = do_truncate(idmap, dentry: path->dentry, length, time_attrs: `0`, NULL);
119
120	put_write_and_out:
121	put_write_access(inode);
122	mnt_drop_write_and_out:
123	mnt_drop_write(mnt: path->mnt);
124
125	return error;
126	}
127	EXPORT_SYMBOL_GPL(vfs_truncate);
128
129	int do_sys_truncate(const char __user *pathname, loff_t length)
130	{
131	unsigned int lookup_flags = LOOKUP_FOLLOW;
132	struct path path;
133	int error;
134
135	if (length < `0`) / sorry, but loff_t says... /
136	return -EINVAL;
137
138	retry:
139	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
140	if (!error) {
141	error = vfs_truncate(&path, length);
142	path_put(&path);
143	}
144	if (retry_estale(error, flags: lookup_flags)) {
145	lookup_flags \|= LOOKUP_REVAL;
146	goto retry;
147	}
148	return error;
149	}
150
151	SYSCALL_DEFINE2(truncate, const char __user , path, long*, length)
152	{
153	return do_sys_truncate(pathname: path, length);
154	}
155
156	#ifdef CONFIG_COMPAT
157	COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
158	{
159	return do_sys_truncate(pathname: path, length);
160	}
161	#endif
162
163	int do_ftruncate(struct file file, loff_t length, int* small)
164	{
165	struct inode *inode;
166	struct dentry *dentry;
167	int error;
168
169	/ explicitly opened as large or we are on 64-bit box /
170	if (file->f_flags & O_LARGEFILE)
171	small = `0`;
172
173	dentry = file->f_path.dentry;
174	inode = dentry->d_inode;
175	if (!S_ISREG(inode->i_mode) \|\| !(file->f_mode & FMODE_WRITE))
176	return -EINVAL;
177
178	/ Cannot ftruncate over 2^31 bytes without large file support /
179	if (small && length > MAX_NON_LFS)
180	return -EINVAL;
181
182	/ Check IS_APPEND on real upper inode /
183	if (IS_APPEND(file_inode(file)))
184	return -EPERM;
185
186	error = security_file_truncate(file);
187	if (error)
188	return error;
189
190	error = fsnotify_truncate_perm(path: &file->f_path, length);
191	if (error)
192	return error;
193
194	sb_start_write(sb: inode->i_sb);
195	error = do_truncate(idmap: file_mnt_idmap(file), dentry, length,
196	ATTR_MTIME \| ATTR_CTIME, filp: file);
197	sb_end_write(sb: inode->i_sb);
198
199	return error;
200	}
201
202	int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
203	{
204	if (length < `0`)
205	return -EINVAL;
206	CLASS(fd, f)(fd);
207	if (fd_empty(f))
208	return -EBADF;
209
210	return do_ftruncate(fd_file(f), length, small);
211	}
212
213	SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
214	{
215	return do_sys_ftruncate(fd, length, small: `1`);
216	}
217
218	#ifdef CONFIG_COMPAT
219	COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
220	{
221	return do_sys_ftruncate(fd, length, small: `1`);
222	}
223	#endif
224
225	/ LFS versions of truncate are only needed on 32 bit machines /
226	#if BITS_PER_LONG == 32
227	SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
228	{
229	return do_sys_truncate(path, length);
230	}
231
232	SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
233	{
234	return do_sys_ftruncate(fd, length, `0`);
235	}
236	#endif /* BITS_PER_LONG == 32 */
237
238	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
239	COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
240	compat_arg_u64_dual(length))
241	{
242	return ksys_truncate(pathname, compat_arg_u64_glue(length));
243	}
244	#endif
245
246	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
247	COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
248	compat_arg_u64_dual(length))
249	{
250	return ksys_ftruncate(fd, compat_arg_u64_glue(length));
251	}
252	#endif
253
254	int vfs_fallocate(struct file file, int* mode, loff_t offset, loff_t len)
255	{
256	struct inode *inode = file_inode(f: file);
257	int ret;
258	loff_t sum;
259
260	if (offset < `0` \|\| len <= `0`)
261	return -EINVAL;
262
263	if (mode & ~(FALLOC_FL_MODE_MASK \| FALLOC_FL_KEEP_SIZE))
264	return -EOPNOTSUPP;
265
266	/*
267	* Modes are exclusive, even if that is not obvious from the encoding
268	* as bit masks and the mix with the flag in the same namespace.
269	*
270	* To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
271	* encoded as no bit set.
272	*/
273	switch (mode & FALLOC_FL_MODE_MASK) {
274	case FALLOC_FL_ALLOCATE_RANGE:
275	case FALLOC_FL_UNSHARE_RANGE:
276	case FALLOC_FL_ZERO_RANGE:
277	break;
278	case FALLOC_FL_PUNCH_HOLE:
279	if (!(mode & FALLOC_FL_KEEP_SIZE))
280	return -EOPNOTSUPP;
281	break;
282	case FALLOC_FL_COLLAPSE_RANGE:
283	case FALLOC_FL_INSERT_RANGE:
284	if (mode & FALLOC_FL_KEEP_SIZE)
285	return -EOPNOTSUPP;
286	break;
287	default:
288	return -EOPNOTSUPP;
289	}
290
291	if (!(file->f_mode & FMODE_WRITE))
292	return -EBADF;
293
294	/*
295	* On append-only files only space preallocation is supported.
296	*/
297	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
298	return -EPERM;
299
300	if (IS_IMMUTABLE(inode))
301	return -EPERM;
302
303	/*
304	* We cannot allow any fallocate operation on an active swapfile
305	*/
306	if (IS_SWAPFILE(inode))
307	return -ETXTBSY;
308
309	/*
310	* Revalidate the write permissions, in case security policy has
311	* changed since the files were opened.
312	*/
313	ret = security_file_permission(file, MAY_WRITE);
314	if (ret)
315	return ret;
316
317	ret = fsnotify_file_area_perm(file, MAY_WRITE, ppos: &offset, count: len);
318	if (ret)
319	return ret;
320
321	if (S_ISFIFO(inode->i_mode))
322	return -ESPIPE;
323
324	if (S_ISDIR(inode->i_mode))
325	return -EISDIR;
326
327	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
328	return -ENODEV;
329
330	/ Check for wraparound /
331	if (check_add_overflow(offset, len, &sum))
332	return -EFBIG;
333
334	if (sum > inode->i_sb->s_maxbytes)
335	return -EFBIG;
336
337	if (!file->f_op->fallocate)
338	return -EOPNOTSUPP;
339
340	file_start_write(file);
341	ret = file->f_op->fallocate(file, mode, offset, len);
342
343	/*
344	* Create inotify and fanotify events.
345	*
346	* To keep the logic simple always create events if fallocate succeeds.
347	* This implies that events are even created if the file size remains
348	* unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
349	*/
350	if (ret == `0`)
351	fsnotify_modify(file);
352
353	file_end_write(file);
354	return ret;
355	}
356	EXPORT_SYMBOL_GPL(vfs_fallocate);
357
358	int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
359	{
360	CLASS(fd, f)(fd);
361
362	if (fd_empty(f))
363	return -EBADF;
364
365	return vfs_fallocate(fd_file(f), mode, offset, len);
366	}
367
368	SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
369	{
370	return ksys_fallocate(fd, mode, offset, len);
371	}
372
373	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
374	COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
375	compat_arg_u64_dual(len))
376	{
377	return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
378	compat_arg_u64_glue(len));
379	}
380	#endif
381
382	/*
383	* access() needs to use the real uid/gid, not the effective uid/gid.
384	* We do this by temporarily clearing all FS-related capabilities and
385	* switching the fsuid/fsgid around to the real ones.
386	*
387	* Creating new credentials is expensive, so we try to skip doing it,
388	* which we can if the result would match what we already got.
389	*/
390	static bool access_need_override_creds(int flags)
391	{
392	const struct cred *cred;
393
394	if (flags & AT_EACCESS)
395	return false;
396
397	cred = current_cred();
398	if (!uid_eq(left: cred->fsuid, right: cred->uid) \|\|
399	!gid_eq(left: cred->fsgid, right: cred->gid))
400	return true;
401
402	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
403	kuid_t root_uid = make_kuid(from: cred->user_ns, uid: `0`);
404	if (!uid_eq(left: cred->uid, right: root_uid)) {
405	if (!cap_isclear(a: cred->cap_effective))
406	return true;
407	} else {
408	if (!cap_isidentical(a: cred->cap_effective,
409	b: cred->cap_permitted))
410	return true;
411	}
412	}
413
414	return false;
415	}
416
417	static const struct cred access_override_creds(void*)
418	{
419	struct cred *override_cred;
420
421	override_cred = prepare_creds();
422	if (!override_cred)
423	return NULL;
424
425	/*
426	* XXX access_need_override_creds performs checks in hopes of skipping
427	* this work. Make sure it stays in sync if making any changes in this
428	* routine.
429	*/
430
431	override_cred->fsuid = override_cred->uid;
432	override_cred->fsgid = override_cred->gid;
433
434	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
435	/ Clear the capabilities if we switch to a non-root user /
436	kuid_t root_uid = make_kuid(from: override_cred->user_ns, uid: `0`);
437	if (!uid_eq(left: override_cred->uid, right: root_uid))
438	cap_clear(override_cred->cap_effective);
439	else
440	override_cred->cap_effective =
441	override_cred->cap_permitted;
442	}
443
444	/*
445	* The new set of credentials can only be used in
446	* task-synchronous circumstances, and does not need
447	* RCU freeing, unless somebody then takes a separate
448	* reference to it.
449	*
450	* NOTE! This is _only_ true because this credential
451	* is used purely for override_creds() that installs
452	* it as the subjective cred. Other threads will be
453	* accessing ->real_cred, not the subjective cred.
454	*
455	* If somebody _does_ make a copy of this (using the
456	* 'get_current_cred()' function), that will clear the
457	* non_rcu field, because now that other user may be
458	* expecting RCU freeing. But normal thread-synchronous
459	* cred accesses will keep things non-racy to avoid RCU
460	* freeing.
461	*/
462	override_cred->non_rcu = `1`;
463	return override_creds(override_cred);
464	}
465
466	static int do_faccessat(int dfd, const char __user filename, int* mode, int flags)
467	{
468	struct path path;
469	struct inode *inode;
470	int res;
471	unsigned int lookup_flags = LOOKUP_FOLLOW;
472	const struct cred *old_cred = NULL;
473
474	if (mode & ~S_IRWXO) / where's F_OK, X_OK, W_OK, R_OK? /
475	return -EINVAL;
476
477	if (flags & ~(AT_EACCESS \| AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH))
478	return -EINVAL;
479
480	if (flags & AT_SYMLINK_NOFOLLOW)
481	lookup_flags &= ~LOOKUP_FOLLOW;
482	if (flags & AT_EMPTY_PATH)
483	lookup_flags \|= LOOKUP_EMPTY;
484
485	if (access_need_override_creds(flags)) {
486	old_cred = access_override_creds();
487	if (!old_cred)
488	return -ENOMEM;
489	}
490
491	retry:
492	res = user_path_at(dfd, filename, lookup_flags, &path);
493	if (res)
494	goto out;
495
496	inode = d_backing_inode(upper: path.dentry);
497
498	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
499	/*
500	* MAY_EXEC on regular files is denied if the fs is mounted
501	* with the "noexec" flag.
502	*/
503	res = -EACCES;
504	if (path_noexec(path: &path))
505	goto out_path_release;
506	}
507
508	res = inode_permission(mnt_idmap(mnt: path.mnt), inode, mode \| MAY_ACCESS);
509	/ SuS v2 requires we report a read only fs too /
510	if (res \|\| !(mode & S_IWOTH) \|\| special_file(inode->i_mode))
511	goto out_path_release;
512	/*
513	* This is a rare case where using __mnt_is_readonly()
514	* is OK without a mnt_want/drop_write() pair. Since
515	* no actual write to the fs is performed here, we do
516	* not need to telegraph to that to anyone.
517	*
518	* By doing this, we accept that this access is
519	* inherently racy and know that the fs may change
520	* state before we even see this result.
521	*/
522	if (__mnt_is_readonly(mnt: path.mnt))
523	res = -EROFS;
524
525	out_path_release:
526	path_put(&path);
527	if (retry_estale(error: res, flags: lookup_flags)) {
528	lookup_flags \|= LOOKUP_REVAL;
529	goto retry;
530	}
531	out:
532	if (old_cred)
533	put_cred(cred: revert_creds(revert_cred: old_cred));
534
535	return res;
536	}
537
538	SYSCALL_DEFINE3(faccessat, int, dfd, const char __user , filename, int*, mode)
539	{
540	return do_faccessat(dfd, filename, mode, flags: `0`);
541	}
542
543	SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user , filename, int*, mode,
544	int, flags)
545	{
546	return do_faccessat(dfd, filename, mode, flags);
547	}
548
549	SYSCALL_DEFINE2(access, const char __user , filename, int*, mode)
550	{
551	return do_faccessat(AT_FDCWD, filename, mode, flags: `0`);
552	}
553
554	SYSCALL_DEFINE1(chdir, const char __user *, filename)
555	{
556	struct path path;
557	int error;
558	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
559	retry:
560	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
561	if (error)
562	goto out;
563
564	error = path_permission(path: &path, MAY_EXEC \| MAY_CHDIR);
565	if (error)
566	goto dput_and_out;
567
568	set_fs_pwd(current->fs, &path);
569
570	dput_and_out:
571	path_put(&path);
572	if (retry_estale(error, flags: lookup_flags)) {
573	lookup_flags \|= LOOKUP_REVAL;
574	goto retry;
575	}
576	out:
577	return error;
578	}
579
580	SYSCALL_DEFINE1(fchdir, unsigned int, fd)
581	{
582	CLASS(fd_raw, f)(fd);
583	int error;
584
585	if (fd_empty(f))
586	return -EBADF;
587
588	if (!d_can_lookup(fd_file(f)->f_path.dentry))
589	return -ENOTDIR;
590
591	error = file_permission(fd_file(f), MAY_EXEC \| MAY_CHDIR);
592	if (!error)
593	set_fs_pwd(current->fs, &fd_file(f)->f_path);
594	return error;
595	}
596
597	SYSCALL_DEFINE1(chroot, const char __user *, filename)
598	{
599	struct path path;
600	int error;
601	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
602	retry:
603	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
604	if (error)
605	goto out;
606
607	error = path_permission(path: &path, MAY_EXEC \| MAY_CHDIR);
608	if (error)
609	goto dput_and_out;
610
611	error = -EPERM;
612	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
613	goto dput_and_out;
614	error = security_path_chroot(path: &path);
615	if (error)
616	goto dput_and_out;
617
618	set_fs_root(current->fs, &path);
619	error = `0`;
620	dput_and_out:
621	path_put(&path);
622	if (retry_estale(error, flags: lookup_flags)) {
623	lookup_flags \|= LOOKUP_REVAL;
624	goto retry;
625	}
626	out:
627	return error;
628	}
629
630	int chmod_common(const struct path *path, umode_t mode)
631	{
632	struct inode *inode = path->dentry->d_inode;
633	struct inode *delegated_inode = NULL;
634	struct iattr newattrs;
635	int error;
636
637	error = mnt_want_write(mnt: path->mnt);
638	if (error)
639	return error;
640	retry_deleg:
641	error = inode_lock_killable(inode);
642	if (error)
643	goto out_mnt_unlock;
644	error = security_path_chmod(path, mode);
645	if (error)
646	goto out_unlock;
647	newattrs.ia_mode = (mode & S_IALLUGO) \| (inode->i_mode & ~S_IALLUGO);
648	newattrs.ia_valid = ATTR_MODE \| ATTR_CTIME;
649	error = notify_change(mnt_idmap(mnt: path->mnt), path->dentry,
650	&newattrs, &delegated_inode);
651	out_unlock:
652	inode_unlock(inode);
653	if (delegated_inode) {
654	error = break_deleg_wait(delegated_inode: &delegated_inode);
655	if (!error)
656	goto retry_deleg;
657	}
658	out_mnt_unlock:
659	mnt_drop_write(mnt: path->mnt);
660	return error;
661	}
662
663	int vfs_fchmod(struct file *file, umode_t mode)
664	{
665	audit_file(file);
666	return chmod_common(path: &file->f_path, mode);
667	}
668
669	SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
670	{
671	CLASS(fd, f)(fd);
672
673	if (fd_empty(f))
674	return -EBADF;
675
676	return vfs_fchmod(fd_file(f), mode);
677	}
678
679	static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
680	unsigned int flags)
681	{
682	struct path path;
683	int error;
684	unsigned int lookup_flags;
685
686	if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)))
687	return -EINVAL;
688
689	lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? `0` : LOOKUP_FOLLOW;
690	if (flags & AT_EMPTY_PATH)
691	lookup_flags \|= LOOKUP_EMPTY;
692
693	retry:
694	error = user_path_at(dfd, filename, lookup_flags, &path);
695	if (!error) {
696	error = chmod_common(path: &path, mode);
697	path_put(&path);
698	if (retry_estale(error, flags: lookup_flags)) {
699	lookup_flags \|= LOOKUP_REVAL;
700	goto retry;
701	}
702	}
703	return error;
704	}
705
706	SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
707	umode_t, mode, unsigned int, flags)
708	{
709	return do_fchmodat(dfd, filename, mode, flags);
710	}
711
712	SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
713	umode_t, mode)
714	{
715	return do_fchmodat(dfd, filename, mode, flags: `0`);
716	}
717
718	SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
719	{
720	return do_fchmodat(AT_FDCWD, filename, mode, flags: `0`);
721	}
722
723	/*
724	* Check whether @kuid is valid and if so generate and set vfsuid_t in
725	* ia_vfsuid.
726	*
727	* Return: true if @kuid is valid, false if not.
728	*/
729	static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
730	{
731	if (!uid_valid(uid: kuid))
732	return false;
733	attr->ia_valid \|= ATTR_UID;
734	attr->ia_vfsuid = VFSUIDT_INIT(kuid);
735	return true;
736	}
737
738	/*
739	* Check whether @kgid is valid and if so generate and set vfsgid_t in
740	* ia_vfsgid.
741	*
742	* Return: true if @kgid is valid, false if not.
743	*/
744	static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
745	{
746	if (!gid_valid(gid: kgid))
747	return false;
748	attr->ia_valid \|= ATTR_GID;
749	attr->ia_vfsgid = VFSGIDT_INIT(kgid);
750	return true;
751	}
752
753	int chown_common(const struct path *path, uid_t user, gid_t group)
754	{
755	struct mnt_idmap *idmap;
756	struct user_namespace *fs_userns;
757	struct inode *inode = path->dentry->d_inode;
758	struct inode *delegated_inode = NULL;
759	int error;
760	struct iattr newattrs;
761	kuid_t uid;
762	kgid_t gid;
763
764	uid = make_kuid(current_user_ns(), uid: user);
765	gid = make_kgid(current_user_ns(), gid: group);
766
767	idmap = mnt_idmap(mnt: path->mnt);
768	fs_userns = i_user_ns(inode);
769
770	retry_deleg:
771	newattrs.ia_vfsuid = INVALID_VFSUID;
772	newattrs.ia_vfsgid = INVALID_VFSGID;
773	newattrs.ia_valid = ATTR_CTIME;
774	if ((user != (uid_t)-`1`) && !setattr_vfsuid(attr: &newattrs, kuid: uid))
775	return -EINVAL;
776	if ((group != (gid_t)-`1`) && !setattr_vfsgid(attr: &newattrs, kgid: gid))
777	return -EINVAL;
778	error = inode_lock_killable(inode);
779	if (error)
780	return error;
781	if (!S_ISDIR(inode->i_mode))
782	newattrs.ia_valid \|= ATTR_KILL_SUID \| ATTR_KILL_PRIV \|
783	setattr_should_drop_sgid(idmap, inode);
784	/ Continue to send actual fs values, not the mount values. /
785	error = security_path_chown(
786	path,
787	uid: from_vfsuid(idmap, fs_userns, vfsuid: newattrs.ia_vfsuid),
788	gid: from_vfsgid(idmap, fs_userns, vfsgid: newattrs.ia_vfsgid));
789	if (!error)
790	error = notify_change(idmap, path->dentry, &newattrs,
791	&delegated_inode);
792	inode_unlock(inode);
793	if (delegated_inode) {
794	error = break_deleg_wait(delegated_inode: &delegated_inode);
795	if (!error)
796	goto retry_deleg;
797	}
798	return error;
799	}
800
801	int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
802	int flag)
803	{
804	struct path path;
805	int error = -EINVAL;
806	int lookup_flags;
807
808	if ((flag & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)) != `0`)
809	goto out;
810
811	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? `0` : LOOKUP_FOLLOW;
812	if (flag & AT_EMPTY_PATH)
813	lookup_flags \|= LOOKUP_EMPTY;
814	retry:
815	error = user_path_at(dfd, filename, lookup_flags, &path);
816	if (error)
817	goto out;
818	error = mnt_want_write(mnt: path.mnt);
819	if (error)
820	goto out_release;
821	error = chown_common(path: &path, user, group);
822	mnt_drop_write(mnt: path.mnt);
823	out_release:
824	path_put(&path);
825	if (retry_estale(error, flags: lookup_flags)) {
826	lookup_flags \|= LOOKUP_REVAL;
827	goto retry;
828	}
829	out:
830	return error;
831	}
832
833	SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
834	gid_t, group, int, flag)
835	{
836	return do_fchownat(dfd, filename, user, group, flag);
837	}
838
839	SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
840	{
841	return do_fchownat(AT_FDCWD, filename, user, group, flag: `0`);
842	}
843
844	SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
845	{
846	return do_fchownat(AT_FDCWD, filename, user, group,
847	AT_SYMLINK_NOFOLLOW);
848	}
849
850	int vfs_fchown(struct file *file, uid_t user, gid_t group)
851	{
852	int error;
853
854	error = mnt_want_write_file(file);
855	if (error)
856	return error;
857	audit_file(file);
858	error = chown_common(path: &file->f_path, user, group);
859	mnt_drop_write_file(file);
860	return error;
861	}
862
863	int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
864	{
865	CLASS(fd, f)(fd);
866
867	if (fd_empty(f))
868	return -EBADF;
869
870	return vfs_fchown(fd_file(f), user, group);
871	}
872
873	SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
874	{
875	return ksys_fchown(fd, user, group);
876	}
877
878	static inline int file_get_write_access(struct file *f)
879	{
880	int error;
881
882	error = get_write_access(inode: f->f_inode);
883	if (unlikely(error))
884	return error;
885	error = mnt_get_write_access(mnt: f->f_path.mnt);
886	if (unlikely(error))
887	goto cleanup_inode;
888	if (unlikely(f->f_mode & FMODE_BACKING)) {
889	error = mnt_get_write_access(mnt: backing_file_user_path(f)->mnt);
890	if (unlikely(error))
891	goto cleanup_mnt;
892	}
893	return `0`;
894
895	cleanup_mnt:
896	mnt_put_write_access(mnt: f->f_path.mnt);
897	cleanup_inode:
898	put_write_access(inode: f->f_inode);
899	return error;
900	}
901
902	static int do_dentry_open(struct file *f,
903	int (open)(struct* inode , struct* file *))
904	{
905	static const struct file_operations empty_fops = {};
906	struct inode *inode = f->f_path.dentry->d_inode;
907	int error;
908
909	path_get(&f->f_path);
910	f->f_inode = inode;
911	f->f_mapping = inode->i_mapping;
912	f->f_wb_err = filemap_sample_wb_err(mapping: f->f_mapping);
913	f->f_sb_err = file_sample_sb_err(file: f);
914
915	if (unlikely(f->f_flags & O_PATH)) {
916	f->f_mode = FMODE_PATH \| FMODE_OPENED;
917	file_set_fsnotify_mode(file: f, FMODE_NONOTIFY);
918	f->f_op = &empty_fops;
919	return `0`;
920	}
921
922	if ((f->f_mode & (FMODE_READ \| FMODE_WRITE)) == FMODE_READ) {
923	i_readcount_inc(inode);
924	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
925	error = file_get_write_access(f);
926	if (unlikely(error))
927	goto cleanup_file;
928	f->f_mode \|= FMODE_WRITER;
929	}
930
931	/ POSIX.1-2008/SUSv4 Section XSI 2.9.7 /
932	if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode))
933	f->f_mode \|= FMODE_ATOMIC_POS;
934
935	f->f_op = fops_get(inode->i_fop);
936	if (WARN_ON(!f->f_op)) {
937	error = -ENODEV;
938	goto cleanup_all;
939	}
940
941	error = security_file_open(file: f);
942	if (error)
943	goto cleanup_all;
944
945	/*
946	* Set FMODE_NONOTIFY_* bits according to existing permission watches.
947	* If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
948	* pseudo file, this call will not change the mode.
949	*/
950	file_set_fsnotify_mode_from_watchers(file: f);
951	error = fsnotify_open_perm(file: f);
952	if (error)
953	goto cleanup_all;
954
955	error = break_lease(inode: file_inode(f), mode: f->f_flags);
956	if (error)
957	goto cleanup_all;
958
959	/ normally all 3 are set; ->open() can clear them if needed /
960	f->f_mode \|= FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE;
961	if (!open)
962	open = f->f_op->open;
963	if (open) {
964	error = open(inode, f);
965	if (error)
966	goto cleanup_all;
967	}
968	f->f_mode \|= FMODE_OPENED;
969	if ((f->f_mode & FMODE_READ) &&
970	likely(f->f_op->read \|\| f->f_op->read_iter))
971	f->f_mode \|= FMODE_CAN_READ;
972	if ((f->f_mode & FMODE_WRITE) &&
973	likely(f->f_op->write \|\| f->f_op->write_iter))
974	f->f_mode \|= FMODE_CAN_WRITE;
975	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
976	f->f_mode &= ~FMODE_LSEEK;
977	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
978	f->f_mode \|= FMODE_CAN_ODIRECT;
979
980	f->f_flags &= ~(O_CREAT \| O_EXCL \| O_NOCTTY \| O_TRUNC);
981	f->f_iocb_flags = iocb_flags(file: f);
982
983	file_ra_state_init(ra: &f->f_ra, mapping: f->f_mapping->host->i_mapping);
984
985	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
986	return -EINVAL;
987
988	/*
989	* XXX: Huge page cache doesn't support writing yet. Drop all page
990	* cache for this file before processing writes.
991	*/
992	if (f->f_mode & FMODE_WRITE) {
993	/*
994	* Depends on full fence from get_write_access() to synchronize
995	* against collapse_file() regarding i_writecount and nr_thps
996	* updates. Ensures subsequent insertion of THPs into the page
997	* cache will fail.
998	*/
999	if (filemap_nr_thps(mapping: inode->i_mapping)) {
1000	struct address_space *mapping = inode->i_mapping;
1001
1002	filemap_invalidate_lock(mapping: inode->i_mapping);
1003	/*
1004	* unmap_mapping_range just need to be called once
1005	* here, because the private pages is not need to be
1006	* unmapped mapping (e.g. data segment of dynamic
1007	* shared libraries here).
1008	*/
1009	unmap_mapping_range(mapping, holebegin: `0`, holelen: `0`, even_cows: `0`);
1010	truncate_inode_pages(mapping, `0`);
1011	filemap_invalidate_unlock(mapping: inode->i_mapping);
1012	}
1013	}
1014
1015	return `0`;
1016
1017	cleanup_all:
1018	if (WARN_ON_ONCE(error > `0`))
1019	error = -EINVAL;
1020	fops_put(f->f_op);
1021	put_file_access(file: f);
1022	cleanup_file:
1023	path_put(&f->f_path);
1024	f->f_path.mnt = NULL;
1025	f->f_path.dentry = NULL;
1026	f->f_inode = NULL;
1027	return error;
1028	}
1029
1030	/**
1031	* finish_open - finish opening a file
1032	* @file: file pointer
1033	* @dentry: pointer to dentry
1034	* @open: open callback
1035	*
1036	* This can be used to finish opening a file passed to i_op->atomic_open().
1037	*
1038	* If the open callback is set to NULL, then the standard f_op->open()
1039	* filesystem callback is substituted.
1040	*
1041	* NB: the dentry reference is _not_ consumed. If, for example, the dentry is
1042	* the return value of d_splice_alias(), then the caller needs to perform dput()
1043	* on it after finish_open().
1044	*
1045	* Returns zero on success or -errno if the open failed.
1046	*/
1047	int finish_open(struct file file, struct* dentry *dentry,
1048	int (open)(struct* inode , struct* file *))
1049	{
1050	BUG_ON(file->f_mode & FMODE_OPENED); / once it's opened, it's opened /
1051
1052	file->f_path.dentry = dentry;
1053	return do_dentry_open(f: file, open);
1054	}
1055	EXPORT_SYMBOL(finish_open);
1056
1057	/**
1058	* finish_no_open - finish ->atomic_open() without opening the file
1059	*
1060	* @file: file pointer
1061	* @dentry: dentry or NULL (as returned from ->lookup())
1062	*
1063	* This can be used to set the result of a successful lookup in ->atomic_open().
1064	*
1065	* NB: unlike finish_open() this function does consume the dentry reference and
1066	* the caller need not dput() it.
1067	*
1068	* Returns "0" which must be the return value of ->atomic_open() after having
1069	* called this function.
1070	*/
1071	int finish_no_open(struct file file, struct* dentry *dentry)
1072	{
1073	file->f_path.dentry = dentry;
1074	return `0`;
1075	}
1076	EXPORT_SYMBOL(finish_no_open);
1077
1078	char file_path(struct* file filp, char* buf, int* buflen)
1079	{
1080	return d_path(&filp->f_path, buf, buflen);
1081	}
1082	EXPORT_SYMBOL(file_path);
1083
1084	/**
1085	* vfs_open - open the file at the given path
1086	* @path: path to open
1087	* @file: newly allocated file with f_flag initialized
1088	*/
1089	int vfs_open(const struct path path, struct* file *file)
1090	{
1091	int ret;
1092
1093	file->f_path = *path;
1094	ret = do_dentry_open(f: file, NULL);
1095	if (!ret) {
1096	/*
1097	* Once we return a file with FMODE_OPENED, __fput() will call
1098	* fsnotify_close(), so we need fsnotify_open() here for
1099	* symmetry.
1100	*/
1101	fsnotify_open(file);
1102	}
1103	return ret;
1104	}
1105
1106	struct file dentry_open(const* struct path path, int* flags,
1107	const struct cred *cred)
1108	{
1109	int error;
1110	struct file *f;
1111
1112	/ We must always pass in a valid mount pointer. /
1113	BUG_ON(!path->mnt);
1114
1115	f = alloc_empty_file(flags, cred);
1116	if (!IS_ERR(ptr: f)) {
1117	error = vfs_open(path, file: f);
1118	if (error) {
1119	fput(f);
1120	f = ERR_PTR(error);
1121	}
1122	}
1123	return f;
1124	}
1125	EXPORT_SYMBOL(dentry_open);
1126
1127	struct file dentry_open_nonotify(const* struct path path, int* flags,
1128	const struct cred *cred)
1129	{
1130	struct file *f = alloc_empty_file(flags, cred);
1131	if (!IS_ERR(ptr: f)) {
1132	int error;
1133
1134	file_set_fsnotify_mode(file: f, FMODE_NONOTIFY);
1135	error = vfs_open(path, file: f);
1136	if (error) {
1137	fput(f);
1138	f = ERR_PTR(error);
1139	}
1140	}
1141	return f;
1142	}
1143
1144	/**
1145	* dentry_create - Create and open a file
1146	* @path: path to create
1147	* @flags: O_ flags
1148	* @mode: mode bits for new file
1149	* @cred: credentials to use
1150	*
1151	* Caller must hold the parent directory's lock, and have prepared
1152	* a negative dentry, placed in @path->dentry, for the new file.
1153	*
1154	* Caller sets @path->mnt to the vfsmount of the filesystem where
1155	* the new file is to be created. The parent directory and the
1156	* negative dentry must reside on the same filesystem instance.
1157	*
1158	* On success, returns a "struct file *". Otherwise a ERR_PTR
1159	* is returned.
1160	*/
1161	struct file dentry_create(const* struct path path, int* flags, umode_t mode,
1162	const struct cred *cred)
1163	{
1164	struct file *f;
1165	int error;
1166
1167	f = alloc_empty_file(flags, cred);
1168	if (IS_ERR(ptr: f))
1169	return f;
1170
1171	error = vfs_create(mnt_idmap(mnt: path->mnt),
1172	d_inode(dentry: path->dentry->d_parent),
1173	path->dentry, mode, true);
1174	if (!error)
1175	error = vfs_open(path, file: f);
1176
1177	if (unlikely(error)) {
1178	fput(f);
1179	return ERR_PTR(error);
1180	}
1181	return f;
1182	}
1183	EXPORT_SYMBOL(dentry_create);
1184
1185	/**
1186	* kernel_file_open - open a file for kernel internal use
1187	* @path: path of the file to open
1188	* @flags: open flags
1189	* @cred: credentials for open
1190	*
1191	* Open a file for use by in-kernel consumers. The file is not accounted
1192	* against nr_files and must not be installed into the file descriptor
1193	* table.
1194	*
1195	* Return: Opened file on success, an error pointer on failure.
1196	*/
1197	struct file kernel_file_open(const* struct path path, int* flags,
1198	const struct cred *cred)
1199	{
1200	struct file *f;
1201	int error;
1202
1203	f = alloc_empty_file_noaccount(flags, cred);
1204	if (IS_ERR(ptr: f))
1205	return f;
1206
1207	f->f_path = *path;
1208	error = do_dentry_open(f, NULL);
1209	if (error) {
1210	fput(f);
1211	return ERR_PTR(error);
1212	}
1213
1214	fsnotify_open(file: f);
1215	return f;
1216	}
1217	EXPORT_SYMBOL_GPL(kernel_file_open);
1218
1219	#define WILL_CREATE(flags) (flags & (O_CREAT \| __O_TMPFILE))
1220	#define O_PATH_FLAGS (O_DIRECTORY \| O_NOFOLLOW \| O_PATH \| O_CLOEXEC)
1221
1222	inline struct open_how build_open_how(int flags, umode_t mode)
1223	{
1224	struct open_how how = {
1225	.flags = flags & VALID_OPEN_FLAGS,
1226	.mode = mode & S_IALLUGO,
1227	};
1228
1229	/ O_PATH beats everything else. /
1230	if (how.flags & O_PATH)
1231	how.flags &= O_PATH_FLAGS;
1232	/ Modes should only be set for create-like flags. /
1233	if (!WILL_CREATE(how.flags))
1234	how.mode = `0`;
1235	return how;
1236	}
1237
1238	inline int build_open_flags(const struct open_how how, struct* open_flags *op)
1239	{
1240	u64 flags = how->flags;
1241	u64 strip = O_CLOEXEC;
1242	int lookup_flags = `0`;
1243	int acc_mode = ACC_MODE(flags);
1244
1245	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
1246	"struct open_flags doesn't yet handle flags > 32 bits");
1247
1248	/*
1249	* Strip flags that aren't relevant in determining struct open_flags.
1250	*/
1251	flags &= ~strip;
1252
1253	/*
1254	* Older syscalls implicitly clear all of the invalid flags or argument
1255	* values before calling build_open_flags(), but openat2(2) checks all
1256	* of its arguments.
1257	*/
1258	if (flags & ~VALID_OPEN_FLAGS)
1259	return -EINVAL;
1260	if (how->resolve & ~VALID_RESOLVE_FLAGS)
1261	return -EINVAL;
1262
1263	/ Scoping flags are mutually exclusive. /
1264	if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
1265	return -EINVAL;
1266
1267	/ Deal with the mode. /
1268	if (WILL_CREATE(flags)) {
1269	if (how->mode & ~S_IALLUGO)
1270	return -EINVAL;
1271	op->mode = how->mode \| S_IFREG;
1272	} else {
1273	if (how->mode != `0`)
1274	return -EINVAL;
1275	op->mode = `0`;
1276	}
1277
1278	/*
1279	* Block bugs where O_DIRECTORY \| O_CREAT created regular files.
1280	* Note, that blocking O_DIRECTORY \| O_CREAT here also protects
1281	* O_TMPFILE below which requires O_DIRECTORY being raised.
1282	*/
1283	if ((flags & (O_DIRECTORY \| O_CREAT)) == (O_DIRECTORY \| O_CREAT))
1284	return -EINVAL;
1285
1286	/ Now handle the creative implementation of O_TMPFILE. /
1287	if (flags & __O_TMPFILE) {
1288	/*
1289	* In order to ensure programs get explicit errors when trying
1290	* to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
1291	* is raised alongside __O_TMPFILE.
1292	*/
1293	if (!(flags & O_DIRECTORY))
1294	return -EINVAL;
1295	if (!(acc_mode & MAY_WRITE))
1296	return -EINVAL;
1297	}
1298	if (flags & O_PATH) {
1299	/ O_PATH only permits certain other flags to be set. /
1300	if (flags & ~O_PATH_FLAGS)
1301	return -EINVAL;
1302	acc_mode = `0`;
1303	}
1304
1305	/*
1306	* O_SYNC is implemented as __O_SYNC\|O_DSYNC. As many places only
1307	* check for O_DSYNC if the need any syncing at all we enforce it's
1308	* always set instead of having to deal with possibly weird behaviour
1309	* for malicious applications setting only __O_SYNC.
1310	*/
1311	if (flags & __O_SYNC)
1312	flags \|= O_DSYNC;
1313
1314	op->open_flag = flags;
1315
1316	/ O_TRUNC implies we need access checks for write permissions /
1317	if (flags & O_TRUNC)
1318	acc_mode \|= MAY_WRITE;
1319
1320	/ Allow the LSM permission hook to distinguish append*
1321	access from general write access. /*
1322	if (flags & O_APPEND)
1323	acc_mode \|= MAY_APPEND;
1324
1325	op->acc_mode = acc_mode;
1326
1327	op->intent = flags & O_PATH ? `0` : LOOKUP_OPEN;
1328
1329	if (flags & O_CREAT) {
1330	op->intent \|= LOOKUP_CREATE;
1331	if (flags & O_EXCL) {
1332	op->intent \|= LOOKUP_EXCL;
1333	flags \|= O_NOFOLLOW;
1334	}
1335	}
1336
1337	if (flags & O_DIRECTORY)
1338	lookup_flags \|= LOOKUP_DIRECTORY;
1339	if (!(flags & O_NOFOLLOW))
1340	lookup_flags \|= LOOKUP_FOLLOW;
1341
1342	if (how->resolve & RESOLVE_NO_XDEV)
1343	lookup_flags \|= LOOKUP_NO_XDEV;
1344	if (how->resolve & RESOLVE_NO_MAGICLINKS)
1345	lookup_flags \|= LOOKUP_NO_MAGICLINKS;
1346	if (how->resolve & RESOLVE_NO_SYMLINKS)
1347	lookup_flags \|= LOOKUP_NO_SYMLINKS;
1348	if (how->resolve & RESOLVE_BENEATH)
1349	lookup_flags \|= LOOKUP_BENEATH;
1350	if (how->resolve & RESOLVE_IN_ROOT)
1351	lookup_flags \|= LOOKUP_IN_ROOT;
1352	if (how->resolve & RESOLVE_CACHED) {
1353	/ Don't bother even trying for create/truncate/tmpfile open /
1354	if (flags & (O_TRUNC \| O_CREAT \| __O_TMPFILE))
1355	return -EAGAIN;
1356	lookup_flags \|= LOOKUP_CACHED;
1357	}
1358
1359	op->lookup_flags = lookup_flags;
1360	return `0`;
1361	}
1362
1363	/**
1364	* file_open_name - open file and return file pointer
1365	*
1366	* @name: struct filename containing path to open
1367	* @flags: open flags as per the open(2) second argument
1368	* @mode: mode for the new file if O_CREAT is set, else ignored
1369	*
1370	* This is the helper to open a file from kernelspace if you really
1371	* have to. But in generally you should not do this, so please move
1372	* along, nothing to see here..
1373	*/
1374	struct file file_open_name(struct* filename name, int* flags, umode_t mode)
1375	{
1376	struct open_flags op;
1377	struct open_how how = build_open_how(flags, mode);
1378	int err = build_open_flags(how: &how, op: &op);
1379	if (err)
1380	return ERR_PTR(error: err);
1381	return do_filp_open(AT_FDCWD, pathname: name, op: &op);
1382	}
1383
1384	/**
1385	* filp_open - open file and return file pointer
1386	*
1387	* @filename: path to open
1388	* @flags: open flags as per the open(2) second argument
1389	* @mode: mode for the new file if O_CREAT is set, else ignored
1390	*
1391	* This is the helper to open a file from kernelspace if you really
1392	* have to. But in generally you should not do this, so please move
1393	* along, nothing to see here..
1394	*/
1395	struct file filp_open(const* char filename, int* flags, umode_t mode)
1396	{
1397	struct filename *name = getname_kernel(filename);
1398	struct file *file = ERR_CAST(ptr: name);
1399
1400	if (!IS_ERR(ptr: name)) {
1401	file = file_open_name(name, flags, mode);
1402	putname(name);
1403	}
1404	return file;
1405	}
1406	EXPORT_SYMBOL(filp_open);
1407
1408	struct file file_open_root(const* struct path *root,
1409	const char filename, int* flags, umode_t mode)
1410	{
1411	struct open_flags op;
1412	struct open_how how = build_open_how(flags, mode);
1413	int err = build_open_flags(how: &how, op: &op);
1414	if (err)
1415	return ERR_PTR(error: err);
1416	return do_file_open_root(root, filename, &op);
1417	}
1418	EXPORT_SYMBOL(file_open_root);
1419
1420	static int do_sys_openat2(int dfd, const char __user *filename,
1421	struct open_how *how)
1422	{
1423	struct open_flags op;
1424	struct filename *tmp;
1425	int err, fd;
1426
1427	err = build_open_flags(how, op: &op);
1428	if (unlikely(err))
1429	return err;
1430
1431	tmp = getname(name: filename);
1432	if (IS_ERR(ptr: tmp))
1433	return PTR_ERR(ptr: tmp);
1434
1435	fd = get_unused_fd_flags(flags: how->flags);
1436	if (likely(fd >= `0`)) {
1437	struct file *f = do_filp_open(dfd, pathname: tmp, op: &op);
1438	if (IS_ERR(ptr: f)) {
1439	put_unused_fd(fd);
1440	fd = PTR_ERR(ptr: f);
1441	} else {
1442	fd_install(fd, file: f);
1443	}
1444	}
1445	putname(name: tmp);
1446	return fd;
1447	}
1448
1449	int do_sys_open(int dfd, const char __user filename, int* flags, umode_t mode)
1450	{
1451	struct open_how how = build_open_how(flags, mode);
1452	return do_sys_openat2(dfd, filename, how: &how);
1453	}
1454
1455
1456	SYSCALL_DEFINE3(open, const char __user , filename, int*, flags, umode_t, mode)
1457	{
1458	if (force_o_largefile())
1459	flags \|= O_LARGEFILE;
1460	return do_sys_open(AT_FDCWD, filename, flags, mode);
1461	}
1462
1463	SYSCALL_DEFINE4(openat, int, dfd, const char __user , filename, int*, flags,
1464	umode_t, mode)
1465	{
1466	if (force_o_largefile())
1467	flags \|= O_LARGEFILE;
1468	return do_sys_open(dfd, filename, flags, mode);
1469	}
1470
1471	SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
1472	struct open_how __user *, how, size_t, usize)
1473	{
1474	int err;
1475	struct open_how tmp;
1476
1477	BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
1478	BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
1479
1480	if (unlikely(usize < OPEN_HOW_SIZE_VER0))
1481	return -EINVAL;
1482	if (unlikely(usize > PAGE_SIZE))
1483	return -E2BIG;
1484
1485	err = copy_struct_from_user(dst: &tmp, ksize: sizeof(tmp), src: how, usize);
1486	if (err)
1487	return err;
1488
1489	audit_openat2_how(how: &tmp);
1490
1491	/ O_LARGEFILE is only allowed for non-O_PATH. /
1492	if (!(tmp.flags & O_PATH) && force_o_largefile())
1493	tmp.flags \|= O_LARGEFILE;
1494
1495	return do_sys_openat2(dfd, filename, how: &tmp);
1496	}
1497
1498	#ifdef CONFIG_COMPAT
1499	/*
1500	* Exactly like sys_open(), except that it doesn't set the
1501	* O_LARGEFILE flag.
1502	*/
1503	COMPAT_SYSCALL_DEFINE3(open, const char __user , filename, int*, flags, umode_t, mode)
1504	{
1505	return do_sys_open(AT_FDCWD, filename, flags, mode);
1506	}
1507
1508	/*
1509	* Exactly like sys_openat(), except that it doesn't set the
1510	* O_LARGEFILE flag.
1511	*/
1512	COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user , filename, int*, flags, umode_t, mode)
1513	{
1514	return do_sys_open(dfd, filename, flags, mode);
1515	}
1516	#endif
1517
1518	#ifndef __alpha__
1519
1520	/*
1521	* For backward compatibility? Maybe this should be moved
1522	* into arch/i386 instead?
1523	*/
1524	SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1525	{
1526	int flags = O_CREAT \| O_WRONLY \| O_TRUNC;
1527
1528	if (force_o_largefile())
1529	flags \|= O_LARGEFILE;
1530	return do_sys_open(AT_FDCWD, filename: pathname, flags, mode);
1531	}
1532	#endif
1533
1534	/*
1535	* "id" is the POSIX thread ID. We use the
1536	* files pointer for this..
1537	*/
1538	static int filp_flush(struct file *filp, fl_owner_t id)
1539	{
1540	int retval = `0`;
1541
1542	if (CHECK_DATA_CORRUPTION(file_count(filp) == `0`, filp,
1543	"VFS: Close: file count is 0 (f_op=%ps)",
1544	filp->f_op)) {
1545	return `0`;
1546	}
1547
1548	if (filp->f_op->flush)
1549	retval = filp->f_op->flush(filp, id);
1550
1551	if (likely(!(filp->f_mode & FMODE_PATH))) {
1552	dnotify_flush(filp, id);
1553	locks_remove_posix(filp, id);
1554	}
1555	return retval;
1556	}
1557
1558	int filp_close(struct file *filp, fl_owner_t id)
1559	{
1560	int retval;
1561
1562	retval = filp_flush(filp, id);
1563	fput_close(filp);
1564
1565	return retval;
1566	}
1567	EXPORT_SYMBOL(filp_close);
1568
1569	/*
1570	* Careful here! We test whether the file pointer is NULL before
1571	* releasing the fd. This ensures that one clone task can't release
1572	* an fd while another clone is opening it.
1573	*/
1574	SYSCALL_DEFINE1(close, unsigned int, fd)
1575	{
1576	int retval;
1577	struct file *file;
1578
1579	file = file_close_fd(fd);
1580	if (!file)
1581	return -EBADF;
1582
1583	retval = filp_flush(filp: file, current->files);
1584
1585	/*
1586	* We're returning to user space. Don't bother
1587	* with any delayed fput() cases.
1588	*/
1589	fput_close_sync(file);
1590
1591	if (likely(retval == `0`))
1592	return `0`;
1593
1594	/ can't restart close syscall because file table entry was cleared /
1595	if (retval == -ERESTARTSYS \|\|
1596	retval == -ERESTARTNOINTR \|\|
1597	retval == -ERESTARTNOHAND \|\|
1598	retval == -ERESTART_RESTARTBLOCK)
1599	retval = -EINTR;
1600
1601	return retval;
1602	}
1603
1604	/*
1605	* This routine simulates a hangup on the tty, to arrange that users
1606	* are given clean terminals at login time.
1607	*/
1608	SYSCALL_DEFINE0(vhangup)
1609	{
1610	if (capable(CAP_SYS_TTY_CONFIG)) {
1611	tty_vhangup_self();
1612	return `0`;
1613	}
1614	return -EPERM;
1615	}
1616
1617	/*
1618	* Called when an inode is about to be open.
1619	* We use this to disallow opening large files on 32bit systems if
1620	* the caller didn't specify O_LARGEFILE. On 64bit systems we force
1621	* on this flag in sys_open.
1622	*/
1623	int generic_file_open(struct inode * inode, struct file * filp)
1624	{
1625	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1626	return -EOVERFLOW;
1627	return `0`;
1628	}
1629
1630	EXPORT_SYMBOL(generic_file_open);
1631
1632	/*
1633	* This is used by subsystems that don't want seekable
1634	* file descriptors. The function is not supposed to ever fail, the only
1635	* reason it returns an 'int' and not 'void' is so that it can be plugged
1636	* directly into file_operations structure.
1637	*/
1638	int nonseekable_open(struct inode inode, struct* file *filp)
1639	{
1640	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE);
1641	return `0`;
1642	}
1643
1644	EXPORT_SYMBOL(nonseekable_open);
1645
1646	/*
1647	* stream_open is used by subsystems that want stream-like file descriptors.
1648	* Such file descriptors are not seekable and don't have notion of position
1649	* (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
1650	* Contrary to file descriptors of other regular files, .read() and .write()
1651	* can run simultaneously.
1652	*
1653	* stream_open never fails and is marked to return int so that it could be
1654	* directly used as file_operations.open .
1655	*/
1656	int stream_open(struct inode inode, struct* file *filp)
1657	{
1658	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE \| FMODE_ATOMIC_POS);
1659	filp->f_mode \|= FMODE_STREAM;
1660	return `0`;
1661	}
1662
1663	EXPORT_SYMBOL(stream_open);
1664

source code of linux/fs/open.c