xfs_file.c source code [linux/fs/xfs/xfs_file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_inode.h"
14	#include "xfs_trans.h"
15	#include "xfs_inode_item.h"
16	#include "xfs_bmap.h"
17	#include "xfs_bmap_util.h"
18	#include "xfs_dir2.h"
19	#include "xfs_dir2_priv.h"
20	#include "xfs_ioctl.h"
21	#include "xfs_trace.h"
22	#include "xfs_log.h"
23	#include "xfs_icache.h"
24	#include "xfs_pnfs.h"
25	#include "xfs_iomap.h"
26	#include "xfs_reflink.h"
27	#include "xfs_file.h"
28	#include "xfs_aops.h"
29	#include "xfs_zone_alloc.h"
30
31	#include <linux/dax.h>
32	#include <linux/falloc.h>
33	#include <linux/backing-dev.h>
34	#include <linux/mman.h>
35	#include <linux/fadvise.h>
36	#include <linux/mount.h>
37
38	static const struct vm_operations_struct xfs_file_vm_ops;
39
40	/*
41	* Decide if the given file range is aligned to the size of the fundamental
42	* allocation unit for the file.
43	*/
44	bool
45	xfs_is_falloc_aligned(
46	struct xfs_inode *ip,
47	loff_t pos,
48	long long int len)
49	{
50	unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
51
52	if (!is_power_of_2(n: alloc_unit))
53	return isaligned_64(x: pos, y: alloc_unit) &&
54	isaligned_64(x: len, y: alloc_unit);
55
56	return !((pos \| len) & (alloc_unit - `1`));
57	}
58
59	/*
60	* Fsync operations on directories are much simpler than on regular files,
61	* as there is no file data to flush, and thus also no need for explicit
62	* cache flush operations, and there are no non-transaction metadata updates
63	* on directories either.
64	*/
65	STATIC int
66	xfs_dir_fsync(
67	struct file *file,
68	loff_t start,
69	loff_t end,
70	int datasync)
71	{
72	struct xfs_inode *ip = XFS_I(inode: file->f_mapping->host);
73
74	trace_xfs_dir_fsync(ip);
75	return xfs_log_force_inode(ip);
76	}
77
78	static xfs_csn_t
79	xfs_fsync_seq(
80	struct xfs_inode *ip,
81	bool datasync)
82	{
83	if (!xfs_ipincount(ip))
84	return `0`;
85	if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
86	return `0`;
87	return ip->i_itemp->ili_commit_seq;
88	}
89
90	/*
91	* All metadata updates are logged, which means that we just have to flush the
92	* log up to the latest LSN that touched the inode.
93	*
94	* If we have concurrent fsync/fdatasync() calls, we need them to all block on
95	* the log force before we clear the ili_fsync_fields field. This ensures that
96	* we don't get a racing sync operation that does not wait for the metadata to
97	* hit the journal before returning. If we race with clearing ili_fsync_fields,
98	* then all that will happen is the log force will do nothing as the lsn will
99	* already be on disk. We can't race with setting ili_fsync_fields because that
100	* is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
101	* shared until after the ili_fsync_fields is cleared.
102	*/
103	static int
104	xfs_fsync_flush_log(
105	struct xfs_inode *ip,
106	bool datasync,
107	int *log_flushed)
108	{
109	int error = `0`;
110	xfs_csn_t seq;
111
112	xfs_ilock(ip, XFS_ILOCK_SHARED);
113	seq = xfs_fsync_seq(ip, datasync);
114	if (seq) {
115	error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
116	log_flushed);
117
118	spin_lock(lock: &ip->i_itemp->ili_lock);
119	ip->i_itemp->ili_fsync_fields = `0`;
120	spin_unlock(lock: &ip->i_itemp->ili_lock);
121	}
122	xfs_iunlock(ip, XFS_ILOCK_SHARED);
123	return error;
124	}
125
126	STATIC int
127	xfs_file_fsync(
128	struct file *file,
129	loff_t start,
130	loff_t end,
131	int datasync)
132	{
133	struct xfs_inode *ip = XFS_I(inode: file->f_mapping->host);
134	struct xfs_mount *mp = ip->i_mount;
135	int error, err2;
136	int log_flushed = `0`;
137
138	trace_xfs_file_fsync(ip);
139
140	error = file_write_and_wait_range(file, start, end);
141	if (error)
142	return error;
143
144	if (xfs_is_shutdown(mp))
145	return -EIO;
146
147	xfs_iflags_clear(ip, XFS_ITRUNCATED);
148
149	/*
150	* If we have an RT and/or log subvolume we need to make sure to flush
151	* the write cache the device used for file data first. This is to
152	* ensure newly written file data make it to disk before logging the new
153	* inode size in case of an extending write.
154	*/
155	if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
156	error = blkdev_issue_flush(bdev: mp->m_rtdev_targp->bt_bdev);
157	else if (mp->m_logdev_targp != mp->m_ddev_targp)
158	error = blkdev_issue_flush(bdev: mp->m_ddev_targp->bt_bdev);
159
160	/*
161	* Any inode that has dirty modifications in the log is pinned. The
162	* racy check here for a pinned inode will not catch modifications
163	* that happen concurrently to the fsync call, but fsync semantics
164	* only require to sync previously completed I/O.
165	*/
166	if (xfs_ipincount(ip)) {
167	err2 = xfs_fsync_flush_log(ip, datasync, log_flushed: &log_flushed);
168	if (err2 && !error)
169	error = err2;
170	}
171
172	/*
173	* If we only have a single device, and the log force about was
174	* a no-op we might have to flush the data device cache here.
175	* This can only happen for fdatasync/O_DSYNC if we were overwriting
176	* an already allocated file and thus do not have any metadata to
177	* commit.
178	*/
179	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
180	mp->m_logdev_targp == mp->m_ddev_targp) {
181	err2 = blkdev_issue_flush(bdev: mp->m_ddev_targp->bt_bdev);
182	if (err2 && !error)
183	error = err2;
184	}
185
186	return error;
187	}
188
189	static int
190	xfs_ilock_iocb(
191	struct kiocb *iocb,
192	unsigned int lock_mode)
193	{
194	struct xfs_inode *ip = XFS_I(inode: file_inode(f: iocb->ki_filp));
195
196	if (iocb->ki_flags & IOCB_NOWAIT) {
197	if (!xfs_ilock_nowait(ip, lock_mode))
198	return -EAGAIN;
199	} else {
200	xfs_ilock(ip, lock_mode);
201	}
202
203	return `0`;
204	}
205
206	static int
207	xfs_ilock_iocb_for_write(
208	struct kiocb *iocb,
209	unsigned int *lock_mode)
210	{
211	ssize_t ret;
212	struct xfs_inode *ip = XFS_I(inode: file_inode(f: iocb->ki_filp));
213
214	ret = xfs_ilock_iocb(iocb, lock_mode: *lock_mode);
215	if (ret)
216	return ret;
217
218	/*
219	* If a reflink remap is in progress we always need to take the iolock
220	* exclusively to wait for it to finish.
221	*/
222	if (*lock_mode == XFS_IOLOCK_SHARED &&
223	xfs_iflags_test(ip, XFS_IREMAPPING)) {
224	xfs_iunlock(ip, *lock_mode);
225	*lock_mode = XFS_IOLOCK_EXCL;
226	return xfs_ilock_iocb(iocb, lock_mode: *lock_mode);
227	}
228
229	return `0`;
230	}
231
232	STATIC ssize_t
233	xfs_file_dio_read(
234	struct kiocb *iocb,
235	struct iov_iter *to)
236	{
237	struct xfs_inode *ip = XFS_I(inode: file_inode(f: iocb->ki_filp));
238	ssize_t ret;
239
240	trace_xfs_file_direct_read(iocb, iter: to);
241
242	if (!iov_iter_count(i: to))
243	return `0`; / skip atime /
244
245	file_accessed(file: iocb->ki_filp);
246
247	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
248	if (ret)
249	return ret;
250	ret = iomap_dio_rw(iocb, iter: to, ops: &xfs_read_iomap_ops, NULL, dio_flags: `0`, NULL, done_before: `0`);
251	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
252
253	return ret;
254	}
255
256	static noinline ssize_t
257	xfs_file_dax_read(
258	struct kiocb *iocb,
259	struct iov_iter *to)
260	{
261	struct xfs_inode *ip = XFS_I(inode: iocb->ki_filp->f_mapping->host);
262	ssize_t ret = `0`;
263
264	trace_xfs_file_dax_read(iocb, iter: to);
265
266	if (!iov_iter_count(i: to))
267	return `0`; / skip atime /
268
269	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
270	if (ret)
271	return ret;
272	ret = dax_iomap_rw(iocb, iter: to, ops: &xfs_read_iomap_ops);
273	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
274
275	file_accessed(file: iocb->ki_filp);
276	return ret;
277	}
278
279	STATIC ssize_t
280	xfs_file_buffered_read(
281	struct kiocb *iocb,
282	struct iov_iter *to)
283	{
284	struct xfs_inode *ip = XFS_I(inode: file_inode(f: iocb->ki_filp));
285	ssize_t ret;
286
287	trace_xfs_file_buffered_read(iocb, iter: to);
288
289	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
290	if (ret)
291	return ret;
292	ret = generic_file_read_iter(iocb, to);
293	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
294
295	return ret;
296	}
297
298	STATIC ssize_t
299	xfs_file_read_iter(
300	struct kiocb *iocb,
301	struct iov_iter *to)
302	{
303	struct inode *inode = file_inode(f: iocb->ki_filp);
304	struct xfs_mount *mp = XFS_I(inode)->i_mount;
305	ssize_t ret = `0`;
306
307	XFS_STATS_INC(mp, xs_read_calls);
308
309	if (xfs_is_shutdown(mp))
310	return -EIO;
311
312	if (IS_DAX(inode))
313	ret = xfs_file_dax_read(iocb, to);
314	else if (iocb->ki_flags & IOCB_DIRECT)
315	ret = xfs_file_dio_read(iocb, to);
316	else
317	ret = xfs_file_buffered_read(iocb, to);
318
319	if (ret > `0`)
320	XFS_STATS_ADD(mp, xs_read_bytes, ret);
321	return ret;
322	}
323
324	STATIC ssize_t
325	xfs_file_splice_read(
326	struct file *in,
327	loff_t *ppos,
328	struct pipe_inode_info *pipe,
329	size_t len,
330	unsigned int flags)
331	{
332	struct inode *inode = file_inode(f: in);
333	struct xfs_inode *ip = XFS_I(inode);
334	struct xfs_mount *mp = ip->i_mount;
335	ssize_t ret = `0`;
336
337	XFS_STATS_INC(mp, xs_read_calls);
338
339	if (xfs_is_shutdown(mp))
340	return -EIO;
341
342	trace_xfs_file_splice_read(ip, offset: *ppos, count: len);
343
344	xfs_ilock(ip, XFS_IOLOCK_SHARED);
345	ret = filemap_splice_read(in, ppos, pipe, len, flags);
346	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
347	if (ret > `0`)
348	XFS_STATS_ADD(mp, xs_read_bytes, ret);
349	return ret;
350	}
351
352	/*
353	* Take care of zeroing post-EOF blocks when they might exist.
354	*
355	* Returns 0 if successfully, a negative error for a failure, or 1 if this
356	* function dropped the iolock and reacquired it exclusively and the caller
357	* needs to restart the write sanity checks.
358	*/
359	static ssize_t
360	xfs_file_write_zero_eof(
361	struct kiocb *iocb,
362	struct iov_iter *from,
363	unsigned int *iolock,
364	size_t count,
365	bool *drained_dio,
366	struct xfs_zone_alloc_ctx *ac)
367	{
368	struct xfs_inode *ip = XFS_I(inode: iocb->ki_filp->f_mapping->host);
369	loff_t isize;
370	int error;
371
372	/*
373	* We need to serialise against EOF updates that occur in IO completions
374	* here. We want to make sure that nobody is changing the size while
375	* we do this check until we have placed an IO barrier (i.e. hold
376	* XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
377	* spinlock effectively forms a memory barrier once we have
378	* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
379	* hence be able to correctly determine if we need to run zeroing.
380	*/
381	spin_lock(lock: &ip->i_flags_lock);
382	isize = i_size_read(inode: VFS_I(ip));
383	if (iocb->ki_pos <= isize) {
384	spin_unlock(lock: &ip->i_flags_lock);
385	return `0`;
386	}
387	spin_unlock(lock: &ip->i_flags_lock);
388
389	if (iocb->ki_flags & IOCB_NOWAIT)
390	return -EAGAIN;
391
392	if (!*drained_dio) {
393	/*
394	* If zeroing is needed and we are currently holding the iolock
395	* shared, we need to update it to exclusive which implies
396	* having to redo all checks before.
397	*/
398	if (*iolock == XFS_IOLOCK_SHARED) {
399	xfs_iunlock(ip, *iolock);
400	*iolock = XFS_IOLOCK_EXCL;
401	xfs_ilock(ip, *iolock);
402	iov_iter_reexpand(i: from, count);
403	}
404
405	/*
406	* We now have an IO submission barrier in place, but AIO can do
407	* EOF updates during IO completion and hence we now need to
408	* wait for all of them to drain. Non-AIO DIO will have drained
409	* before we are given the XFS_IOLOCK_EXCL, and so for most
410	* cases this wait is a no-op.
411	*/
412	inode_dio_wait(inode: VFS_I(ip));
413	*drained_dio = true;
414	return `1`;
415	}
416
417	trace_xfs_zero_eof(ip, offset: isize, count: iocb->ki_pos - isize);
418
419	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
420	error = xfs_zero_range(ip, pos: isize, len: iocb->ki_pos - isize, ac, NULL);
421	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
422
423	return error;
424	}
425
426	/*
427	* Common pre-write limit and setup checks.
428	*
429	* Called with the iolock held either shared and exclusive according to
430	* @iolock, and returns with it held. Might upgrade the iolock to exclusive
431	* if called for a direct write beyond i_size.
432	*/
433	STATIC ssize_t
434	xfs_file_write_checks(
435	struct kiocb *iocb,
436	struct iov_iter *from,
437	unsigned int *iolock,
438	struct xfs_zone_alloc_ctx *ac)
439	{
440	struct inode *inode = iocb->ki_filp->f_mapping->host;
441	size_t count = iov_iter_count(i: from);
442	bool drained_dio = false;
443	ssize_t error;
444
445	restart:
446	error = generic_write_checks(iocb, from);
447	if (error <= `0`)
448	return error;
449
450	if (iocb->ki_flags & IOCB_NOWAIT) {
451	error = break_layout(inode, wait: false);
452	if (error == -EWOULDBLOCK)
453	error = -EAGAIN;
454	} else {
455	error = xfs_break_layouts(inode, iolock, reason: BREAK_WRITE);
456	}
457
458	if (error)
459	return error;
460
461	/*
462	* For changing security info in file_remove_privs() we need i_rwsem
463	* exclusively.
464	*/
465	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
466	xfs_iunlock(XFS_I(inode), *iolock);
467	*iolock = XFS_IOLOCK_EXCL;
468	error = xfs_ilock_iocb(iocb, lock_mode: *iolock);
469	if (error) {
470	*iolock = `0`;
471	return error;
472	}
473	goto restart;
474	}
475
476	/*
477	* If the offset is beyond the size of the file, we need to zero all
478	* blocks that fall between the existing EOF and the start of this
479	* write.
480	*
481	* We can do an unlocked check for i_size here safely as I/O completion
482	* can only extend EOF. Truncate is locked out at this point, so the
483	* EOF can not move backwards, only forwards. Hence we only need to take
484	* the slow path when we are at or beyond the current EOF.
485	*/
486	if (iocb->ki_pos > i_size_read(inode)) {
487	error = xfs_file_write_zero_eof(iocb, from, iolock, count,
488	drained_dio: &drained_dio, ac);
489	if (error == `1`)
490	goto restart;
491	if (error)
492	return error;
493	}
494
495	return kiocb_modified(iocb);
496	}
497
498	static ssize_t
499	xfs_zoned_write_space_reserve(
500	struct xfs_inode *ip,
501	struct kiocb *iocb,
502	struct iov_iter *from,
503	unsigned int flags,
504	struct xfs_zone_alloc_ctx *ac)
505	{
506	loff_t count = iov_iter_count(i: from);
507	int error;
508
509	if (iocb->ki_flags & IOCB_NOWAIT)
510	flags \|= XFS_ZR_NOWAIT;
511
512	/*
513	* Check the rlimit and LFS boundary first so that we don't over-reserve
514	* by possibly a lot.
515	*
516	* The generic write path will redo this check later, and it might have
517	* changed by then. If it got expanded we'll stick to our earlier
518	* smaller limit, and if it is decreased the new smaller limit will be
519	* used and our extra space reservation will be returned after finishing
520	* the write.
521	*/
522	error = generic_write_check_limits(file: iocb->ki_filp, pos: iocb->ki_pos, count: &count);
523	if (error)
524	return error;
525
526	/*
527	* Sloppily round up count to file system blocks.
528	*
529	* This will often reserve an extra block, but that avoids having to look
530	* at the start offset, which isn't stable for O_APPEND until taking the
531	* iolock. Also we need to reserve a block each for zeroing the old
532	* EOF block and the new start block if they are unaligned.
533	*
534	* Any remaining block will be returned after the write.
535	*/
536	return xfs_zoned_space_reserve(ip,
537	XFS_B_TO_FSB(ip->i_mount, count) + `1` + `2`, flags, ac);
538	}
539
540	static int
541	xfs_dio_write_end_io(
542	struct kiocb *iocb,
543	ssize_t size,
544	int error,
545	unsigned flags)
546	{
547	struct inode *inode = file_inode(f: iocb->ki_filp);
548	struct xfs_inode *ip = XFS_I(inode);
549	loff_t offset = iocb->ki_pos;
550	unsigned int nofs_flag;
551
552	ASSERT(!xfs_is_zoned_inode(ip) \|\|
553	!(flags & (IOMAP_DIO_UNWRITTEN \| IOMAP_DIO_COW)));
554
555	trace_xfs_end_io_direct_write(ip, offset, count: size);
556
557	if (xfs_is_shutdown(mp: ip->i_mount))
558	return -EIO;
559
560	if (error)
561	return error;
562	if (!size)
563	return `0`;
564
565	/*
566	* Capture amount written on completion as we can't reliably account
567	* for it on submission.
568	*/
569	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
570
571	/*
572	* We can allocate memory here while doing writeback on behalf of
573	* memory reclaim. To avoid memory allocation deadlocks set the
574	* task-wide nofs context for the following operations.
575	*/
576	nofs_flag = memalloc_nofs_save();
577
578	if (flags & IOMAP_DIO_COW) {
579	if (iocb->ki_flags & IOCB_ATOMIC)
580	error = xfs_reflink_end_atomic_cow(ip, offset, count: size);
581	else
582	error = xfs_reflink_end_cow(ip, offset, count: size);
583	if (error)
584	goto out;
585	}
586
587	/*
588	* Unwritten conversion updates the in-core isize after extent
589	* conversion but before updating the on-disk size. Updating isize any
590	* earlier allows a racing dio read to find unwritten extents before
591	* they are converted.
592	*/
593	if (flags & IOMAP_DIO_UNWRITTEN) {
594	error = xfs_iomap_write_unwritten(ip, offset, size, true);
595	goto out;
596	}
597
598	/*
599	* We need to update the in-core inode size here so that we don't end up
600	* with the on-disk inode size being outside the in-core inode size. We
601	* have no other method of updating EOF for AIO, so always do it here
602	* if necessary.
603	*
604	* We need to lock the test/set EOF update as we can be racing with
605	* other IO completions here to update the EOF. Failing to serialise
606	* here can result in EOF moving backwards and Bad Things Happen when
607	* that occurs.
608	*
609	* As IO completion only ever extends EOF, we can do an unlocked check
610	* here to avoid taking the spinlock. If we land within the current EOF,
611	* then we do not need to do an extending update at all, and we don't
612	* need to take the lock to check this. If we race with an update moving
613	* EOF, then we'll either still be beyond EOF and need to take the lock,
614	* or we'll be within EOF and we don't need to take it at all.
615	*/
616	if (offset + size <= i_size_read(inode))
617	goto out;
618
619	spin_lock(lock: &ip->i_flags_lock);
620	if (offset + size > i_size_read(inode)) {
621	i_size_write(inode, i_size: offset + size);
622	spin_unlock(lock: &ip->i_flags_lock);
623	error = xfs_setfilesize(ip, offset, size);
624	} else {
625	spin_unlock(lock: &ip->i_flags_lock);
626	}
627
628	out:
629	memalloc_nofs_restore(flags: nofs_flag);
630	return error;
631	}
632
633	static const struct iomap_dio_ops xfs_dio_write_ops = {
634	.end_io = xfs_dio_write_end_io,
635	};
636
637	static void
638	xfs_dio_zoned_submit_io(
639	const struct iomap_iter *iter,
640	struct bio *bio,
641	loff_t file_offset)
642	{
643	struct xfs_mount *mp = XFS_I(inode: iter->inode)->i_mount;
644	struct xfs_zone_alloc_ctx *ac = iter->private;
645	xfs_filblks_t count_fsb;
646	struct iomap_ioend *ioend;
647
648	count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
649	if (count_fsb > ac->reserved_blocks) {
650	xfs_err(mp,
651	"allocation (%lld) larger than reservation (%lld).",
652	count_fsb, ac->reserved_blocks);
653	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
654	bio_io_error(bio);
655	return;
656	}
657	ac->reserved_blocks -= count_fsb;
658
659	bio->bi_end_io = xfs_end_bio;
660	ioend = iomap_init_ioend(inode: iter->inode, bio, file_offset,
661	IOMAP_IOEND_DIRECT);
662	xfs_zone_alloc_and_submit(ioend, oz: &ac->open_zone);
663	}
664
665	static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
666	.bio_set = &iomap_ioend_bioset,
667	.submit_io = xfs_dio_zoned_submit_io,
668	.end_io = xfs_dio_write_end_io,
669	};
670
671	/*
672	* Handle block aligned direct I/O writes.
673	*/
674	static noinline ssize_t
675	xfs_file_dio_write_aligned(
676	struct xfs_inode *ip,
677	struct kiocb *iocb,
678	struct iov_iter *from,
679	const struct iomap_ops *ops,
680	const struct iomap_dio_ops *dops,
681	struct xfs_zone_alloc_ctx *ac)
682	{
683	unsigned int iolock = XFS_IOLOCK_SHARED;
684	ssize_t ret;
685
686	ret = xfs_ilock_iocb_for_write(iocb, lock_mode: &iolock);
687	if (ret)
688	return ret;
689	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, ac);
690	if (ret)
691	goto out_unlock;
692
693	/*
694	* We don't need to hold the IOLOCK exclusively across the IO, so demote
695	* the iolock back to shared if we had to take the exclusive lock in
696	* xfs_file_write_checks() for other reasons.
697	*/
698	if (iolock == XFS_IOLOCK_EXCL) {
699	xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
700	iolock = XFS_IOLOCK_SHARED;
701	}
702	trace_xfs_file_direct_write(iocb, iter: from);
703	ret = iomap_dio_rw(iocb, iter: from, ops, dops, dio_flags: `0`, private: ac, done_before: `0`);
704	out_unlock:
705	xfs_iunlock(ip, iolock);
706	return ret;
707	}
708
709	/*
710	* Handle block aligned direct I/O writes to zoned devices.
711	*/
712	static noinline ssize_t
713	xfs_file_dio_write_zoned(
714	struct xfs_inode *ip,
715	struct kiocb *iocb,
716	struct iov_iter *from)
717	{
718	struct xfs_zone_alloc_ctx ac = { };
719	ssize_t ret;
720
721	ret = xfs_zoned_write_space_reserve(ip, iocb, from, flags: `0`, ac: &ac);
722	if (ret < `0`)
723	return ret;
724	ret = xfs_file_dio_write_aligned(ip, iocb, from,
725	ops: &xfs_zoned_direct_write_iomap_ops,
726	dops: &xfs_dio_zoned_write_ops, ac: &ac);
727	xfs_zoned_space_unreserve(ip, ac: &ac);
728	return ret;
729	}
730
731	/*
732	* Handle block atomic writes
733	*
734	* Two methods of atomic writes are supported:
735	* - REQ_ATOMIC-based, which would typically use some form of HW offload in the
736	* disk
737	* - COW-based, which uses a COW fork as a staging extent for data updates
738	* before atomically updating extent mappings for the range being written
739	*
740	*/
741	static noinline ssize_t
742	xfs_file_dio_write_atomic(
743	struct xfs_inode *ip,
744	struct kiocb *iocb,
745	struct iov_iter *from)
746	{
747	unsigned int iolock = XFS_IOLOCK_SHARED;
748	ssize_t ret, ocount = iov_iter_count(i: from);
749	const struct iomap_ops *dops;
750
751	/*
752	* HW offload should be faster, so try that first if it is already
753	* known that the write length is not too large.
754	*/
755	if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
756	dops = &xfs_atomic_write_cow_iomap_ops;
757	else
758	dops = &xfs_direct_write_iomap_ops;
759
760	retry:
761	ret = xfs_ilock_iocb_for_write(iocb, lock_mode: &iolock);
762	if (ret)
763	return ret;
764
765	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, NULL);
766	if (ret)
767	goto out_unlock;
768
769	/ Demote similar to xfs_file_dio_write_aligned() /
770	if (iolock == XFS_IOLOCK_EXCL) {
771	xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772	iolock = XFS_IOLOCK_SHARED;
773	}
774
775	trace_xfs_file_direct_write(iocb, iter: from);
776	ret = iomap_dio_rw(iocb, iter: from, ops: dops, dops: &xfs_dio_write_ops,
777	dio_flags: `0`, NULL, done_before: `0`);
778
779	/*
780	* The retry mechanism is based on the ->iomap_begin method returning
781	* -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
782	* possible. The REQ_ATOMIC-based method typically not be possible if
783	* the write spans multiple extents or the disk blocks are misaligned.
784	*/
785	if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
786	xfs_iunlock(ip, iolock);
787	dops = &xfs_atomic_write_cow_iomap_ops;
788	goto retry;
789	}
790
791	out_unlock:
792	if (iolock)
793	xfs_iunlock(ip, iolock);
794	return ret;
795	}
796
797	/*
798	* Handle block unaligned direct I/O writes
799	*
800	* In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
801	* them to be done in parallel with reads and other direct I/O writes. However,
802	* if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
803	* to do sub-block zeroing and that requires serialisation against other direct
804	* I/O to the same block. In this case we need to serialise the submission of
805	* the unaligned I/O so that we don't get racing block zeroing in the dio layer.
806	* In the case where sub-block zeroing is not required, we can do concurrent
807	* sub-block dios to the same block successfully.
808	*
809	* Optimistically submit the I/O using the shared lock first, but use the
810	* IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
811	* if block allocation or partial block zeroing would be required. In that case
812	* we try again with the exclusive lock.
813	*/
814	static noinline ssize_t
815	xfs_file_dio_write_unaligned(
816	struct xfs_inode *ip,
817	struct kiocb *iocb,
818	struct iov_iter *from)
819	{
820	size_t isize = i_size_read(inode: VFS_I(ip));
821	size_t count = iov_iter_count(i: from);
822	unsigned int iolock = XFS_IOLOCK_SHARED;
823	unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
824	ssize_t ret;
825
826	/*
827	* Extending writes need exclusivity because of the sub-block zeroing
828	* that the DIO code always does for partial tail blocks beyond EOF, so
829	* don't even bother trying the fast path in this case.
830	*/
831	if (iocb->ki_pos > isize \|\| iocb->ki_pos + count >= isize) {
832	if (iocb->ki_flags & IOCB_NOWAIT)
833	return -EAGAIN;
834	retry_exclusive:
835	iolock = XFS_IOLOCK_EXCL;
836	flags = IOMAP_DIO_FORCE_WAIT;
837	}
838
839	ret = xfs_ilock_iocb_for_write(iocb, lock_mode: &iolock);
840	if (ret)
841	return ret;
842
843	/*
844	* We can't properly handle unaligned direct I/O to reflink files yet,
845	* as we can't unshare a partial block.
846	*/
847	if (xfs_is_cow_inode(ip)) {
848	trace_xfs_reflink_bounce_dio_write(iocb, iter: from);
849	ret = -ENOTBLK;
850	goto out_unlock;
851	}
852
853	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, NULL);
854	if (ret)
855	goto out_unlock;
856
857	/*
858	* If we are doing exclusive unaligned I/O, this must be the only I/O
859	* in-flight. Otherwise we risk data corruption due to unwritten extent
860	* conversions from the AIO end_io handler. Wait for all other I/O to
861	* drain first.
862	*/
863	if (flags & IOMAP_DIO_FORCE_WAIT)
864	inode_dio_wait(inode: VFS_I(ip));
865
866	trace_xfs_file_direct_write(iocb, iter: from);
867	ret = iomap_dio_rw(iocb, iter: from, ops: &xfs_direct_write_iomap_ops,
868	dops: &xfs_dio_write_ops, dio_flags: flags, NULL, done_before: `0`);
869
870	/*
871	* Retry unaligned I/O with exclusive blocking semantics if the DIO
872	* layer rejected it for mapping or locking reasons. If we are doing
873	* nonblocking user I/O, propagate the error.
874	*/
875	if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
876	ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
877	xfs_iunlock(ip, iolock);
878	goto retry_exclusive;
879	}
880
881	out_unlock:
882	if (iolock)
883	xfs_iunlock(ip, iolock);
884	return ret;
885	}
886
887	static ssize_t
888	xfs_file_dio_write(
889	struct kiocb *iocb,
890	struct iov_iter *from)
891	{
892	struct xfs_inode *ip = XFS_I(inode: file_inode(f: iocb->ki_filp));
893	struct xfs_buftarg *target = xfs_inode_buftarg(ip);
894	size_t count = iov_iter_count(i: from);
895
896	/ direct I/O must be aligned to device logical sector size /
897	if ((iocb->ki_pos \| count) & target->bt_logical_sectormask)
898	return -EINVAL;
899
900	/*
901	* For always COW inodes we also must check the alignment of each
902	* individual iovec segment, as they could end up with different
903	* I/Os due to the way bio_iov_iter_get_pages works, and we'd
904	* then overwrite an already written block.
905	*/
906	if (((iocb->ki_pos \| count) & ip->i_mount->m_blockmask) \|\|
907	(xfs_is_always_cow_inode(ip) &&
908	(iov_iter_alignment(i: from) & ip->i_mount->m_blockmask)))
909	return xfs_file_dio_write_unaligned(ip, iocb, from);
910	if (xfs_is_zoned_inode(ip))
911	return xfs_file_dio_write_zoned(ip, iocb, from);
912	if (iocb->ki_flags & IOCB_ATOMIC)
913	return xfs_file_dio_write_atomic(ip, iocb, from);
914	return xfs_file_dio_write_aligned(ip, iocb, from,
915	ops: &xfs_direct_write_iomap_ops, dops: &xfs_dio_write_ops, NULL);
916	}
917
918	static noinline ssize_t
919	xfs_file_dax_write(
920	struct kiocb *iocb,
921	struct iov_iter *from)
922	{
923	struct inode *inode = iocb->ki_filp->f_mapping->host;
924	struct xfs_inode *ip = XFS_I(inode);
925	unsigned int iolock = XFS_IOLOCK_EXCL;
926	ssize_t ret, error = `0`;
927	loff_t pos;
928
929	ret = xfs_ilock_iocb(iocb, lock_mode: iolock);
930	if (ret)
931	return ret;
932	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, NULL);
933	if (ret)
934	goto out;
935
936	pos = iocb->ki_pos;
937
938	trace_xfs_file_dax_write(iocb, iter: from);
939	ret = dax_iomap_rw(iocb, iter: from, ops: &xfs_dax_write_iomap_ops);
940	if (ret > `0` && iocb->ki_pos > i_size_read(inode)) {
941	i_size_write(inode, i_size: iocb->ki_pos);
942	error = xfs_setfilesize(ip, offset: pos, size: ret);
943	}
944	out:
945	if (iolock)
946	xfs_iunlock(ip, iolock);
947	if (error)
948	return error;
949
950	if (ret > `0`) {
951	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
952
953	/ Handle various SYNC-type writes /
954	ret = generic_write_sync(iocb, count: ret);
955	}
956	return ret;
957	}
958
959	STATIC ssize_t
960	xfs_file_buffered_write(
961	struct kiocb *iocb,
962	struct iov_iter *from)
963	{
964	struct inode *inode = iocb->ki_filp->f_mapping->host;
965	struct xfs_inode *ip = XFS_I(inode);
966	ssize_t ret;
967	bool cleared_space = false;
968	unsigned int iolock;
969
970	write_retry:
971	iolock = XFS_IOLOCK_EXCL;
972	ret = xfs_ilock_iocb(iocb, lock_mode: iolock);
973	if (ret)
974	return ret;
975
976	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, NULL);
977	if (ret)
978	goto out;
979
980	trace_xfs_file_buffered_write(iocb, iter: from);
981	ret = iomap_file_buffered_write(iocb, from,
982	ops: &xfs_buffered_write_iomap_ops, NULL);
983
984	/*
985	* If we hit a space limit, try to free up some lingering preallocated
986	* space before returning an error. In the case of ENOSPC, first try to
987	* write back all dirty inodes to free up some of the excess reserved
988	* metadata space. This reduces the chances that the eofblocks scan
989	* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
990	* also behaves as a filter to prevent too many eofblocks scans from
991	* running at the same time. Use a synchronous scan to increase the
992	* effectiveness of the scan.
993	*/
994	if (ret == -EDQUOT && !cleared_space) {
995	xfs_iunlock(ip, iolock);
996	xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
997	cleared_space = true;
998	goto write_retry;
999	} else if (ret == -ENOSPC && !cleared_space) {
1000	struct xfs_icwalk icw = {`0`};
1001
1002	cleared_space = true;
1003	xfs_flush_inodes(mp: ip->i_mount);
1004
1005	xfs_iunlock(ip, iolock);
1006	icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
1007	xfs_blockgc_free_space(mp: ip->i_mount, icm: &icw);
1008	goto write_retry;
1009	}
1010
1011	out:
1012	if (iolock)
1013	xfs_iunlock(ip, iolock);
1014
1015	if (ret > `0`) {
1016	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
1017	/ Handle various SYNC-type writes /
1018	ret = generic_write_sync(iocb, count: ret);
1019	}
1020	return ret;
1021	}
1022
1023	STATIC ssize_t
1024	xfs_file_buffered_write_zoned(
1025	struct kiocb *iocb,
1026	struct iov_iter *from)
1027	{
1028	struct xfs_inode *ip = XFS_I(inode: iocb->ki_filp->f_mapping->host);
1029	struct xfs_mount *mp = ip->i_mount;
1030	unsigned int iolock = XFS_IOLOCK_EXCL;
1031	bool cleared_space = false;
1032	struct xfs_zone_alloc_ctx ac = { };
1033	ssize_t ret;
1034
1035	ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, ac: &ac);
1036	if (ret < `0`)
1037	return ret;
1038
1039	ret = xfs_ilock_iocb(iocb, lock_mode: iolock);
1040	if (ret)
1041	goto out_unreserve;
1042
1043	ret = xfs_file_write_checks(iocb, from, iolock: &iolock, ac: &ac);
1044	if (ret)
1045	goto out_unlock;
1046
1047	/*
1048	* Truncate the iter to the length that we were actually able to
1049	* allocate blocks for. This needs to happen after
1050	* xfs_file_write_checks, because that assigns ki_pos for O_APPEND
1051	* writes.
1052	*/
1053	iov_iter_truncate(i: from,
1054	count: XFS_FSB_TO_B(mp, ac.reserved_blocks) -
1055	(iocb->ki_pos & mp->m_blockmask));
1056	if (!iov_iter_count(i: from))
1057	goto out_unlock;
1058
1059	retry:
1060	trace_xfs_file_buffered_write(iocb, iter: from);
1061	ret = iomap_file_buffered_write(iocb, from,
1062	ops: &xfs_buffered_write_iomap_ops, private: &ac);
1063	if (ret == -ENOSPC && !cleared_space) {
1064	/*
1065	* Kick off writeback to convert delalloc space and release the
1066	* usually too pessimistic indirect block reservations.
1067	*/
1068	xfs_flush_inodes(mp);
1069	cleared_space = true;
1070	goto retry;
1071	}
1072
1073	out_unlock:
1074	xfs_iunlock(ip, iolock);
1075	out_unreserve:
1076	xfs_zoned_space_unreserve(ip, ac: &ac);
1077	if (ret > `0`) {
1078	XFS_STATS_ADD(mp, xs_write_bytes, ret);
1079	ret = generic_write_sync(iocb, count: ret);
1080	}
1081	return ret;
1082	}
1083
1084	STATIC ssize_t
1085	xfs_file_write_iter(
1086	struct kiocb *iocb,
1087	struct iov_iter *from)
1088	{
1089	struct inode *inode = iocb->ki_filp->f_mapping->host;
1090	struct xfs_inode *ip = XFS_I(inode);
1091	ssize_t ret;
1092	size_t ocount = iov_iter_count(i: from);
1093
1094	XFS_STATS_INC(ip->i_mount, xs_write_calls);
1095
1096	if (ocount == `0`)
1097	return `0`;
1098
1099	if (xfs_is_shutdown(mp: ip->i_mount))
1100	return -EIO;
1101
1102	if (IS_DAX(inode))
1103	return xfs_file_dax_write(iocb, from);
1104
1105	if (iocb->ki_flags & IOCB_ATOMIC) {
1106	if (ocount < xfs_get_atomic_write_min(ip))
1107	return -EINVAL;
1108
1109	if (ocount > xfs_get_atomic_write_max(ip))
1110	return -EINVAL;
1111
1112	ret = generic_atomic_write_valid(iocb, iter: from);
1113	if (ret)
1114	return ret;
1115	}
1116
1117	if (iocb->ki_flags & IOCB_DIRECT) {
1118	/*
1119	* Allow a directio write to fall back to a buffered
1120	* write only in the case that we're doing a reflink
1121	* CoW. In all other directio scenarios we do not
1122	* allow an operation to fall back to buffered mode.
1123	*/
1124	ret = xfs_file_dio_write(iocb, from);
1125	if (ret != -ENOTBLK)
1126	return ret;
1127	}
1128
1129	if (xfs_is_zoned_inode(ip))
1130	return xfs_file_buffered_write_zoned(iocb, from);
1131	return xfs_file_buffered_write(iocb, from);
1132	}
1133
1134	/ Does this file, inode, or mount want synchronous writes? /
1135	static inline bool xfs_file_sync_writes(struct file *filp)
1136	{
1137	struct xfs_inode *ip = XFS_I(inode: file_inode(f: filp));
1138
1139	if (xfs_has_wsync(mp: ip->i_mount))
1140	return true;
1141	if (filp->f_flags & (__O_SYNC \| O_DSYNC))
1142	return true;
1143	if (IS_SYNC(file_inode(filp)))
1144	return true;
1145
1146	return false;
1147	}
1148
1149	static int
1150	xfs_falloc_newsize(
1151	struct file *file,
1152	int mode,
1153	loff_t offset,
1154	loff_t len,
1155	loff_t *new_size)
1156	{
1157	struct inode *inode = file_inode(f: file);
1158
1159	if ((mode & FALLOC_FL_KEEP_SIZE) \|\| offset + len <= i_size_read(inode))
1160	return `0`;
1161	*new_size = offset + len;
1162	return inode_newsize_ok(inode, offset: *new_size);
1163	}
1164
1165	static int
1166	xfs_falloc_setsize(
1167	struct file *file,
1168	loff_t new_size)
1169	{
1170	struct iattr iattr = {
1171	.ia_valid = ATTR_SIZE,
1172	.ia_size = new_size,
1173	};
1174
1175	if (!new_size)
1176	return `0`;
1177	return xfs_vn_setattr_size(idmap: file_mnt_idmap(file), dentry: file_dentry(file),
1178	vap: &iattr);
1179	}
1180
1181	static int
1182	xfs_falloc_collapse_range(
1183	struct file *file,
1184	loff_t offset,
1185	loff_t len,
1186	struct xfs_zone_alloc_ctx *ac)
1187	{
1188	struct inode *inode = file_inode(f: file);
1189	loff_t new_size = i_size_read(inode) - len;
1190	int error;
1191
1192	if (!xfs_is_falloc_aligned(ip: XFS_I(inode), pos: offset, len))
1193	return -EINVAL;
1194
1195	/*
1196	* There is no need to overlap collapse range with EOF, in which case it
1197	* is effectively a truncate operation
1198	*/
1199	if (offset + len >= i_size_read(inode))
1200	return -EINVAL;
1201
1202	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
1203	if (error)
1204	return error;
1205	return xfs_falloc_setsize(file, new_size);
1206	}
1207
1208	static int
1209	xfs_falloc_insert_range(
1210	struct file *file,
1211	loff_t offset,
1212	loff_t len)
1213	{
1214	struct inode *inode = file_inode(f: file);
1215	loff_t isize = i_size_read(inode);
1216	int error;
1217
1218	if (!xfs_is_falloc_aligned(ip: XFS_I(inode), pos: offset, len))
1219	return -EINVAL;
1220
1221	/*
1222	* New inode size must not exceed ->s_maxbytes, accounting for
1223	* possible signed overflow.
1224	*/
1225	if (inode->i_sb->s_maxbytes - isize < len)
1226	return -EFBIG;
1227
1228	/ Offset should be less than i_size /
1229	if (offset >= isize)
1230	return -EINVAL;
1231
1232	error = xfs_falloc_setsize(file, new_size: isize + len);
1233	if (error)
1234	return error;
1235
1236	/*
1237	* Perform hole insertion now that the file size has been updated so
1238	* that if we crash during the operation we don't leave shifted extents
1239	* past EOF and hence losing access to the data that is contained within
1240	* them.
1241	*/
1242	return xfs_insert_file_space(XFS_I(inode), offset, len);
1243	}
1244
1245	/*
1246	* Punch a hole and prealloc the range. We use a hole punch rather than
1247	* unwritten extent conversion for two reasons:
1248	*
1249	* 1.) Hole punch handles partial block zeroing for us.
1250	* 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
1251	* virtue of the hole punch.
1252	*/
1253	static int
1254	xfs_falloc_zero_range(
1255	struct file *file,
1256	int mode,
1257	loff_t offset,
1258	loff_t len,
1259	struct xfs_zone_alloc_ctx *ac)
1260	{
1261	struct inode *inode = file_inode(f: file);
1262	unsigned int blksize = i_blocksize(node: inode);
1263	loff_t new_size = `0`;
1264	int error;
1265
1266	trace_xfs_zero_file_space(ip: XFS_I(inode));
1267
1268	error = xfs_falloc_newsize(file, mode, offset, len, new_size: &new_size);
1269	if (error)
1270	return error;
1271
1272	error = xfs_free_file_space(ip: XFS_I(inode), offset, len, ac);
1273	if (error)
1274	return error;
1275
1276	len = round_up(offset + len, blksize) - round_down(offset, blksize);
1277	offset = round_down(offset, blksize);
1278	error = xfs_alloc_file_space(ip: XFS_I(inode), offset, len);
1279	if (error)
1280	return error;
1281	return xfs_falloc_setsize(file, new_size);
1282	}
1283
1284	static int
1285	xfs_falloc_unshare_range(
1286	struct file *file,
1287	int mode,
1288	loff_t offset,
1289	loff_t len)
1290	{
1291	struct inode *inode = file_inode(f: file);
1292	loff_t new_size = `0`;
1293	int error;
1294
1295	error = xfs_falloc_newsize(file, mode, offset, len, new_size: &new_size);
1296	if (error)
1297	return error;
1298
1299	error = xfs_reflink_unshare(ip: XFS_I(inode), offset, len);
1300	if (error)
1301	return error;
1302
1303	error = xfs_alloc_file_space(ip: XFS_I(inode), offset, len);
1304	if (error)
1305	return error;
1306	return xfs_falloc_setsize(file, new_size);
1307	}
1308
1309	static int
1310	xfs_falloc_allocate_range(
1311	struct file *file,
1312	int mode,
1313	loff_t offset,
1314	loff_t len)
1315	{
1316	struct inode *inode = file_inode(f: file);
1317	loff_t new_size = `0`;
1318	int error;
1319
1320	/*
1321	* If always_cow mode we can't use preallocations and thus should not
1322	* create them.
1323	*/
1324	if (xfs_is_always_cow_inode(ip: XFS_I(inode)))
1325	return -EOPNOTSUPP;
1326
1327	error = xfs_falloc_newsize(file, mode, offset, len, new_size: &new_size);
1328	if (error)
1329	return error;
1330
1331	error = xfs_alloc_file_space(ip: XFS_I(inode), offset, len);
1332	if (error)
1333	return error;
1334	return xfs_falloc_setsize(file, new_size);
1335	}
1336
1337	#define XFS_FALLOC_FL_SUPPORTED \
1338	(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \| \
1339	FALLOC_FL_COLLAPSE_RANGE \| FALLOC_FL_ZERO_RANGE \| \
1340	FALLOC_FL_INSERT_RANGE \| FALLOC_FL_UNSHARE_RANGE)
1341
1342	STATIC long
1343	__xfs_file_fallocate(
1344	struct file *file,
1345	int mode,
1346	loff_t offset,
1347	loff_t len,
1348	struct xfs_zone_alloc_ctx *ac)
1349	{
1350	struct inode *inode = file_inode(f: file);
1351	struct xfs_inode *ip = XFS_I(inode);
1352	long error;
1353	uint iolock = XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL;
1354
1355	xfs_ilock(ip, iolock);
1356	error = xfs_break_layouts(inode, iolock: &iolock, reason: BREAK_UNMAP);
1357	if (error)
1358	goto out_unlock;
1359
1360	/*
1361	* Must wait for all AIO to complete before we continue as AIO can
1362	* change the file size on completion without holding any locks we
1363	* currently hold. We must do this first because AIO can update both
1364	* the on disk and in memory inode sizes, and the operations that follow
1365	* require the in-memory size to be fully up-to-date.
1366	*/
1367	inode_dio_wait(inode);
1368
1369	error = file_modified(file);
1370	if (error)
1371	goto out_unlock;
1372
1373	switch (mode & FALLOC_FL_MODE_MASK) {
1374	case FALLOC_FL_PUNCH_HOLE:
1375	error = xfs_free_file_space(ip, offset, len, ac);
1376	break;
1377	case FALLOC_FL_COLLAPSE_RANGE:
1378	error = xfs_falloc_collapse_range(file, offset, len, ac);
1379	break;
1380	case FALLOC_FL_INSERT_RANGE:
1381	error = xfs_falloc_insert_range(file, offset, len);
1382	break;
1383	case FALLOC_FL_ZERO_RANGE:
1384	error = xfs_falloc_zero_range(file, mode, offset, len, ac);
1385	break;
1386	case FALLOC_FL_UNSHARE_RANGE:
1387	error = xfs_falloc_unshare_range(file, mode, offset, len);
1388	break;
1389	case FALLOC_FL_ALLOCATE_RANGE:
1390	error = xfs_falloc_allocate_range(file, mode, offset, len);
1391	break;
1392	default:
1393	error = -EOPNOTSUPP;
1394	break;
1395	}
1396
1397	if (!error && xfs_file_sync_writes(filp: file))
1398	error = xfs_log_force_inode(ip);
1399
1400	out_unlock:
1401	xfs_iunlock(ip, iolock);
1402	return error;
1403	}
1404
1405	static long
1406	xfs_file_zoned_fallocate(
1407	struct file *file,
1408	int mode,
1409	loff_t offset,
1410	loff_t len)
1411	{
1412	struct xfs_zone_alloc_ctx ac = { };
1413	struct xfs_inode *ip = XFS_I(inode: file_inode(f: file));
1414	int error;
1415
1416	error = xfs_zoned_space_reserve(ip, `2`, XFS_ZR_RESERVED, &ac);
1417	if (error)
1418	return error;
1419	error = __xfs_file_fallocate(file, mode, offset, len, ac: &ac);
1420	xfs_zoned_space_unreserve(ip, ac: &ac);
1421	return error;
1422	}
1423
1424	static long
1425	xfs_file_fallocate(
1426	struct file *file,
1427	int mode,
1428	loff_t offset,
1429	loff_t len)
1430	{
1431	struct inode *inode = file_inode(f: file);
1432
1433	if (!S_ISREG(inode->i_mode))
1434	return -EINVAL;
1435	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1436	return -EOPNOTSUPP;
1437
1438	/*
1439	* For zoned file systems, zeroing the first and last block of a hole
1440	* punch requires allocating a new block to rewrite the remaining data
1441	* and new zeroes out of place. Get a reservations for those before
1442	* taking the iolock. Dip into the reserved pool because we are
1443	* expected to be able to punch a hole even on a completely full
1444	* file system.
1445	*/
1446	if (xfs_is_zoned_inode(ip: XFS_I(inode)) &&
1447	(mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE \|
1448	FALLOC_FL_COLLAPSE_RANGE)))
1449	return xfs_file_zoned_fallocate(file, mode, offset, len);
1450	return __xfs_file_fallocate(file, mode, offset, len, NULL);
1451	}
1452
1453	STATIC int
1454	xfs_file_fadvise(
1455	struct file *file,
1456	loff_t start,
1457	loff_t end,
1458	int advice)
1459	{
1460	struct xfs_inode *ip = XFS_I(inode: file_inode(f: file));
1461	int ret;
1462	int lockflags = `0`;
1463
1464	/*
1465	* Operations creating pages in page cache need protection from hole
1466	* punching and similar ops
1467	*/
1468	if (advice == POSIX_FADV_WILLNEED) {
1469	lockflags = XFS_IOLOCK_SHARED;
1470	xfs_ilock(ip, lockflags);
1471	}
1472	ret = generic_fadvise(file, offset: start, len: end, advice);
1473	if (lockflags)
1474	xfs_iunlock(ip, lockflags);
1475	return ret;
1476	}
1477
1478	STATIC loff_t
1479	xfs_file_remap_range(
1480	struct file *file_in,
1481	loff_t pos_in,
1482	struct file *file_out,
1483	loff_t pos_out,
1484	loff_t len,
1485	unsigned int remap_flags)
1486	{
1487	struct inode *inode_in = file_inode(f: file_in);
1488	struct xfs_inode *src = XFS_I(inode: inode_in);
1489	struct inode *inode_out = file_inode(f: file_out);
1490	struct xfs_inode *dest = XFS_I(inode: inode_out);
1491	struct xfs_mount *mp = src->i_mount;
1492	loff_t remapped = `0`;
1493	xfs_extlen_t cowextsize;
1494	int ret;
1495
1496	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
1497	return -EINVAL;
1498
1499	if (!xfs_has_reflink(mp))
1500	return -EOPNOTSUPP;
1501
1502	if (xfs_is_shutdown(mp))
1503	return -EIO;
1504
1505	/ Prepare and then clone file data. /
1506	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1507	len: &len, remap_flags);
1508	if (ret \|\| len == `0`)
1509	return ret;
1510
1511	trace_xfs_reflink_remap_range(src, soffset: pos_in, len, dest, doffset: pos_out);
1512
1513	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, remap_len: len,
1514	remapped: &remapped);
1515	if (ret)
1516	goto out_unlock;
1517
1518	/*
1519	* Carry the cowextsize hint from src to dest if we're sharing the
1520	* entire source file to the entire destination file, the source file
1521	* has a cowextsize hint, and the destination file does not.
1522	*/
1523	cowextsize = `0`;
1524	if (pos_in == `0` && len == i_size_read(inode_in) &&
1525	(src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1526	pos_out == `0` && len >= i_size_read(inode_out) &&
1527	!(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1528	cowextsize = src->i_cowextsize;
1529
1530	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1531	remap_flags);
1532	if (ret)
1533	goto out_unlock;
1534
1535	if (xfs_file_sync_writes(filp: file_in) \|\| xfs_file_sync_writes(filp: file_out))
1536	xfs_log_force_inode(ip: dest);
1537	out_unlock:
1538	xfs_iunlock2_remapping(ip1: src, ip2: dest);
1539	if (ret)
1540	trace_xfs_reflink_remap_range_error(ip: dest, error: ret, _RET_IP_);
1541	/*
1542	* If the caller did not set CAN_SHORTEN, then it is not prepared to
1543	* handle partial results -- either the whole remap succeeds, or we
1544	* must say why it did not. In this case, any error should be returned
1545	* to the caller.
1546	*/
1547	if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
1548	return ret;
1549	return remapped > `0` ? remapped : ret;
1550	}
1551
1552	STATIC int
1553	xfs_file_open(
1554	struct inode *inode,
1555	struct file *file)
1556	{
1557	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1558	return -EIO;
1559	file->f_mode \|= FMODE_NOWAIT \| FMODE_CAN_ODIRECT;
1560	if (xfs_get_atomic_write_min(ip: XFS_I(inode)) > `0`)
1561	file->f_mode \|= FMODE_CAN_ATOMIC_WRITE;
1562	return generic_file_open(inode, filp: file);
1563	}
1564
1565	STATIC int
1566	xfs_dir_open(
1567	struct inode *inode,
1568	struct file *file)
1569	{
1570	struct xfs_inode *ip = XFS_I(inode);
1571	unsigned int mode;
1572	int error;
1573
1574	if (xfs_is_shutdown(mp: ip->i_mount))
1575	return -EIO;
1576	error = generic_file_open(inode, filp: file);
1577	if (error)
1578	return error;
1579
1580	/*
1581	* If there are any blocks, read-ahead block 0 as we're almost
1582	* certain to have the next operation be a read there.
1583	*/
1584	mode = xfs_ilock_data_map_shared(ip);
1585	if (ip->i_df.if_nextents > `0`)
1586	error = xfs_dir3_data_readahead(ip, `0`, `0`);
1587	xfs_iunlock(ip, mode);
1588	return error;
1589	}
1590
1591	/*
1592	* Don't bother propagating errors. We're just doing cleanup, and the caller
1593	* ignores the return value anyway.
1594	*/
1595	STATIC int
1596	xfs_file_release(
1597	struct inode *inode,
1598	struct file *file)
1599	{
1600	struct xfs_inode *ip = XFS_I(inode);
1601	struct xfs_mount *mp = ip->i_mount;
1602
1603	/*
1604	* If this is a read-only mount or the file system has been shut down,
1605	* don't generate I/O.
1606	*/
1607	if (xfs_is_readonly(mp) \|\| xfs_is_shutdown(mp))
1608	return `0`;
1609
1610	/*
1611	* If we previously truncated this file and removed old data in the
1612	* process, we want to initiate "early" writeout on the last close.
1613	* This is an attempt to combat the notorious NULL files problem which
1614	* is particularly noticeable from a truncate down, buffered (re-)write
1615	* (delalloc), followed by a crash. What we are effectively doing here
1616	* is significantly reducing the time window where we'd otherwise be
1617	* exposed to that problem.
1618	*/
1619	if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
1620	xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
1621	if (ip->i_delayed_blks > `0`)
1622	filemap_flush(inode->i_mapping);
1623	}
1624
1625	/*
1626	* XFS aggressively preallocates post-EOF space to generate contiguous
1627	* allocations for writers that append to the end of the file.
1628	*
1629	* To support workloads that close and reopen the file frequently, these
1630	* preallocations usually persist after a close unless it is the first
1631	* close for the inode. This is a tradeoff to generate tightly packed
1632	* data layouts for unpacking tarballs or similar archives that write
1633	* one file after another without going back to it while keeping the
1634	* preallocation for files that have recurring open/write/close cycles.
1635	*
1636	* This heuristic is skipped for inodes with the append-only flag as
1637	* that flag is rather pointless for inodes written only once.
1638	*
1639	* There is no point in freeing blocks here for open but unlinked files
1640	* as they will be taken care of by the inactivation path soon.
1641	*
1642	* When releasing a read-only context, don't flush data or trim post-EOF
1643	* blocks. This avoids open/read/close workloads from removing EOF
1644	* blocks that other writers depend upon to reduce fragmentation.
1645	*
1646	* Inodes on the zoned RT device never have preallocations, so skip
1647	* taking the locks below.
1648	*/
1649	if (!inode->i_nlink \|\|
1650	!(file->f_mode & FMODE_WRITE) \|\|
1651	(ip->i_diflags & XFS_DIFLAG_APPEND) \|\|
1652	xfs_is_zoned_inode(ip))
1653	return `0`;
1654
1655	/*
1656	* If we can't get the iolock just skip truncating the blocks past EOF
1657	* because we could deadlock with the mmap_lock otherwise. We'll get
1658	* another chance to drop them once the last reference to the inode is
1659	* dropped, so we'll never leak blocks permanently.
1660	*/
1661	if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
1662	xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1663	if (xfs_can_free_eofblocks(ip) &&
1664	!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
1665	xfs_free_eofblocks(ip);
1666	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1667	}
1668
1669	return `0`;
1670	}
1671
1672	STATIC int
1673	xfs_file_readdir(
1674	struct file *file,
1675	struct dir_context *ctx)
1676	{
1677	struct inode *inode = file_inode(f: file);
1678	xfs_inode_t *ip = XFS_I(inode);
1679	size_t bufsize;
1680
1681	/*
1682	* The Linux API doesn't pass down the total size of the buffer
1683	* we read into down to the filesystem. With the filldir concept
1684	* it's not needed for correct information, but the XFS dir2 leaf
1685	* code wants an estimate of the buffer size to calculate it's
1686	* readahead window and size the buffers used for mapping to
1687	* physical blocks.
1688	*
1689	* Try to give it an estimate that's good enough, maybe at some
1690	* point we can change the ->readdir prototype to include the
1691	* buffer size. For now we use the current glibc buffer size.
1692	*/
1693	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1694
1695	return xfs_readdir(NULL, ip, ctx, bufsize);
1696	}
1697
1698	STATIC loff_t
1699	xfs_file_llseek(
1700	struct file *file,
1701	loff_t offset,
1702	int whence)
1703	{
1704	struct inode *inode = file->f_mapping->host;
1705
1706	if (xfs_is_shutdown(mp: XFS_I(inode)->i_mount))
1707	return -EIO;
1708
1709	switch (whence) {
1710	default:
1711	return generic_file_llseek(file, offset, whence);
1712	case SEEK_HOLE:
1713	offset = iomap_seek_hole(inode, offset, ops: &xfs_seek_iomap_ops);
1714	break;
1715	case SEEK_DATA:
1716	offset = iomap_seek_data(inode, offset, ops: &xfs_seek_iomap_ops);
1717	break;
1718	}
1719
1720	if (offset < `0`)
1721	return offset;
1722	return vfs_setpos(file, offset, maxsize: inode->i_sb->s_maxbytes);
1723	}
1724
1725	static inline vm_fault_t
1726	xfs_dax_fault_locked(
1727	struct vm_fault *vmf,
1728	unsigned int order,
1729	bool write_fault)
1730	{
1731	vm_fault_t ret;
1732	pfn_t pfn;
1733
1734	if (!IS_ENABLED(CONFIG_FS_DAX)) {
1735	ASSERT(`0`);
1736	return VM_FAULT_SIGBUS;
1737	}
1738	ret = dax_iomap_fault(vmf, order, pfnp: &pfn, NULL,
1739	ops: (write_fault && !vmf->cow_page) ?
1740	&xfs_dax_write_iomap_ops :
1741	&xfs_read_iomap_ops);
1742	if (ret & VM_FAULT_NEEDDSYNC)
1743	ret = dax_finish_sync_fault(vmf, order, pfn);
1744	return ret;
1745	}
1746
1747	static vm_fault_t
1748	xfs_dax_read_fault(
1749	struct vm_fault *vmf,
1750	unsigned int order)
1751	{
1752	struct xfs_inode *ip = XFS_I(inode: file_inode(f: vmf->vma->vm_file));
1753	vm_fault_t ret;
1754
1755	trace_xfs_read_fault(ip, order);
1756
1757	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1758	ret = xfs_dax_fault_locked(vmf, order, write_fault: false);
1759	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1760
1761	return ret;
1762	}
1763
1764	/*
1765	* Locking for serialisation of IO during page faults. This results in a lock
1766	* ordering of:
1767	*
1768	* mmap_lock (MM)
1769	* sb_start_pagefault(vfs, freeze)
1770	* invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1771	* page_lock (MM)
1772	* i_lock (XFS - extent map serialisation)
1773	*/
1774	static vm_fault_t
1775	__xfs_write_fault(
1776	struct vm_fault *vmf,
1777	unsigned int order,
1778	struct xfs_zone_alloc_ctx *ac)
1779	{
1780	struct inode *inode = file_inode(f: vmf->vma->vm_file);
1781	struct xfs_inode *ip = XFS_I(inode);
1782	unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
1783	vm_fault_t ret;
1784
1785	trace_xfs_write_fault(ip, order);
1786
1787	sb_start_pagefault(sb: inode->i_sb);
1788	file_update_time(file: vmf->vma->vm_file);
1789
1790	/*
1791	* Normally we only need the shared mmaplock, but if a reflink remap is
1792	* in progress we take the exclusive lock to wait for the remap to
1793	* finish before taking a write fault.
1794	*/
1795	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1796	if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
1797	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1798	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1799	lock_mode = XFS_MMAPLOCK_EXCL;
1800	}
1801
1802	if (IS_DAX(inode))
1803	ret = xfs_dax_fault_locked(vmf, order, write_fault: true);
1804	else
1805	ret = iomap_page_mkwrite(vmf, ops: &xfs_buffered_write_iomap_ops,
1806	private: ac);
1807	xfs_iunlock(ip, lock_mode);
1808
1809	sb_end_pagefault(sb: inode->i_sb);
1810	return ret;
1811	}
1812
1813	static vm_fault_t
1814	xfs_write_fault_zoned(
1815	struct vm_fault *vmf,
1816	unsigned int order)
1817	{
1818	struct xfs_inode *ip = XFS_I(inode: file_inode(f: vmf->vma->vm_file));
1819	unsigned int len = folio_size(page_folio(vmf->page));
1820	struct xfs_zone_alloc_ctx ac = { };
1821	int error;
1822	vm_fault_t ret;
1823
1824	/*
1825	* This could over-allocate as it doesn't check for truncation.
1826	*
1827	* But as the overallocation is limited to less than a folio and will be
1828	* release instantly that's just fine.
1829	*/
1830	error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), `0`,
1831	&ac);
1832	if (error < `0`)
1833	return vmf_fs_error(err: error);
1834	ret = __xfs_write_fault(vmf, order, ac: &ac);
1835	xfs_zoned_space_unreserve(ip, ac: &ac);
1836	return ret;
1837	}
1838
1839	static vm_fault_t
1840	xfs_write_fault(
1841	struct vm_fault *vmf,
1842	unsigned int order)
1843	{
1844	if (xfs_is_zoned_inode(ip: XFS_I(inode: file_inode(f: vmf->vma->vm_file))))
1845	return xfs_write_fault_zoned(vmf, order);
1846	return __xfs_write_fault(vmf, order, NULL);
1847	}
1848
1849	static inline bool
1850	xfs_is_write_fault(
1851	struct vm_fault *vmf)
1852	{
1853	return (vmf->flags & FAULT_FLAG_WRITE) &&
1854	(vmf->vma->vm_flags & VM_SHARED);
1855	}
1856
1857	static vm_fault_t
1858	xfs_filemap_fault(
1859	struct vm_fault *vmf)
1860	{
1861	struct inode *inode = file_inode(f: vmf->vma->vm_file);
1862
1863	/ DAX can shortcut the normal fault path on write faults! /
1864	if (IS_DAX(inode)) {
1865	if (xfs_is_write_fault(vmf))
1866	return xfs_write_fault(vmf, order: `0`);
1867	return xfs_dax_read_fault(vmf, order: `0`);
1868	}
1869
1870	trace_xfs_read_fault(ip: XFS_I(inode), order: `0`);
1871	return filemap_fault(vmf);
1872	}
1873
1874	static vm_fault_t
1875	xfs_filemap_huge_fault(
1876	struct vm_fault *vmf,
1877	unsigned int order)
1878	{
1879	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1880	return VM_FAULT_FALLBACK;
1881
1882	/ DAX can shortcut the normal fault path on write faults! /
1883	if (xfs_is_write_fault(vmf))
1884	return xfs_write_fault(vmf, order);
1885	return xfs_dax_read_fault(vmf, order);
1886	}
1887
1888	static vm_fault_t
1889	xfs_filemap_page_mkwrite(
1890	struct vm_fault *vmf)
1891	{
1892	return xfs_write_fault(vmf, order: `0`);
1893	}
1894
1895	/*
1896	* pfn_mkwrite was originally intended to ensure we capture time stamp updates
1897	* on write faults. In reality, it needs to serialise against truncate and
1898	* prepare memory for writing so handle is as standard write fault.
1899	*/
1900	static vm_fault_t
1901	xfs_filemap_pfn_mkwrite(
1902	struct vm_fault *vmf)
1903	{
1904	return xfs_write_fault(vmf, order: `0`);
1905	}
1906
1907	static const struct vm_operations_struct xfs_file_vm_ops = {
1908	.fault = xfs_filemap_fault,
1909	.huge_fault = xfs_filemap_huge_fault,
1910	.map_pages = filemap_map_pages,
1911	.page_mkwrite = xfs_filemap_page_mkwrite,
1912	.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1913	};
1914
1915	STATIC int
1916	xfs_file_mmap(
1917	struct file *file,
1918	struct vm_area_struct *vma)
1919	{
1920	struct inode *inode = file_inode(f: file);
1921	struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1922
1923	/*
1924	* We don't support synchronous mappings for non-DAX files and
1925	* for DAX files if underneath dax_device is not synchronous.
1926	*/
1927	if (!daxdev_mapping_supported(vma, dax_dev: target->bt_daxdev))
1928	return -EOPNOTSUPP;
1929
1930	file_accessed(file);
1931	vma->vm_ops = &xfs_file_vm_ops;
1932	if (IS_DAX(inode))
1933	vm_flags_set(vma, VM_HUGEPAGE);
1934	return `0`;
1935	}
1936
1937	const struct file_operations xfs_file_operations = {
1938	.llseek = xfs_file_llseek,
1939	.read_iter = xfs_file_read_iter,
1940	.write_iter = xfs_file_write_iter,
1941	.splice_read = xfs_file_splice_read,
1942	.splice_write = iter_file_splice_write,
1943	.iopoll = iocb_bio_iopoll,
1944	.unlocked_ioctl = xfs_file_ioctl,
1945	#ifdef CONFIG_COMPAT
1946	.compat_ioctl = xfs_file_compat_ioctl,
1947	#endif
1948	.mmap = xfs_file_mmap,
1949	.open = xfs_file_open,
1950	.release = xfs_file_release,
1951	.fsync = xfs_file_fsync,
1952	.get_unmapped_area = thp_get_unmapped_area,
1953	.fallocate = xfs_file_fallocate,
1954	.fadvise = xfs_file_fadvise,
1955	.remap_file_range = xfs_file_remap_range,
1956	.fop_flags = FOP_MMAP_SYNC \| FOP_BUFFER_RASYNC \|
1957	FOP_BUFFER_WASYNC \| FOP_DIO_PARALLEL_WRITE \|
1958	FOP_DONTCACHE,
1959	};
1960
1961	const struct file_operations xfs_dir_file_operations = {
1962	.open = xfs_dir_open,
1963	.read = generic_read_dir,
1964	.iterate_shared = xfs_file_readdir,
1965	.llseek = generic_file_llseek,
1966	.unlocked_ioctl = xfs_file_ioctl,
1967	#ifdef CONFIG_COMPAT
1968	.compat_ioctl = xfs_file_compat_ioctl,
1969	#endif
1970	.fsync = xfs_dir_fsync,
1971	};
1972

source code of linux/fs/xfs/xfs_file.c