xfs_iomap.c source code [linux/fs/xfs/xfs_iomap.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
4	* Copyright (c) 2016-2018 Christoph Hellwig.
5	* All Rights Reserved.
6	*/
7	#include "xfs.h"
8	#include "xfs_fs.h"
9	#include "xfs_shared.h"
10	#include "xfs_format.h"
11	#include "xfs_log_format.h"
12	#include "xfs_trans_resv.h"
13	#include "xfs_mount.h"
14	#include "xfs_inode.h"
15	#include "xfs_btree.h"
16	#include "xfs_bmap_btree.h"
17	#include "xfs_bmap.h"
18	#include "xfs_bmap_util.h"
19	#include "xfs_errortag.h"
20	#include "xfs_error.h"
21	#include "xfs_trans.h"
22	#include "xfs_trans_space.h"
23	#include "xfs_inode_item.h"
24	#include "xfs_iomap.h"
25	#include "xfs_trace.h"
26	#include "xfs_quota.h"
27	#include "xfs_dquot_item.h"
28	#include "xfs_dquot.h"
29	#include "xfs_reflink.h"
30	#include "xfs_health.h"
31
32	#define XFS_ALLOC_ALIGN(mp, off) \
33	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
34
35	static int
36	xfs_alert_fsblock_zero(
37	xfs_inode_t *ip,
38	xfs_bmbt_irec_t *imap)
39	{
40	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
41	"Access to block zero in inode %llu "
42	"start_block: %llx start_off: %llx "
43	"blkcnt: %llx extent-state: %x",
44	(unsigned long long)ip->i_ino,
45	(unsigned long long)imap->br_startblock,
46	(unsigned long long)imap->br_startoff,
47	(unsigned long long)imap->br_blockcount,
48	imap->br_state);
49	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
50	return -EFSCORRUPTED;
51	}
52
53	u64
54	xfs_iomap_inode_sequence(
55	struct xfs_inode *ip,
56	u16 iomap_flags)
57	{
58	u64 cookie = `0`;
59
60	if (iomap_flags & IOMAP_F_XATTR)
61	return READ_ONCE(ip->i_af.if_seq);
62	if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
63	cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << `32`;
64	return cookie \| READ_ONCE(ip->i_df.if_seq);
65	}
66
67	/*
68	* Check that the iomap passed to us is still valid for the given offset and
69	* length.
70	*/
71	static bool
72	xfs_iomap_valid(
73	struct inode *inode,
74	const struct iomap *iomap)
75	{
76	struct xfs_inode *ip = XFS_I(inode);
77
78	if (iomap->validity_cookie !=
79	xfs_iomap_inode_sequence(ip, iomap_flags: iomap->flags)) {
80	trace_xfs_iomap_invalid(ip, iomap);
81	return false;
82	}
83
84	XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS);
85	return true;
86	}
87
88	static const struct iomap_folio_ops xfs_iomap_folio_ops = {
89	.iomap_valid = xfs_iomap_valid,
90	};
91
92	int
93	xfs_bmbt_to_iomap(
94	struct xfs_inode *ip,
95	struct iomap *iomap,
96	struct xfs_bmbt_irec *imap,
97	unsigned int mapping_flags,
98	u16 iomap_flags,
99	u64 sequence_cookie)
100	{
101	struct xfs_mount *mp = ip->i_mount;
102	struct xfs_buftarg *target = xfs_inode_buftarg(ip);
103
104	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
105	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
106	return xfs_alert_fsblock_zero(ip, imap);
107	}
108
109	if (imap->br_startblock == HOLESTARTBLOCK) {
110	iomap->addr = IOMAP_NULL_ADDR;
111	iomap->type = IOMAP_HOLE;
112	} else if (imap->br_startblock == DELAYSTARTBLOCK \|\|
113	isnullstartblock(imap->br_startblock)) {
114	iomap->addr = IOMAP_NULL_ADDR;
115	iomap->type = IOMAP_DELALLOC;
116	} else {
117	iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
118	if (mapping_flags & IOMAP_DAX)
119	iomap->addr += target->bt_dax_part_off;
120
121	if (imap->br_state == XFS_EXT_UNWRITTEN)
122	iomap->type = IOMAP_UNWRITTEN;
123	else
124	iomap->type = IOMAP_MAPPED;
125
126	}
127	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
128	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
129	if (mapping_flags & IOMAP_DAX)
130	iomap->dax_dev = target->bt_daxdev;
131	else
132	iomap->bdev = target->bt_bdev;
133	iomap->flags = iomap_flags;
134
135	if (xfs_ipincount(ip) &&
136	(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
137	iomap->flags \|= IOMAP_F_DIRTY;
138
139	iomap->validity_cookie = sequence_cookie;
140	iomap->folio_ops = &xfs_iomap_folio_ops;
141	return `0`;
142	}
143
144	static void
145	xfs_hole_to_iomap(
146	struct xfs_inode *ip,
147	struct iomap *iomap,
148	xfs_fileoff_t offset_fsb,
149	xfs_fileoff_t end_fsb)
150	{
151	struct xfs_buftarg *target = xfs_inode_buftarg(ip);
152
153	iomap->addr = IOMAP_NULL_ADDR;
154	iomap->type = IOMAP_HOLE;
155	iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
156	iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
157	iomap->bdev = target->bt_bdev;
158	iomap->dax_dev = target->bt_daxdev;
159	}
160
161	static inline xfs_fileoff_t
162	xfs_iomap_end_fsb(
163	struct xfs_mount *mp,
164	loff_t offset,
165	loff_t count)
166	{
167	ASSERT(offset <= mp->m_super->s_maxbytes);
168	return min(XFS_B_TO_FSB(mp, offset + count),
169	XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
170	}
171
172	static xfs_extlen_t
173	xfs_eof_alignment(
174	struct xfs_inode *ip)
175	{
176	struct xfs_mount *mp = ip->i_mount;
177	xfs_extlen_t align = `0`;
178
179	if (!XFS_IS_REALTIME_INODE(ip)) {
180	/*
181	* Round up the allocation request to a stripe unit
182	* (m_dalign) boundary if the file size is >= stripe unit
183	* size, and we are allocating past the allocation eof.
184	*
185	* If mounted with the "-o swalloc" option the alignment is
186	* increased from the strip unit size to the stripe width.
187	*/
188	if (mp->m_swidth && xfs_has_swalloc(mp))
189	align = mp->m_swidth;
190	else if (mp->m_dalign)
191	align = mp->m_dalign;
192
193	if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align))
194	align = `0`;
195	}
196
197	return align;
198	}
199
200	/*
201	* Check if last_fsb is outside the last extent, and if so grow it to the next
202	* stripe unit boundary.
203	*/
204	xfs_fileoff_t
205	xfs_iomap_eof_align_last_fsb(
206	struct xfs_inode *ip,
207	xfs_fileoff_t end_fsb)
208	{
209	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
210	xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
211	xfs_extlen_t align = xfs_eof_alignment(ip);
212	struct xfs_bmbt_irec irec;
213	struct xfs_iext_cursor icur;
214
215	ASSERT(!xfs_need_iread_extents(ifp));
216
217	/*
218	* Always round up the allocation request to the extent hint boundary.
219	*/
220	if (extsz) {
221	if (align)
222	align = roundup_64(align, extsz);
223	else
224	align = extsz;
225	}
226
227	if (align) {
228	xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align);
229
230	xfs_iext_last(ifp, &icur);
231	if (!xfs_iext_get_extent(ifp, &icur, &irec) \|\|
232	aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
233	return aligned_end_fsb;
234	}
235
236	return end_fsb;
237	}
238
239	int
240	xfs_iomap_write_direct(
241	struct xfs_inode *ip,
242	xfs_fileoff_t offset_fsb,
243	xfs_fileoff_t count_fsb,
244	unsigned int flags,
245	struct xfs_bmbt_irec *imap,
246	u64 *seq)
247	{
248	struct xfs_mount *mp = ip->i_mount;
249	struct xfs_trans *tp;
250	xfs_filblks_t resaligned;
251	int nimaps;
252	unsigned int dblocks, rblocks;
253	bool force = false;
254	int error;
255	int bmapi_flags = XFS_BMAPI_PREALLOC;
256	int nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT;
257
258	ASSERT(count_fsb > `0`);
259
260	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
261	xfs_get_extsz_hint(ip));
262	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
263	dblocks = XFS_DIOSTRAT_SPACE_RES(mp, `0`);
264	rblocks = resaligned;
265	} else {
266	dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
267	rblocks = `0`;
268	}
269
270	error = xfs_qm_dqattach(ip);
271	if (error)
272	return error;
273
274	/*
275	* For DAX, we do not allocate unwritten extents, but instead we zero
276	* the block before we commit the transaction. Ideally we'd like to do
277	* this outside the transaction context, but if we commit and then crash
278	* we may not have zeroed the blocks and this will be exposed on
279	* recovery of the allocation. Hence we must zero before commit.
280	*
281	* Further, if we are mapping unwritten extents here, we need to zero
282	* and convert them to written so that we don't need an unwritten extent
283	* callback for DAX. This also means that we need to be able to dip into
284	* the reserve block pool for bmbt block allocation if there is no space
285	* left but we need to do unwritten extent conversion.
286	*/
287	if (flags & IOMAP_DAX) {
288	bmapi_flags = XFS_BMAPI_CONVERT \| XFS_BMAPI_ZERO;
289	if (imap->br_state == XFS_EXT_UNWRITTEN) {
290	force = true;
291	nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT;
292	dblocks = XFS_DIOSTRAT_SPACE_RES(mp, `0`) << `1`;
293	}
294	}
295
296	error = xfs_trans_alloc_inode(ip, resv: &M_RES(mp)->tr_write, dblocks,
297	rblocks, force, tpp: &tp);
298	if (error)
299	return error;
300
301	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
302	if (error == -EFBIG)
303	error = xfs_iext_count_upgrade(tp, ip, nr_exts);
304	if (error)
305	goto out_trans_cancel;
306
307	/*
308	* From this point onwards we overwrite the imap pointer that the
309	* caller gave to us.
310	*/
311	nimaps = `1`;
312	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, `0`,
313	imap, &nimaps);
314	if (error)
315	goto out_trans_cancel;
316
317	/*
318	* Complete the transaction
319	*/
320	error = xfs_trans_commit(tp);
321	if (error)
322	goto out_unlock;
323
324	/*
325	* Copy any maps to caller's array and return any error.
326	*/
327	if (nimaps == `0`) {
328	error = -ENOSPC;
329	goto out_unlock;
330	}
331
332	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
333	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
334	error = xfs_alert_fsblock_zero(ip, imap);
335	}
336
337	out_unlock:
338	*seq = xfs_iomap_inode_sequence(ip, iomap_flags: `0`);
339	xfs_iunlock(ip, XFS_ILOCK_EXCL);
340	return error;
341
342	out_trans_cancel:
343	xfs_trans_cancel(tp);
344	goto out_unlock;
345	}
346
347	STATIC bool
348	xfs_quota_need_throttle(
349	struct xfs_inode *ip,
350	xfs_dqtype_t type,
351	xfs_fsblock_t alloc_blocks)
352	{
353	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
354
355	if (!dq \|\| !xfs_this_quota_on(ip->i_mount, type))
356	return false;
357
358	/ no hi watermark, no throttle /
359	if (!dq->q_prealloc_hi_wmark)
360	return false;
361
362	/ under the lo watermark, no throttle /
363	if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
364	return false;
365
366	return true;
367	}
368
369	STATIC void
370	xfs_quota_calc_throttle(
371	struct xfs_inode *ip,
372	xfs_dqtype_t type,
373	xfs_fsblock_t *qblocks,
374	int *qshift,
375	int64_t *qfreesp)
376	{
377	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
378	int64_t freesp;
379	int shift = `0`;
380
381	/ no dq, or over hi wmark, squash the prealloc completely /
382	if (!dq \|\| dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
383	*qblocks = `0`;
384	*qfreesp = `0`;
385	return;
386	}
387
388	freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
389	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
390	shift = `2`;
391	if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
392	shift += `2`;
393	if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
394	shift += `2`;
395	}
396
397	if (freesp < *qfreesp)
398	*qfreesp = freesp;
399
400	/ only overwrite the throttle values if we are more aggressive /
401	if ((freesp >> shift) < (qblocks >> qshift)) {
402	*qblocks = freesp;
403	*qshift = shift;
404	}
405	}
406
407	/*
408	* If we don't have a user specified preallocation size, dynamically increase
409	* the preallocation size as the size of the file grows. Cap the maximum size
410	* at a single extent or less if the filesystem is near full. The closer the
411	* filesystem is to being full, the smaller the maximum preallocation.
412	*/
413	STATIC xfs_fsblock_t
414	xfs_iomap_prealloc_size(
415	struct xfs_inode *ip,
416	int whichfork,
417	loff_t offset,
418	loff_t count,
419	struct xfs_iext_cursor *icur)
420	{
421	struct xfs_iext_cursor ncur = *icur;
422	struct xfs_bmbt_irec prev, got;
423	struct xfs_mount *mp = ip->i_mount;
424	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
425	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
426	int64_t freesp;
427	xfs_fsblock_t qblocks;
428	xfs_fsblock_t alloc_blocks = `0`;
429	xfs_extlen_t plen;
430	int shift = `0`;
431	int qshift = `0`;
432
433	/*
434	* As an exception we don't do any preallocation at all if the file is
435	* smaller than the minimum preallocation and we are using the default
436	* dynamic preallocation scheme, as it is likely this is the only write
437	* to the file that is going to be done.
438	*/
439	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
440	return `0`;
441
442	/*
443	* Use the minimum preallocation size for small files or if we are
444	* writing right after a hole.
445	*/
446	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) \|\|
447	!xfs_iext_prev_extent(ifp, &ncur, &prev) \|\|
448	prev.br_startoff + prev.br_blockcount < offset_fsb)
449	return mp->m_allocsize_blocks;
450
451	/*
452	* Take the size of the preceding data extents as the basis for the
453	* preallocation size. Note that we don't care if the previous extents
454	* are written or not.
455	*/
456	plen = prev.br_blockcount;
457	while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
458	if (plen > XFS_MAX_BMBT_EXTLEN / `2` \|\|
459	isnullstartblock(got.br_startblock) \|\|
460	got.br_startoff + got.br_blockcount != prev.br_startoff \|\|
461	got.br_startblock + got.br_blockcount != prev.br_startblock)
462	break;
463	plen += got.br_blockcount;
464	prev = got;
465	}
466
467	/*
468	* If the size of the extents is greater than half the maximum extent
469	* length, then use the current offset as the basis. This ensures that
470	* for large files the preallocation size always extends to
471	* XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
472	* unit/width alignment of real extents.
473	*/
474	alloc_blocks = plen * `2`;
475	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
476	alloc_blocks = XFS_B_TO_FSB(mp, offset);
477	qblocks = alloc_blocks;
478
479	/*
480	* XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
481	* down to the nearest power of two value after throttling. To prevent
482	* the round down from unconditionally reducing the maximum supported
483	* prealloc size, we round up first, apply appropriate throttling, round
484	* down and cap the value to XFS_BMBT_MAX_EXTLEN.
485	*/
486	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
487	alloc_blocks);
488
489	freesp = percpu_counter_read_positive(fbc: &mp->m_fdblocks);
490	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
491	shift = `2`;
492	if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
493	shift++;
494	if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
495	shift++;
496	if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
497	shift++;
498	if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
499	shift++;
500	}
501
502	/*
503	* Check each quota to cap the prealloc size, provide a shift value to
504	* throttle with and adjust amount of available space.
505	*/
506	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
507	xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
508	&freesp);
509	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
510	xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
511	&freesp);
512	if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
513	xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
514	&freesp);
515
516	/*
517	* The final prealloc size is set to the minimum of free space available
518	* in each of the quotas and the overall filesystem.
519	*
520	* The shift throttle value is set to the maximum value as determined by
521	* the global low free space values and per-quota low free space values.
522	*/
523	alloc_blocks = min(alloc_blocks, qblocks);
524	shift = max(shift, qshift);
525
526	if (shift)
527	alloc_blocks >>= shift;
528	/*
529	* rounddown_pow_of_two() returns an undefined result if we pass in
530	* alloc_blocks = 0.
531	*/
532	if (alloc_blocks)
533	alloc_blocks = rounddown_pow_of_two(alloc_blocks);
534	if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
535	alloc_blocks = XFS_MAX_BMBT_EXTLEN;
536
537	/*
538	* If we are still trying to allocate more space than is
539	* available, squash the prealloc hard. This can happen if we
540	* have a large file on a small filesystem and the above
541	* lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
542	*/
543	while (alloc_blocks && alloc_blocks >= freesp)
544	alloc_blocks >>= `4`;
545	if (alloc_blocks < mp->m_allocsize_blocks)
546	alloc_blocks = mp->m_allocsize_blocks;
547	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
548	mp->m_allocsize_blocks);
549	return alloc_blocks;
550	}
551
552	int
553	xfs_iomap_write_unwritten(
554	xfs_inode_t *ip,
555	xfs_off_t offset,
556	xfs_off_t count,
557	bool update_isize)
558	{
559	xfs_mount_t *mp = ip->i_mount;
560	xfs_fileoff_t offset_fsb;
561	xfs_filblks_t count_fsb;
562	xfs_filblks_t numblks_fsb;
563	int nimaps;
564	xfs_trans_t *tp;
565	xfs_bmbt_irec_t imap;
566	struct inode *inode = VFS_I(ip);
567	xfs_fsize_t i_size;
568	uint resblks;
569	int error;
570
571	trace_xfs_unwritten_convert(ip, offset, count);
572
573	offset_fsb = XFS_B_TO_FSBT(mp, offset);
574	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
575	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
576
577	/*
578	* Reserve enough blocks in this transaction for two complete extent
579	* btree splits. We may be converting the middle part of an unwritten
580	* extent and in this case we will insert two new extents in the btree
581	* each of which could cause a full split.
582	*
583	* This reservation amount will be used in the first call to
584	* xfs_bmbt_split() to select an AG with enough space to satisfy the
585	* rest of the operation.
586	*/
587	resblks = XFS_DIOSTRAT_SPACE_RES(mp, `0`) << `1`;
588
589	/ Attach dquots so that bmbt splits are accounted correctly. /
590	error = xfs_qm_dqattach(ip);
591	if (error)
592	return error;
593
594	do {
595	/*
596	* Set up a transaction to convert the range of extents
597	* from unwritten to real. Do allocations in a loop until
598	* we have covered the range passed in.
599	*
600	* Note that we can't risk to recursing back into the filesystem
601	* here as we might be asked to write out the same inode that we
602	* complete here and might deadlock on the iolock.
603	*/
604	error = xfs_trans_alloc_inode(ip, resv: &M_RES(mp)->tr_write, dblocks: resblks,
605	rblocks: `0`, force: true, tpp: &tp);
606	if (error)
607	return error;
608
609	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
610	XFS_IEXT_WRITE_UNWRITTEN_CNT);
611	if (error == -EFBIG)
612	error = xfs_iext_count_upgrade(tp, ip,
613	XFS_IEXT_WRITE_UNWRITTEN_CNT);
614	if (error)
615	goto error_on_bmapi_transaction;
616
617	/*
618	* Modify the unwritten extent state of the buffer.
619	*/
620	nimaps = `1`;
621	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
622	XFS_BMAPI_CONVERT, resblks, &imap,
623	&nimaps);
624	if (error)
625	goto error_on_bmapi_transaction;
626
627	/*
628	* Log the updated inode size as we go. We have to be careful
629	* to only log it up to the actual write offset if it is
630	* halfway into a block.
631	*/
632	i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
633	if (i_size > offset + count)
634	i_size = offset + count;
635	if (update_isize && i_size > i_size_read(inode))
636	i_size_write(inode, i_size);
637	i_size = xfs_new_eof(ip, i_size);
638	if (i_size) {
639	ip->i_disk_size = i_size;
640	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
641	}
642
643	error = xfs_trans_commit(tp);
644	xfs_iunlock(ip, XFS_ILOCK_EXCL);
645	if (error)
646	return error;
647
648	if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
649	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
650	return xfs_alert_fsblock_zero(ip, &imap);
651	}
652
653	if ((numblks_fsb = imap.br_blockcount) == `0`) {
654	/*
655	* The numblks_fsb value should always get
656	* smaller, otherwise the loop is stuck.
657	*/
658	ASSERT(imap.br_blockcount);
659	break;
660	}
661	offset_fsb += numblks_fsb;
662	count_fsb -= numblks_fsb;
663	} while (count_fsb > `0`);
664
665	return `0`;
666
667	error_on_bmapi_transaction:
668	xfs_trans_cancel(tp);
669	xfs_iunlock(ip, XFS_ILOCK_EXCL);
670	return error;
671	}
672
673	static inline bool
674	imap_needs_alloc(
675	struct inode *inode,
676	unsigned flags,
677	struct xfs_bmbt_irec *imap,
678	int nimaps)
679	{
680	/ don't allocate blocks when just zeroing /
681	if (flags & IOMAP_ZERO)
682	return false;
683	if (!nimaps \|\|
684	imap->br_startblock == HOLESTARTBLOCK \|\|
685	imap->br_startblock == DELAYSTARTBLOCK)
686	return true;
687	/ we convert unwritten extents before copying the data for DAX /
688	if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
689	return true;
690	return false;
691	}
692
693	static inline bool
694	imap_needs_cow(
695	struct xfs_inode *ip,
696	unsigned int flags,
697	struct xfs_bmbt_irec *imap,
698	int nimaps)
699	{
700	if (!xfs_is_cow_inode(ip))
701	return false;
702
703	/ when zeroing we don't have to COW holes or unwritten extents /
704	if (flags & IOMAP_ZERO) {
705	if (!nimaps \|\|
706	imap->br_startblock == HOLESTARTBLOCK \|\|
707	imap->br_state == XFS_EXT_UNWRITTEN)
708	return false;
709	}
710
711	return true;
712	}
713
714	static int
715	xfs_ilock_for_iomap(
716	struct xfs_inode *ip,
717	unsigned flags,
718	unsigned *lockmode)
719	{
720	unsigned int mode = *lockmode;
721	bool is_write = flags & (IOMAP_WRITE \| IOMAP_ZERO);
722
723	/*
724	* COW writes may allocate delalloc space or convert unwritten COW
725	* extents, so we need to make sure to take the lock exclusively here.
726	*/
727	if (xfs_is_cow_inode(ip) && is_write)
728	mode = XFS_ILOCK_EXCL;
729
730	/*
731	* Extents not yet cached requires exclusive access, don't block. This
732	* is an opencoded xfs_ilock_data_map_shared() call but with
733	* non-blocking behaviour.
734	*/
735	if (xfs_need_iread_extents(&ip->i_df)) {
736	if (flags & IOMAP_NOWAIT)
737	return -EAGAIN;
738	mode = XFS_ILOCK_EXCL;
739	}
740
741	relock:
742	if (flags & IOMAP_NOWAIT) {
743	if (!xfs_ilock_nowait(ip, mode))
744	return -EAGAIN;
745	} else {
746	xfs_ilock(ip, mode);
747	}
748
749	/*
750	* The reflink iflag could have changed since the earlier unlocked
751	* check, so if we got ILOCK_SHARED for a write and but we're now a
752	* reflink inode we have to switch to ILOCK_EXCL and relock.
753	*/
754	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
755	xfs_iunlock(ip, mode);
756	mode = XFS_ILOCK_EXCL;
757	goto relock;
758	}
759
760	*lockmode = mode;
761	return `0`;
762	}
763
764	/*
765	* Check that the imap we are going to return to the caller spans the entire
766	* range that the caller requested for the IO.
767	*/
768	static bool
769	imap_spans_range(
770	struct xfs_bmbt_irec *imap,
771	xfs_fileoff_t offset_fsb,
772	xfs_fileoff_t end_fsb)
773	{
774	if (imap->br_startoff > offset_fsb)
775	return false;
776	if (imap->br_startoff + imap->br_blockcount < end_fsb)
777	return false;
778	return true;
779	}
780
781	static int
782	xfs_direct_write_iomap_begin(
783	struct inode *inode,
784	loff_t offset,
785	loff_t length,
786	unsigned flags,
787	struct iomap *iomap,
788	struct iomap *srcmap)
789	{
790	struct xfs_inode *ip = XFS_I(inode);
791	struct xfs_mount *mp = ip->i_mount;
792	struct xfs_bmbt_irec imap, cmap;
793	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
794	xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
795	int nimaps = `1`, error = `0`;
796	bool shared = false;
797	u16 iomap_flags = `0`;
798	unsigned int lockmode = XFS_ILOCK_SHARED;
799	u64 seq;
800
801	ASSERT(flags & (IOMAP_WRITE \| IOMAP_ZERO));
802
803	if (xfs_is_shutdown(mp))
804	return -EIO;
805
806	/*
807	* Writes that span EOF might trigger an IO size update on completion,
808	* so consider them to be dirty for the purposes of O_DSYNC even if
809	* there is no other metadata changes pending or have been made here.
810	*/
811	if (offset + length > i_size_read(inode))
812	iomap_flags \|= IOMAP_F_DIRTY;
813
814	error = xfs_ilock_for_iomap(ip, flags, lockmode: &lockmode);
815	if (error)
816	return error;
817
818	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
819	&nimaps, `0`);
820	if (error)
821	goto out_unlock;
822
823	if (imap_needs_cow(ip, flags, imap: &imap, nimaps)) {
824	error = -EAGAIN;
825	if (flags & IOMAP_NOWAIT)
826	goto out_unlock;
827
828	/ may drop and re-acquire the ilock /
829	error = xfs_reflink_allocate_cow(ip, imap: &imap, cmap: &cmap, shared: &shared,
830	lockmode: &lockmode,
831	convert_now: (flags & IOMAP_DIRECT) \|\| IS_DAX(inode));
832	if (error)
833	goto out_unlock;
834	if (shared)
835	goto out_found_cow;
836	end_fsb = imap.br_startoff + imap.br_blockcount;
837	length = XFS_FSB_TO_B(mp, end_fsb) - offset;
838	}
839
840	if (imap_needs_alloc(inode, flags, imap: &imap, nimaps))
841	goto allocate_blocks;
842
843	/*
844	* NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
845	* a single map so that we avoid partial IO failures due to the rest of
846	* the I/O range not covered by this map triggering an EAGAIN condition
847	* when it is subsequently mapped and aborting the I/O.
848	*/
849	if (flags & (IOMAP_NOWAIT \| IOMAP_OVERWRITE_ONLY)) {
850	error = -EAGAIN;
851	if (!imap_spans_range(&imap, offset_fsb, end_fsb))
852	goto out_unlock;
853	}
854
855	/*
856	* For overwrite only I/O, we cannot convert unwritten extents without
857	* requiring sub-block zeroing. This can only be done under an
858	* exclusive IOLOCK, hence return -EAGAIN if this is not a written
859	* extent to tell the caller to try again.
860	*/
861	if (flags & IOMAP_OVERWRITE_ONLY) {
862	error = -EAGAIN;
863	if (imap.br_state != XFS_EXT_NORM &&
864	((offset \| length) & mp->m_blockmask))
865	goto out_unlock;
866	}
867
868	seq = xfs_iomap_inode_sequence(ip, iomap_flags);
869	xfs_iunlock(ip, lockmode);
870	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
871	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags, iomap_flags, sequence_cookie: seq);
872
873	allocate_blocks:
874	error = -EAGAIN;
875	if (flags & (IOMAP_NOWAIT \| IOMAP_OVERWRITE_ONLY))
876	goto out_unlock;
877
878	/*
879	* We cap the maximum length we map to a sane size to keep the chunks
880	* of work done where somewhat symmetric with the work writeback does.
881	* This is a completely arbitrary number pulled out of thin air as a
882	* best guess for initial testing.
883	*
884	* Note that the values needs to be less than 32-bits wide until the
885	* lower level functions are updated.
886	*/
887	length = min_t(loff_t, length, `1024` * PAGE_SIZE);
888	end_fsb = xfs_iomap_end_fsb(mp, offset, length);
889
890	if (offset + length > XFS_ISIZE(ip))
891	end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
892	else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
893	end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
894	xfs_iunlock(ip, lockmode);
895
896	error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
897	flags, &imap, &seq);
898	if (error)
899	return error;
900
901	trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
902	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags,
903	iomap_flags: iomap_flags \| IOMAP_F_NEW, sequence_cookie: seq);
904
905	out_found_cow:
906	length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
907	trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
908	if (imap.br_startblock != HOLESTARTBLOCK) {
909	seq = xfs_iomap_inode_sequence(ip, iomap_flags: `0`);
910	error = xfs_bmbt_to_iomap(ip, iomap: srcmap, imap: &imap, mapping_flags: flags, iomap_flags: `0`, sequence_cookie: seq);
911	if (error)
912	goto out_unlock;
913	}
914	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
915	xfs_iunlock(ip, lockmode);
916	return xfs_bmbt_to_iomap(ip, iomap, imap: &cmap, mapping_flags: flags, IOMAP_F_SHARED, sequence_cookie: seq);
917
918	out_unlock:
919	if (lockmode)
920	xfs_iunlock(ip, lockmode);
921	return error;
922	}
923
924	const struct iomap_ops xfs_direct_write_iomap_ops = {
925	.iomap_begin = xfs_direct_write_iomap_begin,
926	};
927
928	static int
929	xfs_dax_write_iomap_end(
930	struct inode *inode,
931	loff_t pos,
932	loff_t length,
933	ssize_t written,
934	unsigned flags,
935	struct iomap *iomap)
936	{
937	struct xfs_inode *ip = XFS_I(inode);
938
939	if (!xfs_is_cow_inode(ip))
940	return `0`;
941
942	if (!written) {
943	xfs_reflink_cancel_cow_range(ip, offset: pos, count: length, cancel_real: true);
944	return `0`;
945	}
946
947	return xfs_reflink_end_cow(ip, offset: pos, count: written);
948	}
949
950	const struct iomap_ops xfs_dax_write_iomap_ops = {
951	.iomap_begin = xfs_direct_write_iomap_begin,
952	.iomap_end = xfs_dax_write_iomap_end,
953	};
954
955	static int
956	xfs_buffered_write_iomap_begin(
957	struct inode *inode,
958	loff_t offset,
959	loff_t count,
960	unsigned flags,
961	struct iomap *iomap,
962	struct iomap *srcmap)
963	{
964	struct xfs_inode *ip = XFS_I(inode);
965	struct xfs_mount *mp = ip->i_mount;
966	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
967	xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
968	struct xfs_bmbt_irec imap, cmap;
969	struct xfs_iext_cursor icur, ccur;
970	xfs_fsblock_t prealloc_blocks = `0`;
971	bool eof = false, cow_eof = false, shared = false;
972	int allocfork = XFS_DATA_FORK;
973	int error = `0`;
974	unsigned int lockmode = XFS_ILOCK_EXCL;
975	u64 seq;
976
977	if (xfs_is_shutdown(mp))
978	return -EIO;
979
980	/ we can't use delayed allocations when using extent size hints /
981	if (xfs_get_extsz_hint(ip))
982	return xfs_direct_write_iomap_begin(inode, offset, length: count,
983	flags, iomap, srcmap);
984
985	ASSERT(!XFS_IS_REALTIME_INODE(ip));
986
987	error = xfs_qm_dqattach(ip);
988	if (error)
989	return error;
990
991	error = xfs_ilock_for_iomap(ip, flags, lockmode: &lockmode);
992	if (error)
993	return error;
994
995	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) \|\|
996	XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
997	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
998	error = -EFSCORRUPTED;
999	goto out_unlock;
1000	}
1001
1002	XFS_STATS_INC(mp, xs_blk_mapw);
1003
1004	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
1005	if (error)
1006	goto out_unlock;
1007
1008	/*
1009	* Search the data fork first to look up our source mapping. We
1010	* always need the data fork map, as we have to return it to the
1011	* iomap code so that the higher level write code can read data in to
1012	* perform read-modify-write cycles for unaligned writes.
1013	*/
1014	eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
1015	if (eof)
1016	imap.br_startoff = end_fsb; / fake hole until the end /
1017
1018	/ We never need to allocate blocks for zeroing or unsharing a hole. /
1019	if ((flags & (IOMAP_UNSHARE \| IOMAP_ZERO)) &&
1020	imap.br_startoff > offset_fsb) {
1021	xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
1022	goto out_unlock;
1023	}
1024
1025	/*
1026	* Search the COW fork extent list even if we did not find a data fork
1027	* extent. This serves two purposes: first this implements the
1028	* speculative preallocation using cowextsize, so that we also unshare
1029	* block adjacent to shared blocks instead of just the shared blocks
1030	* themselves. Second the lookup in the extent list is generally faster
1031	* than going out to the shared extent tree.
1032	*/
1033	if (xfs_is_cow_inode(ip)) {
1034	if (!ip->i_cowfp) {
1035	ASSERT(!xfs_is_reflink_inode(ip));
1036	xfs_ifork_init_cow(ip);
1037	}
1038	cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
1039	&ccur, &cmap);
1040	if (!cow_eof && cmap.br_startoff <= offset_fsb) {
1041	trace_xfs_reflink_cow_found(ip, irec: &cmap);
1042	goto found_cow;
1043	}
1044	}
1045
1046	if (imap.br_startoff <= offset_fsb) {
1047	/*
1048	* For reflink files we may need a delalloc reservation when
1049	* overwriting shared extents. This includes zeroing of
1050	* existing extents that contain data.
1051	*/
1052	if (!xfs_is_cow_inode(ip) \|\|
1053	((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
1054	trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
1055	&imap);
1056	goto found_imap;
1057	}
1058
1059	xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
1060
1061	/ Trim the mapping to the nearest shared extent boundary. /
1062	error = xfs_bmap_trim_cow(ip, imap: &imap, shared: &shared);
1063	if (error)
1064	goto out_unlock;
1065
1066	/ Not shared? Just report the (potentially capped) extent. /
1067	if (!shared) {
1068	trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
1069	&imap);
1070	goto found_imap;
1071	}
1072
1073	/*
1074	* Fork all the shared blocks from our write offset until the
1075	* end of the extent.
1076	*/
1077	allocfork = XFS_COW_FORK;
1078	end_fsb = imap.br_startoff + imap.br_blockcount;
1079	} else {
1080	/*
1081	* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1082	* pages to keep the chunks of work done where somewhat
1083	* symmetric with the work writeback does. This is a completely
1084	* arbitrary number pulled out of thin air.
1085	*
1086	* Note that the values needs to be less than 32-bits wide until
1087	* the lower level functions are updated.
1088	*/
1089	count = min_t(loff_t, count, `1024` * PAGE_SIZE);
1090	end_fsb = xfs_iomap_end_fsb(mp, offset, count);
1091
1092	if (xfs_is_always_cow_inode(ip))
1093	allocfork = XFS_COW_FORK;
1094	}
1095
1096	if (eof && offset + count > XFS_ISIZE(ip)) {
1097	/*
1098	* Determine the initial size of the preallocation.
1099	* We clean up any extra preallocation when the file is closed.
1100	*/
1101	if (xfs_has_allocsize(mp))
1102	prealloc_blocks = mp->m_allocsize_blocks;
1103	else if (allocfork == XFS_DATA_FORK)
1104	prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
1105	offset, count, &icur);
1106	else
1107	prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
1108	offset, count, &ccur);
1109	if (prealloc_blocks) {
1110	xfs_extlen_t align;
1111	xfs_off_t end_offset;
1112	xfs_fileoff_t p_end_fsb;
1113
1114	end_offset = XFS_ALLOC_ALIGN(mp, offset + count - `1`);
1115	p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
1116	prealloc_blocks;
1117
1118	align = xfs_eof_alignment(ip);
1119	if (align)
1120	p_end_fsb = roundup_64(p_end_fsb, align);
1121
1122	p_end_fsb = min(p_end_fsb,
1123	XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
1124	ASSERT(p_end_fsb > offset_fsb);
1125	prealloc_blocks = p_end_fsb - end_fsb;
1126	}
1127	}
1128
1129	retry:
1130	error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
1131	end_fsb - offset_fsb, prealloc_blocks,
1132	allocfork == XFS_DATA_FORK ? &imap : &cmap,
1133	allocfork == XFS_DATA_FORK ? &icur : &ccur,
1134	allocfork == XFS_DATA_FORK ? eof : cow_eof);
1135	switch (error) {
1136	case `0`:
1137	break;
1138	case -ENOSPC:
1139	case -EDQUOT:
1140	/ retry without any preallocation /
1141	trace_xfs_delalloc_enospc(ip, offset, count);
1142	if (prealloc_blocks) {
1143	prealloc_blocks = `0`;
1144	goto retry;
1145	}
1146	fallthrough;
1147	default:
1148	goto out_unlock;
1149	}
1150
1151	if (allocfork == XFS_COW_FORK) {
1152	trace_xfs_iomap_alloc(ip, offset, count, whichfork: allocfork, irec: &cmap);
1153	goto found_cow;
1154	}
1155
1156	/*
1157	* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
1158	* them out if the write happens to fail.
1159	*/
1160	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
1161	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1162	trace_xfs_iomap_alloc(ip, offset, count, whichfork: allocfork, irec: &imap);
1163	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags, IOMAP_F_NEW, sequence_cookie: seq);
1164
1165	found_imap:
1166	seq = xfs_iomap_inode_sequence(ip, iomap_flags: `0`);
1167	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1168	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags, iomap_flags: `0`, sequence_cookie: seq);
1169
1170	found_cow:
1171	seq = xfs_iomap_inode_sequence(ip, iomap_flags: `0`);
1172	if (imap.br_startoff <= offset_fsb) {
1173	error = xfs_bmbt_to_iomap(ip, iomap: srcmap, imap: &imap, mapping_flags: flags, iomap_flags: `0`, sequence_cookie: seq);
1174	if (error)
1175	goto out_unlock;
1176	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
1177	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1178	return xfs_bmbt_to_iomap(ip, iomap, imap: &cmap, mapping_flags: flags,
1179	IOMAP_F_SHARED, sequence_cookie: seq);
1180	}
1181
1182	xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
1183	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1184	return xfs_bmbt_to_iomap(ip, iomap, imap: &cmap, mapping_flags: flags, iomap_flags: `0`, sequence_cookie: seq);
1185
1186	out_unlock:
1187	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1188	return error;
1189	}
1190
1191	static int
1192	xfs_buffered_write_delalloc_punch(
1193	struct inode *inode,
1194	loff_t offset,
1195	loff_t length)
1196	{
1197	return xfs_bmap_punch_delalloc_range(ip: XFS_I(inode), start_byte: offset,
1198	end_byte: offset + length);
1199	}
1200
1201	static int
1202	xfs_buffered_write_iomap_end(
1203	struct inode *inode,
1204	loff_t offset,
1205	loff_t length,
1206	ssize_t written,
1207	unsigned flags,
1208	struct iomap *iomap)
1209	{
1210
1211	struct xfs_mount *mp = XFS_M(inode->i_sb);
1212	int error;
1213
1214	error = iomap_file_buffered_write_punch_delalloc(inode, iomap, pos: offset,
1215	length, written, punch: &xfs_buffered_write_delalloc_punch);
1216	if (error && !xfs_is_shutdown(mp)) {
1217	xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
1218	__func__, XFS_I(inode)->i_ino);
1219	return error;
1220	}
1221	return `0`;
1222	}
1223
1224	const struct iomap_ops xfs_buffered_write_iomap_ops = {
1225	.iomap_begin = xfs_buffered_write_iomap_begin,
1226	.iomap_end = xfs_buffered_write_iomap_end,
1227	};
1228
1229	/*
1230	* iomap_page_mkwrite() will never fail in a way that requires delalloc extents
1231	* that it allocated to be revoked. Hence we do not need an .iomap_end method
1232	* for this operation.
1233	*/
1234	const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
1235	.iomap_begin = xfs_buffered_write_iomap_begin,
1236	};
1237
1238	static int
1239	xfs_read_iomap_begin(
1240	struct inode *inode,
1241	loff_t offset,
1242	loff_t length,
1243	unsigned flags,
1244	struct iomap *iomap,
1245	struct iomap *srcmap)
1246	{
1247	struct xfs_inode *ip = XFS_I(inode);
1248	struct xfs_mount *mp = ip->i_mount;
1249	struct xfs_bmbt_irec imap;
1250	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
1251	xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
1252	int nimaps = `1`, error = `0`;
1253	bool shared = false;
1254	unsigned int lockmode = XFS_ILOCK_SHARED;
1255	u64 seq;
1256
1257	ASSERT(!(flags & (IOMAP_WRITE \| IOMAP_ZERO)));
1258
1259	if (xfs_is_shutdown(mp))
1260	return -EIO;
1261
1262	error = xfs_ilock_for_iomap(ip, flags, lockmode: &lockmode);
1263	if (error)
1264	return error;
1265	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1266	&nimaps, `0`);
1267	if (!error && ((flags & IOMAP_REPORT) \|\| IS_DAX(inode)))
1268	error = xfs_reflink_trim_around_shared(ip, irec: &imap, shared: &shared);
1269	seq = xfs_iomap_inode_sequence(ip, iomap_flags: shared ? IOMAP_F_SHARED : `0`);
1270	xfs_iunlock(ip, lockmode);
1271
1272	if (error)
1273	return error;
1274	trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
1275	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags,
1276	iomap_flags: shared ? IOMAP_F_SHARED : `0`, sequence_cookie: seq);
1277	}
1278
1279	const struct iomap_ops xfs_read_iomap_ops = {
1280	.iomap_begin = xfs_read_iomap_begin,
1281	};
1282
1283	static int
1284	xfs_seek_iomap_begin(
1285	struct inode *inode,
1286	loff_t offset,
1287	loff_t length,
1288	unsigned flags,
1289	struct iomap *iomap,
1290	struct iomap *srcmap)
1291	{
1292	struct xfs_inode *ip = XFS_I(inode);
1293	struct xfs_mount *mp = ip->i_mount;
1294	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
1295	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
1296	xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
1297	struct xfs_iext_cursor icur;
1298	struct xfs_bmbt_irec imap, cmap;
1299	int error = `0`;
1300	unsigned lockmode;
1301	u64 seq;
1302
1303	if (xfs_is_shutdown(mp))
1304	return -EIO;
1305
1306	lockmode = xfs_ilock_data_map_shared(ip);
1307	error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
1308	if (error)
1309	goto out_unlock;
1310
1311	if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
1312	/*
1313	* If we found a data extent we are done.
1314	*/
1315	if (imap.br_startoff <= offset_fsb)
1316	goto done;
1317	data_fsb = imap.br_startoff;
1318	} else {
1319	/*
1320	* Fake a hole until the end of the file.
1321	*/
1322	data_fsb = xfs_iomap_end_fsb(mp, offset, length);
1323	}
1324
1325	/*
1326	* If a COW fork extent covers the hole, report it - capped to the next
1327	* data fork extent:
1328	*/
1329	if (xfs_inode_has_cow_data(ip) &&
1330	xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
1331	cow_fsb = cmap.br_startoff;
1332	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
1333	if (data_fsb < cow_fsb + cmap.br_blockcount)
1334	end_fsb = min(end_fsb, data_fsb);
1335	xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
1336	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
1337	error = xfs_bmbt_to_iomap(ip, iomap, imap: &cmap, mapping_flags: flags,
1338	IOMAP_F_SHARED, sequence_cookie: seq);
1339	/*
1340	* This is a COW extent, so we must probe the page cache
1341	* because there could be dirty page cache being backed
1342	* by this extent.
1343	*/
1344	iomap->type = IOMAP_UNWRITTEN;
1345	goto out_unlock;
1346	}
1347
1348	/*
1349	* Else report a hole, capped to the next found data or COW extent.
1350	*/
1351	if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
1352	imap.br_blockcount = cow_fsb - offset_fsb;
1353	else
1354	imap.br_blockcount = data_fsb - offset_fsb;
1355	imap.br_startoff = offset_fsb;
1356	imap.br_startblock = HOLESTARTBLOCK;
1357	imap.br_state = XFS_EXT_NORM;
1358	done:
1359	seq = xfs_iomap_inode_sequence(ip, iomap_flags: `0`);
1360	xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
1361	error = xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags, iomap_flags: `0`, sequence_cookie: seq);
1362	out_unlock:
1363	xfs_iunlock(ip, lockmode);
1364	return error;
1365	}
1366
1367	const struct iomap_ops xfs_seek_iomap_ops = {
1368	.iomap_begin = xfs_seek_iomap_begin,
1369	};
1370
1371	static int
1372	xfs_xattr_iomap_begin(
1373	struct inode *inode,
1374	loff_t offset,
1375	loff_t length,
1376	unsigned flags,
1377	struct iomap *iomap,
1378	struct iomap *srcmap)
1379	{
1380	struct xfs_inode *ip = XFS_I(inode);
1381	struct xfs_mount *mp = ip->i_mount;
1382	xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
1383	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
1384	struct xfs_bmbt_irec imap;
1385	int nimaps = `1`, error = `0`;
1386	unsigned lockmode;
1387	int seq;
1388
1389	if (xfs_is_shutdown(mp))
1390	return -EIO;
1391
1392	lockmode = xfs_ilock_attr_map_shared(ip);
1393
1394	/ if there are no attribute fork or extents, return ENOENT /
1395	if (!xfs_inode_has_attr_fork(ip) \|\| !ip->i_af.if_nextents) {
1396	error = -ENOENT;
1397	goto out_unlock;
1398	}
1399
1400	ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
1401	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1402	&nimaps, XFS_BMAPI_ATTRFORK);
1403	out_unlock:
1404
1405	seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
1406	xfs_iunlock(ip, lockmode);
1407
1408	if (error)
1409	return error;
1410	ASSERT(nimaps);
1411	return xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: flags, IOMAP_F_XATTR, sequence_cookie: seq);
1412	}
1413
1414	const struct iomap_ops xfs_xattr_iomap_ops = {
1415	.iomap_begin = xfs_xattr_iomap_begin,
1416	};
1417
1418	int
1419	xfs_zero_range(
1420	struct xfs_inode *ip,
1421	loff_t pos,
1422	loff_t len,
1423	bool *did_zero)
1424	{
1425	struct inode *inode = VFS_I(ip);
1426
1427	if (IS_DAX(inode))
1428	return dax_zero_range(inode, pos, len, did_zero,
1429	ops: &xfs_dax_write_iomap_ops);
1430	return iomap_zero_range(inode, pos, len, did_zero,
1431	ops: &xfs_buffered_write_iomap_ops);
1432	}
1433
1434	int
1435	xfs_truncate_page(
1436	struct xfs_inode *ip,
1437	loff_t pos,
1438	bool *did_zero)
1439	{
1440	struct inode *inode = VFS_I(ip);
1441
1442	if (IS_DAX(inode))
1443	return dax_truncate_page(inode, pos, did_zero,
1444	ops: &xfs_dax_write_iomap_ops);
1445	return iomap_truncate_page(inode, pos, did_zero,
1446	ops: &xfs_buffered_write_iomap_ops);
1447	}
1448

source code of linux/fs/xfs/xfs_iomap.c