xfs_bmap_util.c source code [linux/fs/xfs/xfs_bmap_util.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
4	* Copyright (c) 2012 Red Hat, Inc.
5	* All Rights Reserved.
6	*/
7	#include "xfs.h"
8	#include "xfs_fs.h"
9	#include "xfs_shared.h"
10	#include "xfs_format.h"
11	#include "xfs_log_format.h"
12	#include "xfs_trans_resv.h"
13	#include "xfs_bit.h"
14	#include "xfs_mount.h"
15	#include "xfs_defer.h"
16	#include "xfs_inode.h"
17	#include "xfs_btree.h"
18	#include "xfs_trans.h"
19	#include "xfs_alloc.h"
20	#include "xfs_bmap.h"
21	#include "xfs_bmap_util.h"
22	#include "xfs_bmap_btree.h"
23	#include "xfs_rtalloc.h"
24	#include "xfs_error.h"
25	#include "xfs_quota.h"
26	#include "xfs_trans_space.h"
27	#include "xfs_trace.h"
28	#include "xfs_icache.h"
29	#include "xfs_iomap.h"
30	#include "xfs_reflink.h"
31	#include "xfs_rtbitmap.h"
32
33	/ Kernel only BMAP related definitions and functions /
34
35	/*
36	* Convert the given file system block to a disk block. We have to treat it
37	* differently based on whether the file is a real time file or not, because the
38	* bmap code does.
39	*/
40	xfs_daddr_t
41	xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
42	{
43	if (XFS_IS_REALTIME_INODE(ip))
44	return XFS_FSB_TO_BB(ip->i_mount, fsb);
45	return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
46	}
47
48	/*
49	* Routine to zero an extent on disk allocated to the specific inode.
50	*
51	* The VFS functions take a linearised filesystem block offset, so we have to
52	* convert the sparse xfs fsb to the right format first.
53	* VFS types are real funky, too.
54	*/
55	int
56	xfs_zero_extent(
57	struct xfs_inode *ip,
58	xfs_fsblock_t start_fsb,
59	xfs_off_t count_fsb)
60	{
61	struct xfs_mount *mp = ip->i_mount;
62	struct xfs_buftarg *target = xfs_inode_buftarg(ip);
63	xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
64	sector_t block = XFS_BB_TO_FSBT(mp, sector);
65
66	return blkdev_issue_zeroout(bdev: target->bt_bdev,
67	sector: block << (mp->m_super->s_blocksize_bits - `9`),
68	nr_sects: count_fsb << (mp->m_super->s_blocksize_bits - `9`),
69	GFP_KERNEL, flags: `0`);
70	}
71
72	/*
73	* Extent tree block counting routines.
74	*/
75
76	/*
77	* Count leaf blocks given a range of extent records. Delayed allocation
78	* extents are not counted towards the totals.
79	*/
80	xfs_extnum_t
81	xfs_bmap_count_leaves(
82	struct xfs_ifork *ifp,
83	xfs_filblks_t *count)
84	{
85	struct xfs_iext_cursor icur;
86	struct xfs_bmbt_irec got;
87	xfs_extnum_t numrecs = `0`;
88
89	for_each_xfs_iext(ifp, &icur, &got) {
90	if (!isnullstartblock(got.br_startblock)) {
91	*count += got.br_blockcount;
92	numrecs++;
93	}
94	}
95
96	return numrecs;
97	}
98
99	/*
100	* Count fsblocks of the given fork. Delayed allocation extents are
101	* not counted towards the totals.
102	*/
103	int
104	xfs_bmap_count_blocks(
105	struct xfs_trans *tp,
106	struct xfs_inode *ip,
107	int whichfork,
108	xfs_extnum_t *nextents,
109	xfs_filblks_t *count)
110	{
111	struct xfs_mount *mp = ip->i_mount;
112	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
113	struct xfs_btree_cur *cur;
114	xfs_extlen_t btblocks = `0`;
115	int error;
116
117	*nextents = `0`;
118	*count = `0`;
119
120	if (!ifp)
121	return `0`;
122
123	switch (ifp->if_format) {
124	case XFS_DINODE_FMT_BTREE:
125	error = xfs_iread_extents(tp, ip, whichfork);
126	if (error)
127	return error;
128
129	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
130	error = xfs_btree_count_blocks(cur, &btblocks);
131	xfs_btree_del_cursor(cur, error);
132	if (error)
133	return error;
134
135	/*
136	* xfs_btree_count_blocks includes the root block contained in
137	* the inode fork in @btblocks, so subtract one because we're
138	* only interested in allocated disk blocks.
139	*/
140	*count += btblocks - `1`;
141
142	fallthrough;
143	case XFS_DINODE_FMT_EXTENTS:
144	*nextents = xfs_bmap_count_leaves(ifp, count);
145	break;
146	}
147
148	return `0`;
149	}
150
151	static int
152	xfs_getbmap_report_one(
153	struct xfs_inode *ip,
154	struct getbmapx *bmv,
155	struct kgetbmap *out,
156	int64_t bmv_end,
157	struct xfs_bmbt_irec *got)
158	{
159	struct kgetbmap *p = out + bmv->bmv_entries;
160	bool shared = false;
161	int error;
162
163	error = xfs_reflink_trim_around_shared(ip, irec: got, shared: &shared);
164	if (error)
165	return error;
166
167	if (isnullstartblock(got->br_startblock) \|\|
168	got->br_startblock == DELAYSTARTBLOCK) {
169	/*
170	* Take the flush completion as being a point-in-time snapshot
171	* where there are no delalloc extents, and if any new ones
172	* have been created racily, just skip them as being 'after'
173	* the flush and so don't get reported.
174	*/
175	if (!(bmv->bmv_iflags & BMV_IF_DELALLOC))
176	return `0`;
177
178	p->bmv_oflags \|= BMV_OF_DELALLOC;
179	p->bmv_block = -`2`;
180	} else {
181	p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
182	}
183
184	if (got->br_state == XFS_EXT_UNWRITTEN &&
185	(bmv->bmv_iflags & BMV_IF_PREALLOC))
186	p->bmv_oflags \|= BMV_OF_PREALLOC;
187
188	if (shared)
189	p->bmv_oflags \|= BMV_OF_SHARED;
190
191	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
192	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
193
194	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
195	bmv->bmv_length = max(`0LL`, bmv_end - bmv->bmv_offset);
196	bmv->bmv_entries++;
197	return `0`;
198	}
199
200	static void
201	xfs_getbmap_report_hole(
202	struct xfs_inode *ip,
203	struct getbmapx *bmv,
204	struct kgetbmap *out,
205	int64_t bmv_end,
206	xfs_fileoff_t bno,
207	xfs_fileoff_t end)
208	{
209	struct kgetbmap *p = out + bmv->bmv_entries;
210
211	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
212	return;
213
214	p->bmv_block = -`1`;
215	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
216	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
217
218	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
219	bmv->bmv_length = max(`0LL`, bmv_end - bmv->bmv_offset);
220	bmv->bmv_entries++;
221	}
222
223	static inline bool
224	xfs_getbmap_full(
225	struct getbmapx *bmv)
226	{
227	return bmv->bmv_length == `0` \|\| bmv->bmv_entries >= bmv->bmv_count - `1`;
228	}
229
230	static bool
231	xfs_getbmap_next_rec(
232	struct xfs_bmbt_irec *rec,
233	xfs_fileoff_t total_end)
234	{
235	xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount;
236
237	if (end == total_end)
238	return false;
239
240	rec->br_startoff += rec->br_blockcount;
241	if (!isnullstartblock(rec->br_startblock) &&
242	rec->br_startblock != DELAYSTARTBLOCK)
243	rec->br_startblock += rec->br_blockcount;
244	rec->br_blockcount = total_end - end;
245	return true;
246	}
247
248	/*
249	* Get inode's extents as described in bmv, and format for output.
250	* Calls formatter to fill the user's buffer until all extents
251	* are mapped, until the passed-in bmv->bmv_count slots have
252	* been filled, or until the formatter short-circuits the loop,
253	* if it is tracking filled-in extents on its own.
254	*/
255	int / error code /
256	xfs_getbmap(
257	struct xfs_inode *ip,
258	struct getbmapx bmv, /* user bmap structure /
259	struct kgetbmap *out)
260	{
261	struct xfs_mount *mp = ip->i_mount;
262	int iflags = bmv->bmv_iflags;
263	int whichfork, lock, error = `0`;
264	int64_t bmv_end, max_len;
265	xfs_fileoff_t bno, first_bno;
266	struct xfs_ifork *ifp;
267	struct xfs_bmbt_irec got, rec;
268	xfs_filblks_t len;
269	struct xfs_iext_cursor icur;
270
271	if (bmv->bmv_iflags & ~BMV_IF_VALID)
272	return -EINVAL;
273	#ifndef DEBUG
274	/ Only allow CoW fork queries if we're debugging. /
275	if (iflags & BMV_IF_COWFORK)
276	return -EINVAL;
277	#endif
278	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
279	return -EINVAL;
280
281	if (bmv->bmv_length < -`1`)
282	return -EINVAL;
283	bmv->bmv_entries = `0`;
284	if (bmv->bmv_length == `0`)
285	return `0`;
286
287	if (iflags & BMV_IF_ATTRFORK)
288	whichfork = XFS_ATTR_FORK;
289	else if (iflags & BMV_IF_COWFORK)
290	whichfork = XFS_COW_FORK;
291	else
292	whichfork = XFS_DATA_FORK;
293
294	xfs_ilock(ip, XFS_IOLOCK_SHARED);
295	switch (whichfork) {
296	case XFS_ATTR_FORK:
297	lock = xfs_ilock_attr_map_shared(ip);
298	if (!xfs_inode_has_attr_fork(ip))
299	goto out_unlock_ilock;
300
301	max_len = `1LL` << `32`;
302	break;
303	case XFS_COW_FORK:
304	lock = XFS_ILOCK_SHARED;
305	xfs_ilock(ip, lock);
306
307	/ No CoW fork? Just return /
308	if (!xfs_ifork_ptr(ip, whichfork))
309	goto out_unlock_ilock;
310
311	if (xfs_get_cowextsz_hint(ip))
312	max_len = mp->m_super->s_maxbytes;
313	else
314	max_len = XFS_ISIZE(ip);
315	break;
316	case XFS_DATA_FORK:
317	if (!(iflags & BMV_IF_DELALLOC) &&
318	(ip->i_delayed_blks \|\| XFS_ISIZE(ip) > ip->i_disk_size)) {
319	error = filemap_write_and_wait(mapping: VFS_I(ip)->i_mapping);
320	if (error)
321	goto out_unlock_iolock;
322
323	/*
324	* Even after flushing the inode, there can still be
325	* delalloc blocks on the inode beyond EOF due to
326	* speculative preallocation. These are not removed
327	* until the release function is called or the inode
328	* is inactivated. Hence we cannot assert here that
329	* ip->i_delayed_blks == 0.
330	*/
331	}
332
333	if (xfs_get_extsz_hint(ip) \|\|
334	(ip->i_diflags &
335	(XFS_DIFLAG_PREALLOC \| XFS_DIFLAG_APPEND)))
336	max_len = mp->m_super->s_maxbytes;
337	else
338	max_len = XFS_ISIZE(ip);
339
340	lock = xfs_ilock_data_map_shared(ip);
341	break;
342	}
343
344	ifp = xfs_ifork_ptr(ip, whichfork);
345
346	switch (ifp->if_format) {
347	case XFS_DINODE_FMT_EXTENTS:
348	case XFS_DINODE_FMT_BTREE:
349	break;
350	case XFS_DINODE_FMT_LOCAL:
351	/ Local format inode forks report no extents. /
352	goto out_unlock_ilock;
353	default:
354	error = -EINVAL;
355	goto out_unlock_ilock;
356	}
357
358	if (bmv->bmv_length == -`1`) {
359	max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
360	bmv->bmv_length = max(`0LL`, max_len - bmv->bmv_offset);
361	}
362
363	bmv_end = bmv->bmv_offset + bmv->bmv_length;
364
365	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
366	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
367
368	error = xfs_iread_extents(NULL, ip, whichfork);
369	if (error)
370	goto out_unlock_ilock;
371
372	if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
373	/*
374	* Report a whole-file hole if the delalloc flag is set to
375	* stay compatible with the old implementation.
376	*/
377	if (iflags & BMV_IF_DELALLOC)
378	xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
379	XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
380	goto out_unlock_ilock;
381	}
382
383	while (!xfs_getbmap_full(bmv)) {
384	xfs_trim_extent(&got, first_bno, len);
385
386	/*
387	* Report an entry for a hole if this extent doesn't directly
388	* follow the previous one.
389	*/
390	if (got.br_startoff > bno) {
391	xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
392	got.br_startoff);
393	if (xfs_getbmap_full(bmv))
394	break;
395	}
396
397	/*
398	* In order to report shared extents accurately, we report each
399	* distinct shared / unshared part of a single bmbt record with
400	* an individual getbmapx record.
401	*/
402	bno = got.br_startoff + got.br_blockcount;
403	rec = got;
404	do {
405	error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
406	got: &rec);
407	if (error \|\| xfs_getbmap_full(bmv))
408	goto out_unlock_ilock;
409	} while (xfs_getbmap_next_rec(&rec, bno));
410
411	if (!xfs_iext_next_extent(ifp, &icur, &got)) {
412	xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
413
414	if (bmv->bmv_entries > `0`)
415	out[bmv->bmv_entries - `1`].bmv_oflags \|=
416	BMV_OF_LAST;
417
418	if (whichfork != XFS_ATTR_FORK && bno < end &&
419	!xfs_getbmap_full(bmv)) {
420	xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
421	bno, end);
422	}
423	break;
424	}
425
426	if (bno >= first_bno + len)
427	break;
428	}
429
430	out_unlock_ilock:
431	xfs_iunlock(ip, lock);
432	out_unlock_iolock:
433	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
434	return error;
435	}
436
437	/*
438	* Dead simple method of punching delalyed allocation blocks from a range in
439	* the inode. This will always punch out both the start and end blocks, even
440	* if the ranges only partially overlap them, so it is up to the caller to
441	* ensure that partial blocks are not passed in.
442	*/
443	int
444	xfs_bmap_punch_delalloc_range(
445	struct xfs_inode *ip,
446	xfs_off_t start_byte,
447	xfs_off_t end_byte)
448	{
449	struct xfs_mount *mp = ip->i_mount;
450	struct xfs_ifork *ifp = &ip->i_df;
451	xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
452	xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
453	struct xfs_bmbt_irec got, del;
454	struct xfs_iext_cursor icur;
455	int error = `0`;
456
457	ASSERT(!xfs_need_iread_extents(ifp));
458
459	xfs_ilock(ip, XFS_ILOCK_EXCL);
460	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
461	goto out_unlock;
462
463	while (got.br_startoff + got.br_blockcount > start_fsb) {
464	del = got;
465	xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
466
467	/*
468	* A delete can push the cursor forward. Step back to the
469	* previous extent on non-delalloc or extents outside the
470	* target range.
471	*/
472	if (!del.br_blockcount \|\|
473	!isnullstartblock(del.br_startblock)) {
474	if (!xfs_iext_prev_extent(ifp, &icur, &got))
475	break;
476	continue;
477	}
478
479	error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
480	&got, &del);
481	if (error \|\| !xfs_iext_get_extent(ifp, &icur, &got))
482	break;
483	}
484
485	out_unlock:
486	xfs_iunlock(ip, XFS_ILOCK_EXCL);
487	return error;
488	}
489
490	/*
491	* Test whether it is appropriate to check an inode for and free post EOF
492	* blocks. The 'force' parameter determines whether we should also consider
493	* regular files that are marked preallocated or append-only.
494	*/
495	bool
496	xfs_can_free_eofblocks(
497	struct xfs_inode *ip,
498	bool force)
499	{
500	struct xfs_bmbt_irec imap;
501	struct xfs_mount *mp = ip->i_mount;
502	xfs_fileoff_t end_fsb;
503	xfs_fileoff_t last_fsb;
504	int nimaps = `1`;
505	int error;
506
507	/*
508	* Caller must either hold the exclusive io lock; or be inactivating
509	* the inode, which guarantees there are no other users of the inode.
510	*/
511	if (!(VFS_I(ip)->i_state & I_FREEING))
512	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
513
514	/ prealloc/delalloc exists only on regular files /
515	if (!S_ISREG(VFS_I(ip)->i_mode))
516	return false;
517
518	/*
519	* Zero sized files with no cached pages and delalloc blocks will not
520	* have speculative prealloc/delalloc blocks to remove.
521	*/
522	if (VFS_I(ip)->i_size == `0` &&
523	VFS_I(ip)->i_mapping->nrpages == `0` &&
524	ip->i_delayed_blks == `0`)
525	return false;
526
527	/ If we haven't read in the extent list, then don't do it now. /
528	if (xfs_need_iread_extents(&ip->i_df))
529	return false;
530
531	/*
532	* Do not free real preallocated or append-only files unless the file
533	* has delalloc blocks and we are forced to remove them.
534	*/
535	if (ip->i_diflags & (XFS_DIFLAG_PREALLOC \| XFS_DIFLAG_APPEND))
536	if (!force \|\| ip->i_delayed_blks == `0`)
537	return false;
538
539	/*
540	* Do not try to free post-EOF blocks if EOF is beyond the end of the
541	* range supported by the page cache, because the truncation will loop
542	* forever.
543	*/
544	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
545	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > `1`)
546	end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
547	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
548	if (last_fsb <= end_fsb)
549	return false;
550
551	/*
552	* Look up the mapping for the first block past EOF. If we can't find
553	* it, there's nothing to free.
554	*/
555	xfs_ilock(ip, XFS_ILOCK_SHARED);
556	error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
557	`0`);
558	xfs_iunlock(ip, XFS_ILOCK_SHARED);
559	if (error \|\| nimaps == `0`)
560	return false;
561
562	/*
563	* If there's a real mapping there or there are delayed allocation
564	* reservations, then we have post-EOF blocks to try to free.
565	*/
566	return imap.br_startblock != HOLESTARTBLOCK \|\| ip->i_delayed_blks;
567	}
568
569	/*
570	* This is called to free any blocks beyond eof. The caller must hold
571	* IOLOCK_EXCL unless we are in the inode reclaim path and have the only
572	* reference to the inode.
573	*/
574	int
575	xfs_free_eofblocks(
576	struct xfs_inode *ip)
577	{
578	struct xfs_trans *tp;
579	struct xfs_mount *mp = ip->i_mount;
580	int error;
581
582	/ Attach the dquots to the inode up front. /
583	error = xfs_qm_dqattach(ip);
584	if (error)
585	return error;
586
587	/ Wait on dio to ensure i_size has settled. /
588	inode_dio_wait(inode: VFS_I(ip));
589
590	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_itruncate, blocks: `0`, rtextents: `0`, flags: `0`, tpp: &tp);
591	if (error) {
592	ASSERT(xfs_is_shutdown(mp));
593	return error;
594	}
595
596	xfs_ilock(ip, XFS_ILOCK_EXCL);
597	xfs_trans_ijoin(tp, ip, `0`);
598
599	/*
600	* Do not update the on-disk file size. If we update the on-disk file
601	* size and then the system crashes before the contents of the file are
602	* flushed to disk then the files may be full of holes (ie NULL files
603	* bug).
604	*/
605	error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
606	XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
607	if (error)
608	goto err_cancel;
609
610	error = xfs_trans_commit(tp);
611	if (error)
612	goto out_unlock;
613
614	xfs_inode_clear_eofblocks_tag(ip);
615	goto out_unlock;
616
617	err_cancel:
618	/*
619	* If we get an error at this point we simply don't
620	* bother truncating the file.
621	*/
622	xfs_trans_cancel(tp);
623	out_unlock:
624	xfs_iunlock(ip, XFS_ILOCK_EXCL);
625	return error;
626	}
627
628	int
629	xfs_alloc_file_space(
630	struct xfs_inode *ip,
631	xfs_off_t offset,
632	xfs_off_t len)
633	{
634	xfs_mount_t *mp = ip->i_mount;
635	xfs_off_t count;
636	xfs_filblks_t allocatesize_fsb;
637	xfs_extlen_t extsz, temp;
638	xfs_fileoff_t startoffset_fsb;
639	xfs_fileoff_t endoffset_fsb;
640	int rt;
641	xfs_trans_t *tp;
642	xfs_bmbt_irec_t imaps[`1`], *imapp;
643	int error;
644
645	trace_xfs_alloc_file_space(ip);
646
647	if (xfs_is_shutdown(mp))
648	return -EIO;
649
650	error = xfs_qm_dqattach(ip);
651	if (error)
652	return error;
653
654	if (len <= `0`)
655	return -EINVAL;
656
657	rt = XFS_IS_REALTIME_INODE(ip);
658	extsz = xfs_get_extsz_hint(ip);
659
660	count = len;
661	imapp = &imaps[`0`];
662	startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
663	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
664	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
665
666	/*
667	* Allocate file space until done or until there is an error
668	*/
669	while (allocatesize_fsb && !error) {
670	xfs_fileoff_t s, e;
671	unsigned int dblocks, rblocks, resblks;
672	int nimaps = `1`;
673
674	/*
675	* Determine space reservations for data/realtime.
676	*/
677	if (unlikely(extsz)) {
678	s = startoffset_fsb;
679	do_div(s, extsz);
680	s *= extsz;
681	e = startoffset_fsb + allocatesize_fsb;
682	div_u64_rem(startoffset_fsb, extsz, &temp);
683	if (temp)
684	e += temp;
685	div_u64_rem(e, extsz, &temp);
686	if (temp)
687	e += extsz - temp;
688	} else {
689	s = `0`;
690	e = allocatesize_fsb;
691	}
692
693	/*
694	* The transaction reservation is limited to a 32-bit block
695	* count, hence we need to limit the number of blocks we are
696	* trying to reserve to avoid an overflow. We can't allocate
697	* more than @nimaps extents, and an extent is limited on disk
698	* to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
699	* limit.
700	*/
701	resblks = min_t(xfs_fileoff_t, (e - s),
702	(XFS_MAX_BMBT_EXTLEN * nimaps));
703	if (unlikely(rt)) {
704	dblocks = XFS_DIOSTRAT_SPACE_RES(mp, `0`);
705	rblocks = resblks;
706	} else {
707	dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
708	rblocks = `0`;
709	}
710
711	error = xfs_trans_alloc_inode(ip, resv: &M_RES(mp)->tr_write,
712	dblocks, rblocks, force: false, tpp: &tp);
713	if (error)
714	break;
715
716	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
717	XFS_IEXT_ADD_NOSPLIT_CNT);
718	if (error == -EFBIG)
719	error = xfs_iext_count_upgrade(tp, ip,
720	XFS_IEXT_ADD_NOSPLIT_CNT);
721	if (error)
722	goto error;
723
724	error = xfs_bmapi_write(tp, ip, startoffset_fsb,
725	allocatesize_fsb, XFS_BMAPI_PREALLOC, `0`, imapp,
726	&nimaps);
727	if (error)
728	goto error;
729
730	ip->i_diflags \|= XFS_DIFLAG_PREALLOC;
731	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
732
733	error = xfs_trans_commit(tp);
734	xfs_iunlock(ip, XFS_ILOCK_EXCL);
735	if (error)
736	break;
737
738	/*
739	* If the allocator cannot find a single free extent large
740	* enough to cover the start block of the requested range,
741	* xfs_bmapi_write will return 0 but leave *nimaps set to 0.
742	*
743	* In that case we simply need to keep looping with the same
744	* startoffset_fsb so that one of the following allocations
745	* will eventually reach the requested range.
746	*/
747	if (nimaps) {
748	startoffset_fsb += imapp->br_blockcount;
749	allocatesize_fsb -= imapp->br_blockcount;
750	}
751	}
752
753	return error;
754
755	error:
756	xfs_trans_cancel(tp);
757	xfs_iunlock(ip, XFS_ILOCK_EXCL);
758	return error;
759	}
760
761	static int
762	xfs_unmap_extent(
763	struct xfs_inode *ip,
764	xfs_fileoff_t startoffset_fsb,
765	xfs_filblks_t len_fsb,
766	int *done)
767	{
768	struct xfs_mount *mp = ip->i_mount;
769	struct xfs_trans *tp;
770	uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, `0`);
771	int error;
772
773	error = xfs_trans_alloc_inode(ip, resv: &M_RES(mp)->tr_write, dblocks: resblks, rblocks: `0`,
774	force: false, tpp: &tp);
775	if (error)
776	return error;
777
778	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
779	XFS_IEXT_PUNCH_HOLE_CNT);
780	if (error == -EFBIG)
781	error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
782	if (error)
783	goto out_trans_cancel;
784
785	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, `0`, `2`, done);
786	if (error)
787	goto out_trans_cancel;
788
789	error = xfs_trans_commit(tp);
790	out_unlock:
791	xfs_iunlock(ip, XFS_ILOCK_EXCL);
792	return error;
793
794	out_trans_cancel:
795	xfs_trans_cancel(tp);
796	goto out_unlock;
797	}
798
799	/ Caller must first wait for the completion of any pending DIOs if required. /
800	int
801	xfs_flush_unmap_range(
802	struct xfs_inode *ip,
803	xfs_off_t offset,
804	xfs_off_t len)
805	{
806	struct xfs_mount *mp = ip->i_mount;
807	struct inode *inode = VFS_I(ip);
808	xfs_off_t rounding, start, end;
809	int error;
810
811	rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
812	start = round_down(offset, rounding);
813	end = round_up(offset + len, rounding) - `1`;
814
815	error = filemap_write_and_wait_range(mapping: inode->i_mapping, lstart: start, lend: end);
816	if (error)
817	return error;
818	truncate_pagecache_range(inode, offset: start, end);
819	return `0`;
820	}
821
822	int
823	xfs_free_file_space(
824	struct xfs_inode *ip,
825	xfs_off_t offset,
826	xfs_off_t len)
827	{
828	struct xfs_mount *mp = ip->i_mount;
829	xfs_fileoff_t startoffset_fsb;
830	xfs_fileoff_t endoffset_fsb;
831	int done = `0`, error;
832
833	trace_xfs_free_file_space(ip);
834
835	error = xfs_qm_dqattach(ip);
836	if (error)
837	return error;
838
839	if (len <= `0`) / if nothing being freed /
840	return `0`;
841
842	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
843	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
844
845	/ We can only free complete realtime extents. /
846	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > `1`) {
847	startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
848	endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
849	}
850
851	/*
852	* Need to zero the stuff we're not freeing, on disk.
853	*/
854	if (endoffset_fsb > startoffset_fsb) {
855	while (!done) {
856	error = xfs_unmap_extent(ip, startoffset_fsb,
857	endoffset_fsb - startoffset_fsb, &done);
858	if (error)
859	return error;
860	}
861	}
862
863	/*
864	* Now that we've unmap all full blocks we'll have to zero out any
865	* partial block at the beginning and/or end. xfs_zero_range is smart
866	* enough to skip any holes, including those we just created, but we
867	* must take care not to zero beyond EOF and enlarge i_size.
868	*/
869	if (offset >= XFS_ISIZE(ip))
870	return `0`;
871	if (offset + len > XFS_ISIZE(ip))
872	len = XFS_ISIZE(ip) - offset;
873	error = xfs_zero_range(ip, pos: offset, len, NULL);
874	if (error)
875	return error;
876
877	/*
878	* If we zeroed right up to EOF and EOF straddles a page boundary we
879	* must make sure that the post-EOF area is also zeroed because the
880	* page could be mmap'd and xfs_zero_range doesn't do that for us.
881	* Writeback of the eof page will do this, albeit clumsily.
882	*/
883	if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > `0`) {
884	error = filemap_write_and_wait_range(mapping: VFS_I(ip)->i_mapping,
885	round_down(offset + len, PAGE_SIZE), LLONG_MAX);
886	}
887
888	return error;
889	}
890
891	static int
892	xfs_prepare_shift(
893	struct xfs_inode *ip,
894	loff_t offset)
895	{
896	struct xfs_mount *mp = ip->i_mount;
897	int error;
898
899	/*
900	* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
901	* into the accessible region of the file.
902	*/
903	if (xfs_can_free_eofblocks(ip, force: true)) {
904	error = xfs_free_eofblocks(ip);
905	if (error)
906	return error;
907	}
908
909	/*
910	* Shift operations must stabilize the start block offset boundary along
911	* with the full range of the operation. If we don't, a COW writeback
912	* completion could race with an insert, front merge with the start
913	* extent (after split) during the shift and corrupt the file. Start
914	* with the block just prior to the start to stabilize the boundary.
915	*/
916	offset = round_down(offset, mp->m_sb.sb_blocksize);
917	if (offset)
918	offset -= mp->m_sb.sb_blocksize;
919
920	/*
921	* Writeback and invalidate cache for the remainder of the file as we're
922	* about to shift down every extent from offset to EOF.
923	*/
924	error = xfs_flush_unmap_range(ip, offset, len: XFS_ISIZE(ip));
925	if (error)
926	return error;
927
928	/*
929	* Clean out anything hanging around in the cow fork now that
930	* we've flushed all the dirty data out to disk to avoid having
931	* CoW extents at the wrong offsets.
932	*/
933	if (xfs_inode_has_cow_data(ip)) {
934	error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
935	true);
936	if (error)
937	return error;
938	}
939
940	return `0`;
941	}
942
943	/*
944	* xfs_collapse_file_space()
945	* This routine frees disk space and shift extent for the given file.
946	* The first thing we do is to free data blocks in the specified range
947	* by calling xfs_free_file_space(). It would also sync dirty data
948	* and invalidate page cache over the region on which collapse range
949	* is working. And Shift extent records to the left to cover a hole.
950	* RETURNS:
951	* 0 on success
952	* errno on error
953	*
954	*/
955	int
956	xfs_collapse_file_space(
957	struct xfs_inode *ip,
958	xfs_off_t offset,
959	xfs_off_t len)
960	{
961	struct xfs_mount *mp = ip->i_mount;
962	struct xfs_trans *tp;
963	int error;
964	xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
965	xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
966	bool done = false;
967
968	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL);
969
970	trace_xfs_collapse_file_space(ip);
971
972	error = xfs_free_file_space(ip, offset, len);
973	if (error)
974	return error;
975
976	error = xfs_prepare_shift(ip, offset);
977	if (error)
978	return error;
979
980	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_write, blocks: `0`, rtextents: `0`, flags: `0`, tpp: &tp);
981	if (error)
982	return error;
983
984	xfs_ilock(ip, XFS_ILOCK_EXCL);
985	xfs_trans_ijoin(tp, ip, `0`);
986
987	while (!done) {
988	error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
989	&done);
990	if (error)
991	goto out_trans_cancel;
992	if (done)
993	break;
994
995	/ finish any deferred frees and roll the transaction /
996	error = xfs_defer_finish(&tp);
997	if (error)
998	goto out_trans_cancel;
999	}
1000
1001	error = xfs_trans_commit(tp);
1002	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1003	return error;
1004
1005	out_trans_cancel:
1006	xfs_trans_cancel(tp);
1007	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1008	return error;
1009	}
1010
1011	/*
1012	* xfs_insert_file_space()
1013	* This routine create hole space by shifting extents for the given file.
1014	* The first thing we do is to sync dirty data and invalidate page cache
1015	* over the region on which insert range is working. And split an extent
1016	* to two extents at given offset by calling xfs_bmap_split_extent.
1017	* And shift all extent records which are laying between [offset,
1018	* last allocated extent] to the right to reserve hole range.
1019	* RETURNS:
1020	* 0 on success
1021	* errno on error
1022	*/
1023	int
1024	xfs_insert_file_space(
1025	struct xfs_inode *ip,
1026	loff_t offset,
1027	loff_t len)
1028	{
1029	struct xfs_mount *mp = ip->i_mount;
1030	struct xfs_trans *tp;
1031	int error;
1032	xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset);
1033	xfs_fileoff_t next_fsb = NULLFSBLOCK;
1034	xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
1035	bool done = false;
1036
1037	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL);
1038
1039	trace_xfs_insert_file_space(ip);
1040
1041	error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
1042	if (error)
1043	return error;
1044
1045	error = xfs_prepare_shift(ip, offset);
1046	if (error)
1047	return error;
1048
1049	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_write,
1050	blocks: XFS_DIOSTRAT_SPACE_RES(mp, `0`), rtextents: `0`, flags: `0`, tpp: &tp);
1051	if (error)
1052	return error;
1053
1054	xfs_ilock(ip, XFS_ILOCK_EXCL);
1055	xfs_trans_ijoin(tp, ip, `0`);
1056
1057	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
1058	XFS_IEXT_PUNCH_HOLE_CNT);
1059	if (error == -EFBIG)
1060	error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
1061	if (error)
1062	goto out_trans_cancel;
1063
1064	/*
1065	* The extent shifting code works on extent granularity. So, if stop_fsb
1066	* is not the starting block of extent, we need to split the extent at
1067	* stop_fsb.
1068	*/
1069	error = xfs_bmap_split_extent(tp, ip, stop_fsb);
1070	if (error)
1071	goto out_trans_cancel;
1072
1073	do {
1074	error = xfs_defer_finish(&tp);
1075	if (error)
1076	goto out_trans_cancel;
1077
1078	error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
1079	&done, stop_fsb);
1080	if (error)
1081	goto out_trans_cancel;
1082	} while (!done);
1083
1084	error = xfs_trans_commit(tp);
1085	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1086	return error;
1087
1088	out_trans_cancel:
1089	xfs_trans_cancel(tp);
1090	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1091	return error;
1092	}
1093
1094	/*
1095	* We need to check that the format of the data fork in the temporary inode is
1096	* valid for the target inode before doing the swap. This is not a problem with
1097	* attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1098	* data fork depending on the space the attribute fork is taking so we can get
1099	* invalid formats on the target inode.
1100	*
1101	* E.g. target has space for 7 extents in extent format, temp inode only has
1102	* space for 6. If we defragment down to 7 extents, then the tmp format is a
1103	* btree, but when swapped it needs to be in extent format. Hence we can't just
1104	* blindly swap data forks on attr2 filesystems.
1105	*
1106	* Note that we check the swap in both directions so that we don't end up with
1107	* a corrupt temporary inode, either.
1108	*
1109	* Note that fixing the way xfs_fsr sets up the attribute fork in the source
1110	* inode will prevent this situation from occurring, so all we do here is
1111	* reject and log the attempt. basically we are putting the responsibility on
1112	* userspace to get this right.
1113	*/
1114	static int
1115	xfs_swap_extents_check_format(
1116	struct xfs_inode ip, /* target inode /
1117	struct xfs_inode tip) /* tmp inode /
1118	{
1119	struct xfs_ifork *ifp = &ip->i_df;
1120	struct xfs_ifork *tifp = &tip->i_df;
1121
1122	/ User/group/project quota ids must match if quotas are enforced. /
1123	if (XFS_IS_QUOTA_ON(ip->i_mount) &&
1124	(!uid_eq(left: VFS_I(ip)->i_uid, right: VFS_I(ip: tip)->i_uid) \|\|
1125	!gid_eq(left: VFS_I(ip)->i_gid, right: VFS_I(ip: tip)->i_gid) \|\|
1126	ip->i_projid != tip->i_projid))
1127	return -EINVAL;
1128
1129	/ Should never get a local format /
1130	if (ifp->if_format == XFS_DINODE_FMT_LOCAL \|\|
1131	tifp->if_format == XFS_DINODE_FMT_LOCAL)
1132	return -EINVAL;
1133
1134	/*
1135	* if the target inode has less extents that then temporary inode then
1136	* why did userspace call us?
1137	*/
1138	if (ifp->if_nextents < tifp->if_nextents)
1139	return -EINVAL;
1140
1141	/*
1142	* If we have to use the (expensive) rmap swap method, we can
1143	* handle any number of extents and any format.
1144	*/
1145	if (xfs_has_rmapbt(mp: ip->i_mount))
1146	return `0`;
1147
1148	/*
1149	* if the target inode is in extent form and the temp inode is in btree
1150	* form then we will end up with the target inode in the wrong format
1151	* as we already know there are less extents in the temp inode.
1152	*/
1153	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1154	tifp->if_format == XFS_DINODE_FMT_BTREE)
1155	return -EINVAL;
1156
1157	/ Check temp in extent form to max in target /
1158	if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1159	tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1160	return -EINVAL;
1161
1162	/ Check target in extent form to max in temp /
1163	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
1164	ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1165	return -EINVAL;
1166
1167	/*
1168	* If we are in a btree format, check that the temp root block will fit
1169	* in the target and that it has enough extents to be in btree format
1170	* in the target.
1171	*
1172	* Note that we have to be careful to allow btree->extent conversions
1173	* (a common defrag case) which will occur when the temp inode is in
1174	* extent format...
1175	*/
1176	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
1177	if (xfs_inode_has_attr_fork(ip) &&
1178	XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
1179	return -EINVAL;
1180	if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1181	return -EINVAL;
1182	}
1183
1184	/ Reciprocal target->temp btree format checks /
1185	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
1186	if (xfs_inode_has_attr_fork(ip: tip) &&
1187	XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(ip: tip))
1188	return -EINVAL;
1189	if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1190	return -EINVAL;
1191	}
1192
1193	return `0`;
1194	}
1195
1196	static int
1197	xfs_swap_extent_flush(
1198	struct xfs_inode *ip)
1199	{
1200	int error;
1201
1202	error = filemap_write_and_wait(mapping: VFS_I(ip)->i_mapping);
1203	if (error)
1204	return error;
1205	truncate_pagecache_range(inode: VFS_I(ip), offset: `0`, end: -`1`);
1206
1207	/ Verify O_DIRECT for ftmp /
1208	if (VFS_I(ip)->i_mapping->nrpages)
1209	return -EINVAL;
1210	return `0`;
1211	}
1212
1213	/*
1214	* Move extents from one file to another, when rmap is enabled.
1215	*/
1216	STATIC int
1217	xfs_swap_extent_rmap(
1218	struct xfs_trans **tpp,
1219	struct xfs_inode *ip,
1220	struct xfs_inode *tip)
1221	{
1222	struct xfs_trans tp = tpp;
1223	struct xfs_bmbt_irec irec;
1224	struct xfs_bmbt_irec uirec;
1225	struct xfs_bmbt_irec tirec;
1226	xfs_fileoff_t offset_fsb;
1227	xfs_fileoff_t end_fsb;
1228	xfs_filblks_t count_fsb;
1229	int error;
1230	xfs_filblks_t ilen;
1231	xfs_filblks_t rlen;
1232	int nimaps;
1233	uint64_t tip_flags2;
1234
1235	/*
1236	* If the source file has shared blocks, we must flag the donor
1237	* file as having shared blocks so that we get the shared-block
1238	* rmap functions when we go to fix up the rmaps. The flags
1239	* will be switch for reals later.
1240	*/
1241	tip_flags2 = tip->i_diflags2;
1242	if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
1243	tip->i_diflags2 \|= XFS_DIFLAG2_REFLINK;
1244
1245	offset_fsb = `0`;
1246	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
1247	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
1248
1249	while (count_fsb) {
1250	/ Read extent from the donor file /
1251	nimaps = `1`;
1252	error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
1253	&nimaps, `0`);
1254	if (error)
1255	goto out;
1256	ASSERT(nimaps == `1`);
1257	ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
1258
1259	trace_xfs_swap_extent_rmap_remap(ip: tip, irec: &tirec);
1260	ilen = tirec.br_blockcount;
1261
1262	/ Unmap the old blocks in the source file. /
1263	while (tirec.br_blockcount) {
1264	ASSERT(tp->t_highest_agno == NULLAGNUMBER);
1265	trace_xfs_swap_extent_rmap_remap_piece(ip: tip, irec: &tirec);
1266
1267	/ Read extent from the source file /
1268	nimaps = `1`;
1269	error = xfs_bmapi_read(ip, tirec.br_startoff,
1270	tirec.br_blockcount, &irec,
1271	&nimaps, `0`);
1272	if (error)
1273	goto out;
1274	ASSERT(nimaps == `1`);
1275	ASSERT(tirec.br_startoff == irec.br_startoff);
1276	trace_xfs_swap_extent_rmap_remap_piece(ip, irec: &irec);
1277
1278	/ Trim the extent. /
1279	uirec = tirec;
1280	uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
1281	tirec.br_blockcount,
1282	irec.br_blockcount);
1283	trace_xfs_swap_extent_rmap_remap_piece(ip: tip, irec: &uirec);
1284
1285	if (xfs_bmap_is_real_extent(&uirec)) {
1286	error = xfs_iext_count_may_overflow(ip,
1287	XFS_DATA_FORK,
1288	XFS_IEXT_SWAP_RMAP_CNT);
1289	if (error == -EFBIG)
1290	error = xfs_iext_count_upgrade(tp, ip,
1291	XFS_IEXT_SWAP_RMAP_CNT);
1292	if (error)
1293	goto out;
1294	}
1295
1296	if (xfs_bmap_is_real_extent(&irec)) {
1297	error = xfs_iext_count_may_overflow(tip,
1298	XFS_DATA_FORK,
1299	XFS_IEXT_SWAP_RMAP_CNT);
1300	if (error == -EFBIG)
1301	error = xfs_iext_count_upgrade(tp, ip,
1302	XFS_IEXT_SWAP_RMAP_CNT);
1303	if (error)
1304	goto out;
1305	}
1306
1307	/ Remove the mapping from the donor file. /
1308	xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
1309
1310	/ Remove the mapping from the source file. /
1311	xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
1312
1313	/ Map the donor file's blocks into the source file. /
1314	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
1315
1316	/ Map the source file's blocks into the donor file. /
1317	xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
1318
1319	error = xfs_defer_finish(tpp);
1320	tp = *tpp;
1321	if (error)
1322	goto out;
1323
1324	tirec.br_startoff += rlen;
1325	if (tirec.br_startblock != HOLESTARTBLOCK &&
1326	tirec.br_startblock != DELAYSTARTBLOCK)
1327	tirec.br_startblock += rlen;
1328	tirec.br_blockcount -= rlen;
1329	}
1330
1331	/ Roll on... /
1332	count_fsb -= ilen;
1333	offset_fsb += ilen;
1334	}
1335
1336	tip->i_diflags2 = tip_flags2;
1337	return `0`;
1338
1339	out:
1340	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
1341	tip->i_diflags2 = tip_flags2;
1342	return error;
1343	}
1344
1345	/ Swap the extents of two files by swapping data forks. /
1346	STATIC int
1347	xfs_swap_extent_forks(
1348	struct xfs_trans *tp,
1349	struct xfs_inode *ip,
1350	struct xfs_inode *tip,
1351	int *src_log_flags,
1352	int *target_log_flags)
1353	{
1354	xfs_filblks_t aforkblks = `0`;
1355	xfs_filblks_t taforkblks = `0`;
1356	xfs_extnum_t junk;
1357	uint64_t tmp;
1358	int error;
1359
1360	/*
1361	* Count the number of extended attribute blocks
1362	*/
1363	if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > `0` &&
1364	ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
1365	error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
1366	&aforkblks);
1367	if (error)
1368	return error;
1369	}
1370	if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > `0` &&
1371	tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
1372	error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
1373	&taforkblks);
1374	if (error)
1375	return error;
1376	}
1377
1378	/*
1379	* Btree format (v3) inodes have the inode number stamped in the bmbt
1380	* block headers. We can't start changing the bmbt blocks until the
1381	* inode owner change is logged so recovery does the right thing in the
1382	* event of a crash. Set the owner change log flags now and leave the
1383	* bmbt scan as the last step.
1384	*/
1385	if (xfs_has_v3inodes(mp: ip->i_mount)) {
1386	if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
1387	(*target_log_flags) \|= XFS_ILOG_DOWNER;
1388	if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
1389	(*src_log_flags) \|= XFS_ILOG_DOWNER;
1390	}
1391
1392	/*
1393	* Swap the data forks of the inodes
1394	*/
1395	swap(ip->i_df, tip->i_df);
1396
1397	/*
1398	* Fix the on-disk inode values
1399	*/
1400	tmp = (uint64_t)ip->i_nblocks;
1401	ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
1402	tip->i_nblocks = tmp + taforkblks - aforkblks;
1403
1404	/*
1405	* The extents in the source inode could still contain speculative
1406	* preallocation beyond EOF (e.g. the file is open but not modified
1407	* while defrag is in progress). In that case, we need to copy over the
1408	* number of delalloc blocks the data fork in the source inode is
1409	* tracking beyond EOF so that when the fork is truncated away when the
1410	* temporary inode is unlinked we don't underrun the i_delayed_blks
1411	* counter on that inode.
1412	*/
1413	ASSERT(tip->i_delayed_blks == `0`);
1414	tip->i_delayed_blks = ip->i_delayed_blks;
1415	ip->i_delayed_blks = `0`;
1416
1417	switch (ip->i_df.if_format) {
1418	case XFS_DINODE_FMT_EXTENTS:
1419	(*src_log_flags) \|= XFS_ILOG_DEXT;
1420	break;
1421	case XFS_DINODE_FMT_BTREE:
1422	ASSERT(!xfs_has_v3inodes(ip->i_mount) \|\|
1423	(*src_log_flags & XFS_ILOG_DOWNER));
1424	(*src_log_flags) \|= XFS_ILOG_DBROOT;
1425	break;
1426	}
1427
1428	switch (tip->i_df.if_format) {
1429	case XFS_DINODE_FMT_EXTENTS:
1430	(*target_log_flags) \|= XFS_ILOG_DEXT;
1431	break;
1432	case XFS_DINODE_FMT_BTREE:
1433	(*target_log_flags) \|= XFS_ILOG_DBROOT;
1434	ASSERT(!xfs_has_v3inodes(ip->i_mount) \|\|
1435	(*target_log_flags & XFS_ILOG_DOWNER));
1436	break;
1437	}
1438
1439	return `0`;
1440	}
1441
1442	/*
1443	* Fix up the owners of the bmbt blocks to refer to the current inode. The
1444	* change owner scan attempts to order all modified buffers in the current
1445	* transaction. In the event of ordered buffer failure, the offending buffer is
1446	* physically logged as a fallback and the scan returns -EAGAIN. We must roll
1447	* the transaction in this case to replenish the fallback log reservation and
1448	* restart the scan. This process repeats until the scan completes.
1449	*/
1450	static int
1451	xfs_swap_change_owner(
1452	struct xfs_trans **tpp,
1453	struct xfs_inode *ip,
1454	struct xfs_inode *tmpip)
1455	{
1456	int error;
1457	struct xfs_trans tp = tpp;
1458
1459	do {
1460	error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
1461	NULL);
1462	/ success or fatal error /
1463	if (error != -EAGAIN)
1464	break;
1465
1466	error = xfs_trans_roll(tpp);
1467	if (error)
1468	break;
1469	tp = *tpp;
1470
1471	/*
1472	* Redirty both inodes so they can relog and keep the log tail
1473	* moving forward.
1474	*/
1475	xfs_trans_ijoin(tp, ip, `0`);
1476	xfs_trans_ijoin(tp, tmpip, `0`);
1477	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1478	xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
1479	} while (true);
1480
1481	return error;
1482	}
1483
1484	int
1485	xfs_swap_extents(
1486	struct xfs_inode ip, /* target inode /
1487	struct xfs_inode tip, /* tmp inode /
1488	struct xfs_swapext *sxp)
1489	{
1490	struct xfs_mount *mp = ip->i_mount;
1491	struct xfs_trans *tp;
1492	struct xfs_bstat *sbp = &sxp->sx_stat;
1493	int src_log_flags, target_log_flags;
1494	int error = `0`;
1495	uint64_t f;
1496	int resblks = `0`;
1497	unsigned int flags = `0`;
1498	struct timespec64 ctime, mtime;
1499
1500	/*
1501	* Lock the inodes against other IO, page faults and truncate to
1502	* begin with. Then we can ensure the inodes are flushed and have no
1503	* page cache safely. Once we have done this we can take the ilocks and
1504	* do the rest of the checks.
1505	*/
1506	lock_two_nondirectories(VFS_I(ip), VFS_I(ip: tip));
1507	filemap_invalidate_lock_two(mapping1: VFS_I(ip)->i_mapping,
1508	mapping2: VFS_I(ip: tip)->i_mapping);
1509
1510	/ Verify that both files have the same format /
1511	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(ip: tip)->i_mode & S_IFMT)) {
1512	error = -EINVAL;
1513	goto out_unlock;
1514	}
1515
1516	/ Verify both files are either real-time or non-realtime /
1517	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1518	error = -EINVAL;
1519	goto out_unlock;
1520	}
1521
1522	error = xfs_qm_dqattach(ip);
1523	if (error)
1524	goto out_unlock;
1525
1526	error = xfs_qm_dqattach(tip);
1527	if (error)
1528	goto out_unlock;
1529
1530	error = xfs_swap_extent_flush(ip);
1531	if (error)
1532	goto out_unlock;
1533	error = xfs_swap_extent_flush(ip: tip);
1534	if (error)
1535	goto out_unlock;
1536
1537	if (xfs_inode_has_cow_data(ip: tip)) {
1538	error = xfs_reflink_cancel_cow_range(tip, `0`, NULLFILEOFF, true);
1539	if (error)
1540	goto out_unlock;
1541	}
1542
1543	/*
1544	* Extent "swapping" with rmap requires a permanent reservation and
1545	* a block reservation because it's really just a remap operation
1546	* performed with log redo items!
1547	*/
1548	if (xfs_has_rmapbt(mp)) {
1549	int w = XFS_DATA_FORK;
1550	uint32_t ipnext = ip->i_df.if_nextents;
1551	uint32_t tipnext = tip->i_df.if_nextents;
1552
1553	/*
1554	* Conceptually this shouldn't affect the shape of either bmbt,
1555	* but since we atomically move extents one by one, we reserve
1556	* enough space to rebuild both trees.
1557	*/
1558	resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
1559	resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
1560
1561	/*
1562	* If either inode straddles a bmapbt block allocation boundary,
1563	* the rmapbt algorithm triggers repeated allocs and frees as
1564	* extents are remapped. This can exhaust the block reservation
1565	* prematurely and cause shutdown. Return freed blocks to the
1566	* transaction reservation to counter this behavior.
1567	*/
1568	flags \|= XFS_TRANS_RES_FDBLKS;
1569	}
1570	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_write, blocks: resblks, rtextents: `0`, flags,
1571	tpp: &tp);
1572	if (error)
1573	goto out_unlock;
1574
1575	/*
1576	* Lock and join the inodes to the tansaction so that transaction commit
1577	* or cancel will unlock the inodes from this point onwards.
1578	*/
1579	xfs_lock_two_inodes(ip0: ip, XFS_ILOCK_EXCL, ip1: tip, XFS_ILOCK_EXCL);
1580	xfs_trans_ijoin(tp, ip, `0`);
1581	xfs_trans_ijoin(tp, tip, `0`);
1582
1583
1584	/ Verify all data are being swapped /
1585	if (sxp->sx_offset != `0` \|\|
1586	sxp->sx_length != ip->i_disk_size \|\|
1587	sxp->sx_length != tip->i_disk_size) {
1588	error = -EFAULT;
1589	goto out_trans_cancel;
1590	}
1591
1592	trace_xfs_swap_extent_before(ip, which: `0`);
1593	trace_xfs_swap_extent_before(ip: tip, which: `1`);
1594
1595	/ check inode formats now that data is flushed /
1596	error = xfs_swap_extents_check_format(ip, tip);
1597	if (error) {
1598	xfs_notice(mp,
1599	"%s: inode 0x%llx format is incompatible for exchanging.",
1600	__func__, ip->i_ino);
1601	goto out_trans_cancel;
1602	}
1603
1604	/*
1605	* Compare the current change & modify times with that
1606	* passed in. If they differ, we abort this swap.
1607	* This is the mechanism used to ensure the calling
1608	* process that the file was not changed out from
1609	* under it.
1610	*/
1611	ctime = inode_get_ctime(inode: VFS_I(ip));
1612	mtime = inode_get_mtime(inode: VFS_I(ip));
1613	if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) \|\|
1614	(sbp->bs_ctime.tv_nsec != ctime.tv_nsec) \|\|
1615	(sbp->bs_mtime.tv_sec != mtime.tv_sec) \|\|
1616	(sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
1617	error = -EBUSY;
1618	goto out_trans_cancel;
1619	}
1620
1621	/*
1622	* Note the trickiness in setting the log flags - we set the owner log
1623	* flag on the opposite inode (i.e. the inode we are setting the new
1624	* owner to be) because once we swap the forks and log that, log
1625	* recovery is going to see the fork as owned by the swapped inode,
1626	* not the pre-swapped inodes.
1627	*/
1628	src_log_flags = XFS_ILOG_CORE;
1629	target_log_flags = XFS_ILOG_CORE;
1630
1631	if (xfs_has_rmapbt(mp))
1632	error = xfs_swap_extent_rmap(tpp: &tp, ip, tip);
1633	else
1634	error = xfs_swap_extent_forks(tp, ip, tip, src_log_flags: &src_log_flags,
1635	target_log_flags: &target_log_flags);
1636	if (error)
1637	goto out_trans_cancel;
1638
1639	/ Do we have to swap reflink flags? /
1640	if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
1641	(tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
1642	f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
1643	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1644	ip->i_diflags2 \|= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
1645	tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1646	tip->i_diflags2 \|= f & XFS_DIFLAG2_REFLINK;
1647	}
1648
1649	/ Swap the cow forks. /
1650	if (xfs_has_reflink(mp)) {
1651	ASSERT(!ip->i_cowfp \|\|
1652	ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
1653	ASSERT(!tip->i_cowfp \|\|
1654	tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
1655
1656	swap(ip->i_cowfp, tip->i_cowfp);
1657
1658	if (ip->i_cowfp && ip->i_cowfp->if_bytes)
1659	xfs_inode_set_cowblocks_tag(ip);
1660	else
1661	xfs_inode_clear_cowblocks_tag(ip);
1662	if (tip->i_cowfp && tip->i_cowfp->if_bytes)
1663	xfs_inode_set_cowblocks_tag(ip: tip);
1664	else
1665	xfs_inode_clear_cowblocks_tag(ip: tip);
1666	}
1667
1668	xfs_trans_log_inode(tp, ip, src_log_flags);
1669	xfs_trans_log_inode(tp, tip, target_log_flags);
1670
1671	/*
1672	* The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1673	* have inode number owner values in the bmbt blocks that still refer to
1674	* the old inode. Scan each bmbt to fix up the owner values with the
1675	* inode number of the current inode.
1676	*/
1677	if (src_log_flags & XFS_ILOG_DOWNER) {
1678	error = xfs_swap_change_owner(tpp: &tp, ip, tmpip: tip);
1679	if (error)
1680	goto out_trans_cancel;
1681	}
1682	if (target_log_flags & XFS_ILOG_DOWNER) {
1683	error = xfs_swap_change_owner(tpp: &tp, ip: tip, tmpip: ip);
1684	if (error)
1685	goto out_trans_cancel;
1686	}
1687
1688	/*
1689	* If this is a synchronous mount, make sure that the
1690	* transaction goes to disk before returning to the user.
1691	*/
1692	if (xfs_has_wsync(mp))
1693	xfs_trans_set_sync(tp);
1694
1695	error = xfs_trans_commit(tp);
1696
1697	trace_xfs_swap_extent_after(ip, which: `0`);
1698	trace_xfs_swap_extent_after(ip: tip, which: `1`);
1699
1700	out_unlock_ilock:
1701	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1702	xfs_iunlock(tip, XFS_ILOCK_EXCL);
1703	out_unlock:
1704	filemap_invalidate_unlock_two(mapping1: VFS_I(ip)->i_mapping,
1705	mapping2: VFS_I(ip: tip)->i_mapping);
1706	unlock_two_nondirectories(VFS_I(ip), VFS_I(ip: tip));
1707	return error;
1708
1709	out_trans_cancel:
1710	xfs_trans_cancel(tp);
1711	goto out_unlock_ilock;
1712	}
1713

source code of linux/fs/xfs/xfs_bmap_util.c