xfs_mount.c source code [linux/fs/xfs/xfs_mount.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_bit.h"
13	#include "xfs_sb.h"
14	#include "xfs_mount.h"
15	#include "xfs_inode.h"
16	#include "xfs_dir2.h"
17	#include "xfs_ialloc.h"
18	#include "xfs_alloc.h"
19	#include "xfs_rtalloc.h"
20	#include "xfs_bmap.h"
21	#include "xfs_trans.h"
22	#include "xfs_trans_priv.h"
23	#include "xfs_log.h"
24	#include "xfs_log_priv.h"
25	#include "xfs_error.h"
26	#include "xfs_quota.h"
27	#include "xfs_fsops.h"
28	#include "xfs_icache.h"
29	#include "xfs_sysfs.h"
30	#include "xfs_rmap_btree.h"
31	#include "xfs_refcount_btree.h"
32	#include "xfs_reflink.h"
33	#include "xfs_extent_busy.h"
34	#include "xfs_health.h"
35	#include "xfs_trace.h"
36	#include "xfs_ag.h"
37	#include "xfs_rtbitmap.h"
38	#include "xfs_metafile.h"
39	#include "xfs_rtgroup.h"
40	#include "xfs_rtrmap_btree.h"
41	#include "xfs_rtrefcount_btree.h"
42	#include "scrub/stats.h"
43	#include "xfs_zone_alloc.h"
44
45	static DEFINE_MUTEX(xfs_uuid_table_mutex);
46	static int xfs_uuid_table_size;
47	static uuid_t *xfs_uuid_table;
48
49	void
50	xfs_uuid_table_free(void)
51	{
52	if (xfs_uuid_table_size == `0`)
53	return;
54	kfree(objp: xfs_uuid_table);
55	xfs_uuid_table = NULL;
56	xfs_uuid_table_size = `0`;
57	}
58
59	/*
60	* See if the UUID is unique among mounted XFS filesystems.
61	* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
62	*/
63	STATIC int
64	xfs_uuid_mount(
65	struct xfs_mount *mp)
66	{
67	uuid_t *uuid = &mp->m_sb.sb_uuid;
68	int hole, i;
69
70	/ Publish UUID in struct super_block /
71	super_set_uuid(sb: mp->m_super, uuid: uuid->b, len: sizeof(*uuid));
72
73	if (xfs_has_nouuid(mp))
74	return `0`;
75
76	if (uuid_is_null(uuid)) {
77	xfs_warn(mp, "Filesystem has null UUID - can't mount");
78	return -EINVAL;
79	}
80
81	mutex_lock(&xfs_uuid_table_mutex);
82	for (i = `0`, hole = -`1`; i < xfs_uuid_table_size; i++) {
83	if (uuid_is_null(uuid: &xfs_uuid_table[i])) {
84	hole = i;
85	continue;
86	}
87	if (uuid_equal(u1: uuid, u2: &xfs_uuid_table[i]))
88	goto out_duplicate;
89	}
90
91	if (hole < `0`) {
92	xfs_uuid_table = krealloc(xfs_uuid_table,
93	(xfs_uuid_table_size + `1`) * sizeof(*xfs_uuid_table),
94	GFP_KERNEL \| __GFP_NOFAIL);
95	hole = xfs_uuid_table_size++;
96	}
97	xfs_uuid_table[hole] = *uuid;
98	mutex_unlock(lock: &xfs_uuid_table_mutex);
99
100	return `0`;
101
102	out_duplicate:
103	mutex_unlock(lock: &xfs_uuid_table_mutex);
104	xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
105	return -EINVAL;
106	}
107
108	STATIC void
109	xfs_uuid_unmount(
110	struct xfs_mount *mp)
111	{
112	uuid_t *uuid = &mp->m_sb.sb_uuid;
113	int i;
114
115	if (xfs_has_nouuid(mp))
116	return;
117
118	mutex_lock(&xfs_uuid_table_mutex);
119	for (i = `0`; i < xfs_uuid_table_size; i++) {
120	if (uuid_is_null(uuid: &xfs_uuid_table[i]))
121	continue;
122	if (!uuid_equal(u1: uuid, u2: &xfs_uuid_table[i]))
123	continue;
124	memset(&xfs_uuid_table[i], `0`, sizeof(uuid_t));
125	break;
126	}
127	ASSERT(i < xfs_uuid_table_size);
128	mutex_unlock(lock: &xfs_uuid_table_mutex);
129	}
130
131	/*
132	* Check size of device based on the (data/realtime) block count.
133	* Note: this check is used by the growfs code as well as mount.
134	*/
135	int
136	xfs_sb_validate_fsb_count(
137	xfs_sb_t *sbp,
138	uint64_t nblocks)
139	{
140	uint64_t max_bytes;
141
142	ASSERT(sbp->sb_blocklog >= BBSHIFT);
143
144	if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes))
145	return -EFBIG;
146
147	/ Limited by ULONG_MAX of page cache index /
148	if (max_bytes >> PAGE_SHIFT > ULONG_MAX)
149	return -EFBIG;
150	return `0`;
151	}
152
153	/*
154	* xfs_readsb
155	*
156	* Does the initial read of the superblock.
157	*/
158	int
159	xfs_readsb(
160	struct xfs_mount *mp,
161	int flags)
162	{
163	unsigned int sector_size;
164	struct xfs_buf *bp;
165	struct xfs_sb *sbp = &mp->m_sb;
166	int error;
167	int loud = !(flags & XFS_MFSI_QUIET);
168	const struct xfs_buf_ops *buf_ops;
169
170	ASSERT(mp->m_sb_bp == NULL);
171	ASSERT(mp->m_ddev_targp != NULL);
172
173	/*
174	* For the initial read, we must guess at the sector
175	* size based on the block device. It's enough to
176	* get the sb_sectsize out of the superblock and
177	* then reread with the proper length.
178	* We don't verify it yet, because it may not be complete.
179	*/
180	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
181	buf_ops = NULL;
182
183	/*
184	* Allocate a (locked) buffer to hold the superblock. This will be kept
185	* around at all times to optimize access to the superblock.
186	*/
187	reread:
188	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
189	BTOBB(sector_size), &bp, buf_ops);
190	if (error) {
191	if (loud)
192	xfs_warn(mp, "SB validate failed with error %d.", error);
193	/ bad CRC means corrupted metadata /
194	if (error == -EFSBADCRC)
195	error = -EFSCORRUPTED;
196	return error;
197	}
198
199	/*
200	* Initialize the mount structure from the superblock.
201	*/
202	xfs_sb_from_disk(sbp, bp->b_addr);
203
204	/*
205	* If we haven't validated the superblock, do so now before we try
206	* to check the sector size and reread the superblock appropriately.
207	*/
208	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
209	if (loud)
210	xfs_warn(mp, "Invalid superblock magic number");
211	error = -EINVAL;
212	goto release_buf;
213	}
214
215	/*
216	* We must be able to do sector-sized and sector-aligned IO.
217	*/
218	if (sector_size > sbp->sb_sectsize) {
219	if (loud)
220	xfs_warn(mp, "device supports %u byte sectors (not %u)",
221	sector_size, sbp->sb_sectsize);
222	error = -ENOSYS;
223	goto release_buf;
224	}
225
226	if (buf_ops == NULL) {
227	/*
228	* Re-read the superblock so the buffer is correctly sized,
229	* and properly verified.
230	*/
231	xfs_buf_relse(bp);
232	sector_size = sbp->sb_sectsize;
233	buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
234	goto reread;
235	}
236
237	mp->m_features \|= xfs_sb_version_to_features(sbp);
238	xfs_reinit_percpu_counters(mp);
239
240	/*
241	* If logged xattrs are enabled after log recovery finishes, then set
242	* the opstate so that log recovery will work properly.
243	*/
244	if (xfs_sb_version_haslogxattrs(&mp->m_sb))
245	xfs_set_using_logged_xattrs(mp);
246
247	/ no need to be quiet anymore, so reset the buf ops /
248	bp->b_ops = &xfs_sb_buf_ops;
249
250	mp->m_sb_bp = bp;
251	xfs_buf_unlock(bp);
252	return `0`;
253
254	release_buf:
255	xfs_buf_relse(bp);
256	return error;
257	}
258
259	/*
260	* If the sunit/swidth change would move the precomputed root inode value, we
261	* must reject the ondisk change because repair will stumble over that.
262	* However, we allow the mount to proceed because we never rejected this
263	* combination before. Returns true to update the sb, false otherwise.
264	*/
265	static inline int
266	xfs_check_new_dalign(
267	struct xfs_mount *mp,
268	int new_dalign,
269	bool *update_sb)
270	{
271	struct xfs_sb *sbp = &mp->m_sb;
272	xfs_ino_t calc_ino;
273
274	calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
275	trace_xfs_check_new_dalign(mp, new_dalign, calc_rootino: calc_ino);
276
277	if (sbp->sb_rootino == calc_ino) {
278	*update_sb = true;
279	return `0`;
280	}
281
282	xfs_warn(mp,
283	"Cannot change stripe alignment; would require moving root inode.");
284
285	/*
286	* XXX: Next time we add a new incompat feature, this should start
287	* returning -EINVAL to fail the mount. Until then, spit out a warning
288	* that we're ignoring the administrator's instructions.
289	*/
290	xfs_warn(mp, "Skipping superblock stripe alignment update.");
291	*update_sb = false;
292	return `0`;
293	}
294
295	/*
296	* If we were provided with new sunit/swidth values as mount options, make sure
297	* that they pass basic alignment and superblock feature checks, and convert
298	* them into the same units (FSB) that everything else expects. This step
299	* /must/ be done before computing the inode geometry.
300	*/
301	STATIC int
302	xfs_validate_new_dalign(
303	struct xfs_mount *mp)
304	{
305	if (mp->m_dalign == `0`)
306	return `0`;
307
308	/*
309	* If stripe unit and stripe width are not multiples
310	* of the fs blocksize turn off alignment.
311	*/
312	if ((BBTOB(mp->m_dalign) & mp->m_blockmask) \|\|
313	(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
314	xfs_warn(mp,
315	"alignment check failed: sunit/swidth vs. blocksize(%d)",
316	mp->m_sb.sb_blocksize);
317	return -EINVAL;
318	}
319
320	/*
321	* Convert the stripe unit and width to FSBs.
322	*/
323	mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
324	if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
325	xfs_warn(mp,
326	"alignment check failed: sunit/swidth vs. agsize(%d)",
327	mp->m_sb.sb_agblocks);
328	return -EINVAL;
329	}
330
331	if (!mp->m_dalign) {
332	xfs_warn(mp,
333	"alignment check failed: sunit(%d) less than bsize(%d)",
334	mp->m_dalign, mp->m_sb.sb_blocksize);
335	return -EINVAL;
336	}
337
338	mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
339
340	if (!xfs_has_dalign(mp)) {
341	xfs_warn(mp,
342	"cannot change alignment: superblock does not support data alignment");
343	return -EINVAL;
344	}
345
346	return `0`;
347	}
348
349	/ Update alignment values based on mount options and sb values. /
350	STATIC int
351	xfs_update_alignment(
352	struct xfs_mount *mp)
353	{
354	struct xfs_sb *sbp = &mp->m_sb;
355
356	if (mp->m_dalign) {
357	bool update_sb;
358	int error;
359
360	if (sbp->sb_unit == mp->m_dalign &&
361	sbp->sb_width == mp->m_swidth)
362	return `0`;
363
364	error = xfs_check_new_dalign(mp, new_dalign: mp->m_dalign, update_sb: &update_sb);
365	if (error \|\| !update_sb)
366	return error;
367
368	sbp->sb_unit = mp->m_dalign;
369	sbp->sb_width = mp->m_swidth;
370	mp->m_update_sb = true;
371	} else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
372	mp->m_dalign = sbp->sb_unit;
373	mp->m_swidth = sbp->sb_width;
374	}
375
376	return `0`;
377	}
378
379	/*
380	* precalculate the low space thresholds for dynamic speculative preallocation.
381	*/
382	void
383	xfs_set_low_space_thresholds(
384	struct xfs_mount *mp)
385	{
386	uint64_t dblocks = mp->m_sb.sb_dblocks;
387	uint64_t rtexts = mp->m_sb.sb_rextents;
388	int i;
389
390	do_div(dblocks, `100`);
391	do_div(rtexts, `100`);
392
393	for (i = `0`; i < XFS_LOWSP_MAX; i++) {
394	mp->m_low_space[i] = dblocks * (i + `1`);
395	mp->m_low_rtexts[i] = rtexts * (i + `1`);
396	}
397	}
398
399	/*
400	* Check that the data (and log if separate) is an ok size.
401	*/
402	STATIC int
403	xfs_check_sizes(
404	struct xfs_mount *mp)
405	{
406	struct xfs_buf *bp;
407	xfs_daddr_t d;
408	int error;
409
410	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
411	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
412	xfs_warn(mp, "filesystem size mismatch detected");
413	return -EFBIG;
414	}
415	error = xfs_buf_read_uncached(target: mp->m_ddev_targp,
416	daddr: d - XFS_FSS_TO_BB(mp, `1`),
417	numblks: XFS_FSS_TO_BB(mp, `1`), bpp: &bp, NULL);
418	if (error) {
419	xfs_warn(mp, "last sector read failed");
420	return error;
421	}
422	xfs_buf_relse(bp);
423
424	if (mp->m_logdev_targp == mp->m_ddev_targp)
425	return `0`;
426
427	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
428	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
429	xfs_warn(mp, "log size mismatch detected");
430	return -EFBIG;
431	}
432	error = xfs_buf_read_uncached(target: mp->m_logdev_targp,
433	daddr: d - XFS_FSB_TO_BB(mp, `1`),
434	numblks: XFS_FSB_TO_BB(mp, `1`), bpp: &bp, NULL);
435	if (error) {
436	xfs_warn(mp, "log device read failed");
437	return error;
438	}
439	xfs_buf_relse(bp);
440	return `0`;
441	}
442
443	/*
444	* Clear the quotaflags in memory and in the superblock.
445	*/
446	int
447	xfs_mount_reset_sbqflags(
448	struct xfs_mount *mp)
449	{
450	mp->m_qflags = `0`;
451
452	/ It is OK to look at sb_qflags in the mount path without m_sb_lock. /
453	if (mp->m_sb.sb_qflags == `0`)
454	return `0`;
455	spin_lock(lock: &mp->m_sb_lock);
456	mp->m_sb.sb_qflags = `0`;
457	spin_unlock(lock: &mp->m_sb_lock);
458
459	if (!xfs_fs_writable(mp, level: SB_FREEZE_WRITE))
460	return `0`;
461
462	return xfs_sync_sb(mp, false);
463	}
464
465	static const char *const xfs_free_pool_name[] = {
466	[XC_FREE_BLOCKS] = "free blocks",
467	[XC_FREE_RTEXTENTS] = "free rt extents",
468	[XC_FREE_RTAVAILABLE] = "available rt extents",
469	};
470
471	uint64_t
472	xfs_default_resblks(
473	struct xfs_mount *mp,
474	enum xfs_free_counter ctr)
475	{
476	switch (ctr) {
477	case XC_FREE_BLOCKS:
478	/*
479	* Default to 5% or 8192 FSBs of space reserved, whichever is
480	* smaller.
481	*
482	* This is intended to cover concurrent allocation transactions
483	* when we initially hit ENOSPC. These each require a 4 block
484	* reservation. Hence by default we cover roughly 2000
485	* concurrent allocation reservations.
486	*/
487	return min(div_u64(mp->m_sb.sb_dblocks, `20`), `8192ULL`);
488	case XC_FREE_RTEXTENTS:
489	case XC_FREE_RTAVAILABLE:
490	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
491	return xfs_zoned_default_resblks(mp, ctr: ctr);
492	return `0`;
493	default:
494	ASSERT(`0`);
495	return `0`;
496	}
497	}
498
499	/ Ensure the summary counts are correct. /
500	STATIC int
501	xfs_check_summary_counts(
502	struct xfs_mount *mp)
503	{
504	int error = `0`;
505
506	/*
507	* The AG0 superblock verifier rejects in-progress filesystems,
508	* so we should never see the flag set this far into mounting.
509	*/
510	if (mp->m_sb.sb_inprogress) {
511	xfs_err(mp, "sb_inprogress set after log recovery??");
512	WARN_ON(`1`);
513	return -EFSCORRUPTED;
514	}
515
516	/*
517	* Now the log is mounted, we know if it was an unclean shutdown or
518	* not. If it was, with the first phase of recovery has completed, we
519	* have consistent AG blocks on disk. We have not recovered EFIs yet,
520	* but they are recovered transactionally in the second recovery phase
521	* later.
522	*
523	* If the log was clean when we mounted, we can check the summary
524	* counters. If any of them are obviously incorrect, we can recompute
525	* them from the AGF headers in the next step.
526	*/
527	if (xfs_is_clean(mp) &&
528	(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks \|\|
529	!xfs_verify_icount(mp, mp->m_sb.sb_icount) \|\|
530	mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
531	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
532
533	/*
534	* We can safely re-initialise incore superblock counters from the
535	* per-ag data. These may not be correct if the filesystem was not
536	* cleanly unmounted, so we waited for recovery to finish before doing
537	* this.
538	*
539	* If the filesystem was cleanly unmounted or the previous check did
540	* not flag anything weird, then we can trust the values in the
541	* superblock to be correct and we don't need to do anything here.
542	* Otherwise, recalculate the summary counters.
543	*/
544	if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) \|\|
545	xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
546	error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
547	if (error)
548	return error;
549	}
550
551	/*
552	* Older kernels misused sb_frextents to reflect both incore
553	* reservations made by running transactions and the actual count of
554	* free rt extents in the ondisk metadata. Transactions committed
555	* during runtime can therefore contain a superblock update that
556	* undercounts the number of free rt extents tracked in the rt bitmap.
557	* A clean unmount record will have the correct frextents value since
558	* there can be no other transactions running at that point.
559	*
560	* If we're mounting the rt volume after recovering the log, recompute
561	* frextents from the rtbitmap file to fix the inconsistency.
562	*/
563	if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
564	error = xfs_rtalloc_reinit_frextents(mp);
565	if (error)
566	return error;
567	}
568
569	return `0`;
570	}
571
572	static void
573	xfs_unmount_check(
574	struct xfs_mount *mp)
575	{
576	if (xfs_is_shutdown(mp))
577	return;
578
579	if (percpu_counter_sum(fbc: &mp->m_ifree) >
580	percpu_counter_sum(fbc: &mp->m_icount)) {
581	xfs_alert(mp, "ifree/icount mismatch at unmount");
582	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
583	}
584	}
585
586	/*
587	* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
588	* internal inode structures can be sitting in the CIL and AIL at this point,
589	* so we need to unpin them, write them back and/or reclaim them before unmount
590	* can proceed. In other words, callers are required to have inactivated all
591	* inodes.
592	*
593	* An inode cluster that has been freed can have its buffer still pinned in
594	* memory because the transaction is still sitting in a iclog. The stale inodes
595	* on that buffer will be pinned to the buffer until the transaction hits the
596	* disk and the callbacks run. Pushing the AIL will skip the stale inodes and
597	* may never see the pinned buffer, so nothing will push out the iclog and
598	* unpin the buffer.
599	*
600	* Hence we need to force the log to unpin everything first. However, log
601	* forces don't wait for the discards they issue to complete, so we have to
602	* explicitly wait for them to complete here as well.
603	*
604	* Then we can tell the world we are unmounting so that error handling knows
605	* that the filesystem is going away and we should error out anything that we
606	* have been retrying in the background. This will prevent never-ending
607	* retries in AIL pushing from hanging the unmount.
608	*
609	* Finally, we can push the AIL to clean all the remaining dirty objects, then
610	* reclaim the remaining inodes that are still in memory at this point in time.
611	*/
612	static void
613	xfs_unmount_flush_inodes(
614	struct xfs_mount *mp)
615	{
616	xfs_log_force(mp, XFS_LOG_SYNC);
617	xfs_extent_busy_wait_all(mp);
618	flush_workqueue(xfs_discard_wq);
619
620	xfs_set_unmounting(mp);
621
622	xfs_ail_push_all_sync(ailp: mp->m_ail);
623	xfs_inodegc_stop(mp);
624	cancel_delayed_work_sync(dwork: &mp->m_reclaim_work);
625	xfs_reclaim_inodes(mp);
626	xfs_health_unmount(mp);
627	}
628
629	static void
630	xfs_mount_setup_inode_geom(
631	struct xfs_mount *mp)
632	{
633	struct xfs_ino_geometry *igeo = M_IGEO(mp);
634
635	igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
636	ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
637
638	xfs_ialloc_setup_geometry(mp);
639	}
640
641	/ Mount the metadata directory tree root. /
642	STATIC int
643	xfs_mount_setup_metadir(
644	struct xfs_mount *mp)
645	{
646	int error;
647
648	/ Load the metadata directory root inode into memory. /
649	error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR,
650	&mp->m_metadirip);
651	if (error)
652	xfs_warn(mp, "Failed to load metadir root directory, error %d",
653	error);
654	return error;
655	}
656
657	/ Compute maximum possible height for per-AG btree types for this fs. /
658	static inline void
659	xfs_agbtree_compute_maxlevels(
660	struct xfs_mount *mp)
661	{
662	unsigned int levels;
663
664	levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
665	levels = max(levels, mp->m_rmap_maxlevels);
666	mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
667	}
668
669	/ Maximum atomic write IO size that the kernel allows. /
670	static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
671	{
672	return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
673	}
674
675	static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
676	{
677	return `1` << (ffs(nr) - `1`);
678	}
679
680	/*
681	* If the data device advertises atomic write support, limit the size of data
682	* device atomic writes to the greatest power-of-two factor of the AG size so
683	* that every atomic write unit aligns with the start of every AG. This is
684	* required so that the per-AG allocations for an atomic write will always be
685	* aligned compatibly with the alignment requirements of the storage.
686	*
687	* If the data device doesn't advertise atomic writes, then there are no
688	* alignment restrictions and the largest out-of-place write we can do
689	* ourselves is the number of blocks that user files can allocate from any AG.
690	*/
691	static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp)
692	{
693	if (mp->m_ddev_targp->bt_bdev_awu_min > `0`)
694	return max_pow_of_two_factor(nr: mp->m_sb.sb_agblocks);
695	return rounddown_pow_of_two(mp->m_ag_max_usable);
696	}
697
698	/*
699	* Reflink on the realtime device requires rtgroups, and atomic writes require
700	* reflink.
701	*
702	* If the realtime device advertises atomic write support, limit the size of
703	* data device atomic writes to the greatest power-of-two factor of the rtgroup
704	* size so that every atomic write unit aligns with the start of every rtgroup.
705	* This is required so that the per-rtgroup allocations for an atomic write
706	* will always be aligned compatibly with the alignment requirements of the
707	* storage.
708	*
709	* If the rt device doesn't advertise atomic writes, then there are no
710	* alignment restrictions and the largest out-of-place write we can do
711	* ourselves is the number of blocks that user files can allocate from any
712	* rtgroup.
713	*/
714	static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp)
715	{
716	struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
717
718	if (rgs->blocks == `0`)
719	return `0`;
720	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > `0`)
721	return max_pow_of_two_factor(nr: rgs->blocks);
722	return rounddown_pow_of_two(rgs->blocks);
723	}
724
725	/ Compute the maximum atomic write unit size for each section. /
726	static inline void
727	xfs_calc_atomic_write_unit_max(
728	struct xfs_mount *mp)
729	{
730	struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
731	struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
732
733	const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
734	const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
735	const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp);
736	const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp);
737
738	ags->awu_max = min3(max_write, max_ioend, max_agsize);
739	rgs->awu_max = min3(max_write, max_ioend, max_rgsize);
740
741	trace_xfs_calc_atomic_write_unit_max(mp, max_write: max_write, max_ioend: max_ioend,
742	max_agsize: max_agsize, max_rgsize: max_rgsize);
743	}
744
745	/*
746	* Try to set the atomic write maximum to a new value that we got from
747	* userspace via mount option.
748	*/
749	int
750	xfs_set_max_atomic_write_opt(
751	struct xfs_mount *mp,
752	unsigned long long new_max_bytes)
753	{
754	const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
755	const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
756	const xfs_extlen_t max_group =
757	max(mp->m_groups[XG_TYPE_AG].blocks,
758	mp->m_groups[XG_TYPE_RTG].blocks);
759	const xfs_extlen_t max_group_write =
760	max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
761	int error;
762
763	if (new_max_bytes == `0`)
764	goto set_limit;
765
766	ASSERT(max_write <= U32_MAX);
767
768	/ generic_atomic_write_valid enforces power of two length /
769	if (!is_power_of_2(n: new_max_bytes)) {
770	xfs_warn(mp,
771	"max atomic write size of %llu bytes is not a power of 2",
772	new_max_bytes);
773	return -EINVAL;
774	}
775
776	if (new_max_bytes & mp->m_blockmask) {
777	xfs_warn(mp,
778	"max atomic write size of %llu bytes not aligned with fsblock",
779	new_max_bytes);
780	return -EINVAL;
781	}
782
783	if (new_max_fsbs > max_write) {
784	xfs_warn(mp,
785	"max atomic write size of %lluk cannot be larger than max write size %lluk",
786	new_max_bytes >> `10`,
787	XFS_FSB_TO_B(mp, max_write) >> `10`);
788	return -EINVAL;
789	}
790
791	if (new_max_fsbs > max_group) {
792	xfs_warn(mp,
793	"max atomic write size of %lluk cannot be larger than allocation group size %lluk",
794	new_max_bytes >> `10`,
795	XFS_FSB_TO_B(mp, max_group) >> `10`);
796	return -EINVAL;
797	}
798
799	if (new_max_fsbs > max_group_write) {
800	xfs_warn(mp,
801	"max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
802	new_max_bytes >> `10`,
803	XFS_FSB_TO_B(mp, max_group_write) >> `10`);
804	return -EINVAL;
805	}
806
807	set_limit:
808	error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
809	if (error) {
810	xfs_warn(mp,
811	"cannot support completing atomic writes of %lluk",
812	new_max_bytes >> `10`);
813	return error;
814	}
815
816	xfs_calc_atomic_write_unit_max(mp);
817	mp->m_awu_max_bytes = new_max_bytes;
818	return `0`;
819	}
820
821	/ Compute maximum possible height for realtime btree types for this fs. /
822	static inline void
823	xfs_rtbtree_compute_maxlevels(
824	struct xfs_mount *mp)
825	{
826	mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels,
827	mp->m_rtrefc_maxlevels);
828	}
829
830	/*
831	* This function does the following on an initial mount of a file system:
832	* - reads the superblock from disk and init the mount struct
833	* - if we're a 32-bit kernel, do a size check on the superblock
834	* so we don't mount terabyte filesystems
835	* - init mount struct realtime fields
836	* - allocate inode hash table for fs
837	* - init directory manager
838	* - perform recovery and init the log manager
839	*/
840	int
841	xfs_mountfs(
842	struct xfs_mount *mp)
843	{
844	struct xfs_sb *sbp = &(mp->m_sb);
845	struct xfs_inode *rip;
846	struct xfs_ino_geometry *igeo = M_IGEO(mp);
847	uint quotamount = `0`;
848	uint quotaflags = `0`;
849	int error = `0`;
850	int i;
851
852	xfs_sb_mount_common(mp, sbp);
853
854	/*
855	* Check for a mismatched features2 values. Older kernels read & wrote
856	* into the wrong sb offset for sb_features2 on some platforms due to
857	* xfs_sb_t not being 64bit size aligned when sb_features2 was added,
858	* which made older superblock reading/writing routines swap it as a
859	* 64-bit value.
860	*
861	* For backwards compatibility, we make both slots equal.
862	*
863	* If we detect a mismatched field, we OR the set bits into the existing
864	* features2 field in case it has already been modified; we don't want
865	* to lose any features. We then update the bad location with the ORed
866	* value so that older kernels will see any features2 flags. The
867	* superblock writeback code ensures the new sb_features2 is copied to
868	* sb_bad_features2 before it is logged or written to disk.
869	*/
870	if (xfs_sb_has_mismatched_features2(sbp)) {
871	xfs_warn(mp, "correcting sb_features alignment problem");
872	sbp->sb_features2 \|= sbp->sb_bad_features2;
873	mp->m_update_sb = true;
874	}
875
876
877	/ always use v2 inodes by default now /
878	if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
879	mp->m_sb.sb_versionnum \|= XFS_SB_VERSION_NLINKBIT;
880	mp->m_features \|= XFS_FEAT_NLINK;
881	mp->m_update_sb = true;
882	}
883
884	/*
885	* If we were given new sunit/swidth options, do some basic validation
886	* checks and convert the incore dalign and swidth values to the
887	* same units (FSB) that everything else uses. This /must/ happen
888	* before computing the inode geometry.
889	*/
890	error = xfs_validate_new_dalign(mp);
891	if (error)
892	goto out;
893
894	xfs_alloc_compute_maxlevels(mp);
895	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
896	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
897	xfs_mount_setup_inode_geom(mp);
898	xfs_rmapbt_compute_maxlevels(mp);
899	xfs_rtrmapbt_compute_maxlevels(mp);
900	xfs_refcountbt_compute_maxlevels(mp);
901	xfs_rtrefcountbt_compute_maxlevels(mp);
902
903	xfs_agbtree_compute_maxlevels(mp);
904	xfs_rtbtree_compute_maxlevels(mp);
905
906	/*
907	* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
908	* is NOT aligned turn off m_dalign since allocator alignment is within
909	* an ag, therefore ag has to be aligned at stripe boundary. Note that
910	* we must compute the free space and rmap btree geometry before doing
911	* this.
912	*/
913	error = xfs_update_alignment(mp);
914	if (error)
915	goto out;
916
917	/ enable fail_at_unmount as default /
918	mp->m_fail_unmount = true;
919
920	error = xfs_mount_sysfs_init(mp);
921	if (error)
922	goto out_remove_scrub_stats;
923
924	xchk_stats_register(cs: mp->m_scrub_stats, parent: mp->m_debugfs);
925
926	error = xfs_errortag_init(mp);
927	if (error)
928	goto out_remove_sysfs;
929
930	error = xfs_uuid_mount(mp);
931	if (error)
932	goto out_remove_errortag;
933
934	/*
935	* Update the preferred write size based on the information from the
936	* on-disk superblock.
937	*/
938	mp->m_allocsize_log =
939	max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
940	mp->m_allocsize_blocks = `1U` << (mp->m_allocsize_log - sbp->sb_blocklog);
941
942	/ set the low space thresholds for dynamic preallocation /
943	xfs_set_low_space_thresholds(mp);
944
945	/*
946	* If enabled, sparse inode chunk alignment is expected to match the
947	* cluster size. Full inode chunk alignment must match the chunk size,
948	* but that is checked on sb read verification...
949	*/
950	if (xfs_has_sparseinodes(mp) &&
951	mp->m_sb.sb_spino_align !=
952	XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
953	xfs_warn(mp,
954	"Sparse inode block alignment (%u) must match cluster size (%llu).",
955	mp->m_sb.sb_spino_align,
956	XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
957	error = -EINVAL;
958	goto out_remove_uuid;
959	}
960
961	/*
962	* Check that the data (and log if separate) is an ok size.
963	*/
964	error = xfs_check_sizes(mp);
965	if (error)
966	goto out_remove_uuid;
967
968	/*
969	* Initialize realtime fields in the mount structure
970	*/
971	error = xfs_rtmount_init(mp);
972	if (error) {
973	xfs_warn(mp, "RT mount failed");
974	goto out_remove_uuid;
975	}
976
977	/*
978	* Copies the low order bits of the timestamp and the randomly
979	* set "sequence" number out of a UUID.
980	*/
981	mp->m_fixedfsid[`0`] =
982	(get_unaligned_be16(p: &sbp->sb_uuid.b[`8`]) << `16`) \|
983	get_unaligned_be16(p: &sbp->sb_uuid.b[`4`]);
984	mp->m_fixedfsid[`1`] = get_unaligned_be32(p: &sbp->sb_uuid.b[`0`]);
985
986	error = xfs_da_mount(mp);
987	if (error) {
988	xfs_warn(mp, "Failed dir/attr init: %d", error);
989	goto out_remove_uuid;
990	}
991
992	/*
993	* Initialize the precomputed transaction reservations values.
994	*/
995	xfs_trans_init(mp);
996
997	/*
998	* Allocate and initialize the per-ag data.
999	*/
1000	error = xfs_initialize_perag(mp, `0`, sbp->sb_agcount,
1001	mp->m_sb.sb_dblocks, &mp->m_maxagi);
1002	if (error) {
1003	xfs_warn(mp, "Failed per-ag init: %d", error);
1004	goto out_free_dir;
1005	}
1006
1007	error = xfs_initialize_rtgroups(mp, `0`, sbp->sb_rgcount,
1008	mp->m_sb.sb_rextents);
1009	if (error) {
1010	xfs_warn(mp, "Failed rtgroup init: %d", error);
1011	goto out_free_perag;
1012	}
1013
1014	if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
1015	xfs_warn(mp, "no log defined");
1016	error = -EFSCORRUPTED;
1017	goto out_free_rtgroup;
1018	}
1019
1020	error = xfs_inodegc_register_shrinker(mp);
1021	if (error)
1022	goto out_fail_wait;
1023
1024	/*
1025	* If we're resuming quota status, pick up the preliminary qflags from
1026	* the ondisk superblock so that we know if we should recover dquots.
1027	*/
1028	if (xfs_is_resuming_quotaon(mp))
1029	xfs_qm_resume_quotaon(mp);
1030
1031	/*
1032	* Log's mount-time initialization. The first part of recovery can place
1033	* some items on the AIL, to be handled when recovery is finished or
1034	* cancelled.
1035	*/
1036	error = xfs_log_mount(mp, log_target: mp->m_logdev_targp,
1037	start_block: XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1038	num_bblocks: XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1039	if (error) {
1040	xfs_warn(mp, "log mount failed");
1041	goto out_inodegc_shrinker;
1042	}
1043
1044	/*
1045	* If we're resuming quota status and recovered the log, re-sample the
1046	* qflags from the ondisk superblock now that we've recovered it, just
1047	* in case someone shut down enforcement just before a crash.
1048	*/
1049	if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(log: mp->m_log))
1050	xfs_qm_resume_quotaon(mp);
1051
1052	/*
1053	* If logged xattrs are still enabled after log recovery finishes, then
1054	* they'll be available until unmount. Otherwise, turn them off.
1055	*/
1056	if (xfs_sb_version_haslogxattrs(&mp->m_sb))
1057	xfs_set_using_logged_xattrs(mp);
1058	else
1059	xfs_clear_using_logged_xattrs(mp);
1060
1061	/ Enable background inode inactivation workers. /
1062	xfs_inodegc_start(mp);
1063	xfs_blockgc_start(mp);
1064
1065	/*
1066	* Now that we've recovered any pending superblock feature bit
1067	* additions, we can finish setting up the attr2 behaviour for the
1068	* mount. The noattr2 option overrides the superblock flag, so only
1069	* check the superblock feature flag if the mount option is not set.
1070	*/
1071	if (xfs_has_noattr2(mp)) {
1072	mp->m_features &= ~XFS_FEAT_ATTR2;
1073	} else if (!xfs_has_attr2(mp) &&
1074	(mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
1075	mp->m_features \|= XFS_FEAT_ATTR2;
1076	}
1077
1078	if (xfs_has_metadir(mp)) {
1079	error = xfs_mount_setup_metadir(mp);
1080	if (error)
1081	goto out_free_metadir;
1082	}
1083
1084	/*
1085	* Get and sanity-check the root inode.
1086	* Save the pointer to it in the mount structure.
1087	*/
1088	error = xfs_iget(mp, NULL, ino: sbp->sb_rootino, XFS_IGET_UNTRUSTED,
1089	XFS_ILOCK_EXCL, ipp: &rip);
1090	if (error) {
1091	xfs_warn(mp,
1092	"Failed to read root inode 0x%llx, error %d",
1093	sbp->sb_rootino, -error);
1094	goto out_free_metadir;
1095	}
1096
1097	ASSERT(rip != NULL);
1098
1099	if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
1100	xfs_warn(mp, "corrupted root inode %llu: not a directory",
1101	(unsigned long long)rip->i_ino);
1102	xfs_iunlock(rip, XFS_ILOCK_EXCL);
1103	error = -EFSCORRUPTED;
1104	goto out_rele_rip;
1105	}
1106	mp->m_rootip = rip; / save it /
1107
1108	xfs_iunlock(rip, XFS_ILOCK_EXCL);
1109
1110	/*
1111	* Initialize realtime inode pointers in the mount structure
1112	*/
1113	error = xfs_rtmount_inodes(mp);
1114	if (error) {
1115	/*
1116	* Free up the root inode.
1117	*/
1118	xfs_warn(mp, "failed to read RT inodes");
1119	goto out_rele_rip;
1120	}
1121
1122	/ Make sure the summary counts are ok. /
1123	error = xfs_check_summary_counts(mp);
1124	if (error)
1125	goto out_rtunmount;
1126
1127	/*
1128	* If this is a read-only mount defer the superblock updates until
1129	* the next remount into writeable mode. Otherwise we would never
1130	* perform the update e.g. for the root filesystem.
1131	*/
1132	if (mp->m_update_sb && !xfs_is_readonly(mp)) {
1133	error = xfs_sync_sb(mp, false);
1134	if (error) {
1135	xfs_warn(mp, "failed to write sb changes");
1136	goto out_rtunmount;
1137	}
1138	}
1139
1140	/*
1141	* Initialise the XFS quota management subsystem for this mount
1142	*/
1143	if (XFS_IS_QUOTA_ON(mp)) {
1144	error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
1145	if (error)
1146	goto out_rtunmount;
1147	} else {
1148	/*
1149	* If a file system had quotas running earlier, but decided to
1150	* mount without -o uquota/pquota/gquota options, revoke the
1151	* quotachecked license.
1152	*/
1153	if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1154	xfs_notice(mp, "resetting quota flags");
1155	error = xfs_mount_reset_sbqflags(mp);
1156	if (error)
1157	goto out_rtunmount;
1158	}
1159	}
1160
1161	/*
1162	* Finish recovering the file system. This part needed to be delayed
1163	* until after the root and real-time bitmap inodes were consistently
1164	* read in. Temporarily create per-AG space reservations for metadata
1165	* btree shape changes because space freeing transactions (for inode
1166	* inactivation) require the per-AG reservation in lieu of reserving
1167	* blocks.
1168	*/
1169	error = xfs_fs_reserve_ag_blocks(mp);
1170	if (error && error == -ENOSPC)
1171	xfs_warn(mp,
1172	"ENOSPC reserving per-AG metadata pool, log recovery may fail.");
1173	error = xfs_log_mount_finish(mp);
1174	xfs_fs_unreserve_ag_blocks(mp);
1175	if (error) {
1176	xfs_warn(mp, "log mount finish failed");
1177	goto out_rtunmount;
1178	}
1179
1180	/*
1181	* Now the log is fully replayed, we can transition to full read-only
1182	* mode for read-only mounts. This will sync all the metadata and clean
1183	* the log so that the recovery we just performed does not have to be
1184	* replayed again on the next mount.
1185	*
1186	* We use the same quiesce mechanism as the rw->ro remount, as they are
1187	* semantically identical operations.
1188	*/
1189	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
1190	xfs_log_clean(mp);
1191
1192	if (xfs_has_zoned(mp)) {
1193	error = xfs_mount_zones(mp);
1194	if (error)
1195	goto out_rtunmount;
1196	}
1197
1198	/*
1199	* Complete the quota initialisation, post-log-replay component.
1200	*/
1201	if (quotamount) {
1202	ASSERT(mp->m_qflags == `0`);
1203	mp->m_qflags = quotaflags;
1204
1205	xfs_qm_mount_quotas(mp);
1206	}
1207
1208	/*
1209	* Now we are mounted, reserve a small amount of unused space for
1210	* privileged transactions. This is needed so that transaction
1211	* space required for critical operations can dip into this pool
1212	* when at ENOSPC. This is needed for operations like create with
1213	* attr, unwritten extent conversion at ENOSPC, garbage collection
1214	* etc. Data allocations are not allowed to use this reserved space.
1215	*
1216	* This may drive us straight to ENOSPC on mount, but that implies
1217	* we were already there on the last unmount. Warn if this occurs.
1218	*/
1219	if (!xfs_is_readonly(mp)) {
1220	for (i = `0`; i < XC_FREE_NR; i++) {
1221	error = xfs_reserve_blocks(mp, i,
1222	xfs_default_resblks(mp, i));
1223	if (error)
1224	xfs_warn(mp,
1225	"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
1226	xfs_free_pool_name[i]);
1227	}
1228
1229	/ Reserve AG blocks for future btree expansion. /
1230	error = xfs_fs_reserve_ag_blocks(mp);
1231	if (error && error != -ENOSPC)
1232	goto out_agresv;
1233
1234	xfs_zone_gc_start(mp);
1235	}
1236
1237	/*
1238	* Pre-calculate atomic write unit max. This involves computations
1239	* derived from transaction reservations, so we must do this after the
1240	* log is fully initialized.
1241	*/
1242	error = xfs_set_max_atomic_write_opt(mp, new_max_bytes: mp->m_awu_max_bytes);
1243	if (error)
1244	goto out_agresv;
1245
1246	return `0`;
1247
1248	out_agresv:
1249	xfs_fs_unreserve_ag_blocks(mp);
1250	xfs_qm_unmount_quotas(mp);
1251	if (xfs_has_zoned(mp))
1252	xfs_unmount_zones(mp);
1253	out_rtunmount:
1254	xfs_rtunmount_inodes(mp);
1255	out_rele_rip:
1256	xfs_irele(ip: rip);
1257	/ Clean out dquots that might be in memory after quotacheck. /
1258	xfs_qm_unmount(mp);
1259	out_free_metadir:
1260	if (mp->m_metadirip)
1261	xfs_irele(ip: mp->m_metadirip);
1262
1263	/*
1264	* Inactivate all inodes that might still be in memory after a log
1265	* intent recovery failure so that reclaim can free them. Metadata
1266	* inodes and the root directory shouldn't need inactivation, but the
1267	* mount failed for some reason, so pull down all the state and flee.
1268	*/
1269	xfs_inodegc_flush(mp);
1270
1271	/*
1272	* Flush all inode reclamation work and flush the log.
1273	* We have to do this /after/ rtunmount and qm_unmount because those
1274	* two will have scheduled delayed reclaim for the rt/quota inodes.
1275	*
1276	* This is slightly different from the unmountfs call sequence
1277	* because we could be tearing down a partially set up mount. In
1278	* particular, if log_mount_finish fails we bail out without calling
1279	* qm_unmount_quotas and therefore rely on qm_unmount to release the
1280	* quota inodes.
1281	*/
1282	xfs_unmount_flush_inodes(mp);
1283	xfs_log_mount_cancel(mp);
1284	out_inodegc_shrinker:
1285	shrinker_free(shrinker: mp->m_inodegc_shrinker);
1286	out_fail_wait:
1287	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
1288	xfs_buftarg_drain(mp->m_logdev_targp);
1289	xfs_buftarg_drain(mp->m_ddev_targp);
1290	out_free_rtgroup:
1291	xfs_free_rtgroups(mp, `0`, mp->m_sb.sb_rgcount);
1292	out_free_perag:
1293	xfs_free_perag_range(mp, `0`, mp->m_sb.sb_agcount);
1294	out_free_dir:
1295	xfs_da_unmount(mp);
1296	out_remove_uuid:
1297	xfs_uuid_unmount(mp);
1298	out_remove_errortag:
1299	xfs_errortag_del(mp);
1300	out_remove_sysfs:
1301	xfs_mount_sysfs_del(mp);
1302	out_remove_scrub_stats:
1303	xchk_stats_unregister(cs: mp->m_scrub_stats);
1304	out:
1305	return error;
1306	}
1307
1308	/*
1309	* This flushes out the inodes,dquots and the superblock, unmounts the
1310	* log and makes sure that incore structures are freed.
1311	*/
1312	void
1313	xfs_unmountfs(
1314	struct xfs_mount *mp)
1315	{
1316	int error;
1317
1318	/*
1319	* Perform all on-disk metadata updates required to inactivate inodes
1320	* that the VFS evicted earlier in the unmount process. Freeing inodes
1321	* and discarding CoW fork preallocations can cause shape changes to
1322	* the free inode and refcount btrees, respectively, so we must finish
1323	* this before we discard the metadata space reservations. Metadata
1324	* inodes and the root directory do not require inactivation.
1325	*/
1326	xfs_inodegc_flush(mp);
1327
1328	xfs_blockgc_stop(mp);
1329	if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
1330	xfs_zone_gc_stop(mp);
1331	xfs_fs_unreserve_ag_blocks(mp);
1332	xfs_qm_unmount_quotas(mp);
1333	if (xfs_has_zoned(mp))
1334	xfs_unmount_zones(mp);
1335	xfs_rtunmount_inodes(mp);
1336	xfs_irele(ip: mp->m_rootip);
1337	if (mp->m_metadirip)
1338	xfs_irele(ip: mp->m_metadirip);
1339
1340	xfs_unmount_flush_inodes(mp);
1341
1342	xfs_qm_unmount(mp);
1343
1344	/*
1345	* Unreserve any blocks we have so that when we unmount we don't account
1346	* the reserved free space as used. This is really only necessary for
1347	* lazy superblock counting because it trusts the incore superblock
1348	* counters to be absolutely correct on clean unmount.
1349	*
1350	* We don't bother correcting this elsewhere for lazy superblock
1351	* counting because on mount of an unclean filesystem we reconstruct the
1352	* correct counter value and this is irrelevant.
1353	*
1354	* For non-lazy counter filesystems, this doesn't matter at all because
1355	* we only every apply deltas to the superblock and hence the incore
1356	* value does not matter....
1357	*/
1358	error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, `0`);
1359	if (error)
1360	xfs_warn(mp, "Unable to free reserved block pool. "
1361	"Freespace may not be correct on next mount.");
1362	xfs_unmount_check(mp);
1363
1364	/*
1365	* Indicate that it's ok to clear log incompat bits before cleaning
1366	* the log and writing the unmount record.
1367	*/
1368	xfs_set_done_with_log_incompat(mp);
1369	xfs_log_unmount(mp);
1370	xfs_da_unmount(mp);
1371	xfs_uuid_unmount(mp);
1372
1373	#if defined(DEBUG)
1374	xfs_errortag_clearall(mp);
1375	#endif
1376	shrinker_free(shrinker: mp->m_inodegc_shrinker);
1377	xfs_free_rtgroups(mp, `0`, mp->m_sb.sb_rgcount);
1378	xfs_free_perag_range(mp, `0`, mp->m_sb.sb_agcount);
1379	xfs_errortag_del(mp);
1380	xchk_stats_unregister(cs: mp->m_scrub_stats);
1381	xfs_mount_sysfs_del(mp);
1382	}
1383
1384	/*
1385	* Determine whether modifications can proceed. The caller specifies the minimum
1386	* freeze level for which modifications should not be allowed. This allows
1387	* certain operations to proceed while the freeze sequence is in progress, if
1388	* necessary.
1389	*/
1390	bool
1391	xfs_fs_writable(
1392	struct xfs_mount *mp,
1393	int level)
1394	{
1395	ASSERT(level > SB_UNFROZEN);
1396	if ((mp->m_super->s_writers.frozen >= level) \|\|
1397	xfs_is_shutdown(mp) \|\| xfs_is_readonly(mp))
1398	return false;
1399
1400	return true;
1401	}
1402
1403	/*
1404	* Estimate the amount of free space that is not available to userspace and is
1405	* not explicitly reserved from the incore fdblocks. This includes:
1406	*
1407	* - The minimum number of blocks needed to support splitting a bmap btree
1408	* - The blocks currently in use by the freespace btrees because they record
1409	* the actual blocks that will fill per-AG metadata space reservations
1410	*/
1411	uint64_t
1412	xfs_freecounter_unavailable(
1413	struct xfs_mount *mp,
1414	enum xfs_free_counter ctr)
1415	{
1416	if (ctr != XC_FREE_BLOCKS)
1417	return `0`;
1418	return mp->m_alloc_set_aside + atomic64_read(v: &mp->m_allocbt_blks);
1419	}
1420
1421	void
1422	xfs_add_freecounter(
1423	struct xfs_mount *mp,
1424	enum xfs_free_counter ctr,
1425	uint64_t delta)
1426	{
1427	struct xfs_freecounter *counter = &mp->m_free[ctr];
1428	uint64_t res_used;
1429
1430	/*
1431	* If the reserve pool is depleted, put blocks back into it first.
1432	* Most of the time the pool is full.
1433	*/
1434	if (likely(counter->res_avail == counter->res_total)) {
1435	percpu_counter_add(fbc: &counter->count, amount: delta);
1436	return;
1437	}
1438
1439	spin_lock(lock: &mp->m_sb_lock);
1440	res_used = counter->res_total - counter->res_avail;
1441	if (res_used > delta) {
1442	counter->res_avail += delta;
1443	} else {
1444	delta -= res_used;
1445	counter->res_avail = counter->res_total;
1446	percpu_counter_add(fbc: &counter->count, amount: delta);
1447	}
1448	spin_unlock(lock: &mp->m_sb_lock);
1449	}
1450
1451
1452	/ Adjust in-core free blocks or RT extents. /
1453	int
1454	xfs_dec_freecounter(
1455	struct xfs_mount *mp,
1456	enum xfs_free_counter ctr,
1457	uint64_t delta,
1458	bool rsvd)
1459	{
1460	struct xfs_freecounter *counter = &mp->m_free[ctr];
1461	s32 batch;
1462
1463	ASSERT(ctr < XC_FREE_NR);
1464
1465	/*
1466	* Taking blocks away, need to be more accurate the closer we
1467	* are to zero.
1468	*
1469	* If the counter has a value of less than 2 * max batch size,
1470	* then make everything serialise as we are real close to
1471	* ENOSPC.
1472	*/
1473	if (__percpu_counter_compare(fbc: &counter->count, rhs: `2` * XFS_FDBLOCKS_BATCH,
1474	XFS_FDBLOCKS_BATCH) < `0`)
1475	batch = `1`;
1476	else
1477	batch = XFS_FDBLOCKS_BATCH;
1478
1479	/*
1480	* Set aside allocbt blocks because these blocks are tracked as free
1481	* space but not available for allocation. Technically this means that a
1482	* single reservation cannot consume all remaining free space, but the
1483	* ratio of allocbt blocks to usable free blocks should be rather small.
1484	* The tradeoff without this is that filesystems that maintain high
1485	* perag block reservations can over reserve physical block availability
1486	* and fail physical allocation, which leads to much more serious
1487	* problems (i.e. transaction abort, pagecache discards, etc.) than
1488	* slightly premature -ENOSPC.
1489	*/
1490	percpu_counter_add_batch(fbc: &counter->count, amount: -((int64_t)delta), batch);
1491	if (__percpu_counter_compare(fbc: &counter->count,
1492	rhs: xfs_freecounter_unavailable(mp, ctr: ctr),
1493	XFS_FDBLOCKS_BATCH) < `0`) {
1494	/*
1495	* Lock up the sb for dipping into reserves before releasing the
1496	* space that took us to ENOSPC.
1497	*/
1498	spin_lock(lock: &mp->m_sb_lock);
1499	percpu_counter_add(fbc: &counter->count, amount: delta);
1500	if (!rsvd)
1501	goto fdblocks_enospc;
1502	if (delta > counter->res_avail) {
1503	if (ctr == XC_FREE_BLOCKS)
1504	xfs_warn_once(mp,
1505	"Reserve blocks depleted! Consider increasing reserve pool size.");
1506	goto fdblocks_enospc;
1507	}
1508	counter->res_avail -= delta;
1509	trace_xfs_freecounter_reserved(mp, ctr: ctr, delta, _RET_IP_);
1510	spin_unlock(lock: &mp->m_sb_lock);
1511	}
1512
1513	/ we had space! /
1514	return `0`;
1515
1516	fdblocks_enospc:
1517	trace_xfs_freecounter_enospc(mp, ctr: ctr, delta, _RET_IP_);
1518	spin_unlock(lock: &mp->m_sb_lock);
1519	return -ENOSPC;
1520	}
1521
1522	/*
1523	* Used to free the superblock along various error paths.
1524	*/
1525	void
1526	xfs_freesb(
1527	struct xfs_mount *mp)
1528	{
1529	struct xfs_buf *bp = mp->m_sb_bp;
1530
1531	xfs_buf_lock(bp);
1532	mp->m_sb_bp = NULL;
1533	xfs_buf_relse(bp);
1534	}
1535
1536	/*
1537	* If the underlying (data/log/rt) device is readonly, there are some
1538	* operations that cannot proceed.
1539	*/
1540	int
1541	xfs_dev_is_read_only(
1542	struct xfs_mount *mp,
1543	char *message)
1544	{
1545	if (xfs_readonly_buftarg(mp->m_ddev_targp) \|\|
1546	xfs_readonly_buftarg(mp->m_logdev_targp) \|\|
1547	(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1548	xfs_notice(mp, "%s required on read-only device.", message);
1549	xfs_notice(mp, "write access unavailable, cannot proceed.");
1550	return -EROFS;
1551	}
1552	return `0`;
1553	}
1554
1555	/ Force the summary counters to be recalculated at next mount. /
1556	void
1557	xfs_force_summary_recalc(
1558	struct xfs_mount *mp)
1559	{
1560	if (!xfs_has_lazysbcount(mp))
1561	return;
1562
1563	xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
1564	}
1565
1566	/*
1567	* Enable a log incompat feature flag in the primary superblock. The caller
1568	* cannot have any other transactions in progress.
1569	*/
1570	int
1571	xfs_add_incompat_log_feature(
1572	struct xfs_mount *mp,
1573	uint32_t feature)
1574	{
1575	struct xfs_dsb *dsb;
1576	int error;
1577
1578	ASSERT(hweight32(feature) == `1`);
1579	ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
1580
1581	/*
1582	* Force the log to disk and kick the background AIL thread to reduce
1583	* the chances that the bwrite will stall waiting for the AIL to unpin
1584	* the primary superblock buffer. This isn't a data integrity
1585	* operation, so we don't need a synchronous push.
1586	*/
1587	error = xfs_log_force(mp, XFS_LOG_SYNC);
1588	if (error)
1589	return error;
1590	xfs_ail_push_all(ailp: mp->m_ail);
1591
1592	/*
1593	* Lock the primary superblock buffer to serialize all callers that
1594	* are trying to set feature bits.
1595	*/
1596	xfs_buf_lock(mp->m_sb_bp);
1597	xfs_buf_hold(bp: mp->m_sb_bp);
1598
1599	if (xfs_is_shutdown(mp)) {
1600	error = -EIO;
1601	goto rele;
1602	}
1603
1604	if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
1605	goto rele;
1606
1607	/*
1608	* Write the primary superblock to disk immediately, because we need
1609	* the log_incompat bit to be set in the primary super now to protect
1610	* the log items that we're going to commit later.
1611	*/
1612	dsb = mp->m_sb_bp->b_addr;
1613	xfs_sb_to_disk(dsb, &mp->m_sb);
1614	dsb->sb_features_log_incompat \|= cpu_to_be32(feature);
1615	error = xfs_bwrite(bp: mp->m_sb_bp);
1616	if (error)
1617	goto shutdown;
1618
1619	/*
1620	* Add the feature bits to the incore superblock before we unlock the
1621	* buffer.
1622	*/
1623	xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
1624	xfs_buf_relse(bp: mp->m_sb_bp);
1625
1626	/ Log the superblock to disk. /
1627	return xfs_sync_sb(mp, false);
1628	shutdown:
1629	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1630	rele:
1631	xfs_buf_relse(bp: mp->m_sb_bp);
1632	return error;
1633	}
1634
1635	/*
1636	* Clear all the log incompat flags from the superblock.
1637	*
1638	* The caller cannot be in a transaction, must ensure that the log does not
1639	* contain any log items protected by any log incompat bit, and must ensure
1640	* that there are no other threads that depend on the state of the log incompat
1641	* feature flags in the primary super.
1642	*
1643	* Returns true if the superblock is dirty.
1644	*/
1645	bool
1646	xfs_clear_incompat_log_features(
1647	struct xfs_mount *mp)
1648	{
1649	bool ret = false;
1650
1651	if (!xfs_has_crc(mp) \|\|
1652	!xfs_sb_has_incompat_log_feature(&mp->m_sb,
1653	XFS_SB_FEAT_INCOMPAT_LOG_ALL) \|\|
1654	xfs_is_shutdown(mp) \|\|
1655	!xfs_is_done_with_log_incompat(mp))
1656	return false;
1657
1658	/*
1659	* Update the incore superblock. We synchronize on the primary super
1660	* buffer lock to be consistent with the add function, though at least
1661	* in theory this shouldn't be necessary.
1662	*/
1663	xfs_buf_lock(mp->m_sb_bp);
1664	xfs_buf_hold(bp: mp->m_sb_bp);
1665
1666	if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
1667	XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
1668	xfs_sb_remove_incompat_log_features(&mp->m_sb);
1669	ret = true;
1670	}
1671
1672	xfs_buf_relse(bp: mp->m_sb_bp);
1673	return ret;
1674	}
1675
1676	/*
1677	* Update the in-core delayed block counter.
1678	*
1679	* We prefer to update the counter without having to take a spinlock for every
1680	* counter update (i.e. batching). Each change to delayed allocation
1681	* reservations can change can easily exceed the default percpu counter
1682	* batching, so we use a larger batch factor here.
1683	*
1684	* Note that we don't currently have any callers requiring fast summation
1685	* (e.g. percpu_counter_read) so we can use a big batch value here.
1686	*/
1687	#define XFS_DELALLOC_BATCH (4096)
1688	void
1689	xfs_mod_delalloc(
1690	struct xfs_inode *ip,
1691	int64_t data_delta,
1692	int64_t ind_delta)
1693	{
1694	struct xfs_mount *mp = ip->i_mount;
1695
1696	if (XFS_IS_REALTIME_INODE(ip)) {
1697	percpu_counter_add_batch(fbc: &mp->m_delalloc_rtextents,
1698	amount: xfs_blen_to_rtbxlen(mp, data_delta),
1699	XFS_DELALLOC_BATCH);
1700	if (!ind_delta)
1701	return;
1702	data_delta = `0`;
1703	}
1704	percpu_counter_add_batch(fbc: &mp->m_delalloc_blks, amount: data_delta + ind_delta,
1705	XFS_DELALLOC_BATCH);
1706	}
1707

source code of linux/fs/xfs/xfs_mount.c