newbt.c source code [linux/fs/xfs/scrub/newbt.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4	* Author: Darrick J. Wong <djwong@kernel.org>
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_trans_resv.h"
11	#include "xfs_mount.h"
12	#include "xfs_btree.h"
13	#include "xfs_btree_staging.h"
14	#include "xfs_log_format.h"
15	#include "xfs_trans.h"
16	#include "xfs_sb.h"
17	#include "xfs_inode.h"
18	#include "xfs_alloc.h"
19	#include "xfs_rmap.h"
20	#include "xfs_ag.h"
21	#include "xfs_defer.h"
22	#include "scrub/scrub.h"
23	#include "scrub/common.h"
24	#include "scrub/trace.h"
25	#include "scrub/repair.h"
26	#include "scrub/newbt.h"
27
28	/*
29	* Estimate proper slack values for a btree that's being reloaded.
30	*
31	* Under most circumstances, we'll take whatever default loading value the
32	* btree bulk loading code calculates for us. However, there are some
33	* exceptions to this rule:
34	*
35	* (0) If someone turned one of the debug knobs.
36	* (1) If this is a per-AG btree and the AG has less than 10% space free.
37	* (2) If this is an inode btree and the FS has less than 10% space free.
38
39	* In either case, format the new btree blocks almost completely full to
40	* minimize space usage.
41	*/
42	static void
43	xrep_newbt_estimate_slack(
44	struct xrep_newbt *xnr)
45	{
46	struct xfs_scrub *sc = xnr->sc;
47	struct xfs_btree_bload *bload = &xnr->bload;
48	uint64_t free;
49	uint64_t sz;
50
51	/*
52	* The xfs_globals values are set to -1 (i.e. take the bload defaults)
53	* unless someone has set them otherwise, so we just pull the values
54	* here.
55	*/
56	bload->leaf_slack = xfs_globals.bload_leaf_slack;
57	bload->node_slack = xfs_globals.bload_node_slack;
58
59	if (sc->ops->type == ST_PERAG) {
60	free = sc->sa.pag->pagf_freeblks;
61	sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
62	} else {
63	free = percpu_counter_sum(&sc->mp->m_fdblocks);
64	sz = sc->mp->m_sb.sb_dblocks;
65	}
66
67	/ No further changes if there's more than 10% free space left. /
68	if (free >= div_u64(sz, `10`))
69	return;
70
71	/*
72	* We're low on space; load the btrees as tightly as possible. Leave
73	* a couple of open slots in each btree block so that we don't end up
74	* splitting the btrees like crazy after a mount.
75	*/
76	if (bload->leaf_slack < `0`)
77	bload->leaf_slack = `2`;
78	if (bload->node_slack < `0`)
79	bload->node_slack = `2`;
80	}
81
82	/ Initialize accounting resources for staging a new AG btree. /
83	void
84	xrep_newbt_init_ag(
85	struct xrep_newbt *xnr,
86	struct xfs_scrub *sc,
87	const struct xfs_owner_info *oinfo,
88	xfs_fsblock_t alloc_hint,
89	enum xfs_ag_resv_type resv)
90	{
91	memset(xnr, `0`, sizeof(struct xrep_newbt));
92	xnr->sc = sc;
93	xnr->oinfo = oinfo; /* structure copy /
94	xnr->alloc_hint = alloc_hint;
95	xnr->resv = resv;
96	INIT_LIST_HEAD(&xnr->resv_list);
97	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, `256U` << `10`); / 256K /
98	xrep_newbt_estimate_slack(xnr);
99	}
100
101	/ Initialize accounting resources for staging a new inode fork btree. /
102	int
103	xrep_newbt_init_inode(
104	struct xrep_newbt *xnr,
105	struct xfs_scrub *sc,
106	int whichfork,
107	const struct xfs_owner_info *oinfo)
108	{
109	struct xfs_ifork *ifp;
110
111	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
112	if (!ifp)
113	return -ENOMEM;
114
115	xrep_newbt_init_ag(xnr, sc, oinfo,
116	XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
117	XFS_AG_RESV_NONE);
118	xnr->ifake.if_fork = ifp;
119	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
120	return `0`;
121	}
122
123	/*
124	* Initialize accounting resources for staging a new btree. Callers are
125	* expected to add their own reservations (and clean them up) manually.
126	*/
127	void
128	xrep_newbt_init_bare(
129	struct xrep_newbt *xnr,
130	struct xfs_scrub *sc)
131	{
132	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
133	XFS_AG_RESV_NONE);
134	}
135
136	/*
137	* Designate specific blocks to be used to build our new btree. @pag must be
138	* a passive reference.
139	*/
140	STATIC int
141	xrep_newbt_add_blocks(
142	struct xrep_newbt *xnr,
143	struct xfs_perag *pag,
144	const struct xfs_alloc_arg *args)
145	{
146	struct xfs_mount *mp = xnr->sc->mp;
147	struct xrep_newbt_resv *resv;
148	int error;
149
150	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
151	if (!resv)
152	return -ENOMEM;
153
154	INIT_LIST_HEAD(&resv->list);
155	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
156	resv->len = args->len;
157	resv->used = `0`;
158	resv->pag = xfs_perag_hold(pag);
159
160	if (args->tp) {
161	ASSERT(xnr->oinfo.oi_offset == `0`);
162
163	error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
164	if (error)
165	goto out_pag;
166	}
167
168	list_add_tail(&resv->list, &xnr->resv_list);
169	return `0`;
170	out_pag:
171	xfs_perag_put(resv->pag);
172	kfree(resv);
173	return error;
174	}
175
176	/*
177	* Add an extent to the new btree reservation pool. Callers are required to
178	* reap this reservation manually if the repair is cancelled. @pag must be a
179	* passive reference.
180	*/
181	int
182	xrep_newbt_add_extent(
183	struct xrep_newbt *xnr,
184	struct xfs_perag *pag,
185	xfs_agblock_t agbno,
186	xfs_extlen_t len)
187	{
188	struct xfs_mount *mp = xnr->sc->mp;
189	struct xfs_alloc_arg args = {
190	.tp = NULL, / no autoreap /
191	.oinfo = xnr->oinfo,
192	.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
193	.len = len,
194	.resv = xnr->resv,
195	};
196
197	return xrep_newbt_add_blocks(xnr, pag, &args);
198	}
199
200	/ Don't let our allocation hint take us beyond this AG /
201	static inline void
202	xrep_newbt_validate_ag_alloc_hint(
203	struct xrep_newbt *xnr)
204	{
205	struct xfs_scrub *sc = xnr->sc;
206	xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
207
208	if (agno == sc->sa.pag->pag_agno &&
209	xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
210	return;
211
212	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
213	XFS_AGFL_BLOCK(sc->mp) + `1`);
214	}
215
216	/ Allocate disk space for a new per-AG btree. /
217	STATIC int
218	xrep_newbt_alloc_ag_blocks(
219	struct xrep_newbt *xnr,
220	uint64_t nr_blocks)
221	{
222	struct xfs_scrub *sc = xnr->sc;
223	struct xfs_mount *mp = sc->mp;
224	int error = `0`;
225
226	ASSERT(sc->sa.pag != NULL);
227
228	while (nr_blocks > `0`) {
229	struct xfs_alloc_arg args = {
230	.tp = sc->tp,
231	.mp = mp,
232	.oinfo = xnr->oinfo,
233	.minlen = `1`,
234	.maxlen = nr_blocks,
235	.prod = `1`,
236	.resv = xnr->resv,
237	};
238	xfs_agnumber_t agno;
239
240	xrep_newbt_validate_ag_alloc_hint(xnr);
241
242	if (xnr->alloc_vextent)
243	error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
244	else
245	error = xfs_alloc_vextent_near_bno(&args,
246	xnr->alloc_hint);
247	if (error)
248	return error;
249	if (args.fsbno == NULLFSBLOCK)
250	return -ENOSPC;
251
252	agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
253
254	trace_xrep_newbt_alloc_ag_blocks(mp, agno,
255	XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
256	xnr->oinfo.oi_owner);
257
258	if (agno != sc->sa.pag->pag_agno) {
259	ASSERT(agno == sc->sa.pag->pag_agno);
260	return -EFSCORRUPTED;
261	}
262
263	error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
264	if (error)
265	return error;
266
267	nr_blocks -= args.len;
268	xnr->alloc_hint = args.fsbno + args.len;
269
270	error = xrep_defer_finish(sc);
271	if (error)
272	return error;
273	}
274
275	return `0`;
276	}
277
278	/ Don't let our allocation hint take us beyond EOFS /
279	static inline void
280	xrep_newbt_validate_file_alloc_hint(
281	struct xrep_newbt *xnr)
282	{
283	struct xfs_scrub *sc = xnr->sc;
284
285	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
286	return;
287
288	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, `0`, XFS_AGFL_BLOCK(sc->mp) + `1`);
289	}
290
291	/ Allocate disk space for our new file-based btree. /
292	STATIC int
293	xrep_newbt_alloc_file_blocks(
294	struct xrep_newbt *xnr,
295	uint64_t nr_blocks)
296	{
297	struct xfs_scrub *sc = xnr->sc;
298	struct xfs_mount *mp = sc->mp;
299	int error = `0`;
300
301	while (nr_blocks > `0`) {
302	struct xfs_alloc_arg args = {
303	.tp = sc->tp,
304	.mp = mp,
305	.oinfo = xnr->oinfo,
306	.minlen = `1`,
307	.maxlen = nr_blocks,
308	.prod = `1`,
309	.resv = xnr->resv,
310	};
311	struct xfs_perag *pag;
312	xfs_agnumber_t agno;
313
314	xrep_newbt_validate_file_alloc_hint(xnr);
315
316	if (xnr->alloc_vextent)
317	error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
318	else
319	error = xfs_alloc_vextent_start_ag(&args,
320	xnr->alloc_hint);
321	if (error)
322	return error;
323	if (args.fsbno == NULLFSBLOCK)
324	return -ENOSPC;
325
326	agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
327
328	trace_xrep_newbt_alloc_file_blocks(mp, agno,
329	XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
330	xnr->oinfo.oi_owner);
331
332	pag = xfs_perag_get(mp, agno);
333	if (!pag) {
334	ASSERT(`0`);
335	return -EFSCORRUPTED;
336	}
337
338	error = xrep_newbt_add_blocks(xnr, pag, &args);
339	xfs_perag_put(pag);
340	if (error)
341	return error;
342
343	nr_blocks -= args.len;
344	xnr->alloc_hint = args.fsbno + args.len;
345
346	error = xrep_defer_finish(sc);
347	if (error)
348	return error;
349	}
350
351	return `0`;
352	}
353
354	/ Allocate disk space for our new btree. /
355	int
356	xrep_newbt_alloc_blocks(
357	struct xrep_newbt *xnr,
358	uint64_t nr_blocks)
359	{
360	if (xnr->sc->ip)
361	return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
362	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
363	}
364
365	/*
366	* Free the unused part of a space extent that was reserved for a new ondisk
367	* structure. Returns the number of EFIs logged or a negative errno.
368	*/
369	STATIC int
370	xrep_newbt_free_extent(
371	struct xrep_newbt *xnr,
372	struct xrep_newbt_resv *resv,
373	bool btree_committed)
374	{
375	struct xfs_scrub *sc = xnr->sc;
376	xfs_agblock_t free_agbno = resv->agbno;
377	xfs_extlen_t free_aglen = resv->len;
378	xfs_fsblock_t fsbno;
379	int error;
380
381	if (!btree_committed \|\| resv->used == `0`) {
382	/*
383	* If we're not committing a new btree or we didn't use the
384	* space reservation, let the existing EFI free the entire
385	* space extent.
386	*/
387	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
388	free_agbno, free_aglen, xnr->oinfo.oi_owner);
389	xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
390	return `1`;
391	}
392
393	/*
394	* We used space and committed the btree. Cancel the autoreap, remove
395	* the written blocks from the reservation, and possibly log a new EFI
396	* to free any unused reservation space.
397	*/
398	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
399	free_agbno += resv->used;
400	free_aglen -= resv->used;
401
402	if (free_aglen == `0`)
403	return `0`;
404
405	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
406	free_aglen, xnr->oinfo.oi_owner);
407
408	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
409	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
410
411	/*
412	* Use EFIs to free the reservations. This reduces the chance
413	* that we leak blocks if the system goes down.
414	*/
415	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
416	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
417	xnr->resv, true);
418	if (error)
419	return error;
420
421	return `1`;
422	}
423
424	/ Free all the accounting info and disk space we reserved for a new btree. /
425	STATIC int
426	xrep_newbt_free(
427	struct xrep_newbt *xnr,
428	bool btree_committed)
429	{
430	struct xfs_scrub *sc = xnr->sc;
431	struct xrep_newbt_resv resv, n;
432	unsigned int freed = `0`;
433	int error = `0`;
434
435	/*
436	* If the filesystem already went down, we can't free the blocks. Skip
437	* ahead to freeing the incore metadata because we can't fix anything.
438	*/
439	if (xfs_is_shutdown(sc->mp))
440	goto junkit;
441
442	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
443	int ret;
444
445	ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
446	list_del(&resv->list);
447	xfs_perag_put(resv->pag);
448	kfree(resv);
449	if (ret < `0`) {
450	error = ret;
451	goto junkit;
452	}
453
454	freed += ret;
455	if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
456	error = xrep_defer_finish(sc);
457	if (error)
458	goto junkit;
459	freed = `0`;
460	}
461	}
462
463	if (freed)
464	error = xrep_defer_finish(sc);
465
466	junkit:
467	/*
468	* If we still have reservations attached to @newbt, cleanup must have
469	* failed and the filesystem is about to go down. Clean up the incore
470	* reservations and try to commit to freeing the space we used.
471	*/
472	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
473	xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
474	list_del(&resv->list);
475	xfs_perag_put(resv->pag);
476	kfree(resv);
477	}
478
479	if (sc->ip) {
480	kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
481	xnr->ifake.if_fork = NULL;
482	}
483
484	return error;
485	}
486
487	/*
488	* Free all the accounting info and unused disk space allocations after
489	* committing a new btree.
490	*/
491	int
492	xrep_newbt_commit(
493	struct xrep_newbt *xnr)
494	{
495	return xrep_newbt_free(xnr, true);
496	}
497
498	/*
499	* Free all the accounting info and all of the disk space we reserved for a new
500	* btree that we're not going to commit. We want to try to roll things back
501	* cleanly for things like ENOSPC midway through allocation.
502	*/
503	void
504	xrep_newbt_cancel(
505	struct xrep_newbt *xnr)
506	{
507	xrep_newbt_free(xnr, false);
508	}
509
510	/ Feed one of the reserved btree blocks to the bulk loader. /
511	int
512	xrep_newbt_claim_block(
513	struct xfs_btree_cur *cur,
514	struct xrep_newbt *xnr,
515	union xfs_btree_ptr *ptr)
516	{
517	struct xrep_newbt_resv *resv;
518	struct xfs_mount *mp = cur->bc_mp;
519	xfs_agblock_t agbno;
520
521	/*
522	* The first item in the list should always have a free block unless
523	* we're completely out.
524	*/
525	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
526	if (resv->used == resv->len)
527	return -ENOSPC;
528
529	/*
530	* Peel off a block from the start of the reservation. We allocate
531	* blocks in order to place blocks on disk in increasing record or key
532	* order. The block reservations tend to end up on the list in
533	* decreasing order, which hopefully results in leaf blocks ending up
534	* together.
535	*/
536	agbno = resv->agbno + resv->used;
537	resv->used++;
538
539	/ If we used all the blocks in this reservation, move it to the end. /
540	if (resv->used == resv->len)
541	list_move_tail(&resv->list, &xnr->resv_list);
542
543	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, `1`,
544	xnr->oinfo.oi_owner);
545
546	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
547	ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
548	agbno));
549	else
550	ptr->s = cpu_to_be32(agbno);
551
552	/ Relog all the EFIs. /
553	return xrep_defer_finish(xnr->sc);
554	}
555
556	/ How many reserved blocks are unused? /
557	unsigned int
558	xrep_newbt_unused_blocks(
559	struct xrep_newbt *xnr)
560	{
561	struct xrep_newbt_resv *resv;
562	unsigned int unused = `0`;
563
564	list_for_each_entry(resv, &xnr->resv_list, list)
565	unused += resv->len - resv->used;
566	return unused;
567	}
568

source code of linux/fs/xfs/scrub/newbt.c