1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2022-2023 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_btree.h" |
13 | #include "xfs_btree_staging.h" |
14 | #include "xfs_log_format.h" |
15 | #include "xfs_trans.h" |
16 | #include "xfs_sb.h" |
17 | #include "xfs_inode.h" |
18 | #include "xfs_alloc.h" |
19 | #include "xfs_rmap.h" |
20 | #include "xfs_ag.h" |
21 | #include "xfs_defer.h" |
22 | #include "scrub/scrub.h" |
23 | #include "scrub/common.h" |
24 | #include "scrub/trace.h" |
25 | #include "scrub/repair.h" |
26 | #include "scrub/newbt.h" |
27 | |
28 | /* |
29 | * Estimate proper slack values for a btree that's being reloaded. |
30 | * |
31 | * Under most circumstances, we'll take whatever default loading value the |
32 | * btree bulk loading code calculates for us. However, there are some |
33 | * exceptions to this rule: |
34 | * |
35 | * (0) If someone turned one of the debug knobs. |
36 | * (1) If this is a per-AG btree and the AG has less than 10% space free. |
37 | * (2) If this is an inode btree and the FS has less than 10% space free. |
38 | |
39 | * In either case, format the new btree blocks almost completely full to |
40 | * minimize space usage. |
41 | */ |
42 | static void |
43 | xrep_newbt_estimate_slack( |
44 | struct xrep_newbt *xnr) |
45 | { |
46 | struct xfs_scrub *sc = xnr->sc; |
47 | struct xfs_btree_bload *bload = &xnr->bload; |
48 | uint64_t free; |
49 | uint64_t sz; |
50 | |
51 | /* |
52 | * The xfs_globals values are set to -1 (i.e. take the bload defaults) |
53 | * unless someone has set them otherwise, so we just pull the values |
54 | * here. |
55 | */ |
56 | bload->leaf_slack = xfs_globals.bload_leaf_slack; |
57 | bload->node_slack = xfs_globals.bload_node_slack; |
58 | |
59 | if (sc->ops->type == ST_PERAG) { |
60 | free = sc->sa.pag->pagf_freeblks; |
61 | sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); |
62 | } else { |
63 | free = percpu_counter_sum(&sc->mp->m_fdblocks); |
64 | sz = sc->mp->m_sb.sb_dblocks; |
65 | } |
66 | |
67 | /* No further changes if there's more than 10% free space left. */ |
68 | if (free >= div_u64(sz, 10)) |
69 | return; |
70 | |
71 | /* |
72 | * We're low on space; load the btrees as tightly as possible. Leave |
73 | * a couple of open slots in each btree block so that we don't end up |
74 | * splitting the btrees like crazy after a mount. |
75 | */ |
76 | if (bload->leaf_slack < 0) |
77 | bload->leaf_slack = 2; |
78 | if (bload->node_slack < 0) |
79 | bload->node_slack = 2; |
80 | } |
81 | |
82 | /* Initialize accounting resources for staging a new AG btree. */ |
83 | void |
84 | xrep_newbt_init_ag( |
85 | struct xrep_newbt *xnr, |
86 | struct xfs_scrub *sc, |
87 | const struct xfs_owner_info *oinfo, |
88 | xfs_fsblock_t alloc_hint, |
89 | enum xfs_ag_resv_type resv) |
90 | { |
91 | memset(xnr, 0, sizeof(struct xrep_newbt)); |
92 | xnr->sc = sc; |
93 | xnr->oinfo = *oinfo; /* structure copy */ |
94 | xnr->alloc_hint = alloc_hint; |
95 | xnr->resv = resv; |
96 | INIT_LIST_HEAD(&xnr->resv_list); |
97 | xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ |
98 | xrep_newbt_estimate_slack(xnr); |
99 | } |
100 | |
101 | /* Initialize accounting resources for staging a new inode fork btree. */ |
102 | int |
103 | xrep_newbt_init_inode( |
104 | struct xrep_newbt *xnr, |
105 | struct xfs_scrub *sc, |
106 | int whichfork, |
107 | const struct xfs_owner_info *oinfo) |
108 | { |
109 | struct xfs_ifork *ifp; |
110 | |
111 | ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); |
112 | if (!ifp) |
113 | return -ENOMEM; |
114 | |
115 | xrep_newbt_init_ag(xnr, sc, oinfo, |
116 | XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), |
117 | XFS_AG_RESV_NONE); |
118 | xnr->ifake.if_fork = ifp; |
119 | xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); |
120 | return 0; |
121 | } |
122 | |
123 | /* |
124 | * Initialize accounting resources for staging a new btree. Callers are |
125 | * expected to add their own reservations (and clean them up) manually. |
126 | */ |
127 | void |
128 | xrep_newbt_init_bare( |
129 | struct xrep_newbt *xnr, |
130 | struct xfs_scrub *sc) |
131 | { |
132 | xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, |
133 | XFS_AG_RESV_NONE); |
134 | } |
135 | |
136 | /* |
137 | * Designate specific blocks to be used to build our new btree. @pag must be |
138 | * a passive reference. |
139 | */ |
140 | STATIC int |
141 | xrep_newbt_add_blocks( |
142 | struct xrep_newbt *xnr, |
143 | struct xfs_perag *pag, |
144 | const struct xfs_alloc_arg *args) |
145 | { |
146 | struct xfs_mount *mp = xnr->sc->mp; |
147 | struct xrep_newbt_resv *resv; |
148 | int error; |
149 | |
150 | resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); |
151 | if (!resv) |
152 | return -ENOMEM; |
153 | |
154 | INIT_LIST_HEAD(&resv->list); |
155 | resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); |
156 | resv->len = args->len; |
157 | resv->used = 0; |
158 | resv->pag = xfs_perag_hold(pag); |
159 | |
160 | if (args->tp) { |
161 | ASSERT(xnr->oinfo.oi_offset == 0); |
162 | |
163 | error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); |
164 | if (error) |
165 | goto out_pag; |
166 | } |
167 | |
168 | list_add_tail(&resv->list, &xnr->resv_list); |
169 | return 0; |
170 | out_pag: |
171 | xfs_perag_put(resv->pag); |
172 | kfree(resv); |
173 | return error; |
174 | } |
175 | |
176 | /* |
177 | * Add an extent to the new btree reservation pool. Callers are required to |
178 | * reap this reservation manually if the repair is cancelled. @pag must be a |
179 | * passive reference. |
180 | */ |
181 | int |
182 | xrep_newbt_add_extent( |
183 | struct xrep_newbt *xnr, |
184 | struct xfs_perag *pag, |
185 | xfs_agblock_t agbno, |
186 | xfs_extlen_t len) |
187 | { |
188 | struct xfs_mount *mp = xnr->sc->mp; |
189 | struct xfs_alloc_arg args = { |
190 | .tp = NULL, /* no autoreap */ |
191 | .oinfo = xnr->oinfo, |
192 | .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), |
193 | .len = len, |
194 | .resv = xnr->resv, |
195 | }; |
196 | |
197 | return xrep_newbt_add_blocks(xnr, pag, &args); |
198 | } |
199 | |
200 | /* Don't let our allocation hint take us beyond this AG */ |
201 | static inline void |
202 | xrep_newbt_validate_ag_alloc_hint( |
203 | struct xrep_newbt *xnr) |
204 | { |
205 | struct xfs_scrub *sc = xnr->sc; |
206 | xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); |
207 | |
208 | if (agno == sc->sa.pag->pag_agno && |
209 | xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) |
210 | return; |
211 | |
212 | xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, |
213 | XFS_AGFL_BLOCK(sc->mp) + 1); |
214 | } |
215 | |
216 | /* Allocate disk space for a new per-AG btree. */ |
217 | STATIC int |
218 | xrep_newbt_alloc_ag_blocks( |
219 | struct xrep_newbt *xnr, |
220 | uint64_t nr_blocks) |
221 | { |
222 | struct xfs_scrub *sc = xnr->sc; |
223 | struct xfs_mount *mp = sc->mp; |
224 | int error = 0; |
225 | |
226 | ASSERT(sc->sa.pag != NULL); |
227 | |
228 | while (nr_blocks > 0) { |
229 | struct xfs_alloc_arg args = { |
230 | .tp = sc->tp, |
231 | .mp = mp, |
232 | .oinfo = xnr->oinfo, |
233 | .minlen = 1, |
234 | .maxlen = nr_blocks, |
235 | .prod = 1, |
236 | .resv = xnr->resv, |
237 | }; |
238 | xfs_agnumber_t agno; |
239 | |
240 | xrep_newbt_validate_ag_alloc_hint(xnr); |
241 | |
242 | if (xnr->alloc_vextent) |
243 | error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); |
244 | else |
245 | error = xfs_alloc_vextent_near_bno(&args, |
246 | xnr->alloc_hint); |
247 | if (error) |
248 | return error; |
249 | if (args.fsbno == NULLFSBLOCK) |
250 | return -ENOSPC; |
251 | |
252 | agno = XFS_FSB_TO_AGNO(mp, args.fsbno); |
253 | |
254 | trace_xrep_newbt_alloc_ag_blocks(mp, agno, |
255 | XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, |
256 | xnr->oinfo.oi_owner); |
257 | |
258 | if (agno != sc->sa.pag->pag_agno) { |
259 | ASSERT(agno == sc->sa.pag->pag_agno); |
260 | return -EFSCORRUPTED; |
261 | } |
262 | |
263 | error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); |
264 | if (error) |
265 | return error; |
266 | |
267 | nr_blocks -= args.len; |
268 | xnr->alloc_hint = args.fsbno + args.len; |
269 | |
270 | error = xrep_defer_finish(sc); |
271 | if (error) |
272 | return error; |
273 | } |
274 | |
275 | return 0; |
276 | } |
277 | |
278 | /* Don't let our allocation hint take us beyond EOFS */ |
279 | static inline void |
280 | xrep_newbt_validate_file_alloc_hint( |
281 | struct xrep_newbt *xnr) |
282 | { |
283 | struct xfs_scrub *sc = xnr->sc; |
284 | |
285 | if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) |
286 | return; |
287 | |
288 | xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); |
289 | } |
290 | |
291 | /* Allocate disk space for our new file-based btree. */ |
292 | STATIC int |
293 | xrep_newbt_alloc_file_blocks( |
294 | struct xrep_newbt *xnr, |
295 | uint64_t nr_blocks) |
296 | { |
297 | struct xfs_scrub *sc = xnr->sc; |
298 | struct xfs_mount *mp = sc->mp; |
299 | int error = 0; |
300 | |
301 | while (nr_blocks > 0) { |
302 | struct xfs_alloc_arg args = { |
303 | .tp = sc->tp, |
304 | .mp = mp, |
305 | .oinfo = xnr->oinfo, |
306 | .minlen = 1, |
307 | .maxlen = nr_blocks, |
308 | .prod = 1, |
309 | .resv = xnr->resv, |
310 | }; |
311 | struct xfs_perag *pag; |
312 | xfs_agnumber_t agno; |
313 | |
314 | xrep_newbt_validate_file_alloc_hint(xnr); |
315 | |
316 | if (xnr->alloc_vextent) |
317 | error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); |
318 | else |
319 | error = xfs_alloc_vextent_start_ag(&args, |
320 | xnr->alloc_hint); |
321 | if (error) |
322 | return error; |
323 | if (args.fsbno == NULLFSBLOCK) |
324 | return -ENOSPC; |
325 | |
326 | agno = XFS_FSB_TO_AGNO(mp, args.fsbno); |
327 | |
328 | trace_xrep_newbt_alloc_file_blocks(mp, agno, |
329 | XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, |
330 | xnr->oinfo.oi_owner); |
331 | |
332 | pag = xfs_perag_get(mp, agno); |
333 | if (!pag) { |
334 | ASSERT(0); |
335 | return -EFSCORRUPTED; |
336 | } |
337 | |
338 | error = xrep_newbt_add_blocks(xnr, pag, &args); |
339 | xfs_perag_put(pag); |
340 | if (error) |
341 | return error; |
342 | |
343 | nr_blocks -= args.len; |
344 | xnr->alloc_hint = args.fsbno + args.len; |
345 | |
346 | error = xrep_defer_finish(sc); |
347 | if (error) |
348 | return error; |
349 | } |
350 | |
351 | return 0; |
352 | } |
353 | |
354 | /* Allocate disk space for our new btree. */ |
355 | int |
356 | xrep_newbt_alloc_blocks( |
357 | struct xrep_newbt *xnr, |
358 | uint64_t nr_blocks) |
359 | { |
360 | if (xnr->sc->ip) |
361 | return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); |
362 | return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); |
363 | } |
364 | |
365 | /* |
366 | * Free the unused part of a space extent that was reserved for a new ondisk |
367 | * structure. Returns the number of EFIs logged or a negative errno. |
368 | */ |
369 | STATIC int |
370 | xrep_newbt_free_extent( |
371 | struct xrep_newbt *xnr, |
372 | struct xrep_newbt_resv *resv, |
373 | bool btree_committed) |
374 | { |
375 | struct xfs_scrub *sc = xnr->sc; |
376 | xfs_agblock_t free_agbno = resv->agbno; |
377 | xfs_extlen_t free_aglen = resv->len; |
378 | xfs_fsblock_t fsbno; |
379 | int error; |
380 | |
381 | if (!btree_committed || resv->used == 0) { |
382 | /* |
383 | * If we're not committing a new btree or we didn't use the |
384 | * space reservation, let the existing EFI free the entire |
385 | * space extent. |
386 | */ |
387 | trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, |
388 | free_agbno, free_aglen, xnr->oinfo.oi_owner); |
389 | xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); |
390 | return 1; |
391 | } |
392 | |
393 | /* |
394 | * We used space and committed the btree. Cancel the autoreap, remove |
395 | * the written blocks from the reservation, and possibly log a new EFI |
396 | * to free any unused reservation space. |
397 | */ |
398 | xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); |
399 | free_agbno += resv->used; |
400 | free_aglen -= resv->used; |
401 | |
402 | if (free_aglen == 0) |
403 | return 0; |
404 | |
405 | trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, |
406 | free_aglen, xnr->oinfo.oi_owner); |
407 | |
408 | ASSERT(xnr->resv != XFS_AG_RESV_AGFL); |
409 | ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); |
410 | |
411 | /* |
412 | * Use EFIs to free the reservations. This reduces the chance |
413 | * that we leak blocks if the system goes down. |
414 | */ |
415 | fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); |
416 | error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, |
417 | xnr->resv, true); |
418 | if (error) |
419 | return error; |
420 | |
421 | return 1; |
422 | } |
423 | |
424 | /* Free all the accounting info and disk space we reserved for a new btree. */ |
425 | STATIC int |
426 | xrep_newbt_free( |
427 | struct xrep_newbt *xnr, |
428 | bool btree_committed) |
429 | { |
430 | struct xfs_scrub *sc = xnr->sc; |
431 | struct xrep_newbt_resv *resv, *n; |
432 | unsigned int freed = 0; |
433 | int error = 0; |
434 | |
435 | /* |
436 | * If the filesystem already went down, we can't free the blocks. Skip |
437 | * ahead to freeing the incore metadata because we can't fix anything. |
438 | */ |
439 | if (xfs_is_shutdown(sc->mp)) |
440 | goto junkit; |
441 | |
442 | list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { |
443 | int ret; |
444 | |
445 | ret = xrep_newbt_free_extent(xnr, resv, btree_committed); |
446 | list_del(&resv->list); |
447 | xfs_perag_put(resv->pag); |
448 | kfree(resv); |
449 | if (ret < 0) { |
450 | error = ret; |
451 | goto junkit; |
452 | } |
453 | |
454 | freed += ret; |
455 | if (freed >= XREP_MAX_ITRUNCATE_EFIS) { |
456 | error = xrep_defer_finish(sc); |
457 | if (error) |
458 | goto junkit; |
459 | freed = 0; |
460 | } |
461 | } |
462 | |
463 | if (freed) |
464 | error = xrep_defer_finish(sc); |
465 | |
466 | junkit: |
467 | /* |
468 | * If we still have reservations attached to @newbt, cleanup must have |
469 | * failed and the filesystem is about to go down. Clean up the incore |
470 | * reservations and try to commit to freeing the space we used. |
471 | */ |
472 | list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { |
473 | xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); |
474 | list_del(&resv->list); |
475 | xfs_perag_put(resv->pag); |
476 | kfree(resv); |
477 | } |
478 | |
479 | if (sc->ip) { |
480 | kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); |
481 | xnr->ifake.if_fork = NULL; |
482 | } |
483 | |
484 | return error; |
485 | } |
486 | |
487 | /* |
488 | * Free all the accounting info and unused disk space allocations after |
489 | * committing a new btree. |
490 | */ |
491 | int |
492 | xrep_newbt_commit( |
493 | struct xrep_newbt *xnr) |
494 | { |
495 | return xrep_newbt_free(xnr, true); |
496 | } |
497 | |
498 | /* |
499 | * Free all the accounting info and all of the disk space we reserved for a new |
500 | * btree that we're not going to commit. We want to try to roll things back |
501 | * cleanly for things like ENOSPC midway through allocation. |
502 | */ |
503 | void |
504 | xrep_newbt_cancel( |
505 | struct xrep_newbt *xnr) |
506 | { |
507 | xrep_newbt_free(xnr, false); |
508 | } |
509 | |
510 | /* Feed one of the reserved btree blocks to the bulk loader. */ |
511 | int |
512 | xrep_newbt_claim_block( |
513 | struct xfs_btree_cur *cur, |
514 | struct xrep_newbt *xnr, |
515 | union xfs_btree_ptr *ptr) |
516 | { |
517 | struct xrep_newbt_resv *resv; |
518 | struct xfs_mount *mp = cur->bc_mp; |
519 | xfs_agblock_t agbno; |
520 | |
521 | /* |
522 | * The first item in the list should always have a free block unless |
523 | * we're completely out. |
524 | */ |
525 | resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); |
526 | if (resv->used == resv->len) |
527 | return -ENOSPC; |
528 | |
529 | /* |
530 | * Peel off a block from the start of the reservation. We allocate |
531 | * blocks in order to place blocks on disk in increasing record or key |
532 | * order. The block reservations tend to end up on the list in |
533 | * decreasing order, which hopefully results in leaf blocks ending up |
534 | * together. |
535 | */ |
536 | agbno = resv->agbno + resv->used; |
537 | resv->used++; |
538 | |
539 | /* If we used all the blocks in this reservation, move it to the end. */ |
540 | if (resv->used == resv->len) |
541 | list_move_tail(&resv->list, &xnr->resv_list); |
542 | |
543 | trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, |
544 | xnr->oinfo.oi_owner); |
545 | |
546 | if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) |
547 | ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, |
548 | agbno)); |
549 | else |
550 | ptr->s = cpu_to_be32(agbno); |
551 | |
552 | /* Relog all the EFIs. */ |
553 | return xrep_defer_finish(xnr->sc); |
554 | } |
555 | |
556 | /* How many reserved blocks are unused? */ |
557 | unsigned int |
558 | xrep_newbt_unused_blocks( |
559 | struct xrep_newbt *xnr) |
560 | { |
561 | struct xrep_newbt_resv *resv; |
562 | unsigned int unused = 0; |
563 | |
564 | list_for_each_entry(resv, &xnr->resv_list, list) |
565 | unused += resv->len - resv->used; |
566 | return unused; |
567 | } |
568 | |