1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_defer.h" |
13 | #include "xfs_btree.h" |
14 | #include "xfs_btree_staging.h" |
15 | #include "xfs_bit.h" |
16 | #include "xfs_log_format.h" |
17 | #include "xfs_trans.h" |
18 | #include "xfs_sb.h" |
19 | #include "xfs_alloc.h" |
20 | #include "xfs_alloc_btree.h" |
21 | #include "xfs_rmap.h" |
22 | #include "xfs_rmap_btree.h" |
23 | #include "xfs_inode.h" |
24 | #include "xfs_refcount.h" |
25 | #include "xfs_extent_busy.h" |
26 | #include "xfs_health.h" |
27 | #include "xfs_bmap.h" |
28 | #include "xfs_ialloc.h" |
29 | #include "xfs_ag.h" |
30 | #include "scrub/xfs_scrub.h" |
31 | #include "scrub/scrub.h" |
32 | #include "scrub/common.h" |
33 | #include "scrub/btree.h" |
34 | #include "scrub/trace.h" |
35 | #include "scrub/repair.h" |
36 | #include "scrub/bitmap.h" |
37 | #include "scrub/agb_bitmap.h" |
38 | #include "scrub/xfile.h" |
39 | #include "scrub/xfarray.h" |
40 | #include "scrub/newbt.h" |
41 | #include "scrub/reap.h" |
42 | |
43 | /* |
44 | * Free Space Btree Repair |
45 | * ======================= |
46 | * |
47 | * The reverse mappings are supposed to record all space usage for the entire |
48 | * AG. Therefore, we can recreate the free extent records in an AG by looking |
49 | * for gaps in the physical extents recorded in the rmapbt. These records are |
50 | * staged in @free_records. Identifying the gaps is more difficult on a |
51 | * reflink filesystem because rmap records are allowed to overlap. |
52 | * |
53 | * Because the final step of building a new index is to free the space used by |
54 | * the old index, repair needs to find that space. Unfortunately, all |
55 | * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share |
56 | * the same rmapbt owner code (OWN_AG), so this is not straightforward. |
57 | * |
58 | * The scan of the reverse mapping information records the space used by OWN_AG |
59 | * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While |
60 | * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to |
61 | * record all visited rmap btree blocks and all blocks owned by the AGFL. |
62 | * |
63 | * After that is where the definitions of old_allocbt_blocks shifts. This |
64 | * expression identifies possible former bnobt/cntbt blocks: |
65 | * |
66 | * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); |
67 | * |
68 | * Substituting from above definitions, that becomes: |
69 | * |
70 | * old_allocbt_blocks & ~not_allocbt_blocks |
71 | * |
72 | * The OWN_AG bitmap itself isn't needed after this point, so what we really do |
73 | * instead is: |
74 | * |
75 | * old_allocbt_blocks &= ~not_allocbt_blocks; |
76 | * |
77 | * After this point, @old_allocbt_blocks is a bitmap of alleged former |
78 | * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first |
79 | * parameter in place to avoid copying records around. |
80 | * |
81 | * Next, some of the space described by @free_records are diverted to the newbt |
82 | * reservation and used to format new btree blocks. The remaining records are |
83 | * written to the new btree indices. We reconstruct both bnobt and cntbt at |
84 | * the same time since we've already done all the work. |
85 | * |
86 | * We use the prefix 'xrep_abt' here because we regenerate both free space |
87 | * allocation btrees at the same time. |
88 | */ |
89 | |
90 | struct xrep_abt { |
91 | /* Blocks owned by the rmapbt or the agfl. */ |
92 | struct xagb_bitmap not_allocbt_blocks; |
93 | |
94 | /* All OWN_AG blocks. */ |
95 | struct xagb_bitmap old_allocbt_blocks; |
96 | |
97 | /* |
98 | * New bnobt information. All btree block reservations are added to |
99 | * the reservation list in new_bnobt. |
100 | */ |
101 | struct xrep_newbt new_bnobt; |
102 | |
103 | /* new cntbt information */ |
104 | struct xrep_newbt new_cntbt; |
105 | |
106 | /* Free space extents. */ |
107 | struct xfarray *free_records; |
108 | |
109 | struct xfs_scrub *sc; |
110 | |
111 | /* Number of non-null records in @free_records. */ |
112 | uint64_t nr_real_records; |
113 | |
114 | /* get_records()'s position in the free space record array. */ |
115 | xfarray_idx_t array_cur; |
116 | |
117 | /* |
118 | * Next block we anticipate seeing in the rmap records. If the next |
119 | * rmap record is greater than next_agbno, we have found unused space. |
120 | */ |
121 | xfs_agblock_t next_agbno; |
122 | |
123 | /* Number of free blocks in this AG. */ |
124 | xfs_agblock_t nr_blocks; |
125 | |
126 | /* Longest free extent we found in the AG. */ |
127 | xfs_agblock_t longest; |
128 | }; |
129 | |
130 | /* Set up to repair AG free space btrees. */ |
131 | int |
132 | xrep_setup_ag_allocbt( |
133 | struct xfs_scrub *sc) |
134 | { |
135 | unsigned int busy_gen; |
136 | |
137 | /* |
138 | * Make sure the busy extent list is clear because we can't put extents |
139 | * on there twice. |
140 | */ |
141 | busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); |
142 | if (xfs_extent_busy_list_empty(sc->sa.pag)) |
143 | return 0; |
144 | |
145 | return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); |
146 | } |
147 | |
148 | /* Check for any obvious conflicts in the free extent. */ |
149 | STATIC int |
150 | xrep_abt_check_free_ext( |
151 | struct xfs_scrub *sc, |
152 | const struct xfs_alloc_rec_incore *rec) |
153 | { |
154 | enum xbtree_recpacking outcome; |
155 | int error; |
156 | |
157 | if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) |
158 | return -EFSCORRUPTED; |
159 | |
160 | /* Must not be an inode chunk. */ |
161 | error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, |
162 | rec->ar_startblock, rec->ar_blockcount, &outcome); |
163 | if (error) |
164 | return error; |
165 | if (outcome != XBTREE_RECPACKING_EMPTY) |
166 | return -EFSCORRUPTED; |
167 | |
168 | /* Must not be shared or CoW staging. */ |
169 | if (sc->sa.refc_cur) { |
170 | error = xfs_refcount_has_records(sc->sa.refc_cur, |
171 | XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, |
172 | rec->ar_blockcount, &outcome); |
173 | if (error) |
174 | return error; |
175 | if (outcome != XBTREE_RECPACKING_EMPTY) |
176 | return -EFSCORRUPTED; |
177 | |
178 | error = xfs_refcount_has_records(sc->sa.refc_cur, |
179 | XFS_REFC_DOMAIN_COW, rec->ar_startblock, |
180 | rec->ar_blockcount, &outcome); |
181 | if (error) |
182 | return error; |
183 | if (outcome != XBTREE_RECPACKING_EMPTY) |
184 | return -EFSCORRUPTED; |
185 | } |
186 | |
187 | return 0; |
188 | } |
189 | |
190 | /* |
191 | * Stash a free space record for all the space since the last bno we found |
192 | * all the way up to @end. |
193 | */ |
194 | static int |
195 | xrep_abt_stash( |
196 | struct xrep_abt *ra, |
197 | xfs_agblock_t end) |
198 | { |
199 | struct xfs_alloc_rec_incore arec = { |
200 | .ar_startblock = ra->next_agbno, |
201 | .ar_blockcount = end - ra->next_agbno, |
202 | }; |
203 | struct xfs_scrub *sc = ra->sc; |
204 | int error = 0; |
205 | |
206 | if (xchk_should_terminate(sc, &error)) |
207 | return error; |
208 | |
209 | error = xrep_abt_check_free_ext(ra->sc, &arec); |
210 | if (error) |
211 | return error; |
212 | |
213 | trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); |
214 | |
215 | error = xfarray_append(ra->free_records, &arec); |
216 | if (error) |
217 | return error; |
218 | |
219 | ra->nr_blocks += arec.ar_blockcount; |
220 | return 0; |
221 | } |
222 | |
223 | /* Record extents that aren't in use from gaps in the rmap records. */ |
224 | STATIC int |
225 | xrep_abt_walk_rmap( |
226 | struct xfs_btree_cur *cur, |
227 | const struct xfs_rmap_irec *rec, |
228 | void *priv) |
229 | { |
230 | struct xrep_abt *ra = priv; |
231 | int error; |
232 | |
233 | /* Record all the OWN_AG blocks... */ |
234 | if (rec->rm_owner == XFS_RMAP_OWN_AG) { |
235 | error = xagb_bitmap_set(&ra->old_allocbt_blocks, |
236 | rec->rm_startblock, rec->rm_blockcount); |
237 | if (error) |
238 | return error; |
239 | } |
240 | |
241 | /* ...and all the rmapbt blocks... */ |
242 | error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); |
243 | if (error) |
244 | return error; |
245 | |
246 | /* ...and all the free space. */ |
247 | if (rec->rm_startblock > ra->next_agbno) { |
248 | error = xrep_abt_stash(ra, rec->rm_startblock); |
249 | if (error) |
250 | return error; |
251 | } |
252 | |
253 | /* |
254 | * rmap records can overlap on reflink filesystems, so project |
255 | * next_agbno as far out into the AG space as we currently know about. |
256 | */ |
257 | ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, |
258 | rec->rm_startblock + rec->rm_blockcount); |
259 | return 0; |
260 | } |
261 | |
262 | /* Collect an AGFL block for the not-to-release list. */ |
263 | static int |
264 | xrep_abt_walk_agfl( |
265 | struct xfs_mount *mp, |
266 | xfs_agblock_t agbno, |
267 | void *priv) |
268 | { |
269 | struct xrep_abt *ra = priv; |
270 | |
271 | return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); |
272 | } |
273 | |
274 | /* |
275 | * Compare two free space extents by block number. We want to sort in order of |
276 | * increasing block number. |
277 | */ |
278 | static int |
279 | xrep_bnobt_extent_cmp( |
280 | const void *a, |
281 | const void *b) |
282 | { |
283 | const struct xfs_alloc_rec_incore *ap = a; |
284 | const struct xfs_alloc_rec_incore *bp = b; |
285 | |
286 | if (ap->ar_startblock > bp->ar_startblock) |
287 | return 1; |
288 | else if (ap->ar_startblock < bp->ar_startblock) |
289 | return -1; |
290 | return 0; |
291 | } |
292 | |
293 | /* |
294 | * Re-sort the free extents by block number so that we can put the records into |
295 | * the bnobt in the correct order. Make sure the records do not overlap in |
296 | * physical space. |
297 | */ |
298 | STATIC int |
299 | xrep_bnobt_sort_records( |
300 | struct xrep_abt *ra) |
301 | { |
302 | struct xfs_alloc_rec_incore arec; |
303 | xfarray_idx_t cur = XFARRAY_CURSOR_INIT; |
304 | xfs_agblock_t next_agbno = 0; |
305 | int error; |
306 | |
307 | error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); |
308 | if (error) |
309 | return error; |
310 | |
311 | while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { |
312 | if (arec.ar_startblock < next_agbno) |
313 | return -EFSCORRUPTED; |
314 | |
315 | next_agbno = arec.ar_startblock + arec.ar_blockcount; |
316 | } |
317 | |
318 | return error; |
319 | } |
320 | |
321 | /* |
322 | * Compare two free space extents by length and then block number. We want |
323 | * to sort first in order of increasing length and then in order of increasing |
324 | * block number. |
325 | */ |
326 | static int |
327 | xrep_cntbt_extent_cmp( |
328 | const void *a, |
329 | const void *b) |
330 | { |
331 | const struct xfs_alloc_rec_incore *ap = a; |
332 | const struct xfs_alloc_rec_incore *bp = b; |
333 | |
334 | if (ap->ar_blockcount > bp->ar_blockcount) |
335 | return 1; |
336 | else if (ap->ar_blockcount < bp->ar_blockcount) |
337 | return -1; |
338 | return xrep_bnobt_extent_cmp(a, b); |
339 | } |
340 | |
341 | /* |
342 | * Sort the free extents by length so so that we can put the records into the |
343 | * cntbt in the correct order. Don't let userspace kill us if we're resorting |
344 | * after allocating btree blocks. |
345 | */ |
346 | STATIC int |
347 | xrep_cntbt_sort_records( |
348 | struct xrep_abt *ra, |
349 | bool is_resort) |
350 | { |
351 | return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, |
352 | is_resort ? 0 : XFARRAY_SORT_KILLABLE); |
353 | } |
354 | |
355 | /* |
356 | * Iterate all reverse mappings to find (1) the gaps between rmap records (all |
357 | * unowned space), (2) the OWN_AG extents (which encompass the free space |
358 | * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL |
359 | * blocks. The free space is (1) + (2) - (3) - (4). |
360 | */ |
361 | STATIC int |
362 | xrep_abt_find_freespace( |
363 | struct xrep_abt *ra) |
364 | { |
365 | struct xfs_scrub *sc = ra->sc; |
366 | struct xfs_mount *mp = sc->mp; |
367 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
368 | struct xfs_buf *agfl_bp; |
369 | xfs_agblock_t agend; |
370 | int error; |
371 | |
372 | xagb_bitmap_init(&ra->not_allocbt_blocks); |
373 | |
374 | xrep_ag_btcur_init(sc, &sc->sa); |
375 | |
376 | /* |
377 | * Iterate all the reverse mappings to find gaps in the physical |
378 | * mappings, all the OWN_AG blocks, and all the rmapbt extents. |
379 | */ |
380 | error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); |
381 | if (error) |
382 | goto err; |
383 | |
384 | /* Insert a record for space between the last rmap and EOAG. */ |
385 | agend = be32_to_cpu(agf->agf_length); |
386 | if (ra->next_agbno < agend) { |
387 | error = xrep_abt_stash(ra, agend); |
388 | if (error) |
389 | goto err; |
390 | } |
391 | |
392 | /* Collect all the AGFL blocks. */ |
393 | error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); |
394 | if (error) |
395 | goto err; |
396 | |
397 | error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); |
398 | if (error) |
399 | goto err_agfl; |
400 | |
401 | /* Compute the old bnobt/cntbt blocks. */ |
402 | error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, |
403 | &ra->not_allocbt_blocks); |
404 | if (error) |
405 | goto err_agfl; |
406 | |
407 | ra->nr_real_records = xfarray_length(ra->free_records); |
408 | err_agfl: |
409 | xfs_trans_brelse(sc->tp, agfl_bp); |
410 | err: |
411 | xchk_ag_btcur_free(&sc->sa); |
412 | xagb_bitmap_destroy(&ra->not_allocbt_blocks); |
413 | return error; |
414 | } |
415 | |
416 | /* |
417 | * We're going to use the observed free space records to reserve blocks for the |
418 | * new free space btrees, so we play an iterative game where we try to converge |
419 | * on the number of blocks we need: |
420 | * |
421 | * 1. Estimate how many blocks we'll need to store the records. |
422 | * 2. If the first free record has more blocks than we need, we're done. |
423 | * We will have to re-sort the records prior to building the cntbt. |
424 | * 3. If that record has exactly the number of blocks we need, null out the |
425 | * record. We're done. |
426 | * 4. Otherwise, we still need more blocks. Null out the record, subtract its |
427 | * length from the number of blocks we need, and go back to step 1. |
428 | * |
429 | * Fortunately, we don't have to do any transaction work to play this game, so |
430 | * we don't have to tear down the staging cursors. |
431 | */ |
432 | STATIC int |
433 | xrep_abt_reserve_space( |
434 | struct xrep_abt *ra, |
435 | struct xfs_btree_cur *bno_cur, |
436 | struct xfs_btree_cur *cnt_cur, |
437 | bool *needs_resort) |
438 | { |
439 | struct xfs_scrub *sc = ra->sc; |
440 | xfarray_idx_t record_nr; |
441 | unsigned int allocated = 0; |
442 | int error = 0; |
443 | |
444 | record_nr = xfarray_length(ra->free_records) - 1; |
445 | do { |
446 | struct xfs_alloc_rec_incore arec; |
447 | uint64_t required; |
448 | unsigned int desired; |
449 | unsigned int len; |
450 | |
451 | /* Compute how many blocks we'll need. */ |
452 | error = xfs_btree_bload_compute_geometry(cnt_cur, |
453 | &ra->new_cntbt.bload, ra->nr_real_records); |
454 | if (error) |
455 | break; |
456 | |
457 | error = xfs_btree_bload_compute_geometry(bno_cur, |
458 | &ra->new_bnobt.bload, ra->nr_real_records); |
459 | if (error) |
460 | break; |
461 | |
462 | /* How many btree blocks do we need to store all records? */ |
463 | required = ra->new_bnobt.bload.nr_blocks + |
464 | ra->new_cntbt.bload.nr_blocks; |
465 | ASSERT(required < INT_MAX); |
466 | |
467 | /* If we've reserved enough blocks, we're done. */ |
468 | if (allocated >= required) |
469 | break; |
470 | |
471 | desired = required - allocated; |
472 | |
473 | /* We need space but there's none left; bye! */ |
474 | if (ra->nr_real_records == 0) { |
475 | error = -ENOSPC; |
476 | break; |
477 | } |
478 | |
479 | /* Grab the first record from the list. */ |
480 | error = xfarray_load(ra->free_records, record_nr, &arec); |
481 | if (error) |
482 | break; |
483 | |
484 | ASSERT(arec.ar_blockcount <= UINT_MAX); |
485 | len = min_t(unsigned int, arec.ar_blockcount, desired); |
486 | |
487 | trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, |
488 | arec.ar_startblock, len, XFS_RMAP_OWN_AG); |
489 | |
490 | error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, |
491 | arec.ar_startblock, len); |
492 | if (error) |
493 | break; |
494 | allocated += len; |
495 | ra->nr_blocks -= len; |
496 | |
497 | if (arec.ar_blockcount > desired) { |
498 | /* |
499 | * Record has more space than we need. The number of |
500 | * free records doesn't change, so shrink the free |
501 | * record, inform the caller that the records are no |
502 | * longer sorted by length, and exit. |
503 | */ |
504 | arec.ar_startblock += desired; |
505 | arec.ar_blockcount -= desired; |
506 | error = xfarray_store(ra->free_records, record_nr, |
507 | &arec); |
508 | if (error) |
509 | break; |
510 | |
511 | *needs_resort = true; |
512 | return 0; |
513 | } |
514 | |
515 | /* |
516 | * We're going to use up the entire record, so unset it and |
517 | * move on to the next one. This changes the number of free |
518 | * records (but doesn't break the sorting order), so we must |
519 | * go around the loop once more to re-run _bload_init. |
520 | */ |
521 | error = xfarray_unset(ra->free_records, record_nr); |
522 | if (error) |
523 | break; |
524 | ra->nr_real_records--; |
525 | record_nr--; |
526 | } while (1); |
527 | |
528 | return error; |
529 | } |
530 | |
531 | STATIC int |
532 | xrep_abt_dispose_one( |
533 | struct xrep_abt *ra, |
534 | struct xrep_newbt_resv *resv) |
535 | { |
536 | struct xfs_scrub *sc = ra->sc; |
537 | struct xfs_perag *pag = sc->sa.pag; |
538 | xfs_agblock_t free_agbno = resv->agbno + resv->used; |
539 | xfs_extlen_t free_aglen = resv->len - resv->used; |
540 | int error; |
541 | |
542 | ASSERT(pag == resv->pag); |
543 | |
544 | /* Add a deferred rmap for each extent we used. */ |
545 | if (resv->used > 0) |
546 | xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, |
547 | resv->used, XFS_RMAP_OWN_AG); |
548 | |
549 | /* |
550 | * For each reserved btree block we didn't use, add it to the free |
551 | * space btree. We didn't touch fdblocks when we reserved them, so |
552 | * we don't touch it now. |
553 | */ |
554 | if (free_aglen == 0) |
555 | return 0; |
556 | |
557 | trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, |
558 | free_aglen, ra->new_bnobt.oinfo.oi_owner); |
559 | |
560 | error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, |
561 | &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); |
562 | if (error) |
563 | return error; |
564 | |
565 | return xrep_defer_finish(sc); |
566 | } |
567 | |
568 | /* |
569 | * Deal with all the space we reserved. Blocks that were allocated for the |
570 | * free space btrees need to have a (deferred) rmap added for the OWN_AG |
571 | * allocation, and blocks that didn't get used can be freed via the usual |
572 | * (deferred) means. |
573 | */ |
574 | STATIC void |
575 | xrep_abt_dispose_reservations( |
576 | struct xrep_abt *ra, |
577 | int error) |
578 | { |
579 | struct xrep_newbt_resv *resv, *n; |
580 | |
581 | if (error) |
582 | goto junkit; |
583 | |
584 | list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { |
585 | error = xrep_abt_dispose_one(ra, resv); |
586 | if (error) |
587 | goto junkit; |
588 | } |
589 | |
590 | junkit: |
591 | list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { |
592 | xfs_perag_put(resv->pag); |
593 | list_del(&resv->list); |
594 | kfree(resv); |
595 | } |
596 | |
597 | xrep_newbt_cancel(&ra->new_bnobt); |
598 | xrep_newbt_cancel(&ra->new_cntbt); |
599 | } |
600 | |
601 | /* Retrieve free space data for bulk load. */ |
602 | STATIC int |
603 | xrep_abt_get_records( |
604 | struct xfs_btree_cur *cur, |
605 | unsigned int idx, |
606 | struct xfs_btree_block *block, |
607 | unsigned int nr_wanted, |
608 | void *priv) |
609 | { |
610 | struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; |
611 | struct xrep_abt *ra = priv; |
612 | union xfs_btree_rec *block_rec; |
613 | unsigned int loaded; |
614 | int error; |
615 | |
616 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
617 | error = xfarray_load_next(ra->free_records, &ra->array_cur, |
618 | arec); |
619 | if (error) |
620 | return error; |
621 | |
622 | ra->longest = max(ra->longest, arec->ar_blockcount); |
623 | |
624 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
625 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
626 | } |
627 | |
628 | return loaded; |
629 | } |
630 | |
631 | /* Feed one of the new btree blocks to the bulk loader. */ |
632 | STATIC int |
633 | xrep_abt_claim_block( |
634 | struct xfs_btree_cur *cur, |
635 | union xfs_btree_ptr *ptr, |
636 | void *priv) |
637 | { |
638 | struct xrep_abt *ra = priv; |
639 | |
640 | return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); |
641 | } |
642 | |
643 | /* |
644 | * Reset the AGF counters to reflect the free space btrees that we just |
645 | * rebuilt, then reinitialize the per-AG data. |
646 | */ |
647 | STATIC int |
648 | xrep_abt_reset_counters( |
649 | struct xrep_abt *ra) |
650 | { |
651 | struct xfs_scrub *sc = ra->sc; |
652 | struct xfs_perag *pag = sc->sa.pag; |
653 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
654 | unsigned int freesp_btreeblks = 0; |
655 | |
656 | /* |
657 | * Compute the contribution to agf_btreeblks for the new free space |
658 | * btrees. This is the computed btree size minus anything we didn't |
659 | * use. |
660 | */ |
661 | freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; |
662 | freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; |
663 | |
664 | freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); |
665 | freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); |
666 | |
667 | /* |
668 | * The AGF header contains extra information related to the free space |
669 | * btrees, so we must update those fields here. |
670 | */ |
671 | agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + |
672 | (be32_to_cpu(agf->agf_rmap_blocks) - 1)); |
673 | agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); |
674 | agf->agf_longest = cpu_to_be32(ra->longest); |
675 | xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | |
676 | XFS_AGF_LONGEST | |
677 | XFS_AGF_FREEBLKS); |
678 | |
679 | /* |
680 | * After we commit the new btree to disk, it is possible that the |
681 | * process to reap the old btree blocks will race with the AIL trying |
682 | * to checkpoint the old btree blocks into the filesystem. If the new |
683 | * tree is shorter than the old one, the allocbt write verifier will |
684 | * fail and the AIL will shut down the filesystem. |
685 | * |
686 | * To avoid this, save the old incore btree height values as the alt |
687 | * height values before re-initializing the perag info from the updated |
688 | * AGF to capture all the new values. |
689 | */ |
690 | pag->pagf_repair_bno_level = pag->pagf_bno_level; |
691 | pag->pagf_repair_cnt_level = pag->pagf_cnt_level; |
692 | |
693 | /* Reinitialize with the values we just logged. */ |
694 | return xrep_reinit_pagf(sc); |
695 | } |
696 | |
697 | /* |
698 | * Use the collected free space information to stage new free space btrees. |
699 | * If this is successful we'll return with the new btree root |
700 | * information logged to the repair transaction but not yet committed. |
701 | */ |
702 | STATIC int |
703 | xrep_abt_build_new_trees( |
704 | struct xrep_abt *ra) |
705 | { |
706 | struct xfs_scrub *sc = ra->sc; |
707 | struct xfs_btree_cur *bno_cur; |
708 | struct xfs_btree_cur *cnt_cur; |
709 | struct xfs_perag *pag = sc->sa.pag; |
710 | bool needs_resort = false; |
711 | int error; |
712 | |
713 | /* |
714 | * Sort the free extents by length so that we can set up the free space |
715 | * btrees in as few extents as possible. This reduces the amount of |
716 | * deferred rmap / free work we have to do at the end. |
717 | */ |
718 | error = xrep_cntbt_sort_records(ra, false); |
719 | if (error) |
720 | return error; |
721 | |
722 | /* |
723 | * Prepare to construct the new btree by reserving disk space for the |
724 | * new btree and setting up all the accounting information we'll need |
725 | * to root the new btree while it's under construction and before we |
726 | * attach it to the AG header. |
727 | */ |
728 | xrep_newbt_init_bare(&ra->new_bnobt, sc); |
729 | xrep_newbt_init_bare(&ra->new_cntbt, sc); |
730 | |
731 | ra->new_bnobt.bload.get_records = xrep_abt_get_records; |
732 | ra->new_cntbt.bload.get_records = xrep_abt_get_records; |
733 | |
734 | ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; |
735 | ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; |
736 | |
737 | /* Allocate cursors for the staged btrees. */ |
738 | bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); |
739 | xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); |
740 | |
741 | cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); |
742 | xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); |
743 | |
744 | /* Last chance to abort before we start committing fixes. */ |
745 | if (xchk_should_terminate(sc, &error)) |
746 | goto err_cur; |
747 | |
748 | /* Reserve the space we'll need for the new btrees. */ |
749 | error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); |
750 | if (error) |
751 | goto err_cur; |
752 | |
753 | /* |
754 | * If we need to re-sort the free extents by length, do so so that we |
755 | * can put the records into the cntbt in the correct order. |
756 | */ |
757 | if (needs_resort) { |
758 | error = xrep_cntbt_sort_records(ra, needs_resort); |
759 | if (error) |
760 | goto err_cur; |
761 | } |
762 | |
763 | /* |
764 | * Due to btree slack factors, it's possible for a new btree to be one |
765 | * level taller than the old btree. Update the alternate incore btree |
766 | * height so that we don't trip the verifiers when writing the new |
767 | * btree blocks to disk. |
768 | */ |
769 | pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; |
770 | pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; |
771 | |
772 | /* Load the free space by length tree. */ |
773 | ra->array_cur = XFARRAY_CURSOR_INIT; |
774 | ra->longest = 0; |
775 | error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); |
776 | if (error) |
777 | goto err_levels; |
778 | |
779 | error = xrep_bnobt_sort_records(ra); |
780 | if (error) |
781 | return error; |
782 | |
783 | /* Load the free space by block number tree. */ |
784 | ra->array_cur = XFARRAY_CURSOR_INIT; |
785 | error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); |
786 | if (error) |
787 | goto err_levels; |
788 | |
789 | /* |
790 | * Install the new btrees in the AG header. After this point the old |
791 | * btrees are no longer accessible and the new trees are live. |
792 | */ |
793 | xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); |
794 | xfs_btree_del_cursor(bno_cur, 0); |
795 | xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); |
796 | xfs_btree_del_cursor(cnt_cur, 0); |
797 | |
798 | /* Reset the AGF counters now that we've changed the btree shape. */ |
799 | error = xrep_abt_reset_counters(ra); |
800 | if (error) |
801 | goto err_newbt; |
802 | |
803 | /* Dispose of any unused blocks and the accounting information. */ |
804 | xrep_abt_dispose_reservations(ra, error); |
805 | |
806 | return xrep_roll_ag_trans(sc); |
807 | |
808 | err_levels: |
809 | pag->pagf_repair_bno_level = 0; |
810 | pag->pagf_repair_cnt_level = 0; |
811 | err_cur: |
812 | xfs_btree_del_cursor(cnt_cur, error); |
813 | xfs_btree_del_cursor(bno_cur, error); |
814 | err_newbt: |
815 | xrep_abt_dispose_reservations(ra, error); |
816 | return error; |
817 | } |
818 | |
819 | /* |
820 | * Now that we've logged the roots of the new btrees, invalidate all of the |
821 | * old blocks and free them. |
822 | */ |
823 | STATIC int |
824 | xrep_abt_remove_old_trees( |
825 | struct xrep_abt *ra) |
826 | { |
827 | struct xfs_perag *pag = ra->sc->sa.pag; |
828 | int error; |
829 | |
830 | /* Free the old btree blocks if they're not in use. */ |
831 | error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, |
832 | &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); |
833 | if (error) |
834 | return error; |
835 | |
836 | /* |
837 | * Now that we've zapped all the old allocbt blocks we can turn off |
838 | * the alternate height mechanism. |
839 | */ |
840 | pag->pagf_repair_bno_level = 0; |
841 | pag->pagf_repair_cnt_level = 0; |
842 | return 0; |
843 | } |
844 | |
845 | /* Repair the freespace btrees for some AG. */ |
846 | int |
847 | xrep_allocbt( |
848 | struct xfs_scrub *sc) |
849 | { |
850 | struct xrep_abt *ra; |
851 | struct xfs_mount *mp = sc->mp; |
852 | char *descr; |
853 | int error; |
854 | |
855 | /* We require the rmapbt to rebuild anything. */ |
856 | if (!xfs_has_rmapbt(mp)) |
857 | return -EOPNOTSUPP; |
858 | |
859 | ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); |
860 | if (!ra) |
861 | return -ENOMEM; |
862 | ra->sc = sc; |
863 | |
864 | /* We rebuild both data structures. */ |
865 | sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; |
866 | |
867 | /* |
868 | * Make sure the busy extent list is clear because we can't put extents |
869 | * on there twice. In theory we cleared this before we started, but |
870 | * let's not risk the filesystem. |
871 | */ |
872 | if (!xfs_extent_busy_list_empty(sc->sa.pag)) { |
873 | error = -EDEADLOCK; |
874 | goto out_ra; |
875 | } |
876 | |
877 | /* Set up enough storage to handle maximally fragmented free space. */ |
878 | descr = xchk_xfile_ag_descr(sc, "free space records" ); |
879 | error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, |
880 | sizeof(struct xfs_alloc_rec_incore), |
881 | &ra->free_records); |
882 | kfree(descr); |
883 | if (error) |
884 | goto out_ra; |
885 | |
886 | /* Collect the free space data and find the old btree blocks. */ |
887 | xagb_bitmap_init(&ra->old_allocbt_blocks); |
888 | error = xrep_abt_find_freespace(ra); |
889 | if (error) |
890 | goto out_bitmap; |
891 | |
892 | /* Rebuild the free space information. */ |
893 | error = xrep_abt_build_new_trees(ra); |
894 | if (error) |
895 | goto out_bitmap; |
896 | |
897 | /* Kill the old trees. */ |
898 | error = xrep_abt_remove_old_trees(ra); |
899 | if (error) |
900 | goto out_bitmap; |
901 | |
902 | out_bitmap: |
903 | xagb_bitmap_destroy(&ra->old_allocbt_blocks); |
904 | xfarray_destroy(ra->free_records); |
905 | out_ra: |
906 | kfree(ra); |
907 | return error; |
908 | } |
909 | |
910 | /* Make sure both btrees are ok after we've rebuilt them. */ |
911 | int |
912 | xrep_revalidate_allocbt( |
913 | struct xfs_scrub *sc) |
914 | { |
915 | __u32 old_type = sc->sm->sm_type; |
916 | int error; |
917 | |
918 | /* |
919 | * We must update sm_type temporarily so that the tree-to-tree cross |
920 | * reference checks will work in the correct direction, and also so |
921 | * that tracing will report correctly if there are more errors. |
922 | */ |
923 | sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; |
924 | error = xchk_allocbt(sc); |
925 | if (error) |
926 | goto out; |
927 | |
928 | sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; |
929 | error = xchk_allocbt(sc); |
930 | out: |
931 | sc->sm->sm_type = old_type; |
932 | return error; |
933 | } |
934 | |