1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_btree_staging.h"
15#include "xfs_buf_mem.h"
16#include "xfs_btree_mem.h"
17#include "xfs_bit.h"
18#include "xfs_log_format.h"
19#include "xfs_trans.h"
20#include "xfs_sb.h"
21#include "xfs_alloc.h"
22#include "xfs_alloc_btree.h"
23#include "xfs_ialloc.h"
24#include "xfs_ialloc_btree.h"
25#include "xfs_rmap.h"
26#include "xfs_rmap_btree.h"
27#include "xfs_inode.h"
28#include "xfs_icache.h"
29#include "xfs_bmap.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_refcount.h"
32#include "xfs_refcount_btree.h"
33#include "xfs_ag.h"
34#include "scrub/xfs_scrub.h"
35#include "scrub/scrub.h"
36#include "scrub/common.h"
37#include "scrub/btree.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/xfile.h"
43#include "scrub/xfarray.h"
44#include "scrub/iscan.h"
45#include "scrub/newbt.h"
46#include "scrub/reap.h"
47
48/*
49 * Reverse Mapping Btree Repair
50 * ============================
51 *
52 * This is the most involved of all the AG space btree rebuilds. Everywhere
53 * else in XFS we lock inodes and then AG data structures, but generating the
54 * list of rmap records requires that we be able to scan both block mapping
55 * btrees of every inode in the filesystem to see if it owns any extents in
56 * this AG. We can't tolerate any inode updates while we do this, so we
57 * freeze the filesystem to lock everyone else out, and grant ourselves
58 * special privileges to run transactions with regular background reclamation
59 * turned off.
60 *
61 * We also have to be very careful not to allow inode reclaim to start a
62 * transaction because all transactions (other than our own) will block.
63 * Deferred inode inactivation helps us out there.
64 *
65 * I) Reverse mappings for all non-space metadata and file data are collected
66 * according to the following algorithm:
67 *
68 * 1. For each fork of each inode:
69 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
70 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
71 * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
72 * block.
73 * 1.3. If the incore extent map is loaded but the fork is in btree format,
74 * just visit the bmbt blocks to set the corresponding BMBIT areas.
75 * 1.4. From the incore extent map, accumulate each bmap that falls into our
76 * target AG. Remember, multiple bmap records can map to a single rmap
77 * record, so we cannot simply emit rmap records 1:1.
78 * 1.5. Emit rmap records for each extent in BMBIT and free it.
79 * 2. Create bitmaps INOBIT and ICHUNKBIT.
80 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
81 * and set bits in INOBIT for each btree block. If the inobt has no records
82 * at all, we must be careful to record its root in INOBIT.
83 * 4. For each block in the finobt, set the corresponding INOBIT area.
84 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
85 * 6. Create bitmaps REFCBIT and COWBIT.
86 * 7. For each CoW staging extent in the refcountbt, set the corresponding
87 * areas in COWBIT.
88 * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
89 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
90 * A. Emit rmap for the AG headers.
91 * B. Emit rmap for the log, if there is one.
92 *
93 * II) The rmapbt shape and space metadata rmaps are computed as follows:
94 *
95 * 1. Count the rmaps collected in the previous step. (= NR)
96 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
97 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
98 * 4. Create bitmap AGBIT.
99 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
100 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
101 * 7. Count the extents in AGBIT. (= AGNR)
102 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
103 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
104 * and clear AGBIT. Go to step 5.
105 * A. Emit rmaps for each extent in AGBIT.
106 *
107 * III) The rmapbt is constructed and set in place as follows:
108 *
109 * 1. Sort the rmap records.
110 * 2. Bulk load the rmaps.
111 *
112 * IV) Reap the old btree blocks.
113 *
114 * 1. Create a bitmap OLDRMBIT.
115 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
116 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
117 * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
118 * the parts of the AG that the rmap didn't find during its scan of the
119 * primary metadata and aren't known to be in the free space, which implies
120 * that they were the old rmapbt blocks.
121 * 5. Commit.
122 *
123 * We use the 'xrep_rmap' prefix for all the rmap functions.
124 */
125
126/* Context for collecting rmaps */
127struct xrep_rmap {
128 /* new rmapbt information */
129 struct xrep_newbt new_btree;
130
131 /* lock for the xfbtree and xfile */
132 struct mutex lock;
133
134 /* rmap records generated from primary metadata */
135 struct xfbtree rmap_btree;
136
137 struct xfs_scrub *sc;
138
139 /* in-memory btree cursor for the xfs_btree_bload iteration */
140 struct xfs_btree_cur *mcur;
141
142 /* Hooks into rmap update code. */
143 struct xfs_rmap_hook rhook;
144
145 /* inode scan cursor */
146 struct xchk_iscan iscan;
147
148 /* Number of non-freespace records found. */
149 unsigned long long nr_records;
150
151 /* bnobt/cntbt contribution to btreeblks */
152 xfs_agblock_t freesp_btblocks;
153
154 /* old agf_rmap_blocks counter */
155 unsigned int old_rmapbt_fsbcount;
156};
157
158/* Set us up to repair reverse mapping btrees. */
159int
160xrep_setup_ag_rmapbt(
161 struct xfs_scrub *sc)
162{
163 struct xrep_rmap *rr;
164 char *descr;
165 int error;
166
167 xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
168
169 descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
170 error = xrep_setup_xfbtree(sc, descr);
171 kfree(descr);
172 if (error)
173 return error;
174
175 rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
176 if (!rr)
177 return -ENOMEM;
178
179 rr->sc = sc;
180 sc->buf = rr;
181 return 0;
182}
183
184/* Make sure there's nothing funny about this mapping. */
185STATIC int
186xrep_rmap_check_mapping(
187 struct xfs_scrub *sc,
188 const struct xfs_rmap_irec *rec)
189{
190 enum xbtree_recpacking outcome;
191 int error;
192
193 if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
194 return -EFSCORRUPTED;
195
196 /* Make sure this isn't free space. */
197 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
198 rec->rm_blockcount, &outcome);
199 if (error)
200 return error;
201 if (outcome != XBTREE_RECPACKING_EMPTY)
202 return -EFSCORRUPTED;
203
204 return 0;
205}
206
207/* Store a reverse-mapping record. */
208static inline int
209xrep_rmap_stash(
210 struct xrep_rmap *rr,
211 xfs_agblock_t startblock,
212 xfs_extlen_t blockcount,
213 uint64_t owner,
214 uint64_t offset,
215 unsigned int flags)
216{
217 struct xfs_rmap_irec rmap = {
218 .rm_startblock = startblock,
219 .rm_blockcount = blockcount,
220 .rm_owner = owner,
221 .rm_offset = offset,
222 .rm_flags = flags,
223 };
224 struct xfs_scrub *sc = rr->sc;
225 struct xfs_btree_cur *mcur;
226 int error = 0;
227
228 if (xchk_should_terminate(sc, &error))
229 return error;
230
231 if (xchk_iscan_aborted(&rr->iscan))
232 return -EFSCORRUPTED;
233
234 trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
235
236 mutex_lock(&rr->lock);
237 mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
238 error = xfs_rmap_map_raw(mcur, &rmap);
239 xfs_btree_del_cursor(mcur, error);
240 if (error)
241 goto out_cancel;
242
243 error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
244 if (error)
245 goto out_abort;
246
247 mutex_unlock(&rr->lock);
248 return 0;
249
250out_cancel:
251 xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
252out_abort:
253 xchk_iscan_abort(&rr->iscan);
254 mutex_unlock(&rr->lock);
255 return error;
256}
257
258struct xrep_rmap_stash_run {
259 struct xrep_rmap *rr;
260 uint64_t owner;
261 unsigned int rmap_flags;
262};
263
264static int
265xrep_rmap_stash_run(
266 uint32_t start,
267 uint32_t len,
268 void *priv)
269{
270 struct xrep_rmap_stash_run *rsr = priv;
271 struct xrep_rmap *rr = rsr->rr;
272
273 return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
274}
275
276/*
277 * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
278 * that the ranges are in units of FS blocks.
279 */
280STATIC int
281xrep_rmap_stash_bitmap(
282 struct xrep_rmap *rr,
283 struct xagb_bitmap *bitmap,
284 const struct xfs_owner_info *oinfo)
285{
286 struct xrep_rmap_stash_run rsr = {
287 .rr = rr,
288 .owner = oinfo->oi_owner,
289 .rmap_flags = 0,
290 };
291
292 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
293 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
294 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
295 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
296
297 return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
298}
299
300/* Section (I): Finding all file and bmbt extents. */
301
302/* Context for accumulating rmaps for an inode fork. */
303struct xrep_rmap_ifork {
304 /*
305 * Accumulate rmap data here to turn multiple adjacent bmaps into a
306 * single rmap.
307 */
308 struct xfs_rmap_irec accum;
309
310 /* Bitmap of bmbt blocks in this AG. */
311 struct xagb_bitmap bmbt_blocks;
312
313 struct xrep_rmap *rr;
314
315 /* Which inode fork? */
316 int whichfork;
317};
318
319/* Stash an rmap that we accumulated while walking an inode fork. */
320STATIC int
321xrep_rmap_stash_accumulated(
322 struct xrep_rmap_ifork *rf)
323{
324 if (rf->accum.rm_blockcount == 0)
325 return 0;
326
327 return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
328 rf->accum.rm_blockcount, rf->accum.rm_owner,
329 rf->accum.rm_offset, rf->accum.rm_flags);
330}
331
332/* Accumulate a bmbt record. */
333STATIC int
334xrep_rmap_visit_bmbt(
335 struct xfs_btree_cur *cur,
336 struct xfs_bmbt_irec *rec,
337 void *priv)
338{
339 struct xrep_rmap_ifork *rf = priv;
340 struct xfs_mount *mp = rf->rr->sc->mp;
341 struct xfs_rmap_irec *accum = &rf->accum;
342 xfs_agblock_t agbno;
343 unsigned int rmap_flags = 0;
344 int error;
345
346 if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
347 rf->rr->sc->sa.pag->pag_agno)
348 return 0;
349
350 agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
351 if (rf->whichfork == XFS_ATTR_FORK)
352 rmap_flags |= XFS_RMAP_ATTR_FORK;
353 if (rec->br_state == XFS_EXT_UNWRITTEN)
354 rmap_flags |= XFS_RMAP_UNWRITTEN;
355
356 /* If this bmap is adjacent to the previous one, just add it. */
357 if (accum->rm_blockcount > 0 &&
358 rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
359 agbno == accum->rm_startblock + accum->rm_blockcount &&
360 rmap_flags == accum->rm_flags) {
361 accum->rm_blockcount += rec->br_blockcount;
362 return 0;
363 }
364
365 /* Otherwise stash the old rmap and start accumulating a new one. */
366 error = xrep_rmap_stash_accumulated(rf);
367 if (error)
368 return error;
369
370 accum->rm_startblock = agbno;
371 accum->rm_blockcount = rec->br_blockcount;
372 accum->rm_offset = rec->br_startoff;
373 accum->rm_flags = rmap_flags;
374 return 0;
375}
376
377/* Add a btree block to the bitmap. */
378STATIC int
379xrep_rmap_visit_iroot_btree_block(
380 struct xfs_btree_cur *cur,
381 int level,
382 void *priv)
383{
384 struct xrep_rmap_ifork *rf = priv;
385 struct xfs_buf *bp;
386 xfs_fsblock_t fsbno;
387 xfs_agblock_t agbno;
388
389 xfs_btree_get_block(cur, level, &bp);
390 if (!bp)
391 return 0;
392
393 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
394 if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
395 return 0;
396
397 agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
398 return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
399}
400
401/*
402 * Iterate a metadata btree rooted in an inode to collect rmap records for
403 * anything in this fork that matches the AG.
404 */
405STATIC int
406xrep_rmap_scan_iroot_btree(
407 struct xrep_rmap_ifork *rf,
408 struct xfs_btree_cur *cur)
409{
410 struct xfs_owner_info oinfo;
411 struct xrep_rmap *rr = rf->rr;
412 int error;
413
414 xagb_bitmap_init(&rf->bmbt_blocks);
415
416 /* Record all the blocks in the btree itself. */
417 error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
418 XFS_BTREE_VISIT_ALL, rf);
419 if (error)
420 goto out;
421
422 /* Emit rmaps for the btree blocks. */
423 xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
424 error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
425 if (error)
426 goto out;
427
428 /* Stash any remaining accumulated rmaps. */
429 error = xrep_rmap_stash_accumulated(rf);
430out:
431 xagb_bitmap_destroy(&rf->bmbt_blocks);
432 return error;
433}
434
435static inline bool
436is_rt_data_fork(
437 struct xfs_inode *ip,
438 int whichfork)
439{
440 return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
441}
442
443/*
444 * Iterate the block mapping btree to collect rmap records for anything in this
445 * fork that matches the AG. Sets @mappings_done to true if we've scanned the
446 * block mappings in this fork.
447 */
448STATIC int
449xrep_rmap_scan_bmbt(
450 struct xrep_rmap_ifork *rf,
451 struct xfs_inode *ip,
452 bool *mappings_done)
453{
454 struct xrep_rmap *rr = rf->rr;
455 struct xfs_btree_cur *cur;
456 struct xfs_ifork *ifp;
457 int error;
458
459 *mappings_done = false;
460 ifp = xfs_ifork_ptr(ip, rf->whichfork);
461 cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
462
463 if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
464 xfs_need_iread_extents(ifp)) {
465 /*
466 * If the incore extent cache isn't loaded, scan the bmbt for
467 * mapping records. This avoids loading the incore extent
468 * tree, which will increase memory pressure at a time when
469 * we're trying to run as quickly as we possibly can. Ignore
470 * realtime extents.
471 */
472 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
473 if (error)
474 goto out_cur;
475
476 *mappings_done = true;
477 }
478
479 /* Scan for the bmbt blocks, which always live on the data device. */
480 error = xrep_rmap_scan_iroot_btree(rf, cur);
481out_cur:
482 xfs_btree_del_cursor(cur, error);
483 return error;
484}
485
486/*
487 * Iterate the in-core extent cache to collect rmap records for anything in
488 * this fork that matches the AG.
489 */
490STATIC int
491xrep_rmap_scan_iext(
492 struct xrep_rmap_ifork *rf,
493 struct xfs_ifork *ifp)
494{
495 struct xfs_bmbt_irec rec;
496 struct xfs_iext_cursor icur;
497 int error;
498
499 for_each_xfs_iext(ifp, &icur, &rec) {
500 if (isnullstartblock(rec.br_startblock))
501 continue;
502 error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
503 if (error)
504 return error;
505 }
506
507 return xrep_rmap_stash_accumulated(rf);
508}
509
510/* Find all the extents from a given AG in an inode fork. */
511STATIC int
512xrep_rmap_scan_ifork(
513 struct xrep_rmap *rr,
514 struct xfs_inode *ip,
515 int whichfork)
516{
517 struct xrep_rmap_ifork rf = {
518 .accum = { .rm_owner = ip->i_ino, },
519 .rr = rr,
520 .whichfork = whichfork,
521 };
522 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
523 int error = 0;
524
525 if (!ifp)
526 return 0;
527
528 if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
529 bool mappings_done;
530
531 /*
532 * Scan the bmap btree for data device mappings. This includes
533 * the btree blocks themselves, even if this is a realtime
534 * file.
535 */
536 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
537 if (error || mappings_done)
538 return error;
539 } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
540 return 0;
541 }
542
543 /* Scan incore extent cache if this isn't a realtime file. */
544 if (xfs_ifork_is_realtime(ip, whichfork))
545 return 0;
546
547 return xrep_rmap_scan_iext(&rf, ifp);
548}
549
550/*
551 * Take ILOCK on a file that we want to scan.
552 *
553 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
554 * attr bmbt. Otherwise, take ILOCK_SHARED.
555 */
556static inline unsigned int
557xrep_rmap_scan_ilock(
558 struct xfs_inode *ip)
559{
560 uint lock_mode = XFS_ILOCK_SHARED;
561
562 if (xfs_need_iread_extents(&ip->i_df)) {
563 lock_mode = XFS_ILOCK_EXCL;
564 goto lock;
565 }
566
567 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
568 lock_mode = XFS_ILOCK_EXCL;
569
570lock:
571 xfs_ilock(ip, lock_mode);
572 return lock_mode;
573}
574
575/* Record reverse mappings for a file. */
576STATIC int
577xrep_rmap_scan_inode(
578 struct xrep_rmap *rr,
579 struct xfs_inode *ip)
580{
581 unsigned int lock_mode = 0;
582 int error;
583
584 /*
585 * Directory updates (create/link/unlink/rename) drop the directory's
586 * ILOCK before finishing any rmapbt updates associated with directory
587 * shape changes. For this scan to coordinate correctly with the live
588 * update hook, we must take the only lock (i_rwsem) that is held all
589 * the way to dir op completion. This will get fixed by the parent
590 * pointer patchset.
591 */
592 if (S_ISDIR(VFS_I(ip)->i_mode)) {
593 lock_mode = XFS_IOLOCK_SHARED;
594 xfs_ilock(ip, lock_mode);
595 }
596 lock_mode |= xrep_rmap_scan_ilock(ip);
597
598 /* Check the data fork. */
599 error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
600 if (error)
601 goto out_unlock;
602
603 /* Check the attr fork. */
604 error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
605 if (error)
606 goto out_unlock;
607
608 /* COW fork extents are "owned" by the refcount btree. */
609
610 xchk_iscan_mark_visited(&rr->iscan, ip);
611out_unlock:
612 xfs_iunlock(ip, lock_mode);
613 return error;
614}
615
616/* Section (I): Find all AG metadata extents except for free space metadata. */
617
618struct xrep_rmap_inodes {
619 struct xrep_rmap *rr;
620 struct xagb_bitmap inobt_blocks; /* INOBIT */
621 struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
622};
623
624/* Record inode btree rmaps. */
625STATIC int
626xrep_rmap_walk_inobt(
627 struct xfs_btree_cur *cur,
628 const union xfs_btree_rec *rec,
629 void *priv)
630{
631 struct xfs_inobt_rec_incore irec;
632 struct xrep_rmap_inodes *ri = priv;
633 struct xfs_mount *mp = cur->bc_mp;
634 xfs_agblock_t agbno;
635 xfs_extlen_t aglen;
636 xfs_agino_t agino;
637 xfs_agino_t iperhole;
638 unsigned int i;
639 int error;
640
641 /* Record the inobt blocks. */
642 error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
643 if (error)
644 return error;
645
646 xfs_inobt_btrec_to_irec(mp, rec, &irec);
647 if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
648 return -EFSCORRUPTED;
649
650 agino = irec.ir_startino;
651
652 /* Record a non-sparse inode chunk. */
653 if (!xfs_inobt_issparse(irec.ir_holemask)) {
654 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
655 aglen = max_t(xfs_extlen_t, 1,
656 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
657
658 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
659 }
660
661 /* Iterate each chunk. */
662 iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
663 XFS_INODES_PER_HOLEMASK_BIT);
664 aglen = iperhole / mp->m_sb.sb_inopblock;
665 for (i = 0, agino = irec.ir_startino;
666 i < XFS_INOBT_HOLEMASK_BITS;
667 i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
668 /* Skip holes. */
669 if (irec.ir_holemask & (1 << i))
670 continue;
671
672 /* Record the inode chunk otherwise. */
673 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
674 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
675 if (error)
676 return error;
677 }
678
679 return 0;
680}
681
682/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
683STATIC int
684xrep_rmap_find_inode_rmaps(
685 struct xrep_rmap *rr)
686{
687 struct xrep_rmap_inodes ri = {
688 .rr = rr,
689 };
690 struct xfs_scrub *sc = rr->sc;
691 int error;
692
693 xagb_bitmap_init(&ri.inobt_blocks);
694 xagb_bitmap_init(&ri.ichunk_blocks);
695
696 /*
697 * Iterate every record in the inobt so we can capture all the inode
698 * chunks and the blocks in the inobt itself.
699 */
700 error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
701 if (error)
702 goto out_bitmap;
703
704 /*
705 * Note that if there are zero records in the inobt then query_all does
706 * nothing and we have to account the empty inobt root manually.
707 */
708 if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
709 struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
710
711 error = xagb_bitmap_set(&ri.inobt_blocks,
712 be32_to_cpu(agi->agi_root), 1);
713 if (error)
714 goto out_bitmap;
715 }
716
717 /* Scan the finobt too. */
718 if (xfs_has_finobt(sc->mp)) {
719 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
720 sc->sa.fino_cur);
721 if (error)
722 goto out_bitmap;
723 }
724
725 /* Generate rmaps for everything. */
726 error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
727 &XFS_RMAP_OINFO_INOBT);
728 if (error)
729 goto out_bitmap;
730 error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
731 &XFS_RMAP_OINFO_INODES);
732
733out_bitmap:
734 xagb_bitmap_destroy(&ri.inobt_blocks);
735 xagb_bitmap_destroy(&ri.ichunk_blocks);
736 return error;
737}
738
739/* Record a CoW staging extent. */
740STATIC int
741xrep_rmap_walk_cowblocks(
742 struct xfs_btree_cur *cur,
743 const struct xfs_refcount_irec *irec,
744 void *priv)
745{
746 struct xagb_bitmap *bitmap = priv;
747
748 if (!xfs_refcount_check_domain(irec) ||
749 irec->rc_domain != XFS_REFC_DOMAIN_COW)
750 return -EFSCORRUPTED;
751
752 return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
753}
754
755/*
756 * Collect rmaps for the blocks containing the refcount btree, and all CoW
757 * staging extents.
758 */
759STATIC int
760xrep_rmap_find_refcount_rmaps(
761 struct xrep_rmap *rr)
762{
763 struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
764 struct xagb_bitmap cow_blocks; /* COWBIT */
765 struct xfs_refcount_irec low = {
766 .rc_startblock = 0,
767 .rc_domain = XFS_REFC_DOMAIN_COW,
768 };
769 struct xfs_refcount_irec high = {
770 .rc_startblock = -1U,
771 .rc_domain = XFS_REFC_DOMAIN_COW,
772 };
773 struct xfs_scrub *sc = rr->sc;
774 int error;
775
776 if (!xfs_has_reflink(sc->mp))
777 return 0;
778
779 xagb_bitmap_init(&refcountbt_blocks);
780 xagb_bitmap_init(&cow_blocks);
781
782 /* refcountbt */
783 error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
784 if (error)
785 goto out_bitmap;
786
787 /* Collect rmaps for CoW staging extents. */
788 error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
789 xrep_rmap_walk_cowblocks, &cow_blocks);
790 if (error)
791 goto out_bitmap;
792
793 /* Generate rmaps for everything. */
794 error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
795 if (error)
796 goto out_bitmap;
797 error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
798 &XFS_RMAP_OINFO_REFC);
799
800out_bitmap:
801 xagb_bitmap_destroy(&cow_blocks);
802 xagb_bitmap_destroy(&refcountbt_blocks);
803 return error;
804}
805
806/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
807STATIC int
808xrep_rmap_find_agheader_rmaps(
809 struct xrep_rmap *rr)
810{
811 struct xfs_scrub *sc = rr->sc;
812
813 /* Create a record for the AG sb->agfl. */
814 return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
815 XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
816 XFS_RMAP_OWN_FS, 0, 0);
817}
818
819/* Generate rmaps for the log, if it's in this AG. */
820STATIC int
821xrep_rmap_find_log_rmaps(
822 struct xrep_rmap *rr)
823{
824 struct xfs_scrub *sc = rr->sc;
825
826 if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
827 return 0;
828
829 return xrep_rmap_stash(rr,
830 XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
831 sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
832}
833
834/* Check and count all the records that we gathered. */
835STATIC int
836xrep_rmap_check_record(
837 struct xfs_btree_cur *cur,
838 const struct xfs_rmap_irec *rec,
839 void *priv)
840{
841 struct xrep_rmap *rr = priv;
842 int error;
843
844 error = xrep_rmap_check_mapping(rr->sc, rec);
845 if (error)
846 return error;
847
848 rr->nr_records++;
849 return 0;
850}
851
852/*
853 * Generate all the reverse-mappings for this AG, a list of the old rmapbt
854 * blocks, and the new btreeblks count. Figure out if we have enough free
855 * space to reconstruct the inode btrees. The caller must clean up the lists
856 * if anything goes wrong. This implements section (I) above.
857 */
858STATIC int
859xrep_rmap_find_rmaps(
860 struct xrep_rmap *rr)
861{
862 struct xfs_scrub *sc = rr->sc;
863 struct xchk_ag *sa = &sc->sa;
864 struct xfs_inode *ip;
865 struct xfs_btree_cur *mcur;
866 int error;
867
868 /* Find all the per-AG metadata. */
869 xrep_ag_btcur_init(sc, &sc->sa);
870
871 error = xrep_rmap_find_inode_rmaps(rr);
872 if (error)
873 goto end_agscan;
874
875 error = xrep_rmap_find_refcount_rmaps(rr);
876 if (error)
877 goto end_agscan;
878
879 error = xrep_rmap_find_agheader_rmaps(rr);
880 if (error)
881 goto end_agscan;
882
883 error = xrep_rmap_find_log_rmaps(rr);
884end_agscan:
885 xchk_ag_btcur_free(&sc->sa);
886 if (error)
887 return error;
888
889 /*
890 * Set up for a potentially lengthy filesystem scan by reducing our
891 * transaction resource usage for the duration. Specifically:
892 *
893 * Unlock the AG header buffers and cancel the transaction to release
894 * the log grant space while we scan the filesystem.
895 *
896 * Create a new empty transaction to eliminate the possibility of the
897 * inode scan deadlocking on cyclical metadata.
898 *
899 * We pass the empty transaction to the file scanning function to avoid
900 * repeatedly cycling empty transactions. This can be done even though
901 * we take the IOLOCK to quiesce the file because empty transactions
902 * do not take sb_internal.
903 */
904 sa->agf_bp = NULL;
905 sa->agi_bp = NULL;
906 xchk_trans_cancel(sc);
907 error = xchk_trans_alloc_empty(sc);
908 if (error)
909 return error;
910
911 /* Iterate all AGs for inodes rmaps. */
912 while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
913 error = xrep_rmap_scan_inode(rr, ip);
914 xchk_irele(sc, ip);
915 if (error)
916 break;
917
918 if (xchk_should_terminate(sc, &error))
919 break;
920 }
921 xchk_iscan_iter_finish(&rr->iscan);
922 if (error)
923 return error;
924
925 /*
926 * Switch out for a real transaction and lock the AG headers in
927 * preparation for building a new tree.
928 */
929 xchk_trans_cancel(sc);
930 error = xchk_setup_fs(sc);
931 if (error)
932 return error;
933 error = xchk_perag_drain_and_lock(sc);
934 if (error)
935 return error;
936
937 /*
938 * If a hook failed to update the in-memory btree, we lack the data to
939 * continue the repair.
940 */
941 if (xchk_iscan_aborted(&rr->iscan))
942 return -EFSCORRUPTED;
943
944 /*
945 * Now that we have everything locked again, we need to count the
946 * number of rmap records stashed in the btree. This should reflect
947 * all actively-owned space in the filesystem. At the same time, check
948 * all our records before we start building a new btree, which requires
949 * a bnobt cursor.
950 */
951 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
952 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
953 sc->sa.pag);
954
955 rr->nr_records = 0;
956 error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
957
958 xfs_btree_del_cursor(sc->sa.bno_cur, error);
959 sc->sa.bno_cur = NULL;
960 xfs_btree_del_cursor(mcur, error);
961
962 return error;
963}
964
965/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
966
967struct xrep_rmap_agfl {
968 struct xagb_bitmap *bitmap;
969 xfs_agnumber_t agno;
970};
971
972/* Add an AGFL block to the rmap list. */
973STATIC int
974xrep_rmap_walk_agfl(
975 struct xfs_mount *mp,
976 xfs_agblock_t agbno,
977 void *priv)
978{
979 struct xrep_rmap_agfl *ra = priv;
980
981 return xagb_bitmap_set(ra->bitmap, agbno, 1);
982}
983
984/*
985 * Run one round of reserving space for the new rmapbt and recomputing the
986 * number of blocks needed to store the previously observed rmapbt records and
987 * the ones we'll create for the free space metadata. When we don't need more
988 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
989 * true.
990 */
991STATIC int
992xrep_rmap_try_reserve(
993 struct xrep_rmap *rr,
994 struct xfs_btree_cur *rmap_cur,
995 struct xagb_bitmap *freesp_blocks,
996 uint64_t *blocks_reserved,
997 bool *done)
998{
999 struct xrep_rmap_agfl ra = {
1000 .bitmap = freesp_blocks,
1001 .agno = rr->sc->sa.pag->pag_agno,
1002 };
1003 struct xfs_scrub *sc = rr->sc;
1004 struct xrep_newbt_resv *resv, *n;
1005 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1006 struct xfs_buf *agfl_bp;
1007 uint64_t nr_blocks; /* RMB */
1008 uint64_t freesp_records;
1009 int error;
1010
1011 /*
1012 * We're going to recompute new_btree.bload.nr_blocks at the end of
1013 * this function to reflect however many btree blocks we need to store
1014 * all the rmap records (including the ones that reflect the changes we
1015 * made to support the new rmapbt blocks), so we save the old value
1016 * here so we can decide if we've reserved enough blocks.
1017 */
1018 nr_blocks = rr->new_btree.bload.nr_blocks;
1019
1020 /*
1021 * Make sure we've reserved enough space for the new btree. This can
1022 * change the shape of the free space btrees, which can cause secondary
1023 * interactions with the rmap records because all three space btrees
1024 * have the same rmap owner. We'll account for all that below.
1025 */
1026 error = xrep_newbt_alloc_blocks(&rr->new_btree,
1027 nr_blocks - *blocks_reserved);
1028 if (error)
1029 return error;
1030
1031 *blocks_reserved = rr->new_btree.bload.nr_blocks;
1032
1033 /* Clear everything in the bitmap. */
1034 xagb_bitmap_destroy(freesp_blocks);
1035
1036 /* Set all the bnobt blocks in the bitmap. */
1037 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1038 sc->sa.pag);
1039 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1040 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1041 sc->sa.bno_cur = NULL;
1042 if (error)
1043 return error;
1044
1045 /* Set all the cntbt blocks in the bitmap. */
1046 sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1047 sc->sa.pag);
1048 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1049 xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1050 sc->sa.cnt_cur = NULL;
1051 if (error)
1052 return error;
1053
1054 /* Record our new btreeblks value. */
1055 rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1056
1057 /* Set all the new rmapbt blocks in the bitmap. */
1058 list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1059 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1060 if (error)
1061 return error;
1062 }
1063
1064 /* Set all the AGFL blocks in the bitmap. */
1065 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1066 if (error)
1067 return error;
1068
1069 error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1070 if (error)
1071 return error;
1072
1073 /* Count the extents in the bitmap. */
1074 freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1075
1076 /* Compute how many blocks we'll need for all the rmaps. */
1077 error = xfs_btree_bload_compute_geometry(rmap_cur,
1078 &rr->new_btree.bload, rr->nr_records + freesp_records);
1079 if (error)
1080 return error;
1081
1082 /* We're done when we don't need more blocks. */
1083 *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1084 return 0;
1085}
1086
1087/*
1088 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1089 * the free space metadata. This implements section (II) above.
1090 */
1091STATIC int
1092xrep_rmap_reserve_space(
1093 struct xrep_rmap *rr,
1094 struct xfs_btree_cur *rmap_cur)
1095{
1096 struct xagb_bitmap freesp_blocks; /* AGBIT */
1097 uint64_t blocks_reserved = 0;
1098 bool done = false;
1099 int error;
1100
1101 /* Compute how many blocks we'll need for the rmaps collected so far. */
1102 error = xfs_btree_bload_compute_geometry(rmap_cur,
1103 &rr->new_btree.bload, rr->nr_records);
1104 if (error)
1105 return error;
1106
1107 /* Last chance to abort before we start committing fixes. */
1108 if (xchk_should_terminate(rr->sc, &error))
1109 return error;
1110
1111 xagb_bitmap_init(&freesp_blocks);
1112
1113 /*
1114 * Iteratively reserve space for the new rmapbt and recompute the
1115 * number of blocks needed to store the previously observed rmapbt
1116 * records and the ones we'll create for the free space metadata.
1117 * Finish when we don't need more blocks.
1118 */
1119 do {
1120 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1121 &blocks_reserved, &done);
1122 if (error)
1123 goto out_bitmap;
1124 } while (!done);
1125
1126 /* Emit rmaps for everything in the free space bitmap. */
1127 xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1128 error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1129 xchk_ag_btcur_free(&rr->sc->sa);
1130
1131out_bitmap:
1132 xagb_bitmap_destroy(&freesp_blocks);
1133 return error;
1134}
1135
1136/* Section (III): Building the new rmap btree. */
1137
1138/* Update the AGF counters. */
1139STATIC int
1140xrep_rmap_reset_counters(
1141 struct xrep_rmap *rr)
1142{
1143 struct xfs_scrub *sc = rr->sc;
1144 struct xfs_perag *pag = sc->sa.pag;
1145 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1146 xfs_agblock_t rmap_btblocks;
1147
1148 /*
1149 * The AGF header contains extra information related to the reverse
1150 * mapping btree, so we must update those fields here.
1151 */
1152 rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1153 agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1154 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1155
1156 /*
1157 * After we commit the new btree to disk, it is possible that the
1158 * process to reap the old btree blocks will race with the AIL trying
1159 * to checkpoint the old btree blocks into the filesystem. If the new
1160 * tree is shorter than the old one, the rmapbt write verifier will
1161 * fail and the AIL will shut down the filesystem.
1162 *
1163 * To avoid this, save the old incore btree height values as the alt
1164 * height values before re-initializing the perag info from the updated
1165 * AGF to capture all the new values.
1166 */
1167 pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1168
1169 /* Reinitialize with the values we just logged. */
1170 return xrep_reinit_pagf(sc);
1171}
1172
1173/* Retrieve rmapbt data for bulk load. */
1174STATIC int
1175xrep_rmap_get_records(
1176 struct xfs_btree_cur *cur,
1177 unsigned int idx,
1178 struct xfs_btree_block *block,
1179 unsigned int nr_wanted,
1180 void *priv)
1181{
1182 struct xrep_rmap *rr = priv;
1183 union xfs_btree_rec *block_rec;
1184 unsigned int loaded;
1185 int error;
1186
1187 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1188 int stat = 0;
1189
1190 error = xfs_btree_increment(rr->mcur, 0, &stat);
1191 if (error)
1192 return error;
1193 if (!stat)
1194 return -EFSCORRUPTED;
1195
1196 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1197 if (error)
1198 return error;
1199 if (!stat)
1200 return -EFSCORRUPTED;
1201
1202 block_rec = xfs_btree_rec_addr(cur, idx, block);
1203 cur->bc_ops->init_rec_from_cur(cur, block_rec);
1204 }
1205
1206 return loaded;
1207}
1208
1209/* Feed one of the new btree blocks to the bulk loader. */
1210STATIC int
1211xrep_rmap_claim_block(
1212 struct xfs_btree_cur *cur,
1213 union xfs_btree_ptr *ptr,
1214 void *priv)
1215{
1216 struct xrep_rmap *rr = priv;
1217
1218 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1219}
1220
1221/* Custom allocation function for new rmap btrees. */
1222STATIC int
1223xrep_rmap_alloc_vextent(
1224 struct xfs_scrub *sc,
1225 struct xfs_alloc_arg *args,
1226 xfs_fsblock_t alloc_hint)
1227{
1228 int error;
1229
1230 /*
1231 * We don't want an rmap update on the allocation, since we iteratively
1232 * compute the OWN_AG records /after/ allocating blocks for the records
1233 * that we already know we need to store. Therefore, fix the freelist
1234 * with the NORMAP flag set so that we don't also try to create an rmap
1235 * for new AGFL blocks.
1236 */
1237 error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1238 if (error)
1239 return error;
1240
1241 /*
1242 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1243 * free space btrees or by removing blocks from the AGFL and queueing
1244 * an EFI to free the block, the transaction will be dirty. This
1245 * second case is of interest to us.
1246 *
1247 * Later on, we will need to compare gaps in the new recordset against
1248 * the block usage of all OWN_AG owners in order to free the old
1249 * btree's blocks, which means that we can't have EFIs for former AGFL
1250 * blocks attached to the repair transaction when we commit the new
1251 * btree.
1252 *
1253 * xrep_newbt_alloc_blocks guarantees this for us by calling
1254 * xrep_defer_finish to commit anything that fix_freelist may have
1255 * added to the transaction.
1256 */
1257 return xfs_alloc_vextent_near_bno(args, alloc_hint);
1258}
1259
1260
1261/* Count the records in this btree. */
1262STATIC int
1263xrep_rmap_count_records(
1264 struct xfs_btree_cur *cur,
1265 unsigned long long *nr)
1266{
1267 int running = 1;
1268 int error;
1269
1270 *nr = 0;
1271
1272 error = xfs_btree_goto_left_edge(cur);
1273 if (error)
1274 return error;
1275
1276 while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1277 if (running)
1278 (*nr)++;
1279 }
1280
1281 return error;
1282}
1283/*
1284 * Use the collected rmap information to stage a new rmap btree. If this is
1285 * successful we'll return with the new btree root information logged to the
1286 * repair transaction but not yet committed. This implements section (III)
1287 * above.
1288 */
1289STATIC int
1290xrep_rmap_build_new_tree(
1291 struct xrep_rmap *rr)
1292{
1293 struct xfs_scrub *sc = rr->sc;
1294 struct xfs_perag *pag = sc->sa.pag;
1295 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1296 struct xfs_btree_cur *rmap_cur;
1297 xfs_fsblock_t fsbno;
1298 int error;
1299
1300 /*
1301 * Preserve the old rmapbt block count so that we can adjust the
1302 * per-AG rmapbt reservation after we commit the new btree root and
1303 * want to dispose of the old btree blocks.
1304 */
1305 rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1306
1307 /*
1308 * Prepare to construct the new btree by reserving disk space for the
1309 * new btree and setting up all the accounting information we'll need
1310 * to root the new btree while it's under construction and before we
1311 * attach it to the AG header. The new blocks are accounted to the
1312 * rmapbt per-AG reservation, which we will adjust further after
1313 * committing the new btree.
1314 */
1315 fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
1316 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1317 fsbno, XFS_AG_RESV_RMAPBT);
1318 rr->new_btree.bload.get_records = xrep_rmap_get_records;
1319 rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1320 rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1321 rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1322 xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1323
1324 /*
1325 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1326 * and compute OWN_AG rmaps.
1327 */
1328 error = xrep_rmap_reserve_space(rr, rmap_cur);
1329 if (error)
1330 goto err_cur;
1331
1332 /*
1333 * Count the rmapbt records again, because the space reservation
1334 * for the rmapbt itself probably added more records to the btree.
1335 */
1336 rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1337 &rr->rmap_btree);
1338
1339 error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1340 if (error)
1341 goto err_mcur;
1342
1343 /*
1344 * Due to btree slack factors, it's possible for a new btree to be one
1345 * level taller than the old btree. Update the incore btree height so
1346 * that we don't trip the verifiers when writing the new btree blocks
1347 * to disk.
1348 */
1349 pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1350
1351 /*
1352 * Move the cursor to the left edge of the tree so that the first
1353 * increment in ->get_records positions us at the first record.
1354 */
1355 error = xfs_btree_goto_left_edge(rr->mcur);
1356 if (error)
1357 goto err_level;
1358
1359 /* Add all observed rmap records. */
1360 error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1361 if (error)
1362 goto err_level;
1363
1364 /*
1365 * Install the new btree in the AG header. After this point the old
1366 * btree is no longer accessible and the new tree is live.
1367 */
1368 xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1369 xfs_btree_del_cursor(rmap_cur, 0);
1370 xfs_btree_del_cursor(rr->mcur, 0);
1371 rr->mcur = NULL;
1372
1373 /*
1374 * Now that we've written the new btree to disk, we don't need to keep
1375 * updating the in-memory btree. Abort the scan to stop live updates.
1376 */
1377 xchk_iscan_abort(&rr->iscan);
1378
1379 /*
1380 * The newly committed rmap recordset includes mappings for the blocks
1381 * that we reserved to build the new btree. If there is excess space
1382 * reservation to be freed, the corresponding rmap records must also be
1383 * removed.
1384 */
1385 rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1386
1387 /* Reset the AGF counters now that we've changed the btree shape. */
1388 error = xrep_rmap_reset_counters(rr);
1389 if (error)
1390 goto err_newbt;
1391
1392 /* Dispose of any unused blocks and the accounting information. */
1393 error = xrep_newbt_commit(&rr->new_btree);
1394 if (error)
1395 return error;
1396
1397 return xrep_roll_ag_trans(sc);
1398
1399err_level:
1400 pag->pagf_repair_rmap_level = 0;
1401err_mcur:
1402 xfs_btree_del_cursor(rr->mcur, error);
1403err_cur:
1404 xfs_btree_del_cursor(rmap_cur, error);
1405err_newbt:
1406 xrep_newbt_cancel(&rr->new_btree);
1407 return error;
1408}
1409
1410/* Section (IV): Reaping the old btree. */
1411
1412struct xrep_rmap_find_gaps {
1413 struct xagb_bitmap rmap_gaps;
1414 xfs_agblock_t next_agbno;
1415};
1416
1417/* Subtract each free extent in the bnobt from the rmap gaps. */
1418STATIC int
1419xrep_rmap_find_freesp(
1420 struct xfs_btree_cur *cur,
1421 const struct xfs_alloc_rec_incore *rec,
1422 void *priv)
1423{
1424 struct xrep_rmap_find_gaps *rfg = priv;
1425
1426 return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1427 rec->ar_blockcount);
1428}
1429
1430/* Record the free space we find, as part of cleaning out the btree. */
1431STATIC int
1432xrep_rmap_find_gaps(
1433 struct xfs_btree_cur *cur,
1434 const struct xfs_rmap_irec *rec,
1435 void *priv)
1436{
1437 struct xrep_rmap_find_gaps *rfg = priv;
1438 int error;
1439
1440 if (rec->rm_startblock > rfg->next_agbno) {
1441 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1442 rec->rm_startblock - rfg->next_agbno);
1443 if (error)
1444 return error;
1445 }
1446
1447 rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1448 rec->rm_startblock + rec->rm_blockcount);
1449 return 0;
1450}
1451
1452/*
1453 * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
1454 * a list of gaps in the rmap records and a list of the extents mentioned in
1455 * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
1456 * in the bnobt is a block from the old rmapbt and can be removed.
1457 */
1458STATIC int
1459xrep_rmap_remove_old_tree(
1460 struct xrep_rmap *rr)
1461{
1462 struct xrep_rmap_find_gaps rfg = {
1463 .next_agbno = 0,
1464 };
1465 struct xfs_scrub *sc = rr->sc;
1466 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1467 struct xfs_perag *pag = sc->sa.pag;
1468 struct xfs_btree_cur *mcur;
1469 xfs_agblock_t agend;
1470 int error;
1471
1472 xagb_bitmap_init(&rfg.rmap_gaps);
1473
1474 /* Compute free space from the new rmapbt. */
1475 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1476
1477 error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1478 xfs_btree_del_cursor(mcur, error);
1479 if (error)
1480 goto out_bitmap;
1481
1482 /* Insert a record for space between the last rmap and EOAG. */
1483 agend = be32_to_cpu(agf->agf_length);
1484 if (rfg.next_agbno < agend) {
1485 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1486 agend - rfg.next_agbno);
1487 if (error)
1488 goto out_bitmap;
1489 }
1490
1491 /* Compute free space from the existing bnobt. */
1492 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1493 sc->sa.pag);
1494 error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1495 &rfg);
1496 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1497 sc->sa.bno_cur = NULL;
1498 if (error)
1499 goto out_bitmap;
1500
1501 /*
1502 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1503 * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
1504 * block usage count back to the per-AG rmapbt reservation (and not
1505 * fdblocks, since the rmap btree lives in free space) to keep the
1506 * reservation and free space accounting correct.
1507 */
1508 error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1509 &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1510 if (error)
1511 goto out_bitmap;
1512
1513 /*
1514 * Now that we've zapped all the old rmapbt blocks we can turn off
1515 * the alternate height mechanism and reset the per-AG space
1516 * reservation.
1517 */
1518 pag->pagf_repair_rmap_level = 0;
1519 sc->flags |= XREP_RESET_PERAG_RESV;
1520out_bitmap:
1521 xagb_bitmap_destroy(&rfg.rmap_gaps);
1522 return error;
1523}
1524
1525static inline bool
1526xrep_rmapbt_want_live_update(
1527 struct xchk_iscan *iscan,
1528 const struct xfs_owner_info *oi)
1529{
1530 if (xchk_iscan_aborted(iscan))
1531 return false;
1532
1533 /*
1534 * Before unlocking the AG header to perform the inode scan, we
1535 * recorded reverse mappings for all AG metadata except for the OWN_AG
1536 * metadata. IOWs, the in-memory btree knows about the AG headers, the
1537 * two inode btrees, the CoW staging extents, and the refcount btrees.
1538 * For these types of metadata, we need to record the live updates in
1539 * the in-memory rmap btree.
1540 *
1541 * However, we do not scan the free space btrees or the AGFL until we
1542 * have re-locked the AGF and are ready to reserve space for the new
1543 * rmap btree, so we do not want live updates for OWN_AG metadata.
1544 */
1545 if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1546 return oi->oi_owner != XFS_RMAP_OWN_AG;
1547
1548 /* Ignore updates to files that the scanner hasn't visited yet. */
1549 return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1550}
1551
1552/*
1553 * Apply a rmapbt update from the regular filesystem into our shadow btree.
1554 * We're running from the thread that owns the AGF buffer and is generating
1555 * the update, so we must be careful about which parts of the struct xrep_rmap
1556 * that we change.
1557 */
1558static int
1559xrep_rmapbt_live_update(
1560 struct notifier_block *nb,
1561 unsigned long action,
1562 void *data)
1563{
1564 struct xfs_rmap_update_params *p = data;
1565 struct xrep_rmap *rr;
1566 struct xfs_mount *mp;
1567 struct xfs_btree_cur *mcur;
1568 struct xfs_trans *tp;
1569 void *txcookie;
1570 int error;
1571
1572 rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1573 mp = rr->sc->mp;
1574
1575 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1576 goto out_unlock;
1577
1578 trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
1579
1580 error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
1581 if (error)
1582 goto out_abort;
1583
1584 mutex_lock(&rr->lock);
1585 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1586 error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1587 p->blockcount, &p->oinfo, p->unwritten);
1588 xfs_btree_del_cursor(mcur, error);
1589 if (error)
1590 goto out_cancel;
1591
1592 error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1593 if (error)
1594 goto out_cancel;
1595
1596 xrep_trans_cancel_hook_dummy(&txcookie, tp);
1597 mutex_unlock(&rr->lock);
1598 return NOTIFY_DONE;
1599
1600out_cancel:
1601 xfbtree_trans_cancel(&rr->rmap_btree, tp);
1602 xrep_trans_cancel_hook_dummy(&txcookie, tp);
1603out_abort:
1604 mutex_unlock(&rr->lock);
1605 xchk_iscan_abort(&rr->iscan);
1606out_unlock:
1607 return NOTIFY_DONE;
1608}
1609
1610/* Set up the filesystem scan components. */
1611STATIC int
1612xrep_rmap_setup_scan(
1613 struct xrep_rmap *rr)
1614{
1615 struct xfs_scrub *sc = rr->sc;
1616 int error;
1617
1618 mutex_init(&rr->lock);
1619
1620 /* Set up in-memory rmap btree */
1621 error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1622 sc->sa.pag->pag_agno);
1623 if (error)
1624 goto out_mutex;
1625
1626 /* Retry iget every tenth of a second for up to 30 seconds. */
1627 xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1628
1629 /*
1630 * Hook into live rmap operations so that we can update our in-memory
1631 * btree to reflect live changes on the filesystem. Since we drop the
1632 * AGF buffer to scan all the inodes, we need this piece to avoid
1633 * installing a stale btree.
1634 */
1635 ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1636 xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1637 error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
1638 if (error)
1639 goto out_iscan;
1640 return 0;
1641
1642out_iscan:
1643 xchk_iscan_teardown(&rr->iscan);
1644 xfbtree_destroy(&rr->rmap_btree);
1645out_mutex:
1646 mutex_destroy(&rr->lock);
1647 return error;
1648}
1649
1650/* Tear down scan components. */
1651STATIC void
1652xrep_rmap_teardown(
1653 struct xrep_rmap *rr)
1654{
1655 struct xfs_scrub *sc = rr->sc;
1656
1657 xchk_iscan_abort(&rr->iscan);
1658 xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
1659 xchk_iscan_teardown(&rr->iscan);
1660 xfbtree_destroy(&rr->rmap_btree);
1661 mutex_destroy(&rr->lock);
1662}
1663
1664/* Repair the rmap btree for some AG. */
1665int
1666xrep_rmapbt(
1667 struct xfs_scrub *sc)
1668{
1669 struct xrep_rmap *rr = sc->buf;
1670 int error;
1671
1672 error = xrep_rmap_setup_scan(rr);
1673 if (error)
1674 return error;
1675
1676 /*
1677 * Collect rmaps for everything in this AG that isn't space metadata.
1678 * These rmaps won't change even as we try to allocate blocks.
1679 */
1680 error = xrep_rmap_find_rmaps(rr);
1681 if (error)
1682 goto out_records;
1683
1684 /* Rebuild the rmap information. */
1685 error = xrep_rmap_build_new_tree(rr);
1686 if (error)
1687 goto out_records;
1688
1689 /* Kill the old tree. */
1690 error = xrep_rmap_remove_old_tree(rr);
1691 if (error)
1692 goto out_records;
1693
1694out_records:
1695 xrep_rmap_teardown(rr);
1696 return error;
1697}
1698

source code of linux/fs/xfs/scrub/rmap_repair.c