1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (c) 2018-2024 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_defer.h" |
13 | #include "xfs_btree.h" |
14 | #include "xfs_btree_staging.h" |
15 | #include "xfs_buf_mem.h" |
16 | #include "xfs_btree_mem.h" |
17 | #include "xfs_bit.h" |
18 | #include "xfs_log_format.h" |
19 | #include "xfs_trans.h" |
20 | #include "xfs_sb.h" |
21 | #include "xfs_alloc.h" |
22 | #include "xfs_alloc_btree.h" |
23 | #include "xfs_ialloc.h" |
24 | #include "xfs_ialloc_btree.h" |
25 | #include "xfs_rmap.h" |
26 | #include "xfs_rmap_btree.h" |
27 | #include "xfs_inode.h" |
28 | #include "xfs_icache.h" |
29 | #include "xfs_bmap.h" |
30 | #include "xfs_bmap_btree.h" |
31 | #include "xfs_refcount.h" |
32 | #include "xfs_refcount_btree.h" |
33 | #include "xfs_ag.h" |
34 | #include "scrub/xfs_scrub.h" |
35 | #include "scrub/scrub.h" |
36 | #include "scrub/common.h" |
37 | #include "scrub/btree.h" |
38 | #include "scrub/trace.h" |
39 | #include "scrub/repair.h" |
40 | #include "scrub/bitmap.h" |
41 | #include "scrub/agb_bitmap.h" |
42 | #include "scrub/xfile.h" |
43 | #include "scrub/xfarray.h" |
44 | #include "scrub/iscan.h" |
45 | #include "scrub/newbt.h" |
46 | #include "scrub/reap.h" |
47 | |
48 | /* |
49 | * Reverse Mapping Btree Repair |
50 | * ============================ |
51 | * |
52 | * This is the most involved of all the AG space btree rebuilds. Everywhere |
53 | * else in XFS we lock inodes and then AG data structures, but generating the |
54 | * list of rmap records requires that we be able to scan both block mapping |
55 | * btrees of every inode in the filesystem to see if it owns any extents in |
56 | * this AG. We can't tolerate any inode updates while we do this, so we |
57 | * freeze the filesystem to lock everyone else out, and grant ourselves |
58 | * special privileges to run transactions with regular background reclamation |
59 | * turned off. |
60 | * |
61 | * We also have to be very careful not to allow inode reclaim to start a |
62 | * transaction because all transactions (other than our own) will block. |
63 | * Deferred inode inactivation helps us out there. |
64 | * |
65 | * I) Reverse mappings for all non-space metadata and file data are collected |
66 | * according to the following algorithm: |
67 | * |
68 | * 1. For each fork of each inode: |
69 | * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. |
70 | * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate |
71 | * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree |
72 | * block. |
73 | * 1.3. If the incore extent map is loaded but the fork is in btree format, |
74 | * just visit the bmbt blocks to set the corresponding BMBIT areas. |
75 | * 1.4. From the incore extent map, accumulate each bmap that falls into our |
76 | * target AG. Remember, multiple bmap records can map to a single rmap |
77 | * record, so we cannot simply emit rmap records 1:1. |
78 | * 1.5. Emit rmap records for each extent in BMBIT and free it. |
79 | * 2. Create bitmaps INOBIT and ICHUNKBIT. |
80 | * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, |
81 | * and set bits in INOBIT for each btree block. If the inobt has no records |
82 | * at all, we must be careful to record its root in INOBIT. |
83 | * 4. For each block in the finobt, set the corresponding INOBIT area. |
84 | * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. |
85 | * 6. Create bitmaps REFCBIT and COWBIT. |
86 | * 7. For each CoW staging extent in the refcountbt, set the corresponding |
87 | * areas in COWBIT. |
88 | * 8. For each block in the refcountbt, set the corresponding REFCBIT area. |
89 | * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. |
90 | * A. Emit rmap for the AG headers. |
91 | * B. Emit rmap for the log, if there is one. |
92 | * |
93 | * II) The rmapbt shape and space metadata rmaps are computed as follows: |
94 | * |
95 | * 1. Count the rmaps collected in the previous step. (= NR) |
96 | * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) |
97 | * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. |
98 | * 4. Create bitmap AGBIT. |
99 | * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. |
100 | * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. |
101 | * 7. Count the extents in AGBIT. (= AGNR) |
102 | * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') |
103 | * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', |
104 | * and clear AGBIT. Go to step 5. |
105 | * A. Emit rmaps for each extent in AGBIT. |
106 | * |
107 | * III) The rmapbt is constructed and set in place as follows: |
108 | * |
109 | * 1. Sort the rmap records. |
110 | * 2. Bulk load the rmaps. |
111 | * |
112 | * IV) Reap the old btree blocks. |
113 | * |
114 | * 1. Create a bitmap OLDRMBIT. |
115 | * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. |
116 | * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. |
117 | * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are |
118 | * the parts of the AG that the rmap didn't find during its scan of the |
119 | * primary metadata and aren't known to be in the free space, which implies |
120 | * that they were the old rmapbt blocks. |
121 | * 5. Commit. |
122 | * |
123 | * We use the 'xrep_rmap' prefix for all the rmap functions. |
124 | */ |
125 | |
126 | /* Context for collecting rmaps */ |
127 | struct xrep_rmap { |
128 | /* new rmapbt information */ |
129 | struct xrep_newbt new_btree; |
130 | |
131 | /* lock for the xfbtree and xfile */ |
132 | struct mutex lock; |
133 | |
134 | /* rmap records generated from primary metadata */ |
135 | struct xfbtree rmap_btree; |
136 | |
137 | struct xfs_scrub *sc; |
138 | |
139 | /* in-memory btree cursor for the xfs_btree_bload iteration */ |
140 | struct xfs_btree_cur *mcur; |
141 | |
142 | /* Hooks into rmap update code. */ |
143 | struct xfs_rmap_hook rhook; |
144 | |
145 | /* inode scan cursor */ |
146 | struct xchk_iscan iscan; |
147 | |
148 | /* Number of non-freespace records found. */ |
149 | unsigned long long nr_records; |
150 | |
151 | /* bnobt/cntbt contribution to btreeblks */ |
152 | xfs_agblock_t freesp_btblocks; |
153 | |
154 | /* old agf_rmap_blocks counter */ |
155 | unsigned int old_rmapbt_fsbcount; |
156 | }; |
157 | |
158 | /* Set us up to repair reverse mapping btrees. */ |
159 | int |
160 | xrep_setup_ag_rmapbt( |
161 | struct xfs_scrub *sc) |
162 | { |
163 | struct xrep_rmap *rr; |
164 | char *descr; |
165 | int error; |
166 | |
167 | xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); |
168 | |
169 | descr = xchk_xfile_ag_descr(sc, "reverse mapping records" ); |
170 | error = xrep_setup_xfbtree(sc, descr); |
171 | kfree(descr); |
172 | if (error) |
173 | return error; |
174 | |
175 | rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); |
176 | if (!rr) |
177 | return -ENOMEM; |
178 | |
179 | rr->sc = sc; |
180 | sc->buf = rr; |
181 | return 0; |
182 | } |
183 | |
184 | /* Make sure there's nothing funny about this mapping. */ |
185 | STATIC int |
186 | xrep_rmap_check_mapping( |
187 | struct xfs_scrub *sc, |
188 | const struct xfs_rmap_irec *rec) |
189 | { |
190 | enum xbtree_recpacking outcome; |
191 | int error; |
192 | |
193 | if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) |
194 | return -EFSCORRUPTED; |
195 | |
196 | /* Make sure this isn't free space. */ |
197 | error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, |
198 | rec->rm_blockcount, &outcome); |
199 | if (error) |
200 | return error; |
201 | if (outcome != XBTREE_RECPACKING_EMPTY) |
202 | return -EFSCORRUPTED; |
203 | |
204 | return 0; |
205 | } |
206 | |
207 | /* Store a reverse-mapping record. */ |
208 | static inline int |
209 | xrep_rmap_stash( |
210 | struct xrep_rmap *rr, |
211 | xfs_agblock_t startblock, |
212 | xfs_extlen_t blockcount, |
213 | uint64_t owner, |
214 | uint64_t offset, |
215 | unsigned int flags) |
216 | { |
217 | struct xfs_rmap_irec rmap = { |
218 | .rm_startblock = startblock, |
219 | .rm_blockcount = blockcount, |
220 | .rm_owner = owner, |
221 | .rm_offset = offset, |
222 | .rm_flags = flags, |
223 | }; |
224 | struct xfs_scrub *sc = rr->sc; |
225 | struct xfs_btree_cur *mcur; |
226 | int error = 0; |
227 | |
228 | if (xchk_should_terminate(sc, &error)) |
229 | return error; |
230 | |
231 | if (xchk_iscan_aborted(&rr->iscan)) |
232 | return -EFSCORRUPTED; |
233 | |
234 | trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); |
235 | |
236 | mutex_lock(&rr->lock); |
237 | mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); |
238 | error = xfs_rmap_map_raw(mcur, &rmap); |
239 | xfs_btree_del_cursor(mcur, error); |
240 | if (error) |
241 | goto out_cancel; |
242 | |
243 | error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); |
244 | if (error) |
245 | goto out_abort; |
246 | |
247 | mutex_unlock(&rr->lock); |
248 | return 0; |
249 | |
250 | out_cancel: |
251 | xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); |
252 | out_abort: |
253 | xchk_iscan_abort(&rr->iscan); |
254 | mutex_unlock(&rr->lock); |
255 | return error; |
256 | } |
257 | |
258 | struct xrep_rmap_stash_run { |
259 | struct xrep_rmap *rr; |
260 | uint64_t owner; |
261 | unsigned int rmap_flags; |
262 | }; |
263 | |
264 | static int |
265 | xrep_rmap_stash_run( |
266 | uint32_t start, |
267 | uint32_t len, |
268 | void *priv) |
269 | { |
270 | struct xrep_rmap_stash_run *rsr = priv; |
271 | struct xrep_rmap *rr = rsr->rr; |
272 | |
273 | return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); |
274 | } |
275 | |
276 | /* |
277 | * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure |
278 | * that the ranges are in units of FS blocks. |
279 | */ |
280 | STATIC int |
281 | xrep_rmap_stash_bitmap( |
282 | struct xrep_rmap *rr, |
283 | struct xagb_bitmap *bitmap, |
284 | const struct xfs_owner_info *oinfo) |
285 | { |
286 | struct xrep_rmap_stash_run rsr = { |
287 | .rr = rr, |
288 | .owner = oinfo->oi_owner, |
289 | .rmap_flags = 0, |
290 | }; |
291 | |
292 | if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) |
293 | rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; |
294 | if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) |
295 | rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; |
296 | |
297 | return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); |
298 | } |
299 | |
300 | /* Section (I): Finding all file and bmbt extents. */ |
301 | |
302 | /* Context for accumulating rmaps for an inode fork. */ |
303 | struct xrep_rmap_ifork { |
304 | /* |
305 | * Accumulate rmap data here to turn multiple adjacent bmaps into a |
306 | * single rmap. |
307 | */ |
308 | struct xfs_rmap_irec accum; |
309 | |
310 | /* Bitmap of bmbt blocks in this AG. */ |
311 | struct xagb_bitmap bmbt_blocks; |
312 | |
313 | struct xrep_rmap *rr; |
314 | |
315 | /* Which inode fork? */ |
316 | int whichfork; |
317 | }; |
318 | |
319 | /* Stash an rmap that we accumulated while walking an inode fork. */ |
320 | STATIC int |
321 | xrep_rmap_stash_accumulated( |
322 | struct xrep_rmap_ifork *rf) |
323 | { |
324 | if (rf->accum.rm_blockcount == 0) |
325 | return 0; |
326 | |
327 | return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, |
328 | rf->accum.rm_blockcount, rf->accum.rm_owner, |
329 | rf->accum.rm_offset, rf->accum.rm_flags); |
330 | } |
331 | |
332 | /* Accumulate a bmbt record. */ |
333 | STATIC int |
334 | xrep_rmap_visit_bmbt( |
335 | struct xfs_btree_cur *cur, |
336 | struct xfs_bmbt_irec *rec, |
337 | void *priv) |
338 | { |
339 | struct xrep_rmap_ifork *rf = priv; |
340 | struct xfs_mount *mp = rf->rr->sc->mp; |
341 | struct xfs_rmap_irec *accum = &rf->accum; |
342 | xfs_agblock_t agbno; |
343 | unsigned int rmap_flags = 0; |
344 | int error; |
345 | |
346 | if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != |
347 | rf->rr->sc->sa.pag->pag_agno) |
348 | return 0; |
349 | |
350 | agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); |
351 | if (rf->whichfork == XFS_ATTR_FORK) |
352 | rmap_flags |= XFS_RMAP_ATTR_FORK; |
353 | if (rec->br_state == XFS_EXT_UNWRITTEN) |
354 | rmap_flags |= XFS_RMAP_UNWRITTEN; |
355 | |
356 | /* If this bmap is adjacent to the previous one, just add it. */ |
357 | if (accum->rm_blockcount > 0 && |
358 | rec->br_startoff == accum->rm_offset + accum->rm_blockcount && |
359 | agbno == accum->rm_startblock + accum->rm_blockcount && |
360 | rmap_flags == accum->rm_flags) { |
361 | accum->rm_blockcount += rec->br_blockcount; |
362 | return 0; |
363 | } |
364 | |
365 | /* Otherwise stash the old rmap and start accumulating a new one. */ |
366 | error = xrep_rmap_stash_accumulated(rf); |
367 | if (error) |
368 | return error; |
369 | |
370 | accum->rm_startblock = agbno; |
371 | accum->rm_blockcount = rec->br_blockcount; |
372 | accum->rm_offset = rec->br_startoff; |
373 | accum->rm_flags = rmap_flags; |
374 | return 0; |
375 | } |
376 | |
377 | /* Add a btree block to the bitmap. */ |
378 | STATIC int |
379 | xrep_rmap_visit_iroot_btree_block( |
380 | struct xfs_btree_cur *cur, |
381 | int level, |
382 | void *priv) |
383 | { |
384 | struct xrep_rmap_ifork *rf = priv; |
385 | struct xfs_buf *bp; |
386 | xfs_fsblock_t fsbno; |
387 | xfs_agblock_t agbno; |
388 | |
389 | xfs_btree_get_block(cur, level, &bp); |
390 | if (!bp) |
391 | return 0; |
392 | |
393 | fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); |
394 | if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) |
395 | return 0; |
396 | |
397 | agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); |
398 | return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); |
399 | } |
400 | |
401 | /* |
402 | * Iterate a metadata btree rooted in an inode to collect rmap records for |
403 | * anything in this fork that matches the AG. |
404 | */ |
405 | STATIC int |
406 | xrep_rmap_scan_iroot_btree( |
407 | struct xrep_rmap_ifork *rf, |
408 | struct xfs_btree_cur *cur) |
409 | { |
410 | struct xfs_owner_info oinfo; |
411 | struct xrep_rmap *rr = rf->rr; |
412 | int error; |
413 | |
414 | xagb_bitmap_init(&rf->bmbt_blocks); |
415 | |
416 | /* Record all the blocks in the btree itself. */ |
417 | error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, |
418 | XFS_BTREE_VISIT_ALL, rf); |
419 | if (error) |
420 | goto out; |
421 | |
422 | /* Emit rmaps for the btree blocks. */ |
423 | xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); |
424 | error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); |
425 | if (error) |
426 | goto out; |
427 | |
428 | /* Stash any remaining accumulated rmaps. */ |
429 | error = xrep_rmap_stash_accumulated(rf); |
430 | out: |
431 | xagb_bitmap_destroy(&rf->bmbt_blocks); |
432 | return error; |
433 | } |
434 | |
435 | static inline bool |
436 | is_rt_data_fork( |
437 | struct xfs_inode *ip, |
438 | int whichfork) |
439 | { |
440 | return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; |
441 | } |
442 | |
443 | /* |
444 | * Iterate the block mapping btree to collect rmap records for anything in this |
445 | * fork that matches the AG. Sets @mappings_done to true if we've scanned the |
446 | * block mappings in this fork. |
447 | */ |
448 | STATIC int |
449 | xrep_rmap_scan_bmbt( |
450 | struct xrep_rmap_ifork *rf, |
451 | struct xfs_inode *ip, |
452 | bool *mappings_done) |
453 | { |
454 | struct xrep_rmap *rr = rf->rr; |
455 | struct xfs_btree_cur *cur; |
456 | struct xfs_ifork *ifp; |
457 | int error; |
458 | |
459 | *mappings_done = false; |
460 | ifp = xfs_ifork_ptr(ip, rf->whichfork); |
461 | cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); |
462 | |
463 | if (!xfs_ifork_is_realtime(ip, rf->whichfork) && |
464 | xfs_need_iread_extents(ifp)) { |
465 | /* |
466 | * If the incore extent cache isn't loaded, scan the bmbt for |
467 | * mapping records. This avoids loading the incore extent |
468 | * tree, which will increase memory pressure at a time when |
469 | * we're trying to run as quickly as we possibly can. Ignore |
470 | * realtime extents. |
471 | */ |
472 | error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); |
473 | if (error) |
474 | goto out_cur; |
475 | |
476 | *mappings_done = true; |
477 | } |
478 | |
479 | /* Scan for the bmbt blocks, which always live on the data device. */ |
480 | error = xrep_rmap_scan_iroot_btree(rf, cur); |
481 | out_cur: |
482 | xfs_btree_del_cursor(cur, error); |
483 | return error; |
484 | } |
485 | |
486 | /* |
487 | * Iterate the in-core extent cache to collect rmap records for anything in |
488 | * this fork that matches the AG. |
489 | */ |
490 | STATIC int |
491 | xrep_rmap_scan_iext( |
492 | struct xrep_rmap_ifork *rf, |
493 | struct xfs_ifork *ifp) |
494 | { |
495 | struct xfs_bmbt_irec rec; |
496 | struct xfs_iext_cursor icur; |
497 | int error; |
498 | |
499 | for_each_xfs_iext(ifp, &icur, &rec) { |
500 | if (isnullstartblock(rec.br_startblock)) |
501 | continue; |
502 | error = xrep_rmap_visit_bmbt(NULL, &rec, rf); |
503 | if (error) |
504 | return error; |
505 | } |
506 | |
507 | return xrep_rmap_stash_accumulated(rf); |
508 | } |
509 | |
510 | /* Find all the extents from a given AG in an inode fork. */ |
511 | STATIC int |
512 | xrep_rmap_scan_ifork( |
513 | struct xrep_rmap *rr, |
514 | struct xfs_inode *ip, |
515 | int whichfork) |
516 | { |
517 | struct xrep_rmap_ifork rf = { |
518 | .accum = { .rm_owner = ip->i_ino, }, |
519 | .rr = rr, |
520 | .whichfork = whichfork, |
521 | }; |
522 | struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); |
523 | int error = 0; |
524 | |
525 | if (!ifp) |
526 | return 0; |
527 | |
528 | if (ifp->if_format == XFS_DINODE_FMT_BTREE) { |
529 | bool mappings_done; |
530 | |
531 | /* |
532 | * Scan the bmap btree for data device mappings. This includes |
533 | * the btree blocks themselves, even if this is a realtime |
534 | * file. |
535 | */ |
536 | error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); |
537 | if (error || mappings_done) |
538 | return error; |
539 | } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { |
540 | return 0; |
541 | } |
542 | |
543 | /* Scan incore extent cache if this isn't a realtime file. */ |
544 | if (xfs_ifork_is_realtime(ip, whichfork)) |
545 | return 0; |
546 | |
547 | return xrep_rmap_scan_iext(&rf, ifp); |
548 | } |
549 | |
550 | /* |
551 | * Take ILOCK on a file that we want to scan. |
552 | * |
553 | * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded |
554 | * attr bmbt. Otherwise, take ILOCK_SHARED. |
555 | */ |
556 | static inline unsigned int |
557 | xrep_rmap_scan_ilock( |
558 | struct xfs_inode *ip) |
559 | { |
560 | uint lock_mode = XFS_ILOCK_SHARED; |
561 | |
562 | if (xfs_need_iread_extents(&ip->i_df)) { |
563 | lock_mode = XFS_ILOCK_EXCL; |
564 | goto lock; |
565 | } |
566 | |
567 | if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) |
568 | lock_mode = XFS_ILOCK_EXCL; |
569 | |
570 | lock: |
571 | xfs_ilock(ip, lock_mode); |
572 | return lock_mode; |
573 | } |
574 | |
575 | /* Record reverse mappings for a file. */ |
576 | STATIC int |
577 | xrep_rmap_scan_inode( |
578 | struct xrep_rmap *rr, |
579 | struct xfs_inode *ip) |
580 | { |
581 | unsigned int lock_mode = 0; |
582 | int error; |
583 | |
584 | /* |
585 | * Directory updates (create/link/unlink/rename) drop the directory's |
586 | * ILOCK before finishing any rmapbt updates associated with directory |
587 | * shape changes. For this scan to coordinate correctly with the live |
588 | * update hook, we must take the only lock (i_rwsem) that is held all |
589 | * the way to dir op completion. This will get fixed by the parent |
590 | * pointer patchset. |
591 | */ |
592 | if (S_ISDIR(VFS_I(ip)->i_mode)) { |
593 | lock_mode = XFS_IOLOCK_SHARED; |
594 | xfs_ilock(ip, lock_mode); |
595 | } |
596 | lock_mode |= xrep_rmap_scan_ilock(ip); |
597 | |
598 | /* Check the data fork. */ |
599 | error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); |
600 | if (error) |
601 | goto out_unlock; |
602 | |
603 | /* Check the attr fork. */ |
604 | error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); |
605 | if (error) |
606 | goto out_unlock; |
607 | |
608 | /* COW fork extents are "owned" by the refcount btree. */ |
609 | |
610 | xchk_iscan_mark_visited(&rr->iscan, ip); |
611 | out_unlock: |
612 | xfs_iunlock(ip, lock_mode); |
613 | return error; |
614 | } |
615 | |
616 | /* Section (I): Find all AG metadata extents except for free space metadata. */ |
617 | |
618 | struct xrep_rmap_inodes { |
619 | struct xrep_rmap *rr; |
620 | struct xagb_bitmap inobt_blocks; /* INOBIT */ |
621 | struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ |
622 | }; |
623 | |
624 | /* Record inode btree rmaps. */ |
625 | STATIC int |
626 | xrep_rmap_walk_inobt( |
627 | struct xfs_btree_cur *cur, |
628 | const union xfs_btree_rec *rec, |
629 | void *priv) |
630 | { |
631 | struct xfs_inobt_rec_incore irec; |
632 | struct xrep_rmap_inodes *ri = priv; |
633 | struct xfs_mount *mp = cur->bc_mp; |
634 | xfs_agblock_t agbno; |
635 | xfs_extlen_t aglen; |
636 | xfs_agino_t agino; |
637 | xfs_agino_t iperhole; |
638 | unsigned int i; |
639 | int error; |
640 | |
641 | /* Record the inobt blocks. */ |
642 | error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); |
643 | if (error) |
644 | return error; |
645 | |
646 | xfs_inobt_btrec_to_irec(mp, rec, &irec); |
647 | if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) |
648 | return -EFSCORRUPTED; |
649 | |
650 | agino = irec.ir_startino; |
651 | |
652 | /* Record a non-sparse inode chunk. */ |
653 | if (!xfs_inobt_issparse(irec.ir_holemask)) { |
654 | agbno = XFS_AGINO_TO_AGBNO(mp, agino); |
655 | aglen = max_t(xfs_extlen_t, 1, |
656 | XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); |
657 | |
658 | return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); |
659 | } |
660 | |
661 | /* Iterate each chunk. */ |
662 | iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, |
663 | XFS_INODES_PER_HOLEMASK_BIT); |
664 | aglen = iperhole / mp->m_sb.sb_inopblock; |
665 | for (i = 0, agino = irec.ir_startino; |
666 | i < XFS_INOBT_HOLEMASK_BITS; |
667 | i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { |
668 | /* Skip holes. */ |
669 | if (irec.ir_holemask & (1 << i)) |
670 | continue; |
671 | |
672 | /* Record the inode chunk otherwise. */ |
673 | agbno = XFS_AGINO_TO_AGBNO(mp, agino); |
674 | error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); |
675 | if (error) |
676 | return error; |
677 | } |
678 | |
679 | return 0; |
680 | } |
681 | |
682 | /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ |
683 | STATIC int |
684 | xrep_rmap_find_inode_rmaps( |
685 | struct xrep_rmap *rr) |
686 | { |
687 | struct xrep_rmap_inodes ri = { |
688 | .rr = rr, |
689 | }; |
690 | struct xfs_scrub *sc = rr->sc; |
691 | int error; |
692 | |
693 | xagb_bitmap_init(&ri.inobt_blocks); |
694 | xagb_bitmap_init(&ri.ichunk_blocks); |
695 | |
696 | /* |
697 | * Iterate every record in the inobt so we can capture all the inode |
698 | * chunks and the blocks in the inobt itself. |
699 | */ |
700 | error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); |
701 | if (error) |
702 | goto out_bitmap; |
703 | |
704 | /* |
705 | * Note that if there are zero records in the inobt then query_all does |
706 | * nothing and we have to account the empty inobt root manually. |
707 | */ |
708 | if (xagb_bitmap_empty(&ri.ichunk_blocks)) { |
709 | struct xfs_agi *agi = sc->sa.agi_bp->b_addr; |
710 | |
711 | error = xagb_bitmap_set(&ri.inobt_blocks, |
712 | be32_to_cpu(agi->agi_root), 1); |
713 | if (error) |
714 | goto out_bitmap; |
715 | } |
716 | |
717 | /* Scan the finobt too. */ |
718 | if (xfs_has_finobt(sc->mp)) { |
719 | error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, |
720 | sc->sa.fino_cur); |
721 | if (error) |
722 | goto out_bitmap; |
723 | } |
724 | |
725 | /* Generate rmaps for everything. */ |
726 | error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, |
727 | &XFS_RMAP_OINFO_INOBT); |
728 | if (error) |
729 | goto out_bitmap; |
730 | error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, |
731 | &XFS_RMAP_OINFO_INODES); |
732 | |
733 | out_bitmap: |
734 | xagb_bitmap_destroy(&ri.inobt_blocks); |
735 | xagb_bitmap_destroy(&ri.ichunk_blocks); |
736 | return error; |
737 | } |
738 | |
739 | /* Record a CoW staging extent. */ |
740 | STATIC int |
741 | xrep_rmap_walk_cowblocks( |
742 | struct xfs_btree_cur *cur, |
743 | const struct xfs_refcount_irec *irec, |
744 | void *priv) |
745 | { |
746 | struct xagb_bitmap *bitmap = priv; |
747 | |
748 | if (!xfs_refcount_check_domain(irec) || |
749 | irec->rc_domain != XFS_REFC_DOMAIN_COW) |
750 | return -EFSCORRUPTED; |
751 | |
752 | return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); |
753 | } |
754 | |
755 | /* |
756 | * Collect rmaps for the blocks containing the refcount btree, and all CoW |
757 | * staging extents. |
758 | */ |
759 | STATIC int |
760 | xrep_rmap_find_refcount_rmaps( |
761 | struct xrep_rmap *rr) |
762 | { |
763 | struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ |
764 | struct xagb_bitmap cow_blocks; /* COWBIT */ |
765 | struct xfs_refcount_irec low = { |
766 | .rc_startblock = 0, |
767 | .rc_domain = XFS_REFC_DOMAIN_COW, |
768 | }; |
769 | struct xfs_refcount_irec high = { |
770 | .rc_startblock = -1U, |
771 | .rc_domain = XFS_REFC_DOMAIN_COW, |
772 | }; |
773 | struct xfs_scrub *sc = rr->sc; |
774 | int error; |
775 | |
776 | if (!xfs_has_reflink(sc->mp)) |
777 | return 0; |
778 | |
779 | xagb_bitmap_init(&refcountbt_blocks); |
780 | xagb_bitmap_init(&cow_blocks); |
781 | |
782 | /* refcountbt */ |
783 | error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); |
784 | if (error) |
785 | goto out_bitmap; |
786 | |
787 | /* Collect rmaps for CoW staging extents. */ |
788 | error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, |
789 | xrep_rmap_walk_cowblocks, &cow_blocks); |
790 | if (error) |
791 | goto out_bitmap; |
792 | |
793 | /* Generate rmaps for everything. */ |
794 | error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); |
795 | if (error) |
796 | goto out_bitmap; |
797 | error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, |
798 | &XFS_RMAP_OINFO_REFC); |
799 | |
800 | out_bitmap: |
801 | xagb_bitmap_destroy(&cow_blocks); |
802 | xagb_bitmap_destroy(&refcountbt_blocks); |
803 | return error; |
804 | } |
805 | |
806 | /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ |
807 | STATIC int |
808 | ( |
809 | struct xrep_rmap *rr) |
810 | { |
811 | struct xfs_scrub *sc = rr->sc; |
812 | |
813 | /* Create a record for the AG sb->agfl. */ |
814 | return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), |
815 | XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, |
816 | XFS_RMAP_OWN_FS, 0, 0); |
817 | } |
818 | |
819 | /* Generate rmaps for the log, if it's in this AG. */ |
820 | STATIC int |
821 | xrep_rmap_find_log_rmaps( |
822 | struct xrep_rmap *rr) |
823 | { |
824 | struct xfs_scrub *sc = rr->sc; |
825 | |
826 | if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) |
827 | return 0; |
828 | |
829 | return xrep_rmap_stash(rr, |
830 | XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), |
831 | sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); |
832 | } |
833 | |
834 | /* Check and count all the records that we gathered. */ |
835 | STATIC int |
836 | xrep_rmap_check_record( |
837 | struct xfs_btree_cur *cur, |
838 | const struct xfs_rmap_irec *rec, |
839 | void *priv) |
840 | { |
841 | struct xrep_rmap *rr = priv; |
842 | int error; |
843 | |
844 | error = xrep_rmap_check_mapping(rr->sc, rec); |
845 | if (error) |
846 | return error; |
847 | |
848 | rr->nr_records++; |
849 | return 0; |
850 | } |
851 | |
852 | /* |
853 | * Generate all the reverse-mappings for this AG, a list of the old rmapbt |
854 | * blocks, and the new btreeblks count. Figure out if we have enough free |
855 | * space to reconstruct the inode btrees. The caller must clean up the lists |
856 | * if anything goes wrong. This implements section (I) above. |
857 | */ |
858 | STATIC int |
859 | xrep_rmap_find_rmaps( |
860 | struct xrep_rmap *rr) |
861 | { |
862 | struct xfs_scrub *sc = rr->sc; |
863 | struct xchk_ag *sa = &sc->sa; |
864 | struct xfs_inode *ip; |
865 | struct xfs_btree_cur *mcur; |
866 | int error; |
867 | |
868 | /* Find all the per-AG metadata. */ |
869 | xrep_ag_btcur_init(sc, &sc->sa); |
870 | |
871 | error = xrep_rmap_find_inode_rmaps(rr); |
872 | if (error) |
873 | goto end_agscan; |
874 | |
875 | error = xrep_rmap_find_refcount_rmaps(rr); |
876 | if (error) |
877 | goto end_agscan; |
878 | |
879 | error = xrep_rmap_find_agheader_rmaps(rr); |
880 | if (error) |
881 | goto end_agscan; |
882 | |
883 | error = xrep_rmap_find_log_rmaps(rr); |
884 | end_agscan: |
885 | xchk_ag_btcur_free(&sc->sa); |
886 | if (error) |
887 | return error; |
888 | |
889 | /* |
890 | * Set up for a potentially lengthy filesystem scan by reducing our |
891 | * transaction resource usage for the duration. Specifically: |
892 | * |
893 | * Unlock the AG header buffers and cancel the transaction to release |
894 | * the log grant space while we scan the filesystem. |
895 | * |
896 | * Create a new empty transaction to eliminate the possibility of the |
897 | * inode scan deadlocking on cyclical metadata. |
898 | * |
899 | * We pass the empty transaction to the file scanning function to avoid |
900 | * repeatedly cycling empty transactions. This can be done even though |
901 | * we take the IOLOCK to quiesce the file because empty transactions |
902 | * do not take sb_internal. |
903 | */ |
904 | sa->agf_bp = NULL; |
905 | sa->agi_bp = NULL; |
906 | xchk_trans_cancel(sc); |
907 | error = xchk_trans_alloc_empty(sc); |
908 | if (error) |
909 | return error; |
910 | |
911 | /* Iterate all AGs for inodes rmaps. */ |
912 | while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { |
913 | error = xrep_rmap_scan_inode(rr, ip); |
914 | xchk_irele(sc, ip); |
915 | if (error) |
916 | break; |
917 | |
918 | if (xchk_should_terminate(sc, &error)) |
919 | break; |
920 | } |
921 | xchk_iscan_iter_finish(&rr->iscan); |
922 | if (error) |
923 | return error; |
924 | |
925 | /* |
926 | * Switch out for a real transaction and lock the AG headers in |
927 | * preparation for building a new tree. |
928 | */ |
929 | xchk_trans_cancel(sc); |
930 | error = xchk_setup_fs(sc); |
931 | if (error) |
932 | return error; |
933 | error = xchk_perag_drain_and_lock(sc); |
934 | if (error) |
935 | return error; |
936 | |
937 | /* |
938 | * If a hook failed to update the in-memory btree, we lack the data to |
939 | * continue the repair. |
940 | */ |
941 | if (xchk_iscan_aborted(&rr->iscan)) |
942 | return -EFSCORRUPTED; |
943 | |
944 | /* |
945 | * Now that we have everything locked again, we need to count the |
946 | * number of rmap records stashed in the btree. This should reflect |
947 | * all actively-owned space in the filesystem. At the same time, check |
948 | * all our records before we start building a new btree, which requires |
949 | * a bnobt cursor. |
950 | */ |
951 | mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); |
952 | sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, |
953 | sc->sa.pag); |
954 | |
955 | rr->nr_records = 0; |
956 | error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); |
957 | |
958 | xfs_btree_del_cursor(sc->sa.bno_cur, error); |
959 | sc->sa.bno_cur = NULL; |
960 | xfs_btree_del_cursor(mcur, error); |
961 | |
962 | return error; |
963 | } |
964 | |
965 | /* Section (II): Reserving space for new rmapbt and setting free space bitmap */ |
966 | |
967 | struct xrep_rmap_agfl { |
968 | struct xagb_bitmap *bitmap; |
969 | xfs_agnumber_t agno; |
970 | }; |
971 | |
972 | /* Add an AGFL block to the rmap list. */ |
973 | STATIC int |
974 | xrep_rmap_walk_agfl( |
975 | struct xfs_mount *mp, |
976 | xfs_agblock_t agbno, |
977 | void *priv) |
978 | { |
979 | struct xrep_rmap_agfl *ra = priv; |
980 | |
981 | return xagb_bitmap_set(ra->bitmap, agbno, 1); |
982 | } |
983 | |
984 | /* |
985 | * Run one round of reserving space for the new rmapbt and recomputing the |
986 | * number of blocks needed to store the previously observed rmapbt records and |
987 | * the ones we'll create for the free space metadata. When we don't need more |
988 | * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to |
989 | * true. |
990 | */ |
991 | STATIC int |
992 | xrep_rmap_try_reserve( |
993 | struct xrep_rmap *rr, |
994 | struct xfs_btree_cur *rmap_cur, |
995 | struct xagb_bitmap *freesp_blocks, |
996 | uint64_t *blocks_reserved, |
997 | bool *done) |
998 | { |
999 | struct xrep_rmap_agfl ra = { |
1000 | .bitmap = freesp_blocks, |
1001 | .agno = rr->sc->sa.pag->pag_agno, |
1002 | }; |
1003 | struct xfs_scrub *sc = rr->sc; |
1004 | struct xrep_newbt_resv *resv, *n; |
1005 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
1006 | struct xfs_buf *agfl_bp; |
1007 | uint64_t nr_blocks; /* RMB */ |
1008 | uint64_t freesp_records; |
1009 | int error; |
1010 | |
1011 | /* |
1012 | * We're going to recompute new_btree.bload.nr_blocks at the end of |
1013 | * this function to reflect however many btree blocks we need to store |
1014 | * all the rmap records (including the ones that reflect the changes we |
1015 | * made to support the new rmapbt blocks), so we save the old value |
1016 | * here so we can decide if we've reserved enough blocks. |
1017 | */ |
1018 | nr_blocks = rr->new_btree.bload.nr_blocks; |
1019 | |
1020 | /* |
1021 | * Make sure we've reserved enough space for the new btree. This can |
1022 | * change the shape of the free space btrees, which can cause secondary |
1023 | * interactions with the rmap records because all three space btrees |
1024 | * have the same rmap owner. We'll account for all that below. |
1025 | */ |
1026 | error = xrep_newbt_alloc_blocks(&rr->new_btree, |
1027 | nr_blocks - *blocks_reserved); |
1028 | if (error) |
1029 | return error; |
1030 | |
1031 | *blocks_reserved = rr->new_btree.bload.nr_blocks; |
1032 | |
1033 | /* Clear everything in the bitmap. */ |
1034 | xagb_bitmap_destroy(freesp_blocks); |
1035 | |
1036 | /* Set all the bnobt blocks in the bitmap. */ |
1037 | sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, |
1038 | sc->sa.pag); |
1039 | error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); |
1040 | xfs_btree_del_cursor(sc->sa.bno_cur, error); |
1041 | sc->sa.bno_cur = NULL; |
1042 | if (error) |
1043 | return error; |
1044 | |
1045 | /* Set all the cntbt blocks in the bitmap. */ |
1046 | sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, |
1047 | sc->sa.pag); |
1048 | error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); |
1049 | xfs_btree_del_cursor(sc->sa.cnt_cur, error); |
1050 | sc->sa.cnt_cur = NULL; |
1051 | if (error) |
1052 | return error; |
1053 | |
1054 | /* Record our new btreeblks value. */ |
1055 | rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; |
1056 | |
1057 | /* Set all the new rmapbt blocks in the bitmap. */ |
1058 | list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { |
1059 | error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); |
1060 | if (error) |
1061 | return error; |
1062 | } |
1063 | |
1064 | /* Set all the AGFL blocks in the bitmap. */ |
1065 | error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); |
1066 | if (error) |
1067 | return error; |
1068 | |
1069 | error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); |
1070 | if (error) |
1071 | return error; |
1072 | |
1073 | /* Count the extents in the bitmap. */ |
1074 | freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); |
1075 | |
1076 | /* Compute how many blocks we'll need for all the rmaps. */ |
1077 | error = xfs_btree_bload_compute_geometry(rmap_cur, |
1078 | &rr->new_btree.bload, rr->nr_records + freesp_records); |
1079 | if (error) |
1080 | return error; |
1081 | |
1082 | /* We're done when we don't need more blocks. */ |
1083 | *done = nr_blocks >= rr->new_btree.bload.nr_blocks; |
1084 | return 0; |
1085 | } |
1086 | |
1087 | /* |
1088 | * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for |
1089 | * the free space metadata. This implements section (II) above. |
1090 | */ |
1091 | STATIC int |
1092 | xrep_rmap_reserve_space( |
1093 | struct xrep_rmap *rr, |
1094 | struct xfs_btree_cur *rmap_cur) |
1095 | { |
1096 | struct xagb_bitmap freesp_blocks; /* AGBIT */ |
1097 | uint64_t blocks_reserved = 0; |
1098 | bool done = false; |
1099 | int error; |
1100 | |
1101 | /* Compute how many blocks we'll need for the rmaps collected so far. */ |
1102 | error = xfs_btree_bload_compute_geometry(rmap_cur, |
1103 | &rr->new_btree.bload, rr->nr_records); |
1104 | if (error) |
1105 | return error; |
1106 | |
1107 | /* Last chance to abort before we start committing fixes. */ |
1108 | if (xchk_should_terminate(rr->sc, &error)) |
1109 | return error; |
1110 | |
1111 | xagb_bitmap_init(&freesp_blocks); |
1112 | |
1113 | /* |
1114 | * Iteratively reserve space for the new rmapbt and recompute the |
1115 | * number of blocks needed to store the previously observed rmapbt |
1116 | * records and the ones we'll create for the free space metadata. |
1117 | * Finish when we don't need more blocks. |
1118 | */ |
1119 | do { |
1120 | error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, |
1121 | &blocks_reserved, &done); |
1122 | if (error) |
1123 | goto out_bitmap; |
1124 | } while (!done); |
1125 | |
1126 | /* Emit rmaps for everything in the free space bitmap. */ |
1127 | xrep_ag_btcur_init(rr->sc, &rr->sc->sa); |
1128 | error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); |
1129 | xchk_ag_btcur_free(&rr->sc->sa); |
1130 | |
1131 | out_bitmap: |
1132 | xagb_bitmap_destroy(&freesp_blocks); |
1133 | return error; |
1134 | } |
1135 | |
1136 | /* Section (III): Building the new rmap btree. */ |
1137 | |
1138 | /* Update the AGF counters. */ |
1139 | STATIC int |
1140 | xrep_rmap_reset_counters( |
1141 | struct xrep_rmap *rr) |
1142 | { |
1143 | struct xfs_scrub *sc = rr->sc; |
1144 | struct xfs_perag *pag = sc->sa.pag; |
1145 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
1146 | xfs_agblock_t rmap_btblocks; |
1147 | |
1148 | /* |
1149 | * The AGF header contains extra information related to the reverse |
1150 | * mapping btree, so we must update those fields here. |
1151 | */ |
1152 | rmap_btblocks = rr->new_btree.afake.af_blocks - 1; |
1153 | agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); |
1154 | xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); |
1155 | |
1156 | /* |
1157 | * After we commit the new btree to disk, it is possible that the |
1158 | * process to reap the old btree blocks will race with the AIL trying |
1159 | * to checkpoint the old btree blocks into the filesystem. If the new |
1160 | * tree is shorter than the old one, the rmapbt write verifier will |
1161 | * fail and the AIL will shut down the filesystem. |
1162 | * |
1163 | * To avoid this, save the old incore btree height values as the alt |
1164 | * height values before re-initializing the perag info from the updated |
1165 | * AGF to capture all the new values. |
1166 | */ |
1167 | pag->pagf_repair_rmap_level = pag->pagf_rmap_level; |
1168 | |
1169 | /* Reinitialize with the values we just logged. */ |
1170 | return xrep_reinit_pagf(sc); |
1171 | } |
1172 | |
1173 | /* Retrieve rmapbt data for bulk load. */ |
1174 | STATIC int |
1175 | xrep_rmap_get_records( |
1176 | struct xfs_btree_cur *cur, |
1177 | unsigned int idx, |
1178 | struct xfs_btree_block *block, |
1179 | unsigned int nr_wanted, |
1180 | void *priv) |
1181 | { |
1182 | struct xrep_rmap *rr = priv; |
1183 | union xfs_btree_rec *block_rec; |
1184 | unsigned int loaded; |
1185 | int error; |
1186 | |
1187 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
1188 | int stat = 0; |
1189 | |
1190 | error = xfs_btree_increment(rr->mcur, 0, &stat); |
1191 | if (error) |
1192 | return error; |
1193 | if (!stat) |
1194 | return -EFSCORRUPTED; |
1195 | |
1196 | error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); |
1197 | if (error) |
1198 | return error; |
1199 | if (!stat) |
1200 | return -EFSCORRUPTED; |
1201 | |
1202 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
1203 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
1204 | } |
1205 | |
1206 | return loaded; |
1207 | } |
1208 | |
1209 | /* Feed one of the new btree blocks to the bulk loader. */ |
1210 | STATIC int |
1211 | xrep_rmap_claim_block( |
1212 | struct xfs_btree_cur *cur, |
1213 | union xfs_btree_ptr *ptr, |
1214 | void *priv) |
1215 | { |
1216 | struct xrep_rmap *rr = priv; |
1217 | |
1218 | return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); |
1219 | } |
1220 | |
1221 | /* Custom allocation function for new rmap btrees. */ |
1222 | STATIC int |
1223 | xrep_rmap_alloc_vextent( |
1224 | struct xfs_scrub *sc, |
1225 | struct xfs_alloc_arg *args, |
1226 | xfs_fsblock_t alloc_hint) |
1227 | { |
1228 | int error; |
1229 | |
1230 | /* |
1231 | * We don't want an rmap update on the allocation, since we iteratively |
1232 | * compute the OWN_AG records /after/ allocating blocks for the records |
1233 | * that we already know we need to store. Therefore, fix the freelist |
1234 | * with the NORMAP flag set so that we don't also try to create an rmap |
1235 | * for new AGFL blocks. |
1236 | */ |
1237 | error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); |
1238 | if (error) |
1239 | return error; |
1240 | |
1241 | /* |
1242 | * If xrep_fix_freelist fixed the freelist by moving blocks from the |
1243 | * free space btrees or by removing blocks from the AGFL and queueing |
1244 | * an EFI to free the block, the transaction will be dirty. This |
1245 | * second case is of interest to us. |
1246 | * |
1247 | * Later on, we will need to compare gaps in the new recordset against |
1248 | * the block usage of all OWN_AG owners in order to free the old |
1249 | * btree's blocks, which means that we can't have EFIs for former AGFL |
1250 | * blocks attached to the repair transaction when we commit the new |
1251 | * btree. |
1252 | * |
1253 | * xrep_newbt_alloc_blocks guarantees this for us by calling |
1254 | * xrep_defer_finish to commit anything that fix_freelist may have |
1255 | * added to the transaction. |
1256 | */ |
1257 | return xfs_alloc_vextent_near_bno(args, alloc_hint); |
1258 | } |
1259 | |
1260 | |
1261 | /* Count the records in this btree. */ |
1262 | STATIC int |
1263 | xrep_rmap_count_records( |
1264 | struct xfs_btree_cur *cur, |
1265 | unsigned long long *nr) |
1266 | { |
1267 | int running = 1; |
1268 | int error; |
1269 | |
1270 | *nr = 0; |
1271 | |
1272 | error = xfs_btree_goto_left_edge(cur); |
1273 | if (error) |
1274 | return error; |
1275 | |
1276 | while (running && !(error = xfs_btree_increment(cur, 0, &running))) { |
1277 | if (running) |
1278 | (*nr)++; |
1279 | } |
1280 | |
1281 | return error; |
1282 | } |
1283 | /* |
1284 | * Use the collected rmap information to stage a new rmap btree. If this is |
1285 | * successful we'll return with the new btree root information logged to the |
1286 | * repair transaction but not yet committed. This implements section (III) |
1287 | * above. |
1288 | */ |
1289 | STATIC int |
1290 | xrep_rmap_build_new_tree( |
1291 | struct xrep_rmap *rr) |
1292 | { |
1293 | struct xfs_scrub *sc = rr->sc; |
1294 | struct xfs_perag *pag = sc->sa.pag; |
1295 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
1296 | struct xfs_btree_cur *rmap_cur; |
1297 | xfs_fsblock_t fsbno; |
1298 | int error; |
1299 | |
1300 | /* |
1301 | * Preserve the old rmapbt block count so that we can adjust the |
1302 | * per-AG rmapbt reservation after we commit the new btree root and |
1303 | * want to dispose of the old btree blocks. |
1304 | */ |
1305 | rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); |
1306 | |
1307 | /* |
1308 | * Prepare to construct the new btree by reserving disk space for the |
1309 | * new btree and setting up all the accounting information we'll need |
1310 | * to root the new btree while it's under construction and before we |
1311 | * attach it to the AG header. The new blocks are accounted to the |
1312 | * rmapbt per-AG reservation, which we will adjust further after |
1313 | * committing the new btree. |
1314 | */ |
1315 | fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); |
1316 | xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, |
1317 | fsbno, XFS_AG_RESV_RMAPBT); |
1318 | rr->new_btree.bload.get_records = xrep_rmap_get_records; |
1319 | rr->new_btree.bload.claim_block = xrep_rmap_claim_block; |
1320 | rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; |
1321 | rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); |
1322 | xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); |
1323 | |
1324 | /* |
1325 | * Initialize @rr->new_btree, reserve space for the new rmapbt, |
1326 | * and compute OWN_AG rmaps. |
1327 | */ |
1328 | error = xrep_rmap_reserve_space(rr, rmap_cur); |
1329 | if (error) |
1330 | goto err_cur; |
1331 | |
1332 | /* |
1333 | * Count the rmapbt records again, because the space reservation |
1334 | * for the rmapbt itself probably added more records to the btree. |
1335 | */ |
1336 | rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, |
1337 | &rr->rmap_btree); |
1338 | |
1339 | error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); |
1340 | if (error) |
1341 | goto err_mcur; |
1342 | |
1343 | /* |
1344 | * Due to btree slack factors, it's possible for a new btree to be one |
1345 | * level taller than the old btree. Update the incore btree height so |
1346 | * that we don't trip the verifiers when writing the new btree blocks |
1347 | * to disk. |
1348 | */ |
1349 | pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; |
1350 | |
1351 | /* |
1352 | * Move the cursor to the left edge of the tree so that the first |
1353 | * increment in ->get_records positions us at the first record. |
1354 | */ |
1355 | error = xfs_btree_goto_left_edge(rr->mcur); |
1356 | if (error) |
1357 | goto err_level; |
1358 | |
1359 | /* Add all observed rmap records. */ |
1360 | error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); |
1361 | if (error) |
1362 | goto err_level; |
1363 | |
1364 | /* |
1365 | * Install the new btree in the AG header. After this point the old |
1366 | * btree is no longer accessible and the new tree is live. |
1367 | */ |
1368 | xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); |
1369 | xfs_btree_del_cursor(rmap_cur, 0); |
1370 | xfs_btree_del_cursor(rr->mcur, 0); |
1371 | rr->mcur = NULL; |
1372 | |
1373 | /* |
1374 | * Now that we've written the new btree to disk, we don't need to keep |
1375 | * updating the in-memory btree. Abort the scan to stop live updates. |
1376 | */ |
1377 | xchk_iscan_abort(&rr->iscan); |
1378 | |
1379 | /* |
1380 | * The newly committed rmap recordset includes mappings for the blocks |
1381 | * that we reserved to build the new btree. If there is excess space |
1382 | * reservation to be freed, the corresponding rmap records must also be |
1383 | * removed. |
1384 | */ |
1385 | rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; |
1386 | |
1387 | /* Reset the AGF counters now that we've changed the btree shape. */ |
1388 | error = xrep_rmap_reset_counters(rr); |
1389 | if (error) |
1390 | goto err_newbt; |
1391 | |
1392 | /* Dispose of any unused blocks and the accounting information. */ |
1393 | error = xrep_newbt_commit(&rr->new_btree); |
1394 | if (error) |
1395 | return error; |
1396 | |
1397 | return xrep_roll_ag_trans(sc); |
1398 | |
1399 | err_level: |
1400 | pag->pagf_repair_rmap_level = 0; |
1401 | err_mcur: |
1402 | xfs_btree_del_cursor(rr->mcur, error); |
1403 | err_cur: |
1404 | xfs_btree_del_cursor(rmap_cur, error); |
1405 | err_newbt: |
1406 | xrep_newbt_cancel(&rr->new_btree); |
1407 | return error; |
1408 | } |
1409 | |
1410 | /* Section (IV): Reaping the old btree. */ |
1411 | |
1412 | struct xrep_rmap_find_gaps { |
1413 | struct xagb_bitmap rmap_gaps; |
1414 | xfs_agblock_t next_agbno; |
1415 | }; |
1416 | |
1417 | /* Subtract each free extent in the bnobt from the rmap gaps. */ |
1418 | STATIC int |
1419 | xrep_rmap_find_freesp( |
1420 | struct xfs_btree_cur *cur, |
1421 | const struct xfs_alloc_rec_incore *rec, |
1422 | void *priv) |
1423 | { |
1424 | struct xrep_rmap_find_gaps *rfg = priv; |
1425 | |
1426 | return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, |
1427 | rec->ar_blockcount); |
1428 | } |
1429 | |
1430 | /* Record the free space we find, as part of cleaning out the btree. */ |
1431 | STATIC int |
1432 | xrep_rmap_find_gaps( |
1433 | struct xfs_btree_cur *cur, |
1434 | const struct xfs_rmap_irec *rec, |
1435 | void *priv) |
1436 | { |
1437 | struct xrep_rmap_find_gaps *rfg = priv; |
1438 | int error; |
1439 | |
1440 | if (rec->rm_startblock > rfg->next_agbno) { |
1441 | error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, |
1442 | rec->rm_startblock - rfg->next_agbno); |
1443 | if (error) |
1444 | return error; |
1445 | } |
1446 | |
1447 | rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, |
1448 | rec->rm_startblock + rec->rm_blockcount); |
1449 | return 0; |
1450 | } |
1451 | |
1452 | /* |
1453 | * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make |
1454 | * a list of gaps in the rmap records and a list of the extents mentioned in |
1455 | * the bnobt. Any block that's in the new rmapbt gap list but not mentioned |
1456 | * in the bnobt is a block from the old rmapbt and can be removed. |
1457 | */ |
1458 | STATIC int |
1459 | xrep_rmap_remove_old_tree( |
1460 | struct xrep_rmap *rr) |
1461 | { |
1462 | struct xrep_rmap_find_gaps rfg = { |
1463 | .next_agbno = 0, |
1464 | }; |
1465 | struct xfs_scrub *sc = rr->sc; |
1466 | struct xfs_agf *agf = sc->sa.agf_bp->b_addr; |
1467 | struct xfs_perag *pag = sc->sa.pag; |
1468 | struct xfs_btree_cur *mcur; |
1469 | xfs_agblock_t agend; |
1470 | int error; |
1471 | |
1472 | xagb_bitmap_init(&rfg.rmap_gaps); |
1473 | |
1474 | /* Compute free space from the new rmapbt. */ |
1475 | mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); |
1476 | |
1477 | error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); |
1478 | xfs_btree_del_cursor(mcur, error); |
1479 | if (error) |
1480 | goto out_bitmap; |
1481 | |
1482 | /* Insert a record for space between the last rmap and EOAG. */ |
1483 | agend = be32_to_cpu(agf->agf_length); |
1484 | if (rfg.next_agbno < agend) { |
1485 | error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, |
1486 | agend - rfg.next_agbno); |
1487 | if (error) |
1488 | goto out_bitmap; |
1489 | } |
1490 | |
1491 | /* Compute free space from the existing bnobt. */ |
1492 | sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, |
1493 | sc->sa.pag); |
1494 | error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, |
1495 | &rfg); |
1496 | xfs_btree_del_cursor(sc->sa.bno_cur, error); |
1497 | sc->sa.bno_cur = NULL; |
1498 | if (error) |
1499 | goto out_bitmap; |
1500 | |
1501 | /* |
1502 | * Free the "free" blocks that the new rmapbt knows about but the bnobt |
1503 | * doesn't--these are the old rmapbt blocks. Credit the old rmapbt |
1504 | * block usage count back to the per-AG rmapbt reservation (and not |
1505 | * fdblocks, since the rmap btree lives in free space) to keep the |
1506 | * reservation and free space accounting correct. |
1507 | */ |
1508 | error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, |
1509 | &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); |
1510 | if (error) |
1511 | goto out_bitmap; |
1512 | |
1513 | /* |
1514 | * Now that we've zapped all the old rmapbt blocks we can turn off |
1515 | * the alternate height mechanism and reset the per-AG space |
1516 | * reservation. |
1517 | */ |
1518 | pag->pagf_repair_rmap_level = 0; |
1519 | sc->flags |= XREP_RESET_PERAG_RESV; |
1520 | out_bitmap: |
1521 | xagb_bitmap_destroy(&rfg.rmap_gaps); |
1522 | return error; |
1523 | } |
1524 | |
1525 | static inline bool |
1526 | xrep_rmapbt_want_live_update( |
1527 | struct xchk_iscan *iscan, |
1528 | const struct xfs_owner_info *oi) |
1529 | { |
1530 | if (xchk_iscan_aborted(iscan)) |
1531 | return false; |
1532 | |
1533 | /* |
1534 | * Before unlocking the AG header to perform the inode scan, we |
1535 | * recorded reverse mappings for all AG metadata except for the OWN_AG |
1536 | * metadata. IOWs, the in-memory btree knows about the AG headers, the |
1537 | * two inode btrees, the CoW staging extents, and the refcount btrees. |
1538 | * For these types of metadata, we need to record the live updates in |
1539 | * the in-memory rmap btree. |
1540 | * |
1541 | * However, we do not scan the free space btrees or the AGFL until we |
1542 | * have re-locked the AGF and are ready to reserve space for the new |
1543 | * rmap btree, so we do not want live updates for OWN_AG metadata. |
1544 | */ |
1545 | if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) |
1546 | return oi->oi_owner != XFS_RMAP_OWN_AG; |
1547 | |
1548 | /* Ignore updates to files that the scanner hasn't visited yet. */ |
1549 | return xchk_iscan_want_live_update(iscan, oi->oi_owner); |
1550 | } |
1551 | |
1552 | /* |
1553 | * Apply a rmapbt update from the regular filesystem into our shadow btree. |
1554 | * We're running from the thread that owns the AGF buffer and is generating |
1555 | * the update, so we must be careful about which parts of the struct xrep_rmap |
1556 | * that we change. |
1557 | */ |
1558 | static int |
1559 | xrep_rmapbt_live_update( |
1560 | struct notifier_block *nb, |
1561 | unsigned long action, |
1562 | void *data) |
1563 | { |
1564 | struct xfs_rmap_update_params *p = data; |
1565 | struct xrep_rmap *rr; |
1566 | struct xfs_mount *mp; |
1567 | struct xfs_btree_cur *mcur; |
1568 | struct xfs_trans *tp; |
1569 | void *txcookie; |
1570 | int error; |
1571 | |
1572 | rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); |
1573 | mp = rr->sc->mp; |
1574 | |
1575 | if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) |
1576 | goto out_unlock; |
1577 | |
1578 | trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); |
1579 | |
1580 | error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); |
1581 | if (error) |
1582 | goto out_abort; |
1583 | |
1584 | mutex_lock(&rr->lock); |
1585 | mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); |
1586 | error = __xfs_rmap_finish_intent(mcur, action, p->startblock, |
1587 | p->blockcount, &p->oinfo, p->unwritten); |
1588 | xfs_btree_del_cursor(mcur, error); |
1589 | if (error) |
1590 | goto out_cancel; |
1591 | |
1592 | error = xfbtree_trans_commit(&rr->rmap_btree, tp); |
1593 | if (error) |
1594 | goto out_cancel; |
1595 | |
1596 | xrep_trans_cancel_hook_dummy(&txcookie, tp); |
1597 | mutex_unlock(&rr->lock); |
1598 | return NOTIFY_DONE; |
1599 | |
1600 | out_cancel: |
1601 | xfbtree_trans_cancel(&rr->rmap_btree, tp); |
1602 | xrep_trans_cancel_hook_dummy(&txcookie, tp); |
1603 | out_abort: |
1604 | mutex_unlock(&rr->lock); |
1605 | xchk_iscan_abort(&rr->iscan); |
1606 | out_unlock: |
1607 | return NOTIFY_DONE; |
1608 | } |
1609 | |
1610 | /* Set up the filesystem scan components. */ |
1611 | STATIC int |
1612 | xrep_rmap_setup_scan( |
1613 | struct xrep_rmap *rr) |
1614 | { |
1615 | struct xfs_scrub *sc = rr->sc; |
1616 | int error; |
1617 | |
1618 | mutex_init(&rr->lock); |
1619 | |
1620 | /* Set up in-memory rmap btree */ |
1621 | error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, |
1622 | sc->sa.pag->pag_agno); |
1623 | if (error) |
1624 | goto out_mutex; |
1625 | |
1626 | /* Retry iget every tenth of a second for up to 30 seconds. */ |
1627 | xchk_iscan_start(sc, 30000, 100, &rr->iscan); |
1628 | |
1629 | /* |
1630 | * Hook into live rmap operations so that we can update our in-memory |
1631 | * btree to reflect live changes on the filesystem. Since we drop the |
1632 | * AGF buffer to scan all the inodes, we need this piece to avoid |
1633 | * installing a stale btree. |
1634 | */ |
1635 | ASSERT(sc->flags & XCHK_FSGATES_RMAP); |
1636 | xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); |
1637 | error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); |
1638 | if (error) |
1639 | goto out_iscan; |
1640 | return 0; |
1641 | |
1642 | out_iscan: |
1643 | xchk_iscan_teardown(&rr->iscan); |
1644 | xfbtree_destroy(&rr->rmap_btree); |
1645 | out_mutex: |
1646 | mutex_destroy(&rr->lock); |
1647 | return error; |
1648 | } |
1649 | |
1650 | /* Tear down scan components. */ |
1651 | STATIC void |
1652 | xrep_rmap_teardown( |
1653 | struct xrep_rmap *rr) |
1654 | { |
1655 | struct xfs_scrub *sc = rr->sc; |
1656 | |
1657 | xchk_iscan_abort(&rr->iscan); |
1658 | xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); |
1659 | xchk_iscan_teardown(&rr->iscan); |
1660 | xfbtree_destroy(&rr->rmap_btree); |
1661 | mutex_destroy(&rr->lock); |
1662 | } |
1663 | |
1664 | /* Repair the rmap btree for some AG. */ |
1665 | int |
1666 | xrep_rmapbt( |
1667 | struct xfs_scrub *sc) |
1668 | { |
1669 | struct xrep_rmap *rr = sc->buf; |
1670 | int error; |
1671 | |
1672 | error = xrep_rmap_setup_scan(rr); |
1673 | if (error) |
1674 | return error; |
1675 | |
1676 | /* |
1677 | * Collect rmaps for everything in this AG that isn't space metadata. |
1678 | * These rmaps won't change even as we try to allocate blocks. |
1679 | */ |
1680 | error = xrep_rmap_find_rmaps(rr); |
1681 | if (error) |
1682 | goto out_records; |
1683 | |
1684 | /* Rebuild the rmap information. */ |
1685 | error = xrep_rmap_build_new_tree(rr); |
1686 | if (error) |
1687 | goto out_records; |
1688 | |
1689 | /* Kill the old tree. */ |
1690 | error = xrep_rmap_remove_old_tree(rr); |
1691 | if (error) |
1692 | goto out_records; |
1693 | |
1694 | out_records: |
1695 | xrep_rmap_teardown(rr); |
1696 | return error; |
1697 | } |
1698 | |