1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2018-2023 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_defer.h" |
13 | #include "xfs_btree.h" |
14 | #include "xfs_btree_staging.h" |
15 | #include "xfs_bit.h" |
16 | #include "xfs_log_format.h" |
17 | #include "xfs_trans.h" |
18 | #include "xfs_sb.h" |
19 | #include "xfs_inode.h" |
20 | #include "xfs_alloc.h" |
21 | #include "xfs_ialloc.h" |
22 | #include "xfs_ialloc_btree.h" |
23 | #include "xfs_icache.h" |
24 | #include "xfs_rmap.h" |
25 | #include "xfs_rmap_btree.h" |
26 | #include "xfs_log.h" |
27 | #include "xfs_trans_priv.h" |
28 | #include "xfs_error.h" |
29 | #include "xfs_health.h" |
30 | #include "xfs_ag.h" |
31 | #include "scrub/xfs_scrub.h" |
32 | #include "scrub/scrub.h" |
33 | #include "scrub/common.h" |
34 | #include "scrub/btree.h" |
35 | #include "scrub/trace.h" |
36 | #include "scrub/repair.h" |
37 | #include "scrub/bitmap.h" |
38 | #include "scrub/agb_bitmap.h" |
39 | #include "scrub/xfile.h" |
40 | #include "scrub/xfarray.h" |
41 | #include "scrub/newbt.h" |
42 | #include "scrub/reap.h" |
43 | |
44 | /* |
45 | * Inode Btree Repair |
46 | * ================== |
47 | * |
48 | * A quick refresher of inode btrees on a v5 filesystem: |
49 | * |
50 | * - Inode records are read into memory in units of 'inode clusters'. However |
51 | * many inodes fit in a cluster buffer is the smallest number of inodes that |
52 | * can be allocated or freed. Clusters are never smaller than one fs block |
53 | * though they can span multiple blocks. The size (in fs blocks) is |
54 | * computed with xfs_icluster_size_fsb(). The fs block alignment of a |
55 | * cluster is computed with xfs_ialloc_cluster_alignment(). |
56 | * |
57 | * - Each inode btree record can describe a single 'inode chunk'. The chunk |
58 | * size is defined to be 64 inodes. If sparse inodes are enabled, every |
59 | * inobt record must be aligned to the chunk size; if not, every record must |
60 | * be aligned to the start of a cluster. It is possible to construct an XFS |
61 | * geometry where one inobt record maps to multiple inode clusters; it is |
62 | * also possible to construct a geometry where multiple inobt records map to |
63 | * different parts of one inode cluster. |
64 | * |
65 | * - If sparse inodes are not enabled, the smallest unit of allocation for |
66 | * inode records is enough to contain one inode chunk's worth of inodes. |
67 | * |
68 | * - If sparse inodes are enabled, the holemask field will be active. Each |
69 | * bit of the holemask represents 4 potential inodes; if set, the |
70 | * corresponding space does *not* contain inodes and must be left alone. |
71 | * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation |
72 | * of inode records is one inode cluster. |
73 | * |
74 | * So what's the rebuild algorithm? |
75 | * |
76 | * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT |
77 | * records. The OWN_INOBT records are the old inode btree blocks and will be |
78 | * cleared out after we've rebuilt the tree. Each possible inode cluster |
79 | * within an OWN_INODES record will be read in; for each possible inobt record |
80 | * associated with that cluster, compute the freemask calculated from the |
81 | * i_mode data in the inode chunk. For sparse inodes the holemask will be |
82 | * calculated by creating the properly aligned inobt record and punching out |
83 | * any chunk that's missing. Inode allocations and frees grab the AGI first, |
84 | * so repair protects itself from concurrent access by locking the AGI. |
85 | * |
86 | * Once we've reconstructed all the inode records, we can create new inode |
87 | * btree roots and reload the btrees. We rebuild both inode trees at the same |
88 | * time because they have the same rmap owner and it would be more complex to |
89 | * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT |
90 | * blocks it owns. We have all the data we need to build both, so dump |
91 | * everything and start over. |
92 | * |
93 | * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. |
94 | */ |
95 | |
96 | struct xrep_ibt { |
97 | /* Record under construction. */ |
98 | struct xfs_inobt_rec_incore rie; |
99 | |
100 | /* new inobt information */ |
101 | struct xrep_newbt new_inobt; |
102 | |
103 | /* new finobt information */ |
104 | struct xrep_newbt new_finobt; |
105 | |
106 | /* Old inode btree blocks we found in the rmap. */ |
107 | struct xagb_bitmap old_iallocbt_blocks; |
108 | |
109 | /* Reconstructed inode records. */ |
110 | struct xfarray *inode_records; |
111 | |
112 | struct xfs_scrub *sc; |
113 | |
114 | /* Number of inodes assigned disk space. */ |
115 | unsigned int icount; |
116 | |
117 | /* Number of inodes in use. */ |
118 | unsigned int iused; |
119 | |
120 | /* Number of finobt records needed. */ |
121 | unsigned int finobt_recs; |
122 | |
123 | /* get_records()'s position in the inode record array. */ |
124 | xfarray_idx_t array_cur; |
125 | }; |
126 | |
127 | /* |
128 | * Is this inode in use? If the inode is in memory we can tell from i_mode, |
129 | * otherwise we have to check di_mode in the on-disk buffer. We only care |
130 | * that the high (i.e. non-permission) bits of _mode are zero. This should be |
131 | * safe because repair keeps all AG headers locked until the end, and process |
132 | * trying to perform an inode allocation/free must lock the AGI. |
133 | * |
134 | * @cluster_ag_base is the inode offset of the cluster within the AG. |
135 | * @cluster_bp is the cluster buffer. |
136 | * @cluster_index is the inode offset within the inode cluster. |
137 | */ |
138 | STATIC int |
139 | xrep_ibt_check_ifree( |
140 | struct xrep_ibt *ri, |
141 | xfs_agino_t cluster_ag_base, |
142 | struct xfs_buf *cluster_bp, |
143 | unsigned int cluster_index, |
144 | bool *inuse) |
145 | { |
146 | struct xfs_scrub *sc = ri->sc; |
147 | struct xfs_mount *mp = sc->mp; |
148 | struct xfs_dinode *dip; |
149 | xfs_ino_t fsino; |
150 | xfs_agino_t agino; |
151 | xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; |
152 | unsigned int cluster_buf_base; |
153 | unsigned int offset; |
154 | int error; |
155 | |
156 | agino = cluster_ag_base + cluster_index; |
157 | fsino = XFS_AGINO_TO_INO(mp, agno, agino); |
158 | |
159 | /* Inode uncached or half assembled, read disk buffer */ |
160 | cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); |
161 | offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; |
162 | if (offset >= BBTOB(cluster_bp->b_length)) |
163 | return -EFSCORRUPTED; |
164 | dip = xfs_buf_offset(cluster_bp, offset); |
165 | if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) |
166 | return -EFSCORRUPTED; |
167 | |
168 | if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) |
169 | return -EFSCORRUPTED; |
170 | |
171 | /* Will the in-core inode tell us if it's in use? */ |
172 | error = xchk_inode_is_allocated(sc, agino, inuse); |
173 | if (!error) |
174 | return 0; |
175 | |
176 | *inuse = dip->di_mode != 0; |
177 | return 0; |
178 | } |
179 | |
180 | /* Stash the accumulated inobt record for rebuilding. */ |
181 | STATIC int |
182 | xrep_ibt_stash( |
183 | struct xrep_ibt *ri) |
184 | { |
185 | int error = 0; |
186 | |
187 | if (xchk_should_terminate(ri->sc, &error)) |
188 | return error; |
189 | |
190 | ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); |
191 | if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) |
192 | return -EFSCORRUPTED; |
193 | |
194 | if (ri->rie.ir_freecount > 0) |
195 | ri->finobt_recs++; |
196 | |
197 | trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); |
198 | |
199 | error = xfarray_append(ri->inode_records, &ri->rie); |
200 | if (error) |
201 | return error; |
202 | |
203 | ri->rie.ir_startino = NULLAGINO; |
204 | return 0; |
205 | } |
206 | |
207 | /* |
208 | * Given an extent of inodes and an inode cluster buffer, calculate the |
209 | * location of the corresponding inobt record (creating it if necessary), |
210 | * then update the parts of the holemask and freemask of that record that |
211 | * correspond to the inode extent we were given. |
212 | * |
213 | * @cluster_ir_startino is the AG inode number of an inobt record that we're |
214 | * proposing to create for this inode cluster. If sparse inodes are enabled, |
215 | * we must round down to a chunk boundary to find the actual sparse record. |
216 | * @cluster_bp is the buffer of the inode cluster. |
217 | * @nr_inodes is the number of inodes to check from the cluster. |
218 | */ |
219 | STATIC int |
220 | xrep_ibt_cluster_record( |
221 | struct xrep_ibt *ri, |
222 | xfs_agino_t cluster_ir_startino, |
223 | struct xfs_buf *cluster_bp, |
224 | unsigned int nr_inodes) |
225 | { |
226 | struct xfs_scrub *sc = ri->sc; |
227 | struct xfs_mount *mp = sc->mp; |
228 | xfs_agino_t ir_startino; |
229 | unsigned int cluster_base; |
230 | unsigned int cluster_index; |
231 | int error = 0; |
232 | |
233 | ir_startino = cluster_ir_startino; |
234 | if (xfs_has_sparseinodes(mp)) |
235 | ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); |
236 | cluster_base = cluster_ir_startino - ir_startino; |
237 | |
238 | /* |
239 | * If the accumulated inobt record doesn't map this cluster, add it to |
240 | * the list and reset it. |
241 | */ |
242 | if (ri->rie.ir_startino != NULLAGINO && |
243 | ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { |
244 | error = xrep_ibt_stash(ri); |
245 | if (error) |
246 | return error; |
247 | } |
248 | |
249 | if (ri->rie.ir_startino == NULLAGINO) { |
250 | ri->rie.ir_startino = ir_startino; |
251 | ri->rie.ir_free = XFS_INOBT_ALL_FREE; |
252 | ri->rie.ir_holemask = 0xFFFF; |
253 | ri->rie.ir_count = 0; |
254 | } |
255 | |
256 | /* Record the whole cluster. */ |
257 | ri->icount += nr_inodes; |
258 | ri->rie.ir_count += nr_inodes; |
259 | ri->rie.ir_holemask &= ~xfs_inobt_maskn( |
260 | cluster_base / XFS_INODES_PER_HOLEMASK_BIT, |
261 | nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); |
262 | |
263 | /* Which inodes within this cluster are free? */ |
264 | for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { |
265 | bool inuse = false; |
266 | |
267 | error = xrep_ibt_check_ifree(ri, cluster_ir_startino, |
268 | cluster_bp, cluster_index, &inuse); |
269 | if (error) |
270 | return error; |
271 | if (!inuse) |
272 | continue; |
273 | ri->iused++; |
274 | ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + |
275 | cluster_index); |
276 | } |
277 | return 0; |
278 | } |
279 | |
280 | /* |
281 | * For each inode cluster covering the physical extent recorded by the rmapbt, |
282 | * we must calculate the properly aligned startino of that cluster, then |
283 | * iterate each cluster to fill in used and filled masks appropriately. We |
284 | * then use the (startino, used, filled) information to construct the |
285 | * appropriate inode records. |
286 | */ |
287 | STATIC int |
288 | xrep_ibt_process_cluster( |
289 | struct xrep_ibt *ri, |
290 | xfs_agblock_t cluster_bno) |
291 | { |
292 | struct xfs_imap imap; |
293 | struct xfs_buf *cluster_bp; |
294 | struct xfs_scrub *sc = ri->sc; |
295 | struct xfs_mount *mp = sc->mp; |
296 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
297 | xfs_agino_t cluster_ag_base; |
298 | xfs_agino_t irec_index; |
299 | unsigned int nr_inodes; |
300 | int error; |
301 | |
302 | nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, |
303 | XFS_INODES_PER_CHUNK); |
304 | |
305 | /* |
306 | * Grab the inode cluster buffer. This is safe to do with a broken |
307 | * inobt because imap_to_bp directly maps the buffer without touching |
308 | * either inode btree. |
309 | */ |
310 | imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); |
311 | imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); |
312 | imap.im_boffset = 0; |
313 | error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); |
314 | if (error) |
315 | return error; |
316 | |
317 | /* |
318 | * Record the contents of each possible inobt record mapping this |
319 | * cluster. |
320 | */ |
321 | cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); |
322 | for (irec_index = 0; |
323 | irec_index < igeo->inodes_per_cluster; |
324 | irec_index += XFS_INODES_PER_CHUNK) { |
325 | error = xrep_ibt_cluster_record(ri, |
326 | cluster_ag_base + irec_index, cluster_bp, |
327 | nr_inodes); |
328 | if (error) |
329 | break; |
330 | |
331 | } |
332 | |
333 | xfs_trans_brelse(sc->tp, cluster_bp); |
334 | return error; |
335 | } |
336 | |
337 | /* Check for any obvious conflicts in the inode chunk extent. */ |
338 | STATIC int |
339 | xrep_ibt_check_inode_ext( |
340 | struct xfs_scrub *sc, |
341 | xfs_agblock_t agbno, |
342 | xfs_extlen_t len) |
343 | { |
344 | struct xfs_mount *mp = sc->mp; |
345 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
346 | xfs_agino_t agino; |
347 | enum xbtree_recpacking outcome; |
348 | int error; |
349 | |
350 | /* Inode records must be within the AG. */ |
351 | if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) |
352 | return -EFSCORRUPTED; |
353 | |
354 | /* The entire record must align to the inode cluster size. */ |
355 | if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || |
356 | !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) |
357 | return -EFSCORRUPTED; |
358 | |
359 | /* |
360 | * The entire record must also adhere to the inode cluster alignment |
361 | * size if sparse inodes are not enabled. |
362 | */ |
363 | if (!xfs_has_sparseinodes(mp) && |
364 | (!IS_ALIGNED(agbno, igeo->cluster_align) || |
365 | !IS_ALIGNED(agbno + len, igeo->cluster_align))) |
366 | return -EFSCORRUPTED; |
367 | |
368 | /* |
369 | * On a sparse inode fs, this cluster could be part of a sparse chunk. |
370 | * Sparse clusters must be aligned to sparse chunk alignment. |
371 | */ |
372 | if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && |
373 | (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || |
374 | !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) |
375 | return -EFSCORRUPTED; |
376 | |
377 | /* Make sure the entire range of blocks are valid AG inodes. */ |
378 | agino = XFS_AGB_TO_AGINO(mp, agbno); |
379 | if (!xfs_verify_agino(sc->sa.pag, agino)) |
380 | return -EFSCORRUPTED; |
381 | |
382 | agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; |
383 | if (!xfs_verify_agino(sc->sa.pag, agino)) |
384 | return -EFSCORRUPTED; |
385 | |
386 | /* Make sure this isn't free space. */ |
387 | error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); |
388 | if (error) |
389 | return error; |
390 | if (outcome != XBTREE_RECPACKING_EMPTY) |
391 | return -EFSCORRUPTED; |
392 | |
393 | return 0; |
394 | } |
395 | |
396 | /* Found a fragment of the old inode btrees; dispose of them later. */ |
397 | STATIC int |
398 | xrep_ibt_record_old_btree_blocks( |
399 | struct xrep_ibt *ri, |
400 | const struct xfs_rmap_irec *rec) |
401 | { |
402 | if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, |
403 | rec->rm_blockcount)) |
404 | return -EFSCORRUPTED; |
405 | |
406 | return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, |
407 | rec->rm_blockcount); |
408 | } |
409 | |
410 | /* Record extents that belong to inode cluster blocks. */ |
411 | STATIC int |
412 | xrep_ibt_record_inode_blocks( |
413 | struct xrep_ibt *ri, |
414 | const struct xfs_rmap_irec *rec) |
415 | { |
416 | struct xfs_mount *mp = ri->sc->mp; |
417 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
418 | xfs_agblock_t cluster_base; |
419 | int error; |
420 | |
421 | error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, |
422 | rec->rm_blockcount); |
423 | if (error) |
424 | return error; |
425 | |
426 | trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, |
427 | rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, |
428 | rec->rm_offset, rec->rm_flags); |
429 | |
430 | /* |
431 | * Record the free/hole masks for each inode cluster that could be |
432 | * mapped by this rmap record. |
433 | */ |
434 | for (cluster_base = 0; |
435 | cluster_base < rec->rm_blockcount; |
436 | cluster_base += igeo->blocks_per_cluster) { |
437 | error = xrep_ibt_process_cluster(ri, |
438 | rec->rm_startblock + cluster_base); |
439 | if (error) |
440 | return error; |
441 | } |
442 | |
443 | return 0; |
444 | } |
445 | |
446 | STATIC int |
447 | xrep_ibt_walk_rmap( |
448 | struct xfs_btree_cur *cur, |
449 | const struct xfs_rmap_irec *rec, |
450 | void *priv) |
451 | { |
452 | struct xrep_ibt *ri = priv; |
453 | int error = 0; |
454 | |
455 | if (xchk_should_terminate(ri->sc, &error)) |
456 | return error; |
457 | |
458 | switch (rec->rm_owner) { |
459 | case XFS_RMAP_OWN_INOBT: |
460 | return xrep_ibt_record_old_btree_blocks(ri, rec); |
461 | case XFS_RMAP_OWN_INODES: |
462 | return xrep_ibt_record_inode_blocks(ri, rec); |
463 | } |
464 | return 0; |
465 | } |
466 | |
467 | /* |
468 | * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode |
469 | * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct |
470 | * the inode btrees. The caller must clean up the lists if anything goes |
471 | * wrong. |
472 | */ |
473 | STATIC int |
474 | xrep_ibt_find_inodes( |
475 | struct xrep_ibt *ri) |
476 | { |
477 | struct xfs_scrub *sc = ri->sc; |
478 | int error; |
479 | |
480 | ri->rie.ir_startino = NULLAGINO; |
481 | |
482 | /* Collect all reverse mappings for inode blocks. */ |
483 | xrep_ag_btcur_init(sc, &sc->sa); |
484 | error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); |
485 | xchk_ag_btcur_free(&sc->sa); |
486 | if (error) |
487 | return error; |
488 | |
489 | /* If we have a record ready to go, add it to the array. */ |
490 | if (ri->rie.ir_startino != NULLAGINO) |
491 | return xrep_ibt_stash(ri); |
492 | |
493 | return 0; |
494 | } |
495 | |
496 | /* Update the AGI counters. */ |
497 | STATIC int |
498 | xrep_ibt_reset_counters( |
499 | struct xrep_ibt *ri) |
500 | { |
501 | struct xfs_scrub *sc = ri->sc; |
502 | struct xfs_agi *agi = sc->sa.agi_bp->b_addr; |
503 | unsigned int freecount = ri->icount - ri->iused; |
504 | |
505 | /* Trigger inode count recalculation */ |
506 | xfs_force_summary_recalc(sc->mp); |
507 | |
508 | /* |
509 | * The AGI header contains extra information related to the inode |
510 | * btrees, so we must update those fields here. |
511 | */ |
512 | agi->agi_count = cpu_to_be32(ri->icount); |
513 | agi->agi_freecount = cpu_to_be32(freecount); |
514 | xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, |
515 | XFS_AGI_COUNT | XFS_AGI_FREECOUNT); |
516 | |
517 | /* Reinitialize with the values we just logged. */ |
518 | return xrep_reinit_pagi(sc); |
519 | } |
520 | |
521 | /* Retrieve finobt data for bulk load. */ |
522 | STATIC int |
523 | xrep_fibt_get_records( |
524 | struct xfs_btree_cur *cur, |
525 | unsigned int idx, |
526 | struct xfs_btree_block *block, |
527 | unsigned int nr_wanted, |
528 | void *priv) |
529 | { |
530 | struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; |
531 | struct xrep_ibt *ri = priv; |
532 | union xfs_btree_rec *block_rec; |
533 | unsigned int loaded; |
534 | int error; |
535 | |
536 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
537 | do { |
538 | error = xfarray_load(ri->inode_records, |
539 | ri->array_cur++, irec); |
540 | } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); |
541 | if (error) |
542 | return error; |
543 | |
544 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
545 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
546 | } |
547 | |
548 | return loaded; |
549 | } |
550 | |
551 | /* Retrieve inobt data for bulk load. */ |
552 | STATIC int |
553 | xrep_ibt_get_records( |
554 | struct xfs_btree_cur *cur, |
555 | unsigned int idx, |
556 | struct xfs_btree_block *block, |
557 | unsigned int nr_wanted, |
558 | void *priv) |
559 | { |
560 | struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; |
561 | struct xrep_ibt *ri = priv; |
562 | union xfs_btree_rec *block_rec; |
563 | unsigned int loaded; |
564 | int error; |
565 | |
566 | for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { |
567 | error = xfarray_load(ri->inode_records, ri->array_cur++, irec); |
568 | if (error) |
569 | return error; |
570 | |
571 | block_rec = xfs_btree_rec_addr(cur, idx, block); |
572 | cur->bc_ops->init_rec_from_cur(cur, block_rec); |
573 | } |
574 | |
575 | return loaded; |
576 | } |
577 | |
578 | /* Feed one of the new inobt blocks to the bulk loader. */ |
579 | STATIC int |
580 | xrep_ibt_claim_block( |
581 | struct xfs_btree_cur *cur, |
582 | union xfs_btree_ptr *ptr, |
583 | void *priv) |
584 | { |
585 | struct xrep_ibt *ri = priv; |
586 | |
587 | return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); |
588 | } |
589 | |
590 | /* Feed one of the new finobt blocks to the bulk loader. */ |
591 | STATIC int |
592 | xrep_fibt_claim_block( |
593 | struct xfs_btree_cur *cur, |
594 | union xfs_btree_ptr *ptr, |
595 | void *priv) |
596 | { |
597 | struct xrep_ibt *ri = priv; |
598 | |
599 | return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); |
600 | } |
601 | |
602 | /* Make sure the records do not overlap in inumber address space. */ |
603 | STATIC int |
604 | xrep_ibt_check_overlap( |
605 | struct xrep_ibt *ri) |
606 | { |
607 | struct xfs_inobt_rec_incore irec; |
608 | xfarray_idx_t cur; |
609 | xfs_agino_t next_agino = 0; |
610 | int error = 0; |
611 | |
612 | foreach_xfarray_idx(ri->inode_records, cur) { |
613 | if (xchk_should_terminate(ri->sc, &error)) |
614 | return error; |
615 | |
616 | error = xfarray_load(ri->inode_records, cur, &irec); |
617 | if (error) |
618 | return error; |
619 | |
620 | if (irec.ir_startino < next_agino) |
621 | return -EFSCORRUPTED; |
622 | |
623 | next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; |
624 | } |
625 | |
626 | return error; |
627 | } |
628 | |
629 | /* Build new inode btrees and dispose of the old one. */ |
630 | STATIC int |
631 | xrep_ibt_build_new_trees( |
632 | struct xrep_ibt *ri) |
633 | { |
634 | struct xfs_scrub *sc = ri->sc; |
635 | struct xfs_btree_cur *ino_cur; |
636 | struct xfs_btree_cur *fino_cur = NULL; |
637 | xfs_fsblock_t fsbno; |
638 | bool need_finobt; |
639 | int error; |
640 | |
641 | need_finobt = xfs_has_finobt(sc->mp); |
642 | |
643 | /* |
644 | * Create new btrees for staging all the inobt records we collected |
645 | * earlier. The records were collected in order of increasing agino, |
646 | * so we do not have to sort them. Ensure there are no overlapping |
647 | * records. |
648 | */ |
649 | error = xrep_ibt_check_overlap(ri); |
650 | if (error) |
651 | return error; |
652 | |
653 | /* |
654 | * The new inode btrees will not be rooted in the AGI until we've |
655 | * successfully rebuilt the tree. |
656 | * |
657 | * Start by setting up the inobt staging cursor. |
658 | */ |
659 | fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, |
660 | XFS_IBT_BLOCK(sc->mp)), |
661 | xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, |
662 | XFS_AG_RESV_NONE); |
663 | ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; |
664 | ri->new_inobt.bload.get_records = xrep_ibt_get_records; |
665 | |
666 | ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); |
667 | xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); |
668 | error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, |
669 | xfarray_length(ri->inode_records)); |
670 | if (error) |
671 | goto err_inocur; |
672 | |
673 | /* Set up finobt staging cursor. */ |
674 | if (need_finobt) { |
675 | enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; |
676 | |
677 | if (sc->mp->m_finobt_nores) |
678 | resv = XFS_AG_RESV_NONE; |
679 | |
680 | fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, |
681 | XFS_FIBT_BLOCK(sc->mp)), |
682 | xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, |
683 | fsbno, resv); |
684 | ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; |
685 | ri->new_finobt.bload.get_records = xrep_fibt_get_records; |
686 | |
687 | fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); |
688 | xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); |
689 | error = xfs_btree_bload_compute_geometry(fino_cur, |
690 | &ri->new_finobt.bload, ri->finobt_recs); |
691 | if (error) |
692 | goto err_finocur; |
693 | } |
694 | |
695 | /* Last chance to abort before we start committing fixes. */ |
696 | if (xchk_should_terminate(sc, &error)) |
697 | goto err_finocur; |
698 | |
699 | /* Reserve all the space we need to build the new btrees. */ |
700 | error = xrep_newbt_alloc_blocks(&ri->new_inobt, |
701 | ri->new_inobt.bload.nr_blocks); |
702 | if (error) |
703 | goto err_finocur; |
704 | |
705 | if (need_finobt) { |
706 | error = xrep_newbt_alloc_blocks(&ri->new_finobt, |
707 | ri->new_finobt.bload.nr_blocks); |
708 | if (error) |
709 | goto err_finocur; |
710 | } |
711 | |
712 | /* Add all inobt records. */ |
713 | ri->array_cur = XFARRAY_CURSOR_INIT; |
714 | error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); |
715 | if (error) |
716 | goto err_finocur; |
717 | |
718 | /* Add all finobt records. */ |
719 | if (need_finobt) { |
720 | ri->array_cur = XFARRAY_CURSOR_INIT; |
721 | error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); |
722 | if (error) |
723 | goto err_finocur; |
724 | } |
725 | |
726 | /* |
727 | * Install the new btrees in the AG header. After this point the old |
728 | * btrees are no longer accessible and the new trees are live. |
729 | */ |
730 | xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); |
731 | xfs_btree_del_cursor(ino_cur, 0); |
732 | |
733 | if (fino_cur) { |
734 | xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); |
735 | xfs_btree_del_cursor(fino_cur, 0); |
736 | } |
737 | |
738 | /* Reset the AGI counters now that we've changed the inode roots. */ |
739 | error = xrep_ibt_reset_counters(ri); |
740 | if (error) |
741 | goto err_finobt; |
742 | |
743 | /* Free unused blocks and bitmap. */ |
744 | if (need_finobt) { |
745 | error = xrep_newbt_commit(&ri->new_finobt); |
746 | if (error) |
747 | goto err_inobt; |
748 | } |
749 | error = xrep_newbt_commit(&ri->new_inobt); |
750 | if (error) |
751 | return error; |
752 | |
753 | return xrep_roll_ag_trans(sc); |
754 | |
755 | err_finocur: |
756 | if (need_finobt) |
757 | xfs_btree_del_cursor(fino_cur, error); |
758 | err_inocur: |
759 | xfs_btree_del_cursor(ino_cur, error); |
760 | err_finobt: |
761 | if (need_finobt) |
762 | xrep_newbt_cancel(&ri->new_finobt); |
763 | err_inobt: |
764 | xrep_newbt_cancel(&ri->new_inobt); |
765 | return error; |
766 | } |
767 | |
768 | /* |
769 | * Now that we've logged the roots of the new btrees, invalidate all of the |
770 | * old blocks and free them. |
771 | */ |
772 | STATIC int |
773 | xrep_ibt_remove_old_trees( |
774 | struct xrep_ibt *ri) |
775 | { |
776 | struct xfs_scrub *sc = ri->sc; |
777 | int error; |
778 | |
779 | /* |
780 | * Free the old inode btree blocks if they're not in use. It's ok to |
781 | * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG |
782 | * reservation because we reset the reservation before releasing the |
783 | * AGI and AGF header buffer locks. |
784 | */ |
785 | error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, |
786 | &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); |
787 | if (error) |
788 | return error; |
789 | |
790 | /* |
791 | * If the finobt is enabled and has a per-AG reservation, make sure we |
792 | * reinitialize the per-AG reservations. |
793 | */ |
794 | if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) |
795 | sc->flags |= XREP_RESET_PERAG_RESV; |
796 | |
797 | return 0; |
798 | } |
799 | |
800 | /* Repair both inode btrees. */ |
801 | int |
802 | xrep_iallocbt( |
803 | struct xfs_scrub *sc) |
804 | { |
805 | struct xrep_ibt *ri; |
806 | struct xfs_mount *mp = sc->mp; |
807 | char *descr; |
808 | xfs_agino_t first_agino, last_agino; |
809 | int error = 0; |
810 | |
811 | /* We require the rmapbt to rebuild anything. */ |
812 | if (!xfs_has_rmapbt(mp)) |
813 | return -EOPNOTSUPP; |
814 | |
815 | ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); |
816 | if (!ri) |
817 | return -ENOMEM; |
818 | ri->sc = sc; |
819 | |
820 | /* We rebuild both inode btrees. */ |
821 | sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; |
822 | |
823 | /* Set up enough storage to handle an AG with nothing but inodes. */ |
824 | xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); |
825 | last_agino /= XFS_INODES_PER_CHUNK; |
826 | descr = xchk_xfile_ag_descr(sc, "inode index records" ); |
827 | error = xfarray_create(descr, last_agino, |
828 | sizeof(struct xfs_inobt_rec_incore), |
829 | &ri->inode_records); |
830 | kfree(descr); |
831 | if (error) |
832 | goto out_ri; |
833 | |
834 | /* Collect the inode data and find the old btree blocks. */ |
835 | xagb_bitmap_init(&ri->old_iallocbt_blocks); |
836 | error = xrep_ibt_find_inodes(ri); |
837 | if (error) |
838 | goto out_bitmap; |
839 | |
840 | /* Rebuild the inode indexes. */ |
841 | error = xrep_ibt_build_new_trees(ri); |
842 | if (error) |
843 | goto out_bitmap; |
844 | |
845 | /* Kill the old tree. */ |
846 | error = xrep_ibt_remove_old_trees(ri); |
847 | if (error) |
848 | goto out_bitmap; |
849 | |
850 | out_bitmap: |
851 | xagb_bitmap_destroy(&ri->old_iallocbt_blocks); |
852 | xfarray_destroy(ri->inode_records); |
853 | out_ri: |
854 | kfree(ri); |
855 | return error; |
856 | } |
857 | |
858 | /* Make sure both btrees are ok after we've rebuilt them. */ |
859 | int |
860 | xrep_revalidate_iallocbt( |
861 | struct xfs_scrub *sc) |
862 | { |
863 | __u32 old_type = sc->sm->sm_type; |
864 | int error; |
865 | |
866 | /* |
867 | * We must update sm_type temporarily so that the tree-to-tree cross |
868 | * reference checks will work in the correct direction, and also so |
869 | * that tracing will report correctly if there are more errors. |
870 | */ |
871 | sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; |
872 | error = xchk_iallocbt(sc); |
873 | if (error) |
874 | goto out; |
875 | |
876 | if (xfs_has_finobt(sc->mp)) { |
877 | sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; |
878 | error = xchk_iallocbt(sc); |
879 | } |
880 | |
881 | out: |
882 | sc->sm->sm_type = old_type; |
883 | return error; |
884 | } |
885 | |