1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (c) 2021-2024 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_log_format.h" |
13 | #include "xfs_trans.h" |
14 | #include "xfs_inode.h" |
15 | #include "xfs_icache.h" |
16 | #include "xfs_iwalk.h" |
17 | #include "xfs_ialloc.h" |
18 | #include "xfs_dir2.h" |
19 | #include "xfs_dir2_priv.h" |
20 | #include "xfs_ag.h" |
21 | #include "scrub/scrub.h" |
22 | #include "scrub/common.h" |
23 | #include "scrub/repair.h" |
24 | #include "scrub/xfile.h" |
25 | #include "scrub/xfarray.h" |
26 | #include "scrub/iscan.h" |
27 | #include "scrub/nlinks.h" |
28 | #include "scrub/trace.h" |
29 | #include "scrub/readdir.h" |
30 | |
31 | /* |
32 | * Live Inode Link Count Checking |
33 | * ============================== |
34 | * |
35 | * Inode link counts are "summary" metadata, in the sense that they are |
36 | * computed as the number of directory entries referencing each file on the |
37 | * filesystem. Therefore, we compute the correct link counts by creating a |
38 | * shadow link count structure and walking every inode. |
39 | */ |
40 | |
41 | /* Set us up to scrub inode link counts. */ |
42 | int |
43 | xchk_setup_nlinks( |
44 | struct xfs_scrub *sc) |
45 | { |
46 | xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); |
47 | |
48 | sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); |
49 | if (!sc->buf) |
50 | return -ENOMEM; |
51 | |
52 | return xchk_setup_fs(sc); |
53 | } |
54 | |
55 | /* |
56 | * Part 1: Collecting file link counts. For each file, we create a shadow link |
57 | * counting structure, then walk the entire directory tree, incrementing parent |
58 | * and child link counts for each directory entry seen. |
59 | * |
60 | * To avoid false corruption reports in part 2, any failure in this part must |
61 | * set the INCOMPLETE flag even when a negative errno is returned. This care |
62 | * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, |
63 | * ECANCELED) that are absorbed into a scrub state flag update by |
64 | * xchk_*_process_error. Scrub and repair share the same incore data |
65 | * structures, so the INCOMPLETE flag is critical to prevent a repair based on |
66 | * insufficient information. |
67 | * |
68 | * Because we are scanning a live filesystem, it's possible that another thread |
69 | * will try to update the link counts for an inode that we've already scanned. |
70 | * This will cause our counts to be incorrect. Therefore, we hook all |
71 | * directory entry updates because that is when link count updates occur. By |
72 | * shadowing transaction updates in this manner, live nlink check can ensure by |
73 | * locking the inode and the shadow structure that its own copies are not out |
74 | * of date. Because the hook code runs in a different process context from the |
75 | * scrub code and the scrub state flags are not accessed atomically, failures |
76 | * in the hook code must abort the iscan and the scrubber must notice the |
77 | * aborted scan and set the incomplete flag. |
78 | * |
79 | * Note that we use jump labels and srcu notifier hooks to minimize the |
80 | * overhead when live nlinks is /not/ running. Locking order for nlink |
81 | * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. |
82 | */ |
83 | |
84 | /* |
85 | * Add a delta to an nlink counter, clamping the value to U32_MAX. Because |
86 | * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results |
87 | * even if we lose some precision. |
88 | */ |
89 | static inline void |
90 | careful_add( |
91 | xfs_nlink_t *nlinkp, |
92 | int delta) |
93 | { |
94 | uint64_t new_value = (uint64_t)(*nlinkp) + delta; |
95 | |
96 | BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); |
97 | *nlinkp = min_t(uint64_t, new_value, U32_MAX); |
98 | } |
99 | |
100 | /* Update incore link count information. Caller must hold the nlinks lock. */ |
101 | STATIC int |
102 | xchk_nlinks_update_incore( |
103 | struct xchk_nlink_ctrs *xnc, |
104 | xfs_ino_t ino, |
105 | int parents_delta, |
106 | int backrefs_delta, |
107 | int children_delta) |
108 | { |
109 | struct xchk_nlink nl; |
110 | int error; |
111 | |
112 | if (!xnc->nlinks) |
113 | return 0; |
114 | |
115 | error = xfarray_load_sparse(xnc->nlinks, ino, &nl); |
116 | if (error) |
117 | return error; |
118 | |
119 | trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, |
120 | backrefs_delta, children_delta); |
121 | |
122 | careful_add(&nl.parents, parents_delta); |
123 | careful_add(&nl.backrefs, backrefs_delta); |
124 | careful_add(&nl.children, children_delta); |
125 | |
126 | nl.flags |= XCHK_NLINK_WRITTEN; |
127 | error = xfarray_store(xnc->nlinks, ino, &nl); |
128 | if (error == -EFBIG) { |
129 | /* |
130 | * EFBIG means we tried to store data at too high a byte offset |
131 | * in the sparse array. IOWs, we cannot complete the check and |
132 | * must notify userspace that the check was incomplete. |
133 | */ |
134 | error = -ECANCELED; |
135 | } |
136 | return error; |
137 | } |
138 | |
139 | /* |
140 | * Apply a link count change from the regular filesystem into our shadow link |
141 | * count structure based on a directory update in progress. |
142 | */ |
143 | STATIC int |
144 | xchk_nlinks_live_update( |
145 | struct notifier_block *nb, |
146 | unsigned long action, |
147 | void *data) |
148 | { |
149 | struct xfs_dir_update_params *p = data; |
150 | struct xchk_nlink_ctrs *xnc; |
151 | int error; |
152 | |
153 | xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); |
154 | |
155 | trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, |
156 | p->delta, p->name->name, p->name->len); |
157 | |
158 | /* |
159 | * If we've already scanned @dp, update the number of parents that link |
160 | * to @ip. If @ip is a subdirectory, update the number of child links |
161 | * going out of @dp. |
162 | */ |
163 | if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { |
164 | mutex_lock(&xnc->lock); |
165 | error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, |
166 | 0, 0); |
167 | if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) |
168 | error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, |
169 | 0, p->delta); |
170 | mutex_unlock(&xnc->lock); |
171 | if (error) |
172 | goto out_abort; |
173 | } |
174 | |
175 | /* |
176 | * If @ip is a subdirectory and we've already scanned it, update the |
177 | * number of backrefs pointing to @dp. |
178 | */ |
179 | if (S_ISDIR(VFS_IC(p->ip)->i_mode) && |
180 | xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { |
181 | mutex_lock(&xnc->lock); |
182 | error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, |
183 | p->delta, 0); |
184 | mutex_unlock(&xnc->lock); |
185 | if (error) |
186 | goto out_abort; |
187 | } |
188 | |
189 | return NOTIFY_DONE; |
190 | |
191 | out_abort: |
192 | xchk_iscan_abort(&xnc->collect_iscan); |
193 | return NOTIFY_DONE; |
194 | } |
195 | |
196 | /* Bump the observed link count for the inode referenced by this entry. */ |
197 | STATIC int |
198 | xchk_nlinks_collect_dirent( |
199 | struct xfs_scrub *sc, |
200 | struct xfs_inode *dp, |
201 | xfs_dir2_dataptr_t dapos, |
202 | const struct xfs_name *name, |
203 | xfs_ino_t ino, |
204 | void *priv) |
205 | { |
206 | struct xchk_nlink_ctrs *xnc = priv; |
207 | bool dot = false, dotdot = false; |
208 | int error; |
209 | |
210 | /* Does this name make sense? */ |
211 | if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { |
212 | error = -ECANCELED; |
213 | goto out_abort; |
214 | } |
215 | |
216 | if (name->len == 1 && name->name[0] == '.') |
217 | dot = true; |
218 | else if (name->len == 2 && name->name[0] == '.' && |
219 | name->name[1] == '.') |
220 | dotdot = true; |
221 | |
222 | /* Don't accept a '.' entry that points somewhere else. */ |
223 | if (dot && ino != dp->i_ino) { |
224 | error = -ECANCELED; |
225 | goto out_abort; |
226 | } |
227 | |
228 | /* Don't accept an invalid inode number. */ |
229 | if (!xfs_verify_dir_ino(sc->mp, ino)) { |
230 | error = -ECANCELED; |
231 | goto out_abort; |
232 | } |
233 | |
234 | /* Update the shadow link counts if we haven't already failed. */ |
235 | |
236 | if (xchk_iscan_aborted(&xnc->collect_iscan)) { |
237 | error = -ECANCELED; |
238 | goto out_incomplete; |
239 | } |
240 | |
241 | trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); |
242 | |
243 | mutex_lock(&xnc->lock); |
244 | |
245 | /* |
246 | * If this is a dotdot entry, it is a back link from dp to ino. How |
247 | * we handle this depends on whether or not dp is the root directory. |
248 | * |
249 | * The root directory is its own parent, so we pretend the dotdot entry |
250 | * establishes the "parent" of the root directory. Increment the |
251 | * number of parents of the root directory. |
252 | * |
253 | * Otherwise, increment the number of backrefs pointing back to ino. |
254 | */ |
255 | if (dotdot) { |
256 | if (dp == sc->mp->m_rootip) |
257 | error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); |
258 | else |
259 | error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); |
260 | if (error) |
261 | goto out_unlock; |
262 | } |
263 | |
264 | /* |
265 | * If this dirent is a forward link from dp to ino, increment the |
266 | * number of parents linking into ino. |
267 | */ |
268 | if (!dot && !dotdot) { |
269 | error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); |
270 | if (error) |
271 | goto out_unlock; |
272 | } |
273 | |
274 | /* |
275 | * If this dirent is a forward link to a subdirectory, increment the |
276 | * number of child links of dp. |
277 | */ |
278 | if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { |
279 | error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); |
280 | if (error) |
281 | goto out_unlock; |
282 | } |
283 | |
284 | mutex_unlock(&xnc->lock); |
285 | return 0; |
286 | |
287 | out_unlock: |
288 | mutex_unlock(&xnc->lock); |
289 | out_abort: |
290 | xchk_iscan_abort(&xnc->collect_iscan); |
291 | out_incomplete: |
292 | xchk_set_incomplete(sc); |
293 | return error; |
294 | } |
295 | |
296 | /* Walk a directory to bump the observed link counts of the children. */ |
297 | STATIC int |
298 | xchk_nlinks_collect_dir( |
299 | struct xchk_nlink_ctrs *xnc, |
300 | struct xfs_inode *dp) |
301 | { |
302 | struct xfs_scrub *sc = xnc->sc; |
303 | unsigned int lock_mode; |
304 | int error = 0; |
305 | |
306 | /* Prevent anyone from changing this directory while we walk it. */ |
307 | xfs_ilock(dp, XFS_IOLOCK_SHARED); |
308 | lock_mode = xfs_ilock_data_map_shared(dp); |
309 | |
310 | /* |
311 | * The dotdot entry of an unlinked directory still points to the last |
312 | * parent, but the parent no longer links to this directory. Skip the |
313 | * directory to avoid overcounting. |
314 | */ |
315 | if (VFS_I(dp)->i_nlink == 0) |
316 | goto out_unlock; |
317 | |
318 | /* |
319 | * We cannot count file links if the directory looks as though it has |
320 | * been zapped by the inode record repair code. |
321 | */ |
322 | if (xchk_dir_looks_zapped(dp)) { |
323 | error = -EBUSY; |
324 | goto out_abort; |
325 | } |
326 | |
327 | error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); |
328 | if (error == -ECANCELED) { |
329 | error = 0; |
330 | goto out_unlock; |
331 | } |
332 | if (error) |
333 | goto out_abort; |
334 | |
335 | xchk_iscan_mark_visited(&xnc->collect_iscan, dp); |
336 | goto out_unlock; |
337 | |
338 | out_abort: |
339 | xchk_set_incomplete(sc); |
340 | xchk_iscan_abort(&xnc->collect_iscan); |
341 | out_unlock: |
342 | xfs_iunlock(dp, lock_mode); |
343 | xfs_iunlock(dp, XFS_IOLOCK_SHARED); |
344 | return error; |
345 | } |
346 | |
347 | /* If this looks like a valid pointer, count it. */ |
348 | static inline int |
349 | xchk_nlinks_collect_metafile( |
350 | struct xchk_nlink_ctrs *xnc, |
351 | xfs_ino_t ino) |
352 | { |
353 | if (!xfs_verify_ino(xnc->sc->mp, ino)) |
354 | return 0; |
355 | |
356 | trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); |
357 | return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); |
358 | } |
359 | |
360 | /* Bump the link counts of metadata files rooted in the superblock. */ |
361 | STATIC int |
362 | xchk_nlinks_collect_metafiles( |
363 | struct xchk_nlink_ctrs *xnc) |
364 | { |
365 | struct xfs_mount *mp = xnc->sc->mp; |
366 | int error = -ECANCELED; |
367 | |
368 | |
369 | if (xchk_iscan_aborted(&xnc->collect_iscan)) |
370 | goto out_incomplete; |
371 | |
372 | mutex_lock(&xnc->lock); |
373 | error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); |
374 | if (error) |
375 | goto out_abort; |
376 | |
377 | error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); |
378 | if (error) |
379 | goto out_abort; |
380 | |
381 | error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); |
382 | if (error) |
383 | goto out_abort; |
384 | |
385 | error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); |
386 | if (error) |
387 | goto out_abort; |
388 | |
389 | error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); |
390 | if (error) |
391 | goto out_abort; |
392 | mutex_unlock(&xnc->lock); |
393 | |
394 | return 0; |
395 | |
396 | out_abort: |
397 | mutex_unlock(&xnc->lock); |
398 | xchk_iscan_abort(&xnc->collect_iscan); |
399 | out_incomplete: |
400 | xchk_set_incomplete(xnc->sc); |
401 | return error; |
402 | } |
403 | |
404 | /* Advance the collection scan cursor for this non-directory file. */ |
405 | static inline int |
406 | xchk_nlinks_collect_file( |
407 | struct xchk_nlink_ctrs *xnc, |
408 | struct xfs_inode *ip) |
409 | { |
410 | xfs_ilock(ip, XFS_IOLOCK_SHARED); |
411 | xchk_iscan_mark_visited(&xnc->collect_iscan, ip); |
412 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
413 | return 0; |
414 | } |
415 | |
416 | /* Walk all directories and count inode links. */ |
417 | STATIC int |
418 | xchk_nlinks_collect( |
419 | struct xchk_nlink_ctrs *xnc) |
420 | { |
421 | struct xfs_scrub *sc = xnc->sc; |
422 | struct xfs_inode *ip; |
423 | int error; |
424 | |
425 | /* Count the rt and quota files that are rooted in the superblock. */ |
426 | error = xchk_nlinks_collect_metafiles(xnc); |
427 | if (error) |
428 | return error; |
429 | |
430 | /* |
431 | * Set up for a potentially lengthy filesystem scan by reducing our |
432 | * transaction resource usage for the duration. Specifically: |
433 | * |
434 | * Cancel the transaction to release the log grant space while we scan |
435 | * the filesystem. |
436 | * |
437 | * Create a new empty transaction to eliminate the possibility of the |
438 | * inode scan deadlocking on cyclical metadata. |
439 | * |
440 | * We pass the empty transaction to the file scanning function to avoid |
441 | * repeatedly cycling empty transactions. This can be done even though |
442 | * we take the IOLOCK to quiesce the file because empty transactions |
443 | * do not take sb_internal. |
444 | */ |
445 | xchk_trans_cancel(sc); |
446 | error = xchk_trans_alloc_empty(sc); |
447 | if (error) |
448 | return error; |
449 | |
450 | while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { |
451 | if (S_ISDIR(VFS_I(ip)->i_mode)) |
452 | error = xchk_nlinks_collect_dir(xnc, ip); |
453 | else |
454 | error = xchk_nlinks_collect_file(xnc, ip); |
455 | xchk_irele(sc, ip); |
456 | if (error) |
457 | break; |
458 | |
459 | if (xchk_should_terminate(sc, &error)) |
460 | break; |
461 | } |
462 | xchk_iscan_iter_finish(&xnc->collect_iscan); |
463 | if (error) { |
464 | xchk_set_incomplete(sc); |
465 | /* |
466 | * If we couldn't grab an inode that was busy with a state |
467 | * change, change the error code so that we exit to userspace |
468 | * as quickly as possible. |
469 | */ |
470 | if (error == -EBUSY) |
471 | return -ECANCELED; |
472 | return error; |
473 | } |
474 | |
475 | /* |
476 | * Switch out for a real transaction in preparation for building a new |
477 | * tree. |
478 | */ |
479 | xchk_trans_cancel(sc); |
480 | return xchk_setup_fs(sc); |
481 | } |
482 | |
483 | /* |
484 | * Part 2: Comparing file link counters. Walk each inode and compare the link |
485 | * counts against our shadow information; and then walk each shadow link count |
486 | * structure (that wasn't covered in the first part), comparing it against the |
487 | * file. |
488 | */ |
489 | |
490 | /* Read the observed link count for comparison with the actual inode. */ |
491 | STATIC int |
492 | xchk_nlinks_comparison_read( |
493 | struct xchk_nlink_ctrs *xnc, |
494 | xfs_ino_t ino, |
495 | struct xchk_nlink *obs) |
496 | { |
497 | struct xchk_nlink nl; |
498 | int error; |
499 | |
500 | error = xfarray_load_sparse(xnc->nlinks, ino, &nl); |
501 | if (error) |
502 | return error; |
503 | |
504 | nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); |
505 | |
506 | error = xfarray_store(xnc->nlinks, ino, &nl); |
507 | if (error == -EFBIG) { |
508 | /* |
509 | * EFBIG means we tried to store data at too high a byte offset |
510 | * in the sparse array. IOWs, we cannot complete the check and |
511 | * must notify userspace that the check was incomplete. This |
512 | * shouldn't really happen outside of the collection phase. |
513 | */ |
514 | xchk_set_incomplete(xnc->sc); |
515 | return -ECANCELED; |
516 | } |
517 | if (error) |
518 | return error; |
519 | |
520 | /* Copy the counters, but do not expose the internal state. */ |
521 | obs->parents = nl.parents; |
522 | obs->backrefs = nl.backrefs; |
523 | obs->children = nl.children; |
524 | obs->flags = 0; |
525 | return 0; |
526 | } |
527 | |
528 | /* Check our link count against an inode. */ |
529 | STATIC int |
530 | xchk_nlinks_compare_inode( |
531 | struct xchk_nlink_ctrs *xnc, |
532 | struct xfs_inode *ip) |
533 | { |
534 | struct xchk_nlink obs; |
535 | struct xfs_scrub *sc = xnc->sc; |
536 | uint64_t total_links; |
537 | unsigned int actual_nlink; |
538 | int error; |
539 | |
540 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
541 | mutex_lock(&xnc->lock); |
542 | |
543 | if (xchk_iscan_aborted(&xnc->collect_iscan)) { |
544 | xchk_set_incomplete(xnc->sc); |
545 | error = -ECANCELED; |
546 | goto out_scanlock; |
547 | } |
548 | |
549 | error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); |
550 | if (error) |
551 | goto out_scanlock; |
552 | |
553 | /* |
554 | * If we don't have ftype to get an accurate count of the subdirectory |
555 | * entries in this directory, take advantage of the fact that on a |
556 | * consistent ftype=0 filesystem, the number of subdirectory |
557 | * backreferences (dotdot entries) pointing towards this directory |
558 | * should be equal to the number of subdirectory entries in the |
559 | * directory. |
560 | */ |
561 | if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) |
562 | obs.children = obs.backrefs; |
563 | |
564 | total_links = xchk_nlink_total(ip, &obs); |
565 | actual_nlink = VFS_I(ip)->i_nlink; |
566 | |
567 | trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); |
568 | |
569 | /* |
570 | * If we found so many parents that we'd overflow i_nlink, we must flag |
571 | * this as a corruption. The VFS won't let users increase the link |
572 | * count, but it will let them decrease it. |
573 | */ |
574 | if (total_links > XFS_MAXLINK) { |
575 | xchk_ino_set_corrupt(sc, ip->i_ino); |
576 | goto out_corrupt; |
577 | } |
578 | |
579 | /* Link counts should match. */ |
580 | if (total_links != actual_nlink) { |
581 | xchk_ino_set_corrupt(sc, ip->i_ino); |
582 | goto out_corrupt; |
583 | } |
584 | |
585 | if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { |
586 | /* |
587 | * The collection phase ignores directories with zero link |
588 | * count, so we ignore them here too. |
589 | * |
590 | * The number of subdirectory backreferences (dotdot entries) |
591 | * pointing towards this directory should be equal to the |
592 | * number of subdirectory entries in the directory. |
593 | */ |
594 | if (obs.children != obs.backrefs) |
595 | xchk_ino_xref_set_corrupt(sc, ip->i_ino); |
596 | } else { |
597 | /* |
598 | * Non-directories and unlinked directories should not have |
599 | * back references. |
600 | */ |
601 | if (obs.backrefs != 0) { |
602 | xchk_ino_set_corrupt(sc, ip->i_ino); |
603 | goto out_corrupt; |
604 | } |
605 | |
606 | /* |
607 | * Non-directories and unlinked directories should not have |
608 | * children. |
609 | */ |
610 | if (obs.children != 0) { |
611 | xchk_ino_set_corrupt(sc, ip->i_ino); |
612 | goto out_corrupt; |
613 | } |
614 | } |
615 | |
616 | if (ip == sc->mp->m_rootip) { |
617 | /* |
618 | * For the root of a directory tree, both the '.' and '..' |
619 | * entries should point to the root directory. The dotdot |
620 | * entry is counted as a parent of the root /and/ a backref of |
621 | * the root directory. |
622 | */ |
623 | if (obs.parents != 1) { |
624 | xchk_ino_set_corrupt(sc, ip->i_ino); |
625 | goto out_corrupt; |
626 | } |
627 | } else if (actual_nlink > 0) { |
628 | /* |
629 | * Linked files that are not the root directory should have at |
630 | * least one parent. |
631 | */ |
632 | if (obs.parents == 0) { |
633 | xchk_ino_set_corrupt(sc, ip->i_ino); |
634 | goto out_corrupt; |
635 | } |
636 | } |
637 | |
638 | out_corrupt: |
639 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) |
640 | error = -ECANCELED; |
641 | out_scanlock: |
642 | mutex_unlock(&xnc->lock); |
643 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
644 | return error; |
645 | } |
646 | |
647 | /* |
648 | * Check our link count against an inode that wasn't checked previously. This |
649 | * is intended to catch directories with dangling links, though we could be |
650 | * racing with inode allocation in other threads. |
651 | */ |
652 | STATIC int |
653 | xchk_nlinks_compare_inum( |
654 | struct xchk_nlink_ctrs *xnc, |
655 | xfs_ino_t ino) |
656 | { |
657 | struct xchk_nlink obs; |
658 | struct xfs_mount *mp = xnc->sc->mp; |
659 | struct xfs_trans *tp = xnc->sc->tp; |
660 | struct xfs_buf *agi_bp; |
661 | struct xfs_inode *ip; |
662 | int error; |
663 | |
664 | /* |
665 | * The first iget failed, so try again with the variant that returns |
666 | * either an incore inode or the AGI buffer. If the function returns |
667 | * EINVAL/ENOENT, it should have passed us the AGI buffer so that we |
668 | * can guarantee that the inode won't be allocated while we check for |
669 | * a zero link count in the observed link count data. |
670 | */ |
671 | error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); |
672 | if (!error) { |
673 | /* Actually got an inode, so use the inode compare. */ |
674 | error = xchk_nlinks_compare_inode(xnc, ip); |
675 | xchk_irele(xnc->sc, ip); |
676 | return error; |
677 | } |
678 | if (error == -ENOENT || error == -EINVAL) { |
679 | /* No inode was found. Check for zero link count below. */ |
680 | error = 0; |
681 | } |
682 | if (error) |
683 | goto out_agi; |
684 | |
685 | /* Ensure that we have protected against inode allocation/freeing. */ |
686 | if (agi_bp == NULL) { |
687 | ASSERT(agi_bp != NULL); |
688 | xchk_set_incomplete(xnc->sc); |
689 | return -ECANCELED; |
690 | } |
691 | |
692 | if (xchk_iscan_aborted(&xnc->collect_iscan)) { |
693 | xchk_set_incomplete(xnc->sc); |
694 | error = -ECANCELED; |
695 | goto out_agi; |
696 | } |
697 | |
698 | mutex_lock(&xnc->lock); |
699 | error = xchk_nlinks_comparison_read(xnc, ino, &obs); |
700 | if (error) |
701 | goto out_scanlock; |
702 | |
703 | trace_xchk_nlinks_check_zero(mp, ino, &obs); |
704 | |
705 | /* |
706 | * If we can't grab the inode, the link count had better be zero. We |
707 | * still hold the AGI to prevent inode allocation/freeing. |
708 | */ |
709 | if (xchk_nlink_total(NULL, &obs) != 0) { |
710 | xchk_ino_set_corrupt(xnc->sc, ino); |
711 | error = -ECANCELED; |
712 | } |
713 | |
714 | out_scanlock: |
715 | mutex_unlock(&xnc->lock); |
716 | out_agi: |
717 | if (agi_bp) |
718 | xfs_trans_brelse(tp, agi_bp); |
719 | return error; |
720 | } |
721 | |
722 | /* |
723 | * Try to visit every inode in the filesystem to compare the link count. Move |
724 | * on if we can't grab an inode, since we'll revisit unchecked nlink records in |
725 | * the second part. |
726 | */ |
727 | static int |
728 | xchk_nlinks_compare_iter( |
729 | struct xchk_nlink_ctrs *xnc, |
730 | struct xfs_inode **ipp) |
731 | { |
732 | int error; |
733 | |
734 | do { |
735 | error = xchk_iscan_iter(&xnc->compare_iscan, ipp); |
736 | } while (error == -EBUSY); |
737 | |
738 | return error; |
739 | } |
740 | |
741 | /* Compare the link counts we observed against the live information. */ |
742 | STATIC int |
743 | xchk_nlinks_compare( |
744 | struct xchk_nlink_ctrs *xnc) |
745 | { |
746 | struct xchk_nlink nl; |
747 | struct xfs_scrub *sc = xnc->sc; |
748 | struct xfs_inode *ip; |
749 | xfarray_idx_t cur = XFARRAY_CURSOR_INIT; |
750 | int error; |
751 | |
752 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) |
753 | return 0; |
754 | |
755 | /* |
756 | * Create a new empty transaction so that we can advance the iscan |
757 | * cursor without deadlocking if the inobt has a cycle and push on the |
758 | * inactivation workqueue. |
759 | */ |
760 | xchk_trans_cancel(sc); |
761 | error = xchk_trans_alloc_empty(sc); |
762 | if (error) |
763 | return error; |
764 | |
765 | /* |
766 | * Use the inobt to walk all allocated inodes to compare the link |
767 | * counts. Inodes skipped by _compare_iter will be tried again in the |
768 | * next phase of the scan. |
769 | */ |
770 | xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); |
771 | while ((error = xchk_nlinks_compare_iter(xnc, ipp: &ip)) == 1) { |
772 | error = xchk_nlinks_compare_inode(xnc, ip); |
773 | xchk_iscan_mark_visited(&xnc->compare_iscan, ip); |
774 | xchk_irele(sc, ip); |
775 | if (error) |
776 | break; |
777 | |
778 | if (xchk_should_terminate(sc, &error)) |
779 | break; |
780 | } |
781 | xchk_iscan_iter_finish(&xnc->compare_iscan); |
782 | xchk_iscan_teardown(&xnc->compare_iscan); |
783 | if (error) |
784 | return error; |
785 | |
786 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) |
787 | return 0; |
788 | |
789 | /* |
790 | * Walk all the non-null nlink observations that weren't checked in the |
791 | * previous step. |
792 | */ |
793 | mutex_lock(&xnc->lock); |
794 | while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { |
795 | xfs_ino_t ino = cur - 1; |
796 | |
797 | if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) |
798 | continue; |
799 | |
800 | mutex_unlock(&xnc->lock); |
801 | |
802 | error = xchk_nlinks_compare_inum(xnc, ino); |
803 | if (error) |
804 | return error; |
805 | |
806 | if (xchk_should_terminate(xnc->sc, &error)) |
807 | return error; |
808 | |
809 | mutex_lock(&xnc->lock); |
810 | } |
811 | mutex_unlock(&xnc->lock); |
812 | |
813 | return error; |
814 | } |
815 | |
816 | /* Tear down everything associated with a nlinks check. */ |
817 | static void |
818 | xchk_nlinks_teardown_scan( |
819 | void *priv) |
820 | { |
821 | struct xchk_nlink_ctrs *xnc = priv; |
822 | |
823 | /* Discourage any hook functions that might be running. */ |
824 | xchk_iscan_abort(&xnc->collect_iscan); |
825 | |
826 | xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); |
827 | |
828 | xfarray_destroy(xnc->nlinks); |
829 | xnc->nlinks = NULL; |
830 | |
831 | xchk_iscan_teardown(&xnc->collect_iscan); |
832 | mutex_destroy(&xnc->lock); |
833 | xnc->sc = NULL; |
834 | } |
835 | |
836 | /* |
837 | * Scan all inodes in the entire filesystem to generate link count data. If |
838 | * the scan is successful, the counts will be left alive for a repair. If any |
839 | * error occurs, we'll tear everything down. |
840 | */ |
841 | STATIC int |
842 | xchk_nlinks_setup_scan( |
843 | struct xfs_scrub *sc, |
844 | struct xchk_nlink_ctrs *xnc) |
845 | { |
846 | struct xfs_mount *mp = sc->mp; |
847 | char *descr; |
848 | unsigned long long max_inos; |
849 | xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; |
850 | xfs_agino_t first_agino, last_agino; |
851 | int error; |
852 | |
853 | ASSERT(xnc->sc == NULL); |
854 | xnc->sc = sc; |
855 | |
856 | mutex_init(&xnc->lock); |
857 | |
858 | /* Retry iget every tenth of a second for up to 30 seconds. */ |
859 | xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); |
860 | |
861 | /* |
862 | * Set up enough space to store an nlink record for the highest |
863 | * possible inode number in this system. |
864 | */ |
865 | xfs_agino_range(mp, last_agno, &first_agino, &last_agino); |
866 | max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; |
867 | descr = xchk_xfile_descr(sc, "file link counts" ); |
868 | error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), |
869 | sizeof(struct xchk_nlink), &xnc->nlinks); |
870 | kfree(descr); |
871 | if (error) |
872 | goto out_teardown; |
873 | |
874 | /* |
875 | * Hook into the directory entry code so that we can capture updates to |
876 | * file link counts. The hook only triggers for inodes that were |
877 | * already scanned, and the scanner thread takes each inode's ILOCK, |
878 | * which means that any in-progress inode updates will finish before we |
879 | * can scan the inode. |
880 | */ |
881 | ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); |
882 | xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); |
883 | error = xfs_dir_hook_add(mp, &xnc->dhook); |
884 | if (error) |
885 | goto out_teardown; |
886 | |
887 | /* Use deferred cleanup to pass the inode link count data to repair. */ |
888 | sc->buf_cleanup = xchk_nlinks_teardown_scan; |
889 | return 0; |
890 | |
891 | out_teardown: |
892 | xchk_nlinks_teardown_scan(priv: xnc); |
893 | return error; |
894 | } |
895 | |
896 | /* Scrub the link count of all inodes on the filesystem. */ |
897 | int |
898 | xchk_nlinks( |
899 | struct xfs_scrub *sc) |
900 | { |
901 | struct xchk_nlink_ctrs *xnc = sc->buf; |
902 | int error = 0; |
903 | |
904 | /* Set ourselves up to check link counts on the live filesystem. */ |
905 | error = xchk_nlinks_setup_scan(sc, xnc); |
906 | if (error) |
907 | return error; |
908 | |
909 | /* Walk all inodes, picking up link count information. */ |
910 | error = xchk_nlinks_collect(xnc); |
911 | if (!xchk_xref_process_error(sc, 0, 0, &error)) |
912 | return error; |
913 | |
914 | /* Fail fast if we're not playing with a full dataset. */ |
915 | if (xchk_iscan_aborted(&xnc->collect_iscan)) |
916 | xchk_set_incomplete(sc); |
917 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) |
918 | return 0; |
919 | |
920 | /* Compare link counts. */ |
921 | error = xchk_nlinks_compare(xnc); |
922 | if (!xchk_xref_process_error(sc, 0, 0, &error)) |
923 | return error; |
924 | |
925 | /* Check one last time for an incomplete dataset. */ |
926 | if (xchk_iscan_aborted(&xnc->collect_iscan)) |
927 | xchk_set_incomplete(sc); |
928 | |
929 | return 0; |
930 | } |
931 | |