inode_repair.c source code [linux/fs/xfs/scrub/inode_repair.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4	* Author: Darrick J. Wong <djwong@kernel.org>
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_trans_resv.h"
11	#include "xfs_mount.h"
12	#include "xfs_defer.h"
13	#include "xfs_btree.h"
14	#include "xfs_bit.h"
15	#include "xfs_log_format.h"
16	#include "xfs_trans.h"
17	#include "xfs_sb.h"
18	#include "xfs_inode.h"
19	#include "xfs_icache.h"
20	#include "xfs_inode_buf.h"
21	#include "xfs_inode_fork.h"
22	#include "xfs_ialloc.h"
23	#include "xfs_da_format.h"
24	#include "xfs_reflink.h"
25	#include "xfs_alloc.h"
26	#include "xfs_rmap.h"
27	#include "xfs_rmap_btree.h"
28	#include "xfs_bmap.h"
29	#include "xfs_bmap_btree.h"
30	#include "xfs_bmap_util.h"
31	#include "xfs_dir2.h"
32	#include "xfs_dir2_priv.h"
33	#include "xfs_quota_defs.h"
34	#include "xfs_quota.h"
35	#include "xfs_ag.h"
36	#include "xfs_rtbitmap.h"
37	#include "xfs_attr_leaf.h"
38	#include "xfs_log_priv.h"
39	#include "xfs_health.h"
40	#include "xfs_symlink_remote.h"
41	#include "scrub/xfs_scrub.h"
42	#include "scrub/scrub.h"
43	#include "scrub/common.h"
44	#include "scrub/btree.h"
45	#include "scrub/trace.h"
46	#include "scrub/repair.h"
47	#include "scrub/iscan.h"
48	#include "scrub/readdir.h"
49
50	/*
51	* Inode Record Repair
52	* ===================
53	*
54	* Roughly speaking, inode problems can be classified based on whether or not
55	* they trip the dinode verifiers. If those trip, then we won't be able to
56	* xfs_iget ourselves the inode.
57	*
58	* Therefore, the xrep_dinode_* functions fix anything that will cause the
59	* inode buffer verifier or the dinode verifier. The xrep_inode_* functions
60	* fix things on live incore inodes. The inode repair functions make decisions
61	* with security and usability implications when reviving a file:
62	*
63	* - Files with zero di_mode or a garbage di_mode are converted to regular file
64	* that only root can read. This file may not actually contain user data,
65	* if the file was not previously a regular file. Setuid and setgid bits
66	* are cleared.
67	*
68	* - Zero-size directories can be truncated to look empty. It is necessary to
69	* run the bmapbtd and directory repair functions to fully rebuild the
70	* directory.
71	*
72	* - Zero-size symbolic link targets can be truncated to '?'. It is necessary
73	* to run the bmapbtd and symlink repair functions to salvage the symlink.
74	*
75	* - Invalid extent size hints will be removed.
76	*
77	* - Quotacheck will be scheduled if we repaired an inode that was so badly
78	* damaged that the ondisk inode had to be rebuilt.
79	*
80	* - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
81	* Setuid and setgid bits are cleared.
82	*
83	* - Data and attr forks are reset to extents format with zero extents if the
84	* fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
85	* repair functions to recover the space mapping.
86	*
87	* - ACLs will not be recovered if the attr fork is zapped or the extended
88	* attribute structure itself requires salvaging.
89	*
90	* - If the attr fork is zapped, the user and group ids are reset to root and
91	* the setuid and setgid bits are removed.
92	*/
93
94	/*
95	* All the information we need to repair the ondisk inode if we can't iget the
96	* incore inode. We don't allocate this buffer unless we're going to perform
97	* a repair to the ondisk inode cluster buffer.
98	*/
99	struct xrep_inode {
100	/ Inode mapping that we saved from the initial lookup attempt. /
101	struct xfs_imap imap;
102
103	struct xfs_scrub *sc;
104
105	/ Blocks in use on the data device by data extents or bmbt blocks. /
106	xfs_rfsblock_t data_blocks;
107
108	/ Blocks in use on the rt device. /
109	xfs_rfsblock_t rt_blocks;
110
111	/ Blocks in use by the attr fork. /
112	xfs_rfsblock_t attr_blocks;
113
114	/ Number of data device extents for the data fork. /
115	xfs_extnum_t data_extents;
116
117	/*
118	* Number of realtime device extents for the data fork. If
119	* data_extents and rt_extents indicate that the data fork has extents
120	* on both devices, we'll just back away slowly.
121	*/
122	xfs_extnum_t rt_extents;
123
124	/ Number of (data device) extents for the attr fork. /
125	xfs_aextnum_t attr_extents;
126
127	/ Sick state to set after zapping parts of the inode. /
128	unsigned int ino_sick_mask;
129
130	/ Must we remove all access from this file? /
131	bool zap_acls;
132
133	/ Inode scanner to see if we can find the ftype from dirents /
134	struct xchk_iscan ftype_iscan;
135	uint8_t alleged_ftype;
136	};
137
138	/*
139	* Setup function for inode repair. @imap contains the ondisk inode mapping
140	* information so that we can correct the ondisk inode cluster buffer if
141	* necessary to make iget work.
142	*/
143	int
144	xrep_setup_inode(
145	struct xfs_scrub *sc,
146	const struct xfs_imap *imap)
147	{
148	struct xrep_inode *ri;
149
150	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
151	if (!sc->buf)
152	return -ENOMEM;
153
154	ri = sc->buf;
155	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
156	ri->sc = sc;
157	return `0`;
158	}
159
160	/*
161	* Make sure this ondisk inode can pass the inode buffer verifier. This is
162	* not the same as the dinode verifier.
163	*/
164	STATIC void
165	xrep_dinode_buf_core(
166	struct xfs_scrub *sc,
167	struct xfs_buf *bp,
168	unsigned int ioffset)
169	{
170	struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
171	struct xfs_trans *tp = sc->tp;
172	struct xfs_mount *mp = sc->mp;
173	xfs_agino_t agino;
174	bool crc_ok = false;
175	bool magic_ok = false;
176	bool unlinked_ok = false;
177
178	agino = be32_to_cpu(dip->di_next_unlinked);
179
180	if (xfs_verify_agino_or_null(bp->b_pag, agino))
181	unlinked_ok = true;
182
183	if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
184	xfs_dinode_good_version(mp, dip->di_version))
185	magic_ok = true;
186
187	if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
188	XFS_DINODE_CRC_OFF))
189	crc_ok = true;
190
191	if (magic_ok && unlinked_ok && crc_ok)
192	return;
193
194	if (!magic_ok) {
195	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
196	dip->di_version = `3`;
197	}
198	if (!unlinked_ok)
199	dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
200	xfs_dinode_calc_crc(mp, dip);
201	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
202	xfs_trans_log_buf(tp, bp, ioffset,
203	ioffset + sizeof(struct xfs_dinode) - `1`);
204	}
205
206	/ Make sure this inode cluster buffer can pass the inode buffer verifier. /
207	STATIC void
208	xrep_dinode_buf(
209	struct xfs_scrub *sc,
210	struct xfs_buf *bp)
211	{
212	struct xfs_mount *mp = sc->mp;
213	int i;
214	int ni;
215
216	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
217	for (i = `0`; i < ni; i++)
218	xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
219	}
220
221	/ Reinitialize things that never change in an inode. /
222	STATIC void
223	xrep_dinode_header(
224	struct xfs_scrub *sc,
225	struct xfs_dinode *dip)
226	{
227	trace_xrep_dinode_header(sc, dip);
228
229	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
230	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
231	dip->di_version = `3`;
232	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
233	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
234	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
235	}
236
237	/*
238	* If this directory entry points to the scrub target inode, then the directory
239	* we're scanning is the parent of the scrub target inode.
240	*/
241	STATIC int
242	xrep_dinode_findmode_dirent(
243	struct xfs_scrub *sc,
244	struct xfs_inode *dp,
245	xfs_dir2_dataptr_t dapos,
246	const struct xfs_name *name,
247	xfs_ino_t ino,
248	void *priv)
249	{
250	struct xrep_inode *ri = priv;
251	int error = `0`;
252
253	if (xchk_should_terminate(ri->sc, &error))
254	return error;
255
256	if (ino != sc->sm->sm_ino)
257	return `0`;
258
259	/ Ignore garbage directory entry names. /
260	if (name->len == `0` \|\| !xfs_dir2_namecheck(name->name, name->len))
261	return -EFSCORRUPTED;
262
263	/ Don't pick up dot or dotdot entries; we only want child dirents. /
264	if (xfs_dir2_samename(name, &xfs_name_dotdot) \|\|
265	xfs_dir2_samename(name, &xfs_name_dot))
266	return `0`;
267
268	/*
269	* Uhoh, more than one parent for this inode and they don't agree on
270	* the file type?
271	*/
272	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
273	ri->alleged_ftype != name->type) {
274	trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
275	ri->alleged_ftype);
276	return -EFSCORRUPTED;
277	}
278
279	/ We found a potential parent; remember the ftype. /
280	trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
281	ri->alleged_ftype = name->type;
282	return `0`;
283	}
284
285	/*
286	* If this is a directory, walk the dirents looking for any that point to the
287	* scrub target inode.
288	*/
289	STATIC int
290	xrep_dinode_findmode_walk_directory(
291	struct xrep_inode *ri,
292	struct xfs_inode *dp)
293	{
294	struct xfs_scrub *sc = ri->sc;
295	unsigned int lock_mode;
296	int error = `0`;
297
298	/*
299	* Scan the directory to see if there it contains an entry pointing to
300	* the directory that we are repairing.
301	*/
302	lock_mode = xfs_ilock_data_map_shared(dp);
303
304	/*
305	* If this directory is known to be sick, we cannot scan it reliably
306	* and must abort.
307	*/
308	if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE \|
309	XFS_SICK_INO_BMBTD \|
310	XFS_SICK_INO_DIR)) {
311	error = -EFSCORRUPTED;
312	goto out_unlock;
313	}
314
315	/*
316	* We cannot complete our parent pointer scan if a directory looks as
317	* though it has been zapped by the inode record repair code.
318	*/
319	if (xchk_dir_looks_zapped(dp)) {
320	error = -EBUSY;
321	goto out_unlock;
322	}
323
324	error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
325	if (error)
326	goto out_unlock;
327
328	out_unlock:
329	xfs_iunlock(dp, lock_mode);
330	return error;
331	}
332
333	/*
334	* Try to find the mode of the inode being repaired by looking for directories
335	* that point down to this file.
336	*/
337	STATIC int
338	xrep_dinode_find_mode(
339	struct xrep_inode *ri,
340	uint16_t *mode)
341	{
342	struct xfs_scrub *sc = ri->sc;
343	struct xfs_inode *dp;
344	int error;
345
346	/ No ftype means we have no other metadata to consult. /
347	if (!xfs_has_ftype(sc->mp)) {
348	*mode = S_IFREG;
349	return `0`;
350	}
351
352	/*
353	* Scan all directories for parents that might point down to this
354	* inode. Skip the inode being repaired during the scan since it
355	* cannot be its own parent. Note that we still hold the AGI locked
356	* so there's a real possibility that _iscan_iter can return EBUSY.
357	*/
358	xchk_iscan_start(sc, `5000`, `100`, &ri->ftype_iscan);
359	ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
360	ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
361	while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == `1`) {
362	if (S_ISDIR(VFS_I(dp)->i_mode))
363	error = xrep_dinode_findmode_walk_directory(ri, dp);
364	xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
365	xchk_irele(sc, dp);
366	if (error < `0`)
367	break;
368	if (xchk_should_terminate(sc, &error))
369	break;
370	}
371	xchk_iscan_iter_finish(&ri->ftype_iscan);
372	xchk_iscan_teardown(&ri->ftype_iscan);
373
374	if (error == -EBUSY) {
375	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
376	/*
377	* If we got an EBUSY after finding at least one
378	* dirent, that means the scan found an inode on the
379	* inactivation list and could not open it. Accept the
380	* alleged ftype and install a new mode below.
381	*/
382	error = `0`;
383	} else if (!(sc->flags & XCHK_TRY_HARDER)) {
384	/*
385	* Otherwise, retry the operation one time to see if
386	* the reason for the delay is an inode from the same
387	* cluster buffer waiting on the inactivation list.
388	*/
389	error = -EDEADLOCK;
390	}
391	}
392	if (error)
393	return error;
394
395	/*
396	* Convert the discovered ftype into the file mode. If all else fails,
397	* return S_IFREG.
398	*/
399	switch (ri->alleged_ftype) {
400	case XFS_DIR3_FT_DIR:
401	*mode = S_IFDIR;
402	break;
403	case XFS_DIR3_FT_WHT:
404	case XFS_DIR3_FT_CHRDEV:
405	*mode = S_IFCHR;
406	break;
407	case XFS_DIR3_FT_BLKDEV:
408	*mode = S_IFBLK;
409	break;
410	case XFS_DIR3_FT_FIFO:
411	*mode = S_IFIFO;
412	break;
413	case XFS_DIR3_FT_SOCK:
414	*mode = S_IFSOCK;
415	break;
416	case XFS_DIR3_FT_SYMLINK:
417	*mode = S_IFLNK;
418	break;
419	default:
420	*mode = S_IFREG;
421	break;
422	}
423	return `0`;
424	}
425
426	/ Turn di_mode into /something/ recognizable. Returns true if we succeed. /
427	STATIC int
428	xrep_dinode_mode(
429	struct xrep_inode *ri,
430	struct xfs_dinode *dip)
431	{
432	struct xfs_scrub *sc = ri->sc;
433	uint16_t mode = be16_to_cpu(dip->di_mode);
434	int error;
435
436	trace_xrep_dinode_mode(sc, dip);
437
438	if (mode == `0` \|\| xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
439	return `0`;
440
441	/ Try to fix the mode. If we cannot, then leave everything alone. /
442	error = xrep_dinode_find_mode(ri, &mode);
443	switch (error) {
444	case -EINTR:
445	case -EBUSY:
446	case -EDEADLOCK:
447	/ temporary failure or fatal signal /
448	return error;
449	case `0`:
450	/ found mode /
451	break;
452	default:
453	/ some other error, assume S_IFREG /
454	mode = S_IFREG;
455	break;
456	}
457
458	/ bad mode, so we set it to a file that only root can read /
459	dip->di_mode = cpu_to_be16(mode);
460	dip->di_uid = `0`;
461	dip->di_gid = `0`;
462	ri->zap_acls = true;
463	return `0`;
464	}
465
466	/ Fix any conflicting flags that the verifiers complain about. /
467	STATIC void
468	xrep_dinode_flags(
469	struct xfs_scrub *sc,
470	struct xfs_dinode *dip,
471	bool isrt)
472	{
473	struct xfs_mount *mp = sc->mp;
474	uint64_t flags2 = be64_to_cpu(dip->di_flags2);
475	uint16_t flags = be16_to_cpu(dip->di_flags);
476	uint16_t mode = be16_to_cpu(dip->di_mode);
477
478	trace_xrep_dinode_flags(sc, dip);
479
480	if (isrt)
481	flags \|= XFS_DIFLAG_REALTIME;
482	else
483	flags &= ~XFS_DIFLAG_REALTIME;
484
485	/*
486	* For regular files on a reflink filesystem, set the REFLINK flag to
487	* protect shared extents. A later stage will actually check those
488	* extents and clear the flag if possible.
489	*/
490	if (xfs_has_reflink(mp) && S_ISREG(mode))
491	flags2 \|= XFS_DIFLAG2_REFLINK;
492	else
493	flags2 &= ~(XFS_DIFLAG2_REFLINK \| XFS_DIFLAG2_COWEXTSIZE);
494	if (flags & XFS_DIFLAG_REALTIME)
495	flags2 &= ~XFS_DIFLAG2_REFLINK;
496	if (!xfs_has_bigtime(mp))
497	flags2 &= ~XFS_DIFLAG2_BIGTIME;
498	if (!xfs_has_large_extent_counts(mp))
499	flags2 &= ~XFS_DIFLAG2_NREXT64;
500	if (flags2 & XFS_DIFLAG2_NREXT64)
501	dip->di_nrext64_pad = `0`;
502	else if (dip->di_version >= `3`)
503	dip->di_v3_pad = `0`;
504	dip->di_flags = cpu_to_be16(flags);
505	dip->di_flags2 = cpu_to_be64(flags2);
506	}
507
508	/*
509	* Blow out symlink; now it points nowhere. We don't have to worry about
510	* incore state because this inode is failing the verifiers.
511	*/
512	STATIC void
513	xrep_dinode_zap_symlink(
514	struct xrep_inode *ri,
515	struct xfs_dinode *dip)
516	{
517	struct xfs_scrub *sc = ri->sc;
518	char *p;
519
520	trace_xrep_dinode_zap_symlink(sc, dip);
521
522	dip->di_format = XFS_DINODE_FMT_LOCAL;
523	dip->di_size = cpu_to_be64(`1`);
524	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
525	*p = `'?'`;
526	ri->ino_sick_mask \|= XFS_SICK_INO_SYMLINK_ZAPPED;
527	}
528
529	/*
530	* Blow out dir, make the parent point to the root. In the future repair will
531	* reconstruct this directory for us. Note that there's no in-core directory
532	* inode because the sf verifier tripped, so we don't have to worry about the
533	* dentry cache.
534	*/
535	STATIC void
536	xrep_dinode_zap_dir(
537	struct xrep_inode *ri,
538	struct xfs_dinode *dip)
539	{
540	struct xfs_scrub *sc = ri->sc;
541	struct xfs_mount *mp = sc->mp;
542	struct xfs_dir2_sf_hdr *sfp;
543	int i8count;
544
545	trace_xrep_dinode_zap_dir(sc, dip);
546
547	dip->di_format = XFS_DINODE_FMT_LOCAL;
548	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
549	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
550	sfp->count = `0`;
551	sfp->i8count = i8count;
552	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
553	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
554	ri->ino_sick_mask \|= XFS_SICK_INO_DIR_ZAPPED;
555	}
556
557	/ Make sure we don't have a garbage file size. /
558	STATIC void
559	xrep_dinode_size(
560	struct xrep_inode *ri,
561	struct xfs_dinode *dip)
562	{
563	struct xfs_scrub *sc = ri->sc;
564	uint64_t size = be64_to_cpu(dip->di_size);
565	uint16_t mode = be16_to_cpu(dip->di_mode);
566
567	trace_xrep_dinode_size(sc, dip);
568
569	switch (mode & S_IFMT) {
570	case S_IFIFO:
571	case S_IFCHR:
572	case S_IFBLK:
573	case S_IFSOCK:
574	/ di_size can't be nonzero for special files /
575	dip->di_size = `0`;
576	break;
577	case S_IFREG:
578	/ Regular files can't be larger than 2^63-1 bytes. /
579	dip->di_size = cpu_to_be64(size & ~(`1ULL` << `63`));
580	break;
581	case S_IFLNK:
582	/*
583	* Truncate ridiculously oversized symlinks. If the size is
584	* zero, reset it to point to the current directory. Both of
585	* these conditions trigger dinode verifier errors, so there
586	* is no in-core state to reset.
587	*/
588	if (size > XFS_SYMLINK_MAXLEN)
589	dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
590	else if (size == `0`)
591	xrep_dinode_zap_symlink(ri, dip);
592	break;
593	case S_IFDIR:
594	/*
595	* Directories can't have a size larger than 32G. If the size
596	* is zero, reset it to an empty directory. Both of these
597	* conditions trigger dinode verifier errors, so there is no
598	* in-core state to reset.
599	*/
600	if (size > XFS_DIR2_SPACE_SIZE)
601	dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
602	else if (size == `0`)
603	xrep_dinode_zap_dir(ri, dip);
604	break;
605	}
606	}
607
608	/ Fix extent size hints. /
609	STATIC void
610	xrep_dinode_extsize_hints(
611	struct xfs_scrub *sc,
612	struct xfs_dinode *dip)
613	{
614	struct xfs_mount *mp = sc->mp;
615	uint64_t flags2 = be64_to_cpu(dip->di_flags2);
616	uint16_t flags = be16_to_cpu(dip->di_flags);
617	uint16_t mode = be16_to_cpu(dip->di_mode);
618
619	xfs_failaddr_t fa;
620
621	trace_xrep_dinode_extsize_hints(sc, dip);
622
623	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
624	mode, flags);
625	if (fa) {
626	dip->di_extsize = `0`;
627	dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE \|
628	XFS_DIFLAG_EXTSZINHERIT);
629	}
630
631	if (dip->di_version < `3`)
632	return;
633
634	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
635	mode, flags, flags2);
636	if (fa) {
637	dip->di_cowextsize = `0`;
638	dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
639	}
640	}
641
642	/ Count extents and blocks for an inode given an rmap. /
643	STATIC int
644	xrep_dinode_walk_rmap(
645	struct xfs_btree_cur *cur,
646	const struct xfs_rmap_irec *rec,
647	void *priv)
648	{
649	struct xrep_inode *ri = priv;
650	int error = `0`;
651
652	if (xchk_should_terminate(ri->sc, &error))
653	return error;
654
655	/ We only care about this inode. /
656	if (rec->rm_owner != ri->sc->sm->sm_ino)
657	return `0`;
658
659	if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
660	ri->attr_blocks += rec->rm_blockcount;
661	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
662	ri->attr_extents++;
663
664	return `0`;
665	}
666
667	ri->data_blocks += rec->rm_blockcount;
668	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
669	ri->data_extents++;
670
671	return `0`;
672	}
673
674	/ Count extents and blocks for an inode from all AG rmap data. /
675	STATIC int
676	xrep_dinode_count_ag_rmaps(
677	struct xrep_inode *ri,
678	struct xfs_perag *pag)
679	{
680	struct xfs_btree_cur *cur;
681	struct xfs_buf *agf;
682	int error;
683
684	error = xfs_alloc_read_agf(pag, ri->sc->tp, `0`, &agf);
685	if (error)
686	return error;
687
688	cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
689	error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
690	xfs_btree_del_cursor(cur, error);
691	xfs_trans_brelse(ri->sc->tp, agf);
692	return error;
693	}
694
695	/ Count extents and blocks for a given inode from all rmap data. /
696	STATIC int
697	xrep_dinode_count_rmaps(
698	struct xrep_inode *ri)
699	{
700	struct xfs_perag *pag;
701	xfs_agnumber_t agno;
702	int error;
703
704	if (!xfs_has_rmapbt(ri->sc->mp) \|\| xfs_has_realtime(ri->sc->mp))
705	return -EOPNOTSUPP;
706
707	for_each_perag(ri->sc->mp, agno, pag) {
708	error = xrep_dinode_count_ag_rmaps(ri, pag);
709	if (error) {
710	xfs_perag_rele(pag);
711	return error;
712	}
713	}
714
715	/ Can't have extents on both the rt and the data device. /
716	if (ri->data_extents && ri->rt_extents)
717	return -EFSCORRUPTED;
718
719	trace_xrep_dinode_count_rmaps(ri->sc,
720	ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
721	ri->data_extents, ri->rt_extents, ri->attr_extents);
722	return `0`;
723	}
724
725	/ Return true if this extents-format ifork looks like garbage. /
726	STATIC bool
727	xrep_dinode_bad_extents_fork(
728	struct xfs_scrub *sc,
729	struct xfs_dinode *dip,
730	unsigned int dfork_size,
731	int whichfork)
732	{
733	struct xfs_bmbt_irec new;
734	struct xfs_bmbt_rec *dp;
735	xfs_extnum_t nex;
736	bool isrt;
737	unsigned int i;
738
739	nex = xfs_dfork_nextents(dip, whichfork);
740	if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
741	return true;
742
743	dp = XFS_DFORK_PTR(dip, whichfork);
744
745	isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
746	for (i = `0`; i < nex; i++, dp++) {
747	xfs_failaddr_t fa;
748
749	xfs_bmbt_disk_get_all(dp, &new);
750	fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
751	&new);
752	if (fa)
753	return true;
754	}
755
756	return false;
757	}
758
759	/ Return true if this btree-format ifork looks like garbage. /
760	STATIC bool
761	xrep_dinode_bad_bmbt_fork(
762	struct xfs_scrub *sc,
763	struct xfs_dinode *dip,
764	unsigned int dfork_size,
765	int whichfork)
766	{
767	struct xfs_bmdr_block *dfp;
768	xfs_extnum_t nex;
769	unsigned int i;
770	unsigned int dmxr;
771	unsigned int nrecs;
772	unsigned int level;
773
774	nex = xfs_dfork_nextents(dip, whichfork);
775	if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
776	return true;
777
778	if (dfork_size < sizeof(struct xfs_bmdr_block))
779	return true;
780
781	dfp = XFS_DFORK_PTR(dip, whichfork);
782	nrecs = be16_to_cpu(dfp->bb_numrecs);
783	level = be16_to_cpu(dfp->bb_level);
784
785	if (nrecs == `0` \|\| XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
786	return true;
787	if (level == `0` \|\| level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
788	return true;
789
790	dmxr = xfs_bmdr_maxrecs(dfork_size, `0`);
791	for (i = `1`; i <= nrecs; i++) {
792	struct xfs_bmbt_key *fkp;
793	xfs_bmbt_ptr_t *fpp;
794	xfs_fileoff_t fileoff;
795	xfs_fsblock_t fsbno;
796
797	fkp = XFS_BMDR_KEY_ADDR(dfp, i);
798	fileoff = be64_to_cpu(fkp->br_startoff);
799	if (!xfs_verify_fileoff(sc->mp, fileoff))
800	return true;
801
802	fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
803	fsbno = be64_to_cpu(*fpp);
804	if (!xfs_verify_fsbno(sc->mp, fsbno))
805	return true;
806	}
807
808	return false;
809	}
810
811	/*
812	* Check the data fork for things that will fail the ifork verifiers or the
813	* ifork formatters.
814	*/
815	STATIC bool
816	xrep_dinode_check_dfork(
817	struct xfs_scrub *sc,
818	struct xfs_dinode *dip,
819	uint16_t mode)
820	{
821	void *dfork_ptr;
822	int64_t data_size;
823	unsigned int fmt;
824	unsigned int dfork_size;
825
826	/*
827	* Verifier functions take signed int64_t, so check for bogus negative
828	* values first.
829	*/
830	data_size = be64_to_cpu(dip->di_size);
831	if (data_size < `0`)
832	return true;
833
834	fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
835	switch (mode & S_IFMT) {
836	case S_IFIFO:
837	case S_IFCHR:
838	case S_IFBLK:
839	case S_IFSOCK:
840	if (fmt != XFS_DINODE_FMT_DEV)
841	return true;
842	break;
843	case S_IFREG:
844	if (fmt == XFS_DINODE_FMT_LOCAL)
845	return true;
846	fallthrough;
847	case S_IFLNK:
848	case S_IFDIR:
849	switch (fmt) {
850	case XFS_DINODE_FMT_LOCAL:
851	case XFS_DINODE_FMT_EXTENTS:
852	case XFS_DINODE_FMT_BTREE:
853	break;
854	default:
855	return true;
856	}
857	break;
858	default:
859	return true;
860	}
861
862	dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
863	dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
864
865	switch (fmt) {
866	case XFS_DINODE_FMT_DEV:
867	break;
868	case XFS_DINODE_FMT_LOCAL:
869	/ dir/symlink structure cannot be larger than the fork /
870	if (data_size > dfork_size)
871	return true;
872	/ directory structure must pass verification. /
873	if (S_ISDIR(mode) &&
874	xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
875	return true;
876	/ symlink structure must pass verification. /
877	if (S_ISLNK(mode) &&
878	xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
879	return true;
880	break;
881	case XFS_DINODE_FMT_EXTENTS:
882	if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
883	XFS_DATA_FORK))
884	return true;
885	break;
886	case XFS_DINODE_FMT_BTREE:
887	if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
888	XFS_DATA_FORK))
889	return true;
890	break;
891	default:
892	return true;
893	}
894
895	return false;
896	}
897
898	static void
899	xrep_dinode_set_data_nextents(
900	struct xfs_dinode *dip,
901	xfs_extnum_t nextents)
902	{
903	if (xfs_dinode_has_large_extent_counts(dip))
904	dip->di_big_nextents = cpu_to_be64(nextents);
905	else
906	dip->di_nextents = cpu_to_be32(nextents);
907	}
908
909	static void
910	xrep_dinode_set_attr_nextents(
911	struct xfs_dinode *dip,
912	xfs_extnum_t nextents)
913	{
914	if (xfs_dinode_has_large_extent_counts(dip))
915	dip->di_big_anextents = cpu_to_be32(nextents);
916	else
917	dip->di_anextents = cpu_to_be16(nextents);
918	}
919
920	/ Reset the data fork to something sane. /
921	STATIC void
922	xrep_dinode_zap_dfork(
923	struct xrep_inode *ri,
924	struct xfs_dinode *dip,
925	uint16_t mode)
926	{
927	struct xfs_scrub *sc = ri->sc;
928
929	trace_xrep_dinode_zap_dfork(sc, dip);
930
931	ri->ino_sick_mask \|= XFS_SICK_INO_BMBTD_ZAPPED;
932
933	xrep_dinode_set_data_nextents(dip, `0`);
934	ri->data_blocks = `0`;
935	ri->rt_blocks = `0`;
936
937	/ Special files always get reset to DEV /
938	switch (mode & S_IFMT) {
939	case S_IFIFO:
940	case S_IFCHR:
941	case S_IFBLK:
942	case S_IFSOCK:
943	dip->di_format = XFS_DINODE_FMT_DEV;
944	dip->di_size = `0`;
945	return;
946	}
947
948	/*
949	* If we have data extents, reset to an empty map and hope the user
950	* will run the bmapbtd checker next.
951	*/
952	if (ri->data_extents \|\| ri->rt_extents \|\| S_ISREG(mode)) {
953	dip->di_format = XFS_DINODE_FMT_EXTENTS;
954	return;
955	}
956
957	/ Otherwise, reset the local format to the minimum. /
958	switch (mode & S_IFMT) {
959	case S_IFLNK:
960	xrep_dinode_zap_symlink(ri, dip);
961	break;
962	case S_IFDIR:
963	xrep_dinode_zap_dir(ri, dip);
964	break;
965	}
966	}
967
968	/*
969	* Check the attr fork for things that will fail the ifork verifiers or the
970	* ifork formatters.
971	*/
972	STATIC bool
973	xrep_dinode_check_afork(
974	struct xfs_scrub *sc,
975	struct xfs_dinode *dip)
976	{
977	struct xfs_attr_sf_hdr *afork_ptr;
978	size_t attr_size;
979	unsigned int afork_size;
980
981	if (XFS_DFORK_BOFF(dip) == `0`)
982	return dip->di_aformat != XFS_DINODE_FMT_EXTENTS \|\|
983	xfs_dfork_attr_extents(dip) != `0`;
984
985	afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
986	afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
987
988	switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
989	case XFS_DINODE_FMT_LOCAL:
990	/ Fork has to be large enough to extract the xattr size. /
991	if (afork_size < sizeof(struct xfs_attr_sf_hdr))
992	return true;
993
994	/ xattr structure cannot be larger than the fork /
995	attr_size = be16_to_cpu(afork_ptr->totsize);
996	if (attr_size > afork_size)
997	return true;
998
999	/ xattr structure must pass verification. /
1000	return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1001	case XFS_DINODE_FMT_EXTENTS:
1002	if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1003	XFS_ATTR_FORK))
1004	return true;
1005	break;
1006	case XFS_DINODE_FMT_BTREE:
1007	if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1008	XFS_ATTR_FORK))
1009	return true;
1010	break;
1011	default:
1012	return true;
1013	}
1014
1015	return false;
1016	}
1017
1018	/*
1019	* Reset the attr fork to empty. Since the attr fork could have contained
1020	* ACLs, make the file readable only by root.
1021	*/
1022	STATIC void
1023	xrep_dinode_zap_afork(
1024	struct xrep_inode *ri,
1025	struct xfs_dinode *dip,
1026	uint16_t mode)
1027	{
1028	struct xfs_scrub *sc = ri->sc;
1029
1030	trace_xrep_dinode_zap_afork(sc, dip);
1031
1032	ri->ino_sick_mask \|= XFS_SICK_INO_BMBTA_ZAPPED;
1033
1034	dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1035	xrep_dinode_set_attr_nextents(dip, `0`);
1036	ri->attr_blocks = `0`;
1037
1038	/*
1039	* If the data fork is in btree format, removing the attr fork entirely
1040	* might cause verifier failures if the next level down in the bmbt
1041	* could now fit in the data fork area.
1042	*/
1043	if (dip->di_format != XFS_DINODE_FMT_BTREE)
1044	dip->di_forkoff = `0`;
1045	dip->di_mode = cpu_to_be16(mode & ~`0777`);
1046	dip->di_uid = `0`;
1047	dip->di_gid = `0`;
1048	}
1049
1050	/ Make sure the fork offset is a sensible value. /
1051	STATIC void
1052	xrep_dinode_ensure_forkoff(
1053	struct xrep_inode *ri,
1054	struct xfs_dinode *dip,
1055	uint16_t mode)
1056	{
1057	struct xfs_bmdr_block *bmdr;
1058	struct xfs_scrub *sc = ri->sc;
1059	xfs_extnum_t attr_extents, data_extents;
1060	size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(`1`);
1061	unsigned int lit_sz = XFS_LITINO(sc->mp);
1062	unsigned int afork_min, dfork_min;
1063
1064	trace_xrep_dinode_ensure_forkoff(sc, dip);
1065
1066	/*
1067	* Before calling this function, xrep_dinode_core ensured that both
1068	* forks actually fit inside their respective literal areas. If this
1069	* was not the case, the fork was reset to FMT_EXTENTS with zero
1070	* records. If the rmapbt scan found attr or data fork blocks, this
1071	* will be noted in the dinode_stats, and we must leave enough room
1072	* for the bmap repair code to reconstruct the mapping structure.
1073	*
1074	* First, compute the minimum space required for the attr fork.
1075	*/
1076	switch (dip->di_aformat) {
1077	case XFS_DINODE_FMT_LOCAL:
1078	/*
1079	* If we still have a shortform xattr structure at all, that
1080	* means the attr fork area was exactly large enough to fit
1081	* the sf structure.
1082	*/
1083	afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1084	break;
1085	case XFS_DINODE_FMT_EXTENTS:
1086	attr_extents = xfs_dfork_attr_extents(dip);
1087	if (attr_extents) {
1088	/*
1089	* We must maintain sufficient space to hold the entire
1090	* extent map array in the data fork. Note that we
1091	* previously zapped the fork if it had no chance of
1092	* fitting in the inode.
1093	*/
1094	afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1095	} else if (ri->attr_extents > `0`) {
1096	/*
1097	* The attr fork thinks it has zero extents, but we
1098	* found some xattr extents. We need to leave enough
1099	* empty space here so that the incore attr fork will
1100	* get created (and hence trigger the attr fork bmap
1101	* repairer).
1102	*/
1103	afork_min = bmdr_minsz;
1104	} else {
1105	/ No extents on disk or found in rmapbt. /
1106	afork_min = `0`;
1107	}
1108	break;
1109	case XFS_DINODE_FMT_BTREE:
1110	/ Must have space for btree header and key/pointers. /
1111	bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1112	afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1113	break;
1114	default:
1115	/ We should never see any other formats. /
1116	afork_min = `0`;
1117	break;
1118	}
1119
1120	/ Compute the minimum space required for the data fork. /
1121	switch (dip->di_format) {
1122	case XFS_DINODE_FMT_DEV:
1123	dfork_min = sizeof(__be32);
1124	break;
1125	case XFS_DINODE_FMT_UUID:
1126	dfork_min = sizeof(uuid_t);
1127	break;
1128	case XFS_DINODE_FMT_LOCAL:
1129	/*
1130	* If we still have a shortform data fork at all, that means
1131	* the data fork area was large enough to fit whatever was in
1132	* there.
1133	*/
1134	dfork_min = be64_to_cpu(dip->di_size);
1135	break;
1136	case XFS_DINODE_FMT_EXTENTS:
1137	data_extents = xfs_dfork_data_extents(dip);
1138	if (data_extents) {
1139	/*
1140	* We must maintain sufficient space to hold the entire
1141	* extent map array in the data fork. Note that we
1142	* previously zapped the fork if it had no chance of
1143	* fitting in the inode.
1144	*/
1145	dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1146	} else if (ri->data_extents > `0` \|\| ri->rt_extents > `0`) {
1147	/*
1148	* The data fork thinks it has zero extents, but we
1149	* found some data extents. We need to leave enough
1150	* empty space here so that the data fork bmap repair
1151	* will recover the mappings.
1152	*/
1153	dfork_min = bmdr_minsz;
1154	} else {
1155	/ No extents on disk or found in rmapbt. /
1156	dfork_min = `0`;
1157	}
1158	break;
1159	case XFS_DINODE_FMT_BTREE:
1160	/ Must have space for btree header and key/pointers. /
1161	bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1162	dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1163	break;
1164	default:
1165	dfork_min = `0`;
1166	break;
1167	}
1168
1169	/*
1170	* Round all values up to the nearest 8 bytes, because that is the
1171	* precision of di_forkoff.
1172	*/
1173	afork_min = roundup(afork_min, `8`);
1174	dfork_min = roundup(dfork_min, `8`);
1175	bmdr_minsz = roundup(bmdr_minsz, `8`);
1176
1177	ASSERT(dfork_min <= lit_sz);
1178	ASSERT(afork_min <= lit_sz);
1179
1180	/*
1181	* If the data fork was zapped and we don't have enough space for the
1182	* recovery fork, move the attr fork up.
1183	*/
1184	if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1185	xfs_dfork_data_extents(dip) == `0` &&
1186	(ri->data_extents > `0` \|\| ri->rt_extents > `0`) &&
1187	bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1188	if (bmdr_minsz + afork_min > lit_sz) {
1189	/*
1190	* The attr for and the stub fork we need to recover
1191	* the data fork won't both fit. Zap the attr fork.
1192	*/
1193	xrep_dinode_zap_afork(ri, dip, mode);
1194	afork_min = bmdr_minsz;
1195	} else {
1196	void before, after;
1197
1198	/ Otherwise, just slide the attr fork up. /
1199	before = XFS_DFORK_APTR(dip);
1200	dip->di_forkoff = bmdr_minsz >> `3`;
1201	after = XFS_DFORK_APTR(dip);
1202	memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1203	}
1204	}
1205
1206	/*
1207	* If the attr fork was zapped and we don't have enough space for the
1208	* recovery fork, move the attr fork down.
1209	*/
1210	if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1211	xfs_dfork_attr_extents(dip) == `0` &&
1212	ri->attr_extents > `0` &&
1213	bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1214	if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1215	/*
1216	* If the data fork is in btree format then we can't
1217	* adjust forkoff because that runs the risk of
1218	* violating the extents/btree format transition rules.
1219	*/
1220	} else if (bmdr_minsz + dfork_min > lit_sz) {
1221	/*
1222	* If we can't move the attr fork, too bad, we lose the
1223	* attr fork and leak its blocks.
1224	*/
1225	xrep_dinode_zap_afork(ri, dip, mode);
1226	} else {
1227	/*
1228	* Otherwise, just slide the attr fork down. The attr
1229	* fork is empty, so we don't have any old contents to
1230	* move here.
1231	*/
1232	dip->di_forkoff = (lit_sz - bmdr_minsz) >> `3`;
1233	}
1234	}
1235	}
1236
1237	/*
1238	* Zap the data/attr forks if we spot anything that isn't going to pass the
1239	* ifork verifiers or the ifork formatters, because we need to get the inode
1240	* into good enough shape that the higher level repair functions can run.
1241	*/
1242	STATIC void
1243	xrep_dinode_zap_forks(
1244	struct xrep_inode *ri,
1245	struct xfs_dinode *dip)
1246	{
1247	struct xfs_scrub *sc = ri->sc;
1248	xfs_extnum_t data_extents;
1249	xfs_extnum_t attr_extents;
1250	xfs_filblks_t nblocks;
1251	uint16_t mode;
1252	bool zap_datafork = false;
1253	bool zap_attrfork = ri->zap_acls;
1254
1255	trace_xrep_dinode_zap_forks(sc, dip);
1256
1257	mode = be16_to_cpu(dip->di_mode);
1258
1259	data_extents = xfs_dfork_data_extents(dip);
1260	attr_extents = xfs_dfork_attr_extents(dip);
1261	nblocks = be64_to_cpu(dip->di_nblocks);
1262
1263	/ Inode counters don't make sense? /
1264	if (data_extents > nblocks)
1265	zap_datafork = true;
1266	if (attr_extents > nblocks)
1267	zap_attrfork = true;
1268	if (data_extents + attr_extents > nblocks)
1269	zap_datafork = zap_attrfork = true;
1270
1271	if (!zap_datafork)
1272	zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1273	if (!zap_attrfork)
1274	zap_attrfork = xrep_dinode_check_afork(sc, dip);
1275
1276	/ Zap whatever's bad. /
1277	if (zap_attrfork)
1278	xrep_dinode_zap_afork(ri, dip, mode);
1279	if (zap_datafork)
1280	xrep_dinode_zap_dfork(ri, dip, mode);
1281	xrep_dinode_ensure_forkoff(ri, dip, mode);
1282
1283	/*
1284	* Zero di_nblocks if we don't have any extents at all to satisfy the
1285	* buffer verifier.
1286	*/
1287	data_extents = xfs_dfork_data_extents(dip);
1288	attr_extents = xfs_dfork_attr_extents(dip);
1289	if (data_extents + attr_extents == `0`)
1290	dip->di_nblocks = `0`;
1291	}
1292
1293	/ Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. /
1294	STATIC int
1295	xrep_dinode_core(
1296	struct xrep_inode *ri)
1297	{
1298	struct xfs_scrub *sc = ri->sc;
1299	struct xfs_buf *bp;
1300	struct xfs_dinode *dip;
1301	xfs_ino_t ino = sc->sm->sm_ino;
1302	int error;
1303	int iget_error;
1304
1305	/ Figure out what this inode had mapped in both forks. /
1306	error = xrep_dinode_count_rmaps(ri);
1307	if (error)
1308	return error;
1309
1310	/ Read the inode cluster buffer. /
1311	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1312	ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1313	NULL);
1314	if (error)
1315	return error;
1316
1317	/ Make sure we can pass the inode buffer verifier. /
1318	xrep_dinode_buf(sc, bp);
1319	bp->b_ops = &xfs_inode_buf_ops;
1320
1321	/ Fix everything the verifier will complain about. /
1322	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1323	xrep_dinode_header(sc, dip);
1324	iget_error = xrep_dinode_mode(ri, dip);
1325	if (iget_error)
1326	goto write;
1327	xrep_dinode_flags(sc, dip, ri->rt_extents > `0`);
1328	xrep_dinode_size(ri, dip);
1329	xrep_dinode_extsize_hints(sc, dip);
1330	xrep_dinode_zap_forks(ri, dip);
1331
1332	write:
1333	/ Write out the inode. /
1334	trace_xrep_dinode_fixed(sc, dip);
1335	xfs_dinode_calc_crc(sc->mp, dip);
1336	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1337	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1338	ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - `1`);
1339
1340	/*
1341	* In theory, we've fixed the ondisk inode record enough that we should
1342	* be able to load the inode into the cache. Try to iget that inode
1343	* now while we hold the AGI and the inode cluster buffer and take the
1344	* IOLOCK so that we can continue with repairs without anyone else
1345	* accessing the inode. If iget fails, we still need to commit the
1346	* changes.
1347	*/
1348	if (!iget_error)
1349	iget_error = xchk_iget(sc, ino, &sc->ip);
1350	if (!iget_error)
1351	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1352
1353	/*
1354	* Commit the inode cluster buffer updates and drop the AGI buffer that
1355	* we've been holding since scrub setup. From here on out, repairs
1356	* deal only with the cached inode.
1357	*/
1358	error = xrep_trans_commit(sc);
1359	if (error)
1360	return error;
1361
1362	if (iget_error)
1363	return iget_error;
1364
1365	error = xchk_trans_alloc(sc, `0`);
1366	if (error)
1367	return error;
1368
1369	error = xrep_ino_dqattach(sc);
1370	if (error)
1371	return error;
1372
1373	xchk_ilock(sc, XFS_ILOCK_EXCL);
1374	if (ri->ino_sick_mask)
1375	xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1376	return `0`;
1377	}
1378
1379	/ Fix everything xfs_dinode_verify cares about. /
1380	STATIC int
1381	xrep_dinode_problems(
1382	struct xrep_inode *ri)
1383	{
1384	struct xfs_scrub *sc = ri->sc;
1385	int error;
1386
1387	error = xrep_dinode_core(ri);
1388	if (error)
1389	return error;
1390
1391	/ We had to fix a totally busted inode, schedule quotacheck. /
1392	if (XFS_IS_UQUOTA_ON(sc->mp))
1393	xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1394	if (XFS_IS_GQUOTA_ON(sc->mp))
1395	xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1396	if (XFS_IS_PQUOTA_ON(sc->mp))
1397	xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1398
1399	return `0`;
1400	}
1401
1402	/*
1403	* Fix problems that the verifiers don't care about. In general these are
1404	* errors that don't cause problems elsewhere in the kernel that we can easily
1405	* detect, so we don't check them all that rigorously.
1406	*/
1407
1408	/ Make sure block and extent counts are ok. /
1409	STATIC int
1410	xrep_inode_blockcounts(
1411	struct xfs_scrub *sc)
1412	{
1413	struct xfs_ifork *ifp;
1414	xfs_filblks_t count;
1415	xfs_filblks_t acount;
1416	xfs_extnum_t nextents;
1417	int error;
1418
1419	trace_xrep_inode_blockcounts(sc);
1420
1421	/ Set data fork counters from the data fork mappings. /
1422	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1423	&nextents, &count);
1424	if (error)
1425	return error;
1426	if (xfs_is_reflink_inode(sc->ip)) {
1427	/*
1428	* data fork blockcount can exceed physical storage if a user
1429	* reflinks the same block over and over again.
1430	*/
1431	;
1432	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1433	if (count >= sc->mp->m_sb.sb_rblocks)
1434	return -EFSCORRUPTED;
1435	} else {
1436	if (count >= sc->mp->m_sb.sb_dblocks)
1437	return -EFSCORRUPTED;
1438	}
1439	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1440	if (error)
1441	return error;
1442	sc->ip->i_df.if_nextents = nextents;
1443
1444	/ Set attr fork counters from the attr fork mappings. /
1445	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1446	if (ifp) {
1447	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1448	&nextents, &acount);
1449	if (error)
1450	return error;
1451	if (count >= sc->mp->m_sb.sb_dblocks)
1452	return -EFSCORRUPTED;
1453	error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1454	nextents);
1455	if (error)
1456	return error;
1457	ifp->if_nextents = nextents;
1458	} else {
1459	acount = `0`;
1460	}
1461
1462	sc->ip->i_nblocks = count + acount;
1463	return `0`;
1464	}
1465
1466	/ Check for invalid uid/gid/prid. /
1467	STATIC void
1468	xrep_inode_ids(
1469	struct xfs_scrub *sc)
1470	{
1471	bool dirty = false;
1472
1473	trace_xrep_inode_ids(sc);
1474
1475	if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1476	i_uid_write(VFS_I(sc->ip), `0`);
1477	dirty = true;
1478	if (XFS_IS_UQUOTA_ON(sc->mp))
1479	xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1480	}
1481
1482	if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1483	i_gid_write(VFS_I(sc->ip), `0`);
1484	dirty = true;
1485	if (XFS_IS_GQUOTA_ON(sc->mp))
1486	xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1487	}
1488
1489	if (sc->ip->i_projid == -`1U`) {
1490	sc->ip->i_projid = `0`;
1491	dirty = true;
1492	if (XFS_IS_PQUOTA_ON(sc->mp))
1493	xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1494	}
1495
1496	/ strip setuid/setgid if we touched any of the ids /
1497	if (dirty)
1498	VFS_I(sc->ip)->i_mode &= ~(S_ISUID \| S_ISGID);
1499	}
1500
1501	static inline void
1502	xrep_clamp_timestamp(
1503	struct xfs_inode *ip,
1504	struct timespec64 *ts)
1505	{
1506	ts->tv_nsec = clamp_t(long, ts->tv_nsec, `0`, NSEC_PER_SEC);
1507	ts = timestamp_truncate(ts, VFS_I(ip));
1508	}
1509
1510	/ Nanosecond counters can't have more than 1 billion. /
1511	STATIC void
1512	xrep_inode_timestamps(
1513	struct xfs_inode *ip)
1514	{
1515	struct timespec64 tstamp;
1516	struct inode *inode = VFS_I(ip);
1517
1518	tstamp = inode_get_atime(inode);
1519	xrep_clamp_timestamp(ip, ts: &tstamp);
1520	inode_set_atime_to_ts(inode, tstamp);
1521
1522	tstamp = inode_get_mtime(inode);
1523	xrep_clamp_timestamp(ip, ts: &tstamp);
1524	inode_set_mtime_to_ts(inode, tstamp);
1525
1526	tstamp = inode_get_ctime(inode);
1527	xrep_clamp_timestamp(ip, ts: &tstamp);
1528	inode_set_ctime_to_ts(inode, tstamp);
1529
1530	xrep_clamp_timestamp(ip, ts: &ip->i_crtime);
1531	}
1532
1533	/ Fix inode flags that don't make sense together. /
1534	STATIC void
1535	xrep_inode_flags(
1536	struct xfs_scrub *sc)
1537	{
1538	uint16_t mode;
1539
1540	trace_xrep_inode_flags(sc);
1541
1542	mode = VFS_I(sc->ip)->i_mode;
1543
1544	/ Clear junk flags /
1545	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1546	sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1547
1548	/ NEWRTBM only applies to realtime bitmaps /
1549	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1550	sc->ip->i_diflags \|= XFS_DIFLAG_NEWRTBM;
1551	else
1552	sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1553
1554	/ These only make sense for directories. /
1555	if (!S_ISDIR(mode))
1556	sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT \|
1557	XFS_DIFLAG_EXTSZINHERIT \|
1558	XFS_DIFLAG_PROJINHERIT \|
1559	XFS_DIFLAG_NOSYMLINKS);
1560
1561	/ These only make sense for files. /
1562	if (!S_ISREG(mode))
1563	sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME \|
1564	XFS_DIFLAG_EXTSIZE);
1565
1566	/ These only make sense for non-rt files. /
1567	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1568	sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1569
1570	/ Immutable and append only? Drop the append. /
1571	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1572	(sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1573	sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1574
1575	/ Clear junk flags. /
1576	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1577	sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1578
1579	/ No reflink flag unless we support it and it's a file. /
1580	if (!xfs_has_reflink(sc->mp) \|\| !S_ISREG(mode))
1581	sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1582
1583	/ DAX only applies to files and dirs. /
1584	if (!(S_ISREG(mode) \|\| S_ISDIR(mode)))
1585	sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1586
1587	/ No reflink files on the realtime device. /
1588	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1589	sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1590	}
1591
1592	/*
1593	* Fix size problems with block/node format directories. If we fail to find
1594	* the extent list, just bail out and let the bmapbtd repair functions clean
1595	* up that mess.
1596	*/
1597	STATIC void
1598	xrep_inode_blockdir_size(
1599	struct xfs_scrub *sc)
1600	{
1601	struct xfs_iext_cursor icur;
1602	struct xfs_bmbt_irec got;
1603	struct xfs_ifork *ifp;
1604	xfs_fileoff_t off;
1605	int error;
1606
1607	trace_xrep_inode_blockdir_size(sc);
1608
1609	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1610	if (error)
1611	return;
1612
1613	/ Find the last block before 32G; this is the dir size. /
1614	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1615	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1616	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1617	/ zero-extents directory? /
1618	return;
1619	}
1620
1621	off = got.br_startoff + got.br_blockcount;
1622	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1623	XFS_FSB_TO_B(sc->mp, off));
1624	}
1625
1626	/ Fix size problems with short format directories. /
1627	STATIC void
1628	xrep_inode_sfdir_size(
1629	struct xfs_scrub *sc)
1630	{
1631	struct xfs_ifork *ifp;
1632
1633	trace_xrep_inode_sfdir_size(sc);
1634
1635	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1636	sc->ip->i_disk_size = ifp->if_bytes;
1637	}
1638
1639	/*
1640	* Fix any irregularities in a directory inode's size now that we can iterate
1641	* extent maps and access other regular inode data.
1642	*/
1643	STATIC void
1644	xrep_inode_dir_size(
1645	struct xfs_scrub *sc)
1646	{
1647	trace_xrep_inode_dir_size(sc);
1648
1649	switch (sc->ip->i_df.if_format) {
1650	case XFS_DINODE_FMT_EXTENTS:
1651	case XFS_DINODE_FMT_BTREE:
1652	xrep_inode_blockdir_size(sc);
1653	break;
1654	case XFS_DINODE_FMT_LOCAL:
1655	xrep_inode_sfdir_size(sc);
1656	break;
1657	}
1658	}
1659
1660	/ Fix extent size hint problems. /
1661	STATIC void
1662	xrep_inode_extsize(
1663	struct xfs_scrub *sc)
1664	{
1665	/ Fix misaligned extent size hints on a directory. /
1666	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1667	(sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1668	xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > `0`) {
1669	sc->ip->i_extsize = `0`;
1670	sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1671	}
1672	}
1673
1674	/ Fix any irregularities in an inode that the verifiers don't catch. /
1675	STATIC int
1676	xrep_inode_problems(
1677	struct xfs_scrub *sc)
1678	{
1679	int error;
1680
1681	error = xrep_inode_blockcounts(sc);
1682	if (error)
1683	return error;
1684	xrep_inode_timestamps(sc->ip);
1685	xrep_inode_flags(sc);
1686	xrep_inode_ids(sc);
1687	/*
1688	* We can now do a better job fixing the size of a directory now that
1689	* we can scan the data fork extents than we could in xrep_dinode_size.
1690	*/
1691	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1692	xrep_inode_dir_size(sc);
1693	xrep_inode_extsize(sc);
1694
1695	trace_xrep_inode_fixed(sc);
1696	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1697	return xrep_roll_trans(sc);
1698	}
1699
1700	/ Repair an inode's fields. /
1701	int
1702	xrep_inode(
1703	struct xfs_scrub *sc)
1704	{
1705	int error = `0`;
1706
1707	/*
1708	* No inode? That means we failed the _iget verifiers. Repair all
1709	* the things that the inode verifiers care about, then retry _iget.
1710	*/
1711	if (!sc->ip) {
1712	struct xrep_inode *ri = sc->buf;
1713
1714	ASSERT(ri != NULL);
1715
1716	error = xrep_dinode_problems(ri);
1717	if (error == -EBUSY) {
1718	/*
1719	* Directory scan to recover inode mode encountered a
1720	* busy inode, so we did not continue repairing things.
1721	*/
1722	return `0`;
1723	}
1724	if (error)
1725	return error;
1726
1727	/ By this point we had better have a working incore inode. /
1728	if (!sc->ip)
1729	return -EFSCORRUPTED;
1730	}
1731
1732	xfs_trans_ijoin(sc->tp, sc->ip, `0`);
1733
1734	/ If we found corruption of any kind, try to fix it. /
1735	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) \|\|
1736	(sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1737	error = xrep_inode_problems(sc);
1738	if (error)
1739	return error;
1740	}
1741
1742	/ See if we can clear the reflink flag. /
1743	if (xfs_is_reflink_inode(sc->ip)) {
1744	error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1745	if (error)
1746	return error;
1747	}
1748
1749	return xrep_defer_finish(sc);
1750	}
1751

source code of linux/fs/xfs/scrub/inode_repair.c