xfs_log_recover.c source code [linux/fs/xfs/xfs_log_recover.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_bit.h"
13	#include "xfs_sb.h"
14	#include "xfs_mount.h"
15	#include "xfs_defer.h"
16	#include "xfs_inode.h"
17	#include "xfs_trans.h"
18	#include "xfs_log.h"
19	#include "xfs_log_priv.h"
20	#include "xfs_log_recover.h"
21	#include "xfs_trans_priv.h"
22	#include "xfs_alloc.h"
23	#include "xfs_ialloc.h"
24	#include "xfs_trace.h"
25	#include "xfs_icache.h"
26	#include "xfs_error.h"
27	#include "xfs_buf_item.h"
28	#include "xfs_ag.h"
29	#include "xfs_quota.h"
30	#include "xfs_reflink.h"
31
32	#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
33
34	STATIC int
35	xlog_find_zeroed(
36	struct xlog *,
37	xfs_daddr_t *);
38	STATIC int
39	xlog_clear_stale_blocks(
40	struct xlog *,
41	xfs_lsn_t);
42	STATIC int
43	xlog_do_recovery_pass(
44	struct xlog , xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t );
45
46	/*
47	* Sector aligned buffer routines for buffer create/read/write/access
48	*/
49
50	/*
51	* Verify the log-relative block number and length in basic blocks are valid for
52	* an operation involving the given XFS log buffer. Returns true if the fields
53	* are valid, false otherwise.
54	*/
55	static inline bool
56	xlog_verify_bno(
57	struct xlog *log,
58	xfs_daddr_t blk_no,
59	int bbcount)
60	{
61	if (blk_no < `0` \|\| blk_no >= log->l_logBBsize)
62	return false;
63	if (bbcount <= `0` \|\| (blk_no + bbcount) > log->l_logBBsize)
64	return false;
65	return true;
66	}
67
68	/*
69	* Allocate a buffer to hold log data. The buffer needs to be able to map to
70	* a range of nbblks basic blocks at any valid offset within the log.
71	*/
72	static char *
73	xlog_alloc_buffer(
74	struct xlog *log,
75	int nbblks)
76	{
77	/*
78	* Pass log block 0 since we don't have an addr yet, buffer will be
79	* verified on read.
80	*/
81	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, `0`, nbblks))) {
82	xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83	nbblks);
84	return NULL;
85	}
86
87	/*
88	* We do log I/O in units of log sectors (a power-of-2 multiple of the
89	* basic block size), so we round up the requested size to accommodate
90	* the basic blocks required for complete log sectors.
91	*
92	* In addition, the buffer may be used for a non-sector-aligned block
93	* offset, in which case an I/O of the requested size could extend
94	* beyond the end of the buffer. If the requested size is only 1 basic
95	* block it will never straddle a sector boundary, so this won't be an
96	* issue. Nor will this be a problem if the log I/O is done in basic
97	* blocks (sector size 1). But otherwise we extend the buffer by one
98	* extra log sector to ensure there's space to accommodate this
99	* possibility.
100	*/
101	if (nbblks > `1` && log->l_sectBBsize > `1`)
102	nbblks += log->l_sectBBsize;
103	nbblks = round_up(nbblks, log->l_sectBBsize);
104	return kvzalloc(size: BBTOB(nbblks), GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
105	}
106
107	/*
108	* Return the address of the start of the given block number's data
109	* in a log buffer. The buffer covers a log sector-aligned region.
110	*/
111	static inline unsigned int
112	xlog_align(
113	struct xlog *log,
114	xfs_daddr_t blk_no)
115	{
116	return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - `1`));
117	}
118
119	static int
120	xlog_do_io(
121	struct xlog *log,
122	xfs_daddr_t blk_no,
123	unsigned int nbblks,
124	char *data,
125	enum req_op op)
126	{
127	int error;
128
129	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
130	xfs_warn(log->l_mp,
131	"Invalid log block/length (0x%llx, 0x%x) for buffer",
132	blk_no, nbblks);
133	return -EFSCORRUPTED;
134	}
135
136	blk_no = round_down(blk_no, log->l_sectBBsize);
137	nbblks = round_up(nbblks, log->l_sectBBsize);
138	ASSERT(nbblks > `0`);
139
140	error = xfs_rw_bdev(bdev: log->l_targ->bt_bdev, sector: log->l_logBBstart + blk_no,
141	count: BBTOB(nbblks), data, op);
142	if (error && !xlog_is_shutdown(log)) {
143	xfs_alert(log->l_mp,
144	"log recovery %s I/O error at daddr 0x%llx len %d error %d",
145	op == REQ_OP_WRITE ? "write" : "read",
146	blk_no, nbblks, error);
147	}
148	return error;
149	}
150
151	STATIC int
152	xlog_bread_noalign(
153	struct xlog *log,
154	xfs_daddr_t blk_no,
155	int nbblks,
156	char *data)
157	{
158	return xlog_do_io(log, blk_no, nbblks, data, op: REQ_OP_READ);
159	}
160
161	STATIC int
162	xlog_bread(
163	struct xlog *log,
164	xfs_daddr_t blk_no,
165	int nbblks,
166	char *data,
167	char **offset)
168	{
169	int error;
170
171	error = xlog_do_io(log, blk_no, nbblks, data, op: REQ_OP_READ);
172	if (!error)
173	*offset = data + xlog_align(log, blk_no);
174	return error;
175	}
176
177	STATIC int
178	xlog_bwrite(
179	struct xlog *log,
180	xfs_daddr_t blk_no,
181	int nbblks,
182	char *data)
183	{
184	return xlog_do_io(log, blk_no, nbblks, data, op: REQ_OP_WRITE);
185	}
186
187	#ifdef DEBUG
188	/*
189	* dump debug superblock and log record information
190	*/
191	STATIC void
192	xlog_header_check_dump(
193	xfs_mount_t *mp,
194	xlog_rec_header_t *head)
195	{
196	xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
197	__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
198	xfs_debug(mp, " log : uuid = %pU, fmt = %d",
199	&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
200	}
201	#else
202	#define xlog_header_check_dump(mp, head)
203	#endif
204
205	/*
206	* check log record header for recovery
207	*/
208	STATIC int
209	xlog_header_check_recover(
210	xfs_mount_t *mp,
211	xlog_rec_header_t *head)
212	{
213	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
214
215	/*
216	* IRIX doesn't write the h_fmt field and leaves it zeroed
217	* (XLOG_FMT_UNKNOWN). This stops us from trying to recover
218	* a dirty log created in IRIX.
219	*/
220	if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
221	xfs_warn(mp,
222	"dirty log written in incompatible format - can't recover");
223	xlog_header_check_dump(mp, head);
224	return -EFSCORRUPTED;
225	}
226	if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
227	&head->h_fs_uuid))) {
228	xfs_warn(mp,
229	"dirty log entry has mismatched uuid - can't recover");
230	xlog_header_check_dump(mp, head);
231	return -EFSCORRUPTED;
232	}
233	return `0`;
234	}
235
236	/*
237	* read the head block of the log and check the header
238	*/
239	STATIC int
240	xlog_header_check_mount(
241	xfs_mount_t *mp,
242	xlog_rec_header_t *head)
243	{
244	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
245
246	if (uuid_is_null(uuid: &head->h_fs_uuid)) {
247	/*
248	* IRIX doesn't write the h_fs_uuid or h_fmt fields. If
249	* h_fs_uuid is null, we assume this log was last mounted
250	* by IRIX and continue.
251	*/
252	xfs_warn(mp, "null uuid in log - IRIX style log");
253	} else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
254	&head->h_fs_uuid))) {
255	xfs_warn(mp, "log has mismatched uuid - can't recover");
256	xlog_header_check_dump(mp, head);
257	return -EFSCORRUPTED;
258	}
259	return `0`;
260	}
261
262	/*
263	* This routine finds (to an approximation) the first block in the physical
264	* log which contains the given cycle. It uses a binary search algorithm.
265	* Note that the algorithm can not be perfect because the disk will not
266	* necessarily be perfect.
267	*/
268	STATIC int
269	xlog_find_cycle_start(
270	struct xlog *log,
271	char *buffer,
272	xfs_daddr_t first_blk,
273	xfs_daddr_t *last_blk,
274	uint cycle)
275	{
276	char *offset;
277	xfs_daddr_t mid_blk;
278	xfs_daddr_t end_blk;
279	uint mid_cycle;
280	int error;
281
282	end_blk = *last_blk;
283	mid_blk = BLK_AVG(first_blk, end_blk);
284	while (mid_blk != first_blk && mid_blk != end_blk) {
285	error = xlog_bread(log, blk_no: mid_blk, nbblks: `1`, data: buffer, offset: &offset);
286	if (error)
287	return error;
288	mid_cycle = xlog_get_cycle(offset);
289	if (mid_cycle == cycle)
290	end_blk = mid_blk; / last_half_cycle == mid_cycle /
291	else
292	first_blk = mid_blk; / first_half_cycle == mid_cycle /
293	mid_blk = BLK_AVG(first_blk, end_blk);
294	}
295	ASSERT((mid_blk == first_blk && mid_blk+`1` == end_blk) \|\|
296	(mid_blk == end_blk && mid_blk-`1` == first_blk));
297
298	*last_blk = end_blk;
299
300	return `0`;
301	}
302
303	/*
304	* Check that a range of blocks does not contain stop_on_cycle_no.
305	* Fill in *new_blk with the block offset where such a block is
306	* found, or with -1 (an invalid block number) if there is no such
307	* block in the range. The scan needs to occur from front to back
308	* and the pointer into the region must be updated since a later
309	* routine will need to perform another test.
310	*/
311	STATIC int
312	xlog_find_verify_cycle(
313	struct xlog *log,
314	xfs_daddr_t start_blk,
315	int nbblks,
316	uint stop_on_cycle_no,
317	xfs_daddr_t *new_blk)
318	{
319	xfs_daddr_t i, j;
320	uint cycle;
321	char *buffer;
322	xfs_daddr_t bufblks;
323	char *buf = NULL;
324	int error = `0`;
325
326	/*
327	* Greedily allocate a buffer big enough to handle the full
328	* range of basic blocks we'll be examining. If that fails,
329	* try a smaller size. We need to be able to read at least
330	* a log sector, or we're out of luck.
331	*/
332	bufblks = roundup_pow_of_two(nbblks);
333	while (bufblks > log->l_logBBsize)
334	bufblks >>= `1`;
335	while (!(buffer = xlog_alloc_buffer(log, nbblks: bufblks))) {
336	bufblks >>= `1`;
337	if (bufblks < log->l_sectBBsize)
338	return -ENOMEM;
339	}
340
341	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
342	int bcount;
343
344	bcount = min(bufblks, (start_blk + nbblks - i));
345
346	error = xlog_bread(log, blk_no: i, nbblks: bcount, data: buffer, offset: &buf);
347	if (error)
348	goto out;
349
350	for (j = `0`; j < bcount; j++) {
351	cycle = xlog_get_cycle(buf);
352	if (cycle == stop_on_cycle_no) {
353	*new_blk = i+j;
354	goto out;
355	}
356
357	buf += BBSIZE;
358	}
359	}
360
361	*new_blk = -`1`;
362
363	out:
364	kvfree(addr: buffer);
365	return error;
366	}
367
368	static inline int
369	xlog_logrec_hblks(struct xlog log, struct* xlog_rec_header *rh)
370	{
371	if (xfs_has_logv2(mp: log->l_mp)) {
372	int h_size = be32_to_cpu(rh->h_size);
373
374	if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
375	h_size > XLOG_HEADER_CYCLE_SIZE)
376	return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
377	}
378	return `1`;
379	}
380
381	/*
382	* Potentially backup over partial log record write.
383	*
384	* In the typical case, last_blk is the number of the block directly after
385	* a good log record. Therefore, we subtract one to get the block number
386	* of the last block in the given buffer. extra_bblks contains the number
387	* of blocks we would have read on a previous read. This happens when the
388	* last log record is split over the end of the physical log.
389	*
390	* extra_bblks is the number of blocks potentially verified on a previous
391	* call to this routine.
392	*/
393	STATIC int
394	xlog_find_verify_log_record(
395	struct xlog *log,
396	xfs_daddr_t start_blk,
397	xfs_daddr_t *last_blk,
398	int extra_bblks)
399	{
400	xfs_daddr_t i;
401	char *buffer;
402	char *offset = NULL;
403	xlog_rec_header_t *head = NULL;
404	int error = `0`;
405	int smallmem = `0`;
406	int num_blks = *last_blk - start_blk;
407	int xhdrs;
408
409	ASSERT(start_blk != `0` \|\| *last_blk != start_blk);
410
411	buffer = xlog_alloc_buffer(log, nbblks: num_blks);
412	if (!buffer) {
413	buffer = xlog_alloc_buffer(log, nbblks: `1`);
414	if (!buffer)
415	return -ENOMEM;
416	smallmem = `1`;
417	} else {
418	error = xlog_bread(log, blk_no: start_blk, nbblks: num_blks, data: buffer, offset: &offset);
419	if (error)
420	goto out;
421	offset += ((num_blks - `1`) << BBSHIFT);
422	}
423
424	for (i = (*last_blk) - `1`; i >= `0`; i--) {
425	if (i < start_blk) {
426	/ valid log record not found /
427	xfs_warn(log->l_mp,
428	"Log inconsistent (didn't find previous header)");
429	ASSERT(`0`);
430	error = -EFSCORRUPTED;
431	goto out;
432	}
433
434	if (smallmem) {
435	error = xlog_bread(log, blk_no: i, nbblks: `1`, data: buffer, offset: &offset);
436	if (error)
437	goto out;
438	}
439
440	head = (xlog_rec_header_t *)offset;
441
442	if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
443	break;
444
445	if (!smallmem)
446	offset -= BBSIZE;
447	}
448
449	/*
450	* We hit the beginning of the physical log & still no header. Return
451	* to caller. If caller can handle a return of -1, then this routine
452	* will be called again for the end of the physical log.
453	*/
454	if (i == -`1`) {
455	error = `1`;
456	goto out;
457	}
458
459	/*
460	* We have the final block of the good log (the first block
461	* of the log record _before_ the head. So we check the uuid.
462	*/
463	if ((error = xlog_header_check_mount(log->l_mp, head)))
464	goto out;
465
466	/*
467	* We may have found a log record header before we expected one.
468	* last_blk will be the 1st block # with a given cycle #. We may end
469	* up reading an entire log record. In this case, we don't want to
470	* reset last_blk. Only when last_blk points in the middle of a log
471	* record do we update last_blk.
472	*/
473	xhdrs = xlog_logrec_hblks(log, head);
474
475	if (*last_blk - i + extra_bblks !=
476	BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
477	*last_blk = i;
478
479	out:
480	kvfree(addr: buffer);
481	return error;
482	}
483
484	/*
485	* Head is defined to be the point of the log where the next log write
486	* could go. This means that incomplete LR writes at the end are
487	* eliminated when calculating the head. We aren't guaranteed that previous
488	* LR have complete transactions. We only know that a cycle number of
489	* current cycle number -1 won't be present in the log if we start writing
490	* from our current block number.
491	*
492	* last_blk contains the block number of the first block with a given
493	* cycle number.
494	*
495	* Return: zero if normal, non-zero if error.
496	*/
497	STATIC int
498	xlog_find_head(
499	struct xlog *log,
500	xfs_daddr_t *return_head_blk)
501	{
502	char *buffer;
503	char *offset;
504	xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
505	int num_scan_bblks;
506	uint first_half_cycle, last_half_cycle;
507	uint stop_on_cycle;
508	int error, log_bbnum = log->l_logBBsize;
509
510	/ Is the end of the log device zeroed? /
511	error = xlog_find_zeroed(log, &first_blk);
512	if (error < `0`) {
513	xfs_warn(log->l_mp, "empty log check failed");
514	return error;
515	}
516	if (error == `1`) {
517	*return_head_blk = first_blk;
518
519	/ Is the whole lot zeroed? /
520	if (!first_blk) {
521	/ Linux XFS shouldn't generate totally zeroed logs -*
522	* mkfs etc write a dummy unmount record to a fresh
523	* log so we can store the uuid in there
524	*/
525	xfs_warn(log->l_mp, "totally zeroed log");
526	}
527
528	return `0`;
529	}
530
531	first_blk = `0`; / get cycle # of 1st block /
532	buffer = xlog_alloc_buffer(log, nbblks: `1`);
533	if (!buffer)
534	return -ENOMEM;
535
536	error = xlog_bread(log, blk_no: `0`, nbblks: `1`, data: buffer, offset: &offset);
537	if (error)
538	goto out_free_buffer;
539
540	first_half_cycle = xlog_get_cycle(offset);
541
542	last_blk = head_blk = log_bbnum - `1`; / get cycle # of last block /
543	error = xlog_bread(log, blk_no: last_blk, nbblks: `1`, data: buffer, offset: &offset);
544	if (error)
545	goto out_free_buffer;
546
547	last_half_cycle = xlog_get_cycle(offset);
548	ASSERT(last_half_cycle != `0`);
549
550	/*
551	* If the 1st half cycle number is equal to the last half cycle number,
552	* then the entire log is stamped with the same cycle number. In this
553	* case, head_blk can't be set to zero (which makes sense). The below
554	* math doesn't work out properly with head_blk equal to zero. Instead,
555	* we set it to log_bbnum which is an invalid block number, but this
556	* value makes the math correct. If head_blk doesn't changed through
557	* all the tests below, *head_blk is set to zero at the very end rather
558	* than log_bbnum. In a sense, log_bbnum and zero are the same block
559	* in a circular file.
560	*/
561	if (first_half_cycle == last_half_cycle) {
562	/*
563	* In this case we believe that the entire log should have
564	* cycle number last_half_cycle. We need to scan backwards
565	* from the end verifying that there are no holes still
566	* containing last_half_cycle - 1. If we find such a hole,
567	* then the start of that hole will be the new head. The
568	* simple case looks like
569	* x \| x ... \| x - 1 \| x
570	* Another case that fits this picture would be
571	* x \| x + 1 \| x ... \| x
572	* In this case the head really is somewhere at the end of the
573	* log, as one of the latest writes at the beginning was
574	* incomplete.
575	* One more case is
576	* x \| x + 1 \| x ... \| x - 1 \| x
577	* This is really the combination of the above two cases, and
578	* the head has to end up at the start of the x-1 hole at the
579	* end of the log.
580	*
581	* In the 256k log case, we will read from the beginning to the
582	* end of the log and search for cycle numbers equal to x-1.
583	* We don't worry about the x+1 blocks that we encounter,
584	* because we know that they cannot be the head since the log
585	* started with x.
586	*/
587	head_blk = log_bbnum;
588	stop_on_cycle = last_half_cycle - `1`;
589	} else {
590	/*
591	* In this case we want to find the first block with cycle
592	* number matching last_half_cycle. We expect the log to be
593	* some variation on
594	* x + 1 ... \| x ... \| x
595	* The first block with cycle number x (last_half_cycle) will
596	* be where the new head belongs. First we do a binary search
597	* for the first occurrence of last_half_cycle. The binary
598	* search may not be totally accurate, so then we scan back
599	* from there looking for occurrences of last_half_cycle before
600	* us. If that backwards scan wraps around the beginning of
601	* the log, then we look for occurrences of last_half_cycle - 1
602	* at the end of the log. The cases we're looking for look
603	* like
604	* v binary search stopped here
605	* x + 1 ... \| x \| x + 1 \| x ... \| x
606	* ^ but we want to locate this spot
607	* or
608	* <---------> less than scan distance
609	* x + 1 ... \| x ... \| x - 1 \| x
610	* ^ we want to locate this spot
611	*/
612	stop_on_cycle = last_half_cycle;
613	error = xlog_find_cycle_start(log, buffer, first_blk, last_blk: &head_blk,
614	cycle: last_half_cycle);
615	if (error)
616	goto out_free_buffer;
617	}
618
619	/*
620	* Now validate the answer. Scan back some number of maximum possible
621	* blocks and make sure each one has the expected cycle number. The
622	* maximum is determined by the total possible amount of buffering
623	* in the in-core log. The following number can be made tighter if
624	* we actually look at the block size of the filesystem.
625	*/
626	num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
627	if (head_blk >= num_scan_bblks) {
628	/*
629	* We are guaranteed that the entire check can be performed
630	* in one buffer.
631	*/
632	start_blk = head_blk - num_scan_bblks;
633	if ((error = xlog_find_verify_cycle(log,
634	start_blk, nbblks: num_scan_bblks,
635	stop_on_cycle_no: stop_on_cycle, new_blk: &new_blk)))
636	goto out_free_buffer;
637	if (new_blk != -`1`)
638	head_blk = new_blk;
639	} else { / need to read 2 parts of log /
640	/*
641	* We are going to scan backwards in the log in two parts.
642	* First we scan the physical end of the log. In this part
643	* of the log, we are looking for blocks with cycle number
644	* last_half_cycle - 1.
645	* If we find one, then we know that the log starts there, as
646	* we've found a hole that didn't get written in going around
647	* the end of the physical log. The simple case for this is
648	* x + 1 ... \| x ... \| x - 1 \| x
649	* <---------> less than scan distance
650	* If all of the blocks at the end of the log have cycle number
651	* last_half_cycle, then we check the blocks at the start of
652	* the log looking for occurrences of last_half_cycle. If we
653	* find one, then our current estimate for the location of the
654	* first occurrence of last_half_cycle is wrong and we move
655	* back to the hole we've found. This case looks like
656	* x + 1 ... \| x \| x + 1 \| x ...
657	* ^ binary search stopped here
658	* Another case we need to handle that only occurs in 256k
659	* logs is
660	* x + 1 ... \| x ... \| x+1 \| x ...
661	* ^ binary search stops here
662	* In a 256k log, the scan at the end of the log will see the
663	* x + 1 blocks. We need to skip past those since that is
664	* certainly not the head of the log. By searching for
665	* last_half_cycle-1 we accomplish that.
666	*/
667	ASSERT(head_blk <= INT_MAX &&
668	(xfs_daddr_t) num_scan_bblks >= head_blk);
669	start_blk = log_bbnum - (num_scan_bblks - head_blk);
670	if ((error = xlog_find_verify_cycle(log, start_blk,
671	nbblks: num_scan_bblks - (int)head_blk,
672	stop_on_cycle_no: (stop_on_cycle - `1`), new_blk: &new_blk)))
673	goto out_free_buffer;
674	if (new_blk != -`1`) {
675	head_blk = new_blk;
676	goto validate_head;
677	}
678
679	/*
680	* Scan beginning of log now. The last part of the physical
681	* log is good. This scan needs to verify that it doesn't find
682	* the last_half_cycle.
683	*/
684	start_blk = `0`;
685	ASSERT(head_blk <= INT_MAX);
686	if ((error = xlog_find_verify_cycle(log,
687	start_blk, nbblks: (int)head_blk,
688	stop_on_cycle_no: stop_on_cycle, new_blk: &new_blk)))
689	goto out_free_buffer;
690	if (new_blk != -`1`)
691	head_blk = new_blk;
692	}
693
694	validate_head:
695	/*
696	* Now we need to make sure head_blk is not pointing to a block in
697	* the middle of a log record.
698	*/
699	num_scan_bblks = XLOG_REC_SHIFT(log);
700	if (head_blk >= num_scan_bblks) {
701	start_blk = head_blk - num_scan_bblks; / don't read head_blk /
702
703	/ start ptr at last block ptr before head_blk /
704	error = xlog_find_verify_log_record(log, start_blk, last_blk: &head_blk, extra_bblks: `0`);
705	if (error == `1`)
706	error = -EIO;
707	if (error)
708	goto out_free_buffer;
709	} else {
710	start_blk = `0`;
711	ASSERT(head_blk <= INT_MAX);
712	error = xlog_find_verify_log_record(log, start_blk, last_blk: &head_blk, extra_bblks: `0`);
713	if (error < `0`)
714	goto out_free_buffer;
715	if (error == `1`) {
716	/ We hit the beginning of the log during our search /
717	start_blk = log_bbnum - (num_scan_bblks - head_blk);
718	new_blk = log_bbnum;
719	ASSERT(start_blk <= INT_MAX &&
720	(xfs_daddr_t) log_bbnum-start_blk >= `0`);
721	ASSERT(head_blk <= INT_MAX);
722	error = xlog_find_verify_log_record(log, start_blk,
723	last_blk: &new_blk, extra_bblks: (int)head_blk);
724	if (error == `1`)
725	error = -EIO;
726	if (error)
727	goto out_free_buffer;
728	if (new_blk != log_bbnum)
729	head_blk = new_blk;
730	} else if (error)
731	goto out_free_buffer;
732	}
733
734	kvfree(addr: buffer);
735	if (head_blk == log_bbnum)
736	*return_head_blk = `0`;
737	else
738	*return_head_blk = head_blk;
739	/*
740	* When returning here, we have a good block number. Bad block
741	* means that during a previous crash, we didn't have a clean break
742	* from cycle number N to cycle number N-1. In this case, we need
743	* to find the first block with cycle number N-1.
744	*/
745	return `0`;
746
747	out_free_buffer:
748	kvfree(addr: buffer);
749	if (error)
750	xfs_warn(log->l_mp, "failed to find log head");
751	return error;
752	}
753
754	/*
755	* Seek backwards in the log for log record headers.
756	*
757	* Given a starting log block, walk backwards until we find the provided number
758	* of records or hit the provided tail block. The return value is the number of
759	* records encountered or a negative error code. The log block and buffer
760	* pointer of the last record seen are returned in rblk and rhead respectively.
761	*/
762	STATIC int
763	xlog_rseek_logrec_hdr(
764	struct xlog *log,
765	xfs_daddr_t head_blk,
766	xfs_daddr_t tail_blk,
767	int count,
768	char *buffer,
769	xfs_daddr_t *rblk,
770	struct xlog_rec_header **rhead,
771	bool *wrapped)
772	{
773	int i;
774	int error;
775	int found = `0`;
776	char *offset = NULL;
777	xfs_daddr_t end_blk;
778
779	*wrapped = false;
780
781	/*
782	* Walk backwards from the head block until we hit the tail or the first
783	* block in the log.
784	*/
785	end_blk = head_blk > tail_blk ? tail_blk : `0`;
786	for (i = (int) head_blk - `1`; i >= end_blk; i--) {
787	error = xlog_bread(log, blk_no: i, nbblks: `1`, data: buffer, offset: &offset);
788	if (error)
789	goto out_error;
790
791	if ((__be32 ) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
792	*rblk = i;
793	rhead = (struct* xlog_rec_header *) offset;
794	if (++found == count)
795	break;
796	}
797	}
798
799	/*
800	* If we haven't hit the tail block or the log record header count,
801	* start looking again from the end of the physical log. Note that
802	* callers can pass head == tail if the tail is not yet known.
803	*/
804	if (tail_blk >= head_blk && found != count) {
805	for (i = log->l_logBBsize - `1`; i >= (int) tail_blk; i--) {
806	error = xlog_bread(log, blk_no: i, nbblks: `1`, data: buffer, offset: &offset);
807	if (error)
808	goto out_error;
809
810	if ((__be32 )offset ==
811	cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
812	*wrapped = true;
813	*rblk = i;
814	rhead = (struct* xlog_rec_header *) offset;
815	if (++found == count)
816	break;
817	}
818	}
819	}
820
821	return found;
822
823	out_error:
824	return error;
825	}
826
827	/*
828	* Seek forward in the log for log record headers.
829	*
830	* Given head and tail blocks, walk forward from the tail block until we find
831	* the provided number of records or hit the head block. The return value is the
832	* number of records encountered or a negative error code. The log block and
833	* buffer pointer of the last record seen are returned in rblk and rhead
834	* respectively.
835	*/
836	STATIC int
837	xlog_seek_logrec_hdr(
838	struct xlog *log,
839	xfs_daddr_t head_blk,
840	xfs_daddr_t tail_blk,
841	int count,
842	char *buffer,
843	xfs_daddr_t *rblk,
844	struct xlog_rec_header **rhead,
845	bool *wrapped)
846	{
847	int i;
848	int error;
849	int found = `0`;
850	char *offset = NULL;
851	xfs_daddr_t end_blk;
852
853	*wrapped = false;
854
855	/*
856	* Walk forward from the tail block until we hit the head or the last
857	* block in the log.
858	*/
859	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - `1`;
860	for (i = (int) tail_blk; i <= end_blk; i++) {
861	error = xlog_bread(log, blk_no: i, nbblks: `1`, data: buffer, offset: &offset);
862	if (error)
863	goto out_error;
864
865	if ((__be32 ) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
866	*rblk = i;
867	rhead = (struct* xlog_rec_header *) offset;
868	if (++found == count)
869	break;
870	}
871	}
872
873	/*
874	* If we haven't hit the head block or the log record header count,
875	* start looking again from the start of the physical log.
876	*/
877	if (tail_blk > head_blk && found != count) {
878	for (i = `0`; i < (int) head_blk; i++) {
879	error = xlog_bread(log, blk_no: i, nbblks: `1`, data: buffer, offset: &offset);
880	if (error)
881	goto out_error;
882
883	if ((__be32 )offset ==
884	cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
885	*wrapped = true;
886	*rblk = i;
887	rhead = (struct* xlog_rec_header *) offset;
888	if (++found == count)
889	break;
890	}
891	}
892	}
893
894	return found;
895
896	out_error:
897	return error;
898	}
899
900	/*
901	* Calculate distance from head to tail (i.e., unused space in the log).
902	*/
903	static inline int
904	xlog_tail_distance(
905	struct xlog *log,
906	xfs_daddr_t head_blk,
907	xfs_daddr_t tail_blk)
908	{
909	if (head_blk < tail_blk)
910	return tail_blk - head_blk;
911
912	return tail_blk + (log->l_logBBsize - head_blk);
913	}
914
915	/*
916	* Verify the log tail. This is particularly important when torn or incomplete
917	* writes have been detected near the front of the log and the head has been
918	* walked back accordingly.
919	*
920	* We also have to handle the case where the tail was pinned and the head
921	* blocked behind the tail right before a crash. If the tail had been pushed
922	* immediately prior to the crash and the subsequent checkpoint was only
923	* partially written, it's possible it overwrote the last referenced tail in the
924	* log with garbage. This is not a coherency problem because the tail must have
925	* been pushed before it can be overwritten, but appears as log corruption to
926	* recovery because we have no way to know the tail was updated if the
927	* subsequent checkpoint didn't write successfully.
928	*
929	* Therefore, CRC check the log from tail to head. If a failure occurs and the
930	* offending record is within max iclog bufs from the head, walk the tail
931	* forward and retry until a valid tail is found or corruption is detected out
932	* of the range of a possible overwrite.
933	*/
934	STATIC int
935	xlog_verify_tail(
936	struct xlog *log,
937	xfs_daddr_t head_blk,
938	xfs_daddr_t *tail_blk,
939	int hsize)
940	{
941	struct xlog_rec_header *thead;
942	char *buffer;
943	xfs_daddr_t first_bad;
944	int error = `0`;
945	bool wrapped;
946	xfs_daddr_t tmp_tail;
947	xfs_daddr_t orig_tail = *tail_blk;
948
949	buffer = xlog_alloc_buffer(log, nbblks: `1`);
950	if (!buffer)
951	return -ENOMEM;
952
953	/*
954	* Make sure the tail points to a record (returns positive count on
955	* success).
956	*/
957	error = xlog_seek_logrec_hdr(log, head_blk, tail_blk: *tail_blk, count: `1`, buffer,
958	rblk: &tmp_tail, rhead: &thead, wrapped: &wrapped);
959	if (error < `0`)
960	goto out;
961	if (*tail_blk != tmp_tail)
962	*tail_blk = tmp_tail;
963
964	/*
965	* Run a CRC check from the tail to the head. We can't just check
966	* MAX_ICLOGS records past the tail because the tail may point to stale
967	* blocks cleared during the search for the head/tail. These blocks are
968	* overwritten with zero-length records and thus record count is not a
969	* reliable indicator of the iclog state before a crash.
970	*/
971	first_bad = `0`;
972	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
973	XLOG_RECOVER_CRCPASS, &first_bad);
974	while ((error == -EFSBADCRC \|\| error == -EFSCORRUPTED) && first_bad) {
975	int tail_distance;
976
977	/*
978	* Is corruption within range of the head? If so, retry from
979	* the next record. Otherwise return an error.
980	*/
981	tail_distance = xlog_tail_distance(log, head_blk, tail_blk: first_bad);
982	if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
983	break;
984
985	/ skip to the next record; returns positive count on success /
986	error = xlog_seek_logrec_hdr(log, head_blk, tail_blk: first_bad, count: `2`,
987	buffer, rblk: &tmp_tail, rhead: &thead, wrapped: &wrapped);
988	if (error < `0`)
989	goto out;
990
991	*tail_blk = tmp_tail;
992	first_bad = `0`;
993	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
994	XLOG_RECOVER_CRCPASS, &first_bad);
995	}
996
997	if (!error && *tail_blk != orig_tail)
998	xfs_warn(log->l_mp,
999	"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1000	orig_tail, *tail_blk);
1001	out:
1002	kvfree(addr: buffer);
1003	return error;
1004	}
1005
1006	/*
1007	* Detect and trim torn writes from the head of the log.
1008	*
1009	* Storage without sector atomicity guarantees can result in torn writes in the
1010	* log in the event of a crash. Our only means to detect this scenario is via
1011	* CRC verification. While we can't always be certain that CRC verification
1012	* failure is due to a torn write vs. an unrelated corruption, we do know that
1013	* only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1014	* one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1015	* the log and treat failures in this range as torn writes as a matter of
1016	* policy. In the event of CRC failure, the head is walked back to the last good
1017	* record in the log and the tail is updated from that record and verified.
1018	*/
1019	STATIC int
1020	xlog_verify_head(
1021	struct xlog *log,
1022	xfs_daddr_t head_blk, /* in/out: unverified head /
1023	xfs_daddr_t tail_blk, /* out: tail block /
1024	char *buffer,
1025	xfs_daddr_t rhead_blk, /* start blk of last record /
1026	struct xlog_rec_header *rhead, /* ptr to last record /
1027	bool wrapped) /* last rec. wraps phys. log /
1028	{
1029	struct xlog_rec_header *tmp_rhead;
1030	char *tmp_buffer;
1031	xfs_daddr_t first_bad;
1032	xfs_daddr_t tmp_rhead_blk;
1033	int found;
1034	int error;
1035	bool tmp_wrapped;
1036
1037	/*
1038	* Check the head of the log for torn writes. Search backwards from the
1039	* head until we hit the tail or the maximum number of log record I/Os
1040	* that could have been in flight at one time. Use a temporary buffer so
1041	* we don't trash the rhead/buffer pointers from the caller.
1042	*/
1043	tmp_buffer = xlog_alloc_buffer(log, nbblks: `1`);
1044	if (!tmp_buffer)
1045	return -ENOMEM;
1046	error = xlog_rseek_logrec_hdr(log, head_blk, tail_blk,
1047	XLOG_MAX_ICLOGS, tmp_buffer,
1048	&tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1049	kvfree(addr: tmp_buffer);
1050	if (error < `0`)
1051	return error;
1052
1053	/*
1054	* Now run a CRC verification pass over the records starting at the
1055	* block found above to the current head. If a CRC failure occurs, the
1056	* log block of the first bad record is saved in first_bad.
1057	*/
1058	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1059	XLOG_RECOVER_CRCPASS, &first_bad);
1060	if ((error == -EFSBADCRC \|\| error == -EFSCORRUPTED) && first_bad) {
1061	/*
1062	* We've hit a potential torn write. Reset the error and warn
1063	* about it.
1064	*/
1065	error = `0`;
1066	xfs_warn(log->l_mp,
1067	"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1068	first_bad, *head_blk);
1069
1070	/*
1071	* Get the header block and buffer pointer for the last good
1072	* record before the bad record.
1073	*
1074	* Note that xlog_find_tail() clears the blocks at the new head
1075	* (i.e., the records with invalid CRC) if the cycle number
1076	* matches the current cycle.
1077	*/
1078	found = xlog_rseek_logrec_hdr(log, head_blk: first_bad, tail_blk: *tail_blk, count: `1`,
1079	buffer, rblk: rhead_blk, rhead, wrapped);
1080	if (found < `0`)
1081	return found;
1082	if (found == `0`) / XXX: right thing to do here? /
1083	return -EIO;
1084
1085	/*
1086	* Reset the head block to the starting block of the first bad
1087	* log record and set the tail block based on the last good
1088	* record.
1089	*
1090	* Bail out if the updated head/tail match as this indicates
1091	* possible corruption outside of the acceptable
1092	* (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1093	*/
1094	*head_blk = first_bad;
1095	tail_blk = BLOCK_LSN(be64_to_cpu((rhead)->h_tail_lsn));
1096	if (head_blk == tail_blk) {
1097	ASSERT(`0`);
1098	return `0`;
1099	}
1100	}
1101	if (error)
1102	return error;
1103
1104	return xlog_verify_tail(log, head_blk: *head_blk, tail_blk,
1105	be32_to_cpu((*rhead)->h_size));
1106	}
1107
1108	/*
1109	* We need to make sure we handle log wrapping properly, so we can't use the
1110	* calculated logbno directly. Make sure it wraps to the correct bno inside the
1111	* log.
1112	*
1113	* The log is limited to 32 bit sizes, so we use the appropriate modulus
1114	* operation here and cast it back to a 64 bit daddr on return.
1115	*/
1116	static inline xfs_daddr_t
1117	xlog_wrap_logbno(
1118	struct xlog *log,
1119	xfs_daddr_t bno)
1120	{
1121	int mod;
1122
1123	div_s64_rem(dividend: bno, divisor: log->l_logBBsize, remainder: &mod);
1124	return mod;
1125	}
1126
1127	/*
1128	* Check whether the head of the log points to an unmount record. In other
1129	* words, determine whether the log is clean. If so, update the in-core state
1130	* appropriately.
1131	*/
1132	static int
1133	xlog_check_unmount_rec(
1134	struct xlog *log,
1135	xfs_daddr_t *head_blk,
1136	xfs_daddr_t *tail_blk,
1137	struct xlog_rec_header *rhead,
1138	xfs_daddr_t rhead_blk,
1139	char *buffer,
1140	bool *clean)
1141	{
1142	struct xlog_op_header *op_head;
1143	xfs_daddr_t umount_data_blk;
1144	xfs_daddr_t after_umount_blk;
1145	int hblks;
1146	int error;
1147	char *offset;
1148
1149	*clean = false;
1150
1151	/*
1152	* Look for unmount record. If we find it, then we know there was a
1153	* clean unmount. Since 'i' could be the last block in the physical
1154	* log, we convert to a log block before comparing to the head_blk.
1155	*
1156	* Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1157	* below. We won't want to clear the unmount record if there is one, so
1158	* we pass the lsn of the unmount record rather than the block after it.
1159	*/
1160	hblks = xlog_logrec_hblks(log, rh: rhead);
1161	after_umount_blk = xlog_wrap_logbno(log,
1162	bno: rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1163
1164	if (*head_blk == after_umount_blk &&
1165	be32_to_cpu(rhead->h_num_logops) == `1`) {
1166	umount_data_blk = xlog_wrap_logbno(log, bno: rhead_blk + hblks);
1167	error = xlog_bread(log, blk_no: umount_data_blk, nbblks: `1`, data: buffer, offset: &offset);
1168	if (error)
1169	return error;
1170
1171	op_head = (struct xlog_op_header *)offset;
1172	if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1173	/*
1174	* Set tail and last sync so that newly written log
1175	* records will point recovery to after the current
1176	* unmount record.
1177	*/
1178	xlog_assign_atomic_lsn(lsn: &log->l_tail_lsn,
1179	cycle: log->l_curr_cycle, block: after_umount_blk);
1180	xlog_assign_atomic_lsn(lsn: &log->l_last_sync_lsn,
1181	cycle: log->l_curr_cycle, block: after_umount_blk);
1182	*tail_blk = after_umount_blk;
1183
1184	*clean = true;
1185	}
1186	}
1187
1188	return `0`;
1189	}
1190
1191	static void
1192	xlog_set_state(
1193	struct xlog *log,
1194	xfs_daddr_t head_blk,
1195	struct xlog_rec_header *rhead,
1196	xfs_daddr_t rhead_blk,
1197	bool bump_cycle)
1198	{
1199	/*
1200	* Reset log values according to the state of the log when we
1201	* crashed. In the case where head_blk == 0, we bump curr_cycle
1202	* one because the next write starts a new cycle rather than
1203	* continuing the cycle of the last good log record. At this
1204	* point we have guaranteed that all partial log records have been
1205	* accounted for. Therefore, we know that the last good log record
1206	* written was complete and ended exactly on the end boundary
1207	* of the physical log.
1208	*/
1209	log->l_prev_block = rhead_blk;
1210	log->l_curr_block = (int)head_blk;
1211	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1212	if (bump_cycle)
1213	log->l_curr_cycle++;
1214	atomic64_set(v: &log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1215	atomic64_set(v: &log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1216	xlog_assign_grant_head(head: &log->l_reserve_head.grant, cycle: log->l_curr_cycle,
1217	space: BBTOB(log->l_curr_block));
1218	xlog_assign_grant_head(head: &log->l_write_head.grant, cycle: log->l_curr_cycle,
1219	space: BBTOB(log->l_curr_block));
1220	}
1221
1222	/*
1223	* Find the sync block number or the tail of the log.
1224	*
1225	* This will be the block number of the last record to have its
1226	* associated buffers synced to disk. Every log record header has
1227	* a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1228	* to get a sync block number. The only concern is to figure out which
1229	* log record header to believe.
1230	*
1231	* The following algorithm uses the log record header with the largest
1232	* lsn. The entire log record does not need to be valid. We only care
1233	* that the header is valid.
1234	*
1235	* We could speed up search by using current head_blk buffer, but it is not
1236	* available.
1237	*/
1238	STATIC int
1239	xlog_find_tail(
1240	struct xlog *log,
1241	xfs_daddr_t *head_blk,
1242	xfs_daddr_t *tail_blk)
1243	{
1244	xlog_rec_header_t *rhead;
1245	char *offset = NULL;
1246	char *buffer;
1247	int error;
1248	xfs_daddr_t rhead_blk;
1249	xfs_lsn_t tail_lsn;
1250	bool wrapped = false;
1251	bool clean = false;
1252
1253	/*
1254	* Find previous log record
1255	*/
1256	if ((error = xlog_find_head(log, return_head_blk: head_blk)))
1257	return error;
1258	ASSERT(*head_blk < INT_MAX);
1259
1260	buffer = xlog_alloc_buffer(log, nbblks: `1`);
1261	if (!buffer)
1262	return -ENOMEM;
1263	if (head_blk == `0`) { /* special case /
1264	error = xlog_bread(log, blk_no: `0`, nbblks: `1`, data: buffer, offset: &offset);
1265	if (error)
1266	goto done;
1267
1268	if (xlog_get_cycle(offset) == `0`) {
1269	*tail_blk = `0`;
1270	/ leave all other log inited values alone /
1271	goto done;
1272	}
1273	}
1274
1275	/*
1276	* Search backwards through the log looking for the log record header
1277	* block. This wraps all the way back around to the head so something is
1278	* seriously wrong if we can't find it.
1279	*/
1280	error = xlog_rseek_logrec_hdr(log, head_blk, head_blk, `1`, buffer,
1281	&rhead_blk, &rhead, &wrapped);
1282	if (error < `0`)
1283	goto done;
1284	if (!error) {
1285	xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1286	error = -EFSCORRUPTED;
1287	goto done;
1288	}
1289	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1290
1291	/*
1292	* Set the log state based on the current head record.
1293	*/
1294	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1295	tail_lsn = atomic64_read(&log->l_tail_lsn);
1296
1297	/*
1298	* Look for an unmount record at the head of the log. This sets the log
1299	* state to determine whether recovery is necessary.
1300	*/
1301	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1302	rhead_blk, buffer, &clean);
1303	if (error)
1304	goto done;
1305
1306	/*
1307	* Verify the log head if the log is not clean (e.g., we have anything
1308	* but an unmount record at the head). This uses CRC verification to
1309	* detect and trim torn writes. If discovered, CRC failures are
1310	* considered torn writes and the log head is trimmed accordingly.
1311	*
1312	* Note that we can only run CRC verification when the log is dirty
1313	* because there's no guarantee that the log data behind an unmount
1314	* record is compatible with the current architecture.
1315	*/
1316	if (!clean) {
1317	xfs_daddr_t orig_head = *head_blk;
1318
1319	error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1320	&rhead_blk, &rhead, &wrapped);
1321	if (error)
1322	goto done;
1323
1324	/ update in-core state again if the head changed /
1325	if (*head_blk != orig_head) {
1326	xlog_set_state(log, *head_blk, rhead, rhead_blk,
1327	wrapped);
1328	tail_lsn = atomic64_read(&log->l_tail_lsn);
1329	error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1330	rhead, rhead_blk, buffer,
1331	&clean);
1332	if (error)
1333	goto done;
1334	}
1335	}
1336
1337	/*
1338	* Note that the unmount was clean. If the unmount was not clean, we
1339	* need to know this to rebuild the superblock counters from the perag
1340	* headers if we have a filesystem using non-persistent counters.
1341	*/
1342	if (clean)
1343	set_bit(XFS_OPSTATE_CLEAN, addr: &log->l_mp->m_opstate);
1344
1345	/*
1346	* Make sure that there are no blocks in front of the head
1347	* with the same cycle number as the head. This can happen
1348	* because we allow multiple outstanding log writes concurrently,
1349	* and the later writes might make it out before earlier ones.
1350	*
1351	* We use the lsn from before modifying it so that we'll never
1352	* overwrite the unmount record after a clean unmount.
1353	*
1354	* Do this only if we are going to recover the filesystem
1355	*
1356	* NOTE: This used to say "if (!readonly)"
1357	* However on Linux, we can & do recover a read-only filesystem.
1358	* We only skip recovery if NORECOVERY is specified on mount,
1359	* in which case we would not be here.
1360	*
1361	* But... if the -device- itself is readonly, just skip this.
1362	* We can't recover this device anyway, so it won't matter.
1363	*/
1364	if (!xfs_readonly_buftarg(log->l_targ))
1365	error = xlog_clear_stale_blocks(log, tail_lsn);
1366
1367	done:
1368	kvfree(addr: buffer);
1369
1370	if (error)
1371	xfs_warn(log->l_mp, "failed to locate log tail");
1372	return error;
1373	}
1374
1375	/*
1376	* Is the log zeroed at all?
1377	*
1378	* The last binary search should be changed to perform an X block read
1379	* once X becomes small enough. You can then search linearly through
1380	* the X blocks. This will cut down on the number of reads we need to do.
1381	*
1382	* If the log is partially zeroed, this routine will pass back the blkno
1383	* of the first block with cycle number 0. It won't have a complete LR
1384	* preceding it.
1385	*
1386	* Return:
1387	* 0 => the log is completely written to
1388	* 1 => use *blk_no as the first block of the log
1389	* <0 => error has occurred
1390	*/
1391	STATIC int
1392	xlog_find_zeroed(
1393	struct xlog *log,
1394	xfs_daddr_t *blk_no)
1395	{
1396	char *buffer;
1397	char *offset;
1398	uint first_cycle, last_cycle;
1399	xfs_daddr_t new_blk, last_blk, start_blk;
1400	xfs_daddr_t num_scan_bblks;
1401	int error, log_bbnum = log->l_logBBsize;
1402	int ret = `1`;
1403
1404	*blk_no = `0`;
1405
1406	/ check totally zeroed log /
1407	buffer = xlog_alloc_buffer(log, nbblks: `1`);
1408	if (!buffer)
1409	return -ENOMEM;
1410	error = xlog_bread(log, blk_no: `0`, nbblks: `1`, data: buffer, offset: &offset);
1411	if (error)
1412	goto out_free_buffer;
1413
1414	first_cycle = xlog_get_cycle(offset);
1415	if (first_cycle == `0`) { / completely zeroed log /
1416	*blk_no = `0`;
1417	goto out_free_buffer;
1418	}
1419
1420	/ check partially zeroed log /
1421	error = xlog_bread(log, blk_no: log_bbnum-`1`, nbblks: `1`, data: buffer, offset: &offset);
1422	if (error)
1423	goto out_free_buffer;
1424
1425	last_cycle = xlog_get_cycle(offset);
1426	if (last_cycle != `0`) { / log completely written to /
1427	ret = `0`;
1428	goto out_free_buffer;
1429	}
1430
1431	/ we have a partially zeroed log /
1432	last_blk = log_bbnum-`1`;
1433	error = xlog_find_cycle_start(log, buffer, first_blk: `0`, last_blk: &last_blk, cycle: `0`);
1434	if (error)
1435	goto out_free_buffer;
1436
1437	/*
1438	* Validate the answer. Because there is no way to guarantee that
1439	* the entire log is made up of log records which are the same size,
1440	* we scan over the defined maximum blocks. At this point, the maximum
1441	* is not chosen to mean anything special. XXXmiken
1442	*/
1443	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1444	ASSERT(num_scan_bblks <= INT_MAX);
1445
1446	if (last_blk < num_scan_bblks)
1447	num_scan_bblks = last_blk;
1448	start_blk = last_blk - num_scan_bblks;
1449
1450	/*
1451	* We search for any instances of cycle number 0 that occur before
1452	* our current estimate of the head. What we're trying to detect is
1453	* 1 ... \| 0 \| 1 \| 0...
1454	* ^ binary search ends here
1455	*/
1456	if ((error = xlog_find_verify_cycle(log, start_blk,
1457	nbblks: (int)num_scan_bblks, stop_on_cycle_no: `0`, new_blk: &new_blk)))
1458	goto out_free_buffer;
1459	if (new_blk != -`1`)
1460	last_blk = new_blk;
1461
1462	/*
1463	* Potentially backup over partial log record write. We don't need
1464	* to search the end of the log because we know it is zero.
1465	*/
1466	error = xlog_find_verify_log_record(log, start_blk, last_blk: &last_blk, extra_bblks: `0`);
1467	if (error == `1`)
1468	error = -EIO;
1469	if (error)
1470	goto out_free_buffer;
1471
1472	*blk_no = last_blk;
1473	out_free_buffer:
1474	kvfree(addr: buffer);
1475	if (error)
1476	return error;
1477	return ret;
1478	}
1479
1480	/*
1481	* These are simple subroutines used by xlog_clear_stale_blocks() below
1482	* to initialize a buffer full of empty log record headers and write
1483	* them into the log.
1484	*/
1485	STATIC void
1486	xlog_add_record(
1487	struct xlog *log,
1488	char *buf,
1489	int cycle,
1490	int block,
1491	int tail_cycle,
1492	int tail_block)
1493	{
1494	xlog_rec_header_t recp = (xlog_rec_header_t )buf;
1495
1496	memset(buf, `0`, BBSIZE);
1497	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1498	recp->h_cycle = cpu_to_be32(cycle);
1499	recp->h_version = cpu_to_be32(
1500	xfs_has_logv2(log->l_mp) ? `2` : `1`);
1501	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1502	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1503	recp->h_fmt = cpu_to_be32(XLOG_FMT);
1504	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1505	}
1506
1507	STATIC int
1508	xlog_write_log_records(
1509	struct xlog *log,
1510	int cycle,
1511	int start_block,
1512	int blocks,
1513	int tail_cycle,
1514	int tail_block)
1515	{
1516	char *offset;
1517	char *buffer;
1518	int balign, ealign;
1519	int sectbb = log->l_sectBBsize;
1520	int end_block = start_block + blocks;
1521	int bufblks;
1522	int error = `0`;
1523	int i, j = `0`;
1524
1525	/*
1526	* Greedily allocate a buffer big enough to handle the full
1527	* range of basic blocks to be written. If that fails, try
1528	* a smaller size. We need to be able to write at least a
1529	* log sector, or we're out of luck.
1530	*/
1531	bufblks = roundup_pow_of_two(blocks);
1532	while (bufblks > log->l_logBBsize)
1533	bufblks >>= `1`;
1534	while (!(buffer = xlog_alloc_buffer(log, nbblks: bufblks))) {
1535	bufblks >>= `1`;
1536	if (bufblks < sectbb)
1537	return -ENOMEM;
1538	}
1539
1540	/ We may need to do a read at the start to fill in part of*
1541	* the buffer in the starting sector not covered by the first
1542	* write below.
1543	*/
1544	balign = round_down(start_block, sectbb);
1545	if (balign != start_block) {
1546	error = xlog_bread_noalign(log, blk_no: start_block, nbblks: `1`, data: buffer);
1547	if (error)
1548	goto out_free_buffer;
1549
1550	j = start_block - balign;
1551	}
1552
1553	for (i = start_block; i < end_block; i += bufblks) {
1554	int bcount, endcount;
1555
1556	bcount = min(bufblks, end_block - start_block);
1557	endcount = bcount - j;
1558
1559	/ We may need to do a read at the end to fill in part of*
1560	* the buffer in the final sector not covered by the write.
1561	* If this is the same sector as the above read, skip it.
1562	*/
1563	ealign = round_down(end_block, sectbb);
1564	if (j == `0` && (start_block + endcount > ealign)) {
1565	error = xlog_bread_noalign(log, blk_no: ealign, nbblks: sectbb,
1566	data: buffer + BBTOB(ealign - start_block));
1567	if (error)
1568	break;
1569
1570	}
1571
1572	offset = buffer + xlog_align(log, blk_no: start_block);
1573	for (; j < endcount; j++) {
1574	xlog_add_record(log, buf: offset, cycle, block: i+j,
1575	tail_cycle, tail_block);
1576	offset += BBSIZE;
1577	}
1578	error = xlog_bwrite(log, blk_no: start_block, nbblks: endcount, data: buffer);
1579	if (error)
1580	break;
1581	start_block += endcount;
1582	j = `0`;
1583	}
1584
1585	out_free_buffer:
1586	kvfree(addr: buffer);
1587	return error;
1588	}
1589
1590	/*
1591	* This routine is called to blow away any incomplete log writes out
1592	* in front of the log head. We do this so that we won't become confused
1593	* if we come up, write only a little bit more, and then crash again.
1594	* If we leave the partial log records out there, this situation could
1595	* cause us to think those partial writes are valid blocks since they
1596	* have the current cycle number. We get rid of them by overwriting them
1597	* with empty log records with the old cycle number rather than the
1598	* current one.
1599	*
1600	* The tail lsn is passed in rather than taken from
1601	* the log so that we will not write over the unmount record after a
1602	* clean unmount in a 512 block log. Doing so would leave the log without
1603	* any valid log records in it until a new one was written. If we crashed
1604	* during that time we would not be able to recover.
1605	*/
1606	STATIC int
1607	xlog_clear_stale_blocks(
1608	struct xlog *log,
1609	xfs_lsn_t tail_lsn)
1610	{
1611	int tail_cycle, head_cycle;
1612	int tail_block, head_block;
1613	int tail_distance, max_distance;
1614	int distance;
1615	int error;
1616
1617	tail_cycle = CYCLE_LSN(tail_lsn);
1618	tail_block = BLOCK_LSN(tail_lsn);
1619	head_cycle = log->l_curr_cycle;
1620	head_block = log->l_curr_block;
1621
1622	/*
1623	* Figure out the distance between the new head of the log
1624	* and the tail. We want to write over any blocks beyond the
1625	* head that we may have written just before the crash, but
1626	* we don't want to overwrite the tail of the log.
1627	*/
1628	if (head_cycle == tail_cycle) {
1629	/*
1630	* The tail is behind the head in the physical log,
1631	* so the distance from the head to the tail is the
1632	* distance from the head to the end of the log plus
1633	* the distance from the beginning of the log to the
1634	* tail.
1635	*/
1636	if (XFS_IS_CORRUPT(log->l_mp,
1637	head_block < tail_block \|\|
1638	head_block >= log->l_logBBsize))
1639	return -EFSCORRUPTED;
1640	tail_distance = tail_block + (log->l_logBBsize - head_block);
1641	} else {
1642	/*
1643	* The head is behind the tail in the physical log,
1644	* so the distance from the head to the tail is just
1645	* the tail block minus the head block.
1646	*/
1647	if (XFS_IS_CORRUPT(log->l_mp,
1648	head_block >= tail_block \|\|
1649	head_cycle != tail_cycle + `1`))
1650	return -EFSCORRUPTED;
1651	tail_distance = tail_block - head_block;
1652	}
1653
1654	/*
1655	* If the head is right up against the tail, we can't clear
1656	* anything.
1657	*/
1658	if (tail_distance <= `0`) {
1659	ASSERT(tail_distance == `0`);
1660	return `0`;
1661	}
1662
1663	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1664	/*
1665	* Take the smaller of the maximum amount of outstanding I/O
1666	* we could have and the distance to the tail to clear out.
1667	* We take the smaller so that we don't overwrite the tail and
1668	* we don't waste all day writing from the head to the tail
1669	* for no reason.
1670	*/
1671	max_distance = min(max_distance, tail_distance);
1672
1673	if ((head_block + max_distance) <= log->l_logBBsize) {
1674	/*
1675	* We can stomp all the blocks we need to without
1676	* wrapping around the end of the log. Just do it
1677	* in a single write. Use the cycle number of the
1678	* current cycle minus one so that the log will look like:
1679	* n ... \| n - 1 ...
1680	*/
1681	error = xlog_write_log_records(log, cycle: (head_cycle - `1`),
1682	start_block: head_block, blocks: max_distance, tail_cycle,
1683	tail_block);
1684	if (error)
1685	return error;
1686	} else {
1687	/*
1688	* We need to wrap around the end of the physical log in
1689	* order to clear all the blocks. Do it in two separate
1690	* I/Os. The first write should be from the head to the
1691	* end of the physical log, and it should use the current
1692	* cycle number minus one just like above.
1693	*/
1694	distance = log->l_logBBsize - head_block;
1695	error = xlog_write_log_records(log, cycle: (head_cycle - `1`),
1696	start_block: head_block, blocks: distance, tail_cycle,
1697	tail_block);
1698
1699	if (error)
1700	return error;
1701
1702	/*
1703	* Now write the blocks at the start of the physical log.
1704	* This writes the remainder of the blocks we want to clear.
1705	* It uses the current cycle number since we're now on the
1706	* same cycle as the head so that we get:
1707	* n ... n ... \| n - 1 ...
1708	* ^^^^^ blocks we're writing
1709	*/
1710	distance = max_distance - (log->l_logBBsize - head_block);
1711	error = xlog_write_log_records(log, cycle: head_cycle, start_block: `0`, blocks: distance,
1712	tail_cycle, tail_block);
1713	if (error)
1714	return error;
1715	}
1716
1717	return `0`;
1718	}
1719
1720	/*
1721	* Release the recovered intent item in the AIL that matches the given intent
1722	* type and intent id.
1723	*/
1724	void
1725	xlog_recover_release_intent(
1726	struct xlog *log,
1727	unsigned short intent_type,
1728	uint64_t intent_id)
1729	{
1730	struct xfs_defer_pending dfp, n;
1731
1732	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
1733	struct xfs_log_item *lip = dfp->dfp_intent;
1734
1735	if (lip->li_type != intent_type)
1736	continue;
1737	if (!lip->li_ops->iop_match(lip, intent_id))
1738	continue;
1739
1740	ASSERT(xlog_item_is_intent(lip));
1741
1742	xfs_defer_cancel_recovery(log->l_mp, dfp);
1743	}
1744	}
1745
1746	int
1747	xlog_recover_iget(
1748	struct xfs_mount *mp,
1749	xfs_ino_t ino,
1750	struct xfs_inode **ipp)
1751	{
1752	int error;
1753
1754	error = xfs_iget(mp, NULL, ino, flags: `0`, lock_flags: `0`, ipp);
1755	if (error)
1756	return error;
1757
1758	error = xfs_qm_dqattach(*ipp);
1759	if (error) {
1760	xfs_irele(ip: *ipp);
1761	return error;
1762	}
1763
1764	if (VFS_I(ip: *ipp)->i_nlink == `0`)
1765	xfs_iflags_set(ip: *ipp, XFS_IRECOVERY);
1766
1767	return `0`;
1768	}
1769
1770	/******************************************************************************
1771	*
1772	* Log recover routines
1773	*
1774	******************************************************************************
1775	*/
1776	static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1777	&xlog_buf_item_ops,
1778	&xlog_inode_item_ops,
1779	&xlog_dquot_item_ops,
1780	&xlog_quotaoff_item_ops,
1781	&xlog_icreate_item_ops,
1782	&xlog_efi_item_ops,
1783	&xlog_efd_item_ops,
1784	&xlog_rui_item_ops,
1785	&xlog_rud_item_ops,
1786	&xlog_cui_item_ops,
1787	&xlog_cud_item_ops,
1788	&xlog_bui_item_ops,
1789	&xlog_bud_item_ops,
1790	&xlog_attri_item_ops,
1791	&xlog_attrd_item_ops,
1792	};
1793
1794	static const struct xlog_recover_item_ops *
1795	xlog_find_item_ops(
1796	struct xlog_recover_item *item)
1797	{
1798	unsigned int i;
1799
1800	for (i = `0`; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1801	if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1802	return xlog_recover_item_ops[i];
1803
1804	return NULL;
1805	}
1806
1807	/*
1808	* Sort the log items in the transaction.
1809	*
1810	* The ordering constraints are defined by the inode allocation and unlink
1811	* behaviour. The rules are:
1812	*
1813	* 1. Every item is only logged once in a given transaction. Hence it
1814	* represents the last logged state of the item. Hence ordering is
1815	* dependent on the order in which operations need to be performed so
1816	* required initial conditions are always met.
1817	*
1818	* 2. Cancelled buffers are recorded in pass 1 in a separate table and
1819	* there's nothing to replay from them so we can simply cull them
1820	* from the transaction. However, we can't do that until after we've
1821	* replayed all the other items because they may be dependent on the
1822	* cancelled buffer and replaying the cancelled buffer can remove it
1823	* form the cancelled buffer table. Hence they have tobe done last.
1824	*
1825	* 3. Inode allocation buffers must be replayed before inode items that
1826	* read the buffer and replay changes into it. For filesystems using the
1827	* ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1828	* treated the same as inode allocation buffers as they create and
1829	* initialise the buffers directly.
1830	*
1831	* 4. Inode unlink buffers must be replayed after inode items are replayed.
1832	* This ensures that inodes are completely flushed to the inode buffer
1833	* in a "free" state before we remove the unlinked inode list pointer.
1834	*
1835	* Hence the ordering needs to be inode allocation buffers first, inode items
1836	* second, inode unlink buffers third and cancelled buffers last.
1837	*
1838	* But there's a problem with that - we can't tell an inode allocation buffer
1839	* apart from a regular buffer, so we can't separate them. We can, however,
1840	* tell an inode unlink buffer from the others, and so we can separate them out
1841	* from all the other buffers and move them to last.
1842	*
1843	* Hence, 4 lists, in order from head to tail:
1844	* - buffer_list for all buffers except cancelled/inode unlink buffers
1845	* - item_list for all non-buffer items
1846	* - inode_buffer_list for inode unlink buffers
1847	* - cancel_list for the cancelled buffers
1848	*
1849	* Note that we add objects to the tail of the lists so that first-to-last
1850	* ordering is preserved within the lists. Adding objects to the head of the
1851	* list means when we traverse from the head we walk them in last-to-first
1852	* order. For cancelled buffers and inode unlink buffers this doesn't matter,
1853	* but for all other items there may be specific ordering that we need to
1854	* preserve.
1855	*/
1856	STATIC int
1857	xlog_recover_reorder_trans(
1858	struct xlog *log,
1859	struct xlog_recover *trans,
1860	int pass)
1861	{
1862	struct xlog_recover_item item, n;
1863	int error = `0`;
1864	LIST_HEAD(sort_list);
1865	LIST_HEAD(cancel_list);
1866	LIST_HEAD(buffer_list);
1867	LIST_HEAD(inode_buffer_list);
1868	LIST_HEAD(item_list);
1869
1870	list_splice_init(list: &trans->r_itemq, head: &sort_list);
1871	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1872	enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1873
1874	item->ri_ops = xlog_find_item_ops(item);
1875	if (!item->ri_ops) {
1876	xfs_warn(log->l_mp,
1877	"%s: unrecognized type of log operation (%d)",
1878	__func__, ITEM_TYPE(item));
1879	ASSERT(`0`);
1880	/*
1881	* return the remaining items back to the transaction
1882	* item list so they can be freed in caller.
1883	*/
1884	if (!list_empty(head: &sort_list))
1885	list_splice_init(list: &sort_list, head: &trans->r_itemq);
1886	error = -EFSCORRUPTED;
1887	break;
1888	}
1889
1890	if (item->ri_ops->reorder)
1891	fate = item->ri_ops->reorder(item);
1892
1893	switch (fate) {
1894	case XLOG_REORDER_BUFFER_LIST:
1895	list_move_tail(list: &item->ri_list, head: &buffer_list);
1896	break;
1897	case XLOG_REORDER_CANCEL_LIST:
1898	trace_xfs_log_recover_item_reorder_head(log,
1899	trans, item, pass);
1900	list_move(list: &item->ri_list, head: &cancel_list);
1901	break;
1902	case XLOG_REORDER_INODE_BUFFER_LIST:
1903	list_move(list: &item->ri_list, head: &inode_buffer_list);
1904	break;
1905	case XLOG_REORDER_ITEM_LIST:
1906	trace_xfs_log_recover_item_reorder_tail(log,
1907	trans, item, pass);
1908	list_move_tail(list: &item->ri_list, head: &item_list);
1909	break;
1910	}
1911	}
1912
1913	ASSERT(list_empty(&sort_list));
1914	if (!list_empty(head: &buffer_list))
1915	list_splice(list: &buffer_list, head: &trans->r_itemq);
1916	if (!list_empty(head: &item_list))
1917	list_splice_tail(list: &item_list, head: &trans->r_itemq);
1918	if (!list_empty(head: &inode_buffer_list))
1919	list_splice_tail(list: &inode_buffer_list, head: &trans->r_itemq);
1920	if (!list_empty(head: &cancel_list))
1921	list_splice_tail(list: &cancel_list, head: &trans->r_itemq);
1922	return error;
1923	}
1924
1925	void
1926	xlog_buf_readahead(
1927	struct xlog *log,
1928	xfs_daddr_t blkno,
1929	uint len,
1930	const struct xfs_buf_ops *ops)
1931	{
1932	if (!xlog_is_buffer_cancelled(log, blkno, len))
1933	xfs_buf_readahead(target: log->l_mp->m_ddev_targp, blkno, numblks: len, ops);
1934	}
1935
1936	/*
1937	* Create a deferred work structure for resuming and tracking the progress of a
1938	* log intent item that was found during recovery.
1939	*/
1940	void
1941	xlog_recover_intent_item(
1942	struct xlog *log,
1943	struct xfs_log_item *lip,
1944	xfs_lsn_t lsn,
1945	const struct xfs_defer_op_type *ops)
1946	{
1947	ASSERT(xlog_item_is_intent(lip));
1948
1949	xfs_defer_start_recovery(lip, &log->r_dfops, ops);
1950
1951	/*
1952	* Insert the intent into the AIL directly and drop one reference so
1953	* that finishing or canceling the work will drop the other.
1954	*/
1955	xfs_trans_ail_insert(log->l_ailp, lip, lsn);
1956	lip->li_ops->iop_unpin(lip, `0`);
1957	}
1958
1959	STATIC int
1960	xlog_recover_items_pass2(
1961	struct xlog *log,
1962	struct xlog_recover *trans,
1963	struct list_head *buffer_list,
1964	struct list_head *item_list)
1965	{
1966	struct xlog_recover_item *item;
1967	int error = `0`;
1968
1969	list_for_each_entry(item, item_list, ri_list) {
1970	trace_xfs_log_recover_item_recover(log, trans, item,
1971	XLOG_RECOVER_PASS2);
1972
1973	if (item->ri_ops->commit_pass2)
1974	error = item->ri_ops->commit_pass2(log, buffer_list,
1975	item, trans->r_lsn);
1976	if (error)
1977	return error;
1978	}
1979
1980	return error;
1981	}
1982
1983	/*
1984	* Perform the transaction.
1985	*
1986	* If the transaction modifies a buffer or inode, do it now. Otherwise,
1987	* EFIs and EFDs get queued up by adding entries into the AIL for them.
1988	*/
1989	STATIC int
1990	xlog_recover_commit_trans(
1991	struct xlog *log,
1992	struct xlog_recover *trans,
1993	int pass,
1994	struct list_head *buffer_list)
1995	{
1996	int error = `0`;
1997	int items_queued = `0`;
1998	struct xlog_recover_item *item;
1999	struct xlog_recover_item *next;
2000	LIST_HEAD (ra_list);
2001	LIST_HEAD (done_list);
2002
2003	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
2004
2005	hlist_del_init(n: &trans->r_list);
2006
2007	error = xlog_recover_reorder_trans(log, trans, pass);
2008	if (error)
2009	return error;
2010
2011	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
2012	trace_xfs_log_recover_item_recover(log, trans, item, pass);
2013
2014	switch (pass) {
2015	case XLOG_RECOVER_PASS1:
2016	if (item->ri_ops->commit_pass1)
2017	error = item->ri_ops->commit_pass1(log, item);
2018	break;
2019	case XLOG_RECOVER_PASS2:
2020	if (item->ri_ops->ra_pass2)
2021	item->ri_ops->ra_pass2(log, item);
2022	list_move_tail(list: &item->ri_list, head: &ra_list);
2023	items_queued++;
2024	if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
2025	error = xlog_recover_items_pass2(log, trans,
2026	buffer_list, item_list: &ra_list);
2027	list_splice_tail_init(list: &ra_list, head: &done_list);
2028	items_queued = `0`;
2029	}
2030
2031	break;
2032	default:
2033	ASSERT(`0`);
2034	}
2035
2036	if (error)
2037	goto out;
2038	}
2039
2040	out:
2041	if (!list_empty(head: &ra_list)) {
2042	if (!error)
2043	error = xlog_recover_items_pass2(log, trans,
2044	buffer_list, item_list: &ra_list);
2045	list_splice_tail_init(list: &ra_list, head: &done_list);
2046	}
2047
2048	if (!list_empty(head: &done_list))
2049	list_splice_init(list: &done_list, head: &trans->r_itemq);
2050
2051	return error;
2052	}
2053
2054	STATIC void
2055	xlog_recover_add_item(
2056	struct list_head *head)
2057	{
2058	struct xlog_recover_item *item;
2059
2060	item = kzalloc(sizeof(struct xlog_recover_item),
2061	GFP_KERNEL \| __GFP_NOFAIL);
2062	INIT_LIST_HEAD(list: &item->ri_list);
2063	list_add_tail(new: &item->ri_list, head);
2064	}
2065
2066	STATIC int
2067	xlog_recover_add_to_cont_trans(
2068	struct xlog *log,
2069	struct xlog_recover *trans,
2070	char *dp,
2071	int len)
2072	{
2073	struct xlog_recover_item *item;
2074	char ptr, old_ptr;
2075	int old_len;
2076
2077	/*
2078	* If the transaction is empty, the header was split across this and the
2079	* previous record. Copy the rest of the header.
2080	*/
2081	if (list_empty(head: &trans->r_itemq)) {
2082	ASSERT(len <= sizeof(struct xfs_trans_header));
2083	if (len > sizeof(struct xfs_trans_header)) {
2084	xfs_warn(log->l_mp, "%s: bad header length", __func__);
2085	return -EFSCORRUPTED;
2086	}
2087
2088	xlog_recover_add_item(head: &trans->r_itemq);
2089	ptr = (char *)&trans->r_theader +
2090	sizeof(struct xfs_trans_header) - len;
2091	memcpy(ptr, dp, len);
2092	return `0`;
2093	}
2094
2095	/ take the tail entry /
2096	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2097	ri_list);
2098
2099	old_ptr = item->ri_buf[item->ri_cnt-`1`].i_addr;
2100	old_len = item->ri_buf[item->ri_cnt-`1`].i_len;
2101
2102	ptr = kvrealloc(p: old_ptr, oldsize: old_len, newsize: len + old_len, GFP_KERNEL);
2103	if (!ptr)
2104	return -ENOMEM;
2105	memcpy(&ptr[old_len], dp, len);
2106	item->ri_buf[item->ri_cnt-`1`].i_len += len;
2107	item->ri_buf[item->ri_cnt-`1`].i_addr = ptr;
2108	trace_xfs_log_recover_item_add_cont(log, trans, item, pass: `0`);
2109	return `0`;
2110	}
2111
2112	/*
2113	* The next region to add is the start of a new region. It could be
2114	* a whole region or it could be the first part of a new region. Because
2115	* of this, the assumption here is that the type and size fields of all
2116	* format structures fit into the first 32 bits of the structure.
2117	*
2118	* This works because all regions must be 32 bit aligned. Therefore, we
2119	* either have both fields or we have neither field. In the case we have
2120	* neither field, the data part of the region is zero length. We only have
2121	* a log_op_header and can throw away the header since a new one will appear
2122	* later. If we have at least 4 bytes, then we can determine how many regions
2123	* will appear in the current log item.
2124	*/
2125	STATIC int
2126	xlog_recover_add_to_trans(
2127	struct xlog *log,
2128	struct xlog_recover *trans,
2129	char *dp,
2130	int len)
2131	{
2132	struct xfs_inode_log_format in_f; /* any will do /
2133	struct xlog_recover_item *item;
2134	char *ptr;
2135
2136	if (!len)
2137	return `0`;
2138	if (list_empty(head: &trans->r_itemq)) {
2139	/ we need to catch log corruptions here /
2140	if ((uint )dp != XFS_TRANS_HEADER_MAGIC) {
2141	xfs_warn(log->l_mp, "%s: bad header magic number",
2142	__func__);
2143	ASSERT(`0`);
2144	return -EFSCORRUPTED;
2145	}
2146
2147	if (len > sizeof(struct xfs_trans_header)) {
2148	xfs_warn(log->l_mp, "%s: bad header length", __func__);
2149	ASSERT(`0`);
2150	return -EFSCORRUPTED;
2151	}
2152
2153	/*
2154	* The transaction header can be arbitrarily split across op
2155	* records. If we don't have the whole thing here, copy what we
2156	* do have and handle the rest in the next record.
2157	*/
2158	if (len == sizeof(struct xfs_trans_header))
2159	xlog_recover_add_item(head: &trans->r_itemq);
2160	memcpy(&trans->r_theader, dp, len);
2161	return `0`;
2162	}
2163
2164	ptr = xlog_kvmalloc(buf_size: len);
2165	memcpy(ptr, dp, len);
2166	in_f = (struct xfs_inode_log_format *)ptr;
2167
2168	/ take the tail entry /
2169	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2170	ri_list);
2171	if (item->ri_total != `0` &&
2172	item->ri_total == item->ri_cnt) {
2173	/ tail item is in use, get a new one /
2174	xlog_recover_add_item(head: &trans->r_itemq);
2175	item = list_entry(trans->r_itemq.prev,
2176	struct xlog_recover_item, ri_list);
2177	}
2178
2179	if (item->ri_total == `0`) { / first region to be added /
2180	if (in_f->ilf_size == `0` \|\|
2181	in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2182	xfs_warn(log->l_mp,
2183	"bad number of regions (%d) in inode log format",
2184	in_f->ilf_size);
2185	ASSERT(`0`);
2186	kvfree(addr: ptr);
2187	return -EFSCORRUPTED;
2188	}
2189
2190	item->ri_total = in_f->ilf_size;
2191	item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2192	GFP_KERNEL \| __GFP_NOFAIL);
2193	}
2194
2195	if (item->ri_total <= item->ri_cnt) {
2196	xfs_warn(log->l_mp,
2197	"log item region count (%d) overflowed size (%d)",
2198	item->ri_cnt, item->ri_total);
2199	ASSERT(`0`);
2200	kvfree(addr: ptr);
2201	return -EFSCORRUPTED;
2202	}
2203
2204	/ Description region is ri_buf[0] /
2205	item->ri_buf[item->ri_cnt].i_addr = ptr;
2206	item->ri_buf[item->ri_cnt].i_len = len;
2207	item->ri_cnt++;
2208	trace_xfs_log_recover_item_add(log, trans, item, pass: `0`);
2209	return `0`;
2210	}
2211
2212	/*
2213	* Free up any resources allocated by the transaction
2214	*
2215	* Remember that EFIs, EFDs, and IUNLINKs are handled later.
2216	*/
2217	STATIC void
2218	xlog_recover_free_trans(
2219	struct xlog_recover *trans)
2220	{
2221	struct xlog_recover_item item, n;
2222	int i;
2223
2224	hlist_del_init(n: &trans->r_list);
2225
2226	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2227	/ Free the regions in the item. /
2228	list_del(entry: &item->ri_list);
2229	for (i = `0`; i < item->ri_cnt; i++)
2230	kvfree(addr: item->ri_buf[i].i_addr);
2231	/ Free the item itself /
2232	kfree(objp: item->ri_buf);
2233	kfree(objp: item);
2234	}
2235	/ Free the transaction recover structure /
2236	kfree(objp: trans);
2237	}
2238
2239	/*
2240	* On error or completion, trans is freed.
2241	*/
2242	STATIC int
2243	xlog_recovery_process_trans(
2244	struct xlog *log,
2245	struct xlog_recover *trans,
2246	char *dp,
2247	unsigned int len,
2248	unsigned int flags,
2249	int pass,
2250	struct list_head *buffer_list)
2251	{
2252	int error = `0`;
2253	bool freeit = false;
2254
2255	/ mask off ophdr transaction container flags /
2256	flags &= ~XLOG_END_TRANS;
2257	if (flags & XLOG_WAS_CONT_TRANS)
2258	flags &= ~XLOG_CONTINUE_TRANS;
2259
2260	/*
2261	* Callees must not free the trans structure. We'll decide if we need to
2262	* free it or not based on the operation being done and it's result.
2263	*/
2264	switch (flags) {
2265	/ expected flag values /
2266	case `0`:
2267	case XLOG_CONTINUE_TRANS:
2268	error = xlog_recover_add_to_trans(log, trans, dp, len);
2269	break;
2270	case XLOG_WAS_CONT_TRANS:
2271	error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2272	break;
2273	case XLOG_COMMIT_TRANS:
2274	error = xlog_recover_commit_trans(log, trans, pass,
2275	buffer_list);
2276	/ success or fail, we are now done with this transaction. /
2277	freeit = true;
2278	break;
2279
2280	/ unexpected flag values /
2281	case XLOG_UNMOUNT_TRANS:
2282	/ just skip trans /
2283	xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2284	freeit = true;
2285	break;
2286	case XLOG_START_TRANS:
2287	default:
2288	xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2289	ASSERT(`0`);
2290	error = -EFSCORRUPTED;
2291	break;
2292	}
2293	if (error \|\| freeit)
2294	xlog_recover_free_trans(trans);
2295	return error;
2296	}
2297
2298	/*
2299	* Lookup the transaction recovery structure associated with the ID in the
2300	* current ophdr. If the transaction doesn't exist and the start flag is set in
2301	* the ophdr, then allocate a new transaction for future ID matches to find.
2302	* Either way, return what we found during the lookup - an existing transaction
2303	* or nothing.
2304	*/
2305	STATIC struct xlog_recover *
2306	xlog_recover_ophdr_to_trans(
2307	struct hlist_head rhash[],
2308	struct xlog_rec_header *rhead,
2309	struct xlog_op_header *ohead)
2310	{
2311	struct xlog_recover *trans;
2312	xlog_tid_t tid;
2313	struct hlist_head *rhp;
2314
2315	tid = be32_to_cpu(ohead->oh_tid);
2316	rhp = &rhash[XLOG_RHASH(tid)];
2317	hlist_for_each_entry(trans, rhp, r_list) {
2318	if (trans->r_log_tid == tid)
2319	return trans;
2320	}
2321
2322	/*
2323	* skip over non-start transaction headers - we could be
2324	* processing slack space before the next transaction starts
2325	*/
2326	if (!(ohead->oh_flags & XLOG_START_TRANS))
2327	return NULL;
2328
2329	ASSERT(be32_to_cpu(ohead->oh_len) == `0`);
2330
2331	/*
2332	* This is a new transaction so allocate a new recovery container to
2333	* hold the recovery ops that will follow.
2334	*/
2335	trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL \| __GFP_NOFAIL);
2336	trans->r_log_tid = tid;
2337	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2338	INIT_LIST_HEAD(list: &trans->r_itemq);
2339	INIT_HLIST_NODE(h: &trans->r_list);
2340	hlist_add_head(n: &trans->r_list, h: rhp);
2341
2342	/*
2343	* Nothing more to do for this ophdr. Items to be added to this new
2344	* transaction will be in subsequent ophdr containers.
2345	*/
2346	return NULL;
2347	}
2348
2349	STATIC int
2350	xlog_recover_process_ophdr(
2351	struct xlog *log,
2352	struct hlist_head rhash[],
2353	struct xlog_rec_header *rhead,
2354	struct xlog_op_header *ohead,
2355	char *dp,
2356	char *end,
2357	int pass,
2358	struct list_head *buffer_list)
2359	{
2360	struct xlog_recover *trans;
2361	unsigned int len;
2362	int error;
2363
2364	/ Do we understand who wrote this op? /
2365	if (ohead->oh_clientid != XFS_TRANSACTION &&
2366	ohead->oh_clientid != XFS_LOG) {
2367	xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2368	__func__, ohead->oh_clientid);
2369	ASSERT(`0`);
2370	return -EFSCORRUPTED;
2371	}
2372
2373	/*
2374	* Check the ophdr contains all the data it is supposed to contain.
2375	*/
2376	len = be32_to_cpu(ohead->oh_len);
2377	if (dp + len > end) {
2378	xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2379	WARN_ON(`1`);
2380	return -EFSCORRUPTED;
2381	}
2382
2383	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2384	if (!trans) {
2385	/ nothing to do, so skip over this ophdr /
2386	return `0`;
2387	}
2388
2389	/*
2390	* The recovered buffer queue is drained only once we know that all
2391	* recovery items for the current LSN have been processed. This is
2392	* required because:
2393	*
2394	* - Buffer write submission updates the metadata LSN of the buffer.
2395	* - Log recovery skips items with a metadata LSN >= the current LSN of
2396	* the recovery item.
2397	* - Separate recovery items against the same metadata buffer can share
2398	* a current LSN. I.e., consider that the LSN of a recovery item is
2399	* defined as the starting LSN of the first record in which its
2400	* transaction appears, that a record can hold multiple transactions,
2401	* and/or that a transaction can span multiple records.
2402	*
2403	* In other words, we are allowed to submit a buffer from log recovery
2404	* once per current LSN. Otherwise, we may incorrectly skip recovery
2405	* items and cause corruption.
2406	*
2407	* We don't know up front whether buffers are updated multiple times per
2408	* LSN. Therefore, track the current LSN of each commit log record as it
2409	* is processed and drain the queue when it changes. Use commit records
2410	* because they are ordered correctly by the logging code.
2411	*/
2412	if (log->l_recovery_lsn != trans->r_lsn &&
2413	ohead->oh_flags & XLOG_COMMIT_TRANS) {
2414	error = xfs_buf_delwri_submit(buffer_list);
2415	if (error)
2416	return error;
2417	log->l_recovery_lsn = trans->r_lsn;
2418	}
2419
2420	return xlog_recovery_process_trans(log, trans, dp, len,
2421	flags: ohead->oh_flags, pass, buffer_list);
2422	}
2423
2424	/*
2425	* There are two valid states of the r_state field. 0 indicates that the
2426	* transaction structure is in a normal state. We have either seen the
2427	* start of the transaction or the last operation we added was not a partial
2428	* operation. If the last operation we added to the transaction was a
2429	* partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2430	*
2431	* NOTE: skip LRs with 0 data length.
2432	*/
2433	STATIC int
2434	xlog_recover_process_data(
2435	struct xlog *log,
2436	struct hlist_head rhash[],
2437	struct xlog_rec_header *rhead,
2438	char *dp,
2439	int pass,
2440	struct list_head *buffer_list)
2441	{
2442	struct xlog_op_header *ohead;
2443	char *end;
2444	int num_logops;
2445	int error;
2446
2447	end = dp + be32_to_cpu(rhead->h_len);
2448	num_logops = be32_to_cpu(rhead->h_num_logops);
2449
2450	/ check the log format matches our own - else we can't recover /
2451	if (xlog_header_check_recover(log->l_mp, rhead))
2452	return -EIO;
2453
2454	trace_xfs_log_recover_record(log, rhead, pass);
2455	while ((dp < end) && num_logops) {
2456
2457	ohead = (struct xlog_op_header *)dp;
2458	dp += sizeof(*ohead);
2459	ASSERT(dp <= end);
2460
2461	/ errors will abort recovery /
2462	error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
2463	dp, end, pass, buffer_list);
2464	if (error)
2465	return error;
2466
2467	dp += be32_to_cpu(ohead->oh_len);
2468	num_logops--;
2469	}
2470	return `0`;
2471	}
2472
2473	/ Take all the collected deferred ops and finish them in order. /
2474	static int
2475	xlog_finish_defer_ops(
2476	struct xfs_mount *mp,
2477	struct list_head *capture_list)
2478	{
2479	struct xfs_defer_capture dfc, next;
2480	struct xfs_trans *tp;
2481	int error = `0`;
2482
2483	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2484	struct xfs_trans_res resv;
2485	struct xfs_defer_resources dres;
2486
2487	/*
2488	* Create a new transaction reservation from the captured
2489	* information. Set logcount to 1 to force the new transaction
2490	* to regrant every roll so that we can make forward progress
2491	* in recovery no matter how full the log might be.
2492	*/
2493	resv.tr_logres = dfc->dfc_logres;
2494	resv.tr_logcount = `1`;
2495	resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2496
2497	error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2498	dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2499	if (error) {
2500	xlog_force_shutdown(log: mp->m_log, SHUTDOWN_LOG_IO_ERROR);
2501	return error;
2502	}
2503
2504	/*
2505	* Transfer to this new transaction all the dfops we captured
2506	* from recovering a single intent item.
2507	*/
2508	list_del_init(entry: &dfc->dfc_list);
2509	xfs_defer_ops_continue(dfc, tp, &dres);
2510	error = xfs_trans_commit(tp);
2511	xfs_defer_resources_rele(&dres);
2512	if (error)
2513	return error;
2514	}
2515
2516	ASSERT(list_empty(capture_list));
2517	return `0`;
2518	}
2519
2520	/ Release all the captured defer ops and capture structures in this list. /
2521	static void
2522	xlog_abort_defer_ops(
2523	struct xfs_mount *mp,
2524	struct list_head *capture_list)
2525	{
2526	struct xfs_defer_capture *dfc;
2527	struct xfs_defer_capture *next;
2528
2529	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2530	list_del_init(entry: &dfc->dfc_list);
2531	xfs_defer_ops_capture_abort(mp, dfc);
2532	}
2533	}
2534
2535	/*
2536	* When this is called, all of the log intent items which did not have
2537	* corresponding log done items should be in the AIL. What we do now is update
2538	* the data structures associated with each one.
2539	*
2540	* Since we process the log intent items in normal transactions, they will be
2541	* removed at some point after the commit. This prevents us from just walking
2542	* down the list processing each one. We'll use a flag in the intent item to
2543	* skip those that we've already processed and use the AIL iteration mechanism's
2544	* generation count to try to speed this up at least a bit.
2545	*
2546	* When we start, we know that the intents are the only things in the AIL. As we
2547	* process them, however, other items are added to the AIL. Hence we know we
2548	* have started recovery on all the pending intents when we find an non-intent
2549	* item in the AIL.
2550	*/
2551	STATIC int
2552	xlog_recover_process_intents(
2553	struct xlog *log)
2554	{
2555	LIST_HEAD(capture_list);
2556	struct xfs_defer_pending dfp, n;
2557	int error = `0`;
2558	#if defined(DEBUG) \|\| defined(XFS_WARN)
2559	xfs_lsn_t last_lsn;
2560
2561	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2562	#endif
2563
2564	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2565	ASSERT(xlog_item_is_intent(dfp->dfp_intent));
2566
2567	/*
2568	* We should never see a redo item with a LSN higher than
2569	* the last transaction we found in the log at the start
2570	* of recovery.
2571	*/
2572	ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= `0`);
2573
2574	/*
2575	* NOTE: If your intent processing routine can create more
2576	* deferred ops, you /must/ attach them to the capture list in
2577	* the recover routine or else those subsequent intents will be
2578	* replayed in the wrong order!
2579	*
2580	* The recovery function can free the log item, so we must not
2581	* access dfp->dfp_intent after it returns. It must dispose of
2582	* @dfp if it returns 0.
2583	*/
2584	error = xfs_defer_finish_recovery(log->l_mp, dfp,
2585	&capture_list);
2586	if (error)
2587	break;
2588	}
2589	if (error)
2590	goto err;
2591
2592	error = xlog_finish_defer_ops(mp: log->l_mp, capture_list: &capture_list);
2593	if (error)
2594	goto err;
2595
2596	return `0`;
2597	err:
2598	xlog_abort_defer_ops(mp: log->l_mp, capture_list: &capture_list);
2599	return error;
2600	}
2601
2602	/*
2603	* A cancel occurs when the mount has failed and we're bailing out. Release all
2604	* pending log intent items that we haven't started recovery on so they don't
2605	* pin the AIL.
2606	*/
2607	STATIC void
2608	xlog_recover_cancel_intents(
2609	struct xlog *log)
2610	{
2611	struct xfs_defer_pending dfp, n;
2612
2613	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2614	ASSERT(xlog_item_is_intent(dfp->dfp_intent));
2615
2616	xfs_defer_cancel_recovery(log->l_mp, dfp);
2617	}
2618	}
2619
2620	/*
2621	* Transfer ownership of the recovered pending work to the recovery transaction
2622	* and try to finish the work. If there is more work to be done, the dfp will
2623	* remain attached to the transaction. If not, the dfp is freed.
2624	*/
2625	int
2626	xlog_recover_finish_intent(
2627	struct xfs_trans *tp,
2628	struct xfs_defer_pending *dfp)
2629	{
2630	int error;
2631
2632	list_move(list: &dfp->dfp_list, head: &tp->t_dfops);
2633	error = xfs_defer_finish_one(tp, dfp);
2634	if (error == -EAGAIN)
2635	return `0`;
2636	return error;
2637	}
2638
2639	/*
2640	* This routine performs a transaction to null out a bad inode pointer
2641	* in an agi unlinked inode hash bucket.
2642	*/
2643	STATIC void
2644	xlog_recover_clear_agi_bucket(
2645	struct xfs_perag *pag,
2646	int bucket)
2647	{
2648	struct xfs_mount *mp = pag->pag_mount;
2649	struct xfs_trans *tp;
2650	struct xfs_agi *agi;
2651	struct xfs_buf *agibp;
2652	int offset;
2653	int error;
2654
2655	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_clearagi, blocks: `0`, rtextents: `0`, flags: `0`, tpp: &tp);
2656	if (error)
2657	goto out_error;
2658
2659	error = xfs_read_agi(pag, tp, &agibp);
2660	if (error)
2661	goto out_abort;
2662
2663	agi = agibp->b_addr;
2664	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
2665	offset = offsetof(xfs_agi_t, agi_unlinked) +
2666	(sizeof(xfs_agino_t) * bucket);
2667	xfs_trans_log_buf(tp, agibp, offset,
2668	(offset + sizeof(xfs_agino_t) - `1`));
2669
2670	error = xfs_trans_commit(tp);
2671	if (error)
2672	goto out_error;
2673	return;
2674
2675	out_abort:
2676	xfs_trans_cancel(tp);
2677	out_error:
2678	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
2679	pag->pag_agno);
2680	return;
2681	}
2682
2683	static int
2684	xlog_recover_iunlink_bucket(
2685	struct xfs_perag *pag,
2686	struct xfs_agi *agi,
2687	int bucket)
2688	{
2689	struct xfs_mount *mp = pag->pag_mount;
2690	struct xfs_inode *prev_ip = NULL;
2691	struct xfs_inode *ip;
2692	xfs_agino_t prev_agino, agino;
2693	int error = `0`;
2694
2695	agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2696	while (agino != NULLAGINO) {
2697	error = xfs_iget(mp, NULL,
2698	XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
2699	`0`, `0`, &ip);
2700	if (error)
2701	break;
2702
2703	ASSERT(VFS_I(ip)->i_nlink == `0`);
2704	ASSERT(VFS_I(ip)->i_mode != `0`);
2705	xfs_iflags_clear(ip, XFS_IRECOVERY);
2706	agino = ip->i_next_unlinked;
2707
2708	if (prev_ip) {
2709	ip->i_prev_unlinked = prev_agino;
2710	xfs_irele(ip: prev_ip);
2711
2712	/*
2713	* Ensure the inode is removed from the unlinked list
2714	* before we continue so that it won't race with
2715	* building the in-memory list here. This could be
2716	* serialised with the agibp lock, but that just
2717	* serialises via lockstepping and it's much simpler
2718	* just to flush the inodegc queue and wait for it to
2719	* complete.
2720	*/
2721	error = xfs_inodegc_flush(mp);
2722	if (error)
2723	break;
2724	}
2725
2726	prev_agino = agino;
2727	prev_ip = ip;
2728	}
2729
2730	if (prev_ip) {
2731	int error2;
2732
2733	ip->i_prev_unlinked = prev_agino;
2734	xfs_irele(ip: prev_ip);
2735
2736	error2 = xfs_inodegc_flush(mp);
2737	if (error2 && !error)
2738	return error2;
2739	}
2740	return error;
2741	}
2742
2743	/*
2744	* Recover AGI unlinked lists
2745	*
2746	* This is called during recovery to process any inodes which we unlinked but
2747	* not freed when the system crashed. These inodes will be on the lists in the
2748	* AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2749	* any inodes found on the lists. Each inode is removed from the lists when it
2750	* has been fully truncated and is freed. The freeing of the inode and its
2751	* removal from the list must be atomic.
2752	*
2753	* If everything we touch in the agi processing loop is already in memory, this
2754	* loop can hold the cpu for a long time. It runs without lock contention,
2755	* memory allocation contention, the need wait for IO, etc, and so will run
2756	* until we either run out of inodes to process, run low on memory or we run out
2757	* of log space.
2758	*
2759	* This behaviour is bad for latency on single CPU and non-preemptible kernels,
2760	* and can prevent other filesystem work (such as CIL pushes) from running. This
2761	* can lead to deadlocks if the recovery process runs out of log reservation
2762	* space. Hence we need to yield the CPU when there is other kernel work
2763	* scheduled on this CPU to ensure other scheduled work can run without undue
2764	* latency.
2765	*/
2766	static void
2767	xlog_recover_iunlink_ag(
2768	struct xfs_perag *pag)
2769	{
2770	struct xfs_agi *agi;
2771	struct xfs_buf *agibp;
2772	int bucket;
2773	int error;
2774
2775	error = xfs_read_agi(pag, NULL, &agibp);
2776	if (error) {
2777	/*
2778	* AGI is b0rked. Don't process it.
2779	*
2780	* We should probably mark the filesystem as corrupt after we've
2781	* recovered all the ag's we can....
2782	*/
2783	return;
2784	}
2785
2786	/*
2787	* Unlock the buffer so that it can be acquired in the normal course of
2788	* the transaction to truncate and free each inode. Because we are not
2789	* racing with anyone else here for the AGI buffer, we don't even need
2790	* to hold it locked to read the initial unlinked bucket entries out of
2791	* the buffer. We keep buffer reference though, so that it stays pinned
2792	* in memory while we need the buffer.
2793	*/
2794	agi = agibp->b_addr;
2795	xfs_buf_unlock(agibp);
2796
2797	for (bucket = `0`; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
2798	error = xlog_recover_iunlink_bucket(pag, agi, bucket);
2799	if (error) {
2800	/*
2801	* Bucket is unrecoverable, so only a repair scan can
2802	* free the remaining unlinked inodes. Just empty the
2803	* bucket and remaining inodes on it unreferenced and
2804	* unfreeable.
2805	*/
2806	xlog_recover_clear_agi_bucket(pag, bucket);
2807	}
2808	}
2809
2810	xfs_buf_rele(agibp);
2811	}
2812
2813	static void
2814	xlog_recover_process_iunlinks(
2815	struct xlog *log)
2816	{
2817	struct xfs_perag *pag;
2818	xfs_agnumber_t agno;
2819
2820	for_each_perag(log->l_mp, agno, pag)
2821	xlog_recover_iunlink_ag(pag);
2822	}
2823
2824	STATIC void
2825	xlog_unpack_data(
2826	struct xlog_rec_header *rhead,
2827	char *dp,
2828	struct xlog *log)
2829	{
2830	int i, j, k;
2831
2832	for (i = `0`; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
2833	i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2834	(__be32 )dp = (__be32 )&rhead->h_cycle_data[i];
2835	dp += BBSIZE;
2836	}
2837
2838	if (xfs_has_logv2(mp: log->l_mp)) {
2839	xlog_in_core_2_t xhdr = (xlog_in_core_2_t )rhead;
2840	for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
2841	j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2842	k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2843	(__be32 )dp = xhdr[j].hic_xheader.xh_cycle_data[k];
2844	dp += BBSIZE;
2845	}
2846	}
2847	}
2848
2849	/*
2850	* CRC check, unpack and process a log record.
2851	*/
2852	STATIC int
2853	xlog_recover_process(
2854	struct xlog *log,
2855	struct hlist_head rhash[],
2856	struct xlog_rec_header *rhead,
2857	char *dp,
2858	int pass,
2859	struct list_head *buffer_list)
2860	{
2861	__le32 old_crc = rhead->h_crc;
2862	__le32 crc;
2863
2864	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
2865
2866	/*
2867	* Nothing else to do if this is a CRC verification pass. Just return
2868	* if this a record with a non-zero crc. Unfortunately, mkfs always
2869	* sets old_crc to 0 so we must consider this valid even on v5 supers.
2870	* Otherwise, return EFSBADCRC on failure so the callers up the stack
2871	* know precisely what failed.
2872	*/
2873	if (pass == XLOG_RECOVER_CRCPASS) {
2874	if (old_crc && crc != old_crc)
2875	return -EFSBADCRC;
2876	return `0`;
2877	}
2878
2879	/*
2880	* We're in the normal recovery path. Issue a warning if and only if the
2881	* CRC in the header is non-zero. This is an advisory warning and the
2882	* zero CRC check prevents warnings from being emitted when upgrading
2883	* the kernel from one that does not add CRCs by default.
2884	*/
2885	if (crc != old_crc) {
2886	if (old_crc \|\| xfs_has_crc(mp: log->l_mp)) {
2887	xfs_alert(log->l_mp,
2888	"log record CRC mismatch: found 0x%x, expected 0x%x.",
2889	le32_to_cpu(old_crc),
2890	le32_to_cpu(crc));
2891	xfs_hex_dump(p: dp, length: `32`);
2892	}
2893
2894	/*
2895	* If the filesystem is CRC enabled, this mismatch becomes a
2896	* fatal log corruption failure.
2897	*/
2898	if (xfs_has_crc(mp: log->l_mp)) {
2899	XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2900	return -EFSCORRUPTED;
2901	}
2902	}
2903
2904	xlog_unpack_data(rhead, dp, log);
2905
2906	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
2907	buffer_list);
2908	}
2909
2910	STATIC int
2911	xlog_valid_rec_header(
2912	struct xlog *log,
2913	struct xlog_rec_header *rhead,
2914	xfs_daddr_t blkno,
2915	int bufsize)
2916	{
2917	int hlen;
2918
2919	if (XFS_IS_CORRUPT(log->l_mp,
2920	rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2921	return -EFSCORRUPTED;
2922	if (XFS_IS_CORRUPT(log->l_mp,
2923	(!rhead->h_version \|\|
2924	(be32_to_cpu(rhead->h_version) &
2925	(~XLOG_VERSION_OKBITS))))) {
2926	xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
2927	__func__, be32_to_cpu(rhead->h_version));
2928	return -EFSCORRUPTED;
2929	}
2930
2931	/*
2932	* LR body must have data (or it wouldn't have been written)
2933	* and h_len must not be greater than LR buffer size.
2934	*/
2935	hlen = be32_to_cpu(rhead->h_len);
2936	if (XFS_IS_CORRUPT(log->l_mp, hlen <= `0` \|\| hlen > bufsize))
2937	return -EFSCORRUPTED;
2938
2939	if (XFS_IS_CORRUPT(log->l_mp,
2940	blkno > log->l_logBBsize \|\| blkno > INT_MAX))
2941	return -EFSCORRUPTED;
2942	return `0`;
2943	}
2944
2945	/*
2946	* Read the log from tail to head and process the log records found.
2947	* Handle the two cases where the tail and head are in the same cycle
2948	* and where the active portion of the log wraps around the end of
2949	* the physical log separately. The pass parameter is passed through
2950	* to the routines called to process the data and is not looked at
2951	* here.
2952	*/
2953	STATIC int
2954	xlog_do_recovery_pass(
2955	struct xlog *log,
2956	xfs_daddr_t head_blk,
2957	xfs_daddr_t tail_blk,
2958	int pass,
2959	xfs_daddr_t first_bad) /* out: first bad log rec /
2960	{
2961	xlog_rec_header_t *rhead;
2962	xfs_daddr_t blk_no, rblk_no;
2963	xfs_daddr_t rhead_blk;
2964	char *offset;
2965	char hbp, dbp;
2966	int error = `0`, h_size, h_len;
2967	int error2 = `0`;
2968	int bblks, split_bblks;
2969	int hblks, split_hblks, wrapped_hblks;
2970	int i;
2971	struct hlist_head rhash[XLOG_RHASH_SIZE];
2972	LIST_HEAD (buffer_list);
2973
2974	ASSERT(head_blk != tail_blk);
2975	blk_no = rhead_blk = tail_blk;
2976
2977	for (i = `0`; i < XLOG_RHASH_SIZE; i++)
2978	INIT_HLIST_HEAD(&rhash[i]);
2979
2980	/*
2981	* Read the header of the tail block and get the iclog buffer size from
2982	* h_size. Use this to tell how many sectors make up the log header.
2983	*/
2984	if (xfs_has_logv2(mp: log->l_mp)) {
2985	/*
2986	* When using variable length iclogs, read first sector of
2987	* iclog header and extract the header size from it. Get a
2988	* new hbp that is the correct size.
2989	*/
2990	hbp = xlog_alloc_buffer(log, nbblks: `1`);
2991	if (!hbp)
2992	return -ENOMEM;
2993
2994	error = xlog_bread(log, blk_no: tail_blk, nbblks: `1`, data: hbp, offset: &offset);
2995	if (error)
2996	goto bread_err1;
2997
2998	rhead = (xlog_rec_header_t *)offset;
2999
3000	/*
3001	* xfsprogs has a bug where record length is based on lsunit but
3002	* h_size (iclog size) is hardcoded to 32k. Now that we
3003	* unconditionally CRC verify the unmount record, this means the
3004	* log buffer can be too small for the record and cause an
3005	* overrun.
3006	*
3007	* Detect this condition here. Use lsunit for the buffer size as
3008	* long as this looks like the mkfs case. Otherwise, return an
3009	* error to avoid a buffer overrun.
3010	*/
3011	h_size = be32_to_cpu(rhead->h_size);
3012	h_len = be32_to_cpu(rhead->h_len);
3013	if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3014	rhead->h_num_logops == cpu_to_be32(`1`)) {
3015	xfs_warn(log->l_mp,
3016	"invalid iclog size (%d bytes), using lsunit (%d bytes)",
3017	h_size, log->l_mp->m_logbsize);
3018	h_size = log->l_mp->m_logbsize;
3019	}
3020
3021	error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3022	if (error)
3023	goto bread_err1;
3024
3025	hblks = xlog_logrec_hblks(log, rhead);
3026	if (hblks != `1`) {
3027	kvfree(addr: hbp);
3028	hbp = xlog_alloc_buffer(log, nbblks: hblks);
3029	}
3030	} else {
3031	ASSERT(log->l_sectBBsize == `1`);
3032	hblks = `1`;
3033	hbp = xlog_alloc_buffer(log, nbblks: `1`);
3034	h_size = XLOG_BIG_RECORD_BSIZE;
3035	}
3036
3037	if (!hbp)
3038	return -ENOMEM;
3039	dbp = xlog_alloc_buffer(log, nbblks: BTOBB(h_size));
3040	if (!dbp) {
3041	kvfree(addr: hbp);
3042	return -ENOMEM;
3043	}
3044
3045	memset(rhash, `0`, sizeof(rhash));
3046	if (tail_blk > head_blk) {
3047	/*
3048	* Perform recovery around the end of the physical log.
3049	* When the head is not on the same cycle number as the tail,
3050	* we can't do a sequential recovery.
3051	*/
3052	while (blk_no < log->l_logBBsize) {
3053	/*
3054	* Check for header wrapping around physical end-of-log
3055	*/
3056	offset = hbp;
3057	split_hblks = `0`;
3058	wrapped_hblks = `0`;
3059	if (blk_no + hblks <= log->l_logBBsize) {
3060	/ Read header in one read /
3061	error = xlog_bread(log, blk_no, nbblks: hblks, data: hbp,
3062	offset: &offset);
3063	if (error)
3064	goto bread_err2;
3065	} else {
3066	/ This LR is split across physical log end /
3067	if (blk_no != log->l_logBBsize) {
3068	/ some data before physical log end /
3069	ASSERT(blk_no <= INT_MAX);
3070	split_hblks = log->l_logBBsize - (int)blk_no;
3071	ASSERT(split_hblks > `0`);
3072	error = xlog_bread(log, blk_no,
3073	nbblks: split_hblks, data: hbp,
3074	offset: &offset);
3075	if (error)
3076	goto bread_err2;
3077	}
3078
3079	/*
3080	* Note: this black magic still works with
3081	* large sector sizes (non-512) only because:
3082	* - we increased the buffer size originally
3083	* by 1 sector giving us enough extra space
3084	* for the second read;
3085	* - the log start is guaranteed to be sector
3086	* aligned;
3087	* - we read the log end (LR header start)
3088	* _first_, then the log start (LR header end)
3089	* - order is important.
3090	*/
3091	wrapped_hblks = hblks - split_hblks;
3092	error = xlog_bread_noalign(log, blk_no: `0`,
3093	nbblks: wrapped_hblks,
3094	data: offset + BBTOB(split_hblks));
3095	if (error)
3096	goto bread_err2;
3097	}
3098	rhead = (xlog_rec_header_t *)offset;
3099	error = xlog_valid_rec_header(log, rhead,
3100	split_hblks ? blk_no : `0`, h_size);
3101	if (error)
3102	goto bread_err2;
3103
3104	bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3105	blk_no += hblks;
3106
3107	/*
3108	* Read the log record data in multiple reads if it
3109	* wraps around the end of the log. Note that if the
3110	* header already wrapped, blk_no could point past the
3111	* end of the log. The record data is contiguous in
3112	* that case.
3113	*/
3114	if (blk_no + bblks <= log->l_logBBsize \|\|
3115	blk_no >= log->l_logBBsize) {
3116	rblk_no = xlog_wrap_logbno(log, bno: blk_no);
3117	error = xlog_bread(log, blk_no: rblk_no, nbblks: bblks, data: dbp,
3118	offset: &offset);
3119	if (error)
3120	goto bread_err2;
3121	} else {
3122	/ This log record is split across the*
3123	* physical end of log */
3124	offset = dbp;
3125	split_bblks = `0`;
3126	if (blk_no != log->l_logBBsize) {
3127	/ some data is before the physical*
3128	* end of log */
3129	ASSERT(!wrapped_hblks);
3130	ASSERT(blk_no <= INT_MAX);
3131	split_bblks =
3132	log->l_logBBsize - (int)blk_no;
3133	ASSERT(split_bblks > `0`);
3134	error = xlog_bread(log, blk_no,
3135	nbblks: split_bblks, data: dbp,
3136	offset: &offset);
3137	if (error)
3138	goto bread_err2;
3139	}
3140
3141	/*
3142	* Note: this black magic still works with
3143	* large sector sizes (non-512) only because:
3144	* - we increased the buffer size originally
3145	* by 1 sector giving us enough extra space
3146	* for the second read;
3147	* - the log start is guaranteed to be sector
3148	* aligned;
3149	* - we read the log end (LR header start)
3150	* _first_, then the log start (LR header end)
3151	* - order is important.
3152	*/
3153	error = xlog_bread_noalign(log, blk_no: `0`,
3154	nbblks: bblks - split_bblks,
3155	data: offset + BBTOB(split_bblks));
3156	if (error)
3157	goto bread_err2;
3158	}
3159
3160	error = xlog_recover_process(log, rhash, rhead, offset,
3161	pass, &buffer_list);
3162	if (error)
3163	goto bread_err2;
3164
3165	blk_no += bblks;
3166	rhead_blk = blk_no;
3167	}
3168
3169	ASSERT(blk_no >= log->l_logBBsize);
3170	blk_no -= log->l_logBBsize;
3171	rhead_blk = blk_no;
3172	}
3173
3174	/ read first part of physical log /
3175	while (blk_no < head_blk) {
3176	error = xlog_bread(log, blk_no, nbblks: hblks, data: hbp, offset: &offset);
3177	if (error)
3178	goto bread_err2;
3179
3180	rhead = (xlog_rec_header_t *)offset;
3181	error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
3182	if (error)
3183	goto bread_err2;
3184
3185	/ blocks in data section /
3186	bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3187	error = xlog_bread(log, blk_no: blk_no+hblks, nbblks: bblks, data: dbp,
3188	offset: &offset);
3189	if (error)
3190	goto bread_err2;
3191
3192	error = xlog_recover_process(log, rhash, rhead, offset, pass,
3193	&buffer_list);
3194	if (error)
3195	goto bread_err2;
3196
3197	blk_no += bblks + hblks;
3198	rhead_blk = blk_no;
3199	}
3200
3201	bread_err2:
3202	kvfree(addr: dbp);
3203	bread_err1:
3204	kvfree(addr: hbp);
3205
3206	/*
3207	* Submit buffers that have been dirtied by the last record recovered.
3208	*/
3209	if (!list_empty(head: &buffer_list)) {
3210	if (error) {
3211	/*
3212	* If there has been an item recovery error then we
3213	* cannot allow partial checkpoint writeback to
3214	* occur. We might have multiple checkpoints with the
3215	* same start LSN in this buffer list, and partial
3216	* writeback of a checkpoint in this situation can
3217	* prevent future recovery of all the changes in the
3218	* checkpoints at this start LSN.
3219	*
3220	* Note: Shutting down the filesystem will result in the
3221	* delwri submission marking all the buffers stale,
3222	* completing them and cleaning up _XBF_LOGRECOVERY
3223	* state without doing any IO.
3224	*/
3225	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3226	}
3227	error2 = xfs_buf_delwri_submit(&buffer_list);
3228	}
3229
3230	if (error && first_bad)
3231	*first_bad = rhead_blk;
3232
3233	/*
3234	* Transactions are freed at commit time but transactions without commit
3235	* records on disk are never committed. Free any that may be left in the
3236	* hash table.
3237	*/
3238	for (i = `0`; i < XLOG_RHASH_SIZE; i++) {
3239	struct hlist_node *tmp;
3240	struct xlog_recover *trans;
3241
3242	hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3243	xlog_recover_free_trans(trans);
3244	}
3245
3246	return error ? error : error2;
3247	}
3248
3249	/*
3250	* Do the recovery of the log. We actually do this in two phases.
3251	* The two passes are necessary in order to implement the function
3252	* of cancelling a record written into the log. The first pass
3253	* determines those things which have been cancelled, and the
3254	* second pass replays log items normally except for those which
3255	* have been cancelled. The handling of the replay and cancellations
3256	* takes place in the log item type specific routines.
3257	*
3258	* The table of items which have cancel records in the log is allocated
3259	* and freed at this level, since only here do we know when all of
3260	* the log recovery has been completed.
3261	*/
3262	STATIC int
3263	xlog_do_log_recovery(
3264	struct xlog *log,
3265	xfs_daddr_t head_blk,
3266	xfs_daddr_t tail_blk)
3267	{
3268	int error;
3269
3270	ASSERT(head_blk != tail_blk);
3271
3272	/*
3273	* First do a pass to find all of the cancelled buf log items.
3274	* Store them in the buf_cancel_table for use in the second pass.
3275	*/
3276	error = xlog_alloc_buf_cancel_table(log);
3277	if (error)
3278	return error;
3279
3280	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3281	XLOG_RECOVER_PASS1, NULL);
3282	if (error != `0`)
3283	goto out_cancel;
3284
3285	/*
3286	* Then do a second pass to actually recover the items in the log.
3287	* When it is complete free the table of buf cancel items.
3288	*/
3289	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3290	XLOG_RECOVER_PASS2, NULL);
3291	if (!error)
3292	xlog_check_buf_cancel_table(log);
3293	out_cancel:
3294	xlog_free_buf_cancel_table(log);
3295	return error;
3296	}
3297
3298	/*
3299	* Do the actual recovery
3300	*/
3301	STATIC int
3302	xlog_do_recover(
3303	struct xlog *log,
3304	xfs_daddr_t head_blk,
3305	xfs_daddr_t tail_blk)
3306	{
3307	struct xfs_mount *mp = log->l_mp;
3308	struct xfs_buf *bp = mp->m_sb_bp;
3309	struct xfs_sb *sbp = &mp->m_sb;
3310	int error;
3311
3312	trace_xfs_log_recover(log, headblk: head_blk, tailblk: tail_blk);
3313
3314	/*
3315	* First replay the images in the log.
3316	*/
3317	error = xlog_do_log_recovery(log, head_blk, tail_blk);
3318	if (error)
3319	return error;
3320
3321	if (xlog_is_shutdown(log))
3322	return -EIO;
3323
3324	/*
3325	* We now update the tail_lsn since much of the recovery has completed
3326	* and there may be space available to use. If there were no extent
3327	* or iunlinks, we can free up the entire log and set the tail_lsn to
3328	* be the last_sync_lsn. This was set in xlog_find_tail to be the
3329	* lsn of the last known good LR on disk. If there are extent frees
3330	* or iunlinks they will have some entries in the AIL; so we look at
3331	* the AIL to determine how to set the tail_lsn.
3332	*/
3333	xlog_assign_tail_lsn(mp);
3334
3335	/*
3336	* Now that we've finished replaying all buffer and inode updates,
3337	* re-read the superblock and reverify it.
3338	*/
3339	xfs_buf_lock(bp);
3340	xfs_buf_hold(bp);
3341	error = _xfs_buf_read(bp, XBF_READ);
3342	if (error) {
3343	if (!xlog_is_shutdown(log)) {
3344	xfs_buf_ioerror_alert(bp, __this_address);
3345	ASSERT(`0`);
3346	}
3347	xfs_buf_relse(bp);
3348	return error;
3349	}
3350
3351	/ Convert superblock from on-disk format /
3352	xfs_sb_from_disk(sbp, bp->b_addr);
3353	xfs_buf_relse(bp);
3354
3355	/ re-initialise in-core superblock and geometry structures /
3356	mp->m_features \|= xfs_sb_version_to_features(sbp);
3357	xfs_reinit_percpu_counters(mp);
3358	error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
3359	&mp->m_maxagi);
3360	if (error) {
3361	xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3362	return error;
3363	}
3364	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
3365
3366	/ Normal transactions can now occur /
3367	clear_bit(XLOG_ACTIVE_RECOVERY, addr: &log->l_opstate);
3368	return `0`;
3369	}
3370
3371	/*
3372	* Perform recovery and re-initialize some log variables in xlog_find_tail.
3373	*
3374	* Return error or zero.
3375	*/
3376	int
3377	xlog_recover(
3378	struct xlog *log)
3379	{
3380	xfs_daddr_t head_blk, tail_blk;
3381	int error;
3382
3383	/ find the tail of the log /
3384	error = xlog_find_tail(log, head_blk: &head_blk, tail_blk: &tail_blk);
3385	if (error)
3386	return error;
3387
3388	/*
3389	* The superblock was read before the log was available and thus the LSN
3390	* could not be verified. Check the superblock LSN against the current
3391	* LSN now that it's known.
3392	*/
3393	if (xfs_has_crc(mp: log->l_mp) &&
3394	!xfs_log_check_lsn(log->l_mp, xfs_lsn_t: log->l_mp->m_sb.sb_lsn))
3395	return -EINVAL;
3396
3397	if (tail_blk != head_blk) {
3398	/ There used to be a comment here:*
3399	*
3400	* disallow recovery on read-only mounts. note -- mount
3401	* checks for ENOSPC and turns it into an intelligent
3402	* error message.
3403	* ...but this is no longer true. Now, unless you specify
3404	* NORECOVERY (in which case this function would never be
3405	* called), we just go ahead and recover. We do this all
3406	* under the vfs layer, so we can get away with it unless
3407	* the device itself is read-only, in which case we fail.
3408	*/
3409	if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3410	return error;
3411	}
3412
3413	/*
3414	* Version 5 superblock log feature mask validation. We know the
3415	* log is dirty so check if there are any unknown log features
3416	* in what we need to recover. If there are unknown features
3417	* (e.g. unsupported transactions, then simply reject the
3418	* attempt at recovery before touching anything.
3419	*/
3420	if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3421	xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3422	XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3423	xfs_warn(log->l_mp,
3424	"Superblock has unknown incompatible log features (0x%x) enabled.",
3425	(log->l_mp->m_sb.sb_features_log_incompat &
3426	XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3427	xfs_warn(log->l_mp,
3428	"The log can not be fully and/or safely recovered by this kernel.");
3429	xfs_warn(log->l_mp,
3430	"Please recover the log on a kernel that supports the unknown features.");
3431	return -EINVAL;
3432	}
3433
3434	/*
3435	* Delay log recovery if the debug hook is set. This is debug
3436	* instrumentation to coordinate simulation of I/O failures with
3437	* log recovery.
3438	*/
3439	if (xfs_globals.log_recovery_delay) {
3440	xfs_notice(log->l_mp,
3441	"Delaying log recovery for %d seconds.",
3442	xfs_globals.log_recovery_delay);
3443	msleep(msecs: xfs_globals.log_recovery_delay * `1000`);
3444	}
3445
3446	xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3447	log->l_mp->m_logname ? log->l_mp->m_logname
3448	: "internal");
3449
3450	error = xlog_do_recover(log, head_blk, tail_blk);
3451	set_bit(XLOG_RECOVERY_NEEDED, addr: &log->l_opstate);
3452	}
3453	return error;
3454	}
3455
3456	/*
3457	* In the first part of recovery we replay inodes and buffers and build up the
3458	* list of intents which need to be processed. Here we process the intents and
3459	* clean up the on disk unlinked inode lists. This is separated from the first
3460	* part of recovery so that the root and real-time bitmap inodes can be read in
3461	* from disk in between the two stages. This is necessary so that we can free
3462	* space in the real-time portion of the file system.
3463	*
3464	* We run this whole process under GFP_NOFS allocation context. We do a
3465	* combination of non-transactional and transactional work, yet we really don't
3466	* want to recurse into the filesystem from direct reclaim during any of this
3467	* processing. This allows all the recovery code run here not to care about the
3468	* memory allocation context it is running in.
3469	*/
3470	int
3471	xlog_recover_finish(
3472	struct xlog *log)
3473	{
3474	unsigned int nofs_flags = memalloc_nofs_save();
3475	int error;
3476
3477	error = xlog_recover_process_intents(log);
3478	if (error) {
3479	/*
3480	* Cancel all the unprocessed intent items now so that we don't
3481	* leave them pinned in the AIL. This can cause the AIL to
3482	* livelock on the pinned item if anyone tries to push the AIL
3483	* (inode reclaim does this) before we get around to
3484	* xfs_log_mount_cancel.
3485	*/
3486	xlog_recover_cancel_intents(log);
3487	xfs_alert(log->l_mp, "Failed to recover intents");
3488	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3489	goto out_error;
3490	}
3491
3492	/*
3493	* Sync the log to get all the intents out of the AIL. This isn't
3494	* absolutely necessary, but it helps in case the unlink transactions
3495	* would have problems pushing the intents out of the way.
3496	*/
3497	xfs_log_force(mp: log->l_mp, XFS_LOG_SYNC);
3498
3499	/*
3500	* Now that we've recovered the log and all the intents, we can clear
3501	* the log incompat feature bits in the superblock because there's no
3502	* longer anything to protect. We rely on the AIL push to write out the
3503	* updated superblock after everything else.
3504	*/
3505	if (xfs_clear_incompat_log_features(mp: log->l_mp)) {
3506	error = xfs_sync_sb(log->l_mp, false);
3507	if (error < `0`) {
3508	xfs_alert(log->l_mp,
3509	"Failed to clear log incompat features on recovery");
3510	goto out_error;
3511	}
3512	}
3513
3514	xlog_recover_process_iunlinks(log);
3515
3516	/*
3517	* Recover any CoW staging blocks that are still referenced by the
3518	* ondisk refcount metadata. During mount there cannot be any live
3519	* staging extents as we have not permitted any user modifications.
3520	* Therefore, it is safe to free them all right now, even on a
3521	* read-only mount.
3522	*/
3523	error = xfs_reflink_recover_cow(mp: log->l_mp);
3524	if (error) {
3525	xfs_alert(log->l_mp,
3526	"Failed to recover leftover CoW staging extents, err %d.",
3527	error);
3528	/*
3529	* If we get an error here, make sure the log is shut down
3530	* but return zero so that any log items committed since the
3531	* end of intents processing can be pushed through the CIL
3532	* and AIL.
3533	*/
3534	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3535	error = `0`;
3536	goto out_error;
3537	}
3538
3539	out_error:
3540	memalloc_nofs_restore(flags: nofs_flags);
3541	return error;
3542	}
3543
3544	void
3545	xlog_recover_cancel(
3546	struct xlog *log)
3547	{
3548	if (xlog_recovery_needed(log))
3549	xlog_recover_cancel_intents(log);
3550	}
3551
3552

source code of linux/fs/xfs/xfs_log_recover.c