1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2000-2006 Silicon Graphics, Inc. |
4 | * All Rights Reserved. |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_log_format.h" |
11 | #include "xfs_trans_resv.h" |
12 | #include "xfs_bit.h" |
13 | #include "xfs_mount.h" |
14 | #include "xfs_trans.h" |
15 | #include "xfs_buf_item.h" |
16 | #include "xfs_trans_priv.h" |
17 | #include "xfs_trace.h" |
18 | #include "xfs_log.h" |
19 | #include "xfs_log_priv.h" |
20 | #include "xfs_log_recover.h" |
21 | #include "xfs_error.h" |
22 | #include "xfs_inode.h" |
23 | #include "xfs_dir2.h" |
24 | #include "xfs_quota.h" |
25 | |
26 | /* |
27 | * This is the number of entries in the l_buf_cancel_table used during |
28 | * recovery. |
29 | */ |
30 | #define XLOG_BC_TABLE_SIZE 64 |
31 | |
32 | #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ |
33 | ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) |
34 | |
35 | /* |
36 | * This structure is used during recovery to record the buf log items which |
37 | * have been canceled and should not be replayed. |
38 | */ |
39 | struct xfs_buf_cancel { |
40 | xfs_daddr_t bc_blkno; |
41 | uint bc_len; |
42 | int bc_refcount; |
43 | struct list_head bc_list; |
44 | }; |
45 | |
46 | static struct xfs_buf_cancel * |
47 | xlog_find_buffer_cancelled( |
48 | struct xlog *log, |
49 | xfs_daddr_t blkno, |
50 | uint len) |
51 | { |
52 | struct list_head *bucket; |
53 | struct xfs_buf_cancel *bcp; |
54 | |
55 | if (!log->l_buf_cancel_table) |
56 | return NULL; |
57 | |
58 | bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); |
59 | list_for_each_entry(bcp, bucket, bc_list) { |
60 | if (bcp->bc_blkno == blkno && bcp->bc_len == len) |
61 | return bcp; |
62 | } |
63 | |
64 | return NULL; |
65 | } |
66 | |
67 | static bool |
68 | xlog_add_buffer_cancelled( |
69 | struct xlog *log, |
70 | xfs_daddr_t blkno, |
71 | uint len) |
72 | { |
73 | struct xfs_buf_cancel *bcp; |
74 | |
75 | /* |
76 | * If we find an existing cancel record, this indicates that the buffer |
77 | * was cancelled multiple times. To ensure that during pass 2 we keep |
78 | * the record in the table until we reach its last occurrence in the |
79 | * log, a reference count is kept to tell how many times we expect to |
80 | * see this record during the second pass. |
81 | */ |
82 | bcp = xlog_find_buffer_cancelled(log, blkno, len); |
83 | if (bcp) { |
84 | bcp->bc_refcount++; |
85 | return false; |
86 | } |
87 | |
88 | bcp = kmalloc(size: sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); |
89 | bcp->bc_blkno = blkno; |
90 | bcp->bc_len = len; |
91 | bcp->bc_refcount = 1; |
92 | list_add_tail(new: &bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); |
93 | return true; |
94 | } |
95 | |
96 | /* |
97 | * Check if there is and entry for blkno, len in the buffer cancel record table. |
98 | */ |
99 | bool |
100 | xlog_is_buffer_cancelled( |
101 | struct xlog *log, |
102 | xfs_daddr_t blkno, |
103 | uint len) |
104 | { |
105 | return xlog_find_buffer_cancelled(log, blkno, len) != NULL; |
106 | } |
107 | |
108 | /* |
109 | * Check if there is and entry for blkno, len in the buffer cancel record table, |
110 | * and decremented the reference count on it if there is one. |
111 | * |
112 | * Remove the cancel record once the refcount hits zero, so that if the same |
113 | * buffer is re-used again after its last cancellation we actually replay the |
114 | * changes made at that point. |
115 | */ |
116 | static bool |
117 | xlog_put_buffer_cancelled( |
118 | struct xlog *log, |
119 | xfs_daddr_t blkno, |
120 | uint len) |
121 | { |
122 | struct xfs_buf_cancel *bcp; |
123 | |
124 | bcp = xlog_find_buffer_cancelled(log, blkno, len); |
125 | if (!bcp) { |
126 | ASSERT(0); |
127 | return false; |
128 | } |
129 | |
130 | if (--bcp->bc_refcount == 0) { |
131 | list_del(entry: &bcp->bc_list); |
132 | kfree(objp: bcp); |
133 | } |
134 | return true; |
135 | } |
136 | |
137 | /* log buffer item recovery */ |
138 | |
139 | /* |
140 | * Sort buffer items for log recovery. Most buffer items should end up on the |
141 | * buffer list and are recovered first, with the following exceptions: |
142 | * |
143 | * 1. XFS_BLF_CANCEL buffers must be processed last because some log items |
144 | * might depend on the incor ecancellation record, and replaying a cancelled |
145 | * buffer item can remove the incore record. |
146 | * |
147 | * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that |
148 | * we replay di_next_unlinked only after flushing the inode 'free' state |
149 | * to the inode buffer. |
150 | * |
151 | * See xlog_recover_reorder_trans for more details. |
152 | */ |
153 | STATIC enum xlog_recover_reorder |
154 | xlog_recover_buf_reorder( |
155 | struct xlog_recover_item *item) |
156 | { |
157 | struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; |
158 | |
159 | if (buf_f->blf_flags & XFS_BLF_CANCEL) |
160 | return XLOG_REORDER_CANCEL_LIST; |
161 | if (buf_f->blf_flags & XFS_BLF_INODE_BUF) |
162 | return XLOG_REORDER_INODE_BUFFER_LIST; |
163 | return XLOG_REORDER_BUFFER_LIST; |
164 | } |
165 | |
166 | STATIC void |
167 | xlog_recover_buf_ra_pass2( |
168 | struct xlog *log, |
169 | struct xlog_recover_item *item) |
170 | { |
171 | struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; |
172 | |
173 | xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); |
174 | } |
175 | |
176 | /* |
177 | * Build up the table of buf cancel records so that we don't replay cancelled |
178 | * data in the second pass. |
179 | */ |
180 | static int |
181 | xlog_recover_buf_commit_pass1( |
182 | struct xlog *log, |
183 | struct xlog_recover_item *item) |
184 | { |
185 | struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; |
186 | |
187 | if (!xfs_buf_log_check_iovec(iovec: &item->ri_buf[0])) { |
188 | xfs_err(log->l_mp, "bad buffer log item size (%d)" , |
189 | item->ri_buf[0].i_len); |
190 | return -EFSCORRUPTED; |
191 | } |
192 | |
193 | if (!(bf->blf_flags & XFS_BLF_CANCEL)) |
194 | trace_xfs_log_recover_buf_not_cancel(log, buf_f: bf); |
195 | else if (xlog_add_buffer_cancelled(log, blkno: bf->blf_blkno, len: bf->blf_len)) |
196 | trace_xfs_log_recover_buf_cancel_add(log, buf_f: bf); |
197 | else |
198 | trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f: bf); |
199 | return 0; |
200 | } |
201 | |
202 | /* |
203 | * Validate the recovered buffer is of the correct type and attach the |
204 | * appropriate buffer operations to them for writeback. Magic numbers are in a |
205 | * few places: |
206 | * the first 16 bits of the buffer (inode buffer, dquot buffer), |
207 | * the first 32 bits of the buffer (most blocks), |
208 | * inside a struct xfs_da_blkinfo at the start of the buffer. |
209 | */ |
210 | static void |
211 | xlog_recover_validate_buf_type( |
212 | struct xfs_mount *mp, |
213 | struct xfs_buf *bp, |
214 | struct xfs_buf_log_format *buf_f, |
215 | xfs_lsn_t current_lsn) |
216 | { |
217 | struct xfs_da_blkinfo *info = bp->b_addr; |
218 | uint32_t magic32; |
219 | uint16_t magic16; |
220 | uint16_t magicda; |
221 | char *warnmsg = NULL; |
222 | |
223 | /* |
224 | * We can only do post recovery validation on items on CRC enabled |
225 | * fielsystems as we need to know when the buffer was written to be able |
226 | * to determine if we should have replayed the item. If we replay old |
227 | * metadata over a newer buffer, then it will enter a temporarily |
228 | * inconsistent state resulting in verification failures. Hence for now |
229 | * just avoid the verification stage for non-crc filesystems |
230 | */ |
231 | if (!xfs_has_crc(mp)) |
232 | return; |
233 | |
234 | magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); |
235 | magic16 = be16_to_cpu(*(__be16*)bp->b_addr); |
236 | magicda = be16_to_cpu(info->magic); |
237 | switch (xfs_blft_from_flags(buf_f)) { |
238 | case XFS_BLFT_BTREE_BUF: |
239 | switch (magic32) { |
240 | case XFS_ABTB_CRC_MAGIC: |
241 | case XFS_ABTB_MAGIC: |
242 | bp->b_ops = &xfs_bnobt_buf_ops; |
243 | break; |
244 | case XFS_ABTC_CRC_MAGIC: |
245 | case XFS_ABTC_MAGIC: |
246 | bp->b_ops = &xfs_cntbt_buf_ops; |
247 | break; |
248 | case XFS_IBT_CRC_MAGIC: |
249 | case XFS_IBT_MAGIC: |
250 | bp->b_ops = &xfs_inobt_buf_ops; |
251 | break; |
252 | case XFS_FIBT_CRC_MAGIC: |
253 | case XFS_FIBT_MAGIC: |
254 | bp->b_ops = &xfs_finobt_buf_ops; |
255 | break; |
256 | case XFS_BMAP_CRC_MAGIC: |
257 | case XFS_BMAP_MAGIC: |
258 | bp->b_ops = &xfs_bmbt_buf_ops; |
259 | break; |
260 | case XFS_RMAP_CRC_MAGIC: |
261 | bp->b_ops = &xfs_rmapbt_buf_ops; |
262 | break; |
263 | case XFS_REFC_CRC_MAGIC: |
264 | bp->b_ops = &xfs_refcountbt_buf_ops; |
265 | break; |
266 | default: |
267 | warnmsg = "Bad btree block magic!" ; |
268 | break; |
269 | } |
270 | break; |
271 | case XFS_BLFT_AGF_BUF: |
272 | if (magic32 != XFS_AGF_MAGIC) { |
273 | warnmsg = "Bad AGF block magic!" ; |
274 | break; |
275 | } |
276 | bp->b_ops = &xfs_agf_buf_ops; |
277 | break; |
278 | case XFS_BLFT_AGFL_BUF: |
279 | if (magic32 != XFS_AGFL_MAGIC) { |
280 | warnmsg = "Bad AGFL block magic!" ; |
281 | break; |
282 | } |
283 | bp->b_ops = &xfs_agfl_buf_ops; |
284 | break; |
285 | case XFS_BLFT_AGI_BUF: |
286 | if (magic32 != XFS_AGI_MAGIC) { |
287 | warnmsg = "Bad AGI block magic!" ; |
288 | break; |
289 | } |
290 | bp->b_ops = &xfs_agi_buf_ops; |
291 | break; |
292 | case XFS_BLFT_UDQUOT_BUF: |
293 | case XFS_BLFT_PDQUOT_BUF: |
294 | case XFS_BLFT_GDQUOT_BUF: |
295 | #ifdef CONFIG_XFS_QUOTA |
296 | if (magic16 != XFS_DQUOT_MAGIC) { |
297 | warnmsg = "Bad DQUOT block magic!" ; |
298 | break; |
299 | } |
300 | bp->b_ops = &xfs_dquot_buf_ops; |
301 | #else |
302 | xfs_alert(mp, |
303 | "Trying to recover dquots without QUOTA support built in!" ); |
304 | ASSERT(0); |
305 | #endif |
306 | break; |
307 | case XFS_BLFT_DINO_BUF: |
308 | if (magic16 != XFS_DINODE_MAGIC) { |
309 | warnmsg = "Bad INODE block magic!" ; |
310 | break; |
311 | } |
312 | bp->b_ops = &xfs_inode_buf_ops; |
313 | break; |
314 | case XFS_BLFT_SYMLINK_BUF: |
315 | if (magic32 != XFS_SYMLINK_MAGIC) { |
316 | warnmsg = "Bad symlink block magic!" ; |
317 | break; |
318 | } |
319 | bp->b_ops = &xfs_symlink_buf_ops; |
320 | break; |
321 | case XFS_BLFT_DIR_BLOCK_BUF: |
322 | if (magic32 != XFS_DIR2_BLOCK_MAGIC && |
323 | magic32 != XFS_DIR3_BLOCK_MAGIC) { |
324 | warnmsg = "Bad dir block magic!" ; |
325 | break; |
326 | } |
327 | bp->b_ops = &xfs_dir3_block_buf_ops; |
328 | break; |
329 | case XFS_BLFT_DIR_DATA_BUF: |
330 | if (magic32 != XFS_DIR2_DATA_MAGIC && |
331 | magic32 != XFS_DIR3_DATA_MAGIC) { |
332 | warnmsg = "Bad dir data magic!" ; |
333 | break; |
334 | } |
335 | bp->b_ops = &xfs_dir3_data_buf_ops; |
336 | break; |
337 | case XFS_BLFT_DIR_FREE_BUF: |
338 | if (magic32 != XFS_DIR2_FREE_MAGIC && |
339 | magic32 != XFS_DIR3_FREE_MAGIC) { |
340 | warnmsg = "Bad dir3 free magic!" ; |
341 | break; |
342 | } |
343 | bp->b_ops = &xfs_dir3_free_buf_ops; |
344 | break; |
345 | case XFS_BLFT_DIR_LEAF1_BUF: |
346 | if (magicda != XFS_DIR2_LEAF1_MAGIC && |
347 | magicda != XFS_DIR3_LEAF1_MAGIC) { |
348 | warnmsg = "Bad dir leaf1 magic!" ; |
349 | break; |
350 | } |
351 | bp->b_ops = &xfs_dir3_leaf1_buf_ops; |
352 | break; |
353 | case XFS_BLFT_DIR_LEAFN_BUF: |
354 | if (magicda != XFS_DIR2_LEAFN_MAGIC && |
355 | magicda != XFS_DIR3_LEAFN_MAGIC) { |
356 | warnmsg = "Bad dir leafn magic!" ; |
357 | break; |
358 | } |
359 | bp->b_ops = &xfs_dir3_leafn_buf_ops; |
360 | break; |
361 | case XFS_BLFT_DA_NODE_BUF: |
362 | if (magicda != XFS_DA_NODE_MAGIC && |
363 | magicda != XFS_DA3_NODE_MAGIC) { |
364 | warnmsg = "Bad da node magic!" ; |
365 | break; |
366 | } |
367 | bp->b_ops = &xfs_da3_node_buf_ops; |
368 | break; |
369 | case XFS_BLFT_ATTR_LEAF_BUF: |
370 | if (magicda != XFS_ATTR_LEAF_MAGIC && |
371 | magicda != XFS_ATTR3_LEAF_MAGIC) { |
372 | warnmsg = "Bad attr leaf magic!" ; |
373 | break; |
374 | } |
375 | bp->b_ops = &xfs_attr3_leaf_buf_ops; |
376 | break; |
377 | case XFS_BLFT_ATTR_RMT_BUF: |
378 | if (magic32 != XFS_ATTR3_RMT_MAGIC) { |
379 | warnmsg = "Bad attr remote magic!" ; |
380 | break; |
381 | } |
382 | bp->b_ops = &xfs_attr3_rmt_buf_ops; |
383 | break; |
384 | case XFS_BLFT_SB_BUF: |
385 | if (magic32 != XFS_SB_MAGIC) { |
386 | warnmsg = "Bad SB block magic!" ; |
387 | break; |
388 | } |
389 | bp->b_ops = &xfs_sb_buf_ops; |
390 | break; |
391 | #ifdef CONFIG_XFS_RT |
392 | case XFS_BLFT_RTBITMAP_BUF: |
393 | case XFS_BLFT_RTSUMMARY_BUF: |
394 | /* no magic numbers for verification of RT buffers */ |
395 | bp->b_ops = &xfs_rtbuf_ops; |
396 | break; |
397 | #endif /* CONFIG_XFS_RT */ |
398 | default: |
399 | xfs_warn(mp, "Unknown buffer type %d!" , |
400 | xfs_blft_from_flags(buf_f)); |
401 | break; |
402 | } |
403 | |
404 | /* |
405 | * Nothing else to do in the case of a NULL current LSN as this means |
406 | * the buffer is more recent than the change in the log and will be |
407 | * skipped. |
408 | */ |
409 | if (current_lsn == NULLCOMMITLSN) |
410 | return; |
411 | |
412 | if (warnmsg) { |
413 | xfs_warn(mp, warnmsg); |
414 | ASSERT(0); |
415 | } |
416 | |
417 | /* |
418 | * We must update the metadata LSN of the buffer as it is written out to |
419 | * ensure that older transactions never replay over this one and corrupt |
420 | * the buffer. This can occur if log recovery is interrupted at some |
421 | * point after the current transaction completes, at which point a |
422 | * subsequent mount starts recovery from the beginning. |
423 | * |
424 | * Write verifiers update the metadata LSN from log items attached to |
425 | * the buffer. Therefore, initialize a bli purely to carry the LSN to |
426 | * the verifier. |
427 | */ |
428 | if (bp->b_ops) { |
429 | struct xfs_buf_log_item *bip; |
430 | |
431 | bp->b_flags |= _XBF_LOGRECOVERY; |
432 | xfs_buf_item_init(bp, mp); |
433 | bip = bp->b_log_item; |
434 | bip->bli_item.li_lsn = current_lsn; |
435 | } |
436 | } |
437 | |
438 | /* |
439 | * Perform a 'normal' buffer recovery. Each logged region of the |
440 | * buffer should be copied over the corresponding region in the |
441 | * given buffer. The bitmap in the buf log format structure indicates |
442 | * where to place the logged data. |
443 | */ |
444 | STATIC void |
445 | xlog_recover_do_reg_buffer( |
446 | struct xfs_mount *mp, |
447 | struct xlog_recover_item *item, |
448 | struct xfs_buf *bp, |
449 | struct xfs_buf_log_format *buf_f, |
450 | xfs_lsn_t current_lsn) |
451 | { |
452 | int i; |
453 | int bit; |
454 | int nbits; |
455 | xfs_failaddr_t fa; |
456 | const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); |
457 | |
458 | trace_xfs_log_recover_buf_reg_buf(log: mp->m_log, buf_f); |
459 | |
460 | bit = 0; |
461 | i = 1; /* 0 is the buf format structure */ |
462 | while (1) { |
463 | bit = xfs_next_bit(buf_f->blf_data_map, |
464 | buf_f->blf_map_size, bit); |
465 | if (bit == -1) |
466 | break; |
467 | nbits = xfs_contig_bits(buf_f->blf_data_map, |
468 | buf_f->blf_map_size, bit); |
469 | ASSERT(nbits > 0); |
470 | ASSERT(item->ri_buf[i].i_addr != NULL); |
471 | ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); |
472 | ASSERT(BBTOB(bp->b_length) >= |
473 | ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); |
474 | |
475 | /* |
476 | * The dirty regions logged in the buffer, even though |
477 | * contiguous, may span multiple chunks. This is because the |
478 | * dirty region may span a physical page boundary in a buffer |
479 | * and hence be split into two separate vectors for writing into |
480 | * the log. Hence we need to trim nbits back to the length of |
481 | * the current region being copied out of the log. |
482 | */ |
483 | if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) |
484 | nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; |
485 | |
486 | /* |
487 | * Do a sanity check if this is a dquot buffer. Just checking |
488 | * the first dquot in the buffer should do. XXXThis is |
489 | * probably a good thing to do for other buf types also. |
490 | */ |
491 | fa = NULL; |
492 | if (buf_f->blf_flags & |
493 | (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { |
494 | if (item->ri_buf[i].i_addr == NULL) { |
495 | xfs_alert(mp, |
496 | "XFS: NULL dquot in %s." , __func__); |
497 | goto next; |
498 | } |
499 | if (item->ri_buf[i].i_len < size_disk_dquot) { |
500 | xfs_alert(mp, |
501 | "XFS: dquot too small (%d) in %s." , |
502 | item->ri_buf[i].i_len, __func__); |
503 | goto next; |
504 | } |
505 | fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); |
506 | if (fa) { |
507 | xfs_alert(mp, |
508 | "dquot corrupt at %pS trying to replay into block 0x%llx" , |
509 | fa, xfs_buf_daddr(bp)); |
510 | goto next; |
511 | } |
512 | } |
513 | |
514 | memcpy(xfs_buf_offset(bp, |
515 | (uint)bit << XFS_BLF_SHIFT), /* dest */ |
516 | item->ri_buf[i].i_addr, /* source */ |
517 | nbits<<XFS_BLF_SHIFT); /* length */ |
518 | next: |
519 | i++; |
520 | bit += nbits; |
521 | } |
522 | |
523 | /* Shouldn't be any more regions */ |
524 | ASSERT(i == item->ri_total); |
525 | |
526 | xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); |
527 | } |
528 | |
529 | /* |
530 | * Perform a dquot buffer recovery. |
531 | * Simple algorithm: if we have found a QUOTAOFF log item of the same type |
532 | * (ie. USR or GRP), then just toss this buffer away; don't recover it. |
533 | * Else, treat it as a regular buffer and do recovery. |
534 | * |
535 | * Return false if the buffer was tossed and true if we recovered the buffer to |
536 | * indicate to the caller if the buffer needs writing. |
537 | */ |
538 | STATIC bool |
539 | xlog_recover_do_dquot_buffer( |
540 | struct xfs_mount *mp, |
541 | struct xlog *log, |
542 | struct xlog_recover_item *item, |
543 | struct xfs_buf *bp, |
544 | struct xfs_buf_log_format *buf_f) |
545 | { |
546 | uint type; |
547 | |
548 | trace_xfs_log_recover_buf_dquot_buf(log, buf_f); |
549 | |
550 | /* |
551 | * Filesystems are required to send in quota flags at mount time. |
552 | */ |
553 | if (!mp->m_qflags) |
554 | return false; |
555 | |
556 | type = 0; |
557 | if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) |
558 | type |= XFS_DQTYPE_USER; |
559 | if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) |
560 | type |= XFS_DQTYPE_PROJ; |
561 | if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) |
562 | type |= XFS_DQTYPE_GROUP; |
563 | /* |
564 | * This type of quotas was turned off, so ignore this buffer |
565 | */ |
566 | if (log->l_quotaoffs_flag & type) |
567 | return false; |
568 | |
569 | xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); |
570 | return true; |
571 | } |
572 | |
573 | /* |
574 | * Perform recovery for a buffer full of inodes. In these buffers, the only |
575 | * data which should be recovered is that which corresponds to the |
576 | * di_next_unlinked pointers in the on disk inode structures. The rest of the |
577 | * data for the inodes is always logged through the inodes themselves rather |
578 | * than the inode buffer and is recovered in xlog_recover_inode_pass2(). |
579 | * |
580 | * The only time when buffers full of inodes are fully recovered is when the |
581 | * buffer is full of newly allocated inodes. In this case the buffer will |
582 | * not be marked as an inode buffer and so will be sent to |
583 | * xlog_recover_do_reg_buffer() below during recovery. |
584 | */ |
585 | STATIC int |
586 | xlog_recover_do_inode_buffer( |
587 | struct xfs_mount *mp, |
588 | struct xlog_recover_item *item, |
589 | struct xfs_buf *bp, |
590 | struct xfs_buf_log_format *buf_f) |
591 | { |
592 | int i; |
593 | int item_index = 0; |
594 | int bit = 0; |
595 | int nbits = 0; |
596 | int reg_buf_offset = 0; |
597 | int reg_buf_bytes = 0; |
598 | int next_unlinked_offset; |
599 | int inodes_per_buf; |
600 | xfs_agino_t *logged_nextp; |
601 | xfs_agino_t *buffer_nextp; |
602 | |
603 | trace_xfs_log_recover_buf_inode_buf(log: mp->m_log, buf_f); |
604 | |
605 | /* |
606 | * Post recovery validation only works properly on CRC enabled |
607 | * filesystems. |
608 | */ |
609 | if (xfs_has_crc(mp)) |
610 | bp->b_ops = &xfs_inode_buf_ops; |
611 | |
612 | inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; |
613 | for (i = 0; i < inodes_per_buf; i++) { |
614 | next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + |
615 | offsetof(struct xfs_dinode, di_next_unlinked); |
616 | |
617 | while (next_unlinked_offset >= |
618 | (reg_buf_offset + reg_buf_bytes)) { |
619 | /* |
620 | * The next di_next_unlinked field is beyond |
621 | * the current logged region. Find the next |
622 | * logged region that contains or is beyond |
623 | * the current di_next_unlinked field. |
624 | */ |
625 | bit += nbits; |
626 | bit = xfs_next_bit(buf_f->blf_data_map, |
627 | buf_f->blf_map_size, bit); |
628 | |
629 | /* |
630 | * If there are no more logged regions in the |
631 | * buffer, then we're done. |
632 | */ |
633 | if (bit == -1) |
634 | return 0; |
635 | |
636 | nbits = xfs_contig_bits(buf_f->blf_data_map, |
637 | buf_f->blf_map_size, bit); |
638 | ASSERT(nbits > 0); |
639 | reg_buf_offset = bit << XFS_BLF_SHIFT; |
640 | reg_buf_bytes = nbits << XFS_BLF_SHIFT; |
641 | item_index++; |
642 | } |
643 | |
644 | /* |
645 | * If the current logged region starts after the current |
646 | * di_next_unlinked field, then move on to the next |
647 | * di_next_unlinked field. |
648 | */ |
649 | if (next_unlinked_offset < reg_buf_offset) |
650 | continue; |
651 | |
652 | ASSERT(item->ri_buf[item_index].i_addr != NULL); |
653 | ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); |
654 | ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); |
655 | |
656 | /* |
657 | * The current logged region contains a copy of the |
658 | * current di_next_unlinked field. Extract its value |
659 | * and copy it to the buffer copy. |
660 | */ |
661 | logged_nextp = item->ri_buf[item_index].i_addr + |
662 | next_unlinked_offset - reg_buf_offset; |
663 | if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { |
664 | xfs_alert(mp, |
665 | "Bad inode buffer log record (ptr = " PTR_FMT", bp = " PTR_FMT"). " |
666 | "Trying to replay bad (0) inode di_next_unlinked field." , |
667 | item, bp); |
668 | return -EFSCORRUPTED; |
669 | } |
670 | |
671 | buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); |
672 | *buffer_nextp = *logged_nextp; |
673 | |
674 | /* |
675 | * If necessary, recalculate the CRC in the on-disk inode. We |
676 | * have to leave the inode in a consistent state for whoever |
677 | * reads it next.... |
678 | */ |
679 | xfs_dinode_calc_crc(mp, |
680 | xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); |
681 | |
682 | } |
683 | |
684 | return 0; |
685 | } |
686 | |
687 | /* |
688 | * V5 filesystems know the age of the buffer on disk being recovered. We can |
689 | * have newer objects on disk than we are replaying, and so for these cases we |
690 | * don't want to replay the current change as that will make the buffer contents |
691 | * temporarily invalid on disk. |
692 | * |
693 | * The magic number might not match the buffer type we are going to recover |
694 | * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence |
695 | * extract the LSN of the existing object in the buffer based on it's current |
696 | * magic number. If we don't recognise the magic number in the buffer, then |
697 | * return a LSN of -1 so that the caller knows it was an unrecognised block and |
698 | * so can recover the buffer. |
699 | * |
700 | * Note: we cannot rely solely on magic number matches to determine that the |
701 | * buffer has a valid LSN - we also need to verify that it belongs to this |
702 | * filesystem, so we need to extract the object's LSN and compare it to that |
703 | * which we read from the superblock. If the UUIDs don't match, then we've got a |
704 | * stale metadata block from an old filesystem instance that we need to recover |
705 | * over the top of. |
706 | */ |
707 | static xfs_lsn_t |
708 | xlog_recover_get_buf_lsn( |
709 | struct xfs_mount *mp, |
710 | struct xfs_buf *bp, |
711 | struct xfs_buf_log_format *buf_f) |
712 | { |
713 | uint32_t magic32; |
714 | uint16_t magic16; |
715 | uint16_t magicda; |
716 | void *blk = bp->b_addr; |
717 | uuid_t *uuid; |
718 | xfs_lsn_t lsn = -1; |
719 | uint16_t blft; |
720 | |
721 | /* v4 filesystems always recover immediately */ |
722 | if (!xfs_has_crc(mp)) |
723 | goto recover_immediately; |
724 | |
725 | /* |
726 | * realtime bitmap and summary file blocks do not have magic numbers or |
727 | * UUIDs, so we must recover them immediately. |
728 | */ |
729 | blft = xfs_blft_from_flags(buf_f); |
730 | if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) |
731 | goto recover_immediately; |
732 | |
733 | magic32 = be32_to_cpu(*(__be32 *)blk); |
734 | switch (magic32) { |
735 | case XFS_ABTB_CRC_MAGIC: |
736 | case XFS_ABTC_CRC_MAGIC: |
737 | case XFS_ABTB_MAGIC: |
738 | case XFS_ABTC_MAGIC: |
739 | case XFS_RMAP_CRC_MAGIC: |
740 | case XFS_REFC_CRC_MAGIC: |
741 | case XFS_FIBT_CRC_MAGIC: |
742 | case XFS_FIBT_MAGIC: |
743 | case XFS_IBT_CRC_MAGIC: |
744 | case XFS_IBT_MAGIC: { |
745 | struct xfs_btree_block *btb = blk; |
746 | |
747 | lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); |
748 | uuid = &btb->bb_u.s.bb_uuid; |
749 | break; |
750 | } |
751 | case XFS_BMAP_CRC_MAGIC: |
752 | case XFS_BMAP_MAGIC: { |
753 | struct xfs_btree_block *btb = blk; |
754 | |
755 | lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); |
756 | uuid = &btb->bb_u.l.bb_uuid; |
757 | break; |
758 | } |
759 | case XFS_AGF_MAGIC: |
760 | lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); |
761 | uuid = &((struct xfs_agf *)blk)->agf_uuid; |
762 | break; |
763 | case XFS_AGFL_MAGIC: |
764 | lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); |
765 | uuid = &((struct xfs_agfl *)blk)->agfl_uuid; |
766 | break; |
767 | case XFS_AGI_MAGIC: |
768 | lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); |
769 | uuid = &((struct xfs_agi *)blk)->agi_uuid; |
770 | break; |
771 | case XFS_SYMLINK_MAGIC: |
772 | lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); |
773 | uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; |
774 | break; |
775 | case XFS_DIR3_BLOCK_MAGIC: |
776 | case XFS_DIR3_DATA_MAGIC: |
777 | case XFS_DIR3_FREE_MAGIC: |
778 | lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); |
779 | uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; |
780 | break; |
781 | case XFS_ATTR3_RMT_MAGIC: |
782 | /* |
783 | * Remote attr blocks are written synchronously, rather than |
784 | * being logged. That means they do not contain a valid LSN |
785 | * (i.e. transactionally ordered) in them, and hence any time we |
786 | * see a buffer to replay over the top of a remote attribute |
787 | * block we should simply do so. |
788 | */ |
789 | goto recover_immediately; |
790 | case XFS_SB_MAGIC: |
791 | /* |
792 | * superblock uuids are magic. We may or may not have a |
793 | * sb_meta_uuid on disk, but it will be set in the in-core |
794 | * superblock. We set the uuid pointer for verification |
795 | * according to the superblock feature mask to ensure we check |
796 | * the relevant UUID in the superblock. |
797 | */ |
798 | lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); |
799 | if (xfs_has_metauuid(mp)) |
800 | uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; |
801 | else |
802 | uuid = &((struct xfs_dsb *)blk)->sb_uuid; |
803 | break; |
804 | default: |
805 | break; |
806 | } |
807 | |
808 | if (lsn != (xfs_lsn_t)-1) { |
809 | if (!uuid_equal(u1: &mp->m_sb.sb_meta_uuid, u2: uuid)) |
810 | goto recover_immediately; |
811 | return lsn; |
812 | } |
813 | |
814 | magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); |
815 | switch (magicda) { |
816 | case XFS_DIR3_LEAF1_MAGIC: |
817 | case XFS_DIR3_LEAFN_MAGIC: |
818 | case XFS_ATTR3_LEAF_MAGIC: |
819 | case XFS_DA3_NODE_MAGIC: |
820 | lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); |
821 | uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; |
822 | break; |
823 | default: |
824 | break; |
825 | } |
826 | |
827 | if (lsn != (xfs_lsn_t)-1) { |
828 | if (!uuid_equal(u1: &mp->m_sb.sb_meta_uuid, u2: uuid)) |
829 | goto recover_immediately; |
830 | return lsn; |
831 | } |
832 | |
833 | /* |
834 | * We do individual object checks on dquot and inode buffers as they |
835 | * have their own individual LSN records. Also, we could have a stale |
836 | * buffer here, so we have to at least recognise these buffer types. |
837 | * |
838 | * A notd complexity here is inode unlinked list processing - it logs |
839 | * the inode directly in the buffer, but we don't know which inodes have |
840 | * been modified, and there is no global buffer LSN. Hence we need to |
841 | * recover all inode buffer types immediately. This problem will be |
842 | * fixed by logical logging of the unlinked list modifications. |
843 | */ |
844 | magic16 = be16_to_cpu(*(__be16 *)blk); |
845 | switch (magic16) { |
846 | case XFS_DQUOT_MAGIC: |
847 | case XFS_DINODE_MAGIC: |
848 | goto recover_immediately; |
849 | default: |
850 | break; |
851 | } |
852 | |
853 | /* unknown buffer contents, recover immediately */ |
854 | |
855 | recover_immediately: |
856 | return (xfs_lsn_t)-1; |
857 | |
858 | } |
859 | |
860 | /* |
861 | * This routine replays a modification made to a buffer at runtime. |
862 | * There are actually two types of buffer, regular and inode, which |
863 | * are handled differently. Inode buffers are handled differently |
864 | * in that we only recover a specific set of data from them, namely |
865 | * the inode di_next_unlinked fields. This is because all other inode |
866 | * data is actually logged via inode records and any data we replay |
867 | * here which overlaps that may be stale. |
868 | * |
869 | * When meta-data buffers are freed at run time we log a buffer item |
870 | * with the XFS_BLF_CANCEL bit set to indicate that previous copies |
871 | * of the buffer in the log should not be replayed at recovery time. |
872 | * This is so that if the blocks covered by the buffer are reused for |
873 | * file data before we crash we don't end up replaying old, freed |
874 | * meta-data into a user's file. |
875 | * |
876 | * To handle the cancellation of buffer log items, we make two passes |
877 | * over the log during recovery. During the first we build a table of |
878 | * those buffers which have been cancelled, and during the second we |
879 | * only replay those buffers which do not have corresponding cancel |
880 | * records in the table. See xlog_recover_buf_pass[1,2] above |
881 | * for more details on the implementation of the table of cancel records. |
882 | */ |
883 | STATIC int |
884 | xlog_recover_buf_commit_pass2( |
885 | struct xlog *log, |
886 | struct list_head *buffer_list, |
887 | struct xlog_recover_item *item, |
888 | xfs_lsn_t current_lsn) |
889 | { |
890 | struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; |
891 | struct xfs_mount *mp = log->l_mp; |
892 | struct xfs_buf *bp; |
893 | int error; |
894 | uint buf_flags; |
895 | xfs_lsn_t lsn; |
896 | |
897 | /* |
898 | * In this pass we only want to recover all the buffers which have |
899 | * not been cancelled and are not cancellation buffers themselves. |
900 | */ |
901 | if (buf_f->blf_flags & XFS_BLF_CANCEL) { |
902 | if (xlog_put_buffer_cancelled(log, blkno: buf_f->blf_blkno, |
903 | len: buf_f->blf_len)) |
904 | goto cancelled; |
905 | } else { |
906 | |
907 | if (xlog_is_buffer_cancelled(log, blkno: buf_f->blf_blkno, |
908 | len: buf_f->blf_len)) |
909 | goto cancelled; |
910 | } |
911 | |
912 | trace_xfs_log_recover_buf_recover(log, buf_f); |
913 | |
914 | buf_flags = 0; |
915 | if (buf_f->blf_flags & XFS_BLF_INODE_BUF) |
916 | buf_flags |= XBF_UNMAPPED; |
917 | |
918 | error = xfs_buf_read(target: mp->m_ddev_targp, blkno: buf_f->blf_blkno, numblks: buf_f->blf_len, |
919 | flags: buf_flags, bpp: &bp, NULL); |
920 | if (error) |
921 | return error; |
922 | |
923 | /* |
924 | * Recover the buffer only if we get an LSN from it and it's less than |
925 | * the lsn of the transaction we are replaying. |
926 | * |
927 | * Note that we have to be extremely careful of readahead here. |
928 | * Readahead does not attach verfiers to the buffers so if we don't |
929 | * actually do any replay after readahead because of the LSN we found |
930 | * in the buffer if more recent than that current transaction then we |
931 | * need to attach the verifier directly. Failure to do so can lead to |
932 | * future recovery actions (e.g. EFI and unlinked list recovery) can |
933 | * operate on the buffers and they won't get the verifier attached. This |
934 | * can lead to blocks on disk having the correct content but a stale |
935 | * CRC. |
936 | * |
937 | * It is safe to assume these clean buffers are currently up to date. |
938 | * If the buffer is dirtied by a later transaction being replayed, then |
939 | * the verifier will be reset to match whatever recover turns that |
940 | * buffer into. |
941 | */ |
942 | lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); |
943 | if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { |
944 | trace_xfs_log_recover_buf_skip(log, buf_f); |
945 | xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); |
946 | |
947 | /* |
948 | * We're skipping replay of this buffer log item due to the log |
949 | * item LSN being behind the ondisk buffer. Verify the buffer |
950 | * contents since we aren't going to run the write verifier. |
951 | */ |
952 | if (bp->b_ops) { |
953 | bp->b_ops->verify_read(bp); |
954 | error = bp->b_error; |
955 | } |
956 | goto out_release; |
957 | } |
958 | |
959 | if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { |
960 | error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); |
961 | if (error) |
962 | goto out_release; |
963 | } else if (buf_f->blf_flags & |
964 | (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { |
965 | bool dirty; |
966 | |
967 | dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); |
968 | if (!dirty) |
969 | goto out_release; |
970 | } else { |
971 | xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); |
972 | } |
973 | |
974 | /* |
975 | * Perform delayed write on the buffer. Asynchronous writes will be |
976 | * slower when taking into account all the buffers to be flushed. |
977 | * |
978 | * Also make sure that only inode buffers with good sizes stay in |
979 | * the buffer cache. The kernel moves inodes in buffers of 1 block |
980 | * or inode_cluster_size bytes, whichever is bigger. The inode |
981 | * buffers in the log can be a different size if the log was generated |
982 | * by an older kernel using unclustered inode buffers or a newer kernel |
983 | * running with a different inode cluster size. Regardless, if |
984 | * the inode buffer size isn't max(blocksize, inode_cluster_size) |
985 | * for *our* value of inode_cluster_size, then we need to keep |
986 | * the buffer out of the buffer cache so that the buffer won't |
987 | * overlap with future reads of those inodes. |
988 | */ |
989 | if (XFS_DINODE_MAGIC == |
990 | be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && |
991 | (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { |
992 | xfs_buf_stale(bp); |
993 | error = xfs_bwrite(bp); |
994 | } else { |
995 | ASSERT(bp->b_mount == mp); |
996 | bp->b_flags |= _XBF_LOGRECOVERY; |
997 | xfs_buf_delwri_queue(bp, buffer_list); |
998 | } |
999 | |
1000 | out_release: |
1001 | xfs_buf_relse(bp); |
1002 | return error; |
1003 | cancelled: |
1004 | trace_xfs_log_recover_buf_cancel(log, buf_f); |
1005 | return 0; |
1006 | } |
1007 | |
1008 | const struct xlog_recover_item_ops xlog_buf_item_ops = { |
1009 | .item_type = XFS_LI_BUF, |
1010 | .reorder = xlog_recover_buf_reorder, |
1011 | .ra_pass2 = xlog_recover_buf_ra_pass2, |
1012 | .commit_pass1 = xlog_recover_buf_commit_pass1, |
1013 | .commit_pass2 = xlog_recover_buf_commit_pass2, |
1014 | }; |
1015 | |
1016 | #ifdef DEBUG |
1017 | void |
1018 | xlog_check_buf_cancel_table( |
1019 | struct xlog *log) |
1020 | { |
1021 | int i; |
1022 | |
1023 | for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) |
1024 | ASSERT(list_empty(&log->l_buf_cancel_table[i])); |
1025 | } |
1026 | #endif |
1027 | |
1028 | int |
1029 | xlog_alloc_buf_cancel_table( |
1030 | struct xlog *log) |
1031 | { |
1032 | void *p; |
1033 | int i; |
1034 | |
1035 | ASSERT(log->l_buf_cancel_table == NULL); |
1036 | |
1037 | p = kmalloc_array(XLOG_BC_TABLE_SIZE, size: sizeof(struct list_head), |
1038 | GFP_KERNEL); |
1039 | if (!p) |
1040 | return -ENOMEM; |
1041 | |
1042 | log->l_buf_cancel_table = p; |
1043 | for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) |
1044 | INIT_LIST_HEAD(list: &log->l_buf_cancel_table[i]); |
1045 | |
1046 | return 0; |
1047 | } |
1048 | |
1049 | void |
1050 | xlog_free_buf_cancel_table( |
1051 | struct xlog *log) |
1052 | { |
1053 | int i; |
1054 | |
1055 | if (!log->l_buf_cancel_table) |
1056 | return; |
1057 | |
1058 | for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { |
1059 | struct xfs_buf_cancel *bc; |
1060 | |
1061 | while ((bc = list_first_entry_or_null( |
1062 | &log->l_buf_cancel_table[i], |
1063 | struct xfs_buf_cancel, bc_list))) { |
1064 | list_del(entry: &bc->bc_list); |
1065 | kfree(objp: bc); |
1066 | } |
1067 | } |
1068 | |
1069 | kfree(objp: log->l_buf_cancel_table); |
1070 | log->l_buf_cancel_table = NULL; |
1071 | } |
1072 | |