1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6#include <linux/fs.h>
7#include <linux/blkdev.h>
8#include <linux/radix-tree.h>
9#include <linux/writeback.h>
10#include <linux/workqueue.h>
11#include <linux/kthread.h>
12#include <linux/slab.h>
13#include <linux/migrate.h>
14#include <linux/ratelimit.h>
15#include <linux/uuid.h>
16#include <linux/semaphore.h>
17#include <linux/error-injection.h>
18#include <linux/crc32c.h>
19#include <linux/sched/mm.h>
20#include <linux/unaligned.h>
21#include <crypto/hash.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "btrfs_inode.h"
26#include "bio.h"
27#include "print-tree.h"
28#include "locking.h"
29#include "tree-log.h"
30#include "free-space-cache.h"
31#include "free-space-tree.h"
32#include "dev-replace.h"
33#include "raid56.h"
34#include "sysfs.h"
35#include "qgroup.h"
36#include "compression.h"
37#include "tree-checker.h"
38#include "ref-verify.h"
39#include "block-group.h"
40#include "discard.h"
41#include "space-info.h"
42#include "zoned.h"
43#include "subpage.h"
44#include "fs.h"
45#include "accessors.h"
46#include "extent-tree.h"
47#include "root-tree.h"
48#include "defrag.h"
49#include "uuid-tree.h"
50#include "relocation.h"
51#include "scrub.h"
52#include "super.h"
53
54#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
55 BTRFS_HEADER_FLAG_RELOC |\
56 BTRFS_SUPER_FLAG_ERROR |\
57 BTRFS_SUPER_FLAG_SEEDING |\
58 BTRFS_SUPER_FLAG_METADUMP |\
59 BTRFS_SUPER_FLAG_METADUMP_V2)
60
61static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
62static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
63
64static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
65{
66 if (fs_info->csum_shash)
67 crypto_free_shash(tfm: fs_info->csum_shash);
68}
69
70/*
71 * Compute the csum of a btree block and store the result to provided buffer.
72 */
73static void csum_tree_block(struct extent_buffer *buf, u8 *result)
74{
75 struct btrfs_fs_info *fs_info = buf->fs_info;
76 int num_pages;
77 u32 first_page_part;
78 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
79 char *kaddr;
80 int i;
81
82 shash->tfm = fs_info->csum_shash;
83 crypto_shash_init(desc: shash);
84
85 if (buf->addr) {
86 /* Pages are contiguous, handle them as a big one. */
87 kaddr = buf->addr;
88 first_page_part = fs_info->nodesize;
89 num_pages = 1;
90 } else {
91 kaddr = folio_address(folio: buf->folios[0]);
92 first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
93 num_pages = num_extent_pages(eb: buf);
94 }
95
96 crypto_shash_update(desc: shash, data: kaddr + BTRFS_CSUM_SIZE,
97 len: first_page_part - BTRFS_CSUM_SIZE);
98
99 /*
100 * Multiple single-page folios case would reach here.
101 *
102 * nodesize <= PAGE_SIZE and large folio all handled by above
103 * crypto_shash_update() already.
104 */
105 for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
106 kaddr = folio_address(folio: buf->folios[i]);
107 crypto_shash_update(desc: shash, data: kaddr, PAGE_SIZE);
108 }
109 memset(result, 0, BTRFS_CSUM_SIZE);
110 crypto_shash_final(desc: shash, out: result);
111}
112
113/*
114 * we can't consider a given block up to date unless the transid of the
115 * block matches the transid in the parent node's pointer. This is how we
116 * detect blocks that either didn't get written at all or got written
117 * in the wrong place.
118 */
119int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
120{
121 if (!extent_buffer_uptodate(eb))
122 return 0;
123
124 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
125 return 1;
126
127 if (atomic)
128 return -EAGAIN;
129
130 if (!extent_buffer_uptodate(eb) ||
131 btrfs_header_generation(eb) != parent_transid) {
132 btrfs_err_rl(eb->fs_info,
133"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
134 eb->start, eb->read_mirror,
135 parent_transid, btrfs_header_generation(eb));
136 clear_extent_buffer_uptodate(eb);
137 return 0;
138 }
139 return 1;
140}
141
142static bool btrfs_supported_super_csum(u16 csum_type)
143{
144 switch (csum_type) {
145 case BTRFS_CSUM_TYPE_CRC32:
146 case BTRFS_CSUM_TYPE_XXHASH:
147 case BTRFS_CSUM_TYPE_SHA256:
148 case BTRFS_CSUM_TYPE_BLAKE2:
149 return true;
150 default:
151 return false;
152 }
153}
154
155/*
156 * Return 0 if the superblock checksum type matches the checksum value of that
157 * algorithm. Pass the raw disk superblock data.
158 */
159int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
160 const struct btrfs_super_block *disk_sb)
161{
162 char result[BTRFS_CSUM_SIZE];
163 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
164
165 shash->tfm = fs_info->csum_shash;
166
167 /*
168 * The super_block structure does not span the whole
169 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
170 * filled with zeros and is included in the checksum.
171 */
172 crypto_shash_digest(desc: shash, data: (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
173 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, out: result);
174
175 if (memcmp(p: disk_sb->csum, q: result, size: fs_info->csum_size))
176 return 1;
177
178 return 0;
179}
180
181static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
182 int mirror_num)
183{
184 struct btrfs_fs_info *fs_info = eb->fs_info;
185 int ret = 0;
186
187 if (sb_rdonly(sb: fs_info->sb))
188 return -EROFS;
189
190 for (int i = 0; i < num_extent_folios(eb); i++) {
191 struct folio *folio = eb->folios[i];
192 u64 start = max_t(u64, eb->start, folio_pos(folio));
193 u64 end = min_t(u64, eb->start + eb->len,
194 folio_pos(folio) + eb->folio_size);
195 u32 len = end - start;
196 phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) +
197 offset_in_folio(folio, start);
198
199 ret = btrfs_repair_io_failure(fs_info, ino: 0, start, length: len, logical: start,
200 paddr, mirror_num);
201 if (ret)
202 break;
203 }
204
205 return ret;
206}
207
208/*
209 * helper to read a given tree block, doing retries as required when
210 * the checksums don't match and we have alternate mirrors to try.
211 *
212 * @check: expected tree parentness check, see the comments of the
213 * structure for details.
214 */
215int btrfs_read_extent_buffer(struct extent_buffer *eb,
216 const struct btrfs_tree_parent_check *check)
217{
218 struct btrfs_fs_info *fs_info = eb->fs_info;
219 int failed = 0;
220 int ret;
221 int num_copies = 0;
222 int mirror_num = 0;
223 int failed_mirror = 0;
224
225 ASSERT(check);
226
227 while (1) {
228 ret = read_extent_buffer_pages(eb, mirror_num, parent_check: check);
229 if (!ret)
230 break;
231
232 num_copies = btrfs_num_copies(fs_info,
233 logical: eb->start, len: eb->len);
234 if (num_copies == 1)
235 break;
236
237 if (!failed_mirror) {
238 failed = 1;
239 failed_mirror = eb->read_mirror;
240 }
241
242 mirror_num++;
243 if (mirror_num == failed_mirror)
244 mirror_num++;
245
246 if (mirror_num > num_copies)
247 break;
248 }
249
250 if (failed && !ret && failed_mirror)
251 btrfs_repair_eb_io_failure(eb, mirror_num: failed_mirror);
252
253 return ret;
254}
255
256/*
257 * Checksum a dirty tree block before IO.
258 */
259int btree_csum_one_bio(struct btrfs_bio *bbio)
260{
261 struct extent_buffer *eb = bbio->private;
262 struct btrfs_fs_info *fs_info = eb->fs_info;
263 u64 found_start = btrfs_header_bytenr(eb);
264 u64 last_trans;
265 u8 result[BTRFS_CSUM_SIZE];
266 int ret;
267
268 /* Btree blocks are always contiguous on disk. */
269 if (WARN_ON_ONCE(bbio->file_offset != eb->start))
270 return -EIO;
271 if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
272 return -EIO;
273
274 /*
275 * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
276 * checksum it but zero-out its content. This is done to preserve
277 * ordering of I/O without unnecessarily writing out data.
278 */
279 if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
280 memzero_extent_buffer(eb, start: 0, len: eb->len);
281 return 0;
282 }
283
284 if (WARN_ON_ONCE(found_start != eb->start))
285 return -EIO;
286 if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
287 return -EIO;
288
289 ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
290 offsetof(struct btrfs_header, fsid),
291 BTRFS_FSID_SIZE) == 0);
292 csum_tree_block(buf: eb, result);
293
294 if (btrfs_header_level(eb))
295 ret = btrfs_check_node(node: eb);
296 else
297 ret = btrfs_check_leaf(leaf: eb);
298
299 if (ret < 0)
300 goto error;
301
302 /*
303 * Also check the generation, the eb reached here must be newer than
304 * last committed. Or something seriously wrong happened.
305 */
306 last_trans = btrfs_get_last_trans_committed(fs_info);
307 if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
308 ret = -EUCLEAN;
309 btrfs_err(fs_info,
310 "block=%llu bad generation, have %llu expect > %llu",
311 eb->start, btrfs_header_generation(eb), last_trans);
312 goto error;
313 }
314 write_extent_buffer(eb, src: result, start: 0, len: fs_info->csum_size);
315 return 0;
316
317error:
318 btrfs_print_tree(c: eb, follow: 0);
319 btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
320 eb->start);
321 /*
322 * Be noisy if this is an extent buffer from a log tree. We don't abort
323 * a transaction in case there's a bad log tree extent buffer, we just
324 * fallback to a transaction commit. Still we want to know when there is
325 * a bad log tree extent buffer, as that may signal a bug somewhere.
326 */
327 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
328 btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
329 return ret;
330}
331
332static bool check_tree_block_fsid(struct extent_buffer *eb)
333{
334 struct btrfs_fs_info *fs_info = eb->fs_info;
335 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
336 u8 fsid[BTRFS_FSID_SIZE];
337
338 read_extent_buffer(eb, dst: fsid, offsetof(struct btrfs_header, fsid),
339 BTRFS_FSID_SIZE);
340
341 /*
342 * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
343 * This is then overwritten by metadata_uuid if it is present in the
344 * device_list_add(). The same true for a seed device as well. So use of
345 * fs_devices::metadata_uuid is appropriate here.
346 */
347 if (memcmp(p: fsid, q: fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
348 return false;
349
350 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
351 if (!memcmp(p: fsid, q: seed_devs->fsid, BTRFS_FSID_SIZE))
352 return false;
353
354 return true;
355}
356
357/* Do basic extent buffer checks at read time */
358int btrfs_validate_extent_buffer(struct extent_buffer *eb,
359 const struct btrfs_tree_parent_check *check)
360{
361 struct btrfs_fs_info *fs_info = eb->fs_info;
362 u64 found_start;
363 const u32 csum_size = fs_info->csum_size;
364 u8 found_level;
365 u8 result[BTRFS_CSUM_SIZE];
366 const u8 *header_csum;
367 int ret = 0;
368 const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
369
370 ASSERT(check);
371
372 found_start = btrfs_header_bytenr(eb);
373 if (found_start != eb->start) {
374 btrfs_err_rl(fs_info,
375 "bad tree block start, mirror %u want %llu have %llu",
376 eb->read_mirror, eb->start, found_start);
377 ret = -EIO;
378 goto out;
379 }
380 if (check_tree_block_fsid(eb)) {
381 btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
382 eb->start, eb->read_mirror);
383 ret = -EIO;
384 goto out;
385 }
386 found_level = btrfs_header_level(eb);
387 if (found_level >= BTRFS_MAX_LEVEL) {
388 btrfs_err(fs_info,
389 "bad tree block level, mirror %u level %d on logical %llu",
390 eb->read_mirror, btrfs_header_level(eb), eb->start);
391 ret = -EIO;
392 goto out;
393 }
394
395 csum_tree_block(buf: eb, result);
396 header_csum = folio_address(folio: eb->folios[0]) +
397 get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
398
399 if (memcmp(p: result, q: header_csum, size: csum_size) != 0) {
400 btrfs_warn_rl(fs_info,
401"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
402 eb->start, eb->read_mirror,
403 CSUM_FMT_VALUE(csum_size, header_csum),
404 CSUM_FMT_VALUE(csum_size, result),
405 btrfs_header_level(eb),
406 ignore_csum ? ", ignored" : "");
407 if (!ignore_csum) {
408 ret = -EUCLEAN;
409 goto out;
410 }
411 }
412
413 if (found_level != check->level) {
414 btrfs_err(fs_info,
415 "level verify failed on logical %llu mirror %u wanted %u found %u",
416 eb->start, eb->read_mirror, check->level, found_level);
417 ret = -EIO;
418 goto out;
419 }
420 if (unlikely(check->transid &&
421 btrfs_header_generation(eb) != check->transid)) {
422 btrfs_err_rl(eb->fs_info,
423"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
424 eb->start, eb->read_mirror, check->transid,
425 btrfs_header_generation(eb));
426 ret = -EIO;
427 goto out;
428 }
429 if (check->has_first_key) {
430 const struct btrfs_key *expect_key = &check->first_key;
431 struct btrfs_key found_key;
432
433 if (found_level)
434 btrfs_node_key_to_cpu(eb, cpu_key: &found_key, nr: 0);
435 else
436 btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: 0);
437 if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
438 btrfs_err(fs_info,
439"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
440 eb->start, check->transid,
441 expect_key->objectid,
442 expect_key->type, expect_key->offset,
443 found_key.objectid, found_key.type,
444 found_key.offset);
445 ret = -EUCLEAN;
446 goto out;
447 }
448 }
449 if (check->owner_root) {
450 ret = btrfs_check_eb_owner(eb, root_owner: check->owner_root);
451 if (ret < 0)
452 goto out;
453 }
454
455 /* If this is a leaf block and it is corrupt, just return -EIO. */
456 if (found_level == 0 && btrfs_check_leaf(leaf: eb))
457 ret = -EIO;
458
459 if (found_level > 0 && btrfs_check_node(node: eb))
460 ret = -EIO;
461
462 if (ret)
463 btrfs_err(fs_info,
464 "read time tree block corruption detected on logical %llu mirror %u",
465 eb->start, eb->read_mirror);
466out:
467 return ret;
468}
469
470#ifdef CONFIG_MIGRATION
471static int btree_migrate_folio(struct address_space *mapping,
472 struct folio *dst, struct folio *src, enum migrate_mode mode)
473{
474 /*
475 * we can't safely write a btree page from here,
476 * we haven't done the locking hook
477 */
478 if (folio_test_dirty(folio: src))
479 return -EAGAIN;
480 /*
481 * Buffers may be managed in a filesystem specific way.
482 * We must have no buffers or drop them.
483 */
484 if (folio_get_private(folio: src) &&
485 !filemap_release_folio(folio: src, GFP_KERNEL))
486 return -EAGAIN;
487 return migrate_folio(mapping, dst, src, mode);
488}
489#else
490#define btree_migrate_folio NULL
491#endif
492
493static int btree_writepages(struct address_space *mapping,
494 struct writeback_control *wbc)
495{
496 int ret;
497
498 if (wbc->sync_mode == WB_SYNC_NONE) {
499 struct btrfs_fs_info *fs_info;
500
501 if (wbc->for_kupdate)
502 return 0;
503
504 fs_info = inode_to_fs_info(mapping->host);
505 /* this is a bit racy, but that's ok */
506 ret = __percpu_counter_compare(fbc: &fs_info->dirty_metadata_bytes,
507 BTRFS_DIRTY_METADATA_THRESH,
508 batch: fs_info->dirty_metadata_batch);
509 if (ret < 0)
510 return 0;
511 }
512 return btree_write_cache_pages(mapping, wbc);
513}
514
515static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
516{
517 if (folio_test_writeback(folio) || folio_test_dirty(folio))
518 return false;
519
520 return try_release_extent_buffer(folio);
521}
522
523static void btree_invalidate_folio(struct folio *folio, size_t offset,
524 size_t length)
525{
526 struct extent_io_tree *tree;
527
528 tree = &folio_to_inode(folio)->io_tree;
529 extent_invalidate_folio(tree, folio, offset);
530 btree_release_folio(folio, GFP_NOFS);
531 if (folio_get_private(folio)) {
532 btrfs_warn(folio_to_fs_info(folio),
533 "folio private not zero on folio %llu",
534 (unsigned long long)folio_pos(folio));
535 folio_detach_private(folio);
536 }
537}
538
539#ifdef DEBUG
540static bool btree_dirty_folio(struct address_space *mapping,
541 struct folio *folio)
542{
543 struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
544 struct btrfs_subpage_info *spi = fs_info->subpage_info;
545 struct btrfs_subpage *subpage;
546 struct extent_buffer *eb;
547 int cur_bit = 0;
548 u64 page_start = folio_pos(folio);
549
550 if (fs_info->sectorsize == PAGE_SIZE) {
551 eb = folio_get_private(folio);
552 BUG_ON(!eb);
553 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
554 BUG_ON(!atomic_read(&eb->refs));
555 btrfs_assert_tree_write_locked(eb);
556 return filemap_dirty_folio(mapping, folio);
557 }
558
559 ASSERT(spi);
560 subpage = folio_get_private(folio);
561
562 for (cur_bit = spi->dirty_offset;
563 cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
564 cur_bit++) {
565 unsigned long flags;
566 u64 cur;
567
568 spin_lock_irqsave(&subpage->lock, flags);
569 if (!test_bit(cur_bit, subpage->bitmaps)) {
570 spin_unlock_irqrestore(&subpage->lock, flags);
571 continue;
572 }
573 spin_unlock_irqrestore(&subpage->lock, flags);
574 cur = page_start + cur_bit * fs_info->sectorsize;
575
576 eb = find_extent_buffer(fs_info, cur);
577 ASSERT(eb);
578 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
579 ASSERT(atomic_read(&eb->refs));
580 btrfs_assert_tree_write_locked(eb);
581 free_extent_buffer(eb);
582
583 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
584 }
585 return filemap_dirty_folio(mapping, folio);
586}
587#else
588#define btree_dirty_folio filemap_dirty_folio
589#endif
590
591static const struct address_space_operations btree_aops = {
592 .writepages = btree_writepages,
593 .release_folio = btree_release_folio,
594 .invalidate_folio = btree_invalidate_folio,
595 .migrate_folio = btree_migrate_folio,
596 .dirty_folio = btree_dirty_folio,
597};
598
599struct extent_buffer *btrfs_find_create_tree_block(
600 struct btrfs_fs_info *fs_info,
601 u64 bytenr, u64 owner_root,
602 int level)
603{
604 if (btrfs_is_testing(fs_info))
605 return alloc_test_extent_buffer(fs_info, start: bytenr);
606 return alloc_extent_buffer(fs_info, start: bytenr, owner_root, level);
607}
608
609/*
610 * Read tree block at logical address @bytenr and do variant basic but critical
611 * verification.
612 *
613 * @check: expected tree parentness check, see comments of the
614 * structure for details.
615 */
616struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
617 struct btrfs_tree_parent_check *check)
618{
619 struct extent_buffer *buf = NULL;
620 int ret;
621
622 ASSERT(check);
623
624 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root: check->owner_root,
625 level: check->level);
626 if (IS_ERR(ptr: buf))
627 return buf;
628
629 ret = btrfs_read_extent_buffer(eb: buf, check);
630 if (ret) {
631 free_extent_buffer_stale(eb: buf);
632 return ERR_PTR(error: ret);
633 }
634 return buf;
635
636}
637
638static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
639 u64 objectid, gfp_t flags)
640{
641 struct btrfs_root *root;
642 bool dummy = btrfs_is_testing(fs_info);
643
644 root = kzalloc(sizeof(*root), flags);
645 if (!root)
646 return NULL;
647
648 memset(&root->root_key, 0, sizeof(root->root_key));
649 memset(&root->root_item, 0, sizeof(root->root_item));
650 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
651 root->fs_info = fs_info;
652 root->root_key.objectid = objectid;
653 root->node = NULL;
654 root->commit_root = NULL;
655 root->state = 0;
656 RB_CLEAR_NODE(&root->rb_node);
657
658 btrfs_set_root_last_trans(root, transid: 0);
659 root->free_objectid = 0;
660 root->nr_delalloc_inodes = 0;
661 root->nr_ordered_extents = 0;
662 xa_init(xa: &root->inodes);
663 xa_init(xa: &root->delayed_nodes);
664
665 btrfs_init_root_block_rsv(root);
666
667 INIT_LIST_HEAD(list: &root->dirty_list);
668 INIT_LIST_HEAD(list: &root->root_list);
669 INIT_LIST_HEAD(list: &root->delalloc_inodes);
670 INIT_LIST_HEAD(list: &root->delalloc_root);
671 INIT_LIST_HEAD(list: &root->ordered_extents);
672 INIT_LIST_HEAD(list: &root->ordered_root);
673 INIT_LIST_HEAD(list: &root->reloc_dirty_list);
674 spin_lock_init(&root->delalloc_lock);
675 spin_lock_init(&root->ordered_extent_lock);
676 spin_lock_init(&root->accounting_lock);
677 spin_lock_init(&root->qgroup_meta_rsv_lock);
678 mutex_init(&root->objectid_mutex);
679 mutex_init(&root->log_mutex);
680 mutex_init(&root->ordered_extent_mutex);
681 mutex_init(&root->delalloc_mutex);
682 init_waitqueue_head(&root->qgroup_flush_wait);
683 init_waitqueue_head(&root->log_writer_wait);
684 init_waitqueue_head(&root->log_commit_wait[0]);
685 init_waitqueue_head(&root->log_commit_wait[1]);
686 INIT_LIST_HEAD(list: &root->log_ctxs[0]);
687 INIT_LIST_HEAD(list: &root->log_ctxs[1]);
688 atomic_set(v: &root->log_commit[0], i: 0);
689 atomic_set(v: &root->log_commit[1], i: 0);
690 atomic_set(v: &root->log_writers, i: 0);
691 atomic_set(v: &root->log_batch, i: 0);
692 refcount_set(r: &root->refs, n: 1);
693 atomic_set(v: &root->snapshot_force_cow, i: 0);
694 atomic_set(v: &root->nr_swapfiles, i: 0);
695 btrfs_set_root_log_transid(root, log_transid: 0);
696 root->log_transid_committed = -1;
697 btrfs_set_root_last_log_commit(root, commit_id: 0);
698 root->anon_dev = 0;
699 if (!dummy) {
700 btrfs_extent_io_tree_init(fs_info, tree: &root->dirty_log_pages,
701 owner: IO_TREE_ROOT_DIRTY_LOG_PAGES);
702 btrfs_extent_io_tree_init(fs_info, tree: &root->log_csum_range,
703 owner: IO_TREE_LOG_CSUM_RANGE);
704 }
705
706 spin_lock_init(&root->root_item_lock);
707 btrfs_qgroup_init_swapped_blocks(swapped_blocks: &root->swapped_blocks);
708#ifdef CONFIG_BTRFS_DEBUG
709 INIT_LIST_HEAD(list: &root->leak_list);
710 spin_lock(lock: &fs_info->fs_roots_radix_lock);
711 list_add_tail(new: &root->leak_list, head: &fs_info->allocated_roots);
712 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
713#endif
714
715 return root;
716}
717
718#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
719/* Should only be used by the testing infrastructure */
720struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_root *root;
723
724 if (!fs_info)
725 return ERR_PTR(error: -EINVAL);
726
727 root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
728 if (!root)
729 return ERR_PTR(error: -ENOMEM);
730
731 /* We don't use the stripesize in selftest, set it as sectorsize */
732 root->alloc_bytenr = 0;
733
734 return root;
735}
736#endif
737
738static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
739{
740 const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
741 const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
742
743 return btrfs_comp_cpu_keys(k1: &a->root_key, k2: &b->root_key);
744}
745
746static int global_root_key_cmp(const void *k, const struct rb_node *node)
747{
748 const struct btrfs_key *key = k;
749 const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
750
751 return btrfs_comp_cpu_keys(k1: key, k2: &root->root_key);
752}
753
754int btrfs_global_root_insert(struct btrfs_root *root)
755{
756 struct btrfs_fs_info *fs_info = root->fs_info;
757 struct rb_node *tmp;
758 int ret = 0;
759
760 write_lock(&fs_info->global_root_lock);
761 tmp = rb_find_add(node: &root->rb_node, tree: &fs_info->global_root_tree, cmp: global_root_cmp);
762 write_unlock(&fs_info->global_root_lock);
763
764 if (tmp) {
765 ret = -EEXIST;
766 btrfs_warn(fs_info, "global root %llu %llu already exists",
767 btrfs_root_id(root), root->root_key.offset);
768 }
769 return ret;
770}
771
772void btrfs_global_root_delete(struct btrfs_root *root)
773{
774 struct btrfs_fs_info *fs_info = root->fs_info;
775
776 write_lock(&fs_info->global_root_lock);
777 rb_erase(&root->rb_node, &fs_info->global_root_tree);
778 write_unlock(&fs_info->global_root_lock);
779}
780
781struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
782 struct btrfs_key *key)
783{
784 struct rb_node *node;
785 struct btrfs_root *root = NULL;
786
787 read_lock(&fs_info->global_root_lock);
788 node = rb_find(key, tree: &fs_info->global_root_tree, cmp: global_root_key_cmp);
789 if (node)
790 root = container_of(node, struct btrfs_root, rb_node);
791 read_unlock(&fs_info->global_root_lock);
792
793 return root;
794}
795
796static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
797{
798 struct btrfs_block_group *block_group;
799 u64 ret;
800
801 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
802 return 0;
803
804 if (bytenr)
805 block_group = btrfs_lookup_block_group(info: fs_info, bytenr);
806 else
807 block_group = btrfs_lookup_first_block_group(info: fs_info, bytenr);
808 ASSERT(block_group);
809 if (!block_group)
810 return 0;
811 ret = block_group->global_root_id;
812 btrfs_put_block_group(cache: block_group);
813
814 return ret;
815}
816
817struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
818{
819 struct btrfs_key key = {
820 .objectid = BTRFS_CSUM_TREE_OBJECTID,
821 .type = BTRFS_ROOT_ITEM_KEY,
822 .offset = btrfs_global_root_id(fs_info, bytenr),
823 };
824
825 return btrfs_global_root(fs_info, key: &key);
826}
827
828struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
829{
830 struct btrfs_key key = {
831 .objectid = BTRFS_EXTENT_TREE_OBJECTID,
832 .type = BTRFS_ROOT_ITEM_KEY,
833 .offset = btrfs_global_root_id(fs_info, bytenr),
834 };
835
836 return btrfs_global_root(fs_info, key: &key);
837}
838
839struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
840 u64 objectid)
841{
842 struct btrfs_fs_info *fs_info = trans->fs_info;
843 struct extent_buffer *leaf;
844 struct btrfs_root *tree_root = fs_info->tree_root;
845 struct btrfs_root *root;
846 struct btrfs_key key;
847 unsigned int nofs_flag;
848 int ret = 0;
849
850 /*
851 * We're holding a transaction handle, so use a NOFS memory allocation
852 * context to avoid deadlock if reclaim happens.
853 */
854 nofs_flag = memalloc_nofs_save();
855 root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
856 memalloc_nofs_restore(flags: nofs_flag);
857 if (!root)
858 return ERR_PTR(error: -ENOMEM);
859
860 root->root_key.objectid = objectid;
861 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
862 root->root_key.offset = 0;
863
864 leaf = btrfs_alloc_tree_block(trans, root, parent: 0, root_objectid: objectid, NULL, level: 0, hint: 0, empty_size: 0,
865 reloc_src_root: 0, nest: BTRFS_NESTING_NORMAL);
866 if (IS_ERR(ptr: leaf)) {
867 ret = PTR_ERR(ptr: leaf);
868 leaf = NULL;
869 goto fail;
870 }
871
872 root->node = leaf;
873 btrfs_mark_buffer_dirty(trans, buf: leaf);
874
875 root->commit_root = btrfs_root_node(root);
876 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
877
878 btrfs_set_root_flags(s: &root->root_item, val: 0);
879 btrfs_set_root_limit(s: &root->root_item, val: 0);
880 btrfs_set_root_bytenr(s: &root->root_item, val: leaf->start);
881 btrfs_set_root_generation(s: &root->root_item, val: trans->transid);
882 btrfs_set_root_level(s: &root->root_item, val: 0);
883 btrfs_set_root_refs(s: &root->root_item, val: 1);
884 btrfs_set_root_used(s: &root->root_item, val: leaf->len);
885 btrfs_set_root_last_snapshot(s: &root->root_item, val: 0);
886 btrfs_set_root_dirid(s: &root->root_item, val: 0);
887 if (is_fstree(rootid: objectid))
888 generate_random_guid(guid: root->root_item.uuid);
889 else
890 export_guid(dst: root->root_item.uuid, src: &guid_null);
891 btrfs_set_root_drop_level(s: &root->root_item, val: 0);
892
893 btrfs_tree_unlock(eb: leaf);
894
895 key.objectid = objectid;
896 key.type = BTRFS_ROOT_ITEM_KEY;
897 key.offset = 0;
898 ret = btrfs_insert_root(trans, root: tree_root, key: &key, item: &root->root_item);
899 if (ret)
900 goto fail;
901
902 return root;
903
904fail:
905 btrfs_put_root(root);
906
907 return ERR_PTR(error: ret);
908}
909
910static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
911{
912 struct btrfs_root *root;
913
914 root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
915 if (!root)
916 return ERR_PTR(error: -ENOMEM);
917
918 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
919 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
920 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
921
922 return root;
923}
924
925int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
926 struct btrfs_root *root)
927{
928 struct extent_buffer *leaf;
929
930 /*
931 * DON'T set SHAREABLE bit for log trees.
932 *
933 * Log trees are not exposed to user space thus can't be snapshotted,
934 * and they go away before a real commit is actually done.
935 *
936 * They do store pointers to file data extents, and those reference
937 * counts still get updated (along with back refs to the log tree).
938 */
939
940 leaf = btrfs_alloc_tree_block(trans, root, parent: 0, BTRFS_TREE_LOG_OBJECTID,
941 NULL, level: 0, hint: 0, empty_size: 0, reloc_src_root: 0, nest: BTRFS_NESTING_NORMAL);
942 if (IS_ERR(ptr: leaf))
943 return PTR_ERR(ptr: leaf);
944
945 root->node = leaf;
946
947 btrfs_mark_buffer_dirty(trans, buf: root->node);
948 btrfs_tree_unlock(eb: root->node);
949
950 return 0;
951}
952
953int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
954 struct btrfs_fs_info *fs_info)
955{
956 struct btrfs_root *log_root;
957
958 log_root = alloc_log_tree(fs_info);
959 if (IS_ERR(ptr: log_root))
960 return PTR_ERR(ptr: log_root);
961
962 if (!btrfs_is_zoned(fs_info)) {
963 int ret = btrfs_alloc_log_tree_node(trans, root: log_root);
964
965 if (ret) {
966 btrfs_put_root(root: log_root);
967 return ret;
968 }
969 }
970
971 WARN_ON(fs_info->log_root_tree);
972 fs_info->log_root_tree = log_root;
973 return 0;
974}
975
976int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
977 struct btrfs_root *root)
978{
979 struct btrfs_fs_info *fs_info = root->fs_info;
980 struct btrfs_root *log_root;
981 struct btrfs_inode_item *inode_item;
982 int ret;
983
984 log_root = alloc_log_tree(fs_info);
985 if (IS_ERR(ptr: log_root))
986 return PTR_ERR(ptr: log_root);
987
988 ret = btrfs_alloc_log_tree_node(trans, root: log_root);
989 if (ret) {
990 btrfs_put_root(root: log_root);
991 return ret;
992 }
993
994 btrfs_set_root_last_trans(root: log_root, transid: trans->transid);
995 log_root->root_key.offset = btrfs_root_id(root);
996
997 inode_item = &log_root->root_item.inode;
998 btrfs_set_stack_inode_generation(s: inode_item, val: 1);
999 btrfs_set_stack_inode_size(s: inode_item, val: 3);
1000 btrfs_set_stack_inode_nlink(s: inode_item, val: 1);
1001 btrfs_set_stack_inode_nbytes(s: inode_item,
1002 val: fs_info->nodesize);
1003 btrfs_set_stack_inode_mode(s: inode_item, S_IFDIR | 0755);
1004
1005 btrfs_set_root_node(item: &log_root->root_item, node: log_root->node);
1006
1007 WARN_ON(root->log_root);
1008 root->log_root = log_root;
1009 btrfs_set_root_log_transid(root, log_transid: 0);
1010 root->log_transid_committed = -1;
1011 btrfs_set_root_last_log_commit(root, commit_id: 0);
1012 return 0;
1013}
1014
1015static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1016 struct btrfs_path *path,
1017 const struct btrfs_key *key)
1018{
1019 struct btrfs_root *root;
1020 struct btrfs_tree_parent_check check = { 0 };
1021 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1022 u64 generation;
1023 int ret;
1024 int level;
1025
1026 root = btrfs_alloc_root(fs_info, objectid: key->objectid, GFP_NOFS);
1027 if (!root)
1028 return ERR_PTR(error: -ENOMEM);
1029
1030 ret = btrfs_find_root(root: tree_root, search_key: key, path,
1031 root_item: &root->root_item, root_key: &root->root_key);
1032 if (ret) {
1033 if (ret > 0)
1034 ret = -ENOENT;
1035 goto fail;
1036 }
1037
1038 generation = btrfs_root_generation(s: &root->root_item);
1039 level = btrfs_root_level(s: &root->root_item);
1040 check.level = level;
1041 check.transid = generation;
1042 check.owner_root = key->objectid;
1043 root->node = read_tree_block(fs_info, bytenr: btrfs_root_bytenr(s: &root->root_item),
1044 check: &check);
1045 if (IS_ERR(ptr: root->node)) {
1046 ret = PTR_ERR(ptr: root->node);
1047 root->node = NULL;
1048 goto fail;
1049 }
1050 if (!btrfs_buffer_uptodate(eb: root->node, parent_transid: generation, atomic: 0)) {
1051 ret = -EIO;
1052 goto fail;
1053 }
1054
1055 /*
1056 * For real fs, and not log/reloc trees, root owner must
1057 * match its root node owner
1058 */
1059 if (!btrfs_is_testing(fs_info) &&
1060 btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1061 btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
1062 btrfs_root_id(root) != btrfs_header_owner(eb: root->node)) {
1063 btrfs_crit(fs_info,
1064"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1065 btrfs_root_id(root), root->node->start,
1066 btrfs_header_owner(root->node),
1067 btrfs_root_id(root));
1068 ret = -EUCLEAN;
1069 goto fail;
1070 }
1071 root->commit_root = btrfs_root_node(root);
1072 return root;
1073fail:
1074 btrfs_put_root(root);
1075 return ERR_PTR(error: ret);
1076}
1077
1078struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1079 const struct btrfs_key *key)
1080{
1081 struct btrfs_root *root;
1082 BTRFS_PATH_AUTO_FREE(path);
1083
1084 path = btrfs_alloc_path();
1085 if (!path)
1086 return ERR_PTR(error: -ENOMEM);
1087 root = read_tree_root_path(tree_root, path, key);
1088
1089 return root;
1090}
1091
1092/*
1093 * Initialize subvolume root in-memory structure.
1094 *
1095 * @anon_dev: anonymous device to attach to the root, if zero, allocate new
1096 *
1097 * In case of failure the caller is responsible to call btrfs_free_fs_root()
1098 */
1099static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1100{
1101 int ret;
1102
1103 btrfs_drew_lock_init(lock: &root->snapshot_lock);
1104
1105 if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1106 !btrfs_is_data_reloc_root(root) &&
1107 is_fstree(rootid: btrfs_root_id(root))) {
1108 set_bit(nr: BTRFS_ROOT_SHAREABLE, addr: &root->state);
1109 btrfs_check_and_init_root_item(item: &root->root_item);
1110 }
1111
1112 /*
1113 * Don't assign anonymous block device to roots that are not exposed to
1114 * userspace, the id pool is limited to 1M
1115 */
1116 if (is_fstree(rootid: btrfs_root_id(root)) &&
1117 btrfs_root_refs(s: &root->root_item) > 0) {
1118 if (!anon_dev) {
1119 ret = get_anon_bdev(&root->anon_dev);
1120 if (ret)
1121 return ret;
1122 } else {
1123 root->anon_dev = anon_dev;
1124 }
1125 }
1126
1127 mutex_lock(&root->objectid_mutex);
1128 ret = btrfs_init_root_free_objectid(root);
1129 if (ret) {
1130 mutex_unlock(lock: &root->objectid_mutex);
1131 return ret;
1132 }
1133
1134 ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1135
1136 mutex_unlock(lock: &root->objectid_mutex);
1137
1138 return 0;
1139}
1140
1141static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1142 u64 root_id)
1143{
1144 struct btrfs_root *root;
1145
1146 spin_lock(lock: &fs_info->fs_roots_radix_lock);
1147 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1148 (unsigned long)root_id);
1149 root = btrfs_grab_root(root);
1150 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
1151 return root;
1152}
1153
1154static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1155 u64 objectid)
1156{
1157 struct btrfs_key key = {
1158 .objectid = objectid,
1159 .type = BTRFS_ROOT_ITEM_KEY,
1160 .offset = 0,
1161 };
1162
1163 switch (objectid) {
1164 case BTRFS_ROOT_TREE_OBJECTID:
1165 return btrfs_grab_root(root: fs_info->tree_root);
1166 case BTRFS_EXTENT_TREE_OBJECTID:
1167 return btrfs_grab_root(root: btrfs_global_root(fs_info, key: &key));
1168 case BTRFS_CHUNK_TREE_OBJECTID:
1169 return btrfs_grab_root(root: fs_info->chunk_root);
1170 case BTRFS_DEV_TREE_OBJECTID:
1171 return btrfs_grab_root(root: fs_info->dev_root);
1172 case BTRFS_CSUM_TREE_OBJECTID:
1173 return btrfs_grab_root(root: btrfs_global_root(fs_info, key: &key));
1174 case BTRFS_QUOTA_TREE_OBJECTID:
1175 return btrfs_grab_root(root: fs_info->quota_root);
1176 case BTRFS_UUID_TREE_OBJECTID:
1177 return btrfs_grab_root(root: fs_info->uuid_root);
1178 case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
1179 return btrfs_grab_root(root: fs_info->block_group_root);
1180 case BTRFS_FREE_SPACE_TREE_OBJECTID:
1181 return btrfs_grab_root(root: btrfs_global_root(fs_info, key: &key));
1182 case BTRFS_RAID_STRIPE_TREE_OBJECTID:
1183 return btrfs_grab_root(root: fs_info->stripe_root);
1184 default:
1185 return NULL;
1186 }
1187}
1188
1189int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1190 struct btrfs_root *root)
1191{
1192 int ret;
1193
1194 ret = radix_tree_preload(GFP_NOFS);
1195 if (ret)
1196 return ret;
1197
1198 spin_lock(lock: &fs_info->fs_roots_radix_lock);
1199 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1200 index: (unsigned long)btrfs_root_id(root),
1201 root);
1202 if (ret == 0) {
1203 btrfs_grab_root(root);
1204 set_bit(nr: BTRFS_ROOT_IN_RADIX, addr: &root->state);
1205 }
1206 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
1207 radix_tree_preload_end();
1208
1209 return ret;
1210}
1211
1212void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
1213{
1214#ifdef CONFIG_BTRFS_DEBUG
1215 struct btrfs_root *root;
1216
1217 while (!list_empty(head: &fs_info->allocated_roots)) {
1218 char buf[BTRFS_ROOT_NAME_BUF_LEN];
1219
1220 root = list_first_entry(&fs_info->allocated_roots,
1221 struct btrfs_root, leak_list);
1222 btrfs_err(fs_info, "leaked root %s refcount %d",
1223 btrfs_root_name(&root->root_key, buf),
1224 refcount_read(&root->refs));
1225 WARN_ON_ONCE(1);
1226 while (refcount_read(r: &root->refs) > 1)
1227 btrfs_put_root(root);
1228 btrfs_put_root(root);
1229 }
1230#endif
1231}
1232
1233static void free_global_roots(struct btrfs_fs_info *fs_info)
1234{
1235 struct btrfs_root *root;
1236 struct rb_node *node;
1237
1238 while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1239 root = rb_entry(node, struct btrfs_root, rb_node);
1240 rb_erase(&root->rb_node, &fs_info->global_root_tree);
1241 btrfs_put_root(root);
1242 }
1243}
1244
1245void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1246{
1247 struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
1248
1249 percpu_counter_destroy(fbc: &fs_info->stats_read_blocks);
1250 percpu_counter_destroy(fbc: &fs_info->dirty_metadata_bytes);
1251 percpu_counter_destroy(fbc: &fs_info->delalloc_bytes);
1252 percpu_counter_destroy(fbc: &fs_info->ordered_bytes);
1253 if (percpu_counter_initialized(fbc: em_counter))
1254 ASSERT(percpu_counter_sum_positive(em_counter) == 0);
1255 percpu_counter_destroy(fbc: em_counter);
1256 percpu_counter_destroy(fbc: &fs_info->dev_replace.bio_counter);
1257 btrfs_free_csum_hash(fs_info);
1258 btrfs_free_stripe_hash_table(info: fs_info);
1259 btrfs_free_ref_cache(fs_info);
1260 kfree(objp: fs_info->balance_ctl);
1261 kfree(objp: fs_info->delayed_root);
1262 free_global_roots(fs_info);
1263 btrfs_put_root(root: fs_info->tree_root);
1264 btrfs_put_root(root: fs_info->chunk_root);
1265 btrfs_put_root(root: fs_info->dev_root);
1266 btrfs_put_root(root: fs_info->quota_root);
1267 btrfs_put_root(root: fs_info->uuid_root);
1268 btrfs_put_root(root: fs_info->fs_root);
1269 btrfs_put_root(root: fs_info->data_reloc_root);
1270 btrfs_put_root(root: fs_info->block_group_root);
1271 btrfs_put_root(root: fs_info->stripe_root);
1272 btrfs_check_leaked_roots(fs_info);
1273 btrfs_extent_buffer_leak_debug_check(fs_info);
1274 kfree(objp: fs_info->super_copy);
1275 kfree(objp: fs_info->super_for_commit);
1276 kvfree(addr: fs_info);
1277}
1278
1279
1280/*
1281 * Get an in-memory reference of a root structure.
1282 *
1283 * For essential trees like root/extent tree, we grab it from fs_info directly.
1284 * For subvolume trees, we check the cached filesystem roots first. If not
1285 * found, then read it from disk and add it to cached fs roots.
1286 *
1287 * Caller should release the root by calling btrfs_put_root() after the usage.
1288 *
1289 * NOTE: Reloc and log trees can't be read by this function as they share the
1290 * same root objectid.
1291 *
1292 * @objectid: root id
1293 * @anon_dev: preallocated anonymous block device number for new roots,
1294 * pass NULL for a new allocation.
1295 * @check_ref: whether to check root item references, If true, return -ENOENT
1296 * for orphan roots
1297 */
1298static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1299 u64 objectid, dev_t *anon_dev,
1300 bool check_ref)
1301{
1302 struct btrfs_root *root;
1303 struct btrfs_path *path;
1304 struct btrfs_key key;
1305 int ret;
1306
1307 root = btrfs_get_global_root(fs_info, objectid);
1308 if (root)
1309 return root;
1310
1311 /*
1312 * If we're called for non-subvolume trees, and above function didn't
1313 * find one, do not try to read it from disk.
1314 *
1315 * This is namely for free-space-tree and quota tree, which can change
1316 * at runtime and should only be grabbed from fs_info.
1317 */
1318 if (!is_fstree(rootid: objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1319 return ERR_PTR(error: -ENOENT);
1320again:
1321 root = btrfs_lookup_fs_root(fs_info, root_id: objectid);
1322 if (root) {
1323 /*
1324 * Some other caller may have read out the newly inserted
1325 * subvolume already (for things like backref walk etc). Not
1326 * that common but still possible. In that case, we just need
1327 * to free the anon_dev.
1328 */
1329 if (unlikely(anon_dev && *anon_dev)) {
1330 free_anon_bdev(*anon_dev);
1331 *anon_dev = 0;
1332 }
1333
1334 if (check_ref && btrfs_root_refs(s: &root->root_item) == 0) {
1335 btrfs_put_root(root);
1336 return ERR_PTR(error: -ENOENT);
1337 }
1338 return root;
1339 }
1340
1341 key.objectid = objectid;
1342 key.type = BTRFS_ROOT_ITEM_KEY;
1343 key.offset = (u64)-1;
1344 root = btrfs_read_tree_root(tree_root: fs_info->tree_root, key: &key);
1345 if (IS_ERR(ptr: root))
1346 return root;
1347
1348 if (check_ref && btrfs_root_refs(s: &root->root_item) == 0) {
1349 ret = -ENOENT;
1350 goto fail;
1351 }
1352
1353 ret = btrfs_init_fs_root(root, anon_dev: anon_dev ? *anon_dev : 0);
1354 if (ret)
1355 goto fail;
1356
1357 path = btrfs_alloc_path();
1358 if (!path) {
1359 ret = -ENOMEM;
1360 goto fail;
1361 }
1362 key.objectid = BTRFS_ORPHAN_OBJECTID;
1363 key.type = BTRFS_ORPHAN_ITEM_KEY;
1364 key.offset = objectid;
1365
1366 ret = btrfs_search_slot(NULL, root: fs_info->tree_root, key: &key, p: path, ins_len: 0, cow: 0);
1367 btrfs_free_path(p: path);
1368 if (ret < 0)
1369 goto fail;
1370 if (ret == 0)
1371 set_bit(nr: BTRFS_ROOT_ORPHAN_ITEM_INSERTED, addr: &root->state);
1372
1373 ret = btrfs_insert_fs_root(fs_info, root);
1374 if (ret) {
1375 if (ret == -EEXIST) {
1376 btrfs_put_root(root);
1377 goto again;
1378 }
1379 goto fail;
1380 }
1381 return root;
1382fail:
1383 /*
1384 * If our caller provided us an anonymous device, then it's his
1385 * responsibility to free it in case we fail. So we have to set our
1386 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1387 * and once again by our caller.
1388 */
1389 if (anon_dev && *anon_dev)
1390 root->anon_dev = 0;
1391 btrfs_put_root(root);
1392 return ERR_PTR(error: ret);
1393}
1394
1395/*
1396 * Get in-memory reference of a root structure
1397 *
1398 * @objectid: tree objectid
1399 * @check_ref: if set, verify that the tree exists and the item has at least
1400 * one reference
1401 */
1402struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1403 u64 objectid, bool check_ref)
1404{
1405 return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
1406}
1407
1408/*
1409 * Get in-memory reference of a root structure, created as new, optionally pass
1410 * the anonymous block device id
1411 *
1412 * @objectid: tree objectid
1413 * @anon_dev: if NULL, allocate a new anonymous block device or use the
1414 * parameter value if not NULL
1415 */
1416struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1417 u64 objectid, dev_t *anon_dev)
1418{
1419 return btrfs_get_root_ref(fs_info, objectid, anon_dev, check_ref: true);
1420}
1421
1422/*
1423 * Return a root for the given objectid.
1424 *
1425 * @fs_info: the fs_info
1426 * @objectid: the objectid we need to lookup
1427 *
1428 * This is exclusively used for backref walking, and exists specifically because
1429 * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
1430 * creation time, which means we may have to read the tree_root in order to look
1431 * up a fs root that is not in memory. If the root is not in memory we will
1432 * read the tree root commit root and look up the fs root from there. This is a
1433 * temporary root, it will not be inserted into the radix tree as it doesn't
1434 * have the most uptodate information, it'll simply be discarded once the
1435 * backref code is finished using the root.
1436 */
1437struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1438 struct btrfs_path *path,
1439 u64 objectid)
1440{
1441 struct btrfs_root *root;
1442 struct btrfs_key key;
1443
1444 ASSERT(path->search_commit_root && path->skip_locking);
1445
1446 /*
1447 * This can return -ENOENT if we ask for a root that doesn't exist, but
1448 * since this is called via the backref walking code we won't be looking
1449 * up a root that doesn't exist, unless there's corruption. So if root
1450 * != NULL just return it.
1451 */
1452 root = btrfs_get_global_root(fs_info, objectid);
1453 if (root)
1454 return root;
1455
1456 root = btrfs_lookup_fs_root(fs_info, root_id: objectid);
1457 if (root)
1458 return root;
1459
1460 key.objectid = objectid;
1461 key.type = BTRFS_ROOT_ITEM_KEY;
1462 key.offset = (u64)-1;
1463 root = read_tree_root_path(tree_root: fs_info->tree_root, path, key: &key);
1464 btrfs_release_path(p: path);
1465
1466 return root;
1467}
1468
1469static int cleaner_kthread(void *arg)
1470{
1471 struct btrfs_fs_info *fs_info = arg;
1472 int again;
1473
1474 while (1) {
1475 again = 0;
1476
1477 set_bit(nr: BTRFS_FS_CLEANER_RUNNING, addr: &fs_info->flags);
1478
1479 /* Make the cleaner go to sleep early. */
1480 if (btrfs_need_cleaner_sleep(fs_info))
1481 goto sleep;
1482
1483 /*
1484 * Do not do anything if we might cause open_ctree() to block
1485 * before we have finished mounting the filesystem.
1486 */
1487 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1488 goto sleep;
1489
1490 if (!mutex_trylock(&fs_info->cleaner_mutex))
1491 goto sleep;
1492
1493 /*
1494 * Avoid the problem that we change the status of the fs
1495 * during the above check and trylock.
1496 */
1497 if (btrfs_need_cleaner_sleep(fs_info)) {
1498 mutex_unlock(lock: &fs_info->cleaner_mutex);
1499 goto sleep;
1500 }
1501
1502 if (test_and_clear_bit(nr: BTRFS_FS_FEATURE_CHANGED, addr: &fs_info->flags))
1503 btrfs_sysfs_feature_update(fs_info);
1504
1505 btrfs_run_delayed_iputs(fs_info);
1506
1507 again = btrfs_clean_one_deleted_snapshot(fs_info);
1508 mutex_unlock(lock: &fs_info->cleaner_mutex);
1509
1510 /*
1511 * The defragger has dealt with the R/O remount and umount,
1512 * needn't do anything special here.
1513 */
1514 btrfs_run_defrag_inodes(fs_info);
1515
1516 /*
1517 * Acquires fs_info->reclaim_bgs_lock to avoid racing
1518 * with relocation (btrfs_relocate_chunk) and relocation
1519 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1520 * after acquiring fs_info->reclaim_bgs_lock. So we
1521 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1522 * unused block groups.
1523 */
1524 btrfs_delete_unused_bgs(fs_info);
1525
1526 /*
1527 * Reclaim block groups in the reclaim_bgs list after we deleted
1528 * all unused block_groups. This possibly gives us some more free
1529 * space.
1530 */
1531 btrfs_reclaim_bgs(fs_info);
1532sleep:
1533 clear_and_wake_up_bit(bit: BTRFS_FS_CLEANER_RUNNING, word: &fs_info->flags);
1534 if (kthread_should_park())
1535 kthread_parkme();
1536 if (kthread_should_stop())
1537 return 0;
1538 if (!again) {
1539 set_current_state(TASK_INTERRUPTIBLE);
1540 schedule();
1541 __set_current_state(TASK_RUNNING);
1542 }
1543 }
1544}
1545
1546static int transaction_kthread(void *arg)
1547{
1548 struct btrfs_root *root = arg;
1549 struct btrfs_fs_info *fs_info = root->fs_info;
1550 struct btrfs_trans_handle *trans;
1551 struct btrfs_transaction *cur;
1552 u64 transid;
1553 time64_t delta;
1554 unsigned long delay;
1555 bool cannot_commit;
1556
1557 do {
1558 cannot_commit = false;
1559 delay = secs_to_jiffies(fs_info->commit_interval);
1560 mutex_lock(&fs_info->transaction_kthread_mutex);
1561
1562 spin_lock(lock: &fs_info->trans_lock);
1563 cur = fs_info->running_transaction;
1564 if (!cur) {
1565 spin_unlock(lock: &fs_info->trans_lock);
1566 goto sleep;
1567 }
1568
1569 delta = ktime_get_seconds() - cur->start_time;
1570 if (!test_and_clear_bit(nr: BTRFS_FS_COMMIT_TRANS, addr: &fs_info->flags) &&
1571 cur->state < TRANS_STATE_COMMIT_PREP &&
1572 delta < fs_info->commit_interval) {
1573 spin_unlock(lock: &fs_info->trans_lock);
1574 delay -= secs_to_jiffies(delta - 1);
1575 delay = min(delay,
1576 secs_to_jiffies(fs_info->commit_interval));
1577 goto sleep;
1578 }
1579 transid = cur->transid;
1580 spin_unlock(lock: &fs_info->trans_lock);
1581
1582 /* If the file system is aborted, this will always fail. */
1583 trans = btrfs_attach_transaction(root);
1584 if (IS_ERR(ptr: trans)) {
1585 if (PTR_ERR(ptr: trans) != -ENOENT)
1586 cannot_commit = true;
1587 goto sleep;
1588 }
1589 if (transid == trans->transid) {
1590 btrfs_commit_transaction(trans);
1591 } else {
1592 btrfs_end_transaction(trans);
1593 }
1594sleep:
1595 wake_up_process(tsk: fs_info->cleaner_kthread);
1596 mutex_unlock(lock: &fs_info->transaction_kthread_mutex);
1597
1598 if (BTRFS_FS_ERROR(fs_info))
1599 btrfs_cleanup_transaction(fs_info);
1600 if (!kthread_should_stop() &&
1601 (!btrfs_transaction_blocked(info: fs_info) ||
1602 cannot_commit))
1603 schedule_timeout_interruptible(timeout: delay);
1604 } while (!kthread_should_stop());
1605 return 0;
1606}
1607
1608/*
1609 * This will find the highest generation in the array of root backups. The
1610 * index of the highest array is returned, or -EINVAL if we can't find
1611 * anything.
1612 *
1613 * We check to make sure the array is valid by comparing the
1614 * generation of the latest root in the array with the generation
1615 * in the super block. If they don't match we pitch it.
1616 */
1617static int find_newest_super_backup(struct btrfs_fs_info *info)
1618{
1619 const u64 newest_gen = btrfs_super_generation(s: info->super_copy);
1620 u64 cur;
1621 struct btrfs_root_backup *root_backup;
1622 int i;
1623
1624 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1625 root_backup = info->super_copy->super_roots + i;
1626 cur = btrfs_backup_tree_root_gen(s: root_backup);
1627 if (cur == newest_gen)
1628 return i;
1629 }
1630
1631 return -EINVAL;
1632}
1633
1634/*
1635 * copy all the root pointers into the super backup array.
1636 * this will bump the backup pointer by one when it is
1637 * done
1638 */
1639static void backup_super_roots(struct btrfs_fs_info *info)
1640{
1641 const int next_backup = info->backup_root_index;
1642 struct btrfs_root_backup *root_backup;
1643
1644 root_backup = info->super_for_commit->super_roots + next_backup;
1645
1646 /*
1647 * make sure all of our padding and empty slots get zero filled
1648 * regardless of which ones we use today
1649 */
1650 memset(root_backup, 0, sizeof(*root_backup));
1651
1652 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1653
1654 btrfs_set_backup_tree_root(s: root_backup, val: info->tree_root->node->start);
1655 btrfs_set_backup_tree_root_gen(s: root_backup,
1656 val: btrfs_header_generation(eb: info->tree_root->node));
1657
1658 btrfs_set_backup_tree_root_level(s: root_backup,
1659 val: btrfs_header_level(eb: info->tree_root->node));
1660
1661 btrfs_set_backup_chunk_root(s: root_backup, val: info->chunk_root->node->start);
1662 btrfs_set_backup_chunk_root_gen(s: root_backup,
1663 val: btrfs_header_generation(eb: info->chunk_root->node));
1664 btrfs_set_backup_chunk_root_level(s: root_backup,
1665 val: btrfs_header_level(eb: info->chunk_root->node));
1666
1667 if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
1668 struct btrfs_root *extent_root = btrfs_extent_root(fs_info: info, bytenr: 0);
1669 struct btrfs_root *csum_root = btrfs_csum_root(fs_info: info, bytenr: 0);
1670
1671 btrfs_set_backup_extent_root(s: root_backup,
1672 val: extent_root->node->start);
1673 btrfs_set_backup_extent_root_gen(s: root_backup,
1674 val: btrfs_header_generation(eb: extent_root->node));
1675 btrfs_set_backup_extent_root_level(s: root_backup,
1676 val: btrfs_header_level(eb: extent_root->node));
1677
1678 btrfs_set_backup_csum_root(s: root_backup, val: csum_root->node->start);
1679 btrfs_set_backup_csum_root_gen(s: root_backup,
1680 val: btrfs_header_generation(eb: csum_root->node));
1681 btrfs_set_backup_csum_root_level(s: root_backup,
1682 val: btrfs_header_level(eb: csum_root->node));
1683 }
1684
1685 /*
1686 * we might commit during log recovery, which happens before we set
1687 * the fs_root. Make sure it is valid before we fill it in.
1688 */
1689 if (info->fs_root && info->fs_root->node) {
1690 btrfs_set_backup_fs_root(s: root_backup,
1691 val: info->fs_root->node->start);
1692 btrfs_set_backup_fs_root_gen(s: root_backup,
1693 val: btrfs_header_generation(eb: info->fs_root->node));
1694 btrfs_set_backup_fs_root_level(s: root_backup,
1695 val: btrfs_header_level(eb: info->fs_root->node));
1696 }
1697
1698 btrfs_set_backup_dev_root(s: root_backup, val: info->dev_root->node->start);
1699 btrfs_set_backup_dev_root_gen(s: root_backup,
1700 val: btrfs_header_generation(eb: info->dev_root->node));
1701 btrfs_set_backup_dev_root_level(s: root_backup,
1702 val: btrfs_header_level(eb: info->dev_root->node));
1703
1704 btrfs_set_backup_total_bytes(s: root_backup,
1705 val: btrfs_super_total_bytes(s: info->super_copy));
1706 btrfs_set_backup_bytes_used(s: root_backup,
1707 val: btrfs_super_bytes_used(s: info->super_copy));
1708 btrfs_set_backup_num_devices(s: root_backup,
1709 val: btrfs_super_num_devices(s: info->super_copy));
1710
1711 /*
1712 * if we don't copy this out to the super_copy, it won't get remembered
1713 * for the next commit
1714 */
1715 memcpy(&info->super_copy->super_roots,
1716 &info->super_for_commit->super_roots,
1717 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1718}
1719
1720/*
1721 * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
1722 * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1723 *
1724 * @fs_info: filesystem whose backup roots need to be read
1725 * @priority: priority of backup root required
1726 *
1727 * Returns backup root index on success and -EINVAL otherwise.
1728 */
1729static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
1730{
1731 int backup_index = find_newest_super_backup(info: fs_info);
1732 struct btrfs_super_block *super = fs_info->super_copy;
1733 struct btrfs_root_backup *root_backup;
1734
1735 if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1736 if (priority == 0)
1737 return backup_index;
1738
1739 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1740 backup_index %= BTRFS_NUM_BACKUP_ROOTS;
1741 } else {
1742 return -EINVAL;
1743 }
1744
1745 root_backup = super->super_roots + backup_index;
1746
1747 btrfs_set_super_generation(s: super,
1748 val: btrfs_backup_tree_root_gen(s: root_backup));
1749 btrfs_set_super_root(s: super, val: btrfs_backup_tree_root(s: root_backup));
1750 btrfs_set_super_root_level(s: super,
1751 val: btrfs_backup_tree_root_level(s: root_backup));
1752 btrfs_set_super_bytes_used(s: super, val: btrfs_backup_bytes_used(s: root_backup));
1753
1754 /*
1755 * Fixme: the total bytes and num_devices need to match or we should
1756 * need a fsck
1757 */
1758 btrfs_set_super_total_bytes(s: super, val: btrfs_backup_total_bytes(s: root_backup));
1759 btrfs_set_super_num_devices(s: super, val: btrfs_backup_num_devices(s: root_backup));
1760
1761 return backup_index;
1762}
1763
1764/* helper to cleanup workers */
1765static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1766{
1767 btrfs_destroy_workqueue(wq: fs_info->fixup_workers);
1768 btrfs_destroy_workqueue(wq: fs_info->delalloc_workers);
1769 btrfs_destroy_workqueue(wq: fs_info->workers);
1770 if (fs_info->endio_workers)
1771 destroy_workqueue(wq: fs_info->endio_workers);
1772 if (fs_info->rmw_workers)
1773 destroy_workqueue(wq: fs_info->rmw_workers);
1774 if (fs_info->compressed_write_workers)
1775 destroy_workqueue(wq: fs_info->compressed_write_workers);
1776 btrfs_destroy_workqueue(wq: fs_info->endio_write_workers);
1777 btrfs_destroy_workqueue(wq: fs_info->endio_freespace_worker);
1778 btrfs_destroy_workqueue(wq: fs_info->delayed_workers);
1779 btrfs_destroy_workqueue(wq: fs_info->caching_workers);
1780 btrfs_destroy_workqueue(wq: fs_info->flush_workers);
1781 btrfs_destroy_workqueue(wq: fs_info->qgroup_rescan_workers);
1782 if (fs_info->discard_ctl.discard_workers)
1783 destroy_workqueue(wq: fs_info->discard_ctl.discard_workers);
1784 /*
1785 * Now that all other work queues are destroyed, we can safely destroy
1786 * the queues used for metadata I/O, since tasks from those other work
1787 * queues can do metadata I/O operations.
1788 */
1789 if (fs_info->endio_meta_workers)
1790 destroy_workqueue(wq: fs_info->endio_meta_workers);
1791}
1792
1793static void free_root_extent_buffers(struct btrfs_root *root)
1794{
1795 if (root) {
1796 free_extent_buffer(eb: root->node);
1797 free_extent_buffer(eb: root->commit_root);
1798 root->node = NULL;
1799 root->commit_root = NULL;
1800 }
1801}
1802
1803static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
1804{
1805 struct btrfs_root *root, *tmp;
1806
1807 rbtree_postorder_for_each_entry_safe(root, tmp,
1808 &fs_info->global_root_tree,
1809 rb_node)
1810 free_root_extent_buffers(root);
1811}
1812
1813/* helper to cleanup tree roots */
1814static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
1815{
1816 free_root_extent_buffers(root: info->tree_root);
1817
1818 free_global_root_pointers(fs_info: info);
1819 free_root_extent_buffers(root: info->dev_root);
1820 free_root_extent_buffers(root: info->quota_root);
1821 free_root_extent_buffers(root: info->uuid_root);
1822 free_root_extent_buffers(root: info->fs_root);
1823 free_root_extent_buffers(root: info->data_reloc_root);
1824 free_root_extent_buffers(root: info->block_group_root);
1825 free_root_extent_buffers(root: info->stripe_root);
1826 if (free_chunk_root)
1827 free_root_extent_buffers(root: info->chunk_root);
1828}
1829
1830void btrfs_put_root(struct btrfs_root *root)
1831{
1832 if (!root)
1833 return;
1834
1835 if (refcount_dec_and_test(r: &root->refs)) {
1836 if (WARN_ON(!xa_empty(&root->inodes)))
1837 xa_destroy(&root->inodes);
1838 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
1839 if (root->anon_dev)
1840 free_anon_bdev(root->anon_dev);
1841 free_root_extent_buffers(root);
1842#ifdef CONFIG_BTRFS_DEBUG
1843 spin_lock(lock: &root->fs_info->fs_roots_radix_lock);
1844 list_del_init(entry: &root->leak_list);
1845 spin_unlock(lock: &root->fs_info->fs_roots_radix_lock);
1846#endif
1847 kfree(objp: root);
1848 }
1849}
1850
1851void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
1852{
1853 int ret;
1854 struct btrfs_root *gang[8];
1855 int i;
1856
1857 while (!list_empty(head: &fs_info->dead_roots)) {
1858 gang[0] = list_first_entry(&fs_info->dead_roots,
1859 struct btrfs_root, root_list);
1860 list_del(entry: &gang[0]->root_list);
1861
1862 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
1863 btrfs_drop_and_free_fs_root(fs_info, root: gang[0]);
1864 btrfs_put_root(root: gang[0]);
1865 }
1866
1867 while (1) {
1868 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1869 results: (void **)gang, first_index: 0,
1870 ARRAY_SIZE(gang));
1871 if (!ret)
1872 break;
1873 for (i = 0; i < ret; i++)
1874 btrfs_drop_and_free_fs_root(fs_info, root: gang[i]);
1875 }
1876}
1877
1878static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
1879{
1880 mutex_init(&fs_info->scrub_lock);
1881 atomic_set(v: &fs_info->scrubs_running, i: 0);
1882 atomic_set(v: &fs_info->scrub_pause_req, i: 0);
1883 atomic_set(v: &fs_info->scrubs_paused, i: 0);
1884 atomic_set(v: &fs_info->scrub_cancel_req, i: 0);
1885 init_waitqueue_head(&fs_info->scrub_pause_wait);
1886 refcount_set(r: &fs_info->scrub_workers_refcnt, n: 0);
1887}
1888
1889static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
1890{
1891 spin_lock_init(&fs_info->balance_lock);
1892 mutex_init(&fs_info->balance_mutex);
1893 atomic_set(v: &fs_info->balance_pause_req, i: 0);
1894 atomic_set(v: &fs_info->balance_cancel_req, i: 0);
1895 fs_info->balance_ctl = NULL;
1896 init_waitqueue_head(&fs_info->balance_wait_q);
1897 atomic_set(v: &fs_info->reloc_cancel_req, i: 0);
1898}
1899
1900static int btrfs_init_btree_inode(struct super_block *sb)
1901{
1902 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1903 unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
1904 root: fs_info->tree_root);
1905 struct inode *inode;
1906
1907 inode = new_inode(sb);
1908 if (!inode)
1909 return -ENOMEM;
1910
1911 btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
1912 set_nlink(inode, nlink: 1);
1913 /*
1914 * we set the i_size on the btree inode to the max possible int.
1915 * the real end of the address space is determined by all of
1916 * the devices in the system
1917 */
1918 inode->i_size = OFFSET_MAX;
1919 inode->i_mapping->a_ops = &btree_aops;
1920 mapping_set_gfp_mask(m: inode->i_mapping, GFP_NOFS);
1921
1922 btrfs_extent_io_tree_init(fs_info, tree: &BTRFS_I(inode)->io_tree,
1923 owner: IO_TREE_BTREE_INODE_IO);
1924 btrfs_extent_map_tree_init(tree: &BTRFS_I(inode)->extent_tree);
1925
1926 BTRFS_I(inode)->root = btrfs_grab_root(root: fs_info->tree_root);
1927 set_bit(nr: BTRFS_INODE_DUMMY, addr: &BTRFS_I(inode)->runtime_flags);
1928 __insert_inode_hash(inode, hashval: hash);
1929 fs_info->btree_inode = inode;
1930
1931 return 0;
1932}
1933
1934static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
1935{
1936 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
1937 init_rwsem(&fs_info->dev_replace.rwsem);
1938 init_waitqueue_head(&fs_info->dev_replace.replace_wait);
1939}
1940
1941static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
1942{
1943 spin_lock_init(&fs_info->qgroup_lock);
1944 mutex_init(&fs_info->qgroup_ioctl_lock);
1945 fs_info->qgroup_tree = RB_ROOT;
1946 INIT_LIST_HEAD(list: &fs_info->dirty_qgroups);
1947 fs_info->qgroup_seq = 1;
1948 fs_info->qgroup_ulist = NULL;
1949 fs_info->qgroup_rescan_running = false;
1950 fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
1951 mutex_init(&fs_info->qgroup_rescan_lock);
1952}
1953
1954static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
1955{
1956 u32 max_active = fs_info->thread_pool_size;
1957 unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
1958 unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
1959
1960 fs_info->workers =
1961 btrfs_alloc_workqueue(fs_info, name: "worker", flags, limit_active: max_active, thresh: 16);
1962
1963 fs_info->delalloc_workers =
1964 btrfs_alloc_workqueue(fs_info, name: "delalloc",
1965 flags, limit_active: max_active, thresh: 2);
1966
1967 fs_info->flush_workers =
1968 btrfs_alloc_workqueue(fs_info, name: "flush_delalloc",
1969 flags, limit_active: max_active, thresh: 0);
1970
1971 fs_info->caching_workers =
1972 btrfs_alloc_workqueue(fs_info, name: "cache", flags, limit_active: max_active, thresh: 0);
1973
1974 fs_info->fixup_workers =
1975 btrfs_alloc_ordered_workqueue(fs_info, name: "fixup", flags: ordered_flags);
1976
1977 fs_info->endio_workers =
1978 alloc_workqueue(fmt: "btrfs-endio", flags, max_active);
1979 fs_info->endio_meta_workers =
1980 alloc_workqueue(fmt: "btrfs-endio-meta", flags, max_active);
1981 fs_info->rmw_workers = alloc_workqueue(fmt: "btrfs-rmw", flags, max_active);
1982 fs_info->endio_write_workers =
1983 btrfs_alloc_workqueue(fs_info, name: "endio-write", flags,
1984 limit_active: max_active, thresh: 2);
1985 fs_info->compressed_write_workers =
1986 alloc_workqueue(fmt: "btrfs-compressed-write", flags, max_active);
1987 fs_info->endio_freespace_worker =
1988 btrfs_alloc_workqueue(fs_info, name: "freespace-write", flags,
1989 limit_active: max_active, thresh: 0);
1990 fs_info->delayed_workers =
1991 btrfs_alloc_workqueue(fs_info, name: "delayed-meta", flags,
1992 limit_active: max_active, thresh: 0);
1993 fs_info->qgroup_rescan_workers =
1994 btrfs_alloc_ordered_workqueue(fs_info, name: "qgroup-rescan",
1995 flags: ordered_flags);
1996 fs_info->discard_ctl.discard_workers =
1997 alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
1998
1999 if (!(fs_info->workers &&
2000 fs_info->delalloc_workers && fs_info->flush_workers &&
2001 fs_info->endio_workers && fs_info->endio_meta_workers &&
2002 fs_info->compressed_write_workers &&
2003 fs_info->endio_write_workers &&
2004 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2005 fs_info->caching_workers && fs_info->fixup_workers &&
2006 fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2007 fs_info->discard_ctl.discard_workers)) {
2008 return -ENOMEM;
2009 }
2010
2011 return 0;
2012}
2013
2014static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2015{
2016 struct crypto_shash *csum_shash;
2017 const char *csum_driver = btrfs_super_csum_driver(csum_type);
2018
2019 csum_shash = crypto_alloc_shash(alg_name: csum_driver, type: 0, mask: 0);
2020
2021 if (IS_ERR(ptr: csum_shash)) {
2022 btrfs_err(fs_info, "error allocating %s hash for checksum",
2023 csum_driver);
2024 return PTR_ERR(ptr: csum_shash);
2025 }
2026
2027 fs_info->csum_shash = csum_shash;
2028
2029 /*
2030 * Check if the checksum implementation is a fast accelerated one.
2031 * As-is this is a bit of a hack and should be replaced once the csum
2032 * implementations provide that information themselves.
2033 */
2034 switch (csum_type) {
2035 case BTRFS_CSUM_TYPE_CRC32:
2036 if (!strstr(crypto_shash_driver_name(tfm: csum_shash), "generic"))
2037 set_bit(nr: BTRFS_FS_CSUM_IMPL_FAST, addr: &fs_info->flags);
2038 break;
2039 case BTRFS_CSUM_TYPE_XXHASH:
2040 set_bit(nr: BTRFS_FS_CSUM_IMPL_FAST, addr: &fs_info->flags);
2041 break;
2042 default:
2043 break;
2044 }
2045
2046 btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2047 btrfs_super_csum_name(csum_type),
2048 crypto_shash_driver_name(csum_shash));
2049 return 0;
2050}
2051
2052static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2053 struct btrfs_fs_devices *fs_devices)
2054{
2055 int ret;
2056 struct btrfs_tree_parent_check check = { 0 };
2057 struct btrfs_root *log_tree_root;
2058 struct btrfs_super_block *disk_super = fs_info->super_copy;
2059 u64 bytenr = btrfs_super_log_root(s: disk_super);
2060 int level = btrfs_super_log_root_level(s: disk_super);
2061
2062 if (fs_devices->rw_devices == 0) {
2063 btrfs_warn(fs_info, "log replay required on RO media");
2064 return -EIO;
2065 }
2066
2067 log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2068 GFP_KERNEL);
2069 if (!log_tree_root)
2070 return -ENOMEM;
2071
2072 check.level = level;
2073 check.transid = fs_info->generation + 1;
2074 check.owner_root = BTRFS_TREE_LOG_OBJECTID;
2075 log_tree_root->node = read_tree_block(fs_info, bytenr, check: &check);
2076 if (IS_ERR(ptr: log_tree_root->node)) {
2077 btrfs_warn(fs_info, "failed to read log tree");
2078 ret = PTR_ERR(ptr: log_tree_root->node);
2079 log_tree_root->node = NULL;
2080 btrfs_put_root(root: log_tree_root);
2081 return ret;
2082 }
2083 if (!extent_buffer_uptodate(eb: log_tree_root->node)) {
2084 btrfs_err(fs_info, "failed to read log tree");
2085 btrfs_put_root(root: log_tree_root);
2086 return -EIO;
2087 }
2088
2089 /* returns with log_tree_root freed on success */
2090 ret = btrfs_recover_log_trees(tree_root: log_tree_root);
2091 if (ret) {
2092 btrfs_handle_fs_error(fs_info, ret,
2093 "Failed to recover log tree");
2094 btrfs_put_root(root: log_tree_root);
2095 return ret;
2096 }
2097
2098 if (sb_rdonly(sb: fs_info->sb)) {
2099 ret = btrfs_commit_super(fs_info);
2100 if (ret)
2101 return ret;
2102 }
2103
2104 return 0;
2105}
2106
2107static int load_global_roots_objectid(struct btrfs_root *tree_root,
2108 struct btrfs_path *path, u64 objectid,
2109 const char *name)
2110{
2111 struct btrfs_fs_info *fs_info = tree_root->fs_info;
2112 struct btrfs_root *root;
2113 u64 max_global_id = 0;
2114 int ret;
2115 struct btrfs_key key = {
2116 .objectid = objectid,
2117 .type = BTRFS_ROOT_ITEM_KEY,
2118 .offset = 0,
2119 };
2120 bool found = false;
2121
2122 /* If we have IGNOREDATACSUMS skip loading these roots. */
2123 if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2124 btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2125 set_bit(nr: BTRFS_FS_STATE_NO_DATA_CSUMS, addr: &fs_info->fs_state);
2126 return 0;
2127 }
2128
2129 while (1) {
2130 ret = btrfs_search_slot(NULL, root: tree_root, key: &key, p: path, ins_len: 0, cow: 0);
2131 if (ret < 0)
2132 break;
2133
2134 if (path->slots[0] >= btrfs_header_nritems(eb: path->nodes[0])) {
2135 ret = btrfs_next_leaf(root: tree_root, path);
2136 if (ret) {
2137 if (ret > 0)
2138 ret = 0;
2139 break;
2140 }
2141 }
2142 ret = 0;
2143
2144 btrfs_item_key_to_cpu(eb: path->nodes[0], cpu_key: &key, nr: path->slots[0]);
2145 if (key.objectid != objectid)
2146 break;
2147 btrfs_release_path(p: path);
2148
2149 /*
2150 * Just worry about this for extent tree, it'll be the same for
2151 * everybody.
2152 */
2153 if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2154 max_global_id = max(max_global_id, key.offset);
2155
2156 found = true;
2157 root = read_tree_root_path(tree_root, path, key: &key);
2158 if (IS_ERR(ptr: root)) {
2159 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2160 ret = PTR_ERR(ptr: root);
2161 break;
2162 }
2163 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2164 ret = btrfs_global_root_insert(root);
2165 if (ret) {
2166 btrfs_put_root(root);
2167 break;
2168 }
2169 key.offset++;
2170 }
2171 btrfs_release_path(p: path);
2172
2173 if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2174 fs_info->nr_global_roots = max_global_id + 1;
2175
2176 if (!found || ret) {
2177 if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2178 set_bit(nr: BTRFS_FS_STATE_NO_DATA_CSUMS, addr: &fs_info->fs_state);
2179
2180 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2181 ret = ret ? ret : -ENOENT;
2182 else
2183 ret = 0;
2184 btrfs_err(fs_info, "failed to load root %s", name);
2185 }
2186 return ret;
2187}
2188
2189static int load_global_roots(struct btrfs_root *tree_root)
2190{
2191 BTRFS_PATH_AUTO_FREE(path);
2192 int ret;
2193
2194 path = btrfs_alloc_path();
2195 if (!path)
2196 return -ENOMEM;
2197
2198 ret = load_global_roots_objectid(tree_root, path,
2199 BTRFS_EXTENT_TREE_OBJECTID, name: "extent");
2200 if (ret)
2201 return ret;
2202 ret = load_global_roots_objectid(tree_root, path,
2203 BTRFS_CSUM_TREE_OBJECTID, name: "csum");
2204 if (ret)
2205 return ret;
2206 if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2207 return ret;
2208 ret = load_global_roots_objectid(tree_root, path,
2209 BTRFS_FREE_SPACE_TREE_OBJECTID,
2210 name: "free space");
2211
2212 return ret;
2213}
2214
2215static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2216{
2217 struct btrfs_root *tree_root = fs_info->tree_root;
2218 struct btrfs_root *root;
2219 struct btrfs_key location;
2220 int ret;
2221
2222 ASSERT(fs_info->tree_root);
2223
2224 ret = load_global_roots(tree_root);
2225 if (ret)
2226 return ret;
2227
2228 location.type = BTRFS_ROOT_ITEM_KEY;
2229 location.offset = 0;
2230
2231 if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2232 location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
2233 root = btrfs_read_tree_root(tree_root, key: &location);
2234 if (IS_ERR(ptr: root)) {
2235 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2236 ret = PTR_ERR(ptr: root);
2237 goto out;
2238 }
2239 } else {
2240 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2241 fs_info->block_group_root = root;
2242 }
2243 }
2244
2245 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2246 root = btrfs_read_tree_root(tree_root, key: &location);
2247 if (IS_ERR(ptr: root)) {
2248 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2249 ret = PTR_ERR(ptr: root);
2250 goto out;
2251 }
2252 } else {
2253 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2254 fs_info->dev_root = root;
2255 }
2256 /* Initialize fs_info for all devices in any case */
2257 ret = btrfs_init_devices_late(fs_info);
2258 if (ret)
2259 goto out;
2260
2261 /*
2262 * This tree can share blocks with some other fs tree during relocation
2263 * and we need a proper setup by btrfs_get_fs_root
2264 */
2265 root = btrfs_get_fs_root(fs_info: tree_root->fs_info,
2266 BTRFS_DATA_RELOC_TREE_OBJECTID, check_ref: true);
2267 if (IS_ERR(ptr: root)) {
2268 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2269 ret = PTR_ERR(ptr: root);
2270 goto out;
2271 }
2272 } else {
2273 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2274 fs_info->data_reloc_root = root;
2275 }
2276
2277 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2278 root = btrfs_read_tree_root(tree_root, key: &location);
2279 if (!IS_ERR(ptr: root)) {
2280 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2281 fs_info->quota_root = root;
2282 }
2283
2284 location.objectid = BTRFS_UUID_TREE_OBJECTID;
2285 root = btrfs_read_tree_root(tree_root, key: &location);
2286 if (IS_ERR(ptr: root)) {
2287 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2288 ret = PTR_ERR(ptr: root);
2289 if (ret != -ENOENT)
2290 goto out;
2291 }
2292 } else {
2293 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2294 fs_info->uuid_root = root;
2295 }
2296
2297 if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
2298 location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
2299 root = btrfs_read_tree_root(tree_root, key: &location);
2300 if (IS_ERR(ptr: root)) {
2301 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2302 ret = PTR_ERR(ptr: root);
2303 goto out;
2304 }
2305 } else {
2306 set_bit(nr: BTRFS_ROOT_TRACK_DIRTY, addr: &root->state);
2307 fs_info->stripe_root = root;
2308 }
2309 }
2310
2311 return 0;
2312out:
2313 btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2314 location.objectid, ret);
2315 return ret;
2316}
2317
2318static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
2319 const struct btrfs_super_block *sb)
2320{
2321 unsigned int cur = 0; /* Offset inside the sys chunk array */
2322 /*
2323 * At sb read time, fs_info is not fully initialized. Thus we have
2324 * to use super block sectorsize, which should have been validated.
2325 */
2326 const u32 sectorsize = btrfs_super_sectorsize(s: sb);
2327 u32 sys_array_size = btrfs_super_sys_array_size(s: sb);
2328
2329 if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2330 btrfs_err(fs_info, "system chunk array too big %u > %u",
2331 sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2332 return -EUCLEAN;
2333 }
2334
2335 while (cur < sys_array_size) {
2336 struct btrfs_disk_key *disk_key;
2337 struct btrfs_chunk *chunk;
2338 struct btrfs_key key;
2339 u64 type;
2340 u16 num_stripes;
2341 u32 len;
2342 int ret;
2343
2344 disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
2345 len = sizeof(*disk_key);
2346
2347 if (cur + len > sys_array_size)
2348 goto short_read;
2349 cur += len;
2350
2351 btrfs_disk_key_to_cpu(cpu_key: &key, disk_key);
2352 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
2353 btrfs_err(fs_info,
2354 "unexpected item type %u in sys_array at offset %u",
2355 key.type, cur);
2356 return -EUCLEAN;
2357 }
2358 chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
2359 num_stripes = btrfs_stack_chunk_num_stripes(s: chunk);
2360 if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
2361 goto short_read;
2362 type = btrfs_stack_chunk_type(s: chunk);
2363 if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
2364 btrfs_err(fs_info,
2365 "invalid chunk type %llu in sys_array at offset %u",
2366 type, cur);
2367 return -EUCLEAN;
2368 }
2369 ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, logical: key.offset,
2370 sectorsize);
2371 if (ret < 0)
2372 return ret;
2373 cur += btrfs_chunk_item_size(num_stripes);
2374 }
2375 return 0;
2376short_read:
2377 btrfs_err(fs_info,
2378 "super block sys chunk array short read, cur=%u sys_array_size=%u",
2379 cur, sys_array_size);
2380 return -EUCLEAN;
2381}
2382
2383/*
2384 * Real super block validation
2385 * NOTE: super csum type and incompat features will not be checked here.
2386 *
2387 * @sb: super block to check
2388 * @mirror_num: the super block number to check its bytenr:
2389 * 0 the primary (1st) sb
2390 * 1, 2 2nd and 3rd backup copy
2391 * -1 skip bytenr check
2392 */
2393int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
2394 const struct btrfs_super_block *sb, int mirror_num)
2395{
2396 u64 nodesize = btrfs_super_nodesize(s: sb);
2397 u64 sectorsize = btrfs_super_sectorsize(s: sb);
2398 int ret = 0;
2399 const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
2400
2401 if (btrfs_super_magic(s: sb) != BTRFS_MAGIC) {
2402 btrfs_err(fs_info, "no valid FS found");
2403 ret = -EINVAL;
2404 }
2405 if ((btrfs_super_flags(s: sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
2406 if (!ignore_flags) {
2407 btrfs_err(fs_info,
2408 "unrecognized or unsupported super flag 0x%llx",
2409 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2410 ret = -EINVAL;
2411 } else {
2412 btrfs_info(fs_info,
2413 "unrecognized or unsupported super flags: 0x%llx, ignored",
2414 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2415 }
2416 }
2417 if (btrfs_super_root_level(s: sb) >= BTRFS_MAX_LEVEL) {
2418 btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2419 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2420 ret = -EINVAL;
2421 }
2422 if (btrfs_super_chunk_root_level(s: sb) >= BTRFS_MAX_LEVEL) {
2423 btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2424 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2425 ret = -EINVAL;
2426 }
2427 if (btrfs_super_log_root_level(s: sb) >= BTRFS_MAX_LEVEL) {
2428 btrfs_err(fs_info, "log_root level too big: %d >= %d",
2429 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2430 ret = -EINVAL;
2431 }
2432
2433 /*
2434 * Check sectorsize and nodesize first, other check will need it.
2435 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2436 */
2437 if (!is_power_of_2(n: sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
2438 sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2439 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2440 ret = -EINVAL;
2441 }
2442
2443 /*
2444 * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
2445 *
2446 * For 4K page sized systems with non-debug builds, all 3 matches (4K).
2447 * For 4K page sized systems with debug builds, there are two block sizes
2448 * supported. (4K and 2K)
2449 *
2450 * We can support 16K sectorsize with 64K page size without problem,
2451 * but such sectorsize/pagesize combination doesn't make much sense.
2452 * 4K will be our future standard, PAGE_SIZE is supported from the very
2453 * beginning.
2454 */
2455 if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
2456 sectorsize != PAGE_SIZE &&
2457 sectorsize != BTRFS_MIN_BLOCKSIZE)) {
2458 btrfs_err(fs_info,
2459 "sectorsize %llu not yet supported for page size %lu",
2460 sectorsize, PAGE_SIZE);
2461 ret = -EINVAL;
2462 }
2463
2464 if (!is_power_of_2(n: nodesize) || nodesize < sectorsize ||
2465 nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2466 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2467 ret = -EINVAL;
2468 }
2469 if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2470 btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2471 le32_to_cpu(sb->__unused_leafsize), nodesize);
2472 ret = -EINVAL;
2473 }
2474
2475 /* Root alignment check */
2476 if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2477 btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2478 btrfs_super_root(sb));
2479 ret = -EINVAL;
2480 }
2481 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2482 btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2483 btrfs_super_chunk_root(sb));
2484 ret = -EINVAL;
2485 }
2486 if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2487 btrfs_warn(fs_info, "log_root block unaligned: %llu",
2488 btrfs_super_log_root(sb));
2489 ret = -EINVAL;
2490 }
2491
2492 if (!fs_info->fs_devices->temp_fsid &&
2493 memcmp(p: fs_info->fs_devices->fsid, q: sb->fsid, BTRFS_FSID_SIZE) != 0) {
2494 btrfs_err(fs_info,
2495 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2496 sb->fsid, fs_info->fs_devices->fsid);
2497 ret = -EINVAL;
2498 }
2499
2500 if (memcmp(p: fs_info->fs_devices->metadata_uuid, q: btrfs_sb_fsid_ptr(sb),
2501 BTRFS_FSID_SIZE) != 0) {
2502 btrfs_err(fs_info,
2503"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2504 btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
2505 ret = -EINVAL;
2506 }
2507
2508 if (memcmp(p: fs_info->fs_devices->metadata_uuid, q: sb->dev_item.fsid,
2509 BTRFS_FSID_SIZE) != 0) {
2510 btrfs_err(fs_info,
2511 "dev_item UUID does not match metadata fsid: %pU != %pU",
2512 fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2513 ret = -EINVAL;
2514 }
2515
2516 /*
2517 * Artificial requirement for block-group-tree to force newer features
2518 * (free-space-tree, no-holes) so the test matrix is smaller.
2519 */
2520 if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
2521 (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
2522 !btrfs_fs_incompat(fs_info, NO_HOLES))) {
2523 btrfs_err(fs_info,
2524 "block-group-tree feature requires free-space-tree and no-holes");
2525 ret = -EINVAL;
2526 }
2527
2528 /*
2529 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2530 * done later
2531 */
2532 if (btrfs_super_bytes_used(s: sb) < 6 * btrfs_super_nodesize(s: sb)) {
2533 btrfs_err(fs_info, "bytes_used is too small %llu",
2534 btrfs_super_bytes_used(sb));
2535 ret = -EINVAL;
2536 }
2537 if (!is_power_of_2(n: btrfs_super_stripesize(s: sb))) {
2538 btrfs_err(fs_info, "invalid stripesize %u",
2539 btrfs_super_stripesize(sb));
2540 ret = -EINVAL;
2541 }
2542 if (btrfs_super_num_devices(s: sb) > (1UL << 31))
2543 btrfs_warn(fs_info, "suspicious number of devices: %llu",
2544 btrfs_super_num_devices(sb));
2545 if (btrfs_super_num_devices(s: sb) == 0) {
2546 btrfs_err(fs_info, "number of devices is 0");
2547 ret = -EINVAL;
2548 }
2549
2550 if (mirror_num >= 0 &&
2551 btrfs_super_bytenr(s: sb) != btrfs_sb_offset(mirror: mirror_num)) {
2552 btrfs_err(fs_info, "super offset mismatch %llu != %u",
2553 btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2554 ret = -EINVAL;
2555 }
2556
2557 if (ret)
2558 return ret;
2559
2560 ret = validate_sys_chunk_array(fs_info, sb);
2561
2562 /*
2563 * Obvious sys_chunk_array corruptions, it must hold at least one key
2564 * and one chunk
2565 */
2566 if (btrfs_super_sys_array_size(s: sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2567 btrfs_err(fs_info, "system chunk array too big %u > %u",
2568 btrfs_super_sys_array_size(sb),
2569 BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2570 ret = -EINVAL;
2571 }
2572 if (btrfs_super_sys_array_size(s: sb) < sizeof(struct btrfs_disk_key)
2573 + sizeof(struct btrfs_chunk)) {
2574 btrfs_err(fs_info, "system chunk array too small %u < %zu",
2575 btrfs_super_sys_array_size(sb),
2576 sizeof(struct btrfs_disk_key)
2577 + sizeof(struct btrfs_chunk));
2578 ret = -EINVAL;
2579 }
2580
2581 /*
2582 * The generation is a global counter, we'll trust it more than the others
2583 * but it's still possible that it's the one that's wrong.
2584 */
2585 if (btrfs_super_generation(s: sb) < btrfs_super_chunk_root_generation(s: sb))
2586 btrfs_warn(fs_info,
2587 "suspicious: generation < chunk_root_generation: %llu < %llu",
2588 btrfs_super_generation(sb),
2589 btrfs_super_chunk_root_generation(sb));
2590 if (btrfs_super_generation(s: sb) < btrfs_super_cache_generation(s: sb)
2591 && btrfs_super_cache_generation(s: sb) != (u64)-1)
2592 btrfs_warn(fs_info,
2593 "suspicious: generation < cache_generation: %llu < %llu",
2594 btrfs_super_generation(sb),
2595 btrfs_super_cache_generation(sb));
2596
2597 return ret;
2598}
2599
2600/*
2601 * Validation of super block at mount time.
2602 * Some checks already done early at mount time, like csum type and incompat
2603 * flags will be skipped.
2604 */
2605static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2606{
2607 return btrfs_validate_super(fs_info, sb: fs_info->super_copy, mirror_num: 0);
2608}
2609
2610/*
2611 * Validation of super block at write time.
2612 * Some checks like bytenr check will be skipped as their values will be
2613 * overwritten soon.
2614 * Extra checks like csum type and incompat flags will be done here.
2615 */
2616static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2617 struct btrfs_super_block *sb)
2618{
2619 int ret;
2620
2621 ret = btrfs_validate_super(fs_info, sb, mirror_num: -1);
2622 if (ret < 0)
2623 goto out;
2624 if (!btrfs_supported_super_csum(csum_type: btrfs_super_csum_type(s: sb))) {
2625 ret = -EUCLEAN;
2626 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2627 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2628 goto out;
2629 }
2630 if (btrfs_super_incompat_flags(s: sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2631 ret = -EUCLEAN;
2632 btrfs_err(fs_info,
2633 "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2634 btrfs_super_incompat_flags(sb),
2635 (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2636 goto out;
2637 }
2638out:
2639 if (ret < 0)
2640 btrfs_err(fs_info,
2641 "super block corruption detected before writing it to disk");
2642 return ret;
2643}
2644
2645static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2646{
2647 struct btrfs_tree_parent_check check = {
2648 .level = level,
2649 .transid = gen,
2650 .owner_root = btrfs_root_id(root)
2651 };
2652 int ret = 0;
2653
2654 root->node = read_tree_block(fs_info: root->fs_info, bytenr, check: &check);
2655 if (IS_ERR(ptr: root->node)) {
2656 ret = PTR_ERR(ptr: root->node);
2657 root->node = NULL;
2658 return ret;
2659 }
2660 if (!extent_buffer_uptodate(eb: root->node)) {
2661 free_extent_buffer(eb: root->node);
2662 root->node = NULL;
2663 return -EIO;
2664 }
2665
2666 btrfs_set_root_node(item: &root->root_item, node: root->node);
2667 root->commit_root = btrfs_root_node(root);
2668 btrfs_set_root_refs(s: &root->root_item, val: 1);
2669 return ret;
2670}
2671
2672static int load_important_roots(struct btrfs_fs_info *fs_info)
2673{
2674 struct btrfs_super_block *sb = fs_info->super_copy;
2675 u64 gen, bytenr;
2676 int level, ret;
2677
2678 bytenr = btrfs_super_root(s: sb);
2679 gen = btrfs_super_generation(s: sb);
2680 level = btrfs_super_root_level(s: sb);
2681 ret = load_super_root(root: fs_info->tree_root, bytenr, gen, level);
2682 if (ret) {
2683 btrfs_warn(fs_info, "couldn't read tree root");
2684 return ret;
2685 }
2686 return 0;
2687}
2688
2689static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2690{
2691 int backup_index = find_newest_super_backup(info: fs_info);
2692 struct btrfs_super_block *sb = fs_info->super_copy;
2693 struct btrfs_root *tree_root = fs_info->tree_root;
2694 bool handle_error = false;
2695 int ret = 0;
2696 int i;
2697
2698 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2699 if (handle_error) {
2700 if (!IS_ERR(ptr: tree_root->node))
2701 free_extent_buffer(eb: tree_root->node);
2702 tree_root->node = NULL;
2703
2704 if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2705 break;
2706
2707 free_root_pointers(info: fs_info, free_chunk_root: 0);
2708
2709 /*
2710 * Don't use the log in recovery mode, it won't be
2711 * valid
2712 */
2713 btrfs_set_super_log_root(s: sb, val: 0);
2714
2715 btrfs_warn(fs_info, "try to load backup roots slot %d", i);
2716 ret = read_backup_root(fs_info, priority: i);
2717 backup_index = ret;
2718 if (ret < 0)
2719 return ret;
2720 }
2721
2722 ret = load_important_roots(fs_info);
2723 if (ret) {
2724 handle_error = true;
2725 continue;
2726 }
2727
2728 /*
2729 * No need to hold btrfs_root::objectid_mutex since the fs
2730 * hasn't been fully initialised and we are the only user
2731 */
2732 ret = btrfs_init_root_free_objectid(root: tree_root);
2733 if (ret < 0) {
2734 handle_error = true;
2735 continue;
2736 }
2737
2738 ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2739
2740 ret = btrfs_read_roots(fs_info);
2741 if (ret < 0) {
2742 handle_error = true;
2743 continue;
2744 }
2745
2746 /* All successful */
2747 fs_info->generation = btrfs_header_generation(eb: tree_root->node);
2748 btrfs_set_last_trans_committed(fs_info, gen: fs_info->generation);
2749 fs_info->last_reloc_trans = 0;
2750
2751 /* Always begin writing backup roots after the one being used */
2752 if (backup_index < 0) {
2753 fs_info->backup_root_index = 0;
2754 } else {
2755 fs_info->backup_root_index = backup_index + 1;
2756 fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2757 }
2758 break;
2759 }
2760
2761 return ret;
2762}
2763
2764/*
2765 * Lockdep gets confused between our buffer_tree which requires IRQ locking because
2766 * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
2767 * have these requirements. Use a class key so lockdep doesn't get them mixed up.
2768 */
2769static struct lock_class_key buffer_xa_class;
2770
2771void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2772{
2773 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2774
2775 /* Use the same flags as mapping->i_pages. */
2776 xa_init_flags(xa: &fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
2777 lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
2778
2779 INIT_LIST_HEAD(list: &fs_info->trans_list);
2780 INIT_LIST_HEAD(list: &fs_info->dead_roots);
2781 INIT_LIST_HEAD(list: &fs_info->delayed_iputs);
2782 INIT_LIST_HEAD(list: &fs_info->delalloc_roots);
2783 INIT_LIST_HEAD(list: &fs_info->caching_block_groups);
2784 spin_lock_init(&fs_info->delalloc_root_lock);
2785 spin_lock_init(&fs_info->trans_lock);
2786 spin_lock_init(&fs_info->fs_roots_radix_lock);
2787 spin_lock_init(&fs_info->delayed_iput_lock);
2788 spin_lock_init(&fs_info->defrag_inodes_lock);
2789 spin_lock_init(&fs_info->super_lock);
2790 spin_lock_init(&fs_info->unused_bgs_lock);
2791 spin_lock_init(&fs_info->treelog_bg_lock);
2792 spin_lock_init(&fs_info->zone_active_bgs_lock);
2793 spin_lock_init(&fs_info->relocation_bg_lock);
2794 rwlock_init(&fs_info->tree_mod_log_lock);
2795 rwlock_init(&fs_info->global_root_lock);
2796 mutex_init(&fs_info->unused_bg_unpin_mutex);
2797 mutex_init(&fs_info->reclaim_bgs_lock);
2798 mutex_init(&fs_info->reloc_mutex);
2799 mutex_init(&fs_info->delalloc_root_mutex);
2800 mutex_init(&fs_info->zoned_meta_io_lock);
2801 mutex_init(&fs_info->zoned_data_reloc_io_lock);
2802 seqlock_init(&fs_info->profiles_lock);
2803
2804 btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
2805 btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
2806 btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
2807 btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
2808 btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
2809 BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2810 btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
2811 BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2812 btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
2813 BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2814 btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
2815 BTRFS_LOCKDEP_TRANS_COMPLETED);
2816
2817 INIT_LIST_HEAD(list: &fs_info->dirty_cowonly_roots);
2818 INIT_LIST_HEAD(list: &fs_info->space_info);
2819 INIT_LIST_HEAD(list: &fs_info->tree_mod_seq_list);
2820 INIT_LIST_HEAD(list: &fs_info->unused_bgs);
2821 INIT_LIST_HEAD(list: &fs_info->reclaim_bgs);
2822 INIT_LIST_HEAD(list: &fs_info->zone_active_bgs);
2823#ifdef CONFIG_BTRFS_DEBUG
2824 INIT_LIST_HEAD(list: &fs_info->allocated_roots);
2825 INIT_LIST_HEAD(list: &fs_info->allocated_ebs);
2826 spin_lock_init(&fs_info->eb_leak_lock);
2827#endif
2828 fs_info->mapping_tree = RB_ROOT_CACHED;
2829 rwlock_init(&fs_info->mapping_tree_lock);
2830 btrfs_init_block_rsv(rsv: &fs_info->global_block_rsv,
2831 type: BTRFS_BLOCK_RSV_GLOBAL);
2832 btrfs_init_block_rsv(rsv: &fs_info->trans_block_rsv, type: BTRFS_BLOCK_RSV_TRANS);
2833 btrfs_init_block_rsv(rsv: &fs_info->chunk_block_rsv, type: BTRFS_BLOCK_RSV_CHUNK);
2834 btrfs_init_block_rsv(rsv: &fs_info->treelog_rsv, type: BTRFS_BLOCK_RSV_TREELOG);
2835 btrfs_init_block_rsv(rsv: &fs_info->empty_block_rsv, type: BTRFS_BLOCK_RSV_EMPTY);
2836 btrfs_init_block_rsv(rsv: &fs_info->delayed_block_rsv,
2837 type: BTRFS_BLOCK_RSV_DELOPS);
2838 btrfs_init_block_rsv(rsv: &fs_info->delayed_refs_rsv,
2839 type: BTRFS_BLOCK_RSV_DELREFS);
2840
2841 atomic_set(v: &fs_info->async_delalloc_pages, i: 0);
2842 atomic_set(v: &fs_info->defrag_running, i: 0);
2843 atomic_set(v: &fs_info->nr_delayed_iputs, i: 0);
2844 atomic64_set(v: &fs_info->tree_mod_seq, i: 0);
2845 fs_info->global_root_tree = RB_ROOT;
2846 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2847 fs_info->metadata_ratio = 0;
2848 fs_info->defrag_inodes = RB_ROOT;
2849 atomic64_set(v: &fs_info->free_chunk_space, i: 0);
2850 fs_info->tree_mod_log = RB_ROOT;
2851 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2852 btrfs_init_ref_verify(fs_info);
2853
2854 fs_info->thread_pool_size = min_t(unsigned long,
2855 num_online_cpus() + 2, 8);
2856
2857 INIT_LIST_HEAD(list: &fs_info->ordered_roots);
2858 spin_lock_init(&fs_info->ordered_root_lock);
2859
2860 btrfs_init_scrub(fs_info);
2861 btrfs_init_balance(fs_info);
2862 btrfs_init_async_reclaim_work(fs_info);
2863 btrfs_init_extent_map_shrinker_work(fs_info);
2864
2865 rwlock_init(&fs_info->block_group_cache_lock);
2866 fs_info->block_group_cache_tree = RB_ROOT_CACHED;
2867
2868 btrfs_extent_io_tree_init(fs_info, tree: &fs_info->excluded_extents,
2869 owner: IO_TREE_FS_EXCLUDED_EXTENTS);
2870
2871 mutex_init(&fs_info->ordered_operations_mutex);
2872 mutex_init(&fs_info->tree_log_mutex);
2873 mutex_init(&fs_info->chunk_mutex);
2874 mutex_init(&fs_info->transaction_kthread_mutex);
2875 mutex_init(&fs_info->cleaner_mutex);
2876 mutex_init(&fs_info->ro_block_group_mutex);
2877 init_rwsem(&fs_info->commit_root_sem);
2878 init_rwsem(&fs_info->cleanup_work_sem);
2879 init_rwsem(&fs_info->subvol_sem);
2880 sema_init(sem: &fs_info->uuid_tree_rescan_sem, val: 1);
2881
2882 btrfs_init_dev_replace_locks(fs_info);
2883 btrfs_init_qgroup(fs_info);
2884 btrfs_discard_init(fs_info);
2885
2886 btrfs_init_free_cluster(cluster: &fs_info->meta_alloc_cluster);
2887 btrfs_init_free_cluster(cluster: &fs_info->data_alloc_cluster);
2888
2889 init_waitqueue_head(&fs_info->transaction_throttle);
2890 init_waitqueue_head(&fs_info->transaction_wait);
2891 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2892 init_waitqueue_head(&fs_info->async_submit_wait);
2893 init_waitqueue_head(&fs_info->delayed_iputs_wait);
2894
2895 /* Usable values until the real ones are cached from the superblock */
2896 fs_info->nodesize = 4096;
2897 fs_info->sectorsize = 4096;
2898 fs_info->sectorsize_bits = ilog2(4096);
2899 fs_info->stripesize = 4096;
2900
2901 /* Default compress algorithm when user does -o compress */
2902 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2903
2904 fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
2905
2906 spin_lock_init(&fs_info->swapfile_pins_lock);
2907 fs_info->swapfile_pins = RB_ROOT;
2908
2909 fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
2910 INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2911}
2912
2913static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2914{
2915 int ret;
2916
2917 fs_info->sb = sb;
2918 /* Temporary fixed values for block size until we read the superblock. */
2919 sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2920 sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2921
2922 ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2923 if (ret)
2924 return ret;
2925
2926 ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
2927 if (ret)
2928 return ret;
2929
2930 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2931 if (ret)
2932 return ret;
2933
2934 ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
2935 if (ret)
2936 return ret;
2937
2938 fs_info->dirty_metadata_batch = PAGE_SIZE *
2939 (1 + ilog2(nr_cpu_ids));
2940
2941 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2942 if (ret)
2943 return ret;
2944
2945 ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2946 GFP_KERNEL);
2947 if (ret)
2948 return ret;
2949
2950 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2951 GFP_KERNEL);
2952 if (!fs_info->delayed_root)
2953 return -ENOMEM;
2954 btrfs_init_delayed_root(delayed_root: fs_info->delayed_root);
2955
2956 if (sb_rdonly(sb))
2957 set_bit(nr: BTRFS_FS_STATE_RO, addr: &fs_info->fs_state);
2958 if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
2959 set_bit(nr: BTRFS_FS_STATE_SKIP_META_CSUMS, addr: &fs_info->fs_state);
2960
2961 return btrfs_alloc_stripe_hash_table(info: fs_info);
2962}
2963
2964static int btrfs_uuid_rescan_kthread(void *data)
2965{
2966 struct btrfs_fs_info *fs_info = data;
2967 int ret;
2968
2969 /*
2970 * 1st step is to iterate through the existing UUID tree and
2971 * to delete all entries that contain outdated data.
2972 * 2nd step is to add all missing entries to the UUID tree.
2973 */
2974 ret = btrfs_uuid_tree_iterate(fs_info);
2975 if (ret < 0) {
2976 if (ret != -EINTR)
2977 btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2978 ret);
2979 up(sem: &fs_info->uuid_tree_rescan_sem);
2980 return ret;
2981 }
2982 return btrfs_uuid_scan_kthread(data);
2983}
2984
2985static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2986{
2987 struct task_struct *task;
2988
2989 down(sem: &fs_info->uuid_tree_rescan_sem);
2990 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2991 if (IS_ERR(ptr: task)) {
2992 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
2993 btrfs_warn(fs_info, "failed to start uuid_rescan task");
2994 up(sem: &fs_info->uuid_tree_rescan_sem);
2995 return PTR_ERR(ptr: task);
2996 }
2997
2998 return 0;
2999}
3000
3001static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3002{
3003 u64 root_objectid = 0;
3004 struct btrfs_root *gang[8];
3005 int ret = 0;
3006
3007 while (1) {
3008 unsigned int found;
3009
3010 spin_lock(lock: &fs_info->fs_roots_radix_lock);
3011 found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3012 results: (void **)gang, first_index: root_objectid,
3013 ARRAY_SIZE(gang));
3014 if (!found) {
3015 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
3016 break;
3017 }
3018 root_objectid = btrfs_root_id(root: gang[found - 1]) + 1;
3019
3020 for (int i = 0; i < found; i++) {
3021 /* Avoid to grab roots in dead_roots. */
3022 if (btrfs_root_refs(s: &gang[i]->root_item) == 0) {
3023 gang[i] = NULL;
3024 continue;
3025 }
3026 /* Grab all the search result for later use. */
3027 gang[i] = btrfs_grab_root(root: gang[i]);
3028 }
3029 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
3030
3031 for (int i = 0; i < found; i++) {
3032 if (!gang[i])
3033 continue;
3034 root_objectid = btrfs_root_id(root: gang[i]);
3035 /*
3036 * Continue to release the remaining roots after the first
3037 * error without cleanup and preserve the first error
3038 * for the return.
3039 */
3040 if (!ret)
3041 ret = btrfs_orphan_cleanup(root: gang[i]);
3042 btrfs_put_root(root: gang[i]);
3043 }
3044 if (ret)
3045 break;
3046
3047 root_objectid++;
3048 }
3049 return ret;
3050}
3051
3052/*
3053 * Mounting logic specific to read-write file systems. Shared by open_ctree
3054 * and btrfs_remount when remounting from read-only to read-write.
3055 */
3056int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3057{
3058 int ret;
3059 const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3060 bool rebuild_free_space_tree = false;
3061
3062 if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3063 btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3064 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
3065 btrfs_warn(fs_info,
3066 "'clear_cache' option is ignored with extent tree v2");
3067 else
3068 rebuild_free_space_tree = true;
3069 } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3070 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3071 btrfs_warn(fs_info, "free space tree is invalid");
3072 rebuild_free_space_tree = true;
3073 }
3074
3075 if (rebuild_free_space_tree) {
3076 btrfs_info(fs_info, "rebuilding free space tree");
3077 ret = btrfs_rebuild_free_space_tree(fs_info);
3078 if (ret) {
3079 btrfs_warn(fs_info,
3080 "failed to rebuild free space tree: %d", ret);
3081 goto out;
3082 }
3083 }
3084
3085 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3086 !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
3087 btrfs_info(fs_info, "disabling free space tree");
3088 ret = btrfs_delete_free_space_tree(fs_info);
3089 if (ret) {
3090 btrfs_warn(fs_info,
3091 "failed to disable free space tree: %d", ret);
3092 goto out;
3093 }
3094 }
3095
3096 /*
3097 * btrfs_find_orphan_roots() is responsible for finding all the dead
3098 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3099 * them into the fs_info->fs_roots_radix tree. This must be done before
3100 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3101 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3102 * item before the root's tree is deleted - this means that if we unmount
3103 * or crash before the deletion completes, on the next mount we will not
3104 * delete what remains of the tree because the orphan item does not
3105 * exists anymore, which is what tells us we have a pending deletion.
3106 */
3107 ret = btrfs_find_orphan_roots(fs_info);
3108 if (ret)
3109 goto out;
3110
3111 ret = btrfs_cleanup_fs_roots(fs_info);
3112 if (ret)
3113 goto out;
3114
3115 down_read(sem: &fs_info->cleanup_work_sem);
3116 if ((ret = btrfs_orphan_cleanup(root: fs_info->fs_root)) ||
3117 (ret = btrfs_orphan_cleanup(root: fs_info->tree_root))) {
3118 up_read(sem: &fs_info->cleanup_work_sem);
3119 goto out;
3120 }
3121 up_read(sem: &fs_info->cleanup_work_sem);
3122
3123 mutex_lock(&fs_info->cleaner_mutex);
3124 ret = btrfs_recover_relocation(fs_info);
3125 mutex_unlock(lock: &fs_info->cleaner_mutex);
3126 if (ret < 0) {
3127 btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3128 goto out;
3129 }
3130
3131 if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3132 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3133 btrfs_info(fs_info, "creating free space tree");
3134 ret = btrfs_create_free_space_tree(fs_info);
3135 if (ret) {
3136 btrfs_warn(fs_info,
3137 "failed to create free space tree: %d", ret);
3138 goto out;
3139 }
3140 }
3141
3142 if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3143 ret = btrfs_set_free_space_cache_v1_active(fs_info, active: cache_opt);
3144 if (ret)
3145 goto out;
3146 }
3147
3148 ret = btrfs_resume_balance_async(fs_info);
3149 if (ret)
3150 goto out;
3151
3152 ret = btrfs_resume_dev_replace_async(fs_info);
3153 if (ret) {
3154 btrfs_warn(fs_info, "failed to resume dev_replace");
3155 goto out;
3156 }
3157
3158 btrfs_qgroup_rescan_resume(fs_info);
3159
3160 if (!fs_info->uuid_root) {
3161 btrfs_info(fs_info, "creating UUID tree");
3162 ret = btrfs_create_uuid_tree(fs_info);
3163 if (ret) {
3164 btrfs_warn(fs_info,
3165 "failed to create the UUID tree %d", ret);
3166 goto out;
3167 }
3168 }
3169
3170out:
3171 return ret;
3172}
3173
3174/*
3175 * Do various sanity and dependency checks of different features.
3176 *
3177 * @is_rw_mount: If the mount is read-write.
3178 *
3179 * This is the place for less strict checks (like for subpage or artificial
3180 * feature dependencies).
3181 *
3182 * For strict checks or possible corruption detection, see
3183 * btrfs_validate_super().
3184 *
3185 * This should be called after btrfs_parse_options(), as some mount options
3186 * (space cache related) can modify on-disk format like free space tree and
3187 * screw up certain feature dependencies.
3188 */
3189int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3190{
3191 struct btrfs_super_block *disk_super = fs_info->super_copy;
3192 u64 incompat = btrfs_super_incompat_flags(s: disk_super);
3193 const u64 compat_ro = btrfs_super_compat_ro_flags(s: disk_super);
3194 const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
3195
3196 if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
3197 btrfs_err(fs_info,
3198 "cannot mount because of unknown incompat features (0x%llx)",
3199 incompat);
3200 return -EINVAL;
3201 }
3202
3203 /* Runtime limitation for mixed block groups. */
3204 if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3205 (fs_info->sectorsize != fs_info->nodesize)) {
3206 btrfs_err(fs_info,
3207"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3208 fs_info->nodesize, fs_info->sectorsize);
3209 return -EINVAL;
3210 }
3211
3212 /* Mixed backref is an always-enabled feature. */
3213 incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3214
3215 /* Set compression related flags just in case. */
3216 if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3217 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3218 else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3219 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3220
3221 /*
3222 * An ancient flag, which should really be marked deprecated.
3223 * Such runtime limitation doesn't really need a incompat flag.
3224 */
3225 if (btrfs_super_nodesize(s: disk_super) > PAGE_SIZE)
3226 incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3227
3228 if (compat_ro_unsupp && is_rw_mount) {
3229 btrfs_err(fs_info,
3230 "cannot mount read-write because of unknown compat_ro features (0x%llx)",
3231 compat_ro);
3232 return -EINVAL;
3233 }
3234
3235 /*
3236 * We have unsupported RO compat features, although RO mounted, we
3237 * should not cause any metadata writes, including log replay.
3238 * Or we could screw up whatever the new feature requires.
3239 */
3240 if (compat_ro_unsupp && btrfs_super_log_root(s: disk_super) &&
3241 !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3242 btrfs_err(fs_info,
3243"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3244 compat_ro);
3245 return -EINVAL;
3246 }
3247
3248 /*
3249 * Artificial limitations for block group tree, to force
3250 * block-group-tree to rely on no-holes and free-space-tree.
3251 */
3252 if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
3253 (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
3254 !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
3255 btrfs_err(fs_info,
3256"block-group-tree feature requires no-holes and free-space-tree features");
3257 return -EINVAL;
3258 }
3259
3260 /*
3261 * Subpage runtime limitation on v1 cache.
3262 *
3263 * V1 space cache still has some hard codeed PAGE_SIZE usage, while
3264 * we're already defaulting to v2 cache, no need to bother v1 as it's
3265 * going to be deprecated anyway.
3266 */
3267 if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3268 btrfs_warn(fs_info,
3269 "v1 space cache is not supported for page size %lu with sectorsize %u",
3270 PAGE_SIZE, fs_info->sectorsize);
3271 return -EINVAL;
3272 }
3273
3274 /* This can be called by remount, we need to protect the super block. */
3275 spin_lock(lock: &fs_info->super_lock);
3276 btrfs_set_super_incompat_flags(s: disk_super, val: incompat);
3277 spin_unlock(lock: &fs_info->super_lock);
3278
3279 return 0;
3280}
3281
3282int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
3283{
3284 u32 sectorsize;
3285 u32 nodesize;
3286 u32 stripesize;
3287 u64 generation;
3288 u16 csum_type;
3289 struct btrfs_super_block *disk_super;
3290 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3291 struct btrfs_root *tree_root;
3292 struct btrfs_root *chunk_root;
3293 int ret;
3294 int level;
3295
3296 ret = init_mount_fs_info(fs_info, sb);
3297 if (ret)
3298 goto fail;
3299
3300 /* These need to be init'ed before we start creating inodes and such. */
3301 tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3302 GFP_KERNEL);
3303 fs_info->tree_root = tree_root;
3304 chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3305 GFP_KERNEL);
3306 fs_info->chunk_root = chunk_root;
3307 if (!tree_root || !chunk_root) {
3308 ret = -ENOMEM;
3309 goto fail;
3310 }
3311
3312 ret = btrfs_init_btree_inode(sb);
3313 if (ret)
3314 goto fail;
3315
3316 invalidate_bdev(bdev: fs_devices->latest_dev->bdev);
3317
3318 /*
3319 * Read super block and check the signature bytes only
3320 */
3321 disk_super = btrfs_read_disk_super(bdev: fs_devices->latest_dev->bdev, copy_num: 0, drop_cache: false);
3322 if (IS_ERR(ptr: disk_super)) {
3323 ret = PTR_ERR(ptr: disk_super);
3324 goto fail_alloc;
3325 }
3326
3327 btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
3328 /*
3329 * Verify the type first, if that or the checksum value are
3330 * corrupted, we'll find out
3331 */
3332 csum_type = btrfs_super_csum_type(s: disk_super);
3333 if (!btrfs_supported_super_csum(csum_type)) {
3334 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3335 csum_type);
3336 ret = -EINVAL;
3337 btrfs_release_disk_super(super: disk_super);
3338 goto fail_alloc;
3339 }
3340
3341 fs_info->csum_size = btrfs_super_csum_size(s: disk_super);
3342
3343 ret = btrfs_init_csum_hash(fs_info, csum_type);
3344 if (ret) {
3345 btrfs_release_disk_super(super: disk_super);
3346 goto fail_alloc;
3347 }
3348
3349 /*
3350 * We want to check superblock checksum, the type is stored inside.
3351 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3352 */
3353 if (btrfs_check_super_csum(fs_info, disk_sb: disk_super)) {
3354 btrfs_err(fs_info, "superblock checksum mismatch");
3355 ret = -EINVAL;
3356 btrfs_release_disk_super(super: disk_super);
3357 goto fail_alloc;
3358 }
3359
3360 /*
3361 * super_copy is zeroed at allocation time and we never touch the
3362 * following bytes up to INFO_SIZE, the checksum is calculated from
3363 * the whole block of INFO_SIZE
3364 */
3365 memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3366 btrfs_release_disk_super(super: disk_super);
3367
3368 disk_super = fs_info->super_copy;
3369
3370 memcpy(fs_info->super_for_commit, fs_info->super_copy,
3371 sizeof(*fs_info->super_for_commit));
3372
3373 ret = btrfs_validate_mount_super(fs_info);
3374 if (ret) {
3375 btrfs_err(fs_info, "superblock contains fatal errors");
3376 ret = -EINVAL;
3377 goto fail_alloc;
3378 }
3379
3380 if (!btrfs_super_root(s: disk_super)) {
3381 btrfs_err(fs_info, "invalid superblock tree root bytenr");
3382 ret = -EINVAL;
3383 goto fail_alloc;
3384 }
3385
3386 /* check FS state, whether FS is broken. */
3387 if (btrfs_super_flags(s: disk_super) & BTRFS_SUPER_FLAG_ERROR)
3388 WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
3389
3390 /* Set up fs_info before parsing mount options */
3391 nodesize = btrfs_super_nodesize(s: disk_super);
3392 sectorsize = btrfs_super_sectorsize(s: disk_super);
3393 stripesize = sectorsize;
3394 fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3395 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3396
3397 fs_info->nodesize = nodesize;
3398 fs_info->sectorsize = sectorsize;
3399 fs_info->sectorsize_bits = ilog2(sectorsize);
3400 fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(info: fs_info) / fs_info->csum_size;
3401 fs_info->stripesize = stripesize;
3402 fs_info->fs_devices->fs_info = fs_info;
3403
3404 /*
3405 * Handle the space caching options appropriately now that we have the
3406 * super block loaded and validated.
3407 */
3408 btrfs_set_free_space_cache_settings(fs_info);
3409
3410 if (!btrfs_check_options(info: fs_info, mount_opt: &fs_info->mount_opt, flags: sb->s_flags)) {
3411 ret = -EINVAL;
3412 goto fail_alloc;
3413 }
3414
3415 ret = btrfs_check_features(fs_info, is_rw_mount: !sb_rdonly(sb));
3416 if (ret < 0)
3417 goto fail_alloc;
3418
3419 /*
3420 * At this point our mount options are validated, if we set ->max_inline
3421 * to something non-standard make sure we truncate it to sectorsize.
3422 */
3423 fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
3424
3425 ret = btrfs_init_workqueues(fs_info);
3426 if (ret)
3427 goto fail_sb_buffer;
3428
3429 sb->s_bdi->ra_pages *= btrfs_super_num_devices(s: disk_super);
3430 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3431
3432 /* Update the values for the current filesystem. */
3433 sb->s_blocksize = sectorsize;
3434 sb->s_blocksize_bits = blksize_bits(size: sectorsize);
3435 memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3436
3437 mutex_lock(&fs_info->chunk_mutex);
3438 ret = btrfs_read_sys_array(fs_info);
3439 mutex_unlock(lock: &fs_info->chunk_mutex);
3440 if (ret) {
3441 btrfs_err(fs_info, "failed to read the system array: %d", ret);
3442 goto fail_sb_buffer;
3443 }
3444
3445 generation = btrfs_super_chunk_root_generation(s: disk_super);
3446 level = btrfs_super_chunk_root_level(s: disk_super);
3447 ret = load_super_root(root: chunk_root, bytenr: btrfs_super_chunk_root(s: disk_super),
3448 gen: generation, level);
3449 if (ret) {
3450 btrfs_err(fs_info, "failed to read chunk root");
3451 goto fail_tree_roots;
3452 }
3453
3454 read_extent_buffer(eb: chunk_root->node, dst: fs_info->chunk_tree_uuid,
3455 offsetof(struct btrfs_header, chunk_tree_uuid),
3456 BTRFS_UUID_SIZE);
3457
3458 ret = btrfs_read_chunk_tree(fs_info);
3459 if (ret) {
3460 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3461 goto fail_tree_roots;
3462 }
3463
3464 /*
3465 * At this point we know all the devices that make this filesystem,
3466 * including the seed devices but we don't know yet if the replace
3467 * target is required. So free devices that are not part of this
3468 * filesystem but skip the replace target device which is checked
3469 * below in btrfs_init_dev_replace().
3470 */
3471 btrfs_free_extra_devids(fs_devices);
3472 if (!fs_devices->latest_dev->bdev) {
3473 btrfs_err(fs_info, "failed to read devices");
3474 ret = -EIO;
3475 goto fail_tree_roots;
3476 }
3477
3478 ret = init_tree_roots(fs_info);
3479 if (ret)
3480 goto fail_tree_roots;
3481
3482 /*
3483 * Get zone type information of zoned block devices. This will also
3484 * handle emulation of a zoned filesystem if a regular device has the
3485 * zoned incompat feature flag set.
3486 */
3487 ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3488 if (ret) {
3489 btrfs_err(fs_info,
3490 "zoned: failed to read device zone info: %d", ret);
3491 goto fail_block_groups;
3492 }
3493
3494 /*
3495 * If we have a uuid root and we're not being told to rescan we need to
3496 * check the generation here so we can set the
3497 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
3498 * transaction during a balance or the log replay without updating the
3499 * uuid generation, and then if we crash we would rescan the uuid tree,
3500 * even though it was perfectly fine.
3501 */
3502 if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3503 fs_info->generation == btrfs_super_uuid_tree_generation(s: disk_super))
3504 set_bit(nr: BTRFS_FS_UPDATE_UUID_TREE_GEN, addr: &fs_info->flags);
3505
3506 ret = btrfs_verify_dev_extents(fs_info);
3507 if (ret) {
3508 btrfs_err(fs_info,
3509 "failed to verify dev extents against chunks: %d",
3510 ret);
3511 goto fail_block_groups;
3512 }
3513 ret = btrfs_recover_balance(fs_info);
3514 if (ret) {
3515 btrfs_err(fs_info, "failed to recover balance: %d", ret);
3516 goto fail_block_groups;
3517 }
3518
3519 ret = btrfs_init_dev_stats(fs_info);
3520 if (ret) {
3521 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3522 goto fail_block_groups;
3523 }
3524
3525 ret = btrfs_init_dev_replace(fs_info);
3526 if (ret) {
3527 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3528 goto fail_block_groups;
3529 }
3530
3531 ret = btrfs_check_zoned_mode(fs_info);
3532 if (ret) {
3533 btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3534 ret);
3535 goto fail_block_groups;
3536 }
3537
3538 ret = btrfs_sysfs_add_fsid(fs_devs: fs_devices);
3539 if (ret) {
3540 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3541 ret);
3542 goto fail_block_groups;
3543 }
3544
3545 ret = btrfs_sysfs_add_mounted(fs_info);
3546 if (ret) {
3547 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3548 goto fail_fsdev_sysfs;
3549 }
3550
3551 ret = btrfs_init_space_info(fs_info);
3552 if (ret) {
3553 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3554 goto fail_sysfs;
3555 }
3556
3557 ret = btrfs_read_block_groups(info: fs_info);
3558 if (ret) {
3559 btrfs_err(fs_info, "failed to read block groups: %d", ret);
3560 goto fail_sysfs;
3561 }
3562
3563 btrfs_free_zone_cache(fs_info);
3564
3565 btrfs_check_active_zone_reservation(fs_info);
3566
3567 if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3568 !btrfs_check_rw_degradable(fs_info, NULL)) {
3569 btrfs_warn(fs_info,
3570 "writable mount is not allowed due to too many missing devices");
3571 ret = -EINVAL;
3572 goto fail_sysfs;
3573 }
3574
3575 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3576 "btrfs-cleaner");
3577 if (IS_ERR(ptr: fs_info->cleaner_kthread)) {
3578 ret = PTR_ERR(ptr: fs_info->cleaner_kthread);
3579 goto fail_sysfs;
3580 }
3581
3582 fs_info->transaction_kthread = kthread_run(transaction_kthread,
3583 tree_root,
3584 "btrfs-transaction");
3585 if (IS_ERR(ptr: fs_info->transaction_kthread)) {
3586 ret = PTR_ERR(ptr: fs_info->transaction_kthread);
3587 goto fail_cleaner;
3588 }
3589
3590 ret = btrfs_read_qgroup_config(fs_info);
3591 if (ret)
3592 goto fail_trans_kthread;
3593
3594 if (btrfs_build_ref_tree(fs_info))
3595 btrfs_err(fs_info, "couldn't build ref tree");
3596
3597 /* do not make disk changes in broken FS or nologreplay is given */
3598 if (btrfs_super_log_root(s: disk_super) != 0 &&
3599 !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3600 btrfs_info(fs_info, "start tree-log replay");
3601 ret = btrfs_replay_log(fs_info, fs_devices);
3602 if (ret)
3603 goto fail_qgroup;
3604 }
3605
3606 fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, check_ref: true);
3607 if (IS_ERR(ptr: fs_info->fs_root)) {
3608 ret = PTR_ERR(ptr: fs_info->fs_root);
3609 btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
3610 fs_info->fs_root = NULL;
3611 goto fail_qgroup;
3612 }
3613
3614 if (sb_rdonly(sb))
3615 return 0;
3616
3617 ret = btrfs_start_pre_rw_mount(fs_info);
3618 if (ret) {
3619 close_ctree(fs_info);
3620 return ret;
3621 }
3622 btrfs_discard_resume(fs_info);
3623
3624 if (fs_info->uuid_root &&
3625 (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3626 fs_info->generation != btrfs_super_uuid_tree_generation(s: disk_super))) {
3627 btrfs_info(fs_info, "checking UUID tree");
3628 ret = btrfs_check_uuid_tree(fs_info);
3629 if (ret) {
3630 btrfs_warn(fs_info,
3631 "failed to check the UUID tree: %d", ret);
3632 close_ctree(fs_info);
3633 return ret;
3634 }
3635 }
3636
3637 set_bit(nr: BTRFS_FS_OPEN, addr: &fs_info->flags);
3638
3639 /* Kick the cleaner thread so it'll start deleting snapshots. */
3640 if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3641 wake_up_process(tsk: fs_info->cleaner_kthread);
3642
3643 return 0;
3644
3645fail_qgroup:
3646 btrfs_free_qgroup_config(fs_info);
3647fail_trans_kthread:
3648 kthread_stop(k: fs_info->transaction_kthread);
3649 btrfs_cleanup_transaction(fs_info);
3650 btrfs_free_fs_roots(fs_info);
3651fail_cleaner:
3652 kthread_stop(k: fs_info->cleaner_kthread);
3653
3654 /*
3655 * make sure we're done with the btree inode before we stop our
3656 * kthreads
3657 */
3658 filemap_write_and_wait(mapping: fs_info->btree_inode->i_mapping);
3659
3660fail_sysfs:
3661 btrfs_sysfs_remove_mounted(fs_info);
3662
3663fail_fsdev_sysfs:
3664 btrfs_sysfs_remove_fsid(fs_devs: fs_info->fs_devices);
3665
3666fail_block_groups:
3667 btrfs_put_block_group_cache(info: fs_info);
3668
3669fail_tree_roots:
3670 if (fs_info->data_reloc_root)
3671 btrfs_drop_and_free_fs_root(fs_info, root: fs_info->data_reloc_root);
3672 free_root_pointers(info: fs_info, free_chunk_root: true);
3673 invalidate_inode_pages2(mapping: fs_info->btree_inode->i_mapping);
3674
3675fail_sb_buffer:
3676 btrfs_stop_all_workers(fs_info);
3677 btrfs_free_block_groups(info: fs_info);
3678fail_alloc:
3679 btrfs_mapping_tree_free(fs_info);
3680
3681 iput(fs_info->btree_inode);
3682fail:
3683 btrfs_close_devices(fs_devices: fs_info->fs_devices);
3684 ASSERT(ret < 0);
3685 return ret;
3686}
3687ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3688
3689static void btrfs_end_super_write(struct bio *bio)
3690{
3691 struct btrfs_device *device = bio->bi_private;
3692 struct folio_iter fi;
3693
3694 bio_for_each_folio_all(fi, bio) {
3695 if (bio->bi_status) {
3696 btrfs_warn_rl_in_rcu(device->fs_info,
3697 "lost super block write due to IO error on %s (%d)",
3698 btrfs_dev_name(device),
3699 blk_status_to_errno(bio->bi_status));
3700 btrfs_dev_stat_inc_and_print(dev: device,
3701 index: BTRFS_DEV_STAT_WRITE_ERRS);
3702 /* Ensure failure if the primary sb fails. */
3703 if (bio->bi_opf & REQ_FUA)
3704 atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
3705 v: &device->sb_write_errors);
3706 else
3707 atomic_inc(v: &device->sb_write_errors);
3708 }
3709 folio_unlock(folio: fi.folio);
3710 folio_put(folio: fi.folio);
3711 }
3712
3713 bio_put(bio);
3714}
3715
3716/*
3717 * Write superblock @sb to the @device. Do not wait for completion, all the
3718 * folios we use for writing are locked.
3719 *
3720 * Write @max_mirrors copies of the superblock, where 0 means default that fit
3721 * the expected device size at commit time. Note that max_mirrors must be
3722 * same for write and wait phases.
3723 *
3724 * Return number of errors when folio is not found or submission fails.
3725 */
3726static int write_dev_supers(struct btrfs_device *device,
3727 struct btrfs_super_block *sb, int max_mirrors)
3728{
3729 struct btrfs_fs_info *fs_info = device->fs_info;
3730 struct address_space *mapping = device->bdev->bd_mapping;
3731 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3732 int i;
3733 int ret;
3734 u64 bytenr, bytenr_orig;
3735
3736 atomic_set(v: &device->sb_write_errors, i: 0);
3737
3738 if (max_mirrors == 0)
3739 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3740
3741 shash->tfm = fs_info->csum_shash;
3742
3743 for (i = 0; i < max_mirrors; i++) {
3744 struct folio *folio;
3745 struct bio *bio;
3746 struct btrfs_super_block *disk_super;
3747 size_t offset;
3748
3749 bytenr_orig = btrfs_sb_offset(mirror: i);
3750 ret = btrfs_sb_log_location(device, mirror: i, WRITE, bytenr_ret: &bytenr);
3751 if (ret == -ENOENT) {
3752 continue;
3753 } else if (ret < 0) {
3754 btrfs_err(device->fs_info,
3755 "couldn't get super block location for mirror %d error %d",
3756 i, ret);
3757 atomic_inc(v: &device->sb_write_errors);
3758 continue;
3759 }
3760 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3761 device->commit_total_bytes)
3762 break;
3763
3764 btrfs_set_super_bytenr(s: sb, val: bytenr_orig);
3765
3766 crypto_shash_digest(desc: shash, data: (const char *)sb + BTRFS_CSUM_SIZE,
3767 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3768 out: sb->csum);
3769
3770 folio = __filemap_get_folio(mapping, index: bytenr >> PAGE_SHIFT,
3771 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
3772 GFP_NOFS);
3773 if (IS_ERR(ptr: folio)) {
3774 btrfs_err(device->fs_info,
3775 "couldn't get super block page for bytenr %llu error %ld",
3776 bytenr, PTR_ERR(folio));
3777 atomic_inc(v: &device->sb_write_errors);
3778 continue;
3779 }
3780
3781 offset = offset_in_folio(folio, bytenr);
3782 disk_super = folio_address(folio) + offset;
3783 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3784
3785 /*
3786 * Directly use bios here instead of relying on the page cache
3787 * to do I/O, so we don't lose the ability to do integrity
3788 * checking.
3789 */
3790 bio = bio_alloc(bdev: device->bdev, nr_vecs: 1,
3791 opf: REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
3792 GFP_NOFS);
3793 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3794 bio->bi_private = device;
3795 bio->bi_end_io = btrfs_end_super_write;
3796 bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, off: offset);
3797
3798 /*
3799 * We FUA only the first super block. The others we allow to
3800 * go down lazy and there's a short window where the on-disk
3801 * copies might still contain the older version.
3802 */
3803 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3804 bio->bi_opf |= REQ_FUA;
3805 submit_bio(bio);
3806
3807 if (btrfs_advance_sb_log(device, mirror: i))
3808 atomic_inc(v: &device->sb_write_errors);
3809 }
3810 return atomic_read(v: &device->sb_write_errors) < i ? 0 : -1;
3811}
3812
3813/*
3814 * Wait for write completion of superblocks done by write_dev_supers,
3815 * @max_mirrors same for write and wait phases.
3816 *
3817 * Return -1 if primary super block write failed or when there were no super block
3818 * copies written. Otherwise 0.
3819 */
3820static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3821{
3822 int i;
3823 int errors = 0;
3824 bool primary_failed = false;
3825 int ret;
3826 u64 bytenr;
3827
3828 if (max_mirrors == 0)
3829 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3830
3831 for (i = 0; i < max_mirrors; i++) {
3832 struct folio *folio;
3833
3834 ret = btrfs_sb_log_location(device, mirror: i, READ, bytenr_ret: &bytenr);
3835 if (ret == -ENOENT) {
3836 break;
3837 } else if (ret < 0) {
3838 errors++;
3839 if (i == 0)
3840 primary_failed = true;
3841 continue;
3842 }
3843 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3844 device->commit_total_bytes)
3845 break;
3846
3847 folio = filemap_get_folio(mapping: device->bdev->bd_mapping,
3848 index: bytenr >> PAGE_SHIFT);
3849 /* If the folio has been removed, then we know it completed. */
3850 if (IS_ERR(ptr: folio))
3851 continue;
3852
3853 /* Folio will be unlocked once the write completes. */
3854 folio_wait_locked(folio);
3855 folio_put(folio);
3856 }
3857
3858 errors += atomic_read(v: &device->sb_write_errors);
3859 if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
3860 primary_failed = true;
3861 if (primary_failed) {
3862 btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3863 device->devid);
3864 return -1;
3865 }
3866
3867 return errors < i ? 0 : -1;
3868}
3869
3870/*
3871 * endio for the write_dev_flush, this will wake anyone waiting
3872 * for the barrier when it is done
3873 */
3874static void btrfs_end_empty_barrier(struct bio *bio)
3875{
3876 bio_uninit(bio);
3877 complete(bio->bi_private);
3878}
3879
3880/*
3881 * Submit a flush request to the device if it supports it. Error handling is
3882 * done in the waiting counterpart.
3883 */
3884static void write_dev_flush(struct btrfs_device *device)
3885{
3886 struct bio *bio = &device->flush_bio;
3887
3888 device->last_flush_error = BLK_STS_OK;
3889
3890 bio_init(bio, bdev: device->bdev, NULL, max_vecs: 0,
3891 opf: REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
3892 bio->bi_end_io = btrfs_end_empty_barrier;
3893 init_completion(x: &device->flush_wait);
3894 bio->bi_private = &device->flush_wait;
3895 submit_bio(bio);
3896 set_bit(BTRFS_DEV_STATE_FLUSH_SENT, addr: &device->dev_state);
3897}
3898
3899/*
3900 * If the flush bio has been submitted by write_dev_flush, wait for it.
3901 * Return true for any error, and false otherwise.
3902 */
3903static bool wait_dev_flush(struct btrfs_device *device)
3904{
3905 struct bio *bio = &device->flush_bio;
3906
3907 if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, addr: &device->dev_state))
3908 return false;
3909
3910 wait_for_completion_io(&device->flush_wait);
3911
3912 if (bio->bi_status) {
3913 device->last_flush_error = bio->bi_status;
3914 btrfs_dev_stat_inc_and_print(dev: device, index: BTRFS_DEV_STAT_FLUSH_ERRS);
3915 return true;
3916 }
3917
3918 return false;
3919}
3920
3921/*
3922 * send an empty flush down to each device in parallel,
3923 * then wait for them
3924 */
3925static int barrier_all_devices(struct btrfs_fs_info *info)
3926{
3927 struct list_head *head;
3928 struct btrfs_device *dev;
3929 int errors_wait = 0;
3930
3931 lockdep_assert_held(&info->fs_devices->device_list_mutex);
3932 /* send down all the barriers */
3933 head = &info->fs_devices->devices;
3934 list_for_each_entry(dev, head, dev_list) {
3935 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3936 continue;
3937 if (!dev->bdev)
3938 continue;
3939 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3940 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3941 continue;
3942
3943 write_dev_flush(device: dev);
3944 }
3945
3946 /* wait for all the barriers */
3947 list_for_each_entry(dev, head, dev_list) {
3948 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3949 continue;
3950 if (!dev->bdev) {
3951 errors_wait++;
3952 continue;
3953 }
3954 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3955 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3956 continue;
3957
3958 if (wait_dev_flush(device: dev))
3959 errors_wait++;
3960 }
3961
3962 /*
3963 * Checks last_flush_error of disks in order to determine the device
3964 * state.
3965 */
3966 if (errors_wait && !btrfs_check_rw_degradable(fs_info: info, NULL))
3967 return -EIO;
3968
3969 return 0;
3970}
3971
3972int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3973{
3974 int raid_type;
3975 int min_tolerated = INT_MAX;
3976
3977 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3978 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3979 min_tolerated = min_t(int, min_tolerated,
3980 btrfs_raid_array[BTRFS_RAID_SINGLE].
3981 tolerated_failures);
3982
3983 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3984 if (raid_type == BTRFS_RAID_SINGLE)
3985 continue;
3986 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3987 continue;
3988 min_tolerated = min_t(int, min_tolerated,
3989 btrfs_raid_array[raid_type].
3990 tolerated_failures);
3991 }
3992
3993 if (min_tolerated == INT_MAX) {
3994 pr_warn("BTRFS: unknown raid flag: %llu", flags);
3995 min_tolerated = 0;
3996 }
3997
3998 return min_tolerated;
3999}
4000
4001int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4002{
4003 struct list_head *head;
4004 struct btrfs_device *dev;
4005 struct btrfs_super_block *sb;
4006 struct btrfs_dev_item *dev_item;
4007 int ret;
4008 int do_barriers;
4009 int max_errors;
4010 int total_errors = 0;
4011 u64 flags;
4012
4013 do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4014
4015 /*
4016 * max_mirrors == 0 indicates we're from commit_transaction,
4017 * not from fsync where the tree roots in fs_info have not
4018 * been consistent on disk.
4019 */
4020 if (max_mirrors == 0)
4021 backup_super_roots(info: fs_info);
4022
4023 sb = fs_info->super_for_commit;
4024 dev_item = &sb->dev_item;
4025
4026 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4027 head = &fs_info->fs_devices->devices;
4028 max_errors = btrfs_super_num_devices(s: fs_info->super_copy) - 1;
4029
4030 if (do_barriers) {
4031 ret = barrier_all_devices(info: fs_info);
4032 if (ret) {
4033 mutex_unlock(
4034 lock: &fs_info->fs_devices->device_list_mutex);
4035 btrfs_handle_fs_error(fs_info, ret,
4036 "errors while submitting device barriers.");
4037 return ret;
4038 }
4039 }
4040
4041 list_for_each_entry(dev, head, dev_list) {
4042 if (!dev->bdev) {
4043 total_errors++;
4044 continue;
4045 }
4046 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4047 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4048 continue;
4049
4050 btrfs_set_stack_device_generation(s: dev_item, val: 0);
4051 btrfs_set_stack_device_type(s: dev_item, val: dev->type);
4052 btrfs_set_stack_device_id(s: dev_item, val: dev->devid);
4053 btrfs_set_stack_device_total_bytes(s: dev_item,
4054 val: dev->commit_total_bytes);
4055 btrfs_set_stack_device_bytes_used(s: dev_item,
4056 val: dev->commit_bytes_used);
4057 btrfs_set_stack_device_io_align(s: dev_item, val: dev->io_align);
4058 btrfs_set_stack_device_io_width(s: dev_item, val: dev->io_width);
4059 btrfs_set_stack_device_sector_size(s: dev_item, val: dev->sector_size);
4060 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4061 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4062 BTRFS_FSID_SIZE);
4063
4064 flags = btrfs_super_flags(s: sb);
4065 btrfs_set_super_flags(s: sb, val: flags | BTRFS_HEADER_FLAG_WRITTEN);
4066
4067 ret = btrfs_validate_write_super(fs_info, sb);
4068 if (ret < 0) {
4069 mutex_unlock(lock: &fs_info->fs_devices->device_list_mutex);
4070 btrfs_handle_fs_error(fs_info, -EUCLEAN,
4071 "unexpected superblock corruption detected");
4072 return -EUCLEAN;
4073 }
4074
4075 ret = write_dev_supers(device: dev, sb, max_mirrors);
4076 if (ret)
4077 total_errors++;
4078 }
4079 if (total_errors > max_errors) {
4080 btrfs_err(fs_info, "%d errors while writing supers",
4081 total_errors);
4082 mutex_unlock(lock: &fs_info->fs_devices->device_list_mutex);
4083
4084 /* FUA is masked off if unsupported and can't be the reason */
4085 btrfs_handle_fs_error(fs_info, -EIO,
4086 "%d errors while writing supers",
4087 total_errors);
4088 return -EIO;
4089 }
4090
4091 total_errors = 0;
4092 list_for_each_entry(dev, head, dev_list) {
4093 if (!dev->bdev)
4094 continue;
4095 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4096 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4097 continue;
4098
4099 ret = wait_dev_supers(device: dev, max_mirrors);
4100 if (ret)
4101 total_errors++;
4102 }
4103 mutex_unlock(lock: &fs_info->fs_devices->device_list_mutex);
4104 if (total_errors > max_errors) {
4105 btrfs_handle_fs_error(fs_info, -EIO,
4106 "%d errors while writing supers",
4107 total_errors);
4108 return -EIO;
4109 }
4110 return 0;
4111}
4112
4113/* Drop a fs root from the radix tree and free it. */
4114void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4115 struct btrfs_root *root)
4116{
4117 bool drop_ref = false;
4118
4119 spin_lock(lock: &fs_info->fs_roots_radix_lock);
4120 radix_tree_delete(&fs_info->fs_roots_radix,
4121 (unsigned long)btrfs_root_id(root));
4122 if (test_and_clear_bit(nr: BTRFS_ROOT_IN_RADIX, addr: &root->state))
4123 drop_ref = true;
4124 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
4125
4126 if (BTRFS_FS_ERROR(fs_info)) {
4127 ASSERT(root->log_root == NULL);
4128 if (root->reloc_root) {
4129 btrfs_put_root(root: root->reloc_root);
4130 root->reloc_root = NULL;
4131 }
4132 }
4133
4134 if (drop_ref)
4135 btrfs_put_root(root);
4136}
4137
4138int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4139{
4140 mutex_lock(&fs_info->cleaner_mutex);
4141 btrfs_run_delayed_iputs(fs_info);
4142 mutex_unlock(lock: &fs_info->cleaner_mutex);
4143 wake_up_process(tsk: fs_info->cleaner_kthread);
4144
4145 /* wait until ongoing cleanup work done */
4146 down_write(sem: &fs_info->cleanup_work_sem);
4147 up_write(sem: &fs_info->cleanup_work_sem);
4148
4149 return btrfs_commit_current_transaction(root: fs_info->tree_root);
4150}
4151
4152static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4153{
4154 struct btrfs_transaction *trans;
4155 struct btrfs_transaction *tmp;
4156 bool found = false;
4157
4158 /*
4159 * This function is only called at the very end of close_ctree(),
4160 * thus no other running transaction, no need to take trans_lock.
4161 */
4162 ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4163 list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4164 struct extent_state *cached = NULL;
4165 u64 dirty_bytes = 0;
4166 u64 cur = 0;
4167 u64 found_start;
4168 u64 found_end;
4169
4170 found = true;
4171 while (btrfs_find_first_extent_bit(tree: &trans->dirty_pages, start: cur,
4172 start_ret: &found_start, end_ret: &found_end,
4173 bits: EXTENT_DIRTY, cached_state: &cached)) {
4174 dirty_bytes += found_end + 1 - found_start;
4175 cur = found_end + 1;
4176 }
4177 btrfs_warn(fs_info,
4178 "transaction %llu (with %llu dirty metadata bytes) is not committed",
4179 trans->transid, dirty_bytes);
4180 btrfs_cleanup_one_transaction(trans);
4181
4182 if (trans == fs_info->running_transaction)
4183 fs_info->running_transaction = NULL;
4184 list_del_init(entry: &trans->list);
4185
4186 btrfs_put_transaction(transaction: trans);
4187 trace_btrfs_transaction_commit(fs_info);
4188 }
4189 ASSERT(!found);
4190}
4191
4192void __cold close_ctree(struct btrfs_fs_info *fs_info)
4193{
4194 int ret;
4195
4196 set_bit(nr: BTRFS_FS_CLOSING_START, addr: &fs_info->flags);
4197
4198 /*
4199 * If we had UNFINISHED_DROPS we could still be processing them, so
4200 * clear that bit and wake up relocation so it can stop.
4201 * We must do this before stopping the block group reclaim task, because
4202 * at btrfs_relocate_block_group() we wait for this bit, and after the
4203 * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4204 * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4205 * return 1.
4206 */
4207 btrfs_wake_unfinished_drop(fs_info);
4208
4209 /*
4210 * We may have the reclaim task running and relocating a data block group,
4211 * in which case it may create delayed iputs. So stop it before we park
4212 * the cleaner kthread otherwise we can get new delayed iputs after
4213 * parking the cleaner, and that can make the async reclaim task to hang
4214 * if it's waiting for delayed iputs to complete, since the cleaner is
4215 * parked and can not run delayed iputs - this will make us hang when
4216 * trying to stop the async reclaim task.
4217 */
4218 cancel_work_sync(work: &fs_info->reclaim_bgs_work);
4219 /*
4220 * We don't want the cleaner to start new transactions, add more delayed
4221 * iputs, etc. while we're closing. We can't use kthread_stop() yet
4222 * because that frees the task_struct, and the transaction kthread might
4223 * still try to wake up the cleaner.
4224 */
4225 kthread_park(k: fs_info->cleaner_kthread);
4226
4227 /* wait for the qgroup rescan worker to stop */
4228 btrfs_qgroup_wait_for_completion(fs_info, interruptible: false);
4229
4230 /* wait for the uuid_scan task to finish */
4231 down(sem: &fs_info->uuid_tree_rescan_sem);
4232 /* avoid complains from lockdep et al., set sem back to initial state */
4233 up(sem: &fs_info->uuid_tree_rescan_sem);
4234
4235 /* pause restriper - we want to resume on mount */
4236 btrfs_pause_balance(fs_info);
4237
4238 btrfs_dev_replace_suspend_for_unmount(fs_info);
4239
4240 btrfs_scrub_cancel(info: fs_info);
4241
4242 /* wait for any defraggers to finish */
4243 wait_event(fs_info->transaction_wait,
4244 (atomic_read(&fs_info->defrag_running) == 0));
4245
4246 /* clear out the rbtree of defraggable inodes */
4247 btrfs_cleanup_defrag_inodes(fs_info);
4248
4249 /*
4250 * Handle the error fs first, as it will flush and wait for all ordered
4251 * extents. This will generate delayed iputs, thus we want to handle
4252 * it first.
4253 */
4254 if (unlikely(BTRFS_FS_ERROR(fs_info)))
4255 btrfs_error_commit_super(fs_info);
4256
4257 /*
4258 * Wait for any fixup workers to complete.
4259 * If we don't wait for them here and they are still running by the time
4260 * we call kthread_stop() against the cleaner kthread further below, we
4261 * get an use-after-free on the cleaner because the fixup worker adds an
4262 * inode to the list of delayed iputs and then attempts to wakeup the
4263 * cleaner kthread, which was already stopped and destroyed. We parked
4264 * already the cleaner, but below we run all pending delayed iputs.
4265 */
4266 btrfs_flush_workqueue(wq: fs_info->fixup_workers);
4267 /*
4268 * Similar case here, we have to wait for delalloc workers before we
4269 * proceed below and stop the cleaner kthread, otherwise we trigger a
4270 * use-after-tree on the cleaner kthread task_struct when a delalloc
4271 * worker running submit_compressed_extents() adds a delayed iput, which
4272 * does a wake up on the cleaner kthread, which was already freed below
4273 * when we call kthread_stop().
4274 */
4275 btrfs_flush_workqueue(wq: fs_info->delalloc_workers);
4276
4277 /*
4278 * We can have ordered extents getting their last reference dropped from
4279 * the fs_info->workers queue because for async writes for data bios we
4280 * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
4281 * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
4282 * has an error, and that later function can do the final
4283 * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
4284 * which adds a delayed iput for the inode. So we must flush the queue
4285 * so that we don't have delayed iputs after committing the current
4286 * transaction below and stopping the cleaner and transaction kthreads.
4287 */
4288 btrfs_flush_workqueue(wq: fs_info->workers);
4289
4290 /*
4291 * When finishing a compressed write bio we schedule a work queue item
4292 * to finish an ordered extent - btrfs_finish_compressed_write_work()
4293 * calls btrfs_finish_ordered_extent() which in turns does a call to
4294 * btrfs_queue_ordered_fn(), and that queues the ordered extent
4295 * completion either in the endio_write_workers work queue or in the
4296 * fs_info->endio_freespace_worker work queue. We flush those queues
4297 * below, so before we flush them we must flush this queue for the
4298 * workers of compressed writes.
4299 */
4300 flush_workqueue(fs_info->compressed_write_workers);
4301
4302 /*
4303 * After we parked the cleaner kthread, ordered extents may have
4304 * completed and created new delayed iputs. If one of the async reclaim
4305 * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4306 * can hang forever trying to stop it, because if a delayed iput is
4307 * added after it ran btrfs_run_delayed_iputs() and before it called
4308 * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4309 * no one else to run iputs.
4310 *
4311 * So wait for all ongoing ordered extents to complete and then run
4312 * delayed iputs. This works because once we reach this point no one
4313 * can either create new ordered extents nor create delayed iputs
4314 * through some other means.
4315 *
4316 * Also note that btrfs_wait_ordered_roots() is not safe here, because
4317 * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4318 * but the delayed iput for the respective inode is made only when doing
4319 * the final btrfs_put_ordered_extent() (which must happen at
4320 * btrfs_finish_ordered_io() when we are unmounting).
4321 */
4322 btrfs_flush_workqueue(wq: fs_info->endio_write_workers);
4323 /* Ordered extents for free space inodes. */
4324 btrfs_flush_workqueue(wq: fs_info->endio_freespace_worker);
4325 btrfs_run_delayed_iputs(fs_info);
4326 /* There should be no more workload to generate new delayed iputs. */
4327 set_bit(nr: BTRFS_FS_STATE_NO_DELAYED_IPUT, addr: &fs_info->fs_state);
4328
4329 cancel_work_sync(work: &fs_info->async_reclaim_work);
4330 cancel_work_sync(work: &fs_info->async_data_reclaim_work);
4331 cancel_work_sync(work: &fs_info->preempt_reclaim_work);
4332 cancel_work_sync(work: &fs_info->em_shrinker_work);
4333
4334 /* Cancel or finish ongoing discard work */
4335 btrfs_discard_cleanup(fs_info);
4336
4337 if (!sb_rdonly(sb: fs_info->sb)) {
4338 /*
4339 * The cleaner kthread is stopped, so do one final pass over
4340 * unused block groups.
4341 */
4342 btrfs_delete_unused_bgs(fs_info);
4343
4344 /*
4345 * There might be existing delayed inode workers still running
4346 * and holding an empty delayed inode item. We must wait for
4347 * them to complete first because they can create a transaction.
4348 * This happens when someone calls btrfs_balance_delayed_items()
4349 * and then a transaction commit runs the same delayed nodes
4350 * before any delayed worker has done something with the nodes.
4351 * We must wait for any worker here and not at transaction
4352 * commit time since that could cause a deadlock.
4353 * This is a very rare case.
4354 */
4355 btrfs_flush_workqueue(wq: fs_info->delayed_workers);
4356
4357 ret = btrfs_commit_super(fs_info);
4358 if (ret)
4359 btrfs_err(fs_info, "commit super ret %d", ret);
4360 }
4361
4362 kthread_stop(k: fs_info->transaction_kthread);
4363 kthread_stop(k: fs_info->cleaner_kthread);
4364
4365 ASSERT(list_empty(&fs_info->delayed_iputs));
4366 set_bit(nr: BTRFS_FS_CLOSING_DONE, addr: &fs_info->flags);
4367
4368 if (btrfs_check_quota_leak(fs_info)) {
4369 DEBUG_WARN("qgroup reserved space leaked");
4370 btrfs_err(fs_info, "qgroup reserved space leaked");
4371 }
4372
4373 btrfs_free_qgroup_config(fs_info);
4374 ASSERT(list_empty(&fs_info->delalloc_roots));
4375
4376 if (percpu_counter_sum(fbc: &fs_info->delalloc_bytes)) {
4377 btrfs_info(fs_info, "at unmount delalloc count %lld",
4378 percpu_counter_sum(&fs_info->delalloc_bytes));
4379 }
4380
4381 if (percpu_counter_sum(fbc: &fs_info->ordered_bytes))
4382 btrfs_info(fs_info, "at unmount dio bytes count %lld",
4383 percpu_counter_sum(&fs_info->ordered_bytes));
4384
4385 btrfs_sysfs_remove_mounted(fs_info);
4386 btrfs_sysfs_remove_fsid(fs_devs: fs_info->fs_devices);
4387
4388 btrfs_put_block_group_cache(info: fs_info);
4389
4390 /*
4391 * we must make sure there is not any read request to
4392 * submit after we stopping all workers.
4393 */
4394 invalidate_inode_pages2(mapping: fs_info->btree_inode->i_mapping);
4395 btrfs_stop_all_workers(fs_info);
4396
4397 /* We shouldn't have any transaction open at this point */
4398 warn_about_uncommitted_trans(fs_info);
4399
4400 clear_bit(nr: BTRFS_FS_OPEN, addr: &fs_info->flags);
4401 free_root_pointers(info: fs_info, free_chunk_root: true);
4402 btrfs_free_fs_roots(fs_info);
4403
4404 /*
4405 * We must free the block groups after dropping the fs_roots as we could
4406 * have had an IO error and have left over tree log blocks that aren't
4407 * cleaned up until the fs roots are freed. This makes the block group
4408 * accounting appear to be wrong because there's pending reserved bytes,
4409 * so make sure we do the block group cleanup afterwards.
4410 */
4411 btrfs_free_block_groups(info: fs_info);
4412
4413 iput(fs_info->btree_inode);
4414
4415 btrfs_mapping_tree_free(fs_info);
4416 btrfs_close_devices(fs_devices: fs_info->fs_devices);
4417}
4418
4419void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
4420 struct extent_buffer *buf)
4421{
4422 struct btrfs_fs_info *fs_info = buf->fs_info;
4423 u64 transid = btrfs_header_generation(eb: buf);
4424
4425#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4426 /*
4427 * This is a fast path so only do this check if we have sanity tests
4428 * enabled. Normal people shouldn't be using unmapped buffers as dirty
4429 * outside of the sanity tests.
4430 */
4431 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4432 return;
4433#endif
4434 /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
4435 ASSERT(trans->transid == fs_info->generation);
4436 btrfs_assert_tree_write_locked(eb: buf);
4437 if (unlikely(transid != fs_info->generation)) {
4438 btrfs_abort_transaction(trans, -EUCLEAN);
4439 btrfs_crit(fs_info,
4440"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
4441 buf->start, transid, fs_info->generation);
4442 }
4443 set_extent_buffer_dirty(buf);
4444}
4445
4446static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4447 int flush_delayed)
4448{
4449 /*
4450 * looks as though older kernels can get into trouble with
4451 * this code, they end up stuck in balance_dirty_pages forever
4452 */
4453 int ret;
4454
4455 if (current->flags & PF_MEMALLOC)
4456 return;
4457
4458 if (flush_delayed)
4459 btrfs_balance_delayed_items(fs_info);
4460
4461 ret = __percpu_counter_compare(fbc: &fs_info->dirty_metadata_bytes,
4462 BTRFS_DIRTY_METADATA_THRESH,
4463 batch: fs_info->dirty_metadata_batch);
4464 if (ret > 0) {
4465 balance_dirty_pages_ratelimited(mapping: fs_info->btree_inode->i_mapping);
4466 }
4467}
4468
4469void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4470{
4471 __btrfs_btree_balance_dirty(fs_info, flush_delayed: 1);
4472}
4473
4474void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4475{
4476 __btrfs_btree_balance_dirty(fs_info, flush_delayed: 0);
4477}
4478
4479static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4480{
4481 /* cleanup FS via transaction */
4482 btrfs_cleanup_transaction(fs_info);
4483
4484 down_write(sem: &fs_info->cleanup_work_sem);
4485 up_write(sem: &fs_info->cleanup_work_sem);
4486}
4487
4488static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4489{
4490 struct btrfs_root *gang[8];
4491 u64 root_objectid = 0;
4492 int ret;
4493
4494 spin_lock(lock: &fs_info->fs_roots_radix_lock);
4495 while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4496 results: (void **)gang, first_index: root_objectid,
4497 ARRAY_SIZE(gang))) != 0) {
4498 int i;
4499
4500 for (i = 0; i < ret; i++)
4501 gang[i] = btrfs_grab_root(root: gang[i]);
4502 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
4503
4504 for (i = 0; i < ret; i++) {
4505 if (!gang[i])
4506 continue;
4507 root_objectid = btrfs_root_id(root: gang[i]);
4508 btrfs_free_log(NULL, root: gang[i]);
4509 btrfs_put_root(root: gang[i]);
4510 }
4511 root_objectid++;
4512 spin_lock(lock: &fs_info->fs_roots_radix_lock);
4513 }
4514 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
4515 btrfs_free_log_root_tree(NULL, fs_info);
4516}
4517
4518static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4519{
4520 struct btrfs_ordered_extent *ordered;
4521
4522 spin_lock(lock: &root->ordered_extent_lock);
4523 /*
4524 * This will just short circuit the ordered completion stuff which will
4525 * make sure the ordered extent gets properly cleaned up.
4526 */
4527 list_for_each_entry(ordered, &root->ordered_extents,
4528 root_extent_list)
4529 set_bit(nr: BTRFS_ORDERED_IOERR, addr: &ordered->flags);
4530 spin_unlock(lock: &root->ordered_extent_lock);
4531}
4532
4533static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4534{
4535 struct btrfs_root *root;
4536 LIST_HEAD(splice);
4537
4538 spin_lock(lock: &fs_info->ordered_root_lock);
4539 list_splice_init(list: &fs_info->ordered_roots, head: &splice);
4540 while (!list_empty(head: &splice)) {
4541 root = list_first_entry(&splice, struct btrfs_root,
4542 ordered_root);
4543 list_move_tail(list: &root->ordered_root,
4544 head: &fs_info->ordered_roots);
4545
4546 spin_unlock(lock: &fs_info->ordered_root_lock);
4547 btrfs_destroy_ordered_extents(root);
4548
4549 cond_resched();
4550 spin_lock(lock: &fs_info->ordered_root_lock);
4551 }
4552 spin_unlock(lock: &fs_info->ordered_root_lock);
4553
4554 /*
4555 * We need this here because if we've been flipped read-only we won't
4556 * get sync() from the umount, so we need to make sure any ordered
4557 * extents that haven't had their dirty pages IO start writeout yet
4558 * actually get run and error out properly.
4559 */
4560 btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
4561}
4562
4563static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4564{
4565 struct btrfs_inode *btrfs_inode;
4566 LIST_HEAD(splice);
4567
4568 spin_lock(lock: &root->delalloc_lock);
4569 list_splice_init(list: &root->delalloc_inodes, head: &splice);
4570
4571 while (!list_empty(head: &splice)) {
4572 struct inode *inode = NULL;
4573 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4574 delalloc_inodes);
4575 btrfs_del_delalloc_inode(inode: btrfs_inode);
4576 spin_unlock(lock: &root->delalloc_lock);
4577
4578 /*
4579 * Make sure we get a live inode and that it'll not disappear
4580 * meanwhile.
4581 */
4582 inode = igrab(&btrfs_inode->vfs_inode);
4583 if (inode) {
4584 unsigned int nofs_flag;
4585
4586 nofs_flag = memalloc_nofs_save();
4587 invalidate_inode_pages2(mapping: inode->i_mapping);
4588 memalloc_nofs_restore(flags: nofs_flag);
4589 iput(inode);
4590 }
4591 spin_lock(lock: &root->delalloc_lock);
4592 }
4593 spin_unlock(lock: &root->delalloc_lock);
4594}
4595
4596static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4597{
4598 struct btrfs_root *root;
4599 LIST_HEAD(splice);
4600
4601 spin_lock(lock: &fs_info->delalloc_root_lock);
4602 list_splice_init(list: &fs_info->delalloc_roots, head: &splice);
4603 while (!list_empty(head: &splice)) {
4604 root = list_first_entry(&splice, struct btrfs_root,
4605 delalloc_root);
4606 root = btrfs_grab_root(root);
4607 BUG_ON(!root);
4608 spin_unlock(lock: &fs_info->delalloc_root_lock);
4609
4610 btrfs_destroy_delalloc_inodes(root);
4611 btrfs_put_root(root);
4612
4613 spin_lock(lock: &fs_info->delalloc_root_lock);
4614 }
4615 spin_unlock(lock: &fs_info->delalloc_root_lock);
4616}
4617
4618static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4619 struct extent_io_tree *dirty_pages,
4620 int mark)
4621{
4622 struct extent_buffer *eb;
4623 u64 start = 0;
4624 u64 end;
4625
4626 while (btrfs_find_first_extent_bit(tree: dirty_pages, start, start_ret: &start, end_ret: &end,
4627 bits: mark, NULL)) {
4628 btrfs_clear_extent_bits(tree: dirty_pages, start, end, bits: mark);
4629 while (start <= end) {
4630 eb = find_extent_buffer(fs_info, start);
4631 start += fs_info->nodesize;
4632 if (!eb)
4633 continue;
4634
4635 btrfs_tree_lock(eb);
4636 wait_on_extent_buffer_writeback(eb);
4637 btrfs_clear_buffer_dirty(NULL, buf: eb);
4638 btrfs_tree_unlock(eb);
4639
4640 free_extent_buffer_stale(eb);
4641 }
4642 }
4643}
4644
4645static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4646 struct extent_io_tree *unpin)
4647{
4648 u64 start;
4649 u64 end;
4650
4651 while (1) {
4652 struct extent_state *cached_state = NULL;
4653
4654 /*
4655 * The btrfs_finish_extent_commit() may get the same range as
4656 * ours between find_first_extent_bit and clear_extent_dirty.
4657 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4658 * the same extent range.
4659 */
4660 mutex_lock(&fs_info->unused_bg_unpin_mutex);
4661 if (!btrfs_find_first_extent_bit(tree: unpin, start: 0, start_ret: &start, end_ret: &end,
4662 bits: EXTENT_DIRTY, cached_state: &cached_state)) {
4663 mutex_unlock(lock: &fs_info->unused_bg_unpin_mutex);
4664 break;
4665 }
4666
4667 btrfs_clear_extent_dirty(tree: unpin, start, end, cached: &cached_state);
4668 btrfs_free_extent_state(state: cached_state);
4669 btrfs_error_unpin_extent_range(fs_info, start, end);
4670 mutex_unlock(lock: &fs_info->unused_bg_unpin_mutex);
4671 cond_resched();
4672 }
4673}
4674
4675static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4676{
4677 struct inode *inode;
4678
4679 inode = cache->io_ctl.inode;
4680 if (inode) {
4681 unsigned int nofs_flag;
4682
4683 nofs_flag = memalloc_nofs_save();
4684 invalidate_inode_pages2(mapping: inode->i_mapping);
4685 memalloc_nofs_restore(flags: nofs_flag);
4686
4687 BTRFS_I(inode)->generation = 0;
4688 cache->io_ctl.inode = NULL;
4689 iput(inode);
4690 }
4691 ASSERT(cache->io_ctl.pages == NULL);
4692 btrfs_put_block_group(cache);
4693}
4694
4695void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4696 struct btrfs_fs_info *fs_info)
4697{
4698 struct btrfs_block_group *cache;
4699
4700 spin_lock(lock: &cur_trans->dirty_bgs_lock);
4701 while (!list_empty(head: &cur_trans->dirty_bgs)) {
4702 cache = list_first_entry(&cur_trans->dirty_bgs,
4703 struct btrfs_block_group,
4704 dirty_list);
4705
4706 if (!list_empty(head: &cache->io_list)) {
4707 spin_unlock(lock: &cur_trans->dirty_bgs_lock);
4708 list_del_init(entry: &cache->io_list);
4709 btrfs_cleanup_bg_io(cache);
4710 spin_lock(lock: &cur_trans->dirty_bgs_lock);
4711 }
4712
4713 list_del_init(entry: &cache->dirty_list);
4714 spin_lock(lock: &cache->lock);
4715 cache->disk_cache_state = BTRFS_DC_ERROR;
4716 spin_unlock(lock: &cache->lock);
4717
4718 spin_unlock(lock: &cur_trans->dirty_bgs_lock);
4719 btrfs_put_block_group(cache);
4720 btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
4721 spin_lock(lock: &cur_trans->dirty_bgs_lock);
4722 }
4723 spin_unlock(lock: &cur_trans->dirty_bgs_lock);
4724
4725 /*
4726 * Refer to the definition of io_bgs member for details why it's safe
4727 * to use it without any locking
4728 */
4729 while (!list_empty(head: &cur_trans->io_bgs)) {
4730 cache = list_first_entry(&cur_trans->io_bgs,
4731 struct btrfs_block_group,
4732 io_list);
4733
4734 list_del_init(entry: &cache->io_list);
4735 spin_lock(lock: &cache->lock);
4736 cache->disk_cache_state = BTRFS_DC_ERROR;
4737 spin_unlock(lock: &cache->lock);
4738 btrfs_cleanup_bg_io(cache);
4739 }
4740}
4741
4742static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
4743{
4744 struct btrfs_root *gang[8];
4745 int i;
4746 int ret;
4747
4748 spin_lock(lock: &fs_info->fs_roots_radix_lock);
4749 while (1) {
4750 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
4751 results: (void **)gang, first_index: 0,
4752 ARRAY_SIZE(gang),
4753 BTRFS_ROOT_TRANS_TAG);
4754 if (ret == 0)
4755 break;
4756 for (i = 0; i < ret; i++) {
4757 struct btrfs_root *root = gang[i];
4758
4759 btrfs_qgroup_free_meta_all_pertrans(root);
4760 radix_tree_tag_clear(&fs_info->fs_roots_radix,
4761 index: (unsigned long)btrfs_root_id(root),
4762 BTRFS_ROOT_TRANS_TAG);
4763 }
4764 }
4765 spin_unlock(lock: &fs_info->fs_roots_radix_lock);
4766}
4767
4768void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
4769{
4770 struct btrfs_fs_info *fs_info = cur_trans->fs_info;
4771 struct btrfs_device *dev, *tmp;
4772
4773 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4774 ASSERT(list_empty(&cur_trans->dirty_bgs));
4775 ASSERT(list_empty(&cur_trans->io_bgs));
4776
4777 list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4778 post_commit_list) {
4779 list_del_init(entry: &dev->post_commit_list);
4780 }
4781
4782 btrfs_destroy_delayed_refs(trans: cur_trans);
4783
4784 cur_trans->state = TRANS_STATE_COMMIT_START;
4785 wake_up(&fs_info->transaction_blocked_wait);
4786
4787 cur_trans->state = TRANS_STATE_UNBLOCKED;
4788 wake_up(&fs_info->transaction_wait);
4789
4790 btrfs_destroy_marked_extents(fs_info, dirty_pages: &cur_trans->dirty_pages,
4791 mark: EXTENT_DIRTY);
4792 btrfs_destroy_pinned_extent(fs_info, unpin: &cur_trans->pinned_extents);
4793
4794 cur_trans->state =TRANS_STATE_COMPLETED;
4795 wake_up(&cur_trans->commit_wait);
4796}
4797
4798static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4799{
4800 struct btrfs_transaction *t;
4801
4802 mutex_lock(&fs_info->transaction_kthread_mutex);
4803
4804 spin_lock(lock: &fs_info->trans_lock);
4805 while (!list_empty(head: &fs_info->trans_list)) {
4806 t = list_first_entry(&fs_info->trans_list,
4807 struct btrfs_transaction, list);
4808 if (t->state >= TRANS_STATE_COMMIT_PREP) {
4809 refcount_inc(r: &t->use_count);
4810 spin_unlock(lock: &fs_info->trans_lock);
4811 btrfs_wait_for_commit(fs_info, transid: t->transid);
4812 btrfs_put_transaction(transaction: t);
4813 spin_lock(lock: &fs_info->trans_lock);
4814 continue;
4815 }
4816 if (t == fs_info->running_transaction) {
4817 t->state = TRANS_STATE_COMMIT_DOING;
4818 spin_unlock(lock: &fs_info->trans_lock);
4819 /*
4820 * We wait for 0 num_writers since we don't hold a trans
4821 * handle open currently for this transaction.
4822 */
4823 wait_event(t->writer_wait,
4824 atomic_read(&t->num_writers) == 0);
4825 } else {
4826 spin_unlock(lock: &fs_info->trans_lock);
4827 }
4828 btrfs_cleanup_one_transaction(cur_trans: t);
4829
4830 spin_lock(lock: &fs_info->trans_lock);
4831 if (t == fs_info->running_transaction)
4832 fs_info->running_transaction = NULL;
4833 list_del_init(entry: &t->list);
4834 spin_unlock(lock: &fs_info->trans_lock);
4835
4836 btrfs_put_transaction(transaction: t);
4837 trace_btrfs_transaction_commit(fs_info);
4838 spin_lock(lock: &fs_info->trans_lock);
4839 }
4840 spin_unlock(lock: &fs_info->trans_lock);
4841 btrfs_destroy_all_ordered_extents(fs_info);
4842 btrfs_destroy_delayed_inodes(fs_info);
4843 btrfs_assert_delayed_root_empty(fs_info);
4844 btrfs_destroy_all_delalloc_inodes(fs_info);
4845 btrfs_drop_all_logs(fs_info);
4846 btrfs_free_all_qgroup_pertrans(fs_info);
4847 mutex_unlock(lock: &fs_info->transaction_kthread_mutex);
4848
4849 return 0;
4850}
4851
4852int btrfs_init_root_free_objectid(struct btrfs_root *root)
4853{
4854 BTRFS_PATH_AUTO_FREE(path);
4855 int ret;
4856 struct extent_buffer *l;
4857 struct btrfs_key search_key;
4858 struct btrfs_key found_key;
4859 int slot;
4860
4861 path = btrfs_alloc_path();
4862 if (!path)
4863 return -ENOMEM;
4864
4865 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4866 search_key.type = -1;
4867 search_key.offset = (u64)-1;
4868 ret = btrfs_search_slot(NULL, root, key: &search_key, p: path, ins_len: 0, cow: 0);
4869 if (ret < 0)
4870 return ret;
4871 if (ret == 0) {
4872 /*
4873 * Key with offset -1 found, there would have to exist a root
4874 * with such id, but this is out of valid range.
4875 */
4876 return -EUCLEAN;
4877 }
4878 if (path->slots[0] > 0) {
4879 slot = path->slots[0] - 1;
4880 l = path->nodes[0];
4881 btrfs_item_key_to_cpu(eb: l, cpu_key: &found_key, nr: slot);
4882 root->free_objectid = max_t(u64, found_key.objectid + 1,
4883 BTRFS_FIRST_FREE_OBJECTID);
4884 } else {
4885 root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4886 }
4887
4888 return 0;
4889}
4890
4891int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4892{
4893 int ret;
4894 mutex_lock(&root->objectid_mutex);
4895
4896 if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4897 btrfs_warn(root->fs_info,
4898 "the objectid of root %llu reaches its highest value",
4899 btrfs_root_id(root));
4900 ret = -ENOSPC;
4901 goto out;
4902 }
4903
4904 *objectid = root->free_objectid++;
4905 ret = 0;
4906out:
4907 mutex_unlock(lock: &root->objectid_mutex);
4908 return ret;
4909}
4910

source code of linux/fs/btrfs/disk-io.c