1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | |
3 | #ifndef BTRFS_FS_H |
4 | #define BTRFS_FS_H |
5 | |
6 | #include <linux/blkdev.h> |
7 | #include <linux/sizes.h> |
8 | #include <linux/time64.h> |
9 | #include <linux/compiler.h> |
10 | #include <linux/math.h> |
11 | #include <linux/atomic.h> |
12 | #include <linux/blkdev.h> |
13 | #include <linux/percpu_counter.h> |
14 | #include <linux/completion.h> |
15 | #include <linux/lockdep.h> |
16 | #include <linux/spinlock.h> |
17 | #include <linux/mutex.h> |
18 | #include <linux/rwlock_types.h> |
19 | #include <linux/rwsem.h> |
20 | #include <linux/semaphore.h> |
21 | #include <linux/list.h> |
22 | #include <linux/radix-tree.h> |
23 | #include <linux/workqueue.h> |
24 | #include <linux/wait.h> |
25 | #include <linux/wait_bit.h> |
26 | #include <linux/sched.h> |
27 | #include <linux/rbtree.h> |
28 | #include <uapi/linux/btrfs.h> |
29 | #include <uapi/linux/btrfs_tree.h> |
30 | #include "extent-io-tree.h" |
31 | #include "async-thread.h" |
32 | #include "block-rsv.h" |
33 | #include "fs.h" |
34 | |
35 | struct inode; |
36 | struct super_block; |
37 | struct kobject; |
38 | struct reloc_control; |
39 | struct crypto_shash; |
40 | struct ulist; |
41 | struct btrfs_device; |
42 | struct btrfs_block_group; |
43 | struct btrfs_root; |
44 | struct btrfs_fs_devices; |
45 | struct btrfs_transaction; |
46 | struct btrfs_delayed_root; |
47 | struct btrfs_balance_control; |
48 | struct btrfs_subpage_info; |
49 | struct btrfs_stripe_hash_table; |
50 | struct btrfs_space_info; |
51 | |
52 | #define BTRFS_MAX_EXTENT_SIZE SZ_128M |
53 | |
54 | #define BTRFS_OLDEST_GENERATION 0ULL |
55 | |
56 | #define BTRFS_EMPTY_DIR_SIZE 0 |
57 | |
58 | #define BTRFS_DIRTY_METADATA_THRESH SZ_32M |
59 | |
60 | #define BTRFS_SUPER_INFO_OFFSET SZ_64K |
61 | #define BTRFS_SUPER_INFO_SIZE 4096 |
62 | static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); |
63 | |
64 | /* |
65 | * Number of metadata items necessary for an unlink operation: |
66 | * |
67 | * 1 for the possible orphan item |
68 | * 1 for the dir item |
69 | * 1 for the dir index |
70 | * 1 for the inode ref |
71 | * 1 for the inode |
72 | * 1 for the parent inode |
73 | */ |
74 | #define BTRFS_UNLINK_METADATA_UNITS 6 |
75 | |
76 | /* |
77 | * The reserved space at the beginning of each device. It covers the primary |
78 | * super block and leaves space for potential use by other tools like |
79 | * bootloaders or to lower potential damage of accidental overwrite. |
80 | */ |
81 | #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) |
82 | /* |
83 | * Runtime (in-memory) states of filesystem |
84 | */ |
85 | enum { |
86 | /* |
87 | * Filesystem is being remounted, allow to skip some operations, like |
88 | * defrag |
89 | */ |
90 | BTRFS_FS_STATE_REMOUNTING, |
91 | /* Filesystem in RO mode */ |
92 | BTRFS_FS_STATE_RO, |
93 | /* Track if a transaction abort has been reported on this filesystem */ |
94 | BTRFS_FS_STATE_TRANS_ABORTED, |
95 | /* |
96 | * Bio operations should be blocked on this filesystem because a source |
97 | * or target device is being destroyed as part of a device replace |
98 | */ |
99 | BTRFS_FS_STATE_DEV_REPLACING, |
100 | /* The btrfs_fs_info created for self-tests */ |
101 | BTRFS_FS_STATE_DUMMY_FS_INFO, |
102 | |
103 | BTRFS_FS_STATE_NO_CSUMS, |
104 | |
105 | /* Indicates there was an error cleaning up a log tree. */ |
106 | BTRFS_FS_STATE_LOG_CLEANUP_ERROR, |
107 | |
108 | BTRFS_FS_STATE_COUNT |
109 | }; |
110 | |
111 | enum { |
112 | BTRFS_FS_CLOSING_START, |
113 | BTRFS_FS_CLOSING_DONE, |
114 | BTRFS_FS_LOG_RECOVERING, |
115 | BTRFS_FS_OPEN, |
116 | BTRFS_FS_QUOTA_ENABLED, |
117 | BTRFS_FS_UPDATE_UUID_TREE_GEN, |
118 | BTRFS_FS_CREATING_FREE_SPACE_TREE, |
119 | BTRFS_FS_BTREE_ERR, |
120 | BTRFS_FS_LOG1_ERR, |
121 | BTRFS_FS_LOG2_ERR, |
122 | BTRFS_FS_QUOTA_OVERRIDE, |
123 | /* Used to record internally whether fs has been frozen */ |
124 | BTRFS_FS_FROZEN, |
125 | /* |
126 | * Indicate that balance has been set up from the ioctl and is in the |
127 | * main phase. The fs_info::balance_ctl is initialized. |
128 | */ |
129 | BTRFS_FS_BALANCE_RUNNING, |
130 | |
131 | /* |
132 | * Indicate that relocation of a chunk has started, it's set per chunk |
133 | * and is toggled between chunks. |
134 | */ |
135 | BTRFS_FS_RELOC_RUNNING, |
136 | |
137 | /* Indicate that the cleaner thread is awake and doing something. */ |
138 | BTRFS_FS_CLEANER_RUNNING, |
139 | |
140 | /* |
141 | * The checksumming has an optimized version and is considered fast, |
142 | * so we don't need to offload checksums to workqueues. |
143 | */ |
144 | BTRFS_FS_CSUM_IMPL_FAST, |
145 | |
146 | /* Indicate that the discard workqueue can service discards. */ |
147 | BTRFS_FS_DISCARD_RUNNING, |
148 | |
149 | /* Indicate that we need to cleanup space cache v1 */ |
150 | BTRFS_FS_CLEANUP_SPACE_CACHE_V1, |
151 | |
152 | /* Indicate that we can't trust the free space tree for caching yet */ |
153 | BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, |
154 | |
155 | /* Indicate whether there are any tree modification log users */ |
156 | BTRFS_FS_TREE_MOD_LOG_USERS, |
157 | |
158 | /* Indicate that we want the transaction kthread to commit right now. */ |
159 | BTRFS_FS_COMMIT_TRANS, |
160 | |
161 | /* Indicate we have half completed snapshot deletions pending. */ |
162 | BTRFS_FS_UNFINISHED_DROPS, |
163 | |
164 | /* Indicate we have to finish a zone to do next allocation. */ |
165 | BTRFS_FS_NEED_ZONE_FINISH, |
166 | |
167 | /* Indicate that we want to commit the transaction. */ |
168 | BTRFS_FS_NEED_TRANS_COMMIT, |
169 | |
170 | /* This is set when active zone tracking is needed. */ |
171 | BTRFS_FS_ACTIVE_ZONE_TRACKING, |
172 | |
173 | /* |
174 | * Indicate if we have some features changed, this is mostly for |
175 | * cleaner thread to update the sysfs interface. |
176 | */ |
177 | BTRFS_FS_FEATURE_CHANGED, |
178 | |
179 | /* |
180 | * Indicate that we have found a tree block which is only aligned to |
181 | * sectorsize, but not to nodesize. This should be rare nowadays. |
182 | */ |
183 | BTRFS_FS_UNALIGNED_TREE_BLOCK, |
184 | |
185 | #if BITS_PER_LONG == 32 |
186 | /* Indicate if we have error/warn message printed on 32bit systems */ |
187 | BTRFS_FS_32BIT_ERROR, |
188 | BTRFS_FS_32BIT_WARN, |
189 | #endif |
190 | }; |
191 | |
192 | /* |
193 | * Flags for mount options. |
194 | * |
195 | * Note: don't forget to add new options to btrfs_show_options() |
196 | */ |
197 | enum { |
198 | BTRFS_MOUNT_NODATASUM = (1UL << 0), |
199 | BTRFS_MOUNT_NODATACOW = (1UL << 1), |
200 | BTRFS_MOUNT_NOBARRIER = (1UL << 2), |
201 | BTRFS_MOUNT_SSD = (1UL << 3), |
202 | BTRFS_MOUNT_DEGRADED = (1UL << 4), |
203 | BTRFS_MOUNT_COMPRESS = (1UL << 5), |
204 | BTRFS_MOUNT_NOTREELOG = (1UL << 6), |
205 | BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), |
206 | BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), |
207 | BTRFS_MOUNT_NOSSD = (1UL << 9), |
208 | BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), |
209 | BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), |
210 | BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), |
211 | BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), |
212 | BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), |
213 | BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), |
214 | BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), |
215 | BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), |
216 | BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), |
217 | BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), |
218 | BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), |
219 | BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), |
220 | BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), |
221 | BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), |
222 | BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), |
223 | BTRFS_MOUNT_REF_VERIFY = (1UL << 25), |
224 | BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), |
225 | BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), |
226 | BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), |
227 | BTRFS_MOUNT_NODISCARD = (1UL << 29), |
228 | BTRFS_MOUNT_NOSPACECACHE = (1UL << 30), |
229 | }; |
230 | |
231 | /* |
232 | * Compat flags that we support. If any incompat flags are set other than the |
233 | * ones specified below then we will fail to mount |
234 | */ |
235 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
236 | #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL |
237 | #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL |
238 | |
239 | #define BTRFS_FEATURE_COMPAT_RO_SUPP \ |
240 | (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ |
241 | BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ |
242 | BTRFS_FEATURE_COMPAT_RO_VERITY | \ |
243 | BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) |
244 | |
245 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL |
246 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL |
247 | |
248 | #define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ |
249 | (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ |
250 | BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ |
251 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
252 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
253 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
254 | BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ |
255 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ |
256 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ |
257 | BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ |
258 | BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ |
259 | BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ |
260 | BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ |
261 | BTRFS_FEATURE_INCOMPAT_ZONED | \ |
262 | BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) |
263 | |
264 | #ifdef CONFIG_BTRFS_DEBUG |
265 | /* |
266 | * Features under developmen like Extent tree v2 support is enabled |
267 | * only under CONFIG_BTRFS_DEBUG. |
268 | */ |
269 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ |
270 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ |
271 | BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ |
272 | BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) |
273 | |
274 | #else |
275 | |
276 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ |
277 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) |
278 | |
279 | #endif |
280 | |
281 | #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ |
282 | (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
283 | #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL |
284 | |
285 | #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) |
286 | #define BTRFS_DEFAULT_MAX_INLINE (2048) |
287 | |
288 | struct btrfs_dev_replace { |
289 | /* See #define above */ |
290 | u64 replace_state; |
291 | /* Seconds since 1-Jan-1970 */ |
292 | time64_t time_started; |
293 | /* Seconds since 1-Jan-1970 */ |
294 | time64_t time_stopped; |
295 | atomic64_t num_write_errors; |
296 | atomic64_t num_uncorrectable_read_errors; |
297 | |
298 | u64 cursor_left; |
299 | u64 committed_cursor_left; |
300 | u64 cursor_left_last_write_of_item; |
301 | u64 cursor_right; |
302 | |
303 | /* See #define above */ |
304 | u64 cont_reading_from_srcdev_mode; |
305 | |
306 | int is_valid; |
307 | int item_needs_writeback; |
308 | struct btrfs_device *srcdev; |
309 | struct btrfs_device *tgtdev; |
310 | |
311 | struct mutex lock_finishing_cancel_unmount; |
312 | struct rw_semaphore rwsem; |
313 | |
314 | struct btrfs_scrub_progress scrub_progress; |
315 | |
316 | struct percpu_counter bio_counter; |
317 | wait_queue_head_t replace_wait; |
318 | }; |
319 | |
320 | /* |
321 | * Free clusters are used to claim free space in relatively large chunks, |
322 | * allowing us to do less seeky writes. They are used for all metadata |
323 | * allocations. In ssd_spread mode they are also used for data allocations. |
324 | */ |
325 | struct btrfs_free_cluster { |
326 | spinlock_t lock; |
327 | spinlock_t refill_lock; |
328 | struct rb_root root; |
329 | |
330 | /* Largest extent in this cluster */ |
331 | u64 max_size; |
332 | |
333 | /* First extent starting offset */ |
334 | u64 window_start; |
335 | |
336 | /* We did a full search and couldn't create a cluster */ |
337 | bool fragmented; |
338 | |
339 | struct btrfs_block_group *block_group; |
340 | /* |
341 | * When a cluster is allocated from a block group, we put the cluster |
342 | * onto a list in the block group so that it can be freed before the |
343 | * block group is freed. |
344 | */ |
345 | struct list_head block_group_list; |
346 | }; |
347 | |
348 | /* Discard control. */ |
349 | /* |
350 | * Async discard uses multiple lists to differentiate the discard filter |
351 | * parameters. Index 0 is for completely free block groups where we need to |
352 | * ensure the entire block group is trimmed without being lossy. Indices |
353 | * afterwards represent monotonically decreasing discard filter sizes to |
354 | * prioritize what should be discarded next. |
355 | */ |
356 | #define BTRFS_NR_DISCARD_LISTS 3 |
357 | #define BTRFS_DISCARD_INDEX_UNUSED 0 |
358 | #define BTRFS_DISCARD_INDEX_START 1 |
359 | |
360 | struct btrfs_discard_ctl { |
361 | struct workqueue_struct *discard_workers; |
362 | struct delayed_work work; |
363 | spinlock_t lock; |
364 | struct btrfs_block_group *block_group; |
365 | struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; |
366 | u64 prev_discard; |
367 | u64 prev_discard_time; |
368 | atomic_t discardable_extents; |
369 | atomic64_t discardable_bytes; |
370 | u64 max_discard_size; |
371 | u64 delay_ms; |
372 | u32 iops_limit; |
373 | u32 kbps_limit; |
374 | u64 discard_extent_bytes; |
375 | u64 discard_bitmap_bytes; |
376 | atomic64_t discard_bytes_saved; |
377 | }; |
378 | |
379 | /* |
380 | * Exclusive operations (device replace, resize, device add/remove, balance) |
381 | */ |
382 | enum btrfs_exclusive_operation { |
383 | BTRFS_EXCLOP_NONE, |
384 | BTRFS_EXCLOP_BALANCE_PAUSED, |
385 | BTRFS_EXCLOP_BALANCE, |
386 | BTRFS_EXCLOP_DEV_ADD, |
387 | BTRFS_EXCLOP_DEV_REMOVE, |
388 | BTRFS_EXCLOP_DEV_REPLACE, |
389 | BTRFS_EXCLOP_RESIZE, |
390 | BTRFS_EXCLOP_SWAP_ACTIVATE, |
391 | }; |
392 | |
393 | /* Store data about transaction commits, exported via sysfs. */ |
394 | struct btrfs_commit_stats { |
395 | /* Total number of commits */ |
396 | u64 commit_count; |
397 | /* The maximum commit duration so far in ns */ |
398 | u64 max_commit_dur; |
399 | /* The last commit duration in ns */ |
400 | u64 last_commit_dur; |
401 | /* The total commit duration in ns */ |
402 | u64 total_commit_dur; |
403 | }; |
404 | |
405 | struct btrfs_fs_info { |
406 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; |
407 | unsigned long flags; |
408 | struct btrfs_root *tree_root; |
409 | struct btrfs_root *chunk_root; |
410 | struct btrfs_root *dev_root; |
411 | struct btrfs_root *fs_root; |
412 | struct btrfs_root *quota_root; |
413 | struct btrfs_root *uuid_root; |
414 | struct btrfs_root *data_reloc_root; |
415 | struct btrfs_root *block_group_root; |
416 | struct btrfs_root *stripe_root; |
417 | |
418 | /* The log root tree is a directory of all the other log roots */ |
419 | struct btrfs_root *log_root_tree; |
420 | |
421 | /* The tree that holds the global roots (csum, extent, etc) */ |
422 | rwlock_t global_root_lock; |
423 | struct rb_root global_root_tree; |
424 | |
425 | spinlock_t fs_roots_radix_lock; |
426 | struct radix_tree_root fs_roots_radix; |
427 | |
428 | /* Block group cache stuff */ |
429 | rwlock_t block_group_cache_lock; |
430 | struct rb_root_cached block_group_cache_tree; |
431 | |
432 | /* Keep track of unallocated space */ |
433 | atomic64_t free_chunk_space; |
434 | |
435 | /* Track ranges which are used by log trees blocks/logged data extents */ |
436 | struct extent_io_tree excluded_extents; |
437 | |
438 | /* logical->physical extent mapping */ |
439 | struct rb_root_cached mapping_tree; |
440 | rwlock_t mapping_tree_lock; |
441 | |
442 | /* |
443 | * Block reservation for extent, checksum, root tree and delayed dir |
444 | * index item. |
445 | */ |
446 | struct btrfs_block_rsv global_block_rsv; |
447 | /* Block reservation for metadata operations */ |
448 | struct btrfs_block_rsv trans_block_rsv; |
449 | /* Block reservation for chunk tree */ |
450 | struct btrfs_block_rsv chunk_block_rsv; |
451 | /* Block reservation for delayed operations */ |
452 | struct btrfs_block_rsv delayed_block_rsv; |
453 | /* Block reservation for delayed refs */ |
454 | struct btrfs_block_rsv delayed_refs_rsv; |
455 | |
456 | struct btrfs_block_rsv empty_block_rsv; |
457 | |
458 | /* |
459 | * Updated while holding the lock 'trans_lock'. Due to the life cycle of |
460 | * a transaction, it can be directly read while holding a transaction |
461 | * handle, everywhere else must be read with btrfs_get_fs_generation(). |
462 | * Should always be updated using btrfs_set_fs_generation(). |
463 | */ |
464 | u64 generation; |
465 | /* |
466 | * Always use btrfs_get_last_trans_committed() and |
467 | * btrfs_set_last_trans_committed() to read and update this field. |
468 | */ |
469 | u64 last_trans_committed; |
470 | /* |
471 | * Generation of the last transaction used for block group relocation |
472 | * since the filesystem was last mounted (or 0 if none happened yet). |
473 | * Must be written and read while holding btrfs_fs_info::commit_root_sem. |
474 | */ |
475 | u64 last_reloc_trans; |
476 | |
477 | /* |
478 | * This is updated to the current trans every time a full commit is |
479 | * required instead of the faster short fsync log commits |
480 | */ |
481 | u64 last_trans_log_full_commit; |
482 | unsigned long mount_opt; |
483 | |
484 | unsigned long compress_type:4; |
485 | unsigned int compress_level; |
486 | u32 commit_interval; |
487 | /* |
488 | * It is a suggestive number, the read side is safe even it gets a |
489 | * wrong number because we will write out the data into a regular |
490 | * extent. The write side(mount/remount) is under ->s_umount lock, |
491 | * so it is also safe. |
492 | */ |
493 | u64 max_inline; |
494 | |
495 | struct btrfs_transaction *running_transaction; |
496 | wait_queue_head_t transaction_throttle; |
497 | wait_queue_head_t transaction_wait; |
498 | wait_queue_head_t transaction_blocked_wait; |
499 | wait_queue_head_t async_submit_wait; |
500 | |
501 | /* |
502 | * Used to protect the incompat_flags, compat_flags, compat_ro_flags |
503 | * when they are updated. |
504 | * |
505 | * Because we do not clear the flags for ever, so we needn't use |
506 | * the lock on the read side. |
507 | * |
508 | * We also needn't use the lock when we mount the fs, because |
509 | * there is no other task which will update the flag. |
510 | */ |
511 | spinlock_t super_lock; |
512 | struct btrfs_super_block *super_copy; |
513 | struct btrfs_super_block *super_for_commit; |
514 | struct super_block *sb; |
515 | struct inode *btree_inode; |
516 | struct mutex tree_log_mutex; |
517 | struct mutex transaction_kthread_mutex; |
518 | struct mutex cleaner_mutex; |
519 | struct mutex chunk_mutex; |
520 | |
521 | /* |
522 | * This is taken to make sure we don't set block groups ro after the |
523 | * free space cache has been allocated on them. |
524 | */ |
525 | struct mutex ro_block_group_mutex; |
526 | |
527 | /* |
528 | * This is used during read/modify/write to make sure no two ios are |
529 | * trying to mod the same stripe at the same time. |
530 | */ |
531 | struct btrfs_stripe_hash_table *stripe_hash_table; |
532 | |
533 | /* |
534 | * This protects the ordered operations list only while we are |
535 | * processing all of the entries on it. This way we make sure the |
536 | * commit code doesn't find the list temporarily empty because another |
537 | * function happens to be doing non-waiting preflush before jumping |
538 | * into the main commit. |
539 | */ |
540 | struct mutex ordered_operations_mutex; |
541 | |
542 | struct rw_semaphore commit_root_sem; |
543 | |
544 | struct rw_semaphore cleanup_work_sem; |
545 | |
546 | struct rw_semaphore subvol_sem; |
547 | |
548 | spinlock_t trans_lock; |
549 | /* |
550 | * The reloc mutex goes with the trans lock, it is taken during commit |
551 | * to protect us from the relocation code. |
552 | */ |
553 | struct mutex reloc_mutex; |
554 | |
555 | struct list_head trans_list; |
556 | struct list_head dead_roots; |
557 | struct list_head caching_block_groups; |
558 | |
559 | spinlock_t delayed_iput_lock; |
560 | struct list_head delayed_iputs; |
561 | atomic_t nr_delayed_iputs; |
562 | wait_queue_head_t delayed_iputs_wait; |
563 | |
564 | atomic64_t tree_mod_seq; |
565 | |
566 | /* This protects tree_mod_log and tree_mod_seq_list */ |
567 | rwlock_t tree_mod_log_lock; |
568 | struct rb_root tree_mod_log; |
569 | struct list_head tree_mod_seq_list; |
570 | |
571 | atomic_t async_delalloc_pages; |
572 | |
573 | /* This is used to protect the following list -- ordered_roots. */ |
574 | spinlock_t ordered_root_lock; |
575 | |
576 | /* |
577 | * All fs/file tree roots in which there are data=ordered extents |
578 | * pending writeback are added into this list. |
579 | * |
580 | * These can span multiple transactions and basically include every |
581 | * dirty data page that isn't from nodatacow. |
582 | */ |
583 | struct list_head ordered_roots; |
584 | |
585 | struct mutex delalloc_root_mutex; |
586 | spinlock_t delalloc_root_lock; |
587 | /* All fs/file tree roots that have delalloc inodes. */ |
588 | struct list_head delalloc_roots; |
589 | |
590 | /* |
591 | * There is a pool of worker threads for checksumming during writes and |
592 | * a pool for checksumming after reads. This is because readers can |
593 | * run with FS locks held, and the writers may be waiting for those |
594 | * locks. We don't want ordering in the pending list to cause |
595 | * deadlocks, and so the two are serviced separately. |
596 | * |
597 | * A third pool does submit_bio to avoid deadlocking with the other two. |
598 | */ |
599 | struct btrfs_workqueue *workers; |
600 | struct btrfs_workqueue *delalloc_workers; |
601 | struct btrfs_workqueue *flush_workers; |
602 | struct workqueue_struct *endio_workers; |
603 | struct workqueue_struct *endio_meta_workers; |
604 | struct workqueue_struct *rmw_workers; |
605 | struct workqueue_struct *compressed_write_workers; |
606 | struct btrfs_workqueue *endio_write_workers; |
607 | struct btrfs_workqueue *endio_freespace_worker; |
608 | struct btrfs_workqueue *caching_workers; |
609 | |
610 | /* |
611 | * Fixup workers take dirty pages that didn't properly go through the |
612 | * cow mechanism and make them safe to write. It happens for the |
613 | * sys_munmap function call path. |
614 | */ |
615 | struct btrfs_workqueue *fixup_workers; |
616 | struct btrfs_workqueue *delayed_workers; |
617 | |
618 | struct task_struct *transaction_kthread; |
619 | struct task_struct *cleaner_kthread; |
620 | u32 thread_pool_size; |
621 | |
622 | struct kobject *space_info_kobj; |
623 | struct kobject *qgroups_kobj; |
624 | struct kobject *discard_kobj; |
625 | |
626 | /* Used to keep from writing metadata until there is a nice batch */ |
627 | struct percpu_counter dirty_metadata_bytes; |
628 | struct percpu_counter delalloc_bytes; |
629 | struct percpu_counter ordered_bytes; |
630 | s32 dirty_metadata_batch; |
631 | s32 delalloc_batch; |
632 | |
633 | /* Protected by 'trans_lock'. */ |
634 | struct list_head dirty_cowonly_roots; |
635 | |
636 | struct btrfs_fs_devices *fs_devices; |
637 | |
638 | /* |
639 | * The space_info list is effectively read only after initial setup. |
640 | * It is populated at mount time and cleaned up after all block groups |
641 | * are removed. RCU is used to protect it. |
642 | */ |
643 | struct list_head space_info; |
644 | |
645 | struct btrfs_space_info *data_sinfo; |
646 | |
647 | struct reloc_control *reloc_ctl; |
648 | |
649 | /* data_alloc_cluster is only used in ssd_spread mode */ |
650 | struct btrfs_free_cluster data_alloc_cluster; |
651 | |
652 | /* All metadata allocations go through this cluster. */ |
653 | struct btrfs_free_cluster meta_alloc_cluster; |
654 | |
655 | /* Auto defrag inodes go here. */ |
656 | spinlock_t defrag_inodes_lock; |
657 | struct rb_root defrag_inodes; |
658 | atomic_t defrag_running; |
659 | |
660 | /* Used to protect avail_{data, metadata, system}_alloc_bits */ |
661 | seqlock_t profiles_lock; |
662 | /* |
663 | * These three are in extended format (availability of single chunks is |
664 | * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted |
665 | * by corresponding BTRFS_BLOCK_GROUP_* bits) |
666 | */ |
667 | u64 avail_data_alloc_bits; |
668 | u64 avail_metadata_alloc_bits; |
669 | u64 avail_system_alloc_bits; |
670 | |
671 | /* Balance state */ |
672 | spinlock_t balance_lock; |
673 | struct mutex balance_mutex; |
674 | atomic_t balance_pause_req; |
675 | atomic_t balance_cancel_req; |
676 | struct btrfs_balance_control *balance_ctl; |
677 | wait_queue_head_t balance_wait_q; |
678 | |
679 | /* Cancellation requests for chunk relocation */ |
680 | atomic_t reloc_cancel_req; |
681 | |
682 | u32 data_chunk_allocations; |
683 | u32 metadata_ratio; |
684 | |
685 | void *bdev_holder; |
686 | |
687 | /* Private scrub information */ |
688 | struct mutex scrub_lock; |
689 | atomic_t scrubs_running; |
690 | atomic_t scrub_pause_req; |
691 | atomic_t scrubs_paused; |
692 | atomic_t scrub_cancel_req; |
693 | wait_queue_head_t scrub_pause_wait; |
694 | /* |
695 | * The worker pointers are NULL iff the refcount is 0, ie. scrub is not |
696 | * running. |
697 | */ |
698 | refcount_t scrub_workers_refcnt; |
699 | struct workqueue_struct *scrub_workers; |
700 | struct btrfs_subpage_info *subpage_info; |
701 | |
702 | struct btrfs_discard_ctl discard_ctl; |
703 | |
704 | /* Is qgroup tracking in a consistent state? */ |
705 | u64 qgroup_flags; |
706 | |
707 | /* Holds configuration and tracking. Protected by qgroup_lock. */ |
708 | struct rb_root qgroup_tree; |
709 | spinlock_t qgroup_lock; |
710 | |
711 | /* |
712 | * Used to avoid frequently calling ulist_alloc()/ulist_free() |
713 | * when doing qgroup accounting, it must be protected by qgroup_lock. |
714 | */ |
715 | struct ulist *qgroup_ulist; |
716 | |
717 | /* |
718 | * Protect user change for quota operations. If a transaction is needed, |
719 | * it must be started before locking this lock. |
720 | */ |
721 | struct mutex qgroup_ioctl_lock; |
722 | |
723 | /* List of dirty qgroups to be written at next commit. */ |
724 | struct list_head dirty_qgroups; |
725 | |
726 | /* Used by qgroup for an efficient tree traversal. */ |
727 | u64 qgroup_seq; |
728 | |
729 | /* Qgroup rescan items. */ |
730 | /* Protects the progress item */ |
731 | struct mutex qgroup_rescan_lock; |
732 | struct btrfs_key qgroup_rescan_progress; |
733 | struct btrfs_workqueue *qgroup_rescan_workers; |
734 | struct completion qgroup_rescan_completion; |
735 | struct btrfs_work qgroup_rescan_work; |
736 | /* Protected by qgroup_rescan_lock */ |
737 | bool qgroup_rescan_running; |
738 | u8 qgroup_drop_subtree_thres; |
739 | u64 qgroup_enable_gen; |
740 | |
741 | /* |
742 | * If this is not 0, then it indicates a serious filesystem error has |
743 | * happened and it contains that error (negative errno value). |
744 | */ |
745 | int fs_error; |
746 | |
747 | /* Filesystem state */ |
748 | unsigned long fs_state; |
749 | |
750 | struct btrfs_delayed_root *delayed_root; |
751 | |
752 | /* Extent buffer radix tree */ |
753 | spinlock_t buffer_lock; |
754 | /* Entries are eb->start / sectorsize */ |
755 | struct radix_tree_root buffer_radix; |
756 | |
757 | /* Next backup root to be overwritten */ |
758 | int backup_root_index; |
759 | |
760 | /* Device replace state */ |
761 | struct btrfs_dev_replace dev_replace; |
762 | |
763 | struct semaphore uuid_tree_rescan_sem; |
764 | |
765 | /* Used to reclaim the metadata space in the background. */ |
766 | struct work_struct async_reclaim_work; |
767 | struct work_struct async_data_reclaim_work; |
768 | struct work_struct preempt_reclaim_work; |
769 | |
770 | /* Reclaim partially filled block groups in the background */ |
771 | struct work_struct reclaim_bgs_work; |
772 | /* Protected by unused_bgs_lock. */ |
773 | struct list_head reclaim_bgs; |
774 | int bg_reclaim_threshold; |
775 | |
776 | /* Protects the lists unused_bgs and reclaim_bgs. */ |
777 | spinlock_t unused_bgs_lock; |
778 | /* Protected by unused_bgs_lock. */ |
779 | struct list_head unused_bgs; |
780 | struct mutex unused_bg_unpin_mutex; |
781 | /* Protect block groups that are going to be deleted */ |
782 | struct mutex reclaim_bgs_lock; |
783 | |
784 | /* Cached block sizes */ |
785 | u32 nodesize; |
786 | u32 sectorsize; |
787 | /* ilog2 of sectorsize, use to avoid 64bit division */ |
788 | u32 sectorsize_bits; |
789 | u32 csum_size; |
790 | u32 csums_per_leaf; |
791 | u32 stripesize; |
792 | |
793 | /* |
794 | * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular |
795 | * filesystem, on zoned it depends on the device constraints. |
796 | */ |
797 | u64 max_extent_size; |
798 | |
799 | /* Block groups and devices containing active swapfiles. */ |
800 | spinlock_t swapfile_pins_lock; |
801 | struct rb_root swapfile_pins; |
802 | |
803 | struct crypto_shash *csum_shash; |
804 | |
805 | /* Type of exclusive operation running, protected by super_lock */ |
806 | enum btrfs_exclusive_operation exclusive_operation; |
807 | |
808 | /* |
809 | * Zone size > 0 when in ZONED mode, otherwise it's used for a check |
810 | * if the mode is enabled |
811 | */ |
812 | u64 zone_size; |
813 | |
814 | /* Constraints for ZONE_APPEND commands: */ |
815 | struct queue_limits limits; |
816 | u64 max_zone_append_size; |
817 | |
818 | struct mutex zoned_meta_io_lock; |
819 | spinlock_t treelog_bg_lock; |
820 | u64 treelog_bg; |
821 | |
822 | /* |
823 | * Start of the dedicated data relocation block group, protected by |
824 | * relocation_bg_lock. |
825 | */ |
826 | spinlock_t relocation_bg_lock; |
827 | u64 data_reloc_bg; |
828 | struct mutex zoned_data_reloc_io_lock; |
829 | |
830 | struct btrfs_block_group *active_meta_bg; |
831 | struct btrfs_block_group *active_system_bg; |
832 | |
833 | u64 nr_global_roots; |
834 | |
835 | spinlock_t zone_active_bgs_lock; |
836 | struct list_head zone_active_bgs; |
837 | |
838 | /* Updates are not protected by any lock */ |
839 | struct btrfs_commit_stats commit_stats; |
840 | |
841 | /* |
842 | * Last generation where we dropped a non-relocation root. |
843 | * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() |
844 | * to change it and to read it, respectively. |
845 | */ |
846 | u64 last_root_drop_gen; |
847 | |
848 | /* |
849 | * Annotations for transaction events (structures are empty when |
850 | * compiled without lockdep). |
851 | */ |
852 | struct lockdep_map btrfs_trans_num_writers_map; |
853 | struct lockdep_map btrfs_trans_num_extwriters_map; |
854 | struct lockdep_map btrfs_state_change_map[4]; |
855 | struct lockdep_map btrfs_trans_pending_ordered_map; |
856 | struct lockdep_map btrfs_ordered_extent_map; |
857 | |
858 | #ifdef CONFIG_BTRFS_FS_REF_VERIFY |
859 | spinlock_t ref_verify_lock; |
860 | struct rb_root block_tree; |
861 | #endif |
862 | |
863 | #ifdef CONFIG_BTRFS_DEBUG |
864 | struct kobject *debug_kobj; |
865 | struct list_head allocated_roots; |
866 | |
867 | spinlock_t eb_leak_lock; |
868 | struct list_head allocated_ebs; |
869 | #endif |
870 | }; |
871 | |
872 | #define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ |
873 | struct page *: (_page))->mapping->host)) |
874 | #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ |
875 | struct folio *: (_folio))->mapping->host)) |
876 | |
877 | #define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) |
878 | #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) |
879 | |
880 | #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ |
881 | struct inode *: (_inode)))->root->fs_info) |
882 | |
883 | static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) |
884 | { |
885 | return READ_ONCE(fs_info->generation); |
886 | } |
887 | |
888 | static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) |
889 | { |
890 | WRITE_ONCE(fs_info->generation, gen); |
891 | } |
892 | |
893 | static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) |
894 | { |
895 | return READ_ONCE(fs_info->last_trans_committed); |
896 | } |
897 | |
898 | static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) |
899 | { |
900 | WRITE_ONCE(fs_info->last_trans_committed, gen); |
901 | } |
902 | |
903 | static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, |
904 | u64 gen) |
905 | { |
906 | WRITE_ONCE(fs_info->last_root_drop_gen, gen); |
907 | } |
908 | |
909 | static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) |
910 | { |
911 | return READ_ONCE(fs_info->last_root_drop_gen); |
912 | } |
913 | |
914 | /* |
915 | * Take the number of bytes to be checksummed and figure out how many leaves |
916 | * it would require to store the csums for that many bytes. |
917 | */ |
918 | static inline u64 btrfs_csum_bytes_to_leaves( |
919 | const struct btrfs_fs_info *fs_info, u64 csum_bytes) |
920 | { |
921 | const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; |
922 | |
923 | return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); |
924 | } |
925 | |
926 | /* |
927 | * Use this if we would be adding new items, as we could split nodes as we cow |
928 | * down the tree. |
929 | */ |
930 | static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, |
931 | unsigned num_items) |
932 | { |
933 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; |
934 | } |
935 | |
936 | /* |
937 | * Doing a truncate or a modification won't result in new nodes or leaves, just |
938 | * what we need for COW. |
939 | */ |
940 | static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, |
941 | unsigned num_items) |
942 | { |
943 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; |
944 | } |
945 | |
946 | #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ |
947 | sizeof(struct btrfs_item)) |
948 | |
949 | static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) |
950 | { |
951 | return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; |
952 | } |
953 | |
954 | /* |
955 | * Count how many fs_info->max_extent_size cover the @size |
956 | */ |
957 | static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) |
958 | { |
959 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
960 | if (!fs_info) |
961 | return div_u64(dividend: size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); |
962 | #endif |
963 | |
964 | return div_u64(dividend: size + fs_info->max_extent_size - 1, divisor: fs_info->max_extent_size); |
965 | } |
966 | |
967 | bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, |
968 | enum btrfs_exclusive_operation type); |
969 | bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, |
970 | enum btrfs_exclusive_operation type); |
971 | void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); |
972 | void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); |
973 | void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, |
974 | enum btrfs_exclusive_operation op); |
975 | |
976 | int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); |
977 | |
978 | /* Compatibility and incompatibility defines */ |
979 | void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, |
980 | const char *name); |
981 | void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, |
982 | const char *name); |
983 | void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, |
984 | const char *name); |
985 | void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, |
986 | const char *name); |
987 | |
988 | #define __btrfs_fs_incompat(fs_info, flags) \ |
989 | (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) |
990 | |
991 | #define __btrfs_fs_compat_ro(fs_info, flags) \ |
992 | (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) |
993 | |
994 | #define btrfs_set_fs_incompat(__fs_info, opt) \ |
995 | __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) |
996 | |
997 | #define btrfs_clear_fs_incompat(__fs_info, opt) \ |
998 | __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) |
999 | |
1000 | #define btrfs_fs_incompat(fs_info, opt) \ |
1001 | __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) |
1002 | |
1003 | #define btrfs_set_fs_compat_ro(__fs_info, opt) \ |
1004 | __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) |
1005 | |
1006 | #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ |
1007 | __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) |
1008 | |
1009 | #define btrfs_fs_compat_ro(fs_info, opt) \ |
1010 | __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) |
1011 | |
1012 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
1013 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
1014 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) |
1015 | #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ |
1016 | BTRFS_MOUNT_##opt) |
1017 | |
1018 | static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) |
1019 | { |
1020 | /* Do it this way so we only ever do one test_bit in the normal case. */ |
1021 | if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { |
1022 | if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) |
1023 | return 2; |
1024 | return 1; |
1025 | } |
1026 | return 0; |
1027 | } |
1028 | |
1029 | /* |
1030 | * If we remount the fs to be R/O or umount the fs, the cleaner needn't do |
1031 | * anything except sleeping. This function is used to check the status of |
1032 | * the fs. |
1033 | * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, |
1034 | * since setting and checking for SB_RDONLY in the superblock's flags is not |
1035 | * atomic. |
1036 | */ |
1037 | static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) |
1038 | { |
1039 | return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || |
1040 | btrfs_fs_closing(fs_info); |
1041 | } |
1042 | |
1043 | static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) |
1044 | { |
1045 | clear_and_wake_up_bit(bit: BTRFS_FS_UNFINISHED_DROPS, word: &fs_info->flags); |
1046 | } |
1047 | |
1048 | #define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) |
1049 | |
1050 | #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ |
1051 | (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ |
1052 | &(fs_info)->fs_state))) |
1053 | |
1054 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
1055 | |
1056 | #define EXPORT_FOR_TESTS |
1057 | |
1058 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
1059 | { |
1060 | return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); |
1061 | } |
1062 | |
1063 | void btrfs_test_destroy_inode(struct inode *inode); |
1064 | |
1065 | #else |
1066 | |
1067 | #define EXPORT_FOR_TESTS static |
1068 | |
1069 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
1070 | { |
1071 | return 0; |
1072 | } |
1073 | #endif |
1074 | |
1075 | #endif |
1076 | |