1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/err.h> |
4 | #include <linux/slab.h> |
5 | #include <linux/spinlock.h> |
6 | #include "messages.h" |
7 | #include "ctree.h" |
8 | #include "extent_map.h" |
9 | #include "compression.h" |
10 | #include "btrfs_inode.h" |
11 | |
12 | |
13 | static struct kmem_cache *extent_map_cache; |
14 | |
15 | int __init extent_map_init(void) |
16 | { |
17 | extent_map_cache = kmem_cache_create(name: "btrfs_extent_map" , |
18 | size: sizeof(struct extent_map), align: 0, flags: 0, NULL); |
19 | if (!extent_map_cache) |
20 | return -ENOMEM; |
21 | return 0; |
22 | } |
23 | |
24 | void __cold extent_map_exit(void) |
25 | { |
26 | kmem_cache_destroy(s: extent_map_cache); |
27 | } |
28 | |
29 | /* |
30 | * Initialize the extent tree @tree. Should be called for each new inode or |
31 | * other user of the extent_map interface. |
32 | */ |
33 | void extent_map_tree_init(struct extent_map_tree *tree) |
34 | { |
35 | tree->map = RB_ROOT_CACHED; |
36 | INIT_LIST_HEAD(list: &tree->modified_extents); |
37 | rwlock_init(&tree->lock); |
38 | } |
39 | |
40 | /* |
41 | * Allocate a new extent_map structure. The new structure is returned with a |
42 | * reference count of one and needs to be freed using free_extent_map() |
43 | */ |
44 | struct extent_map *alloc_extent_map(void) |
45 | { |
46 | struct extent_map *em; |
47 | em = kmem_cache_zalloc(k: extent_map_cache, GFP_NOFS); |
48 | if (!em) |
49 | return NULL; |
50 | RB_CLEAR_NODE(&em->rb_node); |
51 | refcount_set(r: &em->refs, n: 1); |
52 | INIT_LIST_HEAD(list: &em->list); |
53 | return em; |
54 | } |
55 | |
56 | /* |
57 | * Drop the reference out on @em by one and free the structure if the reference |
58 | * count hits zero. |
59 | */ |
60 | void free_extent_map(struct extent_map *em) |
61 | { |
62 | if (!em) |
63 | return; |
64 | if (refcount_dec_and_test(r: &em->refs)) { |
65 | WARN_ON(extent_map_in_tree(em)); |
66 | WARN_ON(!list_empty(&em->list)); |
67 | kmem_cache_free(s: extent_map_cache, objp: em); |
68 | } |
69 | } |
70 | |
71 | /* Do the math around the end of an extent, handling wrapping. */ |
72 | static u64 range_end(u64 start, u64 len) |
73 | { |
74 | if (start + len < start) |
75 | return (u64)-1; |
76 | return start + len; |
77 | } |
78 | |
79 | static int tree_insert(struct rb_root_cached *root, struct extent_map *em) |
80 | { |
81 | struct rb_node **p = &root->rb_root.rb_node; |
82 | struct rb_node *parent = NULL; |
83 | struct extent_map *entry = NULL; |
84 | struct rb_node *orig_parent = NULL; |
85 | u64 end = range_end(start: em->start, len: em->len); |
86 | bool leftmost = true; |
87 | |
88 | while (*p) { |
89 | parent = *p; |
90 | entry = rb_entry(parent, struct extent_map, rb_node); |
91 | |
92 | if (em->start < entry->start) { |
93 | p = &(*p)->rb_left; |
94 | } else if (em->start >= extent_map_end(em: entry)) { |
95 | p = &(*p)->rb_right; |
96 | leftmost = false; |
97 | } else { |
98 | return -EEXIST; |
99 | } |
100 | } |
101 | |
102 | orig_parent = parent; |
103 | while (parent && em->start >= extent_map_end(em: entry)) { |
104 | parent = rb_next(parent); |
105 | entry = rb_entry(parent, struct extent_map, rb_node); |
106 | } |
107 | if (parent) |
108 | if (end > entry->start && em->start < extent_map_end(em: entry)) |
109 | return -EEXIST; |
110 | |
111 | parent = orig_parent; |
112 | entry = rb_entry(parent, struct extent_map, rb_node); |
113 | while (parent && em->start < entry->start) { |
114 | parent = rb_prev(parent); |
115 | entry = rb_entry(parent, struct extent_map, rb_node); |
116 | } |
117 | if (parent) |
118 | if (end > entry->start && em->start < extent_map_end(em: entry)) |
119 | return -EEXIST; |
120 | |
121 | rb_link_node(node: &em->rb_node, parent: orig_parent, rb_link: p); |
122 | rb_insert_color_cached(node: &em->rb_node, root, leftmost); |
123 | return 0; |
124 | } |
125 | |
126 | /* |
127 | * Search through the tree for an extent_map with a given offset. If it can't |
128 | * be found, try to find some neighboring extents |
129 | */ |
130 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
131 | struct rb_node **prev_or_next_ret) |
132 | { |
133 | struct rb_node *n = root->rb_node; |
134 | struct rb_node *prev = NULL; |
135 | struct rb_node *orig_prev = NULL; |
136 | struct extent_map *entry; |
137 | struct extent_map *prev_entry = NULL; |
138 | |
139 | ASSERT(prev_or_next_ret); |
140 | |
141 | while (n) { |
142 | entry = rb_entry(n, struct extent_map, rb_node); |
143 | prev = n; |
144 | prev_entry = entry; |
145 | |
146 | if (offset < entry->start) |
147 | n = n->rb_left; |
148 | else if (offset >= extent_map_end(em: entry)) |
149 | n = n->rb_right; |
150 | else |
151 | return n; |
152 | } |
153 | |
154 | orig_prev = prev; |
155 | while (prev && offset >= extent_map_end(em: prev_entry)) { |
156 | prev = rb_next(prev); |
157 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
158 | } |
159 | |
160 | /* |
161 | * Previous extent map found, return as in this case the caller does not |
162 | * care about the next one. |
163 | */ |
164 | if (prev) { |
165 | *prev_or_next_ret = prev; |
166 | return NULL; |
167 | } |
168 | |
169 | prev = orig_prev; |
170 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
171 | while (prev && offset < prev_entry->start) { |
172 | prev = rb_prev(prev); |
173 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
174 | } |
175 | *prev_or_next_ret = prev; |
176 | |
177 | return NULL; |
178 | } |
179 | |
180 | static inline u64 extent_map_block_end(const struct extent_map *em) |
181 | { |
182 | if (em->block_start + em->block_len < em->block_start) |
183 | return (u64)-1; |
184 | return em->block_start + em->block_len; |
185 | } |
186 | |
187 | static bool can_merge_extent_map(const struct extent_map *em) |
188 | { |
189 | if (em->flags & EXTENT_FLAG_PINNED) |
190 | return false; |
191 | |
192 | /* Don't merge compressed extents, we need to know their actual size. */ |
193 | if (extent_map_is_compressed(em)) |
194 | return false; |
195 | |
196 | if (em->flags & EXTENT_FLAG_LOGGING) |
197 | return false; |
198 | |
199 | /* |
200 | * We don't want to merge stuff that hasn't been written to the log yet |
201 | * since it may not reflect exactly what is on disk, and that would be |
202 | * bad. |
203 | */ |
204 | if (!list_empty(head: &em->list)) |
205 | return false; |
206 | |
207 | return true; |
208 | } |
209 | |
210 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
211 | static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) |
212 | { |
213 | if (extent_map_end(em: prev) != next->start) |
214 | return false; |
215 | |
216 | if (prev->flags != next->flags) |
217 | return false; |
218 | |
219 | if (next->block_start < EXTENT_MAP_LAST_BYTE - 1) |
220 | return next->block_start == extent_map_block_end(em: prev); |
221 | |
222 | /* HOLES and INLINE extents. */ |
223 | return next->block_start == prev->block_start; |
224 | } |
225 | |
226 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
227 | { |
228 | struct extent_map *merge = NULL; |
229 | struct rb_node *rb; |
230 | |
231 | /* |
232 | * We can't modify an extent map that is in the tree and that is being |
233 | * used by another task, as it can cause that other task to see it in |
234 | * inconsistent state during the merging. We always have 1 reference for |
235 | * the tree and 1 for this task (which is unpinning the extent map or |
236 | * clearing the logging flag), so anything > 2 means it's being used by |
237 | * other tasks too. |
238 | */ |
239 | if (refcount_read(r: &em->refs) > 2) |
240 | return; |
241 | |
242 | if (!can_merge_extent_map(em)) |
243 | return; |
244 | |
245 | if (em->start != 0) { |
246 | rb = rb_prev(&em->rb_node); |
247 | if (rb) |
248 | merge = rb_entry(rb, struct extent_map, rb_node); |
249 | if (rb && can_merge_extent_map(em: merge) && mergeable_maps(prev: merge, next: em)) { |
250 | em->start = merge->start; |
251 | em->orig_start = merge->orig_start; |
252 | em->len += merge->len; |
253 | em->block_len += merge->block_len; |
254 | em->block_start = merge->block_start; |
255 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
256 | em->mod_start = merge->mod_start; |
257 | em->generation = max(em->generation, merge->generation); |
258 | em->flags |= EXTENT_FLAG_MERGED; |
259 | |
260 | rb_erase_cached(node: &merge->rb_node, root: &tree->map); |
261 | RB_CLEAR_NODE(&merge->rb_node); |
262 | free_extent_map(em: merge); |
263 | } |
264 | } |
265 | |
266 | rb = rb_next(&em->rb_node); |
267 | if (rb) |
268 | merge = rb_entry(rb, struct extent_map, rb_node); |
269 | if (rb && can_merge_extent_map(em: merge) && mergeable_maps(prev: em, next: merge)) { |
270 | em->len += merge->len; |
271 | em->block_len += merge->block_len; |
272 | rb_erase_cached(node: &merge->rb_node, root: &tree->map); |
273 | RB_CLEAR_NODE(&merge->rb_node); |
274 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
275 | em->generation = max(em->generation, merge->generation); |
276 | em->flags |= EXTENT_FLAG_MERGED; |
277 | free_extent_map(em: merge); |
278 | } |
279 | } |
280 | |
281 | /* |
282 | * Unpin an extent from the cache. |
283 | * |
284 | * @inode: the inode from which we are unpinning an extent range |
285 | * @start: logical offset in the file |
286 | * @len: length of the extent |
287 | * @gen: generation that this extent has been modified in |
288 | * |
289 | * Called after an extent has been written to disk properly. Set the generation |
290 | * to the generation that actually added the file item to the inode so we know |
291 | * we need to sync this extent when we call fsync(). |
292 | * |
293 | * Returns: 0 on success |
294 | * -ENOENT when the extent is not found in the tree |
295 | * -EUCLEAN if the found extent does not match the expected start |
296 | */ |
297 | int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) |
298 | { |
299 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
300 | struct extent_map_tree *tree = &inode->extent_tree; |
301 | int ret = 0; |
302 | struct extent_map *em; |
303 | bool prealloc = false; |
304 | |
305 | write_lock(&tree->lock); |
306 | em = lookup_extent_mapping(tree, start, len); |
307 | |
308 | if (WARN_ON(!em)) { |
309 | btrfs_warn(fs_info, |
310 | "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu" , |
311 | btrfs_ino(inode), btrfs_root_id(inode->root), |
312 | start, start + len, gen); |
313 | ret = -ENOENT; |
314 | goto out; |
315 | } |
316 | |
317 | if (WARN_ON(em->start != start)) { |
318 | btrfs_warn(fs_info, |
319 | "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu" , |
320 | btrfs_ino(inode), btrfs_root_id(inode->root), |
321 | em->start, start, start + len, gen); |
322 | ret = -EUCLEAN; |
323 | goto out; |
324 | } |
325 | |
326 | em->generation = gen; |
327 | em->flags &= ~EXTENT_FLAG_PINNED; |
328 | em->mod_start = em->start; |
329 | em->mod_len = em->len; |
330 | |
331 | if (em->flags & EXTENT_FLAG_FILLING) { |
332 | prealloc = true; |
333 | em->flags &= ~EXTENT_FLAG_FILLING; |
334 | } |
335 | |
336 | try_merge_map(tree, em); |
337 | |
338 | if (prealloc) { |
339 | em->mod_start = em->start; |
340 | em->mod_len = em->len; |
341 | } |
342 | |
343 | out: |
344 | write_unlock(&tree->lock); |
345 | free_extent_map(em); |
346 | return ret; |
347 | |
348 | } |
349 | |
350 | void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) |
351 | { |
352 | lockdep_assert_held_write(&tree->lock); |
353 | |
354 | em->flags &= ~EXTENT_FLAG_LOGGING; |
355 | if (extent_map_in_tree(em)) |
356 | try_merge_map(tree, em); |
357 | } |
358 | |
359 | static inline void setup_extent_mapping(struct extent_map_tree *tree, |
360 | struct extent_map *em, |
361 | int modified) |
362 | { |
363 | refcount_inc(r: &em->refs); |
364 | em->mod_start = em->start; |
365 | em->mod_len = em->len; |
366 | |
367 | ASSERT(list_empty(&em->list)); |
368 | |
369 | if (modified) |
370 | list_add(new: &em->list, head: &tree->modified_extents); |
371 | else |
372 | try_merge_map(tree, em); |
373 | } |
374 | |
375 | /* |
376 | * Add new extent map to the extent tree |
377 | * |
378 | * @tree: tree to insert new map in |
379 | * @em: map to insert |
380 | * @modified: indicate whether the given @em should be added to the |
381 | * modified list, which indicates the extent needs to be logged |
382 | * |
383 | * Insert @em into @tree or perform a simple forward/backward merge with |
384 | * existing mappings. The extent_map struct passed in will be inserted |
385 | * into the tree directly, with an additional reference taken, or a |
386 | * reference dropped if the merge attempt was successful. |
387 | */ |
388 | static int add_extent_mapping(struct extent_map_tree *tree, |
389 | struct extent_map *em, int modified) |
390 | { |
391 | int ret = 0; |
392 | |
393 | lockdep_assert_held_write(&tree->lock); |
394 | |
395 | ret = tree_insert(root: &tree->map, em); |
396 | if (ret) |
397 | goto out; |
398 | |
399 | setup_extent_mapping(tree, em, modified); |
400 | out: |
401 | return ret; |
402 | } |
403 | |
404 | static struct extent_map * |
405 | __lookup_extent_mapping(struct extent_map_tree *tree, |
406 | u64 start, u64 len, int strict) |
407 | { |
408 | struct extent_map *em; |
409 | struct rb_node *rb_node; |
410 | struct rb_node *prev_or_next = NULL; |
411 | u64 end = range_end(start, len); |
412 | |
413 | rb_node = __tree_search(root: &tree->map.rb_root, offset: start, prev_or_next_ret: &prev_or_next); |
414 | if (!rb_node) { |
415 | if (prev_or_next) |
416 | rb_node = prev_or_next; |
417 | else |
418 | return NULL; |
419 | } |
420 | |
421 | em = rb_entry(rb_node, struct extent_map, rb_node); |
422 | |
423 | if (strict && !(end > em->start && start < extent_map_end(em))) |
424 | return NULL; |
425 | |
426 | refcount_inc(r: &em->refs); |
427 | return em; |
428 | } |
429 | |
430 | /* |
431 | * Lookup extent_map that intersects @start + @len range. |
432 | * |
433 | * @tree: tree to lookup in |
434 | * @start: byte offset to start the search |
435 | * @len: length of the lookup range |
436 | * |
437 | * Find and return the first extent_map struct in @tree that intersects the |
438 | * [start, len] range. There may be additional objects in the tree that |
439 | * intersect, so check the object returned carefully to make sure that no |
440 | * additional lookups are needed. |
441 | */ |
442 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, |
443 | u64 start, u64 len) |
444 | { |
445 | return __lookup_extent_mapping(tree, start, len, strict: 1); |
446 | } |
447 | |
448 | /* |
449 | * Find a nearby extent map intersecting @start + @len (not an exact search). |
450 | * |
451 | * @tree: tree to lookup in |
452 | * @start: byte offset to start the search |
453 | * @len: length of the lookup range |
454 | * |
455 | * Find and return the first extent_map struct in @tree that intersects the |
456 | * [start, len] range. |
457 | * |
458 | * If one can't be found, any nearby extent may be returned |
459 | */ |
460 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, |
461 | u64 start, u64 len) |
462 | { |
463 | return __lookup_extent_mapping(tree, start, len, strict: 0); |
464 | } |
465 | |
466 | /* |
467 | * Remove an extent_map from the extent tree. |
468 | * |
469 | * @tree: extent tree to remove from |
470 | * @em: extent map being removed |
471 | * |
472 | * Remove @em from @tree. No reference counts are dropped, and no checks |
473 | * are done to see if the range is in use. |
474 | */ |
475 | void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) |
476 | { |
477 | lockdep_assert_held_write(&tree->lock); |
478 | |
479 | WARN_ON(em->flags & EXTENT_FLAG_PINNED); |
480 | rb_erase_cached(node: &em->rb_node, root: &tree->map); |
481 | if (!(em->flags & EXTENT_FLAG_LOGGING)) |
482 | list_del_init(entry: &em->list); |
483 | RB_CLEAR_NODE(&em->rb_node); |
484 | } |
485 | |
486 | static void replace_extent_mapping(struct extent_map_tree *tree, |
487 | struct extent_map *cur, |
488 | struct extent_map *new, |
489 | int modified) |
490 | { |
491 | lockdep_assert_held_write(&tree->lock); |
492 | |
493 | WARN_ON(cur->flags & EXTENT_FLAG_PINNED); |
494 | ASSERT(extent_map_in_tree(cur)); |
495 | if (!(cur->flags & EXTENT_FLAG_LOGGING)) |
496 | list_del_init(entry: &cur->list); |
497 | rb_replace_node_cached(victim: &cur->rb_node, new: &new->rb_node, root: &tree->map); |
498 | RB_CLEAR_NODE(&cur->rb_node); |
499 | |
500 | setup_extent_mapping(tree, em: new, modified); |
501 | } |
502 | |
503 | static struct extent_map *next_extent_map(const struct extent_map *em) |
504 | { |
505 | struct rb_node *next; |
506 | |
507 | next = rb_next(&em->rb_node); |
508 | if (!next) |
509 | return NULL; |
510 | return container_of(next, struct extent_map, rb_node); |
511 | } |
512 | |
513 | static struct extent_map *prev_extent_map(struct extent_map *em) |
514 | { |
515 | struct rb_node *prev; |
516 | |
517 | prev = rb_prev(&em->rb_node); |
518 | if (!prev) |
519 | return NULL; |
520 | return container_of(prev, struct extent_map, rb_node); |
521 | } |
522 | |
523 | /* |
524 | * Helper for btrfs_get_extent. Given an existing extent in the tree, |
525 | * the existing extent is the nearest extent to map_start, |
526 | * and an extent that you want to insert, deal with overlap and insert |
527 | * the best fitted new extent into the tree. |
528 | */ |
529 | static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, |
530 | struct extent_map *existing, |
531 | struct extent_map *em, |
532 | u64 map_start) |
533 | { |
534 | struct extent_map *prev; |
535 | struct extent_map *next; |
536 | u64 start; |
537 | u64 end; |
538 | u64 start_diff; |
539 | |
540 | if (map_start < em->start || map_start >= extent_map_end(em)) |
541 | return -EINVAL; |
542 | |
543 | if (existing->start > map_start) { |
544 | next = existing; |
545 | prev = prev_extent_map(em: next); |
546 | } else { |
547 | prev = existing; |
548 | next = next_extent_map(em: prev); |
549 | } |
550 | |
551 | start = prev ? extent_map_end(em: prev) : em->start; |
552 | start = max_t(u64, start, em->start); |
553 | end = next ? next->start : extent_map_end(em); |
554 | end = min_t(u64, end, extent_map_end(em)); |
555 | start_diff = start - em->start; |
556 | em->start = start; |
557 | em->len = end - start; |
558 | if (em->block_start < EXTENT_MAP_LAST_BYTE && |
559 | !extent_map_is_compressed(em)) { |
560 | em->block_start += start_diff; |
561 | em->block_len = em->len; |
562 | } |
563 | return add_extent_mapping(tree: em_tree, em, modified: 0); |
564 | } |
565 | |
566 | /* |
567 | * Add extent mapping into em_tree. |
568 | * |
569 | * @fs_info: the filesystem |
570 | * @em_tree: extent tree into which we want to insert the extent mapping |
571 | * @em_in: extent we are inserting |
572 | * @start: start of the logical range btrfs_get_extent() is requesting |
573 | * @len: length of the logical range btrfs_get_extent() is requesting |
574 | * |
575 | * Note that @em_in's range may be different from [start, start+len), |
576 | * but they must be overlapped. |
577 | * |
578 | * Insert @em_in into @em_tree. In case there is an overlapping range, handle |
579 | * the -EEXIST by either: |
580 | * a) Returning the existing extent in @em_in if @start is within the |
581 | * existing em. |
582 | * b) Merge the existing extent with @em_in passed in. |
583 | * |
584 | * Return 0 on success, otherwise -EEXIST. |
585 | * |
586 | */ |
587 | int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, |
588 | struct extent_map_tree *em_tree, |
589 | struct extent_map **em_in, u64 start, u64 len) |
590 | { |
591 | int ret; |
592 | struct extent_map *em = *em_in; |
593 | |
594 | /* |
595 | * Tree-checker should have rejected any inline extent with non-zero |
596 | * file offset. Here just do a sanity check. |
597 | */ |
598 | if (em->block_start == EXTENT_MAP_INLINE) |
599 | ASSERT(em->start == 0); |
600 | |
601 | ret = add_extent_mapping(tree: em_tree, em, modified: 0); |
602 | /* it is possible that someone inserted the extent into the tree |
603 | * while we had the lock dropped. It is also possible that |
604 | * an overlapping map exists in the tree |
605 | */ |
606 | if (ret == -EEXIST) { |
607 | struct extent_map *existing; |
608 | |
609 | existing = search_extent_mapping(tree: em_tree, start, len); |
610 | |
611 | trace_btrfs_handle_em_exist(fs_info, existing, map: em, start, len); |
612 | |
613 | /* |
614 | * existing will always be non-NULL, since there must be |
615 | * extent causing the -EEXIST. |
616 | */ |
617 | if (start >= existing->start && |
618 | start < extent_map_end(em: existing)) { |
619 | free_extent_map(em); |
620 | *em_in = existing; |
621 | ret = 0; |
622 | } else { |
623 | u64 orig_start = em->start; |
624 | u64 orig_len = em->len; |
625 | |
626 | /* |
627 | * The existing extent map is the one nearest to |
628 | * the [start, start + len) range which overlaps |
629 | */ |
630 | ret = merge_extent_mapping(em_tree, existing, |
631 | em, map_start: start); |
632 | if (WARN_ON(ret)) { |
633 | free_extent_map(em); |
634 | *em_in = NULL; |
635 | btrfs_warn(fs_info, |
636 | "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu" , |
637 | existing->start, extent_map_end(existing), |
638 | orig_start, orig_start + orig_len, start); |
639 | } |
640 | free_extent_map(em: existing); |
641 | } |
642 | } |
643 | |
644 | ASSERT(ret == 0 || ret == -EEXIST); |
645 | return ret; |
646 | } |
647 | |
648 | /* |
649 | * Drop all extent maps from a tree in the fastest possible way, rescheduling |
650 | * if needed. This avoids searching the tree, from the root down to the first |
651 | * extent map, before each deletion. |
652 | */ |
653 | static void drop_all_extent_maps_fast(struct extent_map_tree *tree) |
654 | { |
655 | write_lock(&tree->lock); |
656 | while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { |
657 | struct extent_map *em; |
658 | struct rb_node *node; |
659 | |
660 | node = rb_first_cached(&tree->map); |
661 | em = rb_entry(node, struct extent_map, rb_node); |
662 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
663 | remove_extent_mapping(tree, em); |
664 | free_extent_map(em); |
665 | cond_resched_rwlock_write(&tree->lock); |
666 | } |
667 | write_unlock(&tree->lock); |
668 | } |
669 | |
670 | /* |
671 | * Drop all extent maps in a given range. |
672 | * |
673 | * @inode: The target inode. |
674 | * @start: Start offset of the range. |
675 | * @end: End offset of the range (inclusive value). |
676 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. |
677 | * |
678 | * This drops all the extent maps that intersect the given range [@start, @end]. |
679 | * Extent maps that partially overlap the range and extend behind or beyond it, |
680 | * are split. |
681 | * The caller should have locked an appropriate file range in the inode's io |
682 | * tree before calling this function. |
683 | */ |
684 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, |
685 | bool skip_pinned) |
686 | { |
687 | struct extent_map *split; |
688 | struct extent_map *split2; |
689 | struct extent_map *em; |
690 | struct extent_map_tree *em_tree = &inode->extent_tree; |
691 | u64 len = end - start + 1; |
692 | |
693 | WARN_ON(end < start); |
694 | if (end == (u64)-1) { |
695 | if (start == 0 && !skip_pinned) { |
696 | drop_all_extent_maps_fast(tree: em_tree); |
697 | return; |
698 | } |
699 | len = (u64)-1; |
700 | } else { |
701 | /* Make end offset exclusive for use in the loop below. */ |
702 | end++; |
703 | } |
704 | |
705 | /* |
706 | * It's ok if we fail to allocate the extent maps, see the comment near |
707 | * the bottom of the loop below. We only need two spare extent maps in |
708 | * the worst case, where the first extent map that intersects our range |
709 | * starts before the range and the last extent map that intersects our |
710 | * range ends after our range (and they might be the same extent map), |
711 | * because we need to split those two extent maps at the boundaries. |
712 | */ |
713 | split = alloc_extent_map(); |
714 | split2 = alloc_extent_map(); |
715 | |
716 | write_lock(&em_tree->lock); |
717 | em = lookup_extent_mapping(tree: em_tree, start, len); |
718 | |
719 | while (em) { |
720 | /* extent_map_end() returns exclusive value (last byte + 1). */ |
721 | const u64 em_end = extent_map_end(em); |
722 | struct extent_map *next_em = NULL; |
723 | u64 gen; |
724 | unsigned long flags; |
725 | bool modified; |
726 | bool compressed; |
727 | |
728 | if (em_end < end) { |
729 | next_em = next_extent_map(em); |
730 | if (next_em) { |
731 | if (next_em->start < end) |
732 | refcount_inc(r: &next_em->refs); |
733 | else |
734 | next_em = NULL; |
735 | } |
736 | } |
737 | |
738 | if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { |
739 | start = em_end; |
740 | goto next; |
741 | } |
742 | |
743 | flags = em->flags; |
744 | /* |
745 | * In case we split the extent map, we want to preserve the |
746 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want |
747 | * it on the new extent maps. |
748 | */ |
749 | em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); |
750 | modified = !list_empty(head: &em->list); |
751 | |
752 | /* |
753 | * The extent map does not cross our target range, so no need to |
754 | * split it, we can remove it directly. |
755 | */ |
756 | if (em->start >= start && em_end <= end) |
757 | goto remove_em; |
758 | |
759 | gen = em->generation; |
760 | compressed = extent_map_is_compressed(em); |
761 | |
762 | if (em->start < start) { |
763 | if (!split) { |
764 | split = split2; |
765 | split2 = NULL; |
766 | if (!split) |
767 | goto remove_em; |
768 | } |
769 | split->start = em->start; |
770 | split->len = start - em->start; |
771 | |
772 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { |
773 | split->orig_start = em->orig_start; |
774 | split->block_start = em->block_start; |
775 | |
776 | if (compressed) |
777 | split->block_len = em->block_len; |
778 | else |
779 | split->block_len = split->len; |
780 | split->orig_block_len = max(split->block_len, |
781 | em->orig_block_len); |
782 | split->ram_bytes = em->ram_bytes; |
783 | } else { |
784 | split->orig_start = split->start; |
785 | split->block_len = 0; |
786 | split->block_start = em->block_start; |
787 | split->orig_block_len = 0; |
788 | split->ram_bytes = split->len; |
789 | } |
790 | |
791 | split->generation = gen; |
792 | split->flags = flags; |
793 | replace_extent_mapping(tree: em_tree, cur: em, new: split, modified); |
794 | free_extent_map(em: split); |
795 | split = split2; |
796 | split2 = NULL; |
797 | } |
798 | if (em_end > end) { |
799 | if (!split) { |
800 | split = split2; |
801 | split2 = NULL; |
802 | if (!split) |
803 | goto remove_em; |
804 | } |
805 | split->start = end; |
806 | split->len = em_end - end; |
807 | split->block_start = em->block_start; |
808 | split->flags = flags; |
809 | split->generation = gen; |
810 | |
811 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { |
812 | split->orig_block_len = max(em->block_len, |
813 | em->orig_block_len); |
814 | |
815 | split->ram_bytes = em->ram_bytes; |
816 | if (compressed) { |
817 | split->block_len = em->block_len; |
818 | split->orig_start = em->orig_start; |
819 | } else { |
820 | const u64 diff = start + len - em->start; |
821 | |
822 | split->block_len = split->len; |
823 | split->block_start += diff; |
824 | split->orig_start = em->orig_start; |
825 | } |
826 | } else { |
827 | split->ram_bytes = split->len; |
828 | split->orig_start = split->start; |
829 | split->block_len = 0; |
830 | split->orig_block_len = 0; |
831 | } |
832 | |
833 | if (extent_map_in_tree(em)) { |
834 | replace_extent_mapping(tree: em_tree, cur: em, new: split, |
835 | modified); |
836 | } else { |
837 | int ret; |
838 | |
839 | ret = add_extent_mapping(tree: em_tree, em: split, |
840 | modified); |
841 | /* Logic error, shouldn't happen. */ |
842 | ASSERT(ret == 0); |
843 | if (WARN_ON(ret != 0) && modified) |
844 | btrfs_set_inode_full_sync(inode); |
845 | } |
846 | free_extent_map(em: split); |
847 | split = NULL; |
848 | } |
849 | remove_em: |
850 | if (extent_map_in_tree(em)) { |
851 | /* |
852 | * If the extent map is still in the tree it means that |
853 | * either of the following is true: |
854 | * |
855 | * 1) It fits entirely in our range (doesn't end beyond |
856 | * it or starts before it); |
857 | * |
858 | * 2) It starts before our range and/or ends after our |
859 | * range, and we were not able to allocate the extent |
860 | * maps for split operations, @split and @split2. |
861 | * |
862 | * If we are at case 2) then we just remove the entire |
863 | * extent map - this is fine since if anyone needs it to |
864 | * access the subranges outside our range, will just |
865 | * load it again from the subvolume tree's file extent |
866 | * item. However if the extent map was in the list of |
867 | * modified extents, then we must mark the inode for a |
868 | * full fsync, otherwise a fast fsync will miss this |
869 | * extent if it's new and needs to be logged. |
870 | */ |
871 | if ((em->start < start || em_end > end) && modified) { |
872 | ASSERT(!split); |
873 | btrfs_set_inode_full_sync(inode); |
874 | } |
875 | remove_extent_mapping(tree: em_tree, em); |
876 | } |
877 | |
878 | /* |
879 | * Once for the tree reference (we replaced or removed the |
880 | * extent map from the tree). |
881 | */ |
882 | free_extent_map(em); |
883 | next: |
884 | /* Once for us (for our lookup reference). */ |
885 | free_extent_map(em); |
886 | |
887 | em = next_em; |
888 | } |
889 | |
890 | write_unlock(&em_tree->lock); |
891 | |
892 | free_extent_map(em: split); |
893 | free_extent_map(em: split2); |
894 | } |
895 | |
896 | /* |
897 | * Replace a range in the inode's extent map tree with a new extent map. |
898 | * |
899 | * @inode: The target inode. |
900 | * @new_em: The new extent map to add to the inode's extent map tree. |
901 | * @modified: Indicate if the new extent map should be added to the list of |
902 | * modified extents (for fast fsync tracking). |
903 | * |
904 | * Drops all the extent maps in the inode's extent map tree that intersect the |
905 | * range of the new extent map and adds the new extent map to the tree. |
906 | * The caller should have locked an appropriate file range in the inode's io |
907 | * tree before calling this function. |
908 | */ |
909 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, |
910 | struct extent_map *new_em, |
911 | bool modified) |
912 | { |
913 | const u64 end = new_em->start + new_em->len - 1; |
914 | struct extent_map_tree *tree = &inode->extent_tree; |
915 | int ret; |
916 | |
917 | ASSERT(!extent_map_in_tree(new_em)); |
918 | |
919 | /* |
920 | * The caller has locked an appropriate file range in the inode's io |
921 | * tree, but getting -EEXIST when adding the new extent map can still |
922 | * happen in case there are extents that partially cover the range, and |
923 | * this is due to two tasks operating on different parts of the extent. |
924 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from |
925 | * btrfs_get_extent") for an example and details. |
926 | */ |
927 | do { |
928 | btrfs_drop_extent_map_range(inode, start: new_em->start, end, skip_pinned: false); |
929 | write_lock(&tree->lock); |
930 | ret = add_extent_mapping(tree, em: new_em, modified); |
931 | write_unlock(&tree->lock); |
932 | } while (ret == -EEXIST); |
933 | |
934 | return ret; |
935 | } |
936 | |
937 | /* |
938 | * Split off the first pre bytes from the extent_map at [start, start + len], |
939 | * and set the block_start for it to new_logical. |
940 | * |
941 | * This function is used when an ordered_extent needs to be split. |
942 | */ |
943 | int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, |
944 | u64 new_logical) |
945 | { |
946 | struct extent_map_tree *em_tree = &inode->extent_tree; |
947 | struct extent_map *em; |
948 | struct extent_map *split_pre = NULL; |
949 | struct extent_map *split_mid = NULL; |
950 | int ret = 0; |
951 | unsigned long flags; |
952 | |
953 | ASSERT(pre != 0); |
954 | ASSERT(pre < len); |
955 | |
956 | split_pre = alloc_extent_map(); |
957 | if (!split_pre) |
958 | return -ENOMEM; |
959 | split_mid = alloc_extent_map(); |
960 | if (!split_mid) { |
961 | ret = -ENOMEM; |
962 | goto out_free_pre; |
963 | } |
964 | |
965 | lock_extent(tree: &inode->io_tree, start, end: start + len - 1, NULL); |
966 | write_lock(&em_tree->lock); |
967 | em = lookup_extent_mapping(tree: em_tree, start, len); |
968 | if (!em) { |
969 | ret = -EIO; |
970 | goto out_unlock; |
971 | } |
972 | |
973 | ASSERT(em->len == len); |
974 | ASSERT(!extent_map_is_compressed(em)); |
975 | ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); |
976 | ASSERT(em->flags & EXTENT_FLAG_PINNED); |
977 | ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); |
978 | ASSERT(!list_empty(&em->list)); |
979 | |
980 | flags = em->flags; |
981 | em->flags &= ~EXTENT_FLAG_PINNED; |
982 | |
983 | /* First, replace the em with a new extent_map starting from * em->start */ |
984 | split_pre->start = em->start; |
985 | split_pre->len = pre; |
986 | split_pre->orig_start = split_pre->start; |
987 | split_pre->block_start = new_logical; |
988 | split_pre->block_len = split_pre->len; |
989 | split_pre->orig_block_len = split_pre->block_len; |
990 | split_pre->ram_bytes = split_pre->len; |
991 | split_pre->flags = flags; |
992 | split_pre->generation = em->generation; |
993 | |
994 | replace_extent_mapping(tree: em_tree, cur: em, new: split_pre, modified: 1); |
995 | |
996 | /* |
997 | * Now we only have an extent_map at: |
998 | * [em->start, em->start + pre] |
999 | */ |
1000 | |
1001 | /* Insert the middle extent_map. */ |
1002 | split_mid->start = em->start + pre; |
1003 | split_mid->len = em->len - pre; |
1004 | split_mid->orig_start = split_mid->start; |
1005 | split_mid->block_start = em->block_start + pre; |
1006 | split_mid->block_len = split_mid->len; |
1007 | split_mid->orig_block_len = split_mid->block_len; |
1008 | split_mid->ram_bytes = split_mid->len; |
1009 | split_mid->flags = flags; |
1010 | split_mid->generation = em->generation; |
1011 | add_extent_mapping(tree: em_tree, em: split_mid, modified: 1); |
1012 | |
1013 | /* Once for us */ |
1014 | free_extent_map(em); |
1015 | /* Once for the tree */ |
1016 | free_extent_map(em); |
1017 | |
1018 | out_unlock: |
1019 | write_unlock(&em_tree->lock); |
1020 | unlock_extent(tree: &inode->io_tree, start, end: start + len - 1, NULL); |
1021 | free_extent_map(em: split_mid); |
1022 | out_free_pre: |
1023 | free_extent_map(em: split_pre); |
1024 | return ret; |
1025 | } |
1026 | |