1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "misc.h" |
4 | #include "ctree.h" |
5 | #include "space-info.h" |
6 | #include "sysfs.h" |
7 | #include "volumes.h" |
8 | #include "free-space-cache.h" |
9 | #include "ordered-data.h" |
10 | #include "transaction.h" |
11 | #include "block-group.h" |
12 | #include "fs.h" |
13 | #include "accessors.h" |
14 | #include "extent-tree.h" |
15 | |
16 | /* |
17 | * HOW DOES SPACE RESERVATION WORK |
18 | * |
19 | * If you want to know about delalloc specifically, there is a separate comment |
20 | * for that with the delalloc code. This comment is about how the whole system |
21 | * works generally. |
22 | * |
23 | * BASIC CONCEPTS |
24 | * |
25 | * 1) space_info. This is the ultimate arbiter of how much space we can use. |
26 | * There's a description of the bytes_ fields with the struct declaration, |
27 | * refer to that for specifics on each field. Suffice it to say that for |
28 | * reservations we care about total_bytes - SUM(space_info->bytes_) when |
29 | * determining if there is space to make an allocation. There is a space_info |
30 | * for METADATA, SYSTEM, and DATA areas. |
31 | * |
32 | * 2) block_rsv's. These are basically buckets for every different type of |
33 | * metadata reservation we have. You can see the comment in the block_rsv |
34 | * code on the rules for each type, but generally block_rsv->reserved is how |
35 | * much space is accounted for in space_info->bytes_may_use. |
36 | * |
37 | * 3) btrfs_calc*_size. These are the worst case calculations we used based |
38 | * on the number of items we will want to modify. We have one for changing |
39 | * items, and one for inserting new items. Generally we use these helpers to |
40 | * determine the size of the block reserves, and then use the actual bytes |
41 | * values to adjust the space_info counters. |
42 | * |
43 | * MAKING RESERVATIONS, THE NORMAL CASE |
44 | * |
45 | * We call into either btrfs_reserve_data_bytes() or |
46 | * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with |
47 | * num_bytes we want to reserve. |
48 | * |
49 | * ->reserve |
50 | * space_info->bytes_may_reserve += num_bytes |
51 | * |
52 | * ->extent allocation |
53 | * Call btrfs_add_reserved_bytes() which does |
54 | * space_info->bytes_may_reserve -= num_bytes |
55 | * space_info->bytes_reserved += extent_bytes |
56 | * |
57 | * ->insert reference |
58 | * Call btrfs_update_block_group() which does |
59 | * space_info->bytes_reserved -= extent_bytes |
60 | * space_info->bytes_used += extent_bytes |
61 | * |
62 | * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority) |
63 | * |
64 | * Assume we are unable to simply make the reservation because we do not have |
65 | * enough space |
66 | * |
67 | * -> __reserve_bytes |
68 | * create a reserve_ticket with ->bytes set to our reservation, add it to |
69 | * the tail of space_info->tickets, kick async flush thread |
70 | * |
71 | * ->handle_reserve_ticket |
72 | * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set |
73 | * on the ticket. |
74 | * |
75 | * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space |
76 | * Flushes various things attempting to free up space. |
77 | * |
78 | * -> btrfs_try_granting_tickets() |
79 | * This is called by anything that either subtracts space from |
80 | * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the |
81 | * space_info->total_bytes. This loops through the ->priority_tickets and |
82 | * then the ->tickets list checking to see if the reservation can be |
83 | * completed. If it can the space is added to space_info->bytes_may_use and |
84 | * the ticket is woken up. |
85 | * |
86 | * -> ticket wakeup |
87 | * Check if ->bytes == 0, if it does we got our reservation and we can carry |
88 | * on, if not return the appropriate error (ENOSPC, but can be EINTR if we |
89 | * were interrupted.) |
90 | * |
91 | * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY |
92 | * |
93 | * Same as the above, except we add ourselves to the |
94 | * space_info->priority_tickets, and we do not use ticket->wait, we simply |
95 | * call flush_space() ourselves for the states that are safe for us to call |
96 | * without deadlocking and hope for the best. |
97 | * |
98 | * THE FLUSHING STATES |
99 | * |
100 | * Generally speaking we will have two cases for each state, a "nice" state |
101 | * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to |
102 | * reduce the locking over head on the various trees, and even to keep from |
103 | * doing any work at all in the case of delayed refs. Each of these delayed |
104 | * things however hold reservations, and so letting them run allows us to |
105 | * reclaim space so we can make new reservations. |
106 | * |
107 | * FLUSH_DELAYED_ITEMS |
108 | * Every inode has a delayed item to update the inode. Take a simple write |
109 | * for example, we would update the inode item at write time to update the |
110 | * mtime, and then again at finish_ordered_io() time in order to update the |
111 | * isize or bytes. We keep these delayed items to coalesce these operations |
112 | * into a single operation done on demand. These are an easy way to reclaim |
113 | * metadata space. |
114 | * |
115 | * FLUSH_DELALLOC |
116 | * Look at the delalloc comment to get an idea of how much space is reserved |
117 | * for delayed allocation. We can reclaim some of this space simply by |
118 | * running delalloc, but usually we need to wait for ordered extents to |
119 | * reclaim the bulk of this space. |
120 | * |
121 | * FLUSH_DELAYED_REFS |
122 | * We have a block reserve for the outstanding delayed refs space, and every |
123 | * delayed ref operation holds a reservation. Running these is a quick way |
124 | * to reclaim space, but we want to hold this until the end because COW can |
125 | * churn a lot and we can avoid making some extent tree modifications if we |
126 | * are able to delay for as long as possible. |
127 | * |
128 | * ALLOC_CHUNK |
129 | * We will skip this the first time through space reservation, because of |
130 | * overcommit and we don't want to have a lot of useless metadata space when |
131 | * our worst case reservations will likely never come true. |
132 | * |
133 | * RUN_DELAYED_IPUTS |
134 | * If we're freeing inodes we're likely freeing checksums, file extent |
135 | * items, and extent tree items. Loads of space could be freed up by these |
136 | * operations, however they won't be usable until the transaction commits. |
137 | * |
138 | * COMMIT_TRANS |
139 | * This will commit the transaction. Historically we had a lot of logic |
140 | * surrounding whether or not we'd commit the transaction, but this waits born |
141 | * out of a pre-tickets era where we could end up committing the transaction |
142 | * thousands of times in a row without making progress. Now thanks to our |
143 | * ticketing system we know if we're not making progress and can error |
144 | * everybody out after a few commits rather than burning the disk hoping for |
145 | * a different answer. |
146 | * |
147 | * OVERCOMMIT |
148 | * |
149 | * Because we hold so many reservations for metadata we will allow you to |
150 | * reserve more space than is currently free in the currently allocate |
151 | * metadata space. This only happens with metadata, data does not allow |
152 | * overcommitting. |
153 | * |
154 | * You can see the current logic for when we allow overcommit in |
155 | * btrfs_can_overcommit(), but it only applies to unallocated space. If there |
156 | * is no unallocated space to be had, all reservations are kept within the |
157 | * free space in the allocated metadata chunks. |
158 | * |
159 | * Because of overcommitting, you generally want to use the |
160 | * btrfs_can_overcommit() logic for metadata allocations, as it does the right |
161 | * thing with or without extra unallocated space. |
162 | */ |
163 | |
164 | u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, |
165 | bool may_use_included) |
166 | { |
167 | ASSERT(s_info); |
168 | return s_info->bytes_used + s_info->bytes_reserved + |
169 | s_info->bytes_pinned + s_info->bytes_readonly + |
170 | s_info->bytes_zone_unusable + |
171 | (may_use_included ? s_info->bytes_may_use : 0); |
172 | } |
173 | |
174 | /* |
175 | * after adding space to the filesystem, we need to clear the full flags |
176 | * on all the space infos. |
177 | */ |
178 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info) |
179 | { |
180 | struct list_head *head = &info->space_info; |
181 | struct btrfs_space_info *found; |
182 | |
183 | list_for_each_entry(found, head, list) |
184 | found->full = 0; |
185 | } |
186 | |
187 | /* |
188 | * Block groups with more than this value (percents) of unusable space will be |
189 | * scheduled for background reclaim. |
190 | */ |
191 | #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) |
192 | |
193 | /* |
194 | * Calculate chunk size depending on volume type (regular or zoned). |
195 | */ |
196 | static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) |
197 | { |
198 | if (btrfs_is_zoned(fs_info)) |
199 | return fs_info->zone_size; |
200 | |
201 | ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); |
202 | |
203 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
204 | return BTRFS_MAX_DATA_CHUNK_SIZE; |
205 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
206 | return SZ_32M; |
207 | |
208 | /* Handle BTRFS_BLOCK_GROUP_METADATA */ |
209 | if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) |
210 | return SZ_1G; |
211 | |
212 | return SZ_256M; |
213 | } |
214 | |
215 | /* |
216 | * Update default chunk size. |
217 | */ |
218 | void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, |
219 | u64 chunk_size) |
220 | { |
221 | WRITE_ONCE(space_info->chunk_size, chunk_size); |
222 | } |
223 | |
224 | static int create_space_info(struct btrfs_fs_info *info, u64 flags) |
225 | { |
226 | |
227 | struct btrfs_space_info *space_info; |
228 | int i; |
229 | int ret; |
230 | |
231 | space_info = kzalloc(size: sizeof(*space_info), GFP_NOFS); |
232 | if (!space_info) |
233 | return -ENOMEM; |
234 | |
235 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) |
236 | INIT_LIST_HEAD(list: &space_info->block_groups[i]); |
237 | init_rwsem(&space_info->groups_sem); |
238 | spin_lock_init(&space_info->lock); |
239 | space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; |
240 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; |
241 | INIT_LIST_HEAD(list: &space_info->ro_bgs); |
242 | INIT_LIST_HEAD(list: &space_info->tickets); |
243 | INIT_LIST_HEAD(list: &space_info->priority_tickets); |
244 | space_info->clamp = 1; |
245 | btrfs_update_space_info_chunk_size(space_info, chunk_size: calc_chunk_size(fs_info: info, flags)); |
246 | |
247 | if (btrfs_is_zoned(fs_info: info)) |
248 | space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; |
249 | |
250 | ret = btrfs_sysfs_add_space_info_type(fs_info: info, space_info); |
251 | if (ret) |
252 | return ret; |
253 | |
254 | list_add(new: &space_info->list, head: &info->space_info); |
255 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
256 | info->data_sinfo = space_info; |
257 | |
258 | return ret; |
259 | } |
260 | |
261 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info) |
262 | { |
263 | struct btrfs_super_block *disk_super; |
264 | u64 features; |
265 | u64 flags; |
266 | int mixed = 0; |
267 | int ret; |
268 | |
269 | disk_super = fs_info->super_copy; |
270 | if (!btrfs_super_root(s: disk_super)) |
271 | return -EINVAL; |
272 | |
273 | features = btrfs_super_incompat_flags(s: disk_super); |
274 | if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) |
275 | mixed = 1; |
276 | |
277 | flags = BTRFS_BLOCK_GROUP_SYSTEM; |
278 | ret = create_space_info(info: fs_info, flags); |
279 | if (ret) |
280 | goto out; |
281 | |
282 | if (mixed) { |
283 | flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; |
284 | ret = create_space_info(info: fs_info, flags); |
285 | } else { |
286 | flags = BTRFS_BLOCK_GROUP_METADATA; |
287 | ret = create_space_info(info: fs_info, flags); |
288 | if (ret) |
289 | goto out; |
290 | |
291 | flags = BTRFS_BLOCK_GROUP_DATA; |
292 | ret = create_space_info(info: fs_info, flags); |
293 | } |
294 | out: |
295 | return ret; |
296 | } |
297 | |
298 | void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, |
299 | struct btrfs_block_group *block_group) |
300 | { |
301 | struct btrfs_space_info *found; |
302 | int factor, index; |
303 | |
304 | factor = btrfs_bg_type_to_factor(flags: block_group->flags); |
305 | |
306 | found = btrfs_find_space_info(info, flags: block_group->flags); |
307 | ASSERT(found); |
308 | spin_lock(lock: &found->lock); |
309 | found->total_bytes += block_group->length; |
310 | found->disk_total += block_group->length * factor; |
311 | found->bytes_used += block_group->used; |
312 | found->disk_used += block_group->used * factor; |
313 | found->bytes_readonly += block_group->bytes_super; |
314 | found->bytes_zone_unusable += block_group->zone_unusable; |
315 | if (block_group->length > 0) |
316 | found->full = 0; |
317 | btrfs_try_granting_tickets(fs_info: info, space_info: found); |
318 | spin_unlock(lock: &found->lock); |
319 | |
320 | block_group->space_info = found; |
321 | |
322 | index = btrfs_bg_flags_to_raid_index(flags: block_group->flags); |
323 | down_write(sem: &found->groups_sem); |
324 | list_add_tail(new: &block_group->list, head: &found->block_groups[index]); |
325 | up_write(sem: &found->groups_sem); |
326 | } |
327 | |
328 | struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, |
329 | u64 flags) |
330 | { |
331 | struct list_head *head = &info->space_info; |
332 | struct btrfs_space_info *found; |
333 | |
334 | flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; |
335 | |
336 | list_for_each_entry(found, head, list) { |
337 | if (found->flags & flags) |
338 | return found; |
339 | } |
340 | return NULL; |
341 | } |
342 | |
343 | static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, |
344 | struct btrfs_space_info *space_info, |
345 | enum btrfs_reserve_flush_enum flush) |
346 | { |
347 | struct btrfs_space_info *data_sinfo; |
348 | u64 profile; |
349 | u64 avail; |
350 | u64 data_chunk_size; |
351 | int factor; |
352 | |
353 | if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) |
354 | profile = btrfs_system_alloc_profile(fs_info); |
355 | else |
356 | profile = btrfs_metadata_alloc_profile(fs_info); |
357 | |
358 | avail = atomic64_read(v: &fs_info->free_chunk_space); |
359 | |
360 | /* |
361 | * If we have dup, raid1 or raid10 then only half of the free |
362 | * space is actually usable. For raid56, the space info used |
363 | * doesn't include the parity drive, so we don't have to |
364 | * change the math |
365 | */ |
366 | factor = btrfs_bg_type_to_factor(flags: profile); |
367 | avail = div_u64(dividend: avail, divisor: factor); |
368 | if (avail == 0) |
369 | return 0; |
370 | |
371 | /* |
372 | * Calculate the data_chunk_size, space_info->chunk_size is the |
373 | * "optimal" chunk size based on the fs size. However when we actually |
374 | * allocate the chunk we will strip this down further, making it no more |
375 | * than 10% of the disk or 1G, whichever is smaller. |
376 | */ |
377 | data_sinfo = btrfs_find_space_info(info: fs_info, BTRFS_BLOCK_GROUP_DATA); |
378 | data_chunk_size = min(data_sinfo->chunk_size, |
379 | mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); |
380 | data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); |
381 | |
382 | /* |
383 | * Since data allocations immediately use block groups as part of the |
384 | * reservation, because we assume that data reservations will == actual |
385 | * usage, we could potentially overcommit and then immediately have that |
386 | * available space used by a data allocation, which could put us in a |
387 | * bind when we get close to filling the file system. |
388 | * |
389 | * To handle this simply remove the data_chunk_size from the available |
390 | * space. If we are relatively empty this won't affect our ability to |
391 | * overcommit much, and if we're very close to full it'll keep us from |
392 | * getting into a position where we've given ourselves very little |
393 | * metadata wiggle room. |
394 | */ |
395 | if (avail <= data_chunk_size) |
396 | return 0; |
397 | avail -= data_chunk_size; |
398 | |
399 | /* |
400 | * If we aren't flushing all things, let us overcommit up to |
401 | * 1/2th of the space. If we can flush, don't let us overcommit |
402 | * too much, let it overcommit up to 1/8 of the space. |
403 | */ |
404 | if (flush == BTRFS_RESERVE_FLUSH_ALL) |
405 | avail >>= 3; |
406 | else |
407 | avail >>= 1; |
408 | return avail; |
409 | } |
410 | |
411 | int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, |
412 | struct btrfs_space_info *space_info, u64 bytes, |
413 | enum btrfs_reserve_flush_enum flush) |
414 | { |
415 | u64 avail; |
416 | u64 used; |
417 | |
418 | /* Don't overcommit when in mixed mode */ |
419 | if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) |
420 | return 0; |
421 | |
422 | used = btrfs_space_info_used(s_info: space_info, may_use_included: true); |
423 | avail = calc_available_free_space(fs_info, space_info, flush); |
424 | |
425 | if (used + bytes < space_info->total_bytes + avail) |
426 | return 1; |
427 | return 0; |
428 | } |
429 | |
430 | static void remove_ticket(struct btrfs_space_info *space_info, |
431 | struct reserve_ticket *ticket) |
432 | { |
433 | if (!list_empty(head: &ticket->list)) { |
434 | list_del_init(entry: &ticket->list); |
435 | ASSERT(space_info->reclaim_size >= ticket->bytes); |
436 | space_info->reclaim_size -= ticket->bytes; |
437 | } |
438 | } |
439 | |
440 | /* |
441 | * This is for space we already have accounted in space_info->bytes_may_use, so |
442 | * basically when we're returning space from block_rsv's. |
443 | */ |
444 | void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, |
445 | struct btrfs_space_info *space_info) |
446 | { |
447 | struct list_head *head; |
448 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; |
449 | |
450 | lockdep_assert_held(&space_info->lock); |
451 | |
452 | head = &space_info->priority_tickets; |
453 | again: |
454 | while (!list_empty(head)) { |
455 | struct reserve_ticket *ticket; |
456 | u64 used = btrfs_space_info_used(s_info: space_info, may_use_included: true); |
457 | |
458 | ticket = list_first_entry(head, struct reserve_ticket, list); |
459 | |
460 | /* Check and see if our ticket can be satisfied now. */ |
461 | if ((used + ticket->bytes <= space_info->total_bytes) || |
462 | btrfs_can_overcommit(fs_info, space_info, bytes: ticket->bytes, |
463 | flush)) { |
464 | btrfs_space_info_update_bytes_may_use(fs_info, |
465 | sinfo: space_info, |
466 | bytes: ticket->bytes); |
467 | remove_ticket(space_info, ticket); |
468 | ticket->bytes = 0; |
469 | space_info->tickets_id++; |
470 | wake_up(&ticket->wait); |
471 | } else { |
472 | break; |
473 | } |
474 | } |
475 | |
476 | if (head == &space_info->priority_tickets) { |
477 | head = &space_info->tickets; |
478 | flush = BTRFS_RESERVE_FLUSH_ALL; |
479 | goto again; |
480 | } |
481 | } |
482 | |
483 | #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ |
484 | do { \ |
485 | struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ |
486 | spin_lock(&__rsv->lock); \ |
487 | btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ |
488 | __rsv->size, __rsv->reserved); \ |
489 | spin_unlock(&__rsv->lock); \ |
490 | } while (0) |
491 | |
492 | static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) |
493 | { |
494 | switch (space_info->flags) { |
495 | case BTRFS_BLOCK_GROUP_SYSTEM: |
496 | return "SYSTEM" ; |
497 | case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: |
498 | return "DATA+METADATA" ; |
499 | case BTRFS_BLOCK_GROUP_DATA: |
500 | return "DATA" ; |
501 | case BTRFS_BLOCK_GROUP_METADATA: |
502 | return "METADATA" ; |
503 | default: |
504 | return "UNKNOWN" ; |
505 | } |
506 | } |
507 | |
508 | static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) |
509 | { |
510 | DUMP_BLOCK_RSV(fs_info, global_block_rsv); |
511 | DUMP_BLOCK_RSV(fs_info, trans_block_rsv); |
512 | DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); |
513 | DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); |
514 | DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); |
515 | } |
516 | |
517 | static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, |
518 | struct btrfs_space_info *info) |
519 | { |
520 | const char *flag_str = space_info_flag_to_str(space_info: info); |
521 | lockdep_assert_held(&info->lock); |
522 | |
523 | /* The free space could be negative in case of overcommit */ |
524 | btrfs_info(fs_info, "space_info %s has %lld free, is %sfull" , |
525 | flag_str, |
526 | (s64)(info->total_bytes - btrfs_space_info_used(info, true)), |
527 | info->full ? "" : "not " ); |
528 | btrfs_info(fs_info, |
529 | "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu" , |
530 | info->total_bytes, info->bytes_used, info->bytes_pinned, |
531 | info->bytes_reserved, info->bytes_may_use, |
532 | info->bytes_readonly, info->bytes_zone_unusable); |
533 | } |
534 | |
535 | void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, |
536 | struct btrfs_space_info *info, u64 bytes, |
537 | int dump_block_groups) |
538 | { |
539 | struct btrfs_block_group *cache; |
540 | u64 total_avail = 0; |
541 | int index = 0; |
542 | |
543 | spin_lock(lock: &info->lock); |
544 | __btrfs_dump_space_info(fs_info, info); |
545 | dump_global_block_rsv(fs_info); |
546 | spin_unlock(lock: &info->lock); |
547 | |
548 | if (!dump_block_groups) |
549 | return; |
550 | |
551 | down_read(sem: &info->groups_sem); |
552 | again: |
553 | list_for_each_entry(cache, &info->block_groups[index], list) { |
554 | u64 avail; |
555 | |
556 | spin_lock(lock: &cache->lock); |
557 | avail = cache->length - cache->used - cache->pinned - |
558 | cache->reserved - cache->delalloc_bytes - |
559 | cache->bytes_super - cache->zone_unusable; |
560 | btrfs_info(fs_info, |
561 | "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s" , |
562 | cache->start, cache->length, cache->used, cache->pinned, |
563 | cache->reserved, cache->delalloc_bytes, |
564 | cache->bytes_super, cache->zone_unusable, |
565 | avail, cache->ro ? "[readonly]" : "" ); |
566 | spin_unlock(lock: &cache->lock); |
567 | btrfs_dump_free_space(block_group: cache, bytes); |
568 | total_avail += avail; |
569 | } |
570 | if (++index < BTRFS_NR_RAID_TYPES) |
571 | goto again; |
572 | up_read(sem: &info->groups_sem); |
573 | |
574 | btrfs_info(fs_info, "%llu bytes available across all block groups" , total_avail); |
575 | } |
576 | |
577 | static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, |
578 | u64 to_reclaim) |
579 | { |
580 | u64 bytes; |
581 | u64 nr; |
582 | |
583 | bytes = btrfs_calc_insert_metadata_size(fs_info, num_items: 1); |
584 | nr = div64_u64(dividend: to_reclaim, divisor: bytes); |
585 | if (!nr) |
586 | nr = 1; |
587 | return nr; |
588 | } |
589 | |
590 | #define EXTENT_SIZE_PER_ITEM SZ_256K |
591 | |
592 | /* |
593 | * shrink metadata reservation for delalloc |
594 | */ |
595 | static void shrink_delalloc(struct btrfs_fs_info *fs_info, |
596 | struct btrfs_space_info *space_info, |
597 | u64 to_reclaim, bool wait_ordered, |
598 | bool for_preempt) |
599 | { |
600 | struct btrfs_trans_handle *trans; |
601 | u64 delalloc_bytes; |
602 | u64 ordered_bytes; |
603 | u64 items; |
604 | long time_left; |
605 | int loops; |
606 | |
607 | delalloc_bytes = percpu_counter_sum_positive(fbc: &fs_info->delalloc_bytes); |
608 | ordered_bytes = percpu_counter_sum_positive(fbc: &fs_info->ordered_bytes); |
609 | if (delalloc_bytes == 0 && ordered_bytes == 0) |
610 | return; |
611 | |
612 | /* Calc the number of the pages we need flush for space reservation */ |
613 | if (to_reclaim == U64_MAX) { |
614 | items = U64_MAX; |
615 | } else { |
616 | /* |
617 | * to_reclaim is set to however much metadata we need to |
618 | * reclaim, but reclaiming that much data doesn't really track |
619 | * exactly. What we really want to do is reclaim full inode's |
620 | * worth of reservations, however that's not available to us |
621 | * here. We will take a fraction of the delalloc bytes for our |
622 | * flushing loops and hope for the best. Delalloc will expand |
623 | * the amount we write to cover an entire dirty extent, which |
624 | * will reclaim the metadata reservation for that range. If |
625 | * it's not enough subsequent flush stages will be more |
626 | * aggressive. |
627 | */ |
628 | to_reclaim = max(to_reclaim, delalloc_bytes >> 3); |
629 | items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; |
630 | } |
631 | |
632 | trans = current->journal_info; |
633 | |
634 | /* |
635 | * If we are doing more ordered than delalloc we need to just wait on |
636 | * ordered extents, otherwise we'll waste time trying to flush delalloc |
637 | * that likely won't give us the space back we need. |
638 | */ |
639 | if (ordered_bytes > delalloc_bytes && !for_preempt) |
640 | wait_ordered = true; |
641 | |
642 | loops = 0; |
643 | while ((delalloc_bytes || ordered_bytes) && loops < 3) { |
644 | u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; |
645 | long nr_pages = min_t(u64, temp, LONG_MAX); |
646 | int async_pages; |
647 | |
648 | btrfs_start_delalloc_roots(fs_info, nr: nr_pages, in_reclaim_context: true); |
649 | |
650 | /* |
651 | * We need to make sure any outstanding async pages are now |
652 | * processed before we continue. This is because things like |
653 | * sync_inode() try to be smart and skip writing if the inode is |
654 | * marked clean. We don't use filemap_fwrite for flushing |
655 | * because we want to control how many pages we write out at a |
656 | * time, thus this is the only safe way to make sure we've |
657 | * waited for outstanding compressed workers to have started |
658 | * their jobs and thus have ordered extents set up properly. |
659 | * |
660 | * This exists because we do not want to wait for each |
661 | * individual inode to finish its async work, we simply want to |
662 | * start the IO on everybody, and then come back here and wait |
663 | * for all of the async work to catch up. Once we're done with |
664 | * that we know we'll have ordered extents for everything and we |
665 | * can decide if we wait for that or not. |
666 | * |
667 | * If we choose to replace this in the future, make absolutely |
668 | * sure that the proper waiting is being done in the async case, |
669 | * as there have been bugs in that area before. |
670 | */ |
671 | async_pages = atomic_read(v: &fs_info->async_delalloc_pages); |
672 | if (!async_pages) |
673 | goto skip_async; |
674 | |
675 | /* |
676 | * We don't want to wait forever, if we wrote less pages in this |
677 | * loop than we have outstanding, only wait for that number of |
678 | * pages, otherwise we can wait for all async pages to finish |
679 | * before continuing. |
680 | */ |
681 | if (async_pages > nr_pages) |
682 | async_pages -= nr_pages; |
683 | else |
684 | async_pages = 0; |
685 | wait_event(fs_info->async_submit_wait, |
686 | atomic_read(&fs_info->async_delalloc_pages) <= |
687 | async_pages); |
688 | skip_async: |
689 | loops++; |
690 | if (wait_ordered && !trans) { |
691 | btrfs_wait_ordered_roots(fs_info, nr: items, range_start: 0, range_len: (u64)-1); |
692 | } else { |
693 | time_left = schedule_timeout_killable(timeout: 1); |
694 | if (time_left) |
695 | break; |
696 | } |
697 | |
698 | /* |
699 | * If we are for preemption we just want a one-shot of delalloc |
700 | * flushing so we can stop flushing if we decide we don't need |
701 | * to anymore. |
702 | */ |
703 | if (for_preempt) |
704 | break; |
705 | |
706 | spin_lock(lock: &space_info->lock); |
707 | if (list_empty(head: &space_info->tickets) && |
708 | list_empty(head: &space_info->priority_tickets)) { |
709 | spin_unlock(lock: &space_info->lock); |
710 | break; |
711 | } |
712 | spin_unlock(lock: &space_info->lock); |
713 | |
714 | delalloc_bytes = percpu_counter_sum_positive( |
715 | fbc: &fs_info->delalloc_bytes); |
716 | ordered_bytes = percpu_counter_sum_positive( |
717 | fbc: &fs_info->ordered_bytes); |
718 | } |
719 | } |
720 | |
721 | /* |
722 | * Try to flush some data based on policy set by @state. This is only advisory |
723 | * and may fail for various reasons. The caller is supposed to examine the |
724 | * state of @space_info to detect the outcome. |
725 | */ |
726 | static void flush_space(struct btrfs_fs_info *fs_info, |
727 | struct btrfs_space_info *space_info, u64 num_bytes, |
728 | enum btrfs_flush_state state, bool for_preempt) |
729 | { |
730 | struct btrfs_root *root = fs_info->tree_root; |
731 | struct btrfs_trans_handle *trans; |
732 | int nr; |
733 | int ret = 0; |
734 | |
735 | switch (state) { |
736 | case FLUSH_DELAYED_ITEMS_NR: |
737 | case FLUSH_DELAYED_ITEMS: |
738 | if (state == FLUSH_DELAYED_ITEMS_NR) |
739 | nr = calc_reclaim_items_nr(fs_info, to_reclaim: num_bytes) * 2; |
740 | else |
741 | nr = -1; |
742 | |
743 | trans = btrfs_join_transaction_nostart(root); |
744 | if (IS_ERR(ptr: trans)) { |
745 | ret = PTR_ERR(ptr: trans); |
746 | if (ret == -ENOENT) |
747 | ret = 0; |
748 | break; |
749 | } |
750 | ret = btrfs_run_delayed_items_nr(trans, nr); |
751 | btrfs_end_transaction(trans); |
752 | break; |
753 | case FLUSH_DELALLOC: |
754 | case FLUSH_DELALLOC_WAIT: |
755 | case FLUSH_DELALLOC_FULL: |
756 | if (state == FLUSH_DELALLOC_FULL) |
757 | num_bytes = U64_MAX; |
758 | shrink_delalloc(fs_info, space_info, to_reclaim: num_bytes, |
759 | wait_ordered: state != FLUSH_DELALLOC, for_preempt); |
760 | break; |
761 | case FLUSH_DELAYED_REFS_NR: |
762 | case FLUSH_DELAYED_REFS: |
763 | trans = btrfs_join_transaction_nostart(root); |
764 | if (IS_ERR(ptr: trans)) { |
765 | ret = PTR_ERR(ptr: trans); |
766 | if (ret == -ENOENT) |
767 | ret = 0; |
768 | break; |
769 | } |
770 | if (state == FLUSH_DELAYED_REFS_NR) |
771 | btrfs_run_delayed_refs(trans, min_bytes: num_bytes); |
772 | else |
773 | btrfs_run_delayed_refs(trans, min_bytes: 0); |
774 | btrfs_end_transaction(trans); |
775 | break; |
776 | case ALLOC_CHUNK: |
777 | case ALLOC_CHUNK_FORCE: |
778 | trans = btrfs_join_transaction(root); |
779 | if (IS_ERR(ptr: trans)) { |
780 | ret = PTR_ERR(ptr: trans); |
781 | break; |
782 | } |
783 | ret = btrfs_chunk_alloc(trans, |
784 | flags: btrfs_get_alloc_profile(fs_info, orig_flags: space_info->flags), |
785 | force: (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : |
786 | CHUNK_ALLOC_FORCE); |
787 | btrfs_end_transaction(trans); |
788 | |
789 | if (ret > 0 || ret == -ENOSPC) |
790 | ret = 0; |
791 | break; |
792 | case RUN_DELAYED_IPUTS: |
793 | /* |
794 | * If we have pending delayed iputs then we could free up a |
795 | * bunch of pinned space, so make sure we run the iputs before |
796 | * we do our pinned bytes check below. |
797 | */ |
798 | btrfs_run_delayed_iputs(fs_info); |
799 | btrfs_wait_on_delayed_iputs(fs_info); |
800 | break; |
801 | case COMMIT_TRANS: |
802 | ASSERT(current->journal_info == NULL); |
803 | /* |
804 | * We don't want to start a new transaction, just attach to the |
805 | * current one or wait it fully commits in case its commit is |
806 | * happening at the moment. Note: we don't use a nostart join |
807 | * because that does not wait for a transaction to fully commit |
808 | * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). |
809 | */ |
810 | trans = btrfs_attach_transaction_barrier(root); |
811 | if (IS_ERR(ptr: trans)) { |
812 | ret = PTR_ERR(ptr: trans); |
813 | if (ret == -ENOENT) |
814 | ret = 0; |
815 | break; |
816 | } |
817 | ret = btrfs_commit_transaction(trans); |
818 | break; |
819 | default: |
820 | ret = -ENOSPC; |
821 | break; |
822 | } |
823 | |
824 | trace_btrfs_flush_space(fs_info, flags: space_info->flags, num_bytes, state, |
825 | ret, for_preempt); |
826 | return; |
827 | } |
828 | |
829 | static inline u64 |
830 | btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, |
831 | struct btrfs_space_info *space_info) |
832 | { |
833 | u64 used; |
834 | u64 avail; |
835 | u64 to_reclaim = space_info->reclaim_size; |
836 | |
837 | lockdep_assert_held(&space_info->lock); |
838 | |
839 | avail = calc_available_free_space(fs_info, space_info, |
840 | flush: BTRFS_RESERVE_FLUSH_ALL); |
841 | used = btrfs_space_info_used(s_info: space_info, may_use_included: true); |
842 | |
843 | /* |
844 | * We may be flushing because suddenly we have less space than we had |
845 | * before, and now we're well over-committed based on our current free |
846 | * space. If that's the case add in our overage so we make sure to put |
847 | * appropriate pressure on the flushing state machine. |
848 | */ |
849 | if (space_info->total_bytes + avail < used) |
850 | to_reclaim += used - (space_info->total_bytes + avail); |
851 | |
852 | return to_reclaim; |
853 | } |
854 | |
855 | static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, |
856 | struct btrfs_space_info *space_info) |
857 | { |
858 | const u64 global_rsv_size = btrfs_block_rsv_reserved(rsv: &fs_info->global_block_rsv); |
859 | u64 ordered, delalloc; |
860 | u64 thresh; |
861 | u64 used; |
862 | |
863 | thresh = mult_perc(num: space_info->total_bytes, percent: 90); |
864 | |
865 | lockdep_assert_held(&space_info->lock); |
866 | |
867 | /* If we're just plain full then async reclaim just slows us down. */ |
868 | if ((space_info->bytes_used + space_info->bytes_reserved + |
869 | global_rsv_size) >= thresh) |
870 | return false; |
871 | |
872 | used = space_info->bytes_may_use + space_info->bytes_pinned; |
873 | |
874 | /* The total flushable belongs to the global rsv, don't flush. */ |
875 | if (global_rsv_size >= used) |
876 | return false; |
877 | |
878 | /* |
879 | * 128MiB is 1/4 of the maximum global rsv size. If we have less than |
880 | * that devoted to other reservations then there's no sense in flushing, |
881 | * we don't have a lot of things that need flushing. |
882 | */ |
883 | if (used - global_rsv_size <= SZ_128M) |
884 | return false; |
885 | |
886 | /* |
887 | * We have tickets queued, bail so we don't compete with the async |
888 | * flushers. |
889 | */ |
890 | if (space_info->reclaim_size) |
891 | return false; |
892 | |
893 | /* |
894 | * If we have over half of the free space occupied by reservations or |
895 | * pinned then we want to start flushing. |
896 | * |
897 | * We do not do the traditional thing here, which is to say |
898 | * |
899 | * if (used >= ((total_bytes + avail) / 2)) |
900 | * return 1; |
901 | * |
902 | * because this doesn't quite work how we want. If we had more than 50% |
903 | * of the space_info used by bytes_used and we had 0 available we'd just |
904 | * constantly run the background flusher. Instead we want it to kick in |
905 | * if our reclaimable space exceeds our clamped free space. |
906 | * |
907 | * Our clamping range is 2^1 -> 2^8. Practically speaking that means |
908 | * the following: |
909 | * |
910 | * Amount of RAM Minimum threshold Maximum threshold |
911 | * |
912 | * 256GiB 1GiB 128GiB |
913 | * 128GiB 512MiB 64GiB |
914 | * 64GiB 256MiB 32GiB |
915 | * 32GiB 128MiB 16GiB |
916 | * 16GiB 64MiB 8GiB |
917 | * |
918 | * These are the range our thresholds will fall in, corresponding to how |
919 | * much delalloc we need for the background flusher to kick in. |
920 | */ |
921 | |
922 | thresh = calc_available_free_space(fs_info, space_info, |
923 | flush: BTRFS_RESERVE_FLUSH_ALL); |
924 | used = space_info->bytes_used + space_info->bytes_reserved + |
925 | space_info->bytes_readonly + global_rsv_size; |
926 | if (used < space_info->total_bytes) |
927 | thresh += space_info->total_bytes - used; |
928 | thresh >>= space_info->clamp; |
929 | |
930 | used = space_info->bytes_pinned; |
931 | |
932 | /* |
933 | * If we have more ordered bytes than delalloc bytes then we're either |
934 | * doing a lot of DIO, or we simply don't have a lot of delalloc waiting |
935 | * around. Preemptive flushing is only useful in that it can free up |
936 | * space before tickets need to wait for things to finish. In the case |
937 | * of ordered extents, preemptively waiting on ordered extents gets us |
938 | * nothing, if our reservations are tied up in ordered extents we'll |
939 | * simply have to slow down writers by forcing them to wait on ordered |
940 | * extents. |
941 | * |
942 | * In the case that ordered is larger than delalloc, only include the |
943 | * block reserves that we would actually be able to directly reclaim |
944 | * from. In this case if we're heavy on metadata operations this will |
945 | * clearly be heavy enough to warrant preemptive flushing. In the case |
946 | * of heavy DIO or ordered reservations, preemptive flushing will just |
947 | * waste time and cause us to slow down. |
948 | * |
949 | * We want to make sure we truly are maxed out on ordered however, so |
950 | * cut ordered in half, and if it's still higher than delalloc then we |
951 | * can keep flushing. This is to avoid the case where we start |
952 | * flushing, and now delalloc == ordered and we stop preemptively |
953 | * flushing when we could still have several gigs of delalloc to flush. |
954 | */ |
955 | ordered = percpu_counter_read_positive(fbc: &fs_info->ordered_bytes) >> 1; |
956 | delalloc = percpu_counter_read_positive(fbc: &fs_info->delalloc_bytes); |
957 | if (ordered >= delalloc) |
958 | used += btrfs_block_rsv_reserved(rsv: &fs_info->delayed_refs_rsv) + |
959 | btrfs_block_rsv_reserved(rsv: &fs_info->delayed_block_rsv); |
960 | else |
961 | used += space_info->bytes_may_use - global_rsv_size; |
962 | |
963 | return (used >= thresh && !btrfs_fs_closing(fs_info) && |
964 | !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); |
965 | } |
966 | |
967 | static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, |
968 | struct btrfs_space_info *space_info, |
969 | struct reserve_ticket *ticket) |
970 | { |
971 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; |
972 | u64 min_bytes; |
973 | |
974 | if (!ticket->steal) |
975 | return false; |
976 | |
977 | if (global_rsv->space_info != space_info) |
978 | return false; |
979 | |
980 | spin_lock(lock: &global_rsv->lock); |
981 | min_bytes = mult_perc(num: global_rsv->size, percent: 10); |
982 | if (global_rsv->reserved < min_bytes + ticket->bytes) { |
983 | spin_unlock(lock: &global_rsv->lock); |
984 | return false; |
985 | } |
986 | global_rsv->reserved -= ticket->bytes; |
987 | remove_ticket(space_info, ticket); |
988 | ticket->bytes = 0; |
989 | wake_up(&ticket->wait); |
990 | space_info->tickets_id++; |
991 | if (global_rsv->reserved < global_rsv->size) |
992 | global_rsv->full = 0; |
993 | spin_unlock(lock: &global_rsv->lock); |
994 | |
995 | return true; |
996 | } |
997 | |
998 | /* |
999 | * We've exhausted our flushing, start failing tickets. |
1000 | * |
1001 | * @fs_info - fs_info for this fs |
1002 | * @space_info - the space info we were flushing |
1003 | * |
1004 | * We call this when we've exhausted our flushing ability and haven't made |
1005 | * progress in satisfying tickets. The reservation code handles tickets in |
1006 | * order, so if there is a large ticket first and then smaller ones we could |
1007 | * very well satisfy the smaller tickets. This will attempt to wake up any |
1008 | * tickets in the list to catch this case. |
1009 | * |
1010 | * This function returns true if it was able to make progress by clearing out |
1011 | * other tickets, or if it stumbles across a ticket that was smaller than the |
1012 | * first ticket. |
1013 | */ |
1014 | static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, |
1015 | struct btrfs_space_info *space_info) |
1016 | { |
1017 | struct reserve_ticket *ticket; |
1018 | u64 tickets_id = space_info->tickets_id; |
1019 | const bool aborted = BTRFS_FS_ERROR(fs_info); |
1020 | |
1021 | trace_btrfs_fail_all_tickets(fs_info, sinfo: space_info); |
1022 | |
1023 | if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { |
1024 | btrfs_info(fs_info, "cannot satisfy tickets, dumping space info" ); |
1025 | __btrfs_dump_space_info(fs_info, info: space_info); |
1026 | } |
1027 | |
1028 | while (!list_empty(head: &space_info->tickets) && |
1029 | tickets_id == space_info->tickets_id) { |
1030 | ticket = list_first_entry(&space_info->tickets, |
1031 | struct reserve_ticket, list); |
1032 | |
1033 | if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) |
1034 | return true; |
1035 | |
1036 | if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) |
1037 | btrfs_info(fs_info, "failing ticket with %llu bytes" , |
1038 | ticket->bytes); |
1039 | |
1040 | remove_ticket(space_info, ticket); |
1041 | if (aborted) |
1042 | ticket->error = -EIO; |
1043 | else |
1044 | ticket->error = -ENOSPC; |
1045 | wake_up(&ticket->wait); |
1046 | |
1047 | /* |
1048 | * We're just throwing tickets away, so more flushing may not |
1049 | * trip over btrfs_try_granting_tickets, so we need to call it |
1050 | * here to see if we can make progress with the next ticket in |
1051 | * the list. |
1052 | */ |
1053 | if (!aborted) |
1054 | btrfs_try_granting_tickets(fs_info, space_info); |
1055 | } |
1056 | return (tickets_id != space_info->tickets_id); |
1057 | } |
1058 | |
1059 | /* |
1060 | * This is for normal flushers, we can wait all goddamned day if we want to. We |
1061 | * will loop and continuously try to flush as long as we are making progress. |
1062 | * We count progress as clearing off tickets each time we have to loop. |
1063 | */ |
1064 | static void btrfs_async_reclaim_metadata_space(struct work_struct *work) |
1065 | { |
1066 | struct btrfs_fs_info *fs_info; |
1067 | struct btrfs_space_info *space_info; |
1068 | u64 to_reclaim; |
1069 | enum btrfs_flush_state flush_state; |
1070 | int commit_cycles = 0; |
1071 | u64 last_tickets_id; |
1072 | |
1073 | fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); |
1074 | space_info = btrfs_find_space_info(info: fs_info, BTRFS_BLOCK_GROUP_METADATA); |
1075 | |
1076 | spin_lock(lock: &space_info->lock); |
1077 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); |
1078 | if (!to_reclaim) { |
1079 | space_info->flush = 0; |
1080 | spin_unlock(lock: &space_info->lock); |
1081 | return; |
1082 | } |
1083 | last_tickets_id = space_info->tickets_id; |
1084 | spin_unlock(lock: &space_info->lock); |
1085 | |
1086 | flush_state = FLUSH_DELAYED_ITEMS_NR; |
1087 | do { |
1088 | flush_space(fs_info, space_info, num_bytes: to_reclaim, state: flush_state, for_preempt: false); |
1089 | spin_lock(lock: &space_info->lock); |
1090 | if (list_empty(head: &space_info->tickets)) { |
1091 | space_info->flush = 0; |
1092 | spin_unlock(lock: &space_info->lock); |
1093 | return; |
1094 | } |
1095 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, |
1096 | space_info); |
1097 | if (last_tickets_id == space_info->tickets_id) { |
1098 | flush_state++; |
1099 | } else { |
1100 | last_tickets_id = space_info->tickets_id; |
1101 | flush_state = FLUSH_DELAYED_ITEMS_NR; |
1102 | if (commit_cycles) |
1103 | commit_cycles--; |
1104 | } |
1105 | |
1106 | /* |
1107 | * We do not want to empty the system of delalloc unless we're |
1108 | * under heavy pressure, so allow one trip through the flushing |
1109 | * logic before we start doing a FLUSH_DELALLOC_FULL. |
1110 | */ |
1111 | if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) |
1112 | flush_state++; |
1113 | |
1114 | /* |
1115 | * We don't want to force a chunk allocation until we've tried |
1116 | * pretty hard to reclaim space. Think of the case where we |
1117 | * freed up a bunch of space and so have a lot of pinned space |
1118 | * to reclaim. We would rather use that than possibly create a |
1119 | * underutilized metadata chunk. So if this is our first run |
1120 | * through the flushing state machine skip ALLOC_CHUNK_FORCE and |
1121 | * commit the transaction. If nothing has changed the next go |
1122 | * around then we can force a chunk allocation. |
1123 | */ |
1124 | if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) |
1125 | flush_state++; |
1126 | |
1127 | if (flush_state > COMMIT_TRANS) { |
1128 | commit_cycles++; |
1129 | if (commit_cycles > 2) { |
1130 | if (maybe_fail_all_tickets(fs_info, space_info)) { |
1131 | flush_state = FLUSH_DELAYED_ITEMS_NR; |
1132 | commit_cycles--; |
1133 | } else { |
1134 | space_info->flush = 0; |
1135 | } |
1136 | } else { |
1137 | flush_state = FLUSH_DELAYED_ITEMS_NR; |
1138 | } |
1139 | } |
1140 | spin_unlock(lock: &space_info->lock); |
1141 | } while (flush_state <= COMMIT_TRANS); |
1142 | } |
1143 | |
1144 | /* |
1145 | * This handles pre-flushing of metadata space before we get to the point that |
1146 | * we need to start blocking threads on tickets. The logic here is different |
1147 | * from the other flush paths because it doesn't rely on tickets to tell us how |
1148 | * much we need to flush, instead it attempts to keep us below the 80% full |
1149 | * watermark of space by flushing whichever reservation pool is currently the |
1150 | * largest. |
1151 | */ |
1152 | static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) |
1153 | { |
1154 | struct btrfs_fs_info *fs_info; |
1155 | struct btrfs_space_info *space_info; |
1156 | struct btrfs_block_rsv *delayed_block_rsv; |
1157 | struct btrfs_block_rsv *delayed_refs_rsv; |
1158 | struct btrfs_block_rsv *global_rsv; |
1159 | struct btrfs_block_rsv *trans_rsv; |
1160 | int loops = 0; |
1161 | |
1162 | fs_info = container_of(work, struct btrfs_fs_info, |
1163 | preempt_reclaim_work); |
1164 | space_info = btrfs_find_space_info(info: fs_info, BTRFS_BLOCK_GROUP_METADATA); |
1165 | delayed_block_rsv = &fs_info->delayed_block_rsv; |
1166 | delayed_refs_rsv = &fs_info->delayed_refs_rsv; |
1167 | global_rsv = &fs_info->global_block_rsv; |
1168 | trans_rsv = &fs_info->trans_block_rsv; |
1169 | |
1170 | spin_lock(lock: &space_info->lock); |
1171 | while (need_preemptive_reclaim(fs_info, space_info)) { |
1172 | enum btrfs_flush_state flush; |
1173 | u64 delalloc_size = 0; |
1174 | u64 to_reclaim, block_rsv_size; |
1175 | const u64 global_rsv_size = btrfs_block_rsv_reserved(rsv: global_rsv); |
1176 | |
1177 | loops++; |
1178 | |
1179 | /* |
1180 | * We don't have a precise counter for the metadata being |
1181 | * reserved for delalloc, so we'll approximate it by subtracting |
1182 | * out the block rsv's space from the bytes_may_use. If that |
1183 | * amount is higher than the individual reserves, then we can |
1184 | * assume it's tied up in delalloc reservations. |
1185 | */ |
1186 | block_rsv_size = global_rsv_size + |
1187 | btrfs_block_rsv_reserved(rsv: delayed_block_rsv) + |
1188 | btrfs_block_rsv_reserved(rsv: delayed_refs_rsv) + |
1189 | btrfs_block_rsv_reserved(rsv: trans_rsv); |
1190 | if (block_rsv_size < space_info->bytes_may_use) |
1191 | delalloc_size = space_info->bytes_may_use - block_rsv_size; |
1192 | |
1193 | /* |
1194 | * We don't want to include the global_rsv in our calculation, |
1195 | * because that's space we can't touch. Subtract it from the |
1196 | * block_rsv_size for the next checks. |
1197 | */ |
1198 | block_rsv_size -= global_rsv_size; |
1199 | |
1200 | /* |
1201 | * We really want to avoid flushing delalloc too much, as it |
1202 | * could result in poor allocation patterns, so only flush it if |
1203 | * it's larger than the rest of the pools combined. |
1204 | */ |
1205 | if (delalloc_size > block_rsv_size) { |
1206 | to_reclaim = delalloc_size; |
1207 | flush = FLUSH_DELALLOC; |
1208 | } else if (space_info->bytes_pinned > |
1209 | (btrfs_block_rsv_reserved(rsv: delayed_block_rsv) + |
1210 | btrfs_block_rsv_reserved(rsv: delayed_refs_rsv))) { |
1211 | to_reclaim = space_info->bytes_pinned; |
1212 | flush = COMMIT_TRANS; |
1213 | } else if (btrfs_block_rsv_reserved(rsv: delayed_block_rsv) > |
1214 | btrfs_block_rsv_reserved(rsv: delayed_refs_rsv)) { |
1215 | to_reclaim = btrfs_block_rsv_reserved(rsv: delayed_block_rsv); |
1216 | flush = FLUSH_DELAYED_ITEMS_NR; |
1217 | } else { |
1218 | to_reclaim = btrfs_block_rsv_reserved(rsv: delayed_refs_rsv); |
1219 | flush = FLUSH_DELAYED_REFS_NR; |
1220 | } |
1221 | |
1222 | spin_unlock(lock: &space_info->lock); |
1223 | |
1224 | /* |
1225 | * We don't want to reclaim everything, just a portion, so scale |
1226 | * down the to_reclaim by 1/4. If it takes us down to 0, |
1227 | * reclaim 1 items worth. |
1228 | */ |
1229 | to_reclaim >>= 2; |
1230 | if (!to_reclaim) |
1231 | to_reclaim = btrfs_calc_insert_metadata_size(fs_info, num_items: 1); |
1232 | flush_space(fs_info, space_info, num_bytes: to_reclaim, state: flush, for_preempt: true); |
1233 | cond_resched(); |
1234 | spin_lock(lock: &space_info->lock); |
1235 | } |
1236 | |
1237 | /* We only went through once, back off our clamping. */ |
1238 | if (loops == 1 && !space_info->reclaim_size) |
1239 | space_info->clamp = max(1, space_info->clamp - 1); |
1240 | trace_btrfs_done_preemptive_reclaim(fs_info, sinfo: space_info); |
1241 | spin_unlock(lock: &space_info->lock); |
1242 | } |
1243 | |
1244 | /* |
1245 | * FLUSH_DELALLOC_WAIT: |
1246 | * Space is freed from flushing delalloc in one of two ways. |
1247 | * |
1248 | * 1) compression is on and we allocate less space than we reserved |
1249 | * 2) we are overwriting existing space |
1250 | * |
1251 | * For #1 that extra space is reclaimed as soon as the delalloc pages are |
1252 | * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent |
1253 | * length to ->bytes_reserved, and subtracts the reserved space from |
1254 | * ->bytes_may_use. |
1255 | * |
1256 | * For #2 this is trickier. Once the ordered extent runs we will drop the |
1257 | * extent in the range we are overwriting, which creates a delayed ref for |
1258 | * that freed extent. This however is not reclaimed until the transaction |
1259 | * commits, thus the next stages. |
1260 | * |
1261 | * RUN_DELAYED_IPUTS |
1262 | * If we are freeing inodes, we want to make sure all delayed iputs have |
1263 | * completed, because they could have been on an inode with i_nlink == 0, and |
1264 | * thus have been truncated and freed up space. But again this space is not |
1265 | * immediately re-usable, it comes in the form of a delayed ref, which must be |
1266 | * run and then the transaction must be committed. |
1267 | * |
1268 | * COMMIT_TRANS |
1269 | * This is where we reclaim all of the pinned space generated by running the |
1270 | * iputs |
1271 | * |
1272 | * ALLOC_CHUNK_FORCE |
1273 | * For data we start with alloc chunk force, however we could have been full |
1274 | * before, and then the transaction commit could have freed new block groups, |
1275 | * so if we now have space to allocate do the force chunk allocation. |
1276 | */ |
1277 | static const enum btrfs_flush_state data_flush_states[] = { |
1278 | FLUSH_DELALLOC_FULL, |
1279 | RUN_DELAYED_IPUTS, |
1280 | COMMIT_TRANS, |
1281 | ALLOC_CHUNK_FORCE, |
1282 | }; |
1283 | |
1284 | static void btrfs_async_reclaim_data_space(struct work_struct *work) |
1285 | { |
1286 | struct btrfs_fs_info *fs_info; |
1287 | struct btrfs_space_info *space_info; |
1288 | u64 last_tickets_id; |
1289 | enum btrfs_flush_state flush_state = 0; |
1290 | |
1291 | fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); |
1292 | space_info = fs_info->data_sinfo; |
1293 | |
1294 | spin_lock(lock: &space_info->lock); |
1295 | if (list_empty(head: &space_info->tickets)) { |
1296 | space_info->flush = 0; |
1297 | spin_unlock(lock: &space_info->lock); |
1298 | return; |
1299 | } |
1300 | last_tickets_id = space_info->tickets_id; |
1301 | spin_unlock(lock: &space_info->lock); |
1302 | |
1303 | while (!space_info->full) { |
1304 | flush_space(fs_info, space_info, U64_MAX, state: ALLOC_CHUNK_FORCE, for_preempt: false); |
1305 | spin_lock(lock: &space_info->lock); |
1306 | if (list_empty(head: &space_info->tickets)) { |
1307 | space_info->flush = 0; |
1308 | spin_unlock(lock: &space_info->lock); |
1309 | return; |
1310 | } |
1311 | |
1312 | /* Something happened, fail everything and bail. */ |
1313 | if (BTRFS_FS_ERROR(fs_info)) |
1314 | goto aborted_fs; |
1315 | last_tickets_id = space_info->tickets_id; |
1316 | spin_unlock(lock: &space_info->lock); |
1317 | } |
1318 | |
1319 | while (flush_state < ARRAY_SIZE(data_flush_states)) { |
1320 | flush_space(fs_info, space_info, U64_MAX, |
1321 | state: data_flush_states[flush_state], for_preempt: false); |
1322 | spin_lock(lock: &space_info->lock); |
1323 | if (list_empty(head: &space_info->tickets)) { |
1324 | space_info->flush = 0; |
1325 | spin_unlock(lock: &space_info->lock); |
1326 | return; |
1327 | } |
1328 | |
1329 | if (last_tickets_id == space_info->tickets_id) { |
1330 | flush_state++; |
1331 | } else { |
1332 | last_tickets_id = space_info->tickets_id; |
1333 | flush_state = 0; |
1334 | } |
1335 | |
1336 | if (flush_state >= ARRAY_SIZE(data_flush_states)) { |
1337 | if (space_info->full) { |
1338 | if (maybe_fail_all_tickets(fs_info, space_info)) |
1339 | flush_state = 0; |
1340 | else |
1341 | space_info->flush = 0; |
1342 | } else { |
1343 | flush_state = 0; |
1344 | } |
1345 | |
1346 | /* Something happened, fail everything and bail. */ |
1347 | if (BTRFS_FS_ERROR(fs_info)) |
1348 | goto aborted_fs; |
1349 | |
1350 | } |
1351 | spin_unlock(lock: &space_info->lock); |
1352 | } |
1353 | return; |
1354 | |
1355 | aborted_fs: |
1356 | maybe_fail_all_tickets(fs_info, space_info); |
1357 | space_info->flush = 0; |
1358 | spin_unlock(lock: &space_info->lock); |
1359 | } |
1360 | |
1361 | void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) |
1362 | { |
1363 | INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); |
1364 | INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); |
1365 | INIT_WORK(&fs_info->preempt_reclaim_work, |
1366 | btrfs_preempt_reclaim_metadata_space); |
1367 | } |
1368 | |
1369 | static const enum btrfs_flush_state priority_flush_states[] = { |
1370 | FLUSH_DELAYED_ITEMS_NR, |
1371 | FLUSH_DELAYED_ITEMS, |
1372 | ALLOC_CHUNK, |
1373 | }; |
1374 | |
1375 | static const enum btrfs_flush_state evict_flush_states[] = { |
1376 | FLUSH_DELAYED_ITEMS_NR, |
1377 | FLUSH_DELAYED_ITEMS, |
1378 | FLUSH_DELAYED_REFS_NR, |
1379 | FLUSH_DELAYED_REFS, |
1380 | FLUSH_DELALLOC, |
1381 | FLUSH_DELALLOC_WAIT, |
1382 | FLUSH_DELALLOC_FULL, |
1383 | ALLOC_CHUNK, |
1384 | COMMIT_TRANS, |
1385 | }; |
1386 | |
1387 | static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, |
1388 | struct btrfs_space_info *space_info, |
1389 | struct reserve_ticket *ticket, |
1390 | const enum btrfs_flush_state *states, |
1391 | int states_nr) |
1392 | { |
1393 | u64 to_reclaim; |
1394 | int flush_state = 0; |
1395 | |
1396 | spin_lock(lock: &space_info->lock); |
1397 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); |
1398 | /* |
1399 | * This is the priority reclaim path, so to_reclaim could be >0 still |
1400 | * because we may have only satisfied the priority tickets and still |
1401 | * left non priority tickets on the list. We would then have |
1402 | * to_reclaim but ->bytes == 0. |
1403 | */ |
1404 | if (ticket->bytes == 0) { |
1405 | spin_unlock(lock: &space_info->lock); |
1406 | return; |
1407 | } |
1408 | |
1409 | while (flush_state < states_nr) { |
1410 | spin_unlock(lock: &space_info->lock); |
1411 | flush_space(fs_info, space_info, num_bytes: to_reclaim, state: states[flush_state], |
1412 | for_preempt: false); |
1413 | flush_state++; |
1414 | spin_lock(lock: &space_info->lock); |
1415 | if (ticket->bytes == 0) { |
1416 | spin_unlock(lock: &space_info->lock); |
1417 | return; |
1418 | } |
1419 | } |
1420 | |
1421 | /* |
1422 | * Attempt to steal from the global rsv if we can, except if the fs was |
1423 | * turned into error mode due to a transaction abort when flushing space |
1424 | * above, in that case fail with the abort error instead of returning |
1425 | * success to the caller if we can steal from the global rsv - this is |
1426 | * just to have caller fail immeditelly instead of later when trying to |
1427 | * modify the fs, making it easier to debug -ENOSPC problems. |
1428 | */ |
1429 | if (BTRFS_FS_ERROR(fs_info)) { |
1430 | ticket->error = BTRFS_FS_ERROR(fs_info); |
1431 | remove_ticket(space_info, ticket); |
1432 | } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { |
1433 | ticket->error = -ENOSPC; |
1434 | remove_ticket(space_info, ticket); |
1435 | } |
1436 | |
1437 | /* |
1438 | * We must run try_granting_tickets here because we could be a large |
1439 | * ticket in front of a smaller ticket that can now be satisfied with |
1440 | * the available space. |
1441 | */ |
1442 | btrfs_try_granting_tickets(fs_info, space_info); |
1443 | spin_unlock(lock: &space_info->lock); |
1444 | } |
1445 | |
1446 | static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, |
1447 | struct btrfs_space_info *space_info, |
1448 | struct reserve_ticket *ticket) |
1449 | { |
1450 | spin_lock(lock: &space_info->lock); |
1451 | |
1452 | /* We could have been granted before we got here. */ |
1453 | if (ticket->bytes == 0) { |
1454 | spin_unlock(lock: &space_info->lock); |
1455 | return; |
1456 | } |
1457 | |
1458 | while (!space_info->full) { |
1459 | spin_unlock(lock: &space_info->lock); |
1460 | flush_space(fs_info, space_info, U64_MAX, state: ALLOC_CHUNK_FORCE, for_preempt: false); |
1461 | spin_lock(lock: &space_info->lock); |
1462 | if (ticket->bytes == 0) { |
1463 | spin_unlock(lock: &space_info->lock); |
1464 | return; |
1465 | } |
1466 | } |
1467 | |
1468 | ticket->error = -ENOSPC; |
1469 | remove_ticket(space_info, ticket); |
1470 | btrfs_try_granting_tickets(fs_info, space_info); |
1471 | spin_unlock(lock: &space_info->lock); |
1472 | } |
1473 | |
1474 | static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, |
1475 | struct btrfs_space_info *space_info, |
1476 | struct reserve_ticket *ticket) |
1477 | |
1478 | { |
1479 | DEFINE_WAIT(wait); |
1480 | int ret = 0; |
1481 | |
1482 | spin_lock(lock: &space_info->lock); |
1483 | while (ticket->bytes > 0 && ticket->error == 0) { |
1484 | ret = prepare_to_wait_event(wq_head: &ticket->wait, wq_entry: &wait, TASK_KILLABLE); |
1485 | if (ret) { |
1486 | /* |
1487 | * Delete us from the list. After we unlock the space |
1488 | * info, we don't want the async reclaim job to reserve |
1489 | * space for this ticket. If that would happen, then the |
1490 | * ticket's task would not known that space was reserved |
1491 | * despite getting an error, resulting in a space leak |
1492 | * (bytes_may_use counter of our space_info). |
1493 | */ |
1494 | remove_ticket(space_info, ticket); |
1495 | ticket->error = -EINTR; |
1496 | break; |
1497 | } |
1498 | spin_unlock(lock: &space_info->lock); |
1499 | |
1500 | schedule(); |
1501 | |
1502 | finish_wait(wq_head: &ticket->wait, wq_entry: &wait); |
1503 | spin_lock(lock: &space_info->lock); |
1504 | } |
1505 | spin_unlock(lock: &space_info->lock); |
1506 | } |
1507 | |
1508 | /* |
1509 | * Do the appropriate flushing and waiting for a ticket. |
1510 | * |
1511 | * @fs_info: the filesystem |
1512 | * @space_info: space info for the reservation |
1513 | * @ticket: ticket for the reservation |
1514 | * @start_ns: timestamp when the reservation started |
1515 | * @orig_bytes: amount of bytes originally reserved |
1516 | * @flush: how much we can flush |
1517 | * |
1518 | * This does the work of figuring out how to flush for the ticket, waiting for |
1519 | * the reservation, and returning the appropriate error if there is one. |
1520 | */ |
1521 | static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, |
1522 | struct btrfs_space_info *space_info, |
1523 | struct reserve_ticket *ticket, |
1524 | u64 start_ns, u64 orig_bytes, |
1525 | enum btrfs_reserve_flush_enum flush) |
1526 | { |
1527 | int ret; |
1528 | |
1529 | switch (flush) { |
1530 | case BTRFS_RESERVE_FLUSH_DATA: |
1531 | case BTRFS_RESERVE_FLUSH_ALL: |
1532 | case BTRFS_RESERVE_FLUSH_ALL_STEAL: |
1533 | wait_reserve_ticket(fs_info, space_info, ticket); |
1534 | break; |
1535 | case BTRFS_RESERVE_FLUSH_LIMIT: |
1536 | priority_reclaim_metadata_space(fs_info, space_info, ticket, |
1537 | states: priority_flush_states, |
1538 | ARRAY_SIZE(priority_flush_states)); |
1539 | break; |
1540 | case BTRFS_RESERVE_FLUSH_EVICT: |
1541 | priority_reclaim_metadata_space(fs_info, space_info, ticket, |
1542 | states: evict_flush_states, |
1543 | ARRAY_SIZE(evict_flush_states)); |
1544 | break; |
1545 | case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: |
1546 | priority_reclaim_data_space(fs_info, space_info, ticket); |
1547 | break; |
1548 | default: |
1549 | ASSERT(0); |
1550 | break; |
1551 | } |
1552 | |
1553 | ret = ticket->error; |
1554 | ASSERT(list_empty(&ticket->list)); |
1555 | /* |
1556 | * Check that we can't have an error set if the reservation succeeded, |
1557 | * as that would confuse tasks and lead them to error out without |
1558 | * releasing reserved space (if an error happens the expectation is that |
1559 | * space wasn't reserved at all). |
1560 | */ |
1561 | ASSERT(!(ticket->bytes == 0 && ticket->error)); |
1562 | trace_btrfs_reserve_ticket(fs_info, flags: space_info->flags, bytes: orig_bytes, |
1563 | start_ns, flush, error: ticket->error); |
1564 | return ret; |
1565 | } |
1566 | |
1567 | /* |
1568 | * This returns true if this flush state will go through the ordinary flushing |
1569 | * code. |
1570 | */ |
1571 | static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) |
1572 | { |
1573 | return (flush == BTRFS_RESERVE_FLUSH_ALL) || |
1574 | (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); |
1575 | } |
1576 | |
1577 | static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, |
1578 | struct btrfs_space_info *space_info) |
1579 | { |
1580 | u64 ordered = percpu_counter_sum_positive(fbc: &fs_info->ordered_bytes); |
1581 | u64 delalloc = percpu_counter_sum_positive(fbc: &fs_info->delalloc_bytes); |
1582 | |
1583 | /* |
1584 | * If we're heavy on ordered operations then clamping won't help us. We |
1585 | * need to clamp specifically to keep up with dirty'ing buffered |
1586 | * writers, because there's not a 1:1 correlation of writing delalloc |
1587 | * and freeing space, like there is with flushing delayed refs or |
1588 | * delayed nodes. If we're already more ordered than delalloc then |
1589 | * we're keeping up, otherwise we aren't and should probably clamp. |
1590 | */ |
1591 | if (ordered < delalloc) |
1592 | space_info->clamp = min(space_info->clamp + 1, 8); |
1593 | } |
1594 | |
1595 | static inline bool can_steal(enum btrfs_reserve_flush_enum flush) |
1596 | { |
1597 | return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || |
1598 | flush == BTRFS_RESERVE_FLUSH_EVICT); |
1599 | } |
1600 | |
1601 | /* |
1602 | * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to |
1603 | * fail as quickly as possible. |
1604 | */ |
1605 | static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) |
1606 | { |
1607 | return (flush != BTRFS_RESERVE_NO_FLUSH && |
1608 | flush != BTRFS_RESERVE_FLUSH_EMERGENCY); |
1609 | } |
1610 | |
1611 | /* |
1612 | * Try to reserve bytes from the block_rsv's space. |
1613 | * |
1614 | * @fs_info: the filesystem |
1615 | * @space_info: space info we want to allocate from |
1616 | * @orig_bytes: number of bytes we want |
1617 | * @flush: whether or not we can flush to make our reservation |
1618 | * |
1619 | * This will reserve orig_bytes number of bytes from the space info associated |
1620 | * with the block_rsv. If there is not enough space it will make an attempt to |
1621 | * flush out space to make room. It will do this by flushing delalloc if |
1622 | * possible or committing the transaction. If flush is 0 then no attempts to |
1623 | * regain reservations will be made and this will fail if there is not enough |
1624 | * space already. |
1625 | */ |
1626 | static int __reserve_bytes(struct btrfs_fs_info *fs_info, |
1627 | struct btrfs_space_info *space_info, u64 orig_bytes, |
1628 | enum btrfs_reserve_flush_enum flush) |
1629 | { |
1630 | struct work_struct *async_work; |
1631 | struct reserve_ticket ticket; |
1632 | u64 start_ns = 0; |
1633 | u64 used; |
1634 | int ret = -ENOSPC; |
1635 | bool pending_tickets; |
1636 | |
1637 | ASSERT(orig_bytes); |
1638 | /* |
1639 | * If have a transaction handle (current->journal_info != NULL), then |
1640 | * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor |
1641 | * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those |
1642 | * flushing methods can trigger transaction commits. |
1643 | */ |
1644 | if (current->journal_info) { |
1645 | /* One assert per line for easier debugging. */ |
1646 | ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); |
1647 | ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); |
1648 | ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); |
1649 | } |
1650 | |
1651 | if (flush == BTRFS_RESERVE_FLUSH_DATA) |
1652 | async_work = &fs_info->async_data_reclaim_work; |
1653 | else |
1654 | async_work = &fs_info->async_reclaim_work; |
1655 | |
1656 | spin_lock(lock: &space_info->lock); |
1657 | used = btrfs_space_info_used(s_info: space_info, may_use_included: true); |
1658 | |
1659 | /* |
1660 | * We don't want NO_FLUSH allocations to jump everybody, they can |
1661 | * generally handle ENOSPC in a different way, so treat them the same as |
1662 | * normal flushers when it comes to skipping pending tickets. |
1663 | */ |
1664 | if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) |
1665 | pending_tickets = !list_empty(head: &space_info->tickets) || |
1666 | !list_empty(head: &space_info->priority_tickets); |
1667 | else |
1668 | pending_tickets = !list_empty(head: &space_info->priority_tickets); |
1669 | |
1670 | /* |
1671 | * Carry on if we have enough space (short-circuit) OR call |
1672 | * can_overcommit() to ensure we can overcommit to continue. |
1673 | */ |
1674 | if (!pending_tickets && |
1675 | ((used + orig_bytes <= space_info->total_bytes) || |
1676 | btrfs_can_overcommit(fs_info, space_info, bytes: orig_bytes, flush))) { |
1677 | btrfs_space_info_update_bytes_may_use(fs_info, sinfo: space_info, |
1678 | bytes: orig_bytes); |
1679 | ret = 0; |
1680 | } |
1681 | |
1682 | /* |
1683 | * Things are dire, we need to make a reservation so we don't abort. We |
1684 | * will let this reservation go through as long as we have actual space |
1685 | * left to allocate for the block. |
1686 | */ |
1687 | if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { |
1688 | used = btrfs_space_info_used(s_info: space_info, may_use_included: false); |
1689 | if (used + orig_bytes <= space_info->total_bytes) { |
1690 | btrfs_space_info_update_bytes_may_use(fs_info, sinfo: space_info, |
1691 | bytes: orig_bytes); |
1692 | ret = 0; |
1693 | } |
1694 | } |
1695 | |
1696 | /* |
1697 | * If we couldn't make a reservation then setup our reservation ticket |
1698 | * and kick the async worker if it's not already running. |
1699 | * |
1700 | * If we are a priority flusher then we just need to add our ticket to |
1701 | * the list and we will do our own flushing further down. |
1702 | */ |
1703 | if (ret && can_ticket(flush)) { |
1704 | ticket.bytes = orig_bytes; |
1705 | ticket.error = 0; |
1706 | space_info->reclaim_size += ticket.bytes; |
1707 | init_waitqueue_head(&ticket.wait); |
1708 | ticket.steal = can_steal(flush); |
1709 | if (trace_btrfs_reserve_ticket_enabled()) |
1710 | start_ns = ktime_get_ns(); |
1711 | |
1712 | if (flush == BTRFS_RESERVE_FLUSH_ALL || |
1713 | flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || |
1714 | flush == BTRFS_RESERVE_FLUSH_DATA) { |
1715 | list_add_tail(new: &ticket.list, head: &space_info->tickets); |
1716 | if (!space_info->flush) { |
1717 | /* |
1718 | * We were forced to add a reserve ticket, so |
1719 | * our preemptive flushing is unable to keep |
1720 | * up. Clamp down on the threshold for the |
1721 | * preemptive flushing in order to keep up with |
1722 | * the workload. |
1723 | */ |
1724 | maybe_clamp_preempt(fs_info, space_info); |
1725 | |
1726 | space_info->flush = 1; |
1727 | trace_btrfs_trigger_flush(fs_info, |
1728 | flags: space_info->flags, |
1729 | bytes: orig_bytes, flush, |
1730 | reason: "enospc" ); |
1731 | queue_work(wq: system_unbound_wq, work: async_work); |
1732 | } |
1733 | } else { |
1734 | list_add_tail(new: &ticket.list, |
1735 | head: &space_info->priority_tickets); |
1736 | } |
1737 | } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { |
1738 | /* |
1739 | * We will do the space reservation dance during log replay, |
1740 | * which means we won't have fs_info->fs_root set, so don't do |
1741 | * the async reclaim as we will panic. |
1742 | */ |
1743 | if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && |
1744 | !work_busy(work: &fs_info->preempt_reclaim_work) && |
1745 | need_preemptive_reclaim(fs_info, space_info)) { |
1746 | trace_btrfs_trigger_flush(fs_info, flags: space_info->flags, |
1747 | bytes: orig_bytes, flush, reason: "preempt" ); |
1748 | queue_work(wq: system_unbound_wq, |
1749 | work: &fs_info->preempt_reclaim_work); |
1750 | } |
1751 | } |
1752 | spin_unlock(lock: &space_info->lock); |
1753 | if (!ret || !can_ticket(flush)) |
1754 | return ret; |
1755 | |
1756 | return handle_reserve_ticket(fs_info, space_info, ticket: &ticket, start_ns, |
1757 | orig_bytes, flush); |
1758 | } |
1759 | |
1760 | /* |
1761 | * Try to reserve metadata bytes from the block_rsv's space. |
1762 | * |
1763 | * @fs_info: the filesystem |
1764 | * @space_info: the space_info we're allocating for |
1765 | * @orig_bytes: number of bytes we want |
1766 | * @flush: whether or not we can flush to make our reservation |
1767 | * |
1768 | * This will reserve orig_bytes number of bytes from the space info associated |
1769 | * with the block_rsv. If there is not enough space it will make an attempt to |
1770 | * flush out space to make room. It will do this by flushing delalloc if |
1771 | * possible or committing the transaction. If flush is 0 then no attempts to |
1772 | * regain reservations will be made and this will fail if there is not enough |
1773 | * space already. |
1774 | */ |
1775 | int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, |
1776 | struct btrfs_space_info *space_info, |
1777 | u64 orig_bytes, |
1778 | enum btrfs_reserve_flush_enum flush) |
1779 | { |
1780 | int ret; |
1781 | |
1782 | ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); |
1783 | if (ret == -ENOSPC) { |
1784 | trace_btrfs_space_reservation(fs_info, type: "space_info:enospc" , |
1785 | val: space_info->flags, bytes: orig_bytes, reserve: 1); |
1786 | |
1787 | if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) |
1788 | btrfs_dump_space_info(fs_info, info: space_info, bytes: orig_bytes, dump_block_groups: 0); |
1789 | } |
1790 | return ret; |
1791 | } |
1792 | |
1793 | /* |
1794 | * Try to reserve data bytes for an allocation. |
1795 | * |
1796 | * @fs_info: the filesystem |
1797 | * @bytes: number of bytes we need |
1798 | * @flush: how we are allowed to flush |
1799 | * |
1800 | * This will reserve bytes from the data space info. If there is not enough |
1801 | * space then we will attempt to flush space as specified by flush. |
1802 | */ |
1803 | int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, |
1804 | enum btrfs_reserve_flush_enum flush) |
1805 | { |
1806 | struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; |
1807 | int ret; |
1808 | |
1809 | ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || |
1810 | flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || |
1811 | flush == BTRFS_RESERVE_NO_FLUSH); |
1812 | ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); |
1813 | |
1814 | ret = __reserve_bytes(fs_info, space_info: data_sinfo, orig_bytes: bytes, flush); |
1815 | if (ret == -ENOSPC) { |
1816 | trace_btrfs_space_reservation(fs_info, type: "space_info:enospc" , |
1817 | val: data_sinfo->flags, bytes, reserve: 1); |
1818 | if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) |
1819 | btrfs_dump_space_info(fs_info, info: data_sinfo, bytes, dump_block_groups: 0); |
1820 | } |
1821 | return ret; |
1822 | } |
1823 | |
1824 | /* Dump all the space infos when we abort a transaction due to ENOSPC. */ |
1825 | __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) |
1826 | { |
1827 | struct btrfs_space_info *space_info; |
1828 | |
1829 | btrfs_info(fs_info, "dumping space info:" ); |
1830 | list_for_each_entry(space_info, &fs_info->space_info, list) { |
1831 | spin_lock(lock: &space_info->lock); |
1832 | __btrfs_dump_space_info(fs_info, info: space_info); |
1833 | spin_unlock(lock: &space_info->lock); |
1834 | } |
1835 | dump_global_block_rsv(fs_info); |
1836 | } |
1837 | |
1838 | /* |
1839 | * Account the unused space of all the readonly block group in the space_info. |
1840 | * takes mirrors into account. |
1841 | */ |
1842 | u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) |
1843 | { |
1844 | struct btrfs_block_group *block_group; |
1845 | u64 free_bytes = 0; |
1846 | int factor; |
1847 | |
1848 | /* It's df, we don't care if it's racy */ |
1849 | if (list_empty(head: &sinfo->ro_bgs)) |
1850 | return 0; |
1851 | |
1852 | spin_lock(lock: &sinfo->lock); |
1853 | list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { |
1854 | spin_lock(lock: &block_group->lock); |
1855 | |
1856 | if (!block_group->ro) { |
1857 | spin_unlock(lock: &block_group->lock); |
1858 | continue; |
1859 | } |
1860 | |
1861 | factor = btrfs_bg_type_to_factor(flags: block_group->flags); |
1862 | free_bytes += (block_group->length - |
1863 | block_group->used) * factor; |
1864 | |
1865 | spin_unlock(lock: &block_group->lock); |
1866 | } |
1867 | spin_unlock(lock: &sinfo->lock); |
1868 | |
1869 | return free_bytes; |
1870 | } |
1871 | |