1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/jiffies.h> |
4 | #include <linux/kernel.h> |
5 | #include <linux/ktime.h> |
6 | #include <linux/list.h> |
7 | #include <linux/math64.h> |
8 | #include <linux/sizes.h> |
9 | #include <linux/workqueue.h> |
10 | #include "ctree.h" |
11 | #include "block-group.h" |
12 | #include "discard.h" |
13 | #include "free-space-cache.h" |
14 | #include "fs.h" |
15 | |
16 | /* |
17 | * This contains the logic to handle async discard. |
18 | * |
19 | * Async discard manages trimming of free space outside of transaction commit. |
20 | * Discarding is done by managing the block_groups on a LRU list based on free |
21 | * space recency. Two passes are used to first prioritize discarding extents |
22 | * and then allow for trimming in the bitmap the best opportunity to coalesce. |
23 | * The block_groups are maintained on multiple lists to allow for multiple |
24 | * passes with different discard filter requirements. A delayed work item is |
25 | * used to manage discarding with timeout determined by a max of the delay |
26 | * incurred by the iops rate limit, the byte rate limit, and the max delay of |
27 | * BTRFS_DISCARD_MAX_DELAY. |
28 | * |
29 | * Note, this only keeps track of block_groups that are explicitly for data. |
30 | * Mixed block_groups are not supported. |
31 | * |
32 | * The first list is special to manage discarding of fully free block groups. |
33 | * This is necessary because we issue a final trim for a full free block group |
34 | * after forgetting it. When a block group becomes unused, instead of directly |
35 | * being added to the unused_bgs list, we add it to this first list. Then |
36 | * from there, if it becomes fully discarded, we place it onto the unused_bgs |
37 | * list. |
38 | * |
39 | * The in-memory free space cache serves as the backing state for discard. |
40 | * Consequently this means there is no persistence. We opt to load all the |
41 | * block groups in as not discarded, so the mount case degenerates to the |
42 | * crashing case. |
43 | * |
44 | * As the free space cache uses bitmaps, there exists a tradeoff between |
45 | * ease/efficiency for find_free_extent() and the accuracy of discard state. |
46 | * Here we opt to let untrimmed regions merge with everything while only letting |
47 | * trimmed regions merge with other trimmed regions. This can cause |
48 | * overtrimming, but the coalescing benefit seems to be worth it. Additionally, |
49 | * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, |
50 | * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, |
51 | * this resets the state and we will retry trimming the whole bitmap. This is a |
52 | * tradeoff between discard state accuracy and the cost of accounting. |
53 | */ |
54 | |
55 | /* This is an initial delay to give some chance for block reuse */ |
56 | #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) |
57 | #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) |
58 | |
59 | #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) |
60 | #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) |
61 | #define BTRFS_DISCARD_MAX_IOPS (1000U) |
62 | |
63 | /* Monotonically decreasing minimum length filters after index 0 */ |
64 | static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { |
65 | 0, |
66 | BTRFS_ASYNC_DISCARD_MAX_FILTER, |
67 | BTRFS_ASYNC_DISCARD_MIN_FILTER |
68 | }; |
69 | |
70 | static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, |
71 | const struct btrfs_block_group *block_group) |
72 | { |
73 | return &discard_ctl->discard_list[block_group->discard_index]; |
74 | } |
75 | |
76 | /* |
77 | * Determine if async discard should be running. |
78 | * |
79 | * @discard_ctl: discard control |
80 | * |
81 | * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. |
82 | */ |
83 | static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) |
84 | { |
85 | struct btrfs_fs_info *fs_info = container_of(discard_ctl, |
86 | struct btrfs_fs_info, |
87 | discard_ctl); |
88 | |
89 | return (!(fs_info->sb->s_flags & SB_RDONLY) && |
90 | test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); |
91 | } |
92 | |
93 | static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
94 | struct btrfs_block_group *block_group) |
95 | { |
96 | lockdep_assert_held(&discard_ctl->lock); |
97 | |
98 | if (list_empty(head: &block_group->discard_list) || |
99 | block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { |
100 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) |
101 | block_group->discard_index = BTRFS_DISCARD_INDEX_START; |
102 | block_group->discard_eligible_time = (ktime_get_ns() + |
103 | BTRFS_DISCARD_DELAY); |
104 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
105 | } |
106 | if (list_empty(head: &block_group->discard_list)) |
107 | btrfs_get_block_group(cache: block_group); |
108 | |
109 | list_move_tail(list: &block_group->discard_list, |
110 | head: get_discard_list(discard_ctl, block_group)); |
111 | } |
112 | |
113 | static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, |
114 | struct btrfs_block_group *block_group) |
115 | { |
116 | if (!btrfs_is_block_group_data_only(block_group)) |
117 | return; |
118 | |
119 | if (!btrfs_run_discard_work(discard_ctl)) |
120 | return; |
121 | |
122 | spin_lock(lock: &discard_ctl->lock); |
123 | __add_to_discard_list(discard_ctl, block_group); |
124 | spin_unlock(lock: &discard_ctl->lock); |
125 | } |
126 | |
127 | static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, |
128 | struct btrfs_block_group *block_group) |
129 | { |
130 | bool queued; |
131 | |
132 | spin_lock(lock: &discard_ctl->lock); |
133 | |
134 | queued = !list_empty(head: &block_group->discard_list); |
135 | |
136 | if (!btrfs_run_discard_work(discard_ctl)) { |
137 | spin_unlock(lock: &discard_ctl->lock); |
138 | return; |
139 | } |
140 | |
141 | list_del_init(entry: &block_group->discard_list); |
142 | |
143 | block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; |
144 | block_group->discard_eligible_time = (ktime_get_ns() + |
145 | BTRFS_DISCARD_UNUSED_DELAY); |
146 | block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; |
147 | if (!queued) |
148 | btrfs_get_block_group(cache: block_group); |
149 | list_add_tail(new: &block_group->discard_list, |
150 | head: &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); |
151 | |
152 | spin_unlock(lock: &discard_ctl->lock); |
153 | } |
154 | |
155 | static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, |
156 | struct btrfs_block_group *block_group) |
157 | { |
158 | bool running = false; |
159 | bool queued = false; |
160 | |
161 | spin_lock(lock: &discard_ctl->lock); |
162 | |
163 | if (block_group == discard_ctl->block_group) { |
164 | running = true; |
165 | discard_ctl->block_group = NULL; |
166 | } |
167 | |
168 | block_group->discard_eligible_time = 0; |
169 | queued = !list_empty(head: &block_group->discard_list); |
170 | list_del_init(entry: &block_group->discard_list); |
171 | if (queued) |
172 | btrfs_put_block_group(cache: block_group); |
173 | |
174 | spin_unlock(lock: &discard_ctl->lock); |
175 | |
176 | return running; |
177 | } |
178 | |
179 | /* |
180 | * Find block_group that's up next for discarding. |
181 | * |
182 | * @discard_ctl: discard control |
183 | * @now: current time |
184 | * |
185 | * Iterate over the discard lists to find the next block_group up for |
186 | * discarding checking the discard_eligible_time of block_group. |
187 | */ |
188 | static struct btrfs_block_group *find_next_block_group( |
189 | struct btrfs_discard_ctl *discard_ctl, |
190 | u64 now) |
191 | { |
192 | struct btrfs_block_group *ret_block_group = NULL, *block_group; |
193 | int i; |
194 | |
195 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { |
196 | struct list_head *discard_list = &discard_ctl->discard_list[i]; |
197 | |
198 | if (!list_empty(head: discard_list)) { |
199 | block_group = list_first_entry(discard_list, |
200 | struct btrfs_block_group, |
201 | discard_list); |
202 | |
203 | if (!ret_block_group) |
204 | ret_block_group = block_group; |
205 | |
206 | if (ret_block_group->discard_eligible_time < now) |
207 | break; |
208 | |
209 | if (ret_block_group->discard_eligible_time > |
210 | block_group->discard_eligible_time) |
211 | ret_block_group = block_group; |
212 | } |
213 | } |
214 | |
215 | return ret_block_group; |
216 | } |
217 | |
218 | /* |
219 | * Look up next block group and set it for use. |
220 | * |
221 | * @discard_ctl: discard control |
222 | * @discard_state: the discard_state of the block_group after state management |
223 | * @discard_index: the discard_index of the block_group after state management |
224 | * @now: time when discard was invoked, in ns |
225 | * |
226 | * Wrap find_next_block_group() and set the block_group to be in use. |
227 | * @discard_state's control flow is managed here. Variables related to |
228 | * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state |
229 | * and @discard_index are remembered as it may change while we're discarding, |
230 | * but we want the discard to execute in the context determined here. |
231 | */ |
232 | static struct btrfs_block_group *peek_discard_list( |
233 | struct btrfs_discard_ctl *discard_ctl, |
234 | enum btrfs_discard_state *discard_state, |
235 | int *discard_index, u64 now) |
236 | { |
237 | struct btrfs_block_group *block_group; |
238 | |
239 | spin_lock(lock: &discard_ctl->lock); |
240 | again: |
241 | block_group = find_next_block_group(discard_ctl, now); |
242 | |
243 | if (block_group && now >= block_group->discard_eligible_time) { |
244 | if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && |
245 | block_group->used != 0) { |
246 | if (btrfs_is_block_group_data_only(block_group)) { |
247 | __add_to_discard_list(discard_ctl, block_group); |
248 | /* |
249 | * The block group must have been moved to other |
250 | * discard list even if discard was disabled in |
251 | * the meantime or a transaction abort happened, |
252 | * otherwise we can end up in an infinite loop, |
253 | * always jumping into the 'again' label and |
254 | * keep getting this block group over and over |
255 | * in case there are no other block groups in |
256 | * the discard lists. |
257 | */ |
258 | ASSERT(block_group->discard_index != |
259 | BTRFS_DISCARD_INDEX_UNUSED, |
260 | "discard_index=%d" , |
261 | block_group->discard_index); |
262 | } else { |
263 | list_del_init(entry: &block_group->discard_list); |
264 | btrfs_put_block_group(cache: block_group); |
265 | } |
266 | goto again; |
267 | } |
268 | if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { |
269 | block_group->discard_cursor = block_group->start; |
270 | block_group->discard_state = BTRFS_DISCARD_EXTENTS; |
271 | } |
272 | } |
273 | if (block_group) { |
274 | btrfs_get_block_group(cache: block_group); |
275 | discard_ctl->block_group = block_group; |
276 | *discard_state = block_group->discard_state; |
277 | *discard_index = block_group->discard_index; |
278 | } |
279 | spin_unlock(lock: &discard_ctl->lock); |
280 | |
281 | return block_group; |
282 | } |
283 | |
284 | /* |
285 | * Update a block group's filters. |
286 | * |
287 | * @block_group: block group of interest |
288 | * @bytes: recently freed region size after coalescing |
289 | * |
290 | * Async discard maintains multiple lists with progressively smaller filters |
291 | * to prioritize discarding based on size. Should a free space that matches |
292 | * a larger filter be returned to the free_space_cache, prioritize that discard |
293 | * by moving @block_group to the proper filter. |
294 | */ |
295 | void btrfs_discard_check_filter(struct btrfs_block_group *block_group, |
296 | u64 bytes) |
297 | { |
298 | struct btrfs_discard_ctl *discard_ctl; |
299 | |
300 | if (!block_group || |
301 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) |
302 | return; |
303 | |
304 | discard_ctl = &block_group->fs_info->discard_ctl; |
305 | |
306 | if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && |
307 | bytes >= discard_minlen[block_group->discard_index - 1]) { |
308 | int i; |
309 | |
310 | remove_from_discard_list(discard_ctl, block_group); |
311 | |
312 | for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; |
313 | i++) { |
314 | if (bytes >= discard_minlen[i]) { |
315 | block_group->discard_index = i; |
316 | add_to_discard_list(discard_ctl, block_group); |
317 | break; |
318 | } |
319 | } |
320 | } |
321 | } |
322 | |
323 | /* |
324 | * Move a block group along the discard lists. |
325 | * |
326 | * @discard_ctl: discard control |
327 | * @block_group: block_group of interest |
328 | * |
329 | * Increment @block_group's discard_index. If it falls of the list, let it be. |
330 | * Otherwise add it back to the appropriate list. |
331 | */ |
332 | static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, |
333 | struct btrfs_block_group *block_group) |
334 | { |
335 | block_group->discard_index++; |
336 | if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { |
337 | block_group->discard_index = 1; |
338 | return; |
339 | } |
340 | |
341 | add_to_discard_list(discard_ctl, block_group); |
342 | } |
343 | |
344 | /* |
345 | * Remove a block_group from the discard lists. |
346 | * |
347 | * @discard_ctl: discard control |
348 | * @block_group: block_group of interest |
349 | * |
350 | * Remove @block_group from the discard lists. If necessary, wait on the |
351 | * current work and then reschedule the delayed work. |
352 | */ |
353 | void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, |
354 | struct btrfs_block_group *block_group) |
355 | { |
356 | if (remove_from_discard_list(discard_ctl, block_group)) { |
357 | cancel_delayed_work_sync(dwork: &discard_ctl->work); |
358 | btrfs_discard_schedule_work(discard_ctl, override: true); |
359 | } |
360 | } |
361 | |
362 | /* |
363 | * Handles queuing the block_groups. |
364 | * |
365 | * @discard_ctl: discard control |
366 | * @block_group: block_group of interest |
367 | * |
368 | * Maintain the LRU order of the discard lists. |
369 | */ |
370 | void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, |
371 | struct btrfs_block_group *block_group) |
372 | { |
373 | if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) |
374 | return; |
375 | |
376 | if (block_group->used == 0) |
377 | add_to_discard_unused_list(discard_ctl, block_group); |
378 | else |
379 | add_to_discard_list(discard_ctl, block_group); |
380 | |
381 | if (!delayed_work_pending(&discard_ctl->work)) |
382 | btrfs_discard_schedule_work(discard_ctl, override: false); |
383 | } |
384 | |
385 | static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
386 | u64 now, bool override) |
387 | { |
388 | struct btrfs_block_group *block_group; |
389 | |
390 | if (!btrfs_run_discard_work(discard_ctl)) |
391 | return; |
392 | if (!override && delayed_work_pending(&discard_ctl->work)) |
393 | return; |
394 | |
395 | block_group = find_next_block_group(discard_ctl, now); |
396 | if (block_group) { |
397 | u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; |
398 | u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); |
399 | |
400 | /* |
401 | * A single delayed workqueue item is responsible for |
402 | * discarding, so we can manage the bytes rate limit by keeping |
403 | * track of the previous discard. |
404 | */ |
405 | if (kbps_limit && discard_ctl->prev_discard) { |
406 | u64 bps_limit = ((u64)kbps_limit) * SZ_1K; |
407 | u64 bps_delay = div64_u64(dividend: discard_ctl->prev_discard * |
408 | NSEC_PER_SEC, divisor: bps_limit); |
409 | |
410 | delay = max(delay, bps_delay); |
411 | } |
412 | |
413 | /* |
414 | * This timeout is to hopefully prevent immediate discarding |
415 | * in a recently allocated block group. |
416 | */ |
417 | if (now < block_group->discard_eligible_time) { |
418 | u64 bg_timeout = block_group->discard_eligible_time - now; |
419 | |
420 | delay = max(delay, bg_timeout); |
421 | } |
422 | |
423 | if (override && discard_ctl->prev_discard) { |
424 | u64 elapsed = now - discard_ctl->prev_discard_time; |
425 | |
426 | if (delay > elapsed) |
427 | delay -= elapsed; |
428 | else |
429 | delay = 0; |
430 | } |
431 | |
432 | mod_delayed_work(wq: discard_ctl->discard_workers, |
433 | dwork: &discard_ctl->work, delay: nsecs_to_jiffies(n: delay)); |
434 | } |
435 | } |
436 | |
437 | /* |
438 | * Responsible for scheduling the discard work. |
439 | * |
440 | * @discard_ctl: discard control |
441 | * @override: override the current timer |
442 | * |
443 | * Discards are issued by a delayed workqueue item. @override is used to |
444 | * update the current delay as the baseline delay interval is reevaluated on |
445 | * transaction commit. This is also maxed with any other rate limit. |
446 | */ |
447 | void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, |
448 | bool override) |
449 | { |
450 | const u64 now = ktime_get_ns(); |
451 | |
452 | spin_lock(lock: &discard_ctl->lock); |
453 | __btrfs_discard_schedule_work(discard_ctl, now, override); |
454 | spin_unlock(lock: &discard_ctl->lock); |
455 | } |
456 | |
457 | /* |
458 | * Determine next step of a block_group. |
459 | * |
460 | * @discard_ctl: discard control |
461 | * @block_group: block_group of interest |
462 | * |
463 | * Determine the next step for a block group after it's finished going through |
464 | * a pass on a discard list. If it is unused and fully trimmed, we can mark it |
465 | * unused and send it to the unused_bgs path. Otherwise, pass it onto the |
466 | * appropriate filter list or let it fall off. |
467 | */ |
468 | static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, |
469 | struct btrfs_block_group *block_group) |
470 | { |
471 | remove_from_discard_list(discard_ctl, block_group); |
472 | |
473 | if (block_group->used == 0) { |
474 | if (btrfs_is_free_space_trimmed(block_group)) |
475 | btrfs_mark_bg_unused(bg: block_group); |
476 | else |
477 | add_to_discard_unused_list(discard_ctl, block_group); |
478 | } else { |
479 | btrfs_update_discard_index(discard_ctl, block_group); |
480 | } |
481 | } |
482 | |
483 | /* |
484 | * Discard work queue callback |
485 | * |
486 | * @work: work |
487 | * |
488 | * Find the next block_group to start discarding and then discard a single |
489 | * region. It does this in a two-pass fashion: first extents and second |
490 | * bitmaps. Completely discarded block groups are sent to the unused_bgs path. |
491 | */ |
492 | static void btrfs_discard_workfn(struct work_struct *work) |
493 | { |
494 | struct btrfs_discard_ctl *discard_ctl; |
495 | struct btrfs_block_group *block_group; |
496 | enum btrfs_discard_state discard_state; |
497 | int discard_index = 0; |
498 | u64 trimmed = 0; |
499 | u64 minlen = 0; |
500 | u64 now = ktime_get_ns(); |
501 | |
502 | discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); |
503 | |
504 | block_group = peek_discard_list(discard_ctl, discard_state: &discard_state, |
505 | discard_index: &discard_index, now); |
506 | if (!block_group) |
507 | return; |
508 | if (!btrfs_run_discard_work(discard_ctl)) { |
509 | spin_lock(lock: &discard_ctl->lock); |
510 | btrfs_put_block_group(cache: block_group); |
511 | discard_ctl->block_group = NULL; |
512 | spin_unlock(lock: &discard_ctl->lock); |
513 | return; |
514 | } |
515 | if (now < block_group->discard_eligible_time) { |
516 | spin_lock(lock: &discard_ctl->lock); |
517 | btrfs_put_block_group(cache: block_group); |
518 | discard_ctl->block_group = NULL; |
519 | spin_unlock(lock: &discard_ctl->lock); |
520 | btrfs_discard_schedule_work(discard_ctl, override: false); |
521 | return; |
522 | } |
523 | |
524 | /* Perform discarding */ |
525 | minlen = discard_minlen[discard_index]; |
526 | |
527 | if (discard_state == BTRFS_DISCARD_BITMAPS) { |
528 | u64 maxlen = 0; |
529 | |
530 | /* |
531 | * Use the previous levels minimum discard length as the max |
532 | * length filter. In the case something is added to make a |
533 | * region go beyond the max filter, the entire bitmap is set |
534 | * back to BTRFS_TRIM_STATE_UNTRIMMED. |
535 | */ |
536 | if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) |
537 | maxlen = discard_minlen[discard_index - 1]; |
538 | |
539 | btrfs_trim_block_group_bitmaps(block_group, trimmed: &trimmed, |
540 | start: block_group->discard_cursor, |
541 | end: btrfs_block_group_end(block_group), |
542 | minlen, maxlen, async: true); |
543 | discard_ctl->discard_bitmap_bytes += trimmed; |
544 | } else { |
545 | btrfs_trim_block_group_extents(block_group, trimmed: &trimmed, |
546 | start: block_group->discard_cursor, |
547 | end: btrfs_block_group_end(block_group), |
548 | minlen, async: true); |
549 | discard_ctl->discard_extent_bytes += trimmed; |
550 | } |
551 | |
552 | /* Determine next steps for a block_group */ |
553 | if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { |
554 | if (discard_state == BTRFS_DISCARD_BITMAPS) { |
555 | btrfs_finish_discard_pass(discard_ctl, block_group); |
556 | } else { |
557 | block_group->discard_cursor = block_group->start; |
558 | spin_lock(lock: &discard_ctl->lock); |
559 | if (block_group->discard_state != |
560 | BTRFS_DISCARD_RESET_CURSOR) |
561 | block_group->discard_state = |
562 | BTRFS_DISCARD_BITMAPS; |
563 | spin_unlock(lock: &discard_ctl->lock); |
564 | } |
565 | } |
566 | |
567 | now = ktime_get_ns(); |
568 | spin_lock(lock: &discard_ctl->lock); |
569 | discard_ctl->prev_discard = trimmed; |
570 | discard_ctl->prev_discard_time = now; |
571 | btrfs_put_block_group(cache: block_group); |
572 | discard_ctl->block_group = NULL; |
573 | __btrfs_discard_schedule_work(discard_ctl, now, override: false); |
574 | spin_unlock(lock: &discard_ctl->lock); |
575 | } |
576 | |
577 | /* |
578 | * Recalculate the base delay. |
579 | * |
580 | * @discard_ctl: discard control |
581 | * |
582 | * Recalculate the base delay which is based off the total number of |
583 | * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) |
584 | * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). |
585 | */ |
586 | void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) |
587 | { |
588 | s32 discardable_extents; |
589 | s64 discardable_bytes; |
590 | u32 iops_limit; |
591 | unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; |
592 | unsigned long delay; |
593 | |
594 | discardable_extents = atomic_read(v: &discard_ctl->discardable_extents); |
595 | if (!discardable_extents) |
596 | return; |
597 | |
598 | spin_lock(lock: &discard_ctl->lock); |
599 | |
600 | /* |
601 | * The following is to fix a potential -1 discrepancy that we're not |
602 | * sure how to reproduce. But given that this is the only place that |
603 | * utilizes these numbers and this is only called by from |
604 | * btrfs_finish_extent_commit() which is synchronized, we can correct |
605 | * here. |
606 | */ |
607 | if (discardable_extents < 0) |
608 | atomic_add(i: -discardable_extents, |
609 | v: &discard_ctl->discardable_extents); |
610 | |
611 | discardable_bytes = atomic64_read(v: &discard_ctl->discardable_bytes); |
612 | if (discardable_bytes < 0) |
613 | atomic64_add(i: -discardable_bytes, |
614 | v: &discard_ctl->discardable_bytes); |
615 | |
616 | if (discardable_extents <= 0) { |
617 | spin_unlock(lock: &discard_ctl->lock); |
618 | return; |
619 | } |
620 | |
621 | iops_limit = READ_ONCE(discard_ctl->iops_limit); |
622 | |
623 | if (iops_limit) { |
624 | delay = MSEC_PER_SEC / iops_limit; |
625 | } else { |
626 | /* |
627 | * Unset iops_limit means go as fast as possible, so allow a |
628 | * delay of 0. |
629 | */ |
630 | delay = 0; |
631 | min_delay = 0; |
632 | } |
633 | |
634 | delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); |
635 | discard_ctl->delay_ms = delay; |
636 | |
637 | spin_unlock(lock: &discard_ctl->lock); |
638 | } |
639 | |
640 | /* |
641 | * Propagate discard counters. |
642 | * |
643 | * @block_group: block_group of interest |
644 | * |
645 | * Propagate deltas of counters up to the discard_ctl. It maintains a current |
646 | * counter and a previous counter passing the delta up to the global stat. |
647 | * Then the current counter value becomes the previous counter value. |
648 | */ |
649 | void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) |
650 | { |
651 | struct btrfs_free_space_ctl *ctl; |
652 | struct btrfs_discard_ctl *discard_ctl; |
653 | s32 extents_delta; |
654 | s64 bytes_delta; |
655 | |
656 | if (!block_group || |
657 | !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || |
658 | !btrfs_is_block_group_data_only(block_group)) |
659 | return; |
660 | |
661 | ctl = block_group->free_space_ctl; |
662 | discard_ctl = &block_group->fs_info->discard_ctl; |
663 | |
664 | lockdep_assert_held(&ctl->tree_lock); |
665 | extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - |
666 | ctl->discardable_extents[BTRFS_STAT_PREV]; |
667 | if (extents_delta) { |
668 | atomic_add(i: extents_delta, v: &discard_ctl->discardable_extents); |
669 | ctl->discardable_extents[BTRFS_STAT_PREV] = |
670 | ctl->discardable_extents[BTRFS_STAT_CURR]; |
671 | } |
672 | |
673 | bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - |
674 | ctl->discardable_bytes[BTRFS_STAT_PREV]; |
675 | if (bytes_delta) { |
676 | atomic64_add(i: bytes_delta, v: &discard_ctl->discardable_bytes); |
677 | ctl->discardable_bytes[BTRFS_STAT_PREV] = |
678 | ctl->discardable_bytes[BTRFS_STAT_CURR]; |
679 | } |
680 | } |
681 | |
682 | /* |
683 | * Punt unused_bgs list to discard lists. |
684 | * |
685 | * @fs_info: fs_info of interest |
686 | * |
687 | * The unused_bgs list needs to be punted to the discard lists because the |
688 | * order of operations is changed. In the normal synchronous discard path, the |
689 | * block groups are trimmed via a single large trim in transaction commit. This |
690 | * is ultimately what we are trying to avoid with asynchronous discard. Thus, |
691 | * it must be done before going down the unused_bgs path. |
692 | */ |
693 | void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) |
694 | { |
695 | struct btrfs_block_group *block_group, *next; |
696 | |
697 | spin_lock(lock: &fs_info->unused_bgs_lock); |
698 | /* We enabled async discard, so punt all to the queue */ |
699 | list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, |
700 | bg_list) { |
701 | list_del_init(entry: &block_group->bg_list); |
702 | btrfs_discard_queue_work(discard_ctl: &fs_info->discard_ctl, block_group); |
703 | /* |
704 | * This put is for the get done by btrfs_mark_bg_unused. |
705 | * Queueing discard incremented it for discard's reference. |
706 | */ |
707 | btrfs_put_block_group(cache: block_group); |
708 | } |
709 | spin_unlock(lock: &fs_info->unused_bgs_lock); |
710 | } |
711 | |
712 | /* |
713 | * Purge discard lists. |
714 | * |
715 | * @discard_ctl: discard control |
716 | * |
717 | * If we are disabling async discard, we may have intercepted block groups that |
718 | * are completely free and ready for the unused_bgs path. As discarding will |
719 | * now happen in transaction commit or not at all, we can safely mark the |
720 | * corresponding block groups as unused and they will be sent on their merry |
721 | * way to the unused_bgs list. |
722 | */ |
723 | static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) |
724 | { |
725 | struct btrfs_block_group *block_group, *next; |
726 | int i; |
727 | |
728 | spin_lock(lock: &discard_ctl->lock); |
729 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { |
730 | list_for_each_entry_safe(block_group, next, |
731 | &discard_ctl->discard_list[i], |
732 | discard_list) { |
733 | list_del_init(entry: &block_group->discard_list); |
734 | spin_unlock(lock: &discard_ctl->lock); |
735 | if (block_group->used == 0) |
736 | btrfs_mark_bg_unused(bg: block_group); |
737 | spin_lock(lock: &discard_ctl->lock); |
738 | btrfs_put_block_group(cache: block_group); |
739 | } |
740 | } |
741 | spin_unlock(lock: &discard_ctl->lock); |
742 | } |
743 | |
744 | void btrfs_discard_resume(struct btrfs_fs_info *fs_info) |
745 | { |
746 | if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { |
747 | btrfs_discard_cleanup(fs_info); |
748 | return; |
749 | } |
750 | |
751 | btrfs_discard_punt_unused_bgs_list(fs_info); |
752 | |
753 | set_bit(nr: BTRFS_FS_DISCARD_RUNNING, addr: &fs_info->flags); |
754 | } |
755 | |
756 | void btrfs_discard_stop(struct btrfs_fs_info *fs_info) |
757 | { |
758 | clear_bit(nr: BTRFS_FS_DISCARD_RUNNING, addr: &fs_info->flags); |
759 | } |
760 | |
761 | void btrfs_discard_init(struct btrfs_fs_info *fs_info) |
762 | { |
763 | struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; |
764 | int i; |
765 | |
766 | spin_lock_init(&discard_ctl->lock); |
767 | INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); |
768 | |
769 | for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) |
770 | INIT_LIST_HEAD(list: &discard_ctl->discard_list[i]); |
771 | |
772 | discard_ctl->prev_discard = 0; |
773 | discard_ctl->prev_discard_time = 0; |
774 | atomic_set(v: &discard_ctl->discardable_extents, i: 0); |
775 | atomic64_set(v: &discard_ctl->discardable_bytes, i: 0); |
776 | discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; |
777 | discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; |
778 | discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; |
779 | discard_ctl->kbps_limit = 0; |
780 | discard_ctl->discard_extent_bytes = 0; |
781 | discard_ctl->discard_bitmap_bytes = 0; |
782 | atomic64_set(v: &discard_ctl->discard_bytes_saved, i: 0); |
783 | } |
784 | |
785 | void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) |
786 | { |
787 | btrfs_discard_stop(fs_info); |
788 | cancel_delayed_work_sync(dwork: &fs_info->discard_ctl.work); |
789 | btrfs_discard_purge_list(discard_ctl: &fs_info->discard_ctl); |
790 | } |
791 | |