1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2010, 2023 Red Hat, Inc. |
4 | * All Rights Reserved. |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_shared.h" |
8 | #include "xfs_format.h" |
9 | #include "xfs_log_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_trans.h" |
12 | #include "xfs_mount.h" |
13 | #include "xfs_btree.h" |
14 | #include "xfs_alloc_btree.h" |
15 | #include "xfs_alloc.h" |
16 | #include "xfs_discard.h" |
17 | #include "xfs_error.h" |
18 | #include "xfs_extent_busy.h" |
19 | #include "xfs_trace.h" |
20 | #include "xfs_log.h" |
21 | #include "xfs_ag.h" |
22 | #include "xfs_health.h" |
23 | |
24 | /* |
25 | * Notes on an efficient, low latency fstrim algorithm |
26 | * |
27 | * We need to walk the filesystem free space and issue discards on the free |
28 | * space that meet the search criteria (size and location). We cannot issue |
29 | * discards on extents that might be in use, or are so recently in use they are |
30 | * still marked as busy. To serialise against extent state changes whilst we are |
31 | * gathering extents to trim, we must hold the AGF lock to lock out other |
32 | * allocations and extent free operations that might change extent state. |
33 | * |
34 | * However, we cannot just hold the AGF for the entire AG free space walk whilst |
35 | * we issue discards on each free space that is found. Storage devices can have |
36 | * extremely slow discard implementations (e.g. ceph RBD) and so walking a |
37 | * couple of million free extents and issuing synchronous discards on each |
38 | * extent can take a *long* time. Whilst we are doing this walk, nothing else |
39 | * can access the AGF, and we can stall transactions and hence the log whilst |
40 | * modifications wait for the AGF lock to be released. This can lead hung tasks |
41 | * kicking the hung task timer and rebooting the system. This is bad. |
42 | * |
43 | * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI |
44 | * lock, gathers a range of inode cluster buffers that are allocated, drops the |
45 | * AGI lock and then reads all the inode cluster buffers and processes them. It |
46 | * loops doing this, using a cursor to keep track of where it is up to in the AG |
47 | * for each iteration to restart the INOBT lookup from. |
48 | * |
49 | * We can't do this exactly with free space - once we drop the AGF lock, the |
50 | * state of the free extent is out of our control and we cannot run a discard |
51 | * safely on it in this situation. Unless, of course, we've marked the free |
52 | * extent as busy and undergoing a discard operation whilst we held the AGF |
53 | * locked. |
54 | * |
55 | * This is exactly how online discard works - free extents are marked busy when |
56 | * they are freed, and once the extent free has been committed to the journal, |
57 | * the busy extent record is marked as "undergoing discard" and the discard is |
58 | * then issued on the free extent. Once the discard completes, the busy extent |
59 | * record is removed and the extent is able to be allocated again. |
60 | * |
61 | * In the context of fstrim, if we find a free extent we need to discard, we |
62 | * don't have to discard it immediately. All we need to do it record that free |
63 | * extent as being busy and under discard, and all the allocation routines will |
64 | * now avoid trying to allocate it. Hence if we mark the extent as busy under |
65 | * the AGF lock, we can safely discard it without holding the AGF lock because |
66 | * nothing will attempt to allocate that free space until the discard completes. |
67 | * |
68 | * This also allows us to issue discards asynchronously like we do with online |
69 | * discard, and so for fast devices fstrim will run much faster as we can have |
70 | * multiple discard operations in flight at once, as well as pipeline the free |
71 | * extent search so that it overlaps in flight discard IO. |
72 | */ |
73 | |
74 | struct workqueue_struct *xfs_discard_wq; |
75 | |
76 | static void |
77 | xfs_discard_endio_work( |
78 | struct work_struct *work) |
79 | { |
80 | struct xfs_busy_extents *extents = |
81 | container_of(work, struct xfs_busy_extents, endio_work); |
82 | |
83 | xfs_extent_busy_clear(mp: extents->mount, list: &extents->extent_list, do_discard: false); |
84 | kfree(objp: extents->owner); |
85 | } |
86 | |
87 | /* |
88 | * Queue up the actual completion to a thread to avoid IRQ-safe locking for |
89 | * pagb_lock. |
90 | */ |
91 | static void |
92 | xfs_discard_endio( |
93 | struct bio *bio) |
94 | { |
95 | struct xfs_busy_extents *extents = bio->bi_private; |
96 | |
97 | INIT_WORK(&extents->endio_work, xfs_discard_endio_work); |
98 | queue_work(wq: xfs_discard_wq, work: &extents->endio_work); |
99 | bio_put(bio); |
100 | } |
101 | |
102 | /* |
103 | * Walk the discard list and issue discards on all the busy extents in the |
104 | * list. We plug and chain the bios so that we only need a single completion |
105 | * call to clear all the busy extents once the discards are complete. |
106 | */ |
107 | int |
108 | xfs_discard_extents( |
109 | struct xfs_mount *mp, |
110 | struct xfs_busy_extents *extents) |
111 | { |
112 | struct xfs_extent_busy *busyp; |
113 | struct bio *bio = NULL; |
114 | struct blk_plug plug; |
115 | int error = 0; |
116 | |
117 | blk_start_plug(&plug); |
118 | list_for_each_entry(busyp, &extents->extent_list, list) { |
119 | trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, |
120 | busyp->length); |
121 | |
122 | error = __blkdev_issue_discard(bdev: mp->m_ddev_targp->bt_bdev, |
123 | sector: XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), |
124 | nr_sects: XFS_FSB_TO_BB(mp, busyp->length), |
125 | GFP_KERNEL, biop: &bio); |
126 | if (error && error != -EOPNOTSUPP) { |
127 | xfs_info(mp, |
128 | "discard failed for extent [0x%llx,%u], error %d" , |
129 | (unsigned long long)busyp->bno, |
130 | busyp->length, |
131 | error); |
132 | break; |
133 | } |
134 | } |
135 | |
136 | if (bio) { |
137 | bio->bi_private = extents; |
138 | bio->bi_end_io = xfs_discard_endio; |
139 | submit_bio(bio); |
140 | } else { |
141 | xfs_discard_endio_work(work: &extents->endio_work); |
142 | } |
143 | blk_finish_plug(&plug); |
144 | |
145 | return error; |
146 | } |
147 | |
148 | |
149 | static int |
150 | xfs_trim_gather_extents( |
151 | struct xfs_perag *pag, |
152 | xfs_daddr_t start, |
153 | xfs_daddr_t end, |
154 | xfs_daddr_t minlen, |
155 | struct xfs_alloc_rec_incore *tcur, |
156 | struct xfs_busy_extents *extents, |
157 | uint64_t *blocks_trimmed) |
158 | { |
159 | struct xfs_mount *mp = pag->pag_mount; |
160 | struct xfs_trans *tp; |
161 | struct xfs_btree_cur *cur; |
162 | struct xfs_buf *agbp; |
163 | int error; |
164 | int i; |
165 | int batch = 100; |
166 | |
167 | /* |
168 | * Force out the log. This means any transactions that might have freed |
169 | * space before we take the AGF buffer lock are now on disk, and the |
170 | * volatile disk cache is flushed. |
171 | */ |
172 | xfs_log_force(mp, XFS_LOG_SYNC); |
173 | |
174 | error = xfs_trans_alloc_empty(mp, tpp: &tp); |
175 | if (error) |
176 | return error; |
177 | |
178 | error = xfs_alloc_read_agf(pag, tp, 0, &agbp); |
179 | if (error) |
180 | goto out_trans_cancel; |
181 | |
182 | cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); |
183 | |
184 | /* |
185 | * Look up the extent length requested in the AGF and start with it. |
186 | */ |
187 | if (tcur->ar_startblock == NULLAGBLOCK) |
188 | error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); |
189 | else |
190 | error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, |
191 | tcur->ar_blockcount, &i); |
192 | if (error) |
193 | goto out_del_cursor; |
194 | if (i == 0) { |
195 | /* nothing of that length left in the AG, we are done */ |
196 | tcur->ar_blockcount = 0; |
197 | goto out_del_cursor; |
198 | } |
199 | |
200 | /* |
201 | * Loop until we are done with all extents that are large |
202 | * enough to be worth discarding or we hit batch limits. |
203 | */ |
204 | while (i) { |
205 | xfs_agblock_t fbno; |
206 | xfs_extlen_t flen; |
207 | xfs_daddr_t dbno; |
208 | xfs_extlen_t dlen; |
209 | |
210 | error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); |
211 | if (error) |
212 | break; |
213 | if (XFS_IS_CORRUPT(mp, i != 1)) { |
214 | xfs_btree_mark_sick(cur); |
215 | error = -EFSCORRUPTED; |
216 | break; |
217 | } |
218 | |
219 | if (--batch <= 0) { |
220 | /* |
221 | * Update the cursor to point at this extent so we |
222 | * restart the next batch from this extent. |
223 | */ |
224 | tcur->ar_startblock = fbno; |
225 | tcur->ar_blockcount = flen; |
226 | break; |
227 | } |
228 | |
229 | /* |
230 | * use daddr format for all range/len calculations as that is |
231 | * the format the range/len variables are supplied in by |
232 | * userspace. |
233 | */ |
234 | dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno); |
235 | dlen = XFS_FSB_TO_BB(mp, flen); |
236 | |
237 | /* |
238 | * Too small? Give up. |
239 | */ |
240 | if (dlen < minlen) { |
241 | trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); |
242 | tcur->ar_blockcount = 0; |
243 | break; |
244 | } |
245 | |
246 | /* |
247 | * If the extent is entirely outside of the range we are |
248 | * supposed to discard skip it. Do not bother to trim |
249 | * down partially overlapping ranges for now. |
250 | */ |
251 | if (dbno + dlen < start || dbno > end) { |
252 | trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); |
253 | goto next_extent; |
254 | } |
255 | |
256 | /* |
257 | * If any blocks in the range are still busy, skip the |
258 | * discard and try again the next time. |
259 | */ |
260 | if (xfs_extent_busy_search(mp, pag, fbno, flen)) { |
261 | trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); |
262 | goto next_extent; |
263 | } |
264 | |
265 | xfs_extent_busy_insert_discard(pag, fbno, flen, |
266 | &extents->extent_list); |
267 | *blocks_trimmed += flen; |
268 | next_extent: |
269 | error = xfs_btree_decrement(cur, 0, &i); |
270 | if (error) |
271 | break; |
272 | |
273 | /* |
274 | * If there's no more records in the tree, we are done. Set the |
275 | * cursor block count to 0 to indicate to the caller that there |
276 | * is no more extents to search. |
277 | */ |
278 | if (i == 0) |
279 | tcur->ar_blockcount = 0; |
280 | } |
281 | |
282 | /* |
283 | * If there was an error, release all the gathered busy extents because |
284 | * we aren't going to issue a discard on them any more. |
285 | */ |
286 | if (error) |
287 | xfs_extent_busy_clear(mp, list: &extents->extent_list, do_discard: false); |
288 | out_del_cursor: |
289 | xfs_btree_del_cursor(cur, error); |
290 | out_trans_cancel: |
291 | xfs_trans_cancel(tp); |
292 | return error; |
293 | } |
294 | |
295 | static bool |
296 | xfs_trim_should_stop(void) |
297 | { |
298 | return fatal_signal_pending(current) || freezing(current); |
299 | } |
300 | |
301 | /* |
302 | * Iterate the free list gathering extents and discarding them. We need a cursor |
303 | * for the repeated iteration of gather/discard loop, so use the longest extent |
304 | * we found in the last batch as the key to start the next. |
305 | */ |
306 | static int |
307 | xfs_trim_extents( |
308 | struct xfs_perag *pag, |
309 | xfs_daddr_t start, |
310 | xfs_daddr_t end, |
311 | xfs_daddr_t minlen, |
312 | uint64_t *blocks_trimmed) |
313 | { |
314 | struct xfs_alloc_rec_incore tcur = { |
315 | .ar_blockcount = pag->pagf_longest, |
316 | .ar_startblock = NULLAGBLOCK, |
317 | }; |
318 | int error = 0; |
319 | |
320 | do { |
321 | struct xfs_busy_extents *extents; |
322 | |
323 | extents = kzalloc(size: sizeof(*extents), GFP_KERNEL); |
324 | if (!extents) { |
325 | error = -ENOMEM; |
326 | break; |
327 | } |
328 | |
329 | extents->mount = pag->pag_mount; |
330 | extents->owner = extents; |
331 | INIT_LIST_HEAD(list: &extents->extent_list); |
332 | |
333 | error = xfs_trim_gather_extents(pag, start, end, minlen, |
334 | tcur: &tcur, extents, blocks_trimmed); |
335 | if (error) { |
336 | kfree(objp: extents); |
337 | break; |
338 | } |
339 | |
340 | /* |
341 | * We hand the extent list to the discard function here so the |
342 | * discarded extents can be removed from the busy extent list. |
343 | * This allows the discards to run asynchronously with gathering |
344 | * the next round of extents to discard. |
345 | * |
346 | * However, we must ensure that we do not reference the extent |
347 | * list after this function call, as it may have been freed by |
348 | * the time control returns to us. |
349 | */ |
350 | error = xfs_discard_extents(mp: pag->pag_mount, extents); |
351 | if (error) |
352 | break; |
353 | |
354 | if (xfs_trim_should_stop()) |
355 | break; |
356 | |
357 | } while (tcur.ar_blockcount != 0); |
358 | |
359 | return error; |
360 | |
361 | } |
362 | |
363 | /* |
364 | * trim a range of the filesystem. |
365 | * |
366 | * Note: the parameters passed from userspace are byte ranges into the |
367 | * filesystem which does not match to the format we use for filesystem block |
368 | * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format |
369 | * is a linear address range. Hence we need to use DADDR based conversions and |
370 | * comparisons for determining the correct offset and regions to trim. |
371 | */ |
372 | int |
373 | xfs_ioc_trim( |
374 | struct xfs_mount *mp, |
375 | struct fstrim_range __user *urange) |
376 | { |
377 | struct xfs_perag *pag; |
378 | unsigned int granularity = |
379 | bdev_discard_granularity(bdev: mp->m_ddev_targp->bt_bdev); |
380 | struct fstrim_range range; |
381 | xfs_daddr_t start, end, minlen; |
382 | xfs_agnumber_t agno; |
383 | uint64_t blocks_trimmed = 0; |
384 | int error, last_error = 0; |
385 | |
386 | if (!capable(CAP_SYS_ADMIN)) |
387 | return -EPERM; |
388 | if (!bdev_max_discard_sectors(bdev: mp->m_ddev_targp->bt_bdev)) |
389 | return -EOPNOTSUPP; |
390 | |
391 | /* |
392 | * We haven't recovered the log, so we cannot use our bnobt-guided |
393 | * storage zapping commands. |
394 | */ |
395 | if (xfs_has_norecovery(mp)) |
396 | return -EROFS; |
397 | |
398 | if (copy_from_user(to: &range, from: urange, n: sizeof(range))) |
399 | return -EFAULT; |
400 | |
401 | range.minlen = max_t(u64, granularity, range.minlen); |
402 | minlen = BTOBB(range.minlen); |
403 | /* |
404 | * Truncating down the len isn't actually quite correct, but using |
405 | * BBTOB would mean we trivially get overflows for values |
406 | * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default |
407 | * used by the fstrim application. In the end it really doesn't |
408 | * matter as trimming blocks is an advisory interface. |
409 | */ |
410 | if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || |
411 | range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || |
412 | range.len < mp->m_sb.sb_blocksize) |
413 | return -EINVAL; |
414 | |
415 | start = BTOBB(range.start); |
416 | end = start + BTOBBT(range.len) - 1; |
417 | |
418 | if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) |
419 | end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1; |
420 | |
421 | agno = xfs_daddr_to_agno(mp, start); |
422 | for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { |
423 | error = xfs_trim_extents(pag, start, end, minlen, |
424 | &blocks_trimmed); |
425 | if (error) |
426 | last_error = error; |
427 | |
428 | if (xfs_trim_should_stop()) { |
429 | xfs_perag_rele(pag); |
430 | break; |
431 | } |
432 | } |
433 | |
434 | if (last_error) |
435 | return last_error; |
436 | |
437 | range.len = XFS_FSB_TO_B(mp, blocks_trimmed); |
438 | if (copy_to_user(to: urange, from: &range, n: sizeof(range))) |
439 | return -EFAULT; |
440 | return 0; |
441 | } |
442 | |