1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2019 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_log_format.h" |
11 | #include "xfs_trans_resv.h" |
12 | #include "xfs_mount.h" |
13 | #include "xfs_inode.h" |
14 | #include "xfs_btree.h" |
15 | #include "xfs_ialloc.h" |
16 | #include "xfs_ialloc_btree.h" |
17 | #include "xfs_iwalk.h" |
18 | #include "xfs_error.h" |
19 | #include "xfs_trace.h" |
20 | #include "xfs_icache.h" |
21 | #include "xfs_health.h" |
22 | #include "xfs_trans.h" |
23 | #include "xfs_pwork.h" |
24 | #include "xfs_ag.h" |
25 | #include "xfs_bit.h" |
26 | |
27 | /* |
28 | * Walking Inodes in the Filesystem |
29 | * ================================ |
30 | * |
31 | * This iterator function walks a subset of filesystem inodes in increasing |
32 | * order from @startino until there are no more inodes. For each allocated |
33 | * inode it finds, it calls a walk function with the relevant inode number and |
34 | * a pointer to caller-provided data. The walk function can return the usual |
35 | * negative error code to stop the iteration; 0 to continue the iteration; or |
36 | * -ECANCELED to stop the iteration. This return value is returned to the |
37 | * caller. |
38 | * |
39 | * Internally, we allow the walk function to do anything, which means that we |
40 | * cannot maintain the inobt cursor or our lock on the AGI buffer. We |
41 | * therefore cache the inobt records in kernel memory and only call the walk |
42 | * function when our memory buffer is full. @nr_recs is the number of records |
43 | * that we've cached, and @sz_recs is the size of our cache. |
44 | * |
45 | * It is the responsibility of the walk function to ensure it accesses |
46 | * allocated inodes, as the inobt records may be stale by the time they are |
47 | * acted upon. |
48 | */ |
49 | |
50 | struct xfs_iwalk_ag { |
51 | /* parallel work control data; will be null if single threaded */ |
52 | struct xfs_pwork pwork; |
53 | |
54 | struct xfs_mount *mp; |
55 | struct xfs_trans *tp; |
56 | struct xfs_perag *pag; |
57 | |
58 | /* Where do we start the traversal? */ |
59 | xfs_ino_t startino; |
60 | |
61 | /* What was the last inode number we saw when iterating the inobt? */ |
62 | xfs_ino_t lastino; |
63 | |
64 | /* Array of inobt records we cache. */ |
65 | struct xfs_inobt_rec_incore *recs; |
66 | |
67 | /* Number of entries allocated for the @recs array. */ |
68 | unsigned int sz_recs; |
69 | |
70 | /* Number of entries in the @recs array that are in use. */ |
71 | unsigned int nr_recs; |
72 | |
73 | /* Inode walk function and data pointer. */ |
74 | xfs_iwalk_fn iwalk_fn; |
75 | xfs_inobt_walk_fn inobt_walk_fn; |
76 | void *data; |
77 | |
78 | /* |
79 | * Make it look like the inodes up to startino are free so that |
80 | * bulkstat can start its inode iteration at the correct place without |
81 | * needing to special case everywhere. |
82 | */ |
83 | unsigned int trim_start:1; |
84 | |
85 | /* Skip empty inobt records? */ |
86 | unsigned int skip_empty:1; |
87 | |
88 | /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ |
89 | unsigned int drop_trans:1; |
90 | }; |
91 | |
92 | /* |
93 | * Loop over all clusters in a chunk for a given incore inode allocation btree |
94 | * record. Do a readahead if there are any allocated inodes in that cluster. |
95 | */ |
96 | STATIC void |
97 | xfs_iwalk_ichunk_ra( |
98 | struct xfs_mount *mp, |
99 | struct xfs_perag *pag, |
100 | struct xfs_inobt_rec_incore *irec) |
101 | { |
102 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
103 | xfs_agnumber_t agno = pag->pag_agno; |
104 | xfs_agblock_t agbno; |
105 | struct blk_plug plug; |
106 | int i; /* inode chunk index */ |
107 | |
108 | agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); |
109 | |
110 | blk_start_plug(&plug); |
111 | for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) { |
112 | xfs_inofree_t imask; |
113 | |
114 | imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); |
115 | if (imask & ~irec->ir_free) { |
116 | xfs_buf_readahead(mp->m_ddev_targp, |
117 | XFS_AGB_TO_DADDR(mp, agno, agbno), |
118 | igeo->blocks_per_cluster * mp->m_bsize, |
119 | &xfs_inode_buf_ops); |
120 | } |
121 | agbno += igeo->blocks_per_cluster; |
122 | } |
123 | blk_finish_plug(&plug); |
124 | } |
125 | |
126 | /* |
127 | * Set the bits in @irec's free mask that correspond to the inodes before |
128 | * @agino so that we skip them. This is how we restart an inode walk that was |
129 | * interrupted in the middle of an inode record. |
130 | */ |
131 | STATIC void |
132 | xfs_iwalk_adjust_start( |
133 | xfs_agino_t agino, /* starting inode of chunk */ |
134 | struct xfs_inobt_rec_incore *irec) /* btree record */ |
135 | { |
136 | int idx; /* index into inode chunk */ |
137 | |
138 | idx = agino - irec->ir_startino; |
139 | |
140 | irec->ir_free |= xfs_inobt_maskn(0, idx); |
141 | irec->ir_freecount = hweight64(irec->ir_free); |
142 | } |
143 | |
144 | /* Allocate memory for a walk. */ |
145 | STATIC int |
146 | xfs_iwalk_alloc( |
147 | struct xfs_iwalk_ag *iwag) |
148 | { |
149 | size_t size; |
150 | |
151 | ASSERT(iwag->recs == NULL); |
152 | iwag->nr_recs = 0; |
153 | |
154 | /* Allocate a prefetch buffer for inobt records. */ |
155 | size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); |
156 | iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); |
157 | if (iwag->recs == NULL) |
158 | return -ENOMEM; |
159 | |
160 | return 0; |
161 | } |
162 | |
163 | /* Free memory we allocated for a walk. */ |
164 | STATIC void |
165 | xfs_iwalk_free( |
166 | struct xfs_iwalk_ag *iwag) |
167 | { |
168 | kfree(objp: iwag->recs); |
169 | iwag->recs = NULL; |
170 | } |
171 | |
172 | /* For each inuse inode in each cached inobt record, call our function. */ |
173 | STATIC int |
174 | xfs_iwalk_ag_recs( |
175 | struct xfs_iwalk_ag *iwag) |
176 | { |
177 | struct xfs_mount *mp = iwag->mp; |
178 | struct xfs_trans *tp = iwag->tp; |
179 | struct xfs_perag *pag = iwag->pag; |
180 | xfs_ino_t ino; |
181 | unsigned int i, j; |
182 | int error; |
183 | |
184 | for (i = 0; i < iwag->nr_recs; i++) { |
185 | struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; |
186 | |
187 | trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); |
188 | |
189 | if (xfs_pwork_want_abort(pwork: &iwag->pwork)) |
190 | return 0; |
191 | |
192 | if (iwag->inobt_walk_fn) { |
193 | error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, |
194 | iwag->data); |
195 | if (error) |
196 | return error; |
197 | } |
198 | |
199 | if (!iwag->iwalk_fn) |
200 | continue; |
201 | |
202 | for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { |
203 | if (xfs_pwork_want_abort(&iwag->pwork)) |
204 | return 0; |
205 | |
206 | /* Skip if this inode is free */ |
207 | if (XFS_INOBT_MASK(j) & irec->ir_free) |
208 | continue; |
209 | |
210 | /* Otherwise call our function. */ |
211 | ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, |
212 | irec->ir_startino + j); |
213 | error = iwag->iwalk_fn(mp, tp, ino, iwag->data); |
214 | if (error) |
215 | return error; |
216 | } |
217 | } |
218 | |
219 | return 0; |
220 | } |
221 | |
222 | /* Delete cursor and let go of AGI. */ |
223 | static inline void |
224 | xfs_iwalk_del_inobt( |
225 | struct xfs_trans *tp, |
226 | struct xfs_btree_cur **curpp, |
227 | struct xfs_buf **agi_bpp, |
228 | int error) |
229 | { |
230 | if (*curpp) { |
231 | xfs_btree_del_cursor(*curpp, error); |
232 | *curpp = NULL; |
233 | } |
234 | if (*agi_bpp) { |
235 | xfs_trans_brelse(tp, *agi_bpp); |
236 | *agi_bpp = NULL; |
237 | } |
238 | } |
239 | |
240 | /* |
241 | * Set ourselves up for walking inobt records starting from a given point in |
242 | * the filesystem. |
243 | * |
244 | * If caller passed in a nonzero start inode number, load the record from the |
245 | * inobt and make the record look like all the inodes before agino are free so |
246 | * that we skip them, and then move the cursor to the next inobt record. This |
247 | * is how we support starting an iwalk in the middle of an inode chunk. |
248 | * |
249 | * If the caller passed in a start number of zero, move the cursor to the first |
250 | * inobt record. |
251 | * |
252 | * The caller is responsible for cleaning up the cursor and buffer pointer |
253 | * regardless of the error status. |
254 | */ |
255 | STATIC int |
256 | xfs_iwalk_ag_start( |
257 | struct xfs_iwalk_ag *iwag, |
258 | xfs_agino_t agino, |
259 | struct xfs_btree_cur **curpp, |
260 | struct xfs_buf **agi_bpp, |
261 | int *has_more) |
262 | { |
263 | struct xfs_mount *mp = iwag->mp; |
264 | struct xfs_trans *tp = iwag->tp; |
265 | struct xfs_perag *pag = iwag->pag; |
266 | struct xfs_inobt_rec_incore *irec; |
267 | int error; |
268 | |
269 | /* Set up a fresh cursor and empty the inobt cache. */ |
270 | iwag->nr_recs = 0; |
271 | error = xfs_ialloc_read_agi(pag, tp, agi_bpp); |
272 | if (error) |
273 | return error; |
274 | *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); |
275 | |
276 | /* Starting at the beginning of the AG? That's easy! */ |
277 | if (agino == 0) |
278 | return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); |
279 | |
280 | /* |
281 | * Otherwise, we have to grab the inobt record where we left off, stuff |
282 | * the record into our cache, and then see if there are more records. |
283 | * We require a lookup cache of at least two elements so that the |
284 | * caller doesn't have to deal with tearing down the cursor to walk the |
285 | * records. |
286 | */ |
287 | error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more); |
288 | if (error) |
289 | return error; |
290 | |
291 | /* |
292 | * If the LE lookup at @agino yields no records, jump ahead to the |
293 | * inobt cursor increment to see if there are more records to process. |
294 | */ |
295 | if (!*has_more) |
296 | goto out_advance; |
297 | |
298 | /* Get the record, should always work */ |
299 | irec = &iwag->recs[iwag->nr_recs]; |
300 | error = xfs_inobt_get_rec(*curpp, irec, has_more); |
301 | if (error) |
302 | return error; |
303 | if (XFS_IS_CORRUPT(mp, *has_more != 1)) { |
304 | xfs_btree_mark_sick(*curpp); |
305 | return -EFSCORRUPTED; |
306 | } |
307 | |
308 | iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, |
309 | irec->ir_startino + XFS_INODES_PER_CHUNK - 1); |
310 | |
311 | /* |
312 | * If the LE lookup yielded an inobt record before the cursor position, |
313 | * skip it and see if there's another one after it. |
314 | */ |
315 | if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) |
316 | goto out_advance; |
317 | |
318 | /* |
319 | * If agino fell in the middle of the inode record, make it look like |
320 | * the inodes up to agino are free so that we don't return them again. |
321 | */ |
322 | if (iwag->trim_start) |
323 | xfs_iwalk_adjust_start(agino, irec); |
324 | |
325 | /* |
326 | * The prefetch calculation is supposed to give us a large enough inobt |
327 | * record cache that grab_ichunk can stage a partial first record and |
328 | * the loop body can cache a record without having to check for cache |
329 | * space until after it reads an inobt record. |
330 | */ |
331 | iwag->nr_recs++; |
332 | ASSERT(iwag->nr_recs < iwag->sz_recs); |
333 | |
334 | out_advance: |
335 | return xfs_btree_increment(*curpp, 0, has_more); |
336 | } |
337 | |
338 | /* |
339 | * The inobt record cache is full, so preserve the inobt cursor state and |
340 | * run callbacks on the cached inobt records. When we're done, restore the |
341 | * cursor state to wherever the cursor would have been had the cache not been |
342 | * full (and therefore we could've just incremented the cursor) if *@has_more |
343 | * is true. On exit, *@has_more will indicate whether or not the caller should |
344 | * try for more inode records. |
345 | */ |
346 | STATIC int |
347 | xfs_iwalk_run_callbacks( |
348 | struct xfs_iwalk_ag *iwag, |
349 | struct xfs_btree_cur **curpp, |
350 | struct xfs_buf **agi_bpp, |
351 | int *has_more) |
352 | { |
353 | struct xfs_mount *mp = iwag->mp; |
354 | struct xfs_inobt_rec_incore *irec; |
355 | xfs_agino_t next_agino; |
356 | int error; |
357 | |
358 | next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; |
359 | |
360 | ASSERT(iwag->nr_recs > 0); |
361 | |
362 | /* Delete cursor but remember the last record we cached... */ |
363 | xfs_iwalk_del_inobt(tp: iwag->tp, curpp, agi_bpp, error: 0); |
364 | irec = &iwag->recs[iwag->nr_recs - 1]; |
365 | ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); |
366 | |
367 | if (iwag->drop_trans) { |
368 | xfs_trans_cancel(iwag->tp); |
369 | iwag->tp = NULL; |
370 | } |
371 | |
372 | error = xfs_iwalk_ag_recs(iwag); |
373 | if (error) |
374 | return error; |
375 | |
376 | /* ...empty the cache... */ |
377 | iwag->nr_recs = 0; |
378 | |
379 | if (!has_more) |
380 | return 0; |
381 | |
382 | if (iwag->drop_trans) { |
383 | error = xfs_trans_alloc_empty(mp, tpp: &iwag->tp); |
384 | if (error) |
385 | return error; |
386 | } |
387 | |
388 | /* ...and recreate the cursor just past where we left off. */ |
389 | error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); |
390 | if (error) |
391 | return error; |
392 | *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); |
393 | return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); |
394 | } |
395 | |
396 | /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ |
397 | STATIC int |
398 | xfs_iwalk_ag( |
399 | struct xfs_iwalk_ag *iwag) |
400 | { |
401 | struct xfs_mount *mp = iwag->mp; |
402 | struct xfs_perag *pag = iwag->pag; |
403 | struct xfs_buf *agi_bp = NULL; |
404 | struct xfs_btree_cur *cur = NULL; |
405 | xfs_agino_t agino; |
406 | int has_more; |
407 | int error = 0; |
408 | |
409 | /* Set up our cursor at the right place in the inode btree. */ |
410 | ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); |
411 | agino = XFS_INO_TO_AGINO(mp, iwag->startino); |
412 | error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); |
413 | |
414 | while (!error && has_more) { |
415 | struct xfs_inobt_rec_incore *irec; |
416 | xfs_ino_t rec_fsino; |
417 | |
418 | cond_resched(); |
419 | if (xfs_pwork_want_abort(pwork: &iwag->pwork)) |
420 | goto out; |
421 | |
422 | /* Fetch the inobt record. */ |
423 | irec = &iwag->recs[iwag->nr_recs]; |
424 | error = xfs_inobt_get_rec(cur, irec, &has_more); |
425 | if (error || !has_more) |
426 | break; |
427 | |
428 | /* Make sure that we always move forward. */ |
429 | rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); |
430 | if (iwag->lastino != NULLFSINO && |
431 | XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { |
432 | xfs_btree_mark_sick(cur); |
433 | error = -EFSCORRUPTED; |
434 | goto out; |
435 | } |
436 | iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; |
437 | |
438 | /* No allocated inodes in this chunk; skip it. */ |
439 | if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { |
440 | error = xfs_btree_increment(cur, 0, &has_more); |
441 | if (error) |
442 | break; |
443 | continue; |
444 | } |
445 | |
446 | /* |
447 | * Start readahead for this inode chunk in anticipation of |
448 | * walking the inodes. |
449 | */ |
450 | if (iwag->iwalk_fn) |
451 | xfs_iwalk_ichunk_ra(mp, pag, irec); |
452 | |
453 | /* |
454 | * If there's space in the buffer for more records, increment |
455 | * the btree cursor and grab more. |
456 | */ |
457 | if (++iwag->nr_recs < iwag->sz_recs) { |
458 | error = xfs_btree_increment(cur, 0, &has_more); |
459 | if (error || !has_more) |
460 | break; |
461 | continue; |
462 | } |
463 | |
464 | /* |
465 | * Otherwise, we need to save cursor state and run the callback |
466 | * function on the cached records. The run_callbacks function |
467 | * is supposed to return a cursor pointing to the record where |
468 | * we would be if we had been able to increment like above. |
469 | */ |
470 | ASSERT(has_more); |
471 | error = xfs_iwalk_run_callbacks(iwag, curpp: &cur, agi_bpp: &agi_bp, has_more: &has_more); |
472 | } |
473 | |
474 | if (iwag->nr_recs == 0 || error) |
475 | goto out; |
476 | |
477 | /* Walk the unprocessed records in the cache. */ |
478 | error = xfs_iwalk_run_callbacks(iwag, curpp: &cur, agi_bpp: &agi_bp, has_more: &has_more); |
479 | |
480 | out: |
481 | xfs_iwalk_del_inobt(tp: iwag->tp, curpp: &cur, agi_bpp: &agi_bp, error); |
482 | return error; |
483 | } |
484 | |
485 | /* |
486 | * We experimentally determined that the reduction in ioctl call overhead |
487 | * diminishes when userspace asks for more than 2048 inodes, so we'll cap |
488 | * prefetch at this point. |
489 | */ |
490 | #define IWALK_MAX_INODE_PREFETCH (2048U) |
491 | |
492 | /* |
493 | * Given the number of inodes to prefetch, set the number of inobt records that |
494 | * we cache in memory, which controls the number of inodes we try to read |
495 | * ahead. Set the maximum if @inodes == 0. |
496 | */ |
497 | static inline unsigned int |
498 | xfs_iwalk_prefetch( |
499 | unsigned int inodes) |
500 | { |
501 | unsigned int inobt_records; |
502 | |
503 | /* |
504 | * If the caller didn't tell us the number of inodes they wanted, |
505 | * assume the maximum prefetch possible for best performance. |
506 | * Otherwise, cap prefetch at that maximum so that we don't start an |
507 | * absurd amount of prefetch. |
508 | */ |
509 | if (inodes == 0) |
510 | inodes = IWALK_MAX_INODE_PREFETCH; |
511 | inodes = min(inodes, IWALK_MAX_INODE_PREFETCH); |
512 | |
513 | /* Round the inode count up to a full chunk. */ |
514 | inodes = round_up(inodes, XFS_INODES_PER_CHUNK); |
515 | |
516 | /* |
517 | * In order to convert the number of inodes to prefetch into an |
518 | * estimate of the number of inobt records to cache, we require a |
519 | * conversion factor that reflects our expectations of the average |
520 | * loading factor of an inode chunk. Based on data gathered, most |
521 | * (but not all) filesystems manage to keep the inode chunks totally |
522 | * full, so we'll underestimate slightly so that our readahead will |
523 | * still deliver the performance we want on aging filesystems: |
524 | * |
525 | * inobt = inodes / (INODES_PER_CHUNK * (4 / 5)); |
526 | * |
527 | * The funny math is to avoid integer division. |
528 | */ |
529 | inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK); |
530 | |
531 | /* |
532 | * Allocate enough space to prefetch at least two inobt records so that |
533 | * we can cache both the record where the iwalk started and the next |
534 | * record. This simplifies the AG inode walk loop setup code. |
535 | */ |
536 | return max(inobt_records, 2U); |
537 | } |
538 | |
539 | /* |
540 | * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn |
541 | * will be called for each allocated inode, being passed the inode's number and |
542 | * @data. @max_prefetch controls how many inobt records' worth of inodes we |
543 | * try to readahead. |
544 | */ |
545 | int |
546 | xfs_iwalk( |
547 | struct xfs_mount *mp, |
548 | struct xfs_trans *tp, |
549 | xfs_ino_t startino, |
550 | unsigned int flags, |
551 | xfs_iwalk_fn iwalk_fn, |
552 | unsigned int inode_records, |
553 | void *data) |
554 | { |
555 | struct xfs_iwalk_ag iwag = { |
556 | .mp = mp, |
557 | .tp = tp, |
558 | .iwalk_fn = iwalk_fn, |
559 | .data = data, |
560 | .startino = startino, |
561 | .sz_recs = xfs_iwalk_prefetch(inode_records), |
562 | .trim_start = 1, |
563 | .skip_empty = 1, |
564 | .pwork = XFS_PWORK_SINGLE_THREADED, |
565 | .lastino = NULLFSINO, |
566 | }; |
567 | struct xfs_perag *pag; |
568 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); |
569 | int error; |
570 | |
571 | ASSERT(agno < mp->m_sb.sb_agcount); |
572 | ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); |
573 | |
574 | error = xfs_iwalk_alloc(iwag: &iwag); |
575 | if (error) |
576 | return error; |
577 | |
578 | for_each_perag_from(mp, agno, pag) { |
579 | iwag.pag = pag; |
580 | error = xfs_iwalk_ag(&iwag); |
581 | if (error) |
582 | break; |
583 | iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); |
584 | if (flags & XFS_INOBT_WALK_SAME_AG) |
585 | break; |
586 | iwag.pag = NULL; |
587 | } |
588 | |
589 | if (iwag.pag) |
590 | xfs_perag_rele(pag); |
591 | xfs_iwalk_free(iwag: &iwag); |
592 | return error; |
593 | } |
594 | |
595 | /* Run per-thread iwalk work. */ |
596 | static int |
597 | xfs_iwalk_ag_work( |
598 | struct xfs_mount *mp, |
599 | struct xfs_pwork *pwork) |
600 | { |
601 | struct xfs_iwalk_ag *iwag; |
602 | int error = 0; |
603 | |
604 | iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); |
605 | if (xfs_pwork_want_abort(pwork)) |
606 | goto out; |
607 | |
608 | error = xfs_iwalk_alloc(iwag); |
609 | if (error) |
610 | goto out; |
611 | /* |
612 | * Grab an empty transaction so that we can use its recursive buffer |
613 | * locking abilities to detect cycles in the inobt without deadlocking. |
614 | */ |
615 | error = xfs_trans_alloc_empty(mp, tpp: &iwag->tp); |
616 | if (error) |
617 | goto out; |
618 | iwag->drop_trans = 1; |
619 | |
620 | error = xfs_iwalk_ag(iwag); |
621 | if (iwag->tp) |
622 | xfs_trans_cancel(iwag->tp); |
623 | xfs_iwalk_free(iwag); |
624 | out: |
625 | xfs_perag_put(iwag->pag); |
626 | kfree(objp: iwag); |
627 | return error; |
628 | } |
629 | |
630 | /* |
631 | * Walk all the inodes in the filesystem using multiple threads to process each |
632 | * AG. |
633 | */ |
634 | int |
635 | xfs_iwalk_threaded( |
636 | struct xfs_mount *mp, |
637 | xfs_ino_t startino, |
638 | unsigned int flags, |
639 | xfs_iwalk_fn iwalk_fn, |
640 | unsigned int inode_records, |
641 | bool polled, |
642 | void *data) |
643 | { |
644 | struct xfs_pwork_ctl pctl; |
645 | struct xfs_perag *pag; |
646 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); |
647 | int error; |
648 | |
649 | ASSERT(agno < mp->m_sb.sb_agcount); |
650 | ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); |
651 | |
652 | error = xfs_pwork_init(mp, pctl: &pctl, work_fn: xfs_iwalk_ag_work, tag: "xfs_iwalk" ); |
653 | if (error) |
654 | return error; |
655 | |
656 | for_each_perag_from(mp, agno, pag) { |
657 | struct xfs_iwalk_ag *iwag; |
658 | |
659 | if (xfs_pwork_ctl_want_abort(&pctl)) |
660 | break; |
661 | |
662 | iwag = kzalloc(sizeof(struct xfs_iwalk_ag), |
663 | GFP_KERNEL | __GFP_NOFAIL); |
664 | iwag->mp = mp; |
665 | |
666 | /* |
667 | * perag is being handed off to async work, so take a passive |
668 | * reference for the async work to release. |
669 | */ |
670 | iwag->pag = xfs_perag_hold(pag); |
671 | iwag->iwalk_fn = iwalk_fn; |
672 | iwag->data = data; |
673 | iwag->startino = startino; |
674 | iwag->sz_recs = xfs_iwalk_prefetch(inode_records); |
675 | iwag->lastino = NULLFSINO; |
676 | xfs_pwork_queue(&pctl, &iwag->pwork); |
677 | startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); |
678 | if (flags & XFS_INOBT_WALK_SAME_AG) |
679 | break; |
680 | } |
681 | if (pag) |
682 | xfs_perag_rele(pag); |
683 | if (polled) |
684 | xfs_pwork_poll(pctl: &pctl); |
685 | return xfs_pwork_destroy(pctl: &pctl); |
686 | } |
687 | |
688 | /* |
689 | * Allow callers to cache up to a page's worth of inobt records. This reflects |
690 | * the existing inumbers prefetching behavior. Since the inobt walk does not |
691 | * itself do anything with the inobt records, we can set a fairly high limit |
692 | * here. |
693 | */ |
694 | #define MAX_INOBT_WALK_PREFETCH \ |
695 | (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore)) |
696 | |
697 | /* |
698 | * Given the number of records that the user wanted, set the number of inobt |
699 | * records that we buffer in memory. Set the maximum if @inobt_records == 0. |
700 | */ |
701 | static inline unsigned int |
702 | xfs_inobt_walk_prefetch( |
703 | unsigned int inobt_records) |
704 | { |
705 | /* |
706 | * If the caller didn't tell us the number of inobt records they |
707 | * wanted, assume the maximum prefetch possible for best performance. |
708 | */ |
709 | if (inobt_records == 0) |
710 | inobt_records = MAX_INOBT_WALK_PREFETCH; |
711 | |
712 | /* |
713 | * Allocate enough space to prefetch at least two inobt records so that |
714 | * we can cache both the record where the iwalk started and the next |
715 | * record. This simplifies the AG inode walk loop setup code. |
716 | */ |
717 | inobt_records = max(inobt_records, 2U); |
718 | |
719 | /* |
720 | * Cap prefetch at that maximum so that we don't use an absurd amount |
721 | * of memory. |
722 | */ |
723 | return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH); |
724 | } |
725 | |
726 | /* |
727 | * Walk all inode btree records in the filesystem starting from @startino. The |
728 | * @inobt_walk_fn will be called for each btree record, being passed the incore |
729 | * record and @data. @max_prefetch controls how many inobt records we try to |
730 | * cache ahead of time. |
731 | */ |
732 | int |
733 | xfs_inobt_walk( |
734 | struct xfs_mount *mp, |
735 | struct xfs_trans *tp, |
736 | xfs_ino_t startino, |
737 | unsigned int flags, |
738 | xfs_inobt_walk_fn inobt_walk_fn, |
739 | unsigned int inobt_records, |
740 | void *data) |
741 | { |
742 | struct xfs_iwalk_ag iwag = { |
743 | .mp = mp, |
744 | .tp = tp, |
745 | .inobt_walk_fn = inobt_walk_fn, |
746 | .data = data, |
747 | .startino = startino, |
748 | .sz_recs = xfs_inobt_walk_prefetch(inobt_records), |
749 | .pwork = XFS_PWORK_SINGLE_THREADED, |
750 | .lastino = NULLFSINO, |
751 | }; |
752 | struct xfs_perag *pag; |
753 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); |
754 | int error; |
755 | |
756 | ASSERT(agno < mp->m_sb.sb_agcount); |
757 | ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL)); |
758 | |
759 | error = xfs_iwalk_alloc(iwag: &iwag); |
760 | if (error) |
761 | return error; |
762 | |
763 | for_each_perag_from(mp, agno, pag) { |
764 | iwag.pag = pag; |
765 | error = xfs_iwalk_ag(&iwag); |
766 | if (error) |
767 | break; |
768 | iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); |
769 | if (flags & XFS_INOBT_WALK_SAME_AG) |
770 | break; |
771 | iwag.pag = NULL; |
772 | } |
773 | |
774 | if (iwag.pag) |
775 | xfs_perag_rele(pag); |
776 | xfs_iwalk_free(iwag: &iwag); |
777 | return error; |
778 | } |
779 | |