1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2020 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_log_format.h" |
11 | #include "xfs_trans_resv.h" |
12 | #include "xfs_bit.h" |
13 | #include "xfs_mount.h" |
14 | #include "xfs_inode.h" |
15 | #include "xfs_trans.h" |
16 | #include "xfs_btree.h" |
17 | #include "xfs_trace.h" |
18 | #include "xfs_btree_staging.h" |
19 | |
20 | /* |
21 | * Staging Cursors and Fake Roots for Btrees |
22 | * ========================================= |
23 | * |
24 | * A staging btree cursor is a special type of btree cursor that callers must |
25 | * use to construct a new btree index using the btree bulk loader code. The |
26 | * bulk loading code uses the staging btree cursor to abstract the details of |
27 | * initializing new btree blocks and filling them with records or key/ptr |
28 | * pairs. Regular btree operations (e.g. queries and modifications) are not |
29 | * supported with staging cursors, and callers must not invoke them. |
30 | * |
31 | * Fake root structures contain all the information about a btree that is under |
32 | * construction by the bulk loading code. Staging btree cursors point to fake |
33 | * root structures instead of the usual AG header or inode structure. |
34 | * |
35 | * Callers are expected to initialize a fake root structure and pass it into |
36 | * the _stage_cursor function for a specific btree type. When bulk loading is |
37 | * complete, callers should call the _commit_staged_btree function for that |
38 | * specific btree type to commit the new btree into the filesystem. |
39 | */ |
40 | |
41 | /* |
42 | * Bulk Loading for AG Btrees |
43 | * ========================== |
44 | * |
45 | * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the |
46 | * staging cursor. Callers should initialize this to zero. |
47 | * |
48 | * The _stage_cursor() function for a specific btree type should call |
49 | * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging |
50 | * cursor. The corresponding _commit_staged_btree() function should log the |
51 | * new root and call xfs_btree_commit_afakeroot() to transform the staging |
52 | * cursor into a regular btree cursor. |
53 | */ |
54 | |
55 | /* |
56 | * Initialize a AG-rooted btree cursor with the given AG btree fake root. |
57 | */ |
58 | void |
59 | xfs_btree_stage_afakeroot( |
60 | struct xfs_btree_cur *cur, |
61 | struct xbtree_afakeroot *afake) |
62 | { |
63 | ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); |
64 | ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); |
65 | ASSERT(cur->bc_tp == NULL); |
66 | |
67 | cur->bc_ag.afake = afake; |
68 | cur->bc_nlevels = afake->af_levels; |
69 | cur->bc_flags |= XFS_BTREE_STAGING; |
70 | } |
71 | |
72 | /* |
73 | * Transform an AG-rooted staging btree cursor back into a regular cursor by |
74 | * substituting a real btree root for the fake one and restoring normal btree |
75 | * cursor ops. The caller must log the btree root change prior to calling |
76 | * this. |
77 | */ |
78 | void |
79 | xfs_btree_commit_afakeroot( |
80 | struct xfs_btree_cur *cur, |
81 | struct xfs_trans *tp, |
82 | struct xfs_buf *agbp) |
83 | { |
84 | ASSERT(cur->bc_flags & XFS_BTREE_STAGING); |
85 | ASSERT(cur->bc_tp == NULL); |
86 | |
87 | trace_xfs_btree_commit_afakeroot(cur); |
88 | |
89 | cur->bc_ag.afake = NULL; |
90 | cur->bc_ag.agbp = agbp; |
91 | cur->bc_flags &= ~XFS_BTREE_STAGING; |
92 | cur->bc_tp = tp; |
93 | } |
94 | |
95 | /* |
96 | * Bulk Loading for Inode-Rooted Btrees |
97 | * ==================================== |
98 | * |
99 | * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to |
100 | * the staging cursor. This structure should be initialized as follows: |
101 | * |
102 | * - if_fork_size field should be set to the number of bytes available to the |
103 | * fork in the inode. |
104 | * |
105 | * - if_fork should point to a freshly allocated struct xfs_ifork. |
106 | * |
107 | * - if_format should be set to the appropriate fork type (e.g. |
108 | * XFS_DINODE_FMT_BTREE). |
109 | * |
110 | * All other fields must be zero. |
111 | * |
112 | * The _stage_cursor() function for a specific btree type should call |
113 | * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging |
114 | * cursor. The corresponding _commit_staged_btree() function should log the |
115 | * new root and call xfs_btree_commit_ifakeroot() to transform the staging |
116 | * cursor into a regular btree cursor. |
117 | */ |
118 | |
119 | /* |
120 | * Initialize an inode-rooted btree cursor with the given inode btree fake |
121 | * root. The btree cursor's bc_ops will be overridden as needed to make the |
122 | * staging functionality work. If new_ops is not NULL, these new ops will be |
123 | * passed out to the caller for further overriding. |
124 | */ |
125 | void |
126 | xfs_btree_stage_ifakeroot( |
127 | struct xfs_btree_cur *cur, |
128 | struct xbtree_ifakeroot *ifake) |
129 | { |
130 | ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); |
131 | ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); |
132 | ASSERT(cur->bc_tp == NULL); |
133 | |
134 | cur->bc_ino.ifake = ifake; |
135 | cur->bc_nlevels = ifake->if_levels; |
136 | cur->bc_ino.forksize = ifake->if_fork_size; |
137 | cur->bc_flags |= XFS_BTREE_STAGING; |
138 | } |
139 | |
140 | /* |
141 | * Transform an inode-rooted staging btree cursor back into a regular cursor by |
142 | * substituting a real btree root for the fake one and restoring normal btree |
143 | * cursor ops. The caller must log the btree root change prior to calling |
144 | * this. |
145 | */ |
146 | void |
147 | xfs_btree_commit_ifakeroot( |
148 | struct xfs_btree_cur *cur, |
149 | struct xfs_trans *tp, |
150 | int whichfork) |
151 | { |
152 | ASSERT(cur->bc_flags & XFS_BTREE_STAGING); |
153 | ASSERT(cur->bc_tp == NULL); |
154 | |
155 | trace_xfs_btree_commit_ifakeroot(cur); |
156 | |
157 | cur->bc_ino.ifake = NULL; |
158 | cur->bc_ino.whichfork = whichfork; |
159 | cur->bc_flags &= ~XFS_BTREE_STAGING; |
160 | cur->bc_tp = tp; |
161 | } |
162 | |
163 | /* |
164 | * Bulk Loading of Staged Btrees |
165 | * ============================= |
166 | * |
167 | * This interface is used with a staged btree cursor to create a totally new |
168 | * btree with a large number of records (i.e. more than what would fit in a |
169 | * single root block). When the creation is complete, the new root can be |
170 | * linked atomically into the filesystem by committing the staged cursor. |
171 | * |
172 | * Creation of a new btree proceeds roughly as follows: |
173 | * |
174 | * The first step is to initialize an appropriate fake btree root structure and |
175 | * then construct a staged btree cursor. Refer to the block comments about |
176 | * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for |
177 | * more information about how to do this. |
178 | * |
179 | * The second step is to initialize a struct xfs_btree_bload context as |
180 | * documented in the structure definition. |
181 | * |
182 | * The third step is to call xfs_btree_bload_compute_geometry to compute the |
183 | * height of and the number of blocks needed to construct the btree. See the |
184 | * section "Computing the Geometry of the New Btree" for details about this |
185 | * computation. |
186 | * |
187 | * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and |
188 | * save them for later use by ->claim_block(). Bulk loading requires all |
189 | * blocks to be allocated beforehand to avoid ENOSPC failures midway through a |
190 | * rebuild, and to minimize seek distances of the new btree. |
191 | * |
192 | * Step five is to call xfs_btree_bload() to start constructing the btree. |
193 | * |
194 | * The final step is to commit the staging btree cursor, which logs the new |
195 | * btree root and turns the staging cursor into a regular cursor. The caller |
196 | * is responsible for cleaning up the previous btree blocks, if any. |
197 | * |
198 | * Computing the Geometry of the New Btree |
199 | * ======================================= |
200 | * |
201 | * The number of items placed in each btree block is computed via the following |
202 | * algorithm: For leaf levels, the number of items for the level is nr_records |
203 | * in the bload structure. For node levels, the number of items for the level |
204 | * is the number of blocks in the next lower level of the tree. For each |
205 | * level, the desired number of items per block is defined as: |
206 | * |
207 | * desired = max(minrecs, maxrecs - slack factor) |
208 | * |
209 | * The number of blocks for the level is defined to be: |
210 | * |
211 | * blocks = floor(nr_items / desired) |
212 | * |
213 | * Note this is rounded down so that the npb calculation below will never fall |
214 | * below minrecs. The number of items that will actually be loaded into each |
215 | * btree block is defined as: |
216 | * |
217 | * npb = nr_items / blocks |
218 | * |
219 | * Some of the leftmost blocks in the level will contain one extra record as |
220 | * needed to handle uneven division. If the number of records in any block |
221 | * would exceed maxrecs for that level, blocks is incremented and npb is |
222 | * recalculated. |
223 | * |
224 | * In other words, we compute the number of blocks needed to satisfy a given |
225 | * loading level, then spread the items as evenly as possible. |
226 | * |
227 | * The height and number of fs blocks required to create the btree are computed |
228 | * and returned via btree_height and nr_blocks. |
229 | */ |
230 | |
231 | /* |
232 | * Put a btree block that we're loading onto the ordered list and release it. |
233 | * The btree blocks will be written to disk when bulk loading is finished. |
234 | * If we reach the dirty buffer threshold, flush them to disk before |
235 | * continuing. |
236 | */ |
237 | static int |
238 | xfs_btree_bload_drop_buf( |
239 | struct xfs_btree_bload *bbl, |
240 | struct list_head *buffers_list, |
241 | struct xfs_buf **bpp) |
242 | { |
243 | struct xfs_buf *bp = *bpp; |
244 | int error; |
245 | |
246 | if (!bp) |
247 | return 0; |
248 | |
249 | /* |
250 | * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent |
251 | * xfs_buf_read will not pointlessly reread the contents from the disk. |
252 | */ |
253 | bp->b_flags |= XBF_DONE; |
254 | |
255 | xfs_buf_delwri_queue_here(bp, buffers_list); |
256 | xfs_buf_relse(bp); |
257 | *bpp = NULL; |
258 | bbl->nr_dirty++; |
259 | |
260 | if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) |
261 | return 0; |
262 | |
263 | error = xfs_buf_delwri_submit(buffers_list); |
264 | if (error) |
265 | return error; |
266 | |
267 | bbl->nr_dirty = 0; |
268 | return 0; |
269 | } |
270 | |
271 | /* |
272 | * Allocate and initialize one btree block for bulk loading. |
273 | * |
274 | * The new btree block will have its level and numrecs fields set to the values |
275 | * of the level and nr_this_block parameters, respectively. |
276 | * |
277 | * The caller should ensure that ptrp, bpp, and blockp refer to the left |
278 | * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp |
279 | * will all point to the new block. |
280 | */ |
281 | STATIC int |
282 | xfs_btree_bload_prep_block( |
283 | struct xfs_btree_cur *cur, |
284 | struct xfs_btree_bload *bbl, |
285 | struct list_head *buffers_list, |
286 | unsigned int level, |
287 | unsigned int nr_this_block, |
288 | union xfs_btree_ptr *ptrp, /* in/out */ |
289 | struct xfs_buf **bpp, /* in/out */ |
290 | struct xfs_btree_block **blockp, /* in/out */ |
291 | void *priv) |
292 | { |
293 | union xfs_btree_ptr new_ptr; |
294 | struct xfs_buf *new_bp; |
295 | struct xfs_btree_block *new_block; |
296 | int ret; |
297 | |
298 | if (xfs_btree_at_iroot(cur, level)) { |
299 | struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); |
300 | size_t new_size; |
301 | |
302 | ASSERT(*bpp == NULL); |
303 | |
304 | /* Allocate a new incore btree root block. */ |
305 | new_size = bbl->iroot_size(cur, level, nr_this_block, priv); |
306 | ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); |
307 | ifp->if_broot_bytes = (int)new_size; |
308 | |
309 | /* Initialize it and send it out. */ |
310 | xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, |
311 | level, nr_this_block, cur->bc_ino.ip->i_ino); |
312 | |
313 | *bpp = NULL; |
314 | *blockp = ifp->if_broot; |
315 | xfs_btree_set_ptr_null(cur, ptr: ptrp); |
316 | return 0; |
317 | } |
318 | |
319 | /* Claim one of the caller's preallocated blocks. */ |
320 | xfs_btree_set_ptr_null(cur, ptr: &new_ptr); |
321 | ret = bbl->claim_block(cur, &new_ptr, priv); |
322 | if (ret) |
323 | return ret; |
324 | |
325 | ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); |
326 | |
327 | ret = xfs_btree_get_buf_block(cur, ptr: &new_ptr, block: &new_block, bpp: &new_bp); |
328 | if (ret) |
329 | return ret; |
330 | |
331 | /* |
332 | * The previous block (if any) is the left sibling of the new block, |
333 | * so set its right sibling pointer to the new block and drop it. |
334 | */ |
335 | if (*blockp) |
336 | xfs_btree_set_sibling(cur, block: *blockp, ptr: &new_ptr, XFS_BB_RIGHTSIB); |
337 | |
338 | ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); |
339 | if (ret) |
340 | return ret; |
341 | |
342 | /* Initialize the new btree block. */ |
343 | xfs_btree_init_block_cur(cur, bp: new_bp, level, numrecs: nr_this_block); |
344 | xfs_btree_set_sibling(cur, block: new_block, ptr: ptrp, XFS_BB_LEFTSIB); |
345 | |
346 | /* Set the out parameters. */ |
347 | *bpp = new_bp; |
348 | *blockp = new_block; |
349 | xfs_btree_copy_ptrs(cur, dst_ptr: ptrp, src_ptr: &new_ptr, numptrs: 1); |
350 | return 0; |
351 | } |
352 | |
353 | /* Load one leaf block. */ |
354 | STATIC int |
355 | xfs_btree_bload_leaf( |
356 | struct xfs_btree_cur *cur, |
357 | unsigned int recs_this_block, |
358 | xfs_btree_bload_get_records_fn get_records, |
359 | struct xfs_btree_block *block, |
360 | void *priv) |
361 | { |
362 | unsigned int j = 1; |
363 | int ret; |
364 | |
365 | /* Fill the leaf block with records. */ |
366 | while (j <= recs_this_block) { |
367 | ret = get_records(cur, j, block, recs_this_block - j + 1, priv); |
368 | if (ret < 0) |
369 | return ret; |
370 | j += ret; |
371 | } |
372 | |
373 | return 0; |
374 | } |
375 | |
376 | /* |
377 | * Load one node block with key/ptr pairs. |
378 | * |
379 | * child_ptr must point to a block within the next level down in the tree. A |
380 | * key/ptr entry will be created in the new node block to the block pointed to |
381 | * by child_ptr. On exit, child_ptr points to the next block on the child |
382 | * level that needs processing. |
383 | */ |
384 | STATIC int |
385 | xfs_btree_bload_node( |
386 | struct xfs_btree_cur *cur, |
387 | unsigned int recs_this_block, |
388 | union xfs_btree_ptr *child_ptr, |
389 | struct xfs_btree_block *block) |
390 | { |
391 | unsigned int j; |
392 | int ret; |
393 | |
394 | /* Fill the node block with keys and pointers. */ |
395 | for (j = 1; j <= recs_this_block; j++) { |
396 | union xfs_btree_key child_key; |
397 | union xfs_btree_ptr *block_ptr; |
398 | union xfs_btree_key *block_key; |
399 | struct xfs_btree_block *child_block; |
400 | struct xfs_buf *child_bp; |
401 | |
402 | ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); |
403 | |
404 | /* |
405 | * Read the lower-level block in case the buffer for it has |
406 | * been reclaimed. LRU refs will be set on the block, which is |
407 | * desirable if the new btree commits. |
408 | */ |
409 | ret = xfs_btree_read_buf_block(cur, ptr: child_ptr, flags: 0, block: &child_block, |
410 | bpp: &child_bp); |
411 | if (ret) |
412 | return ret; |
413 | |
414 | block_ptr = xfs_btree_ptr_addr(cur, n: j, block); |
415 | xfs_btree_copy_ptrs(cur, dst_ptr: block_ptr, src_ptr: child_ptr, numptrs: 1); |
416 | |
417 | block_key = xfs_btree_key_addr(cur, n: j, block); |
418 | xfs_btree_get_keys(cur, block: child_block, key: &child_key); |
419 | xfs_btree_copy_keys(cur, dst_key: block_key, src_key: &child_key, numkeys: 1); |
420 | |
421 | xfs_btree_get_sibling(cur, block: child_block, ptr: child_ptr, |
422 | XFS_BB_RIGHTSIB); |
423 | xfs_buf_relse(child_bp); |
424 | } |
425 | |
426 | return 0; |
427 | } |
428 | |
429 | /* |
430 | * Compute the maximum number of records (or keyptrs) per block that we want to |
431 | * install at this level in the btree. Caller is responsible for having set |
432 | * @cur->bc_ino.forksize to the desired fork size, if appropriate. |
433 | */ |
434 | STATIC unsigned int |
435 | xfs_btree_bload_max_npb( |
436 | struct xfs_btree_cur *cur, |
437 | struct xfs_btree_bload *bbl, |
438 | unsigned int level) |
439 | { |
440 | unsigned int ret; |
441 | |
442 | if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) |
443 | return cur->bc_ops->get_dmaxrecs(cur, level); |
444 | |
445 | ret = cur->bc_ops->get_maxrecs(cur, level); |
446 | if (level == 0) |
447 | ret -= bbl->leaf_slack; |
448 | else |
449 | ret -= bbl->node_slack; |
450 | return ret; |
451 | } |
452 | |
453 | /* |
454 | * Compute the desired number of records (or keyptrs) per block that we want to |
455 | * install at this level in the btree, which must be somewhere between minrecs |
456 | * and max_npb. The caller is free to install fewer records per block. |
457 | */ |
458 | STATIC unsigned int |
459 | xfs_btree_bload_desired_npb( |
460 | struct xfs_btree_cur *cur, |
461 | struct xfs_btree_bload *bbl, |
462 | unsigned int level) |
463 | { |
464 | unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); |
465 | |
466 | /* Root blocks are not subject to minrecs rules. */ |
467 | if (level == cur->bc_nlevels - 1) |
468 | return max(1U, npb); |
469 | |
470 | return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); |
471 | } |
472 | |
473 | /* |
474 | * Compute the number of records to be stored in each block at this level and |
475 | * the number of blocks for this level. For leaf levels, we must populate an |
476 | * empty root block even if there are no records, so we have to have at least |
477 | * one block. |
478 | */ |
479 | STATIC void |
480 | xfs_btree_bload_level_geometry( |
481 | struct xfs_btree_cur *cur, |
482 | struct xfs_btree_bload *bbl, |
483 | unsigned int level, |
484 | uint64_t nr_this_level, |
485 | unsigned int *avg_per_block, |
486 | uint64_t *blocks, |
487 | uint64_t *) |
488 | { |
489 | uint64_t npb; |
490 | uint64_t dontcare; |
491 | unsigned int desired_npb; |
492 | unsigned int maxnr; |
493 | |
494 | /* |
495 | * Compute the absolute maximum number of records that we can store in |
496 | * the ondisk block or inode root. |
497 | */ |
498 | if (cur->bc_ops->get_dmaxrecs) |
499 | maxnr = cur->bc_ops->get_dmaxrecs(cur, level); |
500 | else |
501 | maxnr = cur->bc_ops->get_maxrecs(cur, level); |
502 | |
503 | /* |
504 | * Compute the number of blocks we need to fill each block with the |
505 | * desired number of records/keyptrs per block. Because desired_npb |
506 | * could be minrecs, we use regular integer division (which rounds |
507 | * the block count down) so that in the next step the effective # of |
508 | * items per block will never be less than desired_npb. |
509 | */ |
510 | desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); |
511 | *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); |
512 | *blocks = max(1ULL, *blocks); |
513 | |
514 | /* |
515 | * Compute the number of records that we will actually put in each |
516 | * block, assuming that we want to spread the records evenly between |
517 | * the blocks. Take care that the effective # of items per block (npb) |
518 | * won't exceed maxrecs even for the blocks that get an extra record, |
519 | * since desired_npb could be maxrecs, and in the previous step we |
520 | * rounded the block count down. |
521 | */ |
522 | npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); |
523 | if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { |
524 | (*blocks)++; |
525 | npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); |
526 | } |
527 | |
528 | *avg_per_block = min_t(uint64_t, npb, nr_this_level); |
529 | |
530 | trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, |
531 | *avg_per_block, desired_npb, *blocks, |
532 | *blocks_with_extra); |
533 | } |
534 | |
535 | /* |
536 | * Ensure a slack value is appropriate for the btree. |
537 | * |
538 | * If the slack value is negative, set slack so that we fill the block to |
539 | * halfway between minrecs and maxrecs. Make sure the slack is never so large |
540 | * that we can underflow minrecs. |
541 | */ |
542 | static void |
543 | xfs_btree_bload_ensure_slack( |
544 | struct xfs_btree_cur *cur, |
545 | int *slack, |
546 | int level) |
547 | { |
548 | int maxr; |
549 | int minr; |
550 | |
551 | maxr = cur->bc_ops->get_maxrecs(cur, level); |
552 | minr = cur->bc_ops->get_minrecs(cur, level); |
553 | |
554 | /* |
555 | * If slack is negative, automatically set slack so that we load the |
556 | * btree block approximately halfway between minrecs and maxrecs. |
557 | * Generally, this will net us 75% loading. |
558 | */ |
559 | if (*slack < 0) |
560 | *slack = maxr - ((maxr + minr) >> 1); |
561 | |
562 | *slack = min(*slack, maxr - minr); |
563 | } |
564 | |
565 | /* |
566 | * Prepare a btree cursor for a bulk load operation by computing the geometry |
567 | * fields in bbl. Caller must ensure that the btree cursor is a staging |
568 | * cursor. This function can be called multiple times. |
569 | */ |
570 | int |
571 | xfs_btree_bload_compute_geometry( |
572 | struct xfs_btree_cur *cur, |
573 | struct xfs_btree_bload *bbl, |
574 | uint64_t nr_records) |
575 | { |
576 | uint64_t nr_blocks = 0; |
577 | uint64_t nr_this_level; |
578 | |
579 | ASSERT(cur->bc_flags & XFS_BTREE_STAGING); |
580 | |
581 | /* |
582 | * Make sure that the slack values make sense for traditional leaf and |
583 | * node blocks. Inode-rooted btrees will return different minrecs and |
584 | * maxrecs values for the root block (bc_nlevels == level - 1). We're |
585 | * checking levels 0 and 1 here, so set bc_nlevels such that the btree |
586 | * code doesn't interpret either as the root level. |
587 | */ |
588 | cur->bc_nlevels = cur->bc_maxlevels - 1; |
589 | xfs_btree_bload_ensure_slack(cur, slack: &bbl->leaf_slack, level: 0); |
590 | xfs_btree_bload_ensure_slack(cur, slack: &bbl->node_slack, level: 1); |
591 | |
592 | bbl->nr_records = nr_this_level = nr_records; |
593 | for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { |
594 | uint64_t level_blocks; |
595 | uint64_t dontcare64; |
596 | unsigned int level = cur->bc_nlevels - 1; |
597 | unsigned int avg_per_block; |
598 | |
599 | xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, |
600 | &avg_per_block, &level_blocks, &dontcare64); |
601 | |
602 | if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { |
603 | /* |
604 | * If all the items we want to store at this level |
605 | * would fit in the inode root block, then we have our |
606 | * btree root and are done. |
607 | * |
608 | * Note that bmap btrees forbid records in the root. |
609 | */ |
610 | if (level != 0 && nr_this_level <= avg_per_block) { |
611 | nr_blocks++; |
612 | break; |
613 | } |
614 | |
615 | /* |
616 | * Otherwise, we have to store all the items for this |
617 | * level in traditional btree blocks and therefore need |
618 | * another level of btree to point to those blocks. |
619 | * |
620 | * We have to re-compute the geometry for each level of |
621 | * an inode-rooted btree because the geometry differs |
622 | * between a btree root in an inode fork and a |
623 | * traditional btree block. |
624 | * |
625 | * This distinction is made in the btree code based on |
626 | * whether level == bc_nlevels - 1. Based on the |
627 | * previous root block size check against the root |
628 | * block geometry, we know that we aren't yet ready to |
629 | * populate the root. Increment bc_nevels and |
630 | * recalculate the geometry for a traditional |
631 | * block-based btree level. |
632 | */ |
633 | cur->bc_nlevels++; |
634 | ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); |
635 | xfs_btree_bload_level_geometry(cur, bbl, level, |
636 | nr_this_level, &avg_per_block, |
637 | &level_blocks, &dontcare64); |
638 | } else { |
639 | /* |
640 | * If all the items we want to store at this level |
641 | * would fit in a single root block, we're done. |
642 | */ |
643 | if (nr_this_level <= avg_per_block) { |
644 | nr_blocks++; |
645 | break; |
646 | } |
647 | |
648 | /* Otherwise, we need another level of btree. */ |
649 | cur->bc_nlevels++; |
650 | ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); |
651 | } |
652 | |
653 | nr_blocks += level_blocks; |
654 | nr_this_level = level_blocks; |
655 | } |
656 | |
657 | if (cur->bc_nlevels > cur->bc_maxlevels) |
658 | return -EOVERFLOW; |
659 | |
660 | bbl->btree_height = cur->bc_nlevels; |
661 | if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) |
662 | bbl->nr_blocks = nr_blocks - 1; |
663 | else |
664 | bbl->nr_blocks = nr_blocks; |
665 | return 0; |
666 | } |
667 | |
668 | /* Bulk load a btree given the parameters and geometry established in bbl. */ |
669 | int |
670 | xfs_btree_bload( |
671 | struct xfs_btree_cur *cur, |
672 | struct xfs_btree_bload *bbl, |
673 | void *priv) |
674 | { |
675 | struct list_head buffers_list; |
676 | union xfs_btree_ptr child_ptr; |
677 | union xfs_btree_ptr ptr; |
678 | struct xfs_buf *bp = NULL; |
679 | struct xfs_btree_block *block = NULL; |
680 | uint64_t nr_this_level = bbl->nr_records; |
681 | uint64_t blocks; |
682 | uint64_t i; |
683 | uint64_t blocks_with_extra; |
684 | uint64_t total_blocks = 0; |
685 | unsigned int avg_per_block; |
686 | unsigned int level = 0; |
687 | int ret; |
688 | |
689 | ASSERT(cur->bc_flags & XFS_BTREE_STAGING); |
690 | |
691 | INIT_LIST_HEAD(&buffers_list); |
692 | cur->bc_nlevels = bbl->btree_height; |
693 | xfs_btree_set_ptr_null(cur, ptr: &child_ptr); |
694 | xfs_btree_set_ptr_null(cur, ptr: &ptr); |
695 | bbl->nr_dirty = 0; |
696 | |
697 | xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, |
698 | &avg_per_block, &blocks, &blocks_with_extra); |
699 | |
700 | /* Load each leaf block. */ |
701 | for (i = 0; i < blocks; i++) { |
702 | unsigned int nr_this_block = avg_per_block; |
703 | |
704 | /* |
705 | * Due to rounding, btree blocks will not be evenly populated |
706 | * in most cases. blocks_with_extra tells us how many blocks |
707 | * will receive an extra record to distribute the excess across |
708 | * the current level as evenly as possible. |
709 | */ |
710 | if (i < blocks_with_extra) |
711 | nr_this_block++; |
712 | |
713 | ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, |
714 | nr_this_block, &ptr, &bp, &block, priv); |
715 | if (ret) |
716 | goto out; |
717 | |
718 | trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, |
719 | nr_this_block); |
720 | |
721 | ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, |
722 | block, priv); |
723 | if (ret) |
724 | goto out; |
725 | |
726 | /* |
727 | * Record the leftmost leaf pointer so we know where to start |
728 | * with the first node level. |
729 | */ |
730 | if (i == 0) |
731 | xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); |
732 | } |
733 | total_blocks += blocks; |
734 | |
735 | ret = xfs_btree_bload_drop_buf(bbl, buffers_list: &buffers_list, bpp: &bp); |
736 | if (ret) |
737 | goto out; |
738 | |
739 | /* Populate the internal btree nodes. */ |
740 | for (level = 1; level < cur->bc_nlevels; level++) { |
741 | union xfs_btree_ptr first_ptr; |
742 | |
743 | nr_this_level = blocks; |
744 | block = NULL; |
745 | xfs_btree_set_ptr_null(cur, ptr: &ptr); |
746 | |
747 | xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, |
748 | &avg_per_block, &blocks, &blocks_with_extra); |
749 | |
750 | /* Load each node block. */ |
751 | for (i = 0; i < blocks; i++) { |
752 | unsigned int nr_this_block = avg_per_block; |
753 | |
754 | if (i < blocks_with_extra) |
755 | nr_this_block++; |
756 | |
757 | ret = xfs_btree_bload_prep_block(cur, bbl, |
758 | &buffers_list, level, nr_this_block, |
759 | &ptr, &bp, &block, priv); |
760 | if (ret) |
761 | goto out; |
762 | |
763 | trace_xfs_btree_bload_block(cur, level, i, blocks, |
764 | &ptr, nr_this_block); |
765 | |
766 | ret = xfs_btree_bload_node(cur, nr_this_block, |
767 | &child_ptr, block); |
768 | if (ret) |
769 | goto out; |
770 | |
771 | /* |
772 | * Record the leftmost node pointer so that we know |
773 | * where to start the next node level above this one. |
774 | */ |
775 | if (i == 0) |
776 | xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); |
777 | } |
778 | total_blocks += blocks; |
779 | |
780 | ret = xfs_btree_bload_drop_buf(bbl, buffers_list: &buffers_list, bpp: &bp); |
781 | if (ret) |
782 | goto out; |
783 | |
784 | xfs_btree_copy_ptrs(cur, dst_ptr: &child_ptr, src_ptr: &first_ptr, numptrs: 1); |
785 | } |
786 | |
787 | /* Initialize the new root. */ |
788 | if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { |
789 | ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); |
790 | cur->bc_ino.ifake->if_levels = cur->bc_nlevels; |
791 | cur->bc_ino.ifake->if_blocks = total_blocks - 1; |
792 | } else { |
793 | cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); |
794 | cur->bc_ag.afake->af_levels = cur->bc_nlevels; |
795 | cur->bc_ag.afake->af_blocks = total_blocks; |
796 | } |
797 | |
798 | /* |
799 | * Write the new blocks to disk. If the ordered list isn't empty after |
800 | * that, then something went wrong and we have to fail. This should |
801 | * never happen, but we'll check anyway. |
802 | */ |
803 | ret = xfs_buf_delwri_submit(&buffers_list); |
804 | if (ret) |
805 | goto out; |
806 | if (!list_empty(&buffers_list)) { |
807 | ASSERT(list_empty(&buffers_list)); |
808 | ret = -EIO; |
809 | } |
810 | |
811 | out: |
812 | xfs_buf_delwri_cancel(&buffers_list); |
813 | if (bp) |
814 | xfs_buf_relse(bp); |
815 | return ret; |
816 | } |
817 | |