xfs_btree_staging.c source code [linux/fs/xfs/libxfs/xfs_btree_staging.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2020 Oracle. All Rights Reserved.
4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_bit.h"
13	#include "xfs_mount.h"
14	#include "xfs_inode.h"
15	#include "xfs_trans.h"
16	#include "xfs_btree.h"
17	#include "xfs_trace.h"
18	#include "xfs_btree_staging.h"
19
20	/*
21	* Staging Cursors and Fake Roots for Btrees
22	* =========================================
23	*
24	* A staging btree cursor is a special type of btree cursor that callers must
25	* use to construct a new btree index using the btree bulk loader code. The
26	* bulk loading code uses the staging btree cursor to abstract the details of
27	* initializing new btree blocks and filling them with records or key/ptr
28	* pairs. Regular btree operations (e.g. queries and modifications) are not
29	* supported with staging cursors, and callers must not invoke them.
30	*
31	* Fake root structures contain all the information about a btree that is under
32	* construction by the bulk loading code. Staging btree cursors point to fake
33	* root structures instead of the usual AG header or inode structure.
34	*
35	* Callers are expected to initialize a fake root structure and pass it into
36	* the _stage_cursor function for a specific btree type. When bulk loading is
37	* complete, callers should call the _commit_staged_btree function for that
38	* specific btree type to commit the new btree into the filesystem.
39	*/
40
41	/*
42	* Bulk Loading for AG Btrees
43	* ==========================
44	*
45	* For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
46	* staging cursor. Callers should initialize this to zero.
47	*
48	* The _stage_cursor() function for a specific btree type should call
49	* xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
50	* cursor. The corresponding _commit_staged_btree() function should log the
51	* new root and call xfs_btree_commit_afakeroot() to transform the staging
52	* cursor into a regular btree cursor.
53	*/
54
55	/*
56	* Initialize a AG-rooted btree cursor with the given AG btree fake root.
57	*/
58	void
59	xfs_btree_stage_afakeroot(
60	struct xfs_btree_cur *cur,
61	struct xbtree_afakeroot *afake)
62	{
63	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
64	ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
65	ASSERT(cur->bc_tp == NULL);
66
67	cur->bc_ag.afake = afake;
68	cur->bc_nlevels = afake->af_levels;
69	cur->bc_flags \|= XFS_BTREE_STAGING;
70	}
71
72	/*
73	* Transform an AG-rooted staging btree cursor back into a regular cursor by
74	* substituting a real btree root for the fake one and restoring normal btree
75	* cursor ops. The caller must log the btree root change prior to calling
76	* this.
77	*/
78	void
79	xfs_btree_commit_afakeroot(
80	struct xfs_btree_cur *cur,
81	struct xfs_trans *tp,
82	struct xfs_buf *agbp)
83	{
84	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
85	ASSERT(cur->bc_tp == NULL);
86
87	trace_xfs_btree_commit_afakeroot(cur);
88
89	cur->bc_ag.afake = NULL;
90	cur->bc_ag.agbp = agbp;
91	cur->bc_flags &= ~XFS_BTREE_STAGING;
92	cur->bc_tp = tp;
93	}
94
95	/*
96	* Bulk Loading for Inode-Rooted Btrees
97	* ====================================
98	*
99	* For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
100	* the staging cursor. This structure should be initialized as follows:
101	*
102	* - if_fork_size field should be set to the number of bytes available to the
103	* fork in the inode.
104	*
105	* - if_fork should point to a freshly allocated struct xfs_ifork.
106	*
107	* - if_format should be set to the appropriate fork type (e.g.
108	* XFS_DINODE_FMT_BTREE).
109	*
110	* All other fields must be zero.
111	*
112	* The _stage_cursor() function for a specific btree type should call
113	* xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
114	* cursor. The corresponding _commit_staged_btree() function should log the
115	* new root and call xfs_btree_commit_ifakeroot() to transform the staging
116	* cursor into a regular btree cursor.
117	*/
118
119	/*
120	* Initialize an inode-rooted btree cursor with the given inode btree fake
121	* root. The btree cursor's bc_ops will be overridden as needed to make the
122	* staging functionality work. If new_ops is not NULL, these new ops will be
123	* passed out to the caller for further overriding.
124	*/
125	void
126	xfs_btree_stage_ifakeroot(
127	struct xfs_btree_cur *cur,
128	struct xbtree_ifakeroot *ifake)
129	{
130	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
131	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
132	ASSERT(cur->bc_tp == NULL);
133
134	cur->bc_ino.ifake = ifake;
135	cur->bc_nlevels = ifake->if_levels;
136	cur->bc_ino.forksize = ifake->if_fork_size;
137	cur->bc_flags \|= XFS_BTREE_STAGING;
138	}
139
140	/*
141	* Transform an inode-rooted staging btree cursor back into a regular cursor by
142	* substituting a real btree root for the fake one and restoring normal btree
143	* cursor ops. The caller must log the btree root change prior to calling
144	* this.
145	*/
146	void
147	xfs_btree_commit_ifakeroot(
148	struct xfs_btree_cur *cur,
149	struct xfs_trans *tp,
150	int whichfork)
151	{
152	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
153	ASSERT(cur->bc_tp == NULL);
154
155	trace_xfs_btree_commit_ifakeroot(cur);
156
157	cur->bc_ino.ifake = NULL;
158	cur->bc_ino.whichfork = whichfork;
159	cur->bc_flags &= ~XFS_BTREE_STAGING;
160	cur->bc_tp = tp;
161	}
162
163	/*
164	* Bulk Loading of Staged Btrees
165	* =============================
166	*
167	* This interface is used with a staged btree cursor to create a totally new
168	* btree with a large number of records (i.e. more than what would fit in a
169	* single root block). When the creation is complete, the new root can be
170	* linked atomically into the filesystem by committing the staged cursor.
171	*
172	* Creation of a new btree proceeds roughly as follows:
173	*
174	* The first step is to initialize an appropriate fake btree root structure and
175	* then construct a staged btree cursor. Refer to the block comments about
176	* "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
177	* more information about how to do this.
178	*
179	* The second step is to initialize a struct xfs_btree_bload context as
180	* documented in the structure definition.
181	*
182	* The third step is to call xfs_btree_bload_compute_geometry to compute the
183	* height of and the number of blocks needed to construct the btree. See the
184	* section "Computing the Geometry of the New Btree" for details about this
185	* computation.
186	*
187	* In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
188	* save them for later use by ->claim_block(). Bulk loading requires all
189	* blocks to be allocated beforehand to avoid ENOSPC failures midway through a
190	* rebuild, and to minimize seek distances of the new btree.
191	*
192	* Step five is to call xfs_btree_bload() to start constructing the btree.
193	*
194	* The final step is to commit the staging btree cursor, which logs the new
195	* btree root and turns the staging cursor into a regular cursor. The caller
196	* is responsible for cleaning up the previous btree blocks, if any.
197	*
198	* Computing the Geometry of the New Btree
199	* =======================================
200	*
201	* The number of items placed in each btree block is computed via the following
202	* algorithm: For leaf levels, the number of items for the level is nr_records
203	* in the bload structure. For node levels, the number of items for the level
204	* is the number of blocks in the next lower level of the tree. For each
205	* level, the desired number of items per block is defined as:
206	*
207	* desired = max(minrecs, maxrecs - slack factor)
208	*
209	* The number of blocks for the level is defined to be:
210	*
211	* blocks = floor(nr_items / desired)
212	*
213	* Note this is rounded down so that the npb calculation below will never fall
214	* below minrecs. The number of items that will actually be loaded into each
215	* btree block is defined as:
216	*
217	* npb = nr_items / blocks
218	*
219	* Some of the leftmost blocks in the level will contain one extra record as
220	* needed to handle uneven division. If the number of records in any block
221	* would exceed maxrecs for that level, blocks is incremented and npb is
222	* recalculated.
223	*
224	* In other words, we compute the number of blocks needed to satisfy a given
225	* loading level, then spread the items as evenly as possible.
226	*
227	* The height and number of fs blocks required to create the btree are computed
228	* and returned via btree_height and nr_blocks.
229	*/
230
231	/*
232	* Put a btree block that we're loading onto the ordered list and release it.
233	* The btree blocks will be written to disk when bulk loading is finished.
234	* If we reach the dirty buffer threshold, flush them to disk before
235	* continuing.
236	*/
237	static int
238	xfs_btree_bload_drop_buf(
239	struct xfs_btree_bload *bbl,
240	struct list_head *buffers_list,
241	struct xfs_buf **bpp)
242	{
243	struct xfs_buf bp = bpp;
244	int error;
245
246	if (!bp)
247	return `0`;
248
249	/*
250	* Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
251	* xfs_buf_read will not pointlessly reread the contents from the disk.
252	*/
253	bp->b_flags \|= XBF_DONE;
254
255	xfs_buf_delwri_queue_here(bp, buffers_list);
256	xfs_buf_relse(bp);
257	*bpp = NULL;
258	bbl->nr_dirty++;
259
260	if (!bbl->max_dirty \|\| bbl->nr_dirty < bbl->max_dirty)
261	return `0`;
262
263	error = xfs_buf_delwri_submit(buffers_list);
264	if (error)
265	return error;
266
267	bbl->nr_dirty = `0`;
268	return `0`;
269	}
270
271	/*
272	* Allocate and initialize one btree block for bulk loading.
273	*
274	* The new btree block will have its level and numrecs fields set to the values
275	* of the level and nr_this_block parameters, respectively.
276	*
277	* The caller should ensure that ptrp, bpp, and blockp refer to the left
278	* sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
279	* will all point to the new block.
280	*/
281	STATIC int
282	xfs_btree_bload_prep_block(
283	struct xfs_btree_cur *cur,
284	struct xfs_btree_bload *bbl,
285	struct list_head *buffers_list,
286	unsigned int level,
287	unsigned int nr_this_block,
288	union xfs_btree_ptr ptrp, /* in/out /
289	struct xfs_buf *bpp, /* in/out /
290	struct xfs_btree_block *blockp, /* in/out /
291	void *priv)
292	{
293	union xfs_btree_ptr new_ptr;
294	struct xfs_buf *new_bp;
295	struct xfs_btree_block *new_block;
296	int ret;
297
298	if (xfs_btree_at_iroot(cur, level)) {
299	struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
300	size_t new_size;
301
302	ASSERT(*bpp == NULL);
303
304	/ Allocate a new incore btree root block. /
305	new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
306	ifp->if_broot = kzalloc(new_size, GFP_KERNEL \| __GFP_NOFAIL);
307	ifp->if_broot_bytes = (int)new_size;
308
309	/ Initialize it and send it out. /
310	xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
311	level, nr_this_block, cur->bc_ino.ip->i_ino);
312
313	*bpp = NULL;
314	*blockp = ifp->if_broot;
315	xfs_btree_set_ptr_null(cur, ptr: ptrp);
316	return `0`;
317	}
318
319	/ Claim one of the caller's preallocated blocks. /
320	xfs_btree_set_ptr_null(cur, ptr: &new_ptr);
321	ret = bbl->claim_block(cur, &new_ptr, priv);
322	if (ret)
323	return ret;
324
325	ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
326
327	ret = xfs_btree_get_buf_block(cur, ptr: &new_ptr, block: &new_block, bpp: &new_bp);
328	if (ret)
329	return ret;
330
331	/*
332	* The previous block (if any) is the left sibling of the new block,
333	* so set its right sibling pointer to the new block and drop it.
334	*/
335	if (*blockp)
336	xfs_btree_set_sibling(cur, block: *blockp, ptr: &new_ptr, XFS_BB_RIGHTSIB);
337
338	ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
339	if (ret)
340	return ret;
341
342	/ Initialize the new btree block. /
343	xfs_btree_init_block_cur(cur, bp: new_bp, level, numrecs: nr_this_block);
344	xfs_btree_set_sibling(cur, block: new_block, ptr: ptrp, XFS_BB_LEFTSIB);
345
346	/ Set the out parameters. /
347	*bpp = new_bp;
348	*blockp = new_block;
349	xfs_btree_copy_ptrs(cur, dst_ptr: ptrp, src_ptr: &new_ptr, numptrs: `1`);
350	return `0`;
351	}
352
353	/ Load one leaf block. /
354	STATIC int
355	xfs_btree_bload_leaf(
356	struct xfs_btree_cur *cur,
357	unsigned int recs_this_block,
358	xfs_btree_bload_get_records_fn get_records,
359	struct xfs_btree_block *block,
360	void *priv)
361	{
362	unsigned int j = `1`;
363	int ret;
364
365	/ Fill the leaf block with records. /
366	while (j <= recs_this_block) {
367	ret = get_records(cur, j, block, recs_this_block - j + `1`, priv);
368	if (ret < `0`)
369	return ret;
370	j += ret;
371	}
372
373	return `0`;
374	}
375
376	/*
377	* Load one node block with key/ptr pairs.
378	*
379	* child_ptr must point to a block within the next level down in the tree. A
380	* key/ptr entry will be created in the new node block to the block pointed to
381	* by child_ptr. On exit, child_ptr points to the next block on the child
382	* level that needs processing.
383	*/
384	STATIC int
385	xfs_btree_bload_node(
386	struct xfs_btree_cur *cur,
387	unsigned int recs_this_block,
388	union xfs_btree_ptr *child_ptr,
389	struct xfs_btree_block *block)
390	{
391	unsigned int j;
392	int ret;
393
394	/ Fill the node block with keys and pointers. /
395	for (j = `1`; j <= recs_this_block; j++) {
396	union xfs_btree_key child_key;
397	union xfs_btree_ptr *block_ptr;
398	union xfs_btree_key *block_key;
399	struct xfs_btree_block *child_block;
400	struct xfs_buf *child_bp;
401
402	ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
403
404	/*
405	* Read the lower-level block in case the buffer for it has
406	* been reclaimed. LRU refs will be set on the block, which is
407	* desirable if the new btree commits.
408	*/
409	ret = xfs_btree_read_buf_block(cur, ptr: child_ptr, flags: `0`, block: &child_block,
410	bpp: &child_bp);
411	if (ret)
412	return ret;
413
414	block_ptr = xfs_btree_ptr_addr(cur, n: j, block);
415	xfs_btree_copy_ptrs(cur, dst_ptr: block_ptr, src_ptr: child_ptr, numptrs: `1`);
416
417	block_key = xfs_btree_key_addr(cur, n: j, block);
418	xfs_btree_get_keys(cur, block: child_block, key: &child_key);
419	xfs_btree_copy_keys(cur, dst_key: block_key, src_key: &child_key, numkeys: `1`);
420
421	xfs_btree_get_sibling(cur, block: child_block, ptr: child_ptr,
422	XFS_BB_RIGHTSIB);
423	xfs_buf_relse(child_bp);
424	}
425
426	return `0`;
427	}
428
429	/*
430	* Compute the maximum number of records (or keyptrs) per block that we want to
431	* install at this level in the btree. Caller is responsible for having set
432	* @cur->bc_ino.forksize to the desired fork size, if appropriate.
433	*/
434	STATIC unsigned int
435	xfs_btree_bload_max_npb(
436	struct xfs_btree_cur *cur,
437	struct xfs_btree_bload *bbl,
438	unsigned int level)
439	{
440	unsigned int ret;
441
442	if (level == cur->bc_nlevels - `1` && cur->bc_ops->get_dmaxrecs)
443	return cur->bc_ops->get_dmaxrecs(cur, level);
444
445	ret = cur->bc_ops->get_maxrecs(cur, level);
446	if (level == `0`)
447	ret -= bbl->leaf_slack;
448	else
449	ret -= bbl->node_slack;
450	return ret;
451	}
452
453	/*
454	* Compute the desired number of records (or keyptrs) per block that we want to
455	* install at this level in the btree, which must be somewhere between minrecs
456	* and max_npb. The caller is free to install fewer records per block.
457	*/
458	STATIC unsigned int
459	xfs_btree_bload_desired_npb(
460	struct xfs_btree_cur *cur,
461	struct xfs_btree_bload *bbl,
462	unsigned int level)
463	{
464	unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
465
466	/ Root blocks are not subject to minrecs rules. /
467	if (level == cur->bc_nlevels - `1`)
468	return max(`1U`, npb);
469
470	return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
471	}
472
473	/*
474	* Compute the number of records to be stored in each block at this level and
475	* the number of blocks for this level. For leaf levels, we must populate an
476	* empty root block even if there are no records, so we have to have at least
477	* one block.
478	*/
479	STATIC void
480	xfs_btree_bload_level_geometry(
481	struct xfs_btree_cur *cur,
482	struct xfs_btree_bload *bbl,
483	unsigned int level,
484	uint64_t nr_this_level,
485	unsigned int *avg_per_block,
486	uint64_t *blocks,
487	uint64_t *blocks_with_extra)
488	{
489	uint64_t npb;
490	uint64_t dontcare;
491	unsigned int desired_npb;
492	unsigned int maxnr;
493
494	/*
495	* Compute the absolute maximum number of records that we can store in
496	* the ondisk block or inode root.
497	*/
498	if (cur->bc_ops->get_dmaxrecs)
499	maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
500	else
501	maxnr = cur->bc_ops->get_maxrecs(cur, level);
502
503	/*
504	* Compute the number of blocks we need to fill each block with the
505	* desired number of records/keyptrs per block. Because desired_npb
506	* could be minrecs, we use regular integer division (which rounds
507	* the block count down) so that in the next step the effective # of
508	* items per block will never be less than desired_npb.
509	*/
510	desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
511	*blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
512	blocks = max(`1ULL`, blocks);
513
514	/*
515	* Compute the number of records that we will actually put in each
516	* block, assuming that we want to spread the records evenly between
517	* the blocks. Take care that the effective # of items per block (npb)
518	* won't exceed maxrecs even for the blocks that get an extra record,
519	* since desired_npb could be maxrecs, and in the previous step we
520	* rounded the block count down.
521	*/
522	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
523	if (npb > maxnr \|\| (npb == maxnr && *blocks_with_extra > `0`)) {
524	(*blocks)++;
525	npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
526	}
527
528	*avg_per_block = min_t(uint64_t, npb, nr_this_level);
529
530	trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
531	avg_per_block, desired_npb, blocks,
532	*blocks_with_extra);
533	}
534
535	/*
536	* Ensure a slack value is appropriate for the btree.
537	*
538	* If the slack value is negative, set slack so that we fill the block to
539	* halfway between minrecs and maxrecs. Make sure the slack is never so large
540	* that we can underflow minrecs.
541	*/
542	static void
543	xfs_btree_bload_ensure_slack(
544	struct xfs_btree_cur *cur,
545	int *slack,
546	int level)
547	{
548	int maxr;
549	int minr;
550
551	maxr = cur->bc_ops->get_maxrecs(cur, level);
552	minr = cur->bc_ops->get_minrecs(cur, level);
553
554	/*
555	* If slack is negative, automatically set slack so that we load the
556	* btree block approximately halfway between minrecs and maxrecs.
557	* Generally, this will net us 75% loading.
558	*/
559	if (*slack < `0`)
560	*slack = maxr - ((maxr + minr) >> `1`);
561
562	slack = min(slack, maxr - minr);
563	}
564
565	/*
566	* Prepare a btree cursor for a bulk load operation by computing the geometry
567	* fields in bbl. Caller must ensure that the btree cursor is a staging
568	* cursor. This function can be called multiple times.
569	*/
570	int
571	xfs_btree_bload_compute_geometry(
572	struct xfs_btree_cur *cur,
573	struct xfs_btree_bload *bbl,
574	uint64_t nr_records)
575	{
576	uint64_t nr_blocks = `0`;
577	uint64_t nr_this_level;
578
579	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
580
581	/*
582	* Make sure that the slack values make sense for traditional leaf and
583	* node blocks. Inode-rooted btrees will return different minrecs and
584	* maxrecs values for the root block (bc_nlevels == level - 1). We're
585	* checking levels 0 and 1 here, so set bc_nlevels such that the btree
586	* code doesn't interpret either as the root level.
587	*/
588	cur->bc_nlevels = cur->bc_maxlevels - `1`;
589	xfs_btree_bload_ensure_slack(cur, slack: &bbl->leaf_slack, level: `0`);
590	xfs_btree_bload_ensure_slack(cur, slack: &bbl->node_slack, level: `1`);
591
592	bbl->nr_records = nr_this_level = nr_records;
593	for (cur->bc_nlevels = `1`; cur->bc_nlevels <= cur->bc_maxlevels;) {
594	uint64_t level_blocks;
595	uint64_t dontcare64;
596	unsigned int level = cur->bc_nlevels - `1`;
597	unsigned int avg_per_block;
598
599	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
600	&avg_per_block, &level_blocks, &dontcare64);
601
602	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
603	/*
604	* If all the items we want to store at this level
605	* would fit in the inode root block, then we have our
606	* btree root and are done.
607	*
608	* Note that bmap btrees forbid records in the root.
609	*/
610	if (level != `0` && nr_this_level <= avg_per_block) {
611	nr_blocks++;
612	break;
613	}
614
615	/*
616	* Otherwise, we have to store all the items for this
617	* level in traditional btree blocks and therefore need
618	* another level of btree to point to those blocks.
619	*
620	* We have to re-compute the geometry for each level of
621	* an inode-rooted btree because the geometry differs
622	* between a btree root in an inode fork and a
623	* traditional btree block.
624	*
625	* This distinction is made in the btree code based on
626	* whether level == bc_nlevels - 1. Based on the
627	* previous root block size check against the root
628	* block geometry, we know that we aren't yet ready to
629	* populate the root. Increment bc_nevels and
630	* recalculate the geometry for a traditional
631	* block-based btree level.
632	*/
633	cur->bc_nlevels++;
634	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
635	xfs_btree_bload_level_geometry(cur, bbl, level,
636	nr_this_level, &avg_per_block,
637	&level_blocks, &dontcare64);
638	} else {
639	/*
640	* If all the items we want to store at this level
641	* would fit in a single root block, we're done.
642	*/
643	if (nr_this_level <= avg_per_block) {
644	nr_blocks++;
645	break;
646	}
647
648	/ Otherwise, we need another level of btree. /
649	cur->bc_nlevels++;
650	ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
651	}
652
653	nr_blocks += level_blocks;
654	nr_this_level = level_blocks;
655	}
656
657	if (cur->bc_nlevels > cur->bc_maxlevels)
658	return -EOVERFLOW;
659
660	bbl->btree_height = cur->bc_nlevels;
661	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
662	bbl->nr_blocks = nr_blocks - `1`;
663	else
664	bbl->nr_blocks = nr_blocks;
665	return `0`;
666	}
667
668	/ Bulk load a btree given the parameters and geometry established in bbl. /
669	int
670	xfs_btree_bload(
671	struct xfs_btree_cur *cur,
672	struct xfs_btree_bload *bbl,
673	void *priv)
674	{
675	struct list_head buffers_list;
676	union xfs_btree_ptr child_ptr;
677	union xfs_btree_ptr ptr;
678	struct xfs_buf *bp = NULL;
679	struct xfs_btree_block *block = NULL;
680	uint64_t nr_this_level = bbl->nr_records;
681	uint64_t blocks;
682	uint64_t i;
683	uint64_t blocks_with_extra;
684	uint64_t total_blocks = `0`;
685	unsigned int avg_per_block;
686	unsigned int level = `0`;
687	int ret;
688
689	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
690
691	INIT_LIST_HEAD(&buffers_list);
692	cur->bc_nlevels = bbl->btree_height;
693	xfs_btree_set_ptr_null(cur, ptr: &child_ptr);
694	xfs_btree_set_ptr_null(cur, ptr: &ptr);
695	bbl->nr_dirty = `0`;
696
697	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
698	&avg_per_block, &blocks, &blocks_with_extra);
699
700	/ Load each leaf block. /
701	for (i = `0`; i < blocks; i++) {
702	unsigned int nr_this_block = avg_per_block;
703
704	/*
705	* Due to rounding, btree blocks will not be evenly populated
706	* in most cases. blocks_with_extra tells us how many blocks
707	* will receive an extra record to distribute the excess across
708	* the current level as evenly as possible.
709	*/
710	if (i < blocks_with_extra)
711	nr_this_block++;
712
713	ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
714	nr_this_block, &ptr, &bp, &block, priv);
715	if (ret)
716	goto out;
717
718	trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
719	nr_this_block);
720
721	ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
722	block, priv);
723	if (ret)
724	goto out;
725
726	/*
727	* Record the leftmost leaf pointer so we know where to start
728	* with the first node level.
729	*/
730	if (i == `0`)
731	xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, `1`);
732	}
733	total_blocks += blocks;
734
735	ret = xfs_btree_bload_drop_buf(bbl, buffers_list: &buffers_list, bpp: &bp);
736	if (ret)
737	goto out;
738
739	/ Populate the internal btree nodes. /
740	for (level = `1`; level < cur->bc_nlevels; level++) {
741	union xfs_btree_ptr first_ptr;
742
743	nr_this_level = blocks;
744	block = NULL;
745	xfs_btree_set_ptr_null(cur, ptr: &ptr);
746
747	xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
748	&avg_per_block, &blocks, &blocks_with_extra);
749
750	/ Load each node block. /
751	for (i = `0`; i < blocks; i++) {
752	unsigned int nr_this_block = avg_per_block;
753
754	if (i < blocks_with_extra)
755	nr_this_block++;
756
757	ret = xfs_btree_bload_prep_block(cur, bbl,
758	&buffers_list, level, nr_this_block,
759	&ptr, &bp, &block, priv);
760	if (ret)
761	goto out;
762
763	trace_xfs_btree_bload_block(cur, level, i, blocks,
764	&ptr, nr_this_block);
765
766	ret = xfs_btree_bload_node(cur, nr_this_block,
767	&child_ptr, block);
768	if (ret)
769	goto out;
770
771	/*
772	* Record the leftmost node pointer so that we know
773	* where to start the next node level above this one.
774	*/
775	if (i == `0`)
776	xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, `1`);
777	}
778	total_blocks += blocks;
779
780	ret = xfs_btree_bload_drop_buf(bbl, buffers_list: &buffers_list, bpp: &bp);
781	if (ret)
782	goto out;
783
784	xfs_btree_copy_ptrs(cur, dst_ptr: &child_ptr, src_ptr: &first_ptr, numptrs: `1`);
785	}
786
787	/ Initialize the new root. /
788	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
789	ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
790	cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
791	cur->bc_ino.ifake->if_blocks = total_blocks - `1`;
792	} else {
793	cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
794	cur->bc_ag.afake->af_levels = cur->bc_nlevels;
795	cur->bc_ag.afake->af_blocks = total_blocks;
796	}
797
798	/*
799	* Write the new blocks to disk. If the ordered list isn't empty after
800	* that, then something went wrong and we have to fail. This should
801	* never happen, but we'll check anyway.
802	*/
803	ret = xfs_buf_delwri_submit(&buffers_list);
804	if (ret)
805	goto out;
806	if (!list_empty(&buffers_list)) {
807	ASSERT(list_empty(&buffers_list));
808	ret = -EIO;
809	}
810
811	out:
812	xfs_buf_delwri_cancel(&buffers_list);
813	if (bp)
814	xfs_buf_relse(bp);
815	return ret;
816	}
817

source code of linux/fs/xfs/libxfs/xfs_btree_staging.c