xfs_rmap_btree.c source code [linux/fs/xfs/libxfs/xfs_rmap_btree.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2014 Red Hat, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_trans.h"
14	#include "xfs_alloc.h"
15	#include "xfs_btree.h"
16	#include "xfs_btree_staging.h"
17	#include "xfs_rmap.h"
18	#include "xfs_rmap_btree.h"
19	#include "xfs_health.h"
20	#include "xfs_trace.h"
21	#include "xfs_error.h"
22	#include "xfs_extent_busy.h"
23	#include "xfs_ag.h"
24	#include "xfs_ag_resv.h"
25	#include "xfs_buf_mem.h"
26	#include "xfs_btree_mem.h"
27
28	static struct kmem_cache *xfs_rmapbt_cur_cache;
29
30	/*
31	* Reverse map btree.
32	*
33	* This is a per-ag tree used to track the owner(s) of a given extent. With
34	* reflink it is possible for there to be multiple owners, which is a departure
35	* from classic XFS. Owner records for data extents are inserted when the
36	* extent is mapped and removed when an extent is unmapped. Owner records for
37	* all other block types (i.e. metadata) are inserted when an extent is
38	* allocated and removed when an extent is freed. There can only be one owner
39	* of a metadata extent, usually an inode or some other metadata structure like
40	* an AG btree.
41	*
42	* The rmap btree is part of the free space management, so blocks for the tree
43	* are sourced from the agfl. Hence we need transaction reservation support for
44	* this tree so that the freelist is always large enough. This also impacts on
45	* the minimum space we need to leave free in the AG.
46	*
47	* The tree is ordered by [ag block, owner, offset]. This is a large key size,
48	* but it is the only way to enforce unique keys when a block can be owned by
49	* multiple files at any offset. There's no need to order/search by extent
50	* size for online updating/management of the tree. It is intended that most
51	* reverse lookups will be to find the owner(s) of a particular block, or to
52	* try to recover tree and file data from corrupt primary metadata.
53	*/
54
55	static struct xfs_btree_cur *
56	xfs_rmapbt_dup_cursor(
57	struct xfs_btree_cur *cur)
58	{
59	return xfs_rmapbt_init_cursor(mp: cur->bc_mp, tp: cur->bc_tp,
60	bp: cur->bc_ag.agbp, pag: cur->bc_ag.pag);
61	}
62
63	STATIC void
64	xfs_rmapbt_set_root(
65	struct xfs_btree_cur *cur,
66	const union xfs_btree_ptr *ptr,
67	int inc)
68	{
69	struct xfs_buf *agbp = cur->bc_ag.agbp;
70	struct xfs_agf *agf = agbp->b_addr;
71
72	ASSERT(ptr->s != `0`);
73
74	agf->agf_rmap_root = ptr->s;
75	be32_add_cpu(&agf->agf_rmap_level, inc);
76	cur->bc_ag.pag->pagf_rmap_level += inc;
77
78	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS \| XFS_AGF_LEVELS);
79	}
80
81	STATIC int
82	xfs_rmapbt_alloc_block(
83	struct xfs_btree_cur *cur,
84	const union xfs_btree_ptr *start,
85	union xfs_btree_ptr *new,
86	int *stat)
87	{
88	struct xfs_buf *agbp = cur->bc_ag.agbp;
89	struct xfs_agf *agf = agbp->b_addr;
90	struct xfs_perag *pag = cur->bc_ag.pag;
91	int error;
92	xfs_agblock_t bno;
93
94	/ Allocate the new block from the freelist. If we can't, give up. /
95	error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
96	&bno, `1`);
97	if (error)
98	return error;
99	if (bno == NULLAGBLOCK) {
100	*stat = `0`;
101	return `0`;
102	}
103
104	xfs_extent_busy_reuse(cur->bc_mp, pag, bno, `1`, false);
105
106	new->s = cpu_to_be32(bno);
107	be32_add_cpu(&agf->agf_rmap_blocks, `1`);
108	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
109
110	xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
111
112	*stat = `1`;
113	return `0`;
114	}
115
116	STATIC int
117	xfs_rmapbt_free_block(
118	struct xfs_btree_cur *cur,
119	struct xfs_buf *bp)
120	{
121	struct xfs_buf *agbp = cur->bc_ag.agbp;
122	struct xfs_agf *agf = agbp->b_addr;
123	struct xfs_perag *pag = cur->bc_ag.pag;
124	xfs_agblock_t bno;
125	int error;
126
127	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
128	be32_add_cpu(&agf->agf_rmap_blocks, -`1`);
129	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
130	error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, `1`);
131	if (error)
132	return error;
133
134	xfs_extent_busy_insert(cur->bc_tp, pag, bno, `1`,
135	XFS_EXTENT_BUSY_SKIP_DISCARD);
136
137	xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, `1`);
138	return `0`;
139	}
140
141	STATIC int
142	xfs_rmapbt_get_minrecs(
143	struct xfs_btree_cur *cur,
144	int level)
145	{
146	return cur->bc_mp->m_rmap_mnr[level != `0`];
147	}
148
149	STATIC int
150	xfs_rmapbt_get_maxrecs(
151	struct xfs_btree_cur *cur,
152	int level)
153	{
154	return cur->bc_mp->m_rmap_mxr[level != `0`];
155	}
156
157	/*
158	* Convert the ondisk record's offset field into the ondisk key's offset field.
159	* Fork and bmbt are significant parts of the rmap record key, but written
160	* status is merely a record attribute.
161	*/
162	static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
163	{
164	return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
165	}
166
167	STATIC void
168	xfs_rmapbt_init_key_from_rec(
169	union xfs_btree_key *key,
170	const union xfs_btree_rec *rec)
171	{
172	key->rmap.rm_startblock = rec->rmap.rm_startblock;
173	key->rmap.rm_owner = rec->rmap.rm_owner;
174	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
175	}
176
177	/*
178	* The high key for a reverse mapping record can be computed by shifting
179	* the startblock and offset to the highest value that would still map
180	* to that record. In practice this means that we add blockcount-1 to
181	* the startblock for all records, and if the record is for a data/attr
182	* fork mapping, we add blockcount-1 to the offset too.
183	*/
184	STATIC void
185	xfs_rmapbt_init_high_key_from_rec(
186	union xfs_btree_key *key,
187	const union xfs_btree_rec *rec)
188	{
189	uint64_t off;
190	int adj;
191
192	adj = be32_to_cpu(rec->rmap.rm_blockcount) - `1`;
193
194	key->rmap.rm_startblock = rec->rmap.rm_startblock;
195	be32_add_cpu(&key->rmap.rm_startblock, adj);
196	key->rmap.rm_owner = rec->rmap.rm_owner;
197	key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
198	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) \|\|
199	XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
200	return;
201	off = be64_to_cpu(key->rmap.rm_offset);
202	off = (XFS_RMAP_OFF(off) + adj) \| (off & ~XFS_RMAP_OFF_MASK);
203	key->rmap.rm_offset = cpu_to_be64(off);
204	}
205
206	STATIC void
207	xfs_rmapbt_init_rec_from_cur(
208	struct xfs_btree_cur *cur,
209	union xfs_btree_rec *rec)
210	{
211	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
212	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
213	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
214	rec->rmap.rm_offset = cpu_to_be64(
215	xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
216	}
217
218	STATIC void
219	xfs_rmapbt_init_ptr_from_cur(
220	struct xfs_btree_cur *cur,
221	union xfs_btree_ptr *ptr)
222	{
223	struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
224
225	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
226
227	ptr->s = agf->agf_rmap_root;
228	}
229
230	/*
231	* Mask the appropriate parts of the ondisk key field for a key comparison.
232	* Fork and bmbt are significant parts of the rmap record key, but written
233	* status is merely a record attribute.
234	*/
235	static inline uint64_t offset_keymask(uint64_t offset)
236	{
237	return offset & ~XFS_RMAP_OFF_UNWRITTEN;
238	}
239
240	STATIC int64_t
241	xfs_rmapbt_key_diff(
242	struct xfs_btree_cur *cur,
243	const union xfs_btree_key *key)
244	{
245	struct xfs_rmap_irec *rec = &cur->bc_rec.r;
246	const struct xfs_rmap_key *kp = &key->rmap;
247	__u64 x, y;
248	int64_t d;
249
250	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
251	if (d)
252	return d;
253
254	x = be64_to_cpu(kp->rm_owner);
255	y = rec->rm_owner;
256	if (x > y)
257	return `1`;
258	else if (y > x)
259	return -`1`;
260
261	x = offset_keymask(be64_to_cpu(kp->rm_offset));
262	y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
263	if (x > y)
264	return `1`;
265	else if (y > x)
266	return -`1`;
267	return `0`;
268	}
269
270	STATIC int64_t
271	xfs_rmapbt_diff_two_keys(
272	struct xfs_btree_cur *cur,
273	const union xfs_btree_key *k1,
274	const union xfs_btree_key *k2,
275	const union xfs_btree_key *mask)
276	{
277	const struct xfs_rmap_key *kp1 = &k1->rmap;
278	const struct xfs_rmap_key *kp2 = &k2->rmap;
279	int64_t d;
280	__u64 x, y;
281
282	/ Doesn't make sense to mask off the physical space part /
283	ASSERT(!mask \|\| mask->rmap.rm_startblock);
284
285	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
286	be32_to_cpu(kp2->rm_startblock);
287	if (d)
288	return d;
289
290	if (!mask \|\| mask->rmap.rm_owner) {
291	x = be64_to_cpu(kp1->rm_owner);
292	y = be64_to_cpu(kp2->rm_owner);
293	if (x > y)
294	return `1`;
295	else if (y > x)
296	return -`1`;
297	}
298
299	if (!mask \|\| mask->rmap.rm_offset) {
300	/ Doesn't make sense to allow offset but not owner /
301	ASSERT(!mask \|\| mask->rmap.rm_owner);
302
303	x = offset_keymask(be64_to_cpu(kp1->rm_offset));
304	y = offset_keymask(be64_to_cpu(kp2->rm_offset));
305	if (x > y)
306	return `1`;
307	else if (y > x)
308	return -`1`;
309	}
310
311	return `0`;
312	}
313
314	static xfs_failaddr_t
315	xfs_rmapbt_verify(
316	struct xfs_buf *bp)
317	{
318	struct xfs_mount *mp = bp->b_mount;
319	struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
320	struct xfs_perag *pag = bp->b_pag;
321	xfs_failaddr_t fa;
322	unsigned int level;
323
324	/*
325	* magic number and level verification
326	*
327	* During growfs operations, we can't verify the exact level or owner as
328	* the perag is not fully initialised and hence not attached to the
329	* buffer. In this case, check against the maximum tree depth.
330	*
331	* Similarly, during log recovery we will have a perag structure
332	* attached, but the agf information will not yet have been initialised
333	* from the on disk AGF. Again, we can only check against maximum limits
334	* in this case.
335	*/
336	if (!xfs_verify_magic(bp, block->bb_magic))
337	return __this_address;
338
339	if (!xfs_has_rmapbt(mp))
340	return __this_address;
341	fa = xfs_btree_agblock_v5hdr_verify(bp);
342	if (fa)
343	return fa;
344
345	level = be16_to_cpu(block->bb_level);
346	if (pag && xfs_perag_initialised_agf(pag)) {
347	unsigned int maxlevel = pag->pagf_rmap_level;
348
349	#ifdef CONFIG_XFS_ONLINE_REPAIR
350	/*
351	* Online repair could be rewriting the free space btrees, so
352	* we'll validate against the larger of either tree while this
353	* is going on.
354	*/
355	maxlevel = max_t(unsigned int, maxlevel,
356	pag->pagf_repair_rmap_level);
357	#endif
358	if (level >= maxlevel)
359	return __this_address;
360	} else if (level >= mp->m_rmap_maxlevels)
361	return __this_address;
362
363	return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != `0`]);
364	}
365
366	static void
367	xfs_rmapbt_read_verify(
368	struct xfs_buf *bp)
369	{
370	xfs_failaddr_t fa;
371
372	if (!xfs_btree_agblock_verify_crc(bp))
373	xfs_verifier_error(bp, -EFSBADCRC, __this_address);
374	else {
375	fa = xfs_rmapbt_verify(bp);
376	if (fa)
377	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
378	}
379
380	if (bp->b_error)
381	trace_xfs_btree_corrupt(bp, _RET_IP_);
382	}
383
384	static void
385	xfs_rmapbt_write_verify(
386	struct xfs_buf *bp)
387	{
388	xfs_failaddr_t fa;
389
390	fa = xfs_rmapbt_verify(bp);
391	if (fa) {
392	trace_xfs_btree_corrupt(bp, _RET_IP_);
393	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
394	return;
395	}
396	xfs_btree_agblock_calc_crc(bp);
397
398	}
399
400	const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
401	.name = "xfs_rmapbt",
402	.magic = { `0`, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
403	.verify_read = xfs_rmapbt_read_verify,
404	.verify_write = xfs_rmapbt_write_verify,
405	.verify_struct = xfs_rmapbt_verify,
406	};
407
408	STATIC int
409	xfs_rmapbt_keys_inorder(
410	struct xfs_btree_cur *cur,
411	const union xfs_btree_key *k1,
412	const union xfs_btree_key *k2)
413	{
414	uint32_t x;
415	uint32_t y;
416	uint64_t a;
417	uint64_t b;
418
419	x = be32_to_cpu(k1->rmap.rm_startblock);
420	y = be32_to_cpu(k2->rmap.rm_startblock);
421	if (x < y)
422	return `1`;
423	else if (x > y)
424	return `0`;
425	a = be64_to_cpu(k1->rmap.rm_owner);
426	b = be64_to_cpu(k2->rmap.rm_owner);
427	if (a < b)
428	return `1`;
429	else if (a > b)
430	return `0`;
431	a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
432	b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
433	if (a <= b)
434	return `1`;
435	return `0`;
436	}
437
438	STATIC int
439	xfs_rmapbt_recs_inorder(
440	struct xfs_btree_cur *cur,
441	const union xfs_btree_rec *r1,
442	const union xfs_btree_rec *r2)
443	{
444	uint32_t x;
445	uint32_t y;
446	uint64_t a;
447	uint64_t b;
448
449	x = be32_to_cpu(r1->rmap.rm_startblock);
450	y = be32_to_cpu(r2->rmap.rm_startblock);
451	if (x < y)
452	return `1`;
453	else if (x > y)
454	return `0`;
455	a = be64_to_cpu(r1->rmap.rm_owner);
456	b = be64_to_cpu(r2->rmap.rm_owner);
457	if (a < b)
458	return `1`;
459	else if (a > b)
460	return `0`;
461	a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
462	b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
463	if (a <= b)
464	return `1`;
465	return `0`;
466	}
467
468	STATIC enum xbtree_key_contig
469	xfs_rmapbt_keys_contiguous(
470	struct xfs_btree_cur *cur,
471	const union xfs_btree_key *key1,
472	const union xfs_btree_key *key2,
473	const union xfs_btree_key *mask)
474	{
475	ASSERT(!mask \|\| mask->rmap.rm_startblock);
476
477	/*
478	* We only support checking contiguity of the physical space component.
479	* If any callers ever need more specificity than that, they'll have to
480	* implement it here.
481	*/
482	ASSERT(!mask \|\| (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
483
484	return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
485	be32_to_cpu(key2->rmap.rm_startblock));
486	}
487
488	const struct xfs_btree_ops xfs_rmapbt_ops = {
489	.name = "rmap",
490	.type = XFS_BTREE_TYPE_AG,
491	.geom_flags = XFS_BTGEO_OVERLAPPING,
492
493	.rec_len = sizeof(struct xfs_rmap_rec),
494	/ Overlapping btree; 2 keys per pointer. /
495	.key_len = `2` * sizeof(struct xfs_rmap_key),
496	.ptr_len = XFS_BTREE_SHORT_PTR_LEN,
497
498	.lru_refs = XFS_RMAP_BTREE_REF,
499	.statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
500	.sick_mask = XFS_SICK_AG_RMAPBT,
501
502	.dup_cursor = xfs_rmapbt_dup_cursor,
503	.set_root = xfs_rmapbt_set_root,
504	.alloc_block = xfs_rmapbt_alloc_block,
505	.free_block = xfs_rmapbt_free_block,
506	.get_minrecs = xfs_rmapbt_get_minrecs,
507	.get_maxrecs = xfs_rmapbt_get_maxrecs,
508	.init_key_from_rec = xfs_rmapbt_init_key_from_rec,
509	.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
510	.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
511	.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
512	.key_diff = xfs_rmapbt_key_diff,
513	.buf_ops = &xfs_rmapbt_buf_ops,
514	.diff_two_keys = xfs_rmapbt_diff_two_keys,
515	.keys_inorder = xfs_rmapbt_keys_inorder,
516	.recs_inorder = xfs_rmapbt_recs_inorder,
517	.keys_contiguous = xfs_rmapbt_keys_contiguous,
518	};
519
520	/*
521	* Create a new reverse mapping btree cursor.
522	*
523	* For staging cursors tp and agbp are NULL.
524	*/
525	struct xfs_btree_cur *
526	xfs_rmapbt_init_cursor(
527	struct xfs_mount *mp,
528	struct xfs_trans *tp,
529	struct xfs_buf *agbp,
530	struct xfs_perag *pag)
531	{
532	struct xfs_btree_cur *cur;
533
534	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
535	mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
536	cur->bc_ag.pag = xfs_perag_hold(pag);
537	cur->bc_ag.agbp = agbp;
538	if (agbp) {
539	struct xfs_agf *agf = agbp->b_addr;
540
541	cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
542	}
543	return cur;
544	}
545
546	#ifdef CONFIG_XFS_BTREE_IN_MEM
547	static inline unsigned int
548	xfs_rmapbt_mem_block_maxrecs(
549	unsigned int blocklen,
550	bool leaf)
551	{
552	if (leaf)
553	return blocklen / sizeof(struct xfs_rmap_rec);
554	return blocklen /
555	(`2` * sizeof(struct xfs_rmap_key) + sizeof(__be64));
556	}
557
558	/*
559	* Validate an in-memory rmap btree block. Callers are allowed to generate an
560	* in-memory btree even if the ondisk feature is not enabled.
561	*/
562	static xfs_failaddr_t
563	xfs_rmapbt_mem_verify(
564	struct xfs_buf *bp)
565	{
566	struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
567	xfs_failaddr_t fa;
568	unsigned int level;
569	unsigned int maxrecs;
570
571	if (!xfs_verify_magic(bp, block->bb_magic))
572	return __this_address;
573
574	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
575	if (fa)
576	return fa;
577
578	level = be16_to_cpu(block->bb_level);
579	if (level >= xfs_rmapbt_maxlevels_ondisk())
580	return __this_address;
581
582	maxrecs = xfs_rmapbt_mem_block_maxrecs(
583	XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == `0`);
584	return xfs_btree_memblock_verify(bp, maxrecs);
585	}
586
587	static void
588	xfs_rmapbt_mem_rw_verify(
589	struct xfs_buf *bp)
590	{
591	xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp);
592
593	if (fa)
594	xfs_verifier_error(bp, -EFSCORRUPTED, fa);
595	}
596
597	/ skip crc checks on in-memory btrees to save time /
598	static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
599	.name = "xfs_rmapbt_mem",
600	.magic = { `0`, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
601	.verify_read = xfs_rmapbt_mem_rw_verify,
602	.verify_write = xfs_rmapbt_mem_rw_verify,
603	.verify_struct = xfs_rmapbt_mem_verify,
604	};
605
606	const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
607	.name = "mem_rmap",
608	.type = XFS_BTREE_TYPE_MEM,
609	.geom_flags = XFS_BTGEO_OVERLAPPING,
610
611	.rec_len = sizeof(struct xfs_rmap_rec),
612	/ Overlapping btree; 2 keys per pointer. /
613	.key_len = `2` * sizeof(struct xfs_rmap_key),
614	.ptr_len = XFS_BTREE_LONG_PTR_LEN,
615
616	.lru_refs = XFS_RMAP_BTREE_REF,
617	.statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
618
619	.dup_cursor = xfbtree_dup_cursor,
620	.set_root = xfbtree_set_root,
621	.alloc_block = xfbtree_alloc_block,
622	.free_block = xfbtree_free_block,
623	.get_minrecs = xfbtree_get_minrecs,
624	.get_maxrecs = xfbtree_get_maxrecs,
625	.init_key_from_rec = xfs_rmapbt_init_key_from_rec,
626	.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
627	.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
628	.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
629	.key_diff = xfs_rmapbt_key_diff,
630	.buf_ops = &xfs_rmapbt_mem_buf_ops,
631	.diff_two_keys = xfs_rmapbt_diff_two_keys,
632	.keys_inorder = xfs_rmapbt_keys_inorder,
633	.recs_inorder = xfs_rmapbt_recs_inorder,
634	.keys_contiguous = xfs_rmapbt_keys_contiguous,
635	};
636
637	/ Create a cursor for an in-memory btree. /
638	struct xfs_btree_cur *
639	xfs_rmapbt_mem_cursor(
640	struct xfs_perag *pag,
641	struct xfs_trans *tp,
642	struct xfbtree *xfbt)
643	{
644	struct xfs_btree_cur *cur;
645	struct xfs_mount *mp = pag->pag_mount;
646
647	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
648	xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
649	cur->bc_mem.xfbtree = xfbt;
650	cur->bc_nlevels = xfbt->nlevels;
651
652	cur->bc_mem.pag = xfs_perag_hold(pag);
653	return cur;
654	}
655
656	/ Create an in-memory rmap btree. /
657	int
658	xfs_rmapbt_mem_init(
659	struct xfs_mount *mp,
660	struct xfbtree *xfbt,
661	struct xfs_buftarg *btp,
662	xfs_agnumber_t agno)
663	{
664	xfbt->owner = agno;
665	return xfbtree_init(mp, xfbt, btp, ops: &xfs_rmapbt_mem_ops);
666	}
667
668	/ Compute the max possible height for reverse mapping btrees in memory. /
669	static unsigned int
670	xfs_rmapbt_mem_maxlevels(void)
671	{
672	unsigned int minrecs[`2`];
673	unsigned int blocklen;
674
675	blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
676
677	minrecs[`0`] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / `2`;
678	minrecs[`1`] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / `2`;
679
680	/*
681	* How tall can an in-memory rmap btree become if we filled the entire
682	* AG with rmap records?
683	*/
684	return xfs_btree_compute_maxlevels(limits: minrecs,
685	XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
686	}
687	#else
688	# define xfs_rmapbt_mem_maxlevels() (0)
689	#endif /* CONFIG_XFS_BTREE_IN_MEM */
690
691	/*
692	* Install a new reverse mapping btree root. Caller is responsible for
693	* invalidating and freeing the old btree blocks.
694	*/
695	void
696	xfs_rmapbt_commit_staged_btree(
697	struct xfs_btree_cur *cur,
698	struct xfs_trans *tp,
699	struct xfs_buf *agbp)
700	{
701	struct xfs_agf *agf = agbp->b_addr;
702	struct xbtree_afakeroot *afake = cur->bc_ag.afake;
703
704	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
705
706	agf->agf_rmap_root = cpu_to_be32(afake->af_root);
707	agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
708	agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
709	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS \| XFS_AGF_LEVELS \|
710	XFS_AGF_RMAP_BLOCKS);
711	xfs_btree_commit_afakeroot(cur, tp, agbp);
712	}
713
714	/ Calculate number of records in a reverse mapping btree block. /
715	static inline unsigned int
716	xfs_rmapbt_block_maxrecs(
717	unsigned int blocklen,
718	bool leaf)
719	{
720	if (leaf)
721	return blocklen / sizeof(struct xfs_rmap_rec);
722	return blocklen /
723	(`2` * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
724	}
725
726	/*
727	* Calculate number of records in an rmap btree block.
728	*/
729	int
730	xfs_rmapbt_maxrecs(
731	int blocklen,
732	int leaf)
733	{
734	blocklen -= XFS_RMAP_BLOCK_LEN;
735	return xfs_rmapbt_block_maxrecs(blocklen, leaf);
736	}
737
738	/ Compute the max possible height for reverse mapping btrees. /
739	unsigned int
740	xfs_rmapbt_maxlevels_ondisk(void)
741	{
742	unsigned int minrecs[`2`];
743	unsigned int blocklen;
744
745	blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
746
747	minrecs[`0`] = xfs_rmapbt_block_maxrecs(blocklen, true) / `2`;
748	minrecs[`1`] = xfs_rmapbt_block_maxrecs(blocklen, false) / `2`;
749
750	/*
751	* Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
752	*
753	* On a reflink filesystem, each AG block can have up to 2^32 (per the
754	* refcount record format) owners, which means that theoretically we
755	* could face up to 2^64 rmap records. However, we're likely to run
756	* out of blocks in the AG long before that happens, which means that
757	* we must compute the max height based on what the btree will look
758	* like if it consumes almost all the blocks in the AG due to maximal
759	* sharing factor.
760	*/
761	return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
762	xfs_rmapbt_mem_maxlevels());
763	}
764
765	/ Compute the maximum height of an rmap btree. /
766	void
767	xfs_rmapbt_compute_maxlevels(
768	struct xfs_mount *mp)
769	{
770	if (!xfs_has_rmapbt(mp)) {
771	mp->m_rmap_maxlevels = `0`;
772	return;
773	}
774
775	if (xfs_has_reflink(mp)) {
776	/*
777	* Compute the asymptotic maxlevels for an rmap btree on a
778	* filesystem that supports reflink.
779	*
780	* On a reflink filesystem, each AG block can have up to 2^32
781	* (per the refcount record format) owners, which means that
782	* theoretically we could face up to 2^64 rmap records.
783	* However, we're likely to run out of blocks in the AG long
784	* before that happens, which means that we must compute the
785	* max height based on what the btree will look like if it
786	* consumes almost all the blocks in the AG due to maximal
787	* sharing factor.
788	*/
789	mp->m_rmap_maxlevels = xfs_btree_space_to_height(limits: mp->m_rmap_mnr,
790	blocks: mp->m_sb.sb_agblocks);
791	} else {
792	/*
793	* If there's no block sharing, compute the maximum rmapbt
794	* height assuming one rmap record per AG block.
795	*/
796	mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
797	limits: mp->m_rmap_mnr, records: mp->m_sb.sb_agblocks);
798	}
799	ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
800	}
801
802	/ Calculate the refcount btree size for some records. /
803	xfs_extlen_t
804	xfs_rmapbt_calc_size(
805	struct xfs_mount *mp,
806	unsigned long long len)
807	{
808	return xfs_btree_calc_size(limits: mp->m_rmap_mnr, records: len);
809	}
810
811	/*
812	* Calculate the maximum refcount btree size.
813	*/
814	xfs_extlen_t
815	xfs_rmapbt_max_size(
816	struct xfs_mount *mp,
817	xfs_agblock_t agblocks)
818	{
819	/ Bail out if we're uninitialized, which can happen in mkfs. /
820	if (mp->m_rmap_mxr[`0`] == `0`)
821	return `0`;
822
823	return xfs_rmapbt_calc_size(mp, agblocks);
824	}
825
826	/*
827	* Figure out how many blocks to reserve and how many are used by this btree.
828	*/
829	int
830	xfs_rmapbt_calc_reserves(
831	struct xfs_mount *mp,
832	struct xfs_trans *tp,
833	struct xfs_perag *pag,
834	xfs_extlen_t *ask,
835	xfs_extlen_t *used)
836	{
837	struct xfs_buf *agbp;
838	struct xfs_agf *agf;
839	xfs_agblock_t agblocks;
840	xfs_extlen_t tree_len;
841	int error;
842
843	if (!xfs_has_rmapbt(mp))
844	return `0`;
845
846	error = xfs_alloc_read_agf(pag, tp, flags: `0`, agfbpp: &agbp);
847	if (error)
848	return error;
849
850	agf = agbp->b_addr;
851	agblocks = be32_to_cpu(agf->agf_length);
852	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
853	xfs_trans_brelse(tp, agbp);
854
855	/*
856	* The log is permanently allocated, so the space it occupies will
857	* never be available for the kinds of things that would require btree
858	* expansion. We therefore can pretend the space isn't there.
859	*/
860	if (xfs_ag_contains_log(mp, pag->pag_agno))
861	agblocks -= mp->m_sb.sb_logblocks;
862
863	/ Reserve 1% of the AG or enough for 1 block per record. /
864	*ask += max(agblocks / `100`, xfs_rmapbt_max_size(mp, agblocks));
865	*used += tree_len;
866
867	return error;
868	}
869
870	int __init
871	xfs_rmapbt_init_cur_cache(void)
872	{
873	xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
874	xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
875	`0`, `0`, NULL);
876
877	if (!xfs_rmapbt_cur_cache)
878	return -ENOMEM;
879	return `0`;
880	}
881
882	void
883	xfs_rmapbt_destroy_cur_cache(void)
884	{
885	kmem_cache_destroy(xfs_rmapbt_cur_cache);
886	xfs_rmapbt_cur_cache = NULL;
887	}
888

source code of linux/fs/xfs/libxfs/xfs_rmap_btree.c