ctree.c source code [linux/fs/btrfs/ctree.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007,2008 Oracle. All rights reserved.
4	*/
5
6	#include <linux/sched.h>
7	#include <linux/slab.h>
8	#include <linux/rbtree.h>
9	#include <linux/mm.h>
10	#include <linux/error-injection.h>
11	#include "messages.h"
12	#include "ctree.h"
13	#include "disk-io.h"
14	#include "transaction.h"
15	#include "print-tree.h"
16	#include "locking.h"
17	#include "volumes.h"
18	#include "qgroup.h"
19	#include "tree-mod-log.h"
20	#include "tree-checker.h"
21	#include "fs.h"
22	#include "accessors.h"
23	#include "extent-tree.h"
24	#include "relocation.h"
25	#include "file-item.h"
26
27	static struct kmem_cache *btrfs_path_cachep;
28
29	static int split_node(struct btrfs_trans_handle trans, struct* btrfs_root
30	root, struct* btrfs_path path, int* level);
31	static int split_leaf(struct btrfs_trans_handle trans, struct* btrfs_root *root,
32	const struct btrfs_key ins_key, struct* btrfs_path *path,
33	int data_size, int extend);
34	static int push_node_left(struct btrfs_trans_handle *trans,
35	struct extent_buffer *dst,
36	struct extent_buffer src, int* empty);
37	static int balance_node_right(struct btrfs_trans_handle *trans,
38	struct extent_buffer *dst_buf,
39	struct extent_buffer *src_buf);
40
41	static const struct btrfs_csums {
42	u16 size;
43	const char name[`10`];
44	const char driver[`12`];
45	} btrfs_csums[] = {
46	[BTRFS_CSUM_TYPE_CRC32] = { .size = `4`, .name = "crc32c" },
47	[BTRFS_CSUM_TYPE_XXHASH] = { .size = `8`, .name = "xxhash64" },
48	[BTRFS_CSUM_TYPE_SHA256] = { .size = `32`, .name = "sha256" },
49	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = `32`, .name = "blake2b",
50	.driver = "blake2b-256" },
51	};
52
53	/*
54	* The leaf data grows from end-to-front in the node. this returns the address
55	* of the start of the last item, which is the stop of the leaf data stack.
56	*/
57	static unsigned int leaf_data_end(const struct extent_buffer *leaf)
58	{
59	u32 nr = btrfs_header_nritems(eb: leaf);
60
61	if (nr == `0`)
62	return BTRFS_LEAF_DATA_SIZE(info: leaf->fs_info);
63	return btrfs_item_offset(eb: leaf, slot: nr - `1`);
64	}
65
66	/*
67	* Move data in a @leaf (using memmove, safe for overlapping ranges).
68	*
69	* @leaf: leaf that we're doing a memmove on
70	* @dst_offset: item data offset we're moving to
71	* @src_offset: item data offset were' moving from
72	* @len: length of the data we're moving
73	*
74	* Wrapper around memmove_extent_buffer() that takes into account the header on
75	* the leaf. The btrfs_item offset's start directly after the header, so we
76	* have to adjust any offsets to account for the header in the leaf. This
77	* handles that math to simplify the callers.
78	*/
79	static inline void memmove_leaf_data(const struct extent_buffer *leaf,
80	unsigned long dst_offset,
81	unsigned long src_offset,
82	unsigned long len)
83	{
84	memmove_extent_buffer(dst: leaf, dst_offset: btrfs_item_nr_offset(eb: leaf, nr: `0`) + dst_offset,
85	src_offset: btrfs_item_nr_offset(eb: leaf, nr: `0`) + src_offset, len);
86	}
87
88	/*
89	* Copy item data from @src into @dst at the given @offset.
90	*
91	* @dst: destination leaf that we're copying into
92	* @src: source leaf that we're copying from
93	* @dst_offset: item data offset we're copying to
94	* @src_offset: item data offset were' copying from
95	* @len: length of the data we're copying
96	*
97	* Wrapper around copy_extent_buffer() that takes into account the header on
98	* the leaf. The btrfs_item offset's start directly after the header, so we
99	* have to adjust any offsets to account for the header in the leaf. This
100	* handles that math to simplify the callers.
101	*/
102	static inline void copy_leaf_data(const struct extent_buffer *dst,
103	const struct extent_buffer *src,
104	unsigned long dst_offset,
105	unsigned long src_offset, unsigned long len)
106	{
107	copy_extent_buffer(dst, src, dst_offset: btrfs_item_nr_offset(eb: dst, nr: `0`) + dst_offset,
108	src_offset: btrfs_item_nr_offset(eb: src, nr: `0`) + src_offset, len);
109	}
110
111	/*
112	* Move items in a @leaf (using memmove).
113	*
114	* @dst: destination leaf for the items
115	* @dst_item: the item nr we're copying into
116	* @src_item: the item nr we're copying from
117	* @nr_items: the number of items to copy
118	*
119	* Wrapper around memmove_extent_buffer() that does the math to get the
120	* appropriate offsets into the leaf from the item numbers.
121	*/
122	static inline void memmove_leaf_items(const struct extent_buffer *leaf,
123	int dst_item, int src_item, int nr_items)
124	{
125	memmove_extent_buffer(dst: leaf, dst_offset: btrfs_item_nr_offset(eb: leaf, nr: dst_item),
126	src_offset: btrfs_item_nr_offset(eb: leaf, nr: src_item),
127	len: nr_items * sizeof(struct btrfs_item));
128	}
129
130	/*
131	* Copy items from @src into @dst at the given @offset.
132	*
133	* @dst: destination leaf for the items
134	* @src: source leaf for the items
135	* @dst_item: the item nr we're copying into
136	* @src_item: the item nr we're copying from
137	* @nr_items: the number of items to copy
138	*
139	* Wrapper around copy_extent_buffer() that does the math to get the
140	* appropriate offsets into the leaf from the item numbers.
141	*/
142	static inline void copy_leaf_items(const struct extent_buffer *dst,
143	const struct extent_buffer *src,
144	int dst_item, int src_item, int nr_items)
145	{
146	copy_extent_buffer(dst, src, dst_offset: btrfs_item_nr_offset(eb: dst, nr: dst_item),
147	src_offset: btrfs_item_nr_offset(eb: src, nr: src_item),
148	len: nr_items * sizeof(struct btrfs_item));
149	}
150
151	/ This exists for btrfs-progs usages. /
152	u16 btrfs_csum_type_size(u16 type)
153	{
154	return btrfs_csums[type].size;
155	}
156
157	int btrfs_super_csum_size(const struct btrfs_super_block *s)
158	{
159	u16 t = btrfs_super_csum_type(s);
160	/*
161	* csum type is validated at mount time
162	*/
163	return btrfs_csum_type_size(type: t);
164	}
165
166	const char *btrfs_super_csum_name(u16 csum_type)
167	{
168	/ csum type is validated at mount time /
169	return btrfs_csums[csum_type].name;
170	}
171
172	/*
173	* Return driver name if defined, otherwise the name that's also a valid driver
174	* name
175	*/
176	const char *btrfs_super_csum_driver(u16 csum_type)
177	{
178	/ csum type is validated at mount time /
179	return btrfs_csums[csum_type].driver[`0`] ?
180	btrfs_csums[csum_type].driver :
181	btrfs_csums[csum_type].name;
182	}
183
184	size_t __attribute_const__ btrfs_get_num_csums(void)
185	{
186	return ARRAY_SIZE(btrfs_csums);
187	}
188
189	struct btrfs_path btrfs_alloc_path(void*)
190	{
191	might_sleep();
192
193	return kmem_cache_zalloc(k: btrfs_path_cachep, GFP_NOFS);
194	}
195
196	/ this also releases the path /
197	void btrfs_free_path(struct btrfs_path *p)
198	{
199	if (!p)
200	return;
201	btrfs_release_path(p);
202	kmem_cache_free(s: btrfs_path_cachep, objp: p);
203	}
204
205	/*
206	* path release drops references on the extent buffers in the path
207	* and it drops any locks held by this path
208	*
209	* It is safe to call this on paths that no locks or extent buffers held.
210	*/
211	noinline void btrfs_release_path(struct btrfs_path *p)
212	{
213	int i;
214
215	for (i = `0`; i < BTRFS_MAX_LEVEL; i++) {
216	p->slots[i] = `0`;
217	if (!p->nodes[i])
218	continue;
219	if (p->locks[i]) {
220	btrfs_tree_unlock_rw(eb: p->nodes[i], rw: p->locks[i]);
221	p->locks[i] = `0`;
222	}
223	free_extent_buffer(eb: p->nodes[i]);
224	p->nodes[i] = NULL;
225	}
226	}
227
228	/*
229	* We want the transaction abort to print stack trace only for errors where the
230	* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231	* caused by external factors.
232	*/
233	bool __cold abort_should_print_stack(int error)
234	{
235	switch (error) {
236	case -EIO:
237	case -EROFS:
238	case -ENOMEM:
239	return false;
240	}
241	return true;
242	}
243
244	/*
245	* safely gets a reference on the root node of a tree. A lock
246	* is not taken, so a concurrent writer may put a different node
247	* at the root of the tree. See btrfs_lock_root_node for the
248	* looping required.
249	*
250	* The extent buffer returned by this has a reference taken, so
251	* it won't disappear. It may stop being the root of the tree
252	* at any time because there are no locks held.
253	*/
254	struct extent_buffer btrfs_root_node(struct* btrfs_root *root)
255	{
256	struct extent_buffer *eb;
257
258	while (`1`) {
259	rcu_read_lock();
260	eb = rcu_dereference(root->node);
261
262	/*
263	* RCU really hurts here, we could free up the root node because
264	* it was COWed but we may not get the new root node yet so do
265	* the inc_not_zero dance and if it doesn't work then
266	* synchronize_rcu and try again.
267	*/
268	if (atomic_inc_not_zero(v: &eb->refs)) {
269	rcu_read_unlock();
270	break;
271	}
272	rcu_read_unlock();
273	synchronize_rcu();
274	}
275	return eb;
276	}
277
278	/*
279	* Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280	* just get put onto a simple dirty list. Transaction walks this list to make
281	* sure they get properly updated on disk.
282	*/
283	static void add_root_to_dirty_list(struct btrfs_root *root)
284	{
285	struct btrfs_fs_info *fs_info = root->fs_info;
286
287	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) \|\|
288	!test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
289	return;
290
291	spin_lock(lock: &fs_info->trans_lock);
292	if (!test_and_set_bit(nr: BTRFS_ROOT_DIRTY, addr: &root->state)) {
293	/ Want the extent tree to be the last on the list /
294	if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
295	list_move_tail(list: &root->dirty_list,
296	head: &fs_info->dirty_cowonly_roots);
297	else
298	list_move(list: &root->dirty_list,
299	head: &fs_info->dirty_cowonly_roots);
300	}
301	spin_unlock(lock: &fs_info->trans_lock);
302	}
303
304	/*
305	* used by snapshot creation to make a copy of a root for a tree with
306	* a given objectid. The buffer with the new root node is returned in
307	* cow_ret, and this func returns zero on success or a negative error code.
308	*/
309	int btrfs_copy_root(struct btrfs_trans_handle *trans,
310	struct btrfs_root *root,
311	struct extent_buffer *buf,
312	struct extent_buffer **cow_ret, u64 new_root_objectid)
313	{
314	struct btrfs_fs_info *fs_info = root->fs_info;
315	struct extent_buffer *cow;
316	int ret = `0`;
317	int level;
318	struct btrfs_disk_key disk_key;
319	u64 reloc_src_root = `0`;
320
321	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
322	trans->transid != fs_info->running_transaction->transid);
323	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
324	trans->transid != root->last_trans);
325
326	level = btrfs_header_level(eb: buf);
327	if (level == `0`)
328	btrfs_item_key(eb: buf, disk_key: &disk_key, nr: `0`);
329	else
330	btrfs_node_key(eb: buf, disk_key: &disk_key, nr: `0`);
331
332	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
333	reloc_src_root = btrfs_header_owner(eb: buf);
334	cow = btrfs_alloc_tree_block(trans, root, parent: `0`, root_objectid: new_root_objectid,
335	key: &disk_key, level, hint: buf->start, empty_size: `0`,
336	reloc_src_root, nest: BTRFS_NESTING_NEW_ROOT);
337	if (IS_ERR(ptr: cow))
338	return PTR_ERR(ptr: cow);
339
340	copy_extent_buffer_full(dst: cow, src: buf);
341	btrfs_set_header_bytenr(eb: cow, val: cow->start);
342	btrfs_set_header_generation(eb: cow, val: trans->transid);
343	btrfs_set_header_backref_rev(eb: cow, BTRFS_MIXED_BACKREF_REV);
344	btrfs_clear_header_flag(eb: cow, BTRFS_HEADER_FLAG_WRITTEN \|
345	BTRFS_HEADER_FLAG_RELOC);
346	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
347	btrfs_set_header_flag(eb: cow, BTRFS_HEADER_FLAG_RELOC);
348	else
349	btrfs_set_header_owner(eb: cow, val: new_root_objectid);
350
351	write_extent_buffer_fsid(eb: cow, fsid: fs_info->fs_devices->metadata_uuid);
352
353	WARN_ON(btrfs_header_generation(buf) > trans->transid);
354	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
355	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `1`);
356	else
357	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `0`);
358	if (ret) {
359	btrfs_tree_unlock(eb: cow);
360	free_extent_buffer(eb: cow);
361	btrfs_abort_transaction(trans, ret);
362	return ret;
363	}
364
365	btrfs_mark_buffer_dirty(trans, buf: cow);
366	*cow_ret = cow;
367	return `0`;
368	}
369
370	/*
371	* check if the tree block can be shared by multiple trees
372	*/
373	bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
374	struct btrfs_root *root,
375	struct extent_buffer *buf)
376	{
377	const u64 buf_gen = btrfs_header_generation(eb: buf);
378
379	/*
380	* Tree blocks not in shareable trees and tree roots are never shared.
381	* If a block was allocated after the last snapshot and the block was
382	* not allocated by tree relocation, we know the block is not shared.
383	*/
384
385	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
386	return false;
387
388	if (buf == root->node)
389	return false;
390
391	if (buf_gen > btrfs_root_last_snapshot(s: &root->root_item) &&
392	!btrfs_header_flag(eb: buf, BTRFS_HEADER_FLAG_RELOC))
393	return false;
394
395	if (buf != root->commit_root)
396	return true;
397
398	/*
399	* An extent buffer that used to be the commit root may still be shared
400	* because the tree height may have increased and it became a child of a
401	* higher level root. This can happen when snapshotting a subvolume
402	* created in the current transaction.
403	*/
404	if (buf_gen == trans->transid)
405	return true;
406
407	return false;
408	}
409
410	static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
411	struct btrfs_root *root,
412	struct extent_buffer *buf,
413	struct extent_buffer *cow,
414	int *last_ref)
415	{
416	struct btrfs_fs_info *fs_info = root->fs_info;
417	u64 refs;
418	u64 owner;
419	u64 flags;
420	u64 new_flags = `0`;
421	int ret;
422
423	/*
424	* Backrefs update rules:
425	*
426	* Always use full backrefs for extent pointers in tree block
427	* allocated by tree relocation.
428	*
429	* If a shared tree block is no longer referenced by its owner
430	* tree (btrfs_header_owner(buf) == root->root_key.objectid),
431	* use full backrefs for extent pointers in tree block.
432	*
433	* If a tree block is been relocating
434	* (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
435	* use full backrefs for extent pointers in tree block.
436	* The reason for this is some operations (such as drop tree)
437	* are only allowed for blocks use full backrefs.
438	*/
439
440	if (btrfs_block_can_be_shared(trans, root, buf)) {
441	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr: buf->start,
442	offset: btrfs_header_level(eb: buf), metadata: `1`,
443	refs: &refs, flags: &flags, NULL);
444	if (ret)
445	return ret;
446	if (unlikely(refs == `0`)) {
447	btrfs_crit(fs_info,
448	"found 0 references for tree block at bytenr %llu level %d root %llu",
449	buf->start, btrfs_header_level(buf),
450	btrfs_root_id(root));
451	ret = -EUCLEAN;
452	btrfs_abort_transaction(trans, ret);
453	return ret;
454	}
455	} else {
456	refs = `1`;
457	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID \|\|
458	btrfs_header_backref_rev(eb: buf) < BTRFS_MIXED_BACKREF_REV)
459	flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
460	else
461	flags = `0`;
462	}
463
464	owner = btrfs_header_owner(eb: buf);
465	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
466	!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
467
468	if (refs > `1`) {
469	if ((owner == root->root_key.objectid \|\|
470	root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
471	!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
472	ret = btrfs_inc_ref(trans, root, buf, full_backref: `1`);
473	if (ret)
474	return ret;
475
476	if (root->root_key.objectid ==
477	BTRFS_TREE_RELOC_OBJECTID) {
478	ret = btrfs_dec_ref(trans, root, buf, full_backref: `0`);
479	if (ret)
480	return ret;
481	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `1`);
482	if (ret)
483	return ret;
484	}
485	new_flags \|= BTRFS_BLOCK_FLAG_FULL_BACKREF;
486	} else {
487
488	if (root->root_key.objectid ==
489	BTRFS_TREE_RELOC_OBJECTID)
490	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `1`);
491	else
492	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `0`);
493	if (ret)
494	return ret;
495	}
496	if (new_flags != `0`) {
497	ret = btrfs_set_disk_extent_flags(trans, eb: buf, flags: new_flags);
498	if (ret)
499	return ret;
500	}
501	} else {
502	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
503	if (root->root_key.objectid ==
504	BTRFS_TREE_RELOC_OBJECTID)
505	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `1`);
506	else
507	ret = btrfs_inc_ref(trans, root, buf: cow, full_backref: `0`);
508	if (ret)
509	return ret;
510	ret = btrfs_dec_ref(trans, root, buf, full_backref: `1`);
511	if (ret)
512	return ret;
513	}
514	btrfs_clear_buffer_dirty(trans, buf);
515	*last_ref = `1`;
516	}
517	return `0`;
518	}
519
520	/*
521	* does the dirty work in cow of a single block. The parent block (if
522	* supplied) is updated to point to the new cow copy. The new buffer is marked
523	* dirty and returned locked. If you modify the block it needs to be marked
524	* dirty again.
525	*
526	* search_start -- an allocation hint for the new block
527	*
528	* empty_size -- a hint that you plan on doing more cow. This is the size in
529	* bytes the allocator should try to find free next to the block it returns.
530	* This is just a hint and may be ignored by the allocator.
531	*/
532	int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
533	struct btrfs_root *root,
534	struct extent_buffer *buf,
535	struct extent_buffer parent, int* parent_slot,
536	struct extent_buffer **cow_ret,
537	u64 search_start, u64 empty_size,
538	enum btrfs_lock_nesting nest)
539	{
540	struct btrfs_fs_info *fs_info = root->fs_info;
541	struct btrfs_disk_key disk_key;
542	struct extent_buffer *cow;
543	int level, ret;
544	int last_ref = `0`;
545	int unlock_orig = `0`;
546	u64 parent_start = `0`;
547	u64 reloc_src_root = `0`;
548
549	if (*cow_ret == buf)
550	unlock_orig = `1`;
551
552	btrfs_assert_tree_write_locked(eb: buf);
553
554	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
555	trans->transid != fs_info->running_transaction->transid);
556	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
557	trans->transid != root->last_trans);
558
559	level = btrfs_header_level(eb: buf);
560
561	if (level == `0`)
562	btrfs_item_key(eb: buf, disk_key: &disk_key, nr: `0`);
563	else
564	btrfs_node_key(eb: buf, disk_key: &disk_key, nr: `0`);
565
566	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
567	if (parent)
568	parent_start = parent->start;
569	reloc_src_root = btrfs_header_owner(eb: buf);
570	}
571	cow = btrfs_alloc_tree_block(trans, root, parent: parent_start,
572	root_objectid: root->root_key.objectid, key: &disk_key, level,
573	hint: search_start, empty_size, reloc_src_root, nest);
574	if (IS_ERR(ptr: cow))
575	return PTR_ERR(ptr: cow);
576
577	/ cow is set to blocking by btrfs_init_new_buffer /
578
579	copy_extent_buffer_full(dst: cow, src: buf);
580	btrfs_set_header_bytenr(eb: cow, val: cow->start);
581	btrfs_set_header_generation(eb: cow, val: trans->transid);
582	btrfs_set_header_backref_rev(eb: cow, BTRFS_MIXED_BACKREF_REV);
583	btrfs_clear_header_flag(eb: cow, BTRFS_HEADER_FLAG_WRITTEN \|
584	BTRFS_HEADER_FLAG_RELOC);
585	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
586	btrfs_set_header_flag(eb: cow, BTRFS_HEADER_FLAG_RELOC);
587	else
588	btrfs_set_header_owner(eb: cow, val: root->root_key.objectid);
589
590	write_extent_buffer_fsid(eb: cow, fsid: fs_info->fs_devices->metadata_uuid);
591
592	ret = update_ref_for_cow(trans, root, buf, cow, last_ref: &last_ref);
593	if (ret) {
594	btrfs_tree_unlock(eb: cow);
595	free_extent_buffer(eb: cow);
596	btrfs_abort_transaction(trans, ret);
597	return ret;
598	}
599
600	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
601	ret = btrfs_reloc_cow_block(trans, root, buf, cow);
602	if (ret) {
603	btrfs_tree_unlock(eb: cow);
604	free_extent_buffer(eb: cow);
605	btrfs_abort_transaction(trans, ret);
606	return ret;
607	}
608	}
609
610	if (buf == root->node) {
611	WARN_ON(parent && parent != buf);
612	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID \|\|
613	btrfs_header_backref_rev(eb: buf) < BTRFS_MIXED_BACKREF_REV)
614	parent_start = buf->start;
615
616	ret = btrfs_tree_mod_log_insert_root(old_root: root->node, new_root: cow, log_removal: true);
617	if (ret < `0`) {
618	btrfs_tree_unlock(eb: cow);
619	free_extent_buffer(eb: cow);
620	btrfs_abort_transaction(trans, ret);
621	return ret;
622	}
623	atomic_inc(v: &cow->refs);
624	rcu_assign_pointer(root->node, cow);
625
626	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf,
627	parent: parent_start, last_ref);
628	free_extent_buffer(eb: buf);
629	add_root_to_dirty_list(root);
630	} else {
631	WARN_ON(trans->transid != btrfs_header_generation(parent));
632	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot: parent_slot,
633	op: BTRFS_MOD_LOG_KEY_REPLACE);
634	if (ret) {
635	btrfs_tree_unlock(eb: cow);
636	free_extent_buffer(eb: cow);
637	btrfs_abort_transaction(trans, ret);
638	return ret;
639	}
640	btrfs_set_node_blockptr(eb: parent, nr: parent_slot,
641	val: cow->start);
642	btrfs_set_node_ptr_generation(eb: parent, nr: parent_slot,
643	val: trans->transid);
644	btrfs_mark_buffer_dirty(trans, buf: parent);
645	if (last_ref) {
646	ret = btrfs_tree_mod_log_free_eb(eb: buf);
647	if (ret) {
648	btrfs_tree_unlock(eb: cow);
649	free_extent_buffer(eb: cow);
650	btrfs_abort_transaction(trans, ret);
651	return ret;
652	}
653	}
654	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf,
655	parent: parent_start, last_ref);
656	}
657	if (unlock_orig)
658	btrfs_tree_unlock(eb: buf);
659	free_extent_buffer_stale(eb: buf);
660	btrfs_mark_buffer_dirty(trans, buf: cow);
661	*cow_ret = cow;
662	return `0`;
663	}
664
665	static inline int should_cow_block(struct btrfs_trans_handle *trans,
666	struct btrfs_root *root,
667	struct extent_buffer *buf)
668	{
669	if (btrfs_is_testing(fs_info: root->fs_info))
670	return `0`;
671
672	/ Ensure we can see the FORCE_COW bit /
673	smp_mb__before_atomic();
674
675	/*
676	* We do not need to cow a block if
677	* 1) this block is not created or changed in this transaction;
678	* 2) this block does not belong to TREE_RELOC tree;
679	* 3) the root is not forced COW.
680	*
681	* What is forced COW:
682	* when we create snapshot during committing the transaction,
683	* after we've finished copying src root, we must COW the shared
684	* block to ensure the metadata consistency.
685	*/
686	if (btrfs_header_generation(eb: buf) == trans->transid &&
687	!btrfs_header_flag(eb: buf, BTRFS_HEADER_FLAG_WRITTEN) &&
688	!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
689	btrfs_header_flag(eb: buf, BTRFS_HEADER_FLAG_RELOC)) &&
690	!test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
691	return `0`;
692	return `1`;
693	}
694
695	/*
696	* COWs a single block, see btrfs_force_cow_block() for the real work.
697	* This version of it has extra checks so that a block isn't COWed more than
698	* once per transaction, as long as it hasn't been written yet
699	*/
700	int btrfs_cow_block(struct btrfs_trans_handle *trans,
701	struct btrfs_root root, struct* extent_buffer *buf,
702	struct extent_buffer parent, int* parent_slot,
703	struct extent_buffer **cow_ret,
704	enum btrfs_lock_nesting nest)
705	{
706	struct btrfs_fs_info *fs_info = root->fs_info;
707	u64 search_start;
708	int ret;
709
710	if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
711	btrfs_abort_transaction(trans, -EUCLEAN);
712	btrfs_crit(fs_info,
713	"attempt to COW block %llu on root %llu that is being deleted",
714	buf->start, btrfs_root_id(root));
715	return -EUCLEAN;
716	}
717
718	/*
719	* COWing must happen through a running transaction, which always
720	* matches the current fs generation (it's a transaction with a state
721	* less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
722	* into error state to prevent the commit of any transaction.
723	*/
724	if (unlikely(trans->transaction != fs_info->running_transaction \|\|
725	trans->transid != fs_info->generation)) {
726	btrfs_abort_transaction(trans, -EUCLEAN);
727	btrfs_crit(fs_info,
728	"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
729	buf->start, btrfs_root_id(root), trans->transid,
730	fs_info->running_transaction->transid,
731	fs_info->generation);
732	return -EUCLEAN;
733	}
734
735	if (!should_cow_block(trans, root, buf)) {
736	*cow_ret = buf;
737	return `0`;
738	}
739
740	search_start = round_down(buf->start, SZ_1G);
741
742	/*
743	* Before CoWing this block for later modification, check if it's
744	* the subtree root and do the delayed subtree trace if needed.
745	*
746	* Also We don't care about the error, as it's handled internally.
747	*/
748	btrfs_qgroup_trace_subtree_after_cow(trans, root, eb: buf);
749	ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
750	cow_ret, search_start, empty_size: `0`, nest);
751
752	trace_btrfs_cow_block(root, buf, cow: *cow_ret);
753
754	return ret;
755	}
756	ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
757
758	/*
759	* same as comp_keys only with two btrfs_key's
760	*/
761	int __pure btrfs_comp_cpu_keys(const struct btrfs_key k1, const* struct btrfs_key *k2)
762	{
763	if (k1->objectid > k2->objectid)
764	return `1`;
765	if (k1->objectid < k2->objectid)
766	return -`1`;
767	if (k1->type > k2->type)
768	return `1`;
769	if (k1->type < k2->type)
770	return -`1`;
771	if (k1->offset > k2->offset)
772	return `1`;
773	if (k1->offset < k2->offset)
774	return -`1`;
775	return `0`;
776	}
777
778	/*
779	* Search for a key in the given extent_buffer.
780	*
781	* The lower boundary for the search is specified by the slot number @first_slot.
782	* Use a value of 0 to search over the whole extent buffer. Works for both
783	* leaves and nodes.
784	*
785	* The slot in the extent buffer is returned via @slot. If the key exists in the
786	* extent buffer, then @slot will point to the slot where the key is, otherwise
787	* it points to the slot where you would insert the key.
788	*
789	* Slot may point to the total number of items (i.e. one position beyond the last
790	* key) if the key is bigger than the last key in the extent buffer.
791	*/
792	int btrfs_bin_search(struct extent_buffer eb, int* first_slot,
793	const struct btrfs_key key, int* *slot)
794	{
795	unsigned long p;
796	int item_size;
797	/*
798	* Use unsigned types for the low and high slots, so that we get a more
799	* efficient division in the search loop below.
800	*/
801	u32 low = first_slot;
802	u32 high = btrfs_header_nritems(eb);
803	int ret;
804	const int key_size = sizeof(struct btrfs_disk_key);
805
806	if (unlikely(low > high)) {
807	btrfs_err(eb->fs_info,
808	"%s: low (%u) > high (%u) eb %llu owner %llu level %d",
809	__func__, low, high, eb->start,
810	btrfs_header_owner(eb), btrfs_header_level(eb));
811	return -EINVAL;
812	}
813
814	if (btrfs_header_level(eb) == `0`) {
815	p = offsetof(struct btrfs_leaf, items);
816	item_size = sizeof(struct btrfs_item);
817	} else {
818	p = offsetof(struct btrfs_node, ptrs);
819	item_size = sizeof(struct btrfs_key_ptr);
820	}
821
822	while (low < high) {
823	const int unit_size = eb->folio_size;
824	unsigned long oil;
825	unsigned long offset;
826	struct btrfs_disk_key *tmp;
827	struct btrfs_disk_key unaligned;
828	int mid;
829
830	mid = (low + high) / `2`;
831	offset = p + mid * item_size;
832	oil = get_eb_offset_in_folio(eb, offset);
833
834	if (oil + key_size <= unit_size) {
835	const unsigned long idx = get_eb_folio_index(eb, offset);
836	char *kaddr = folio_address(folio: eb->folios[idx]);
837
838	oil = get_eb_offset_in_folio(eb, offset);
839	tmp = (struct btrfs_disk_key *)(kaddr + oil);
840	} else {
841	read_extent_buffer(eb, dst: &unaligned, start: offset, len: key_size);
842	tmp = &unaligned;
843	}
844
845	ret = btrfs_comp_keys(disk_key: tmp, k2: key);
846
847	if (ret < `0`)
848	low = mid + `1`;
849	else if (ret > `0`)
850	high = mid;
851	else {
852	*slot = mid;
853	return `0`;
854	}
855	}
856	*slot = low;
857	return `1`;
858	}
859
860	static void root_add_used_bytes(struct btrfs_root *root)
861	{
862	spin_lock(lock: &root->accounting_lock);
863	btrfs_set_root_used(s: &root->root_item,
864	val: btrfs_root_used(s: &root->root_item) + root->fs_info->nodesize);
865	spin_unlock(lock: &root->accounting_lock);
866	}
867
868	static void root_sub_used_bytes(struct btrfs_root *root)
869	{
870	spin_lock(lock: &root->accounting_lock);
871	btrfs_set_root_used(s: &root->root_item,
872	val: btrfs_root_used(s: &root->root_item) - root->fs_info->nodesize);
873	spin_unlock(lock: &root->accounting_lock);
874	}
875
876	/ given a node and slot number, this reads the blocks it points to. The*
877	* extent buffer is returned with a reference taken (but unlocked).
878	*/
879	struct extent_buffer btrfs_read_node_slot(struct* extent_buffer *parent,
880	int slot)
881	{
882	int level = btrfs_header_level(eb: parent);
883	struct btrfs_tree_parent_check check = { `0` };
884	struct extent_buffer *eb;
885
886	if (slot < `0` \|\| slot >= btrfs_header_nritems(eb: parent))
887	return ERR_PTR(error: -ENOENT);
888
889	ASSERT(level);
890
891	check.level = level - `1`;
892	check.transid = btrfs_node_ptr_generation(eb: parent, nr: slot);
893	check.owner_root = btrfs_header_owner(eb: parent);
894	check.has_first_key = true;
895	btrfs_node_key_to_cpu(eb: parent, cpu_key: &check.first_key, nr: slot);
896
897	eb = read_tree_block(fs_info: parent->fs_info, bytenr: btrfs_node_blockptr(eb: parent, nr: slot),
898	check: &check);
899	if (IS_ERR(ptr: eb))
900	return eb;
901	if (!extent_buffer_uptodate(eb)) {
902	free_extent_buffer(eb);
903	return ERR_PTR(error: -EIO);
904	}
905
906	return eb;
907	}
908
909	/*
910	* node level balancing, used to make sure nodes are in proper order for
911	* item deletion. We balance from the top down, so we have to make sure
912	* that a deletion won't leave an node completely empty later on.
913	*/
914	static noinline int balance_level(struct btrfs_trans_handle *trans,
915	struct btrfs_root *root,
916	struct btrfs_path path, int* level)
917	{
918	struct btrfs_fs_info *fs_info = root->fs_info;
919	struct extent_buffer *right = NULL;
920	struct extent_buffer *mid;
921	struct extent_buffer *left = NULL;
922	struct extent_buffer *parent = NULL;
923	int ret = `0`;
924	int wret;
925	int pslot;
926	int orig_slot = path->slots[level];
927	u64 orig_ptr;
928
929	ASSERT(level > `0`);
930
931	mid = path->nodes[level];
932
933	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
934	WARN_ON(btrfs_header_generation(mid) != trans->transid);
935
936	orig_ptr = btrfs_node_blockptr(eb: mid, nr: orig_slot);
937
938	if (level < BTRFS_MAX_LEVEL - `1`) {
939	parent = path->nodes[level + `1`];
940	pslot = path->slots[level + `1`];
941	}
942
943	/*
944	* deal with the case where there is only one pointer in the root
945	* by promoting the node below to a root
946	*/
947	if (!parent) {
948	struct extent_buffer *child;
949
950	if (btrfs_header_nritems(eb: mid) != `1`)
951	return `0`;
952
953	/ promote the child to a root /
954	child = btrfs_read_node_slot(parent: mid, slot: `0`);
955	if (IS_ERR(ptr: child)) {
956	ret = PTR_ERR(ptr: child);
957	goto out;
958	}
959
960	btrfs_tree_lock(eb: child);
961	ret = btrfs_cow_block(trans, root, buf: child, parent: mid, parent_slot: `0`, cow_ret: &child,
962	nest: BTRFS_NESTING_COW);
963	if (ret) {
964	btrfs_tree_unlock(eb: child);
965	free_extent_buffer(eb: child);
966	goto out;
967	}
968
969	ret = btrfs_tree_mod_log_insert_root(old_root: root->node, new_root: child, log_removal: true);
970	if (ret < `0`) {
971	btrfs_tree_unlock(eb: child);
972	free_extent_buffer(eb: child);
973	btrfs_abort_transaction(trans, ret);
974	goto out;
975	}
976	rcu_assign_pointer(root->node, child);
977
978	add_root_to_dirty_list(root);
979	btrfs_tree_unlock(eb: child);
980
981	path->locks[level] = `0`;
982	path->nodes[level] = NULL;
983	btrfs_clear_buffer_dirty(trans, buf: mid);
984	btrfs_tree_unlock(eb: mid);
985	/ once for the path /
986	free_extent_buffer(eb: mid);
987
988	root_sub_used_bytes(root);
989	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf: mid, parent: `0`, last_ref: `1`);
990	/ once for the root ptr /
991	free_extent_buffer_stale(eb: mid);
992	return `0`;
993	}
994	if (btrfs_header_nritems(eb: mid) >
995	BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) / `4`)
996	return `0`;
997
998	if (pslot) {
999	left = btrfs_read_node_slot(parent, slot: pslot - `1`);
1000	if (IS_ERR(ptr: left)) {
1001	ret = PTR_ERR(ptr: left);
1002	left = NULL;
1003	goto out;
1004	}
1005
1006	__btrfs_tree_lock(eb: left, nest: BTRFS_NESTING_LEFT);
1007	wret = btrfs_cow_block(trans, root, buf: left,
1008	parent, parent_slot: pslot - `1`, cow_ret: &left,
1009	nest: BTRFS_NESTING_LEFT_COW);
1010	if (wret) {
1011	ret = wret;
1012	goto out;
1013	}
1014	}
1015
1016	if (pslot + `1` < btrfs_header_nritems(eb: parent)) {
1017	right = btrfs_read_node_slot(parent, slot: pslot + `1`);
1018	if (IS_ERR(ptr: right)) {
1019	ret = PTR_ERR(ptr: right);
1020	right = NULL;
1021	goto out;
1022	}
1023
1024	__btrfs_tree_lock(eb: right, nest: BTRFS_NESTING_RIGHT);
1025	wret = btrfs_cow_block(trans, root, buf: right,
1026	parent, parent_slot: pslot + `1`, cow_ret: &right,
1027	nest: BTRFS_NESTING_RIGHT_COW);
1028	if (wret) {
1029	ret = wret;
1030	goto out;
1031	}
1032	}
1033
1034	/ first, try to make some room in the middle buffer /
1035	if (left) {
1036	orig_slot += btrfs_header_nritems(eb: left);
1037	wret = push_node_left(trans, dst: left, src: mid, empty: `1`);
1038	if (wret < `0`)
1039	ret = wret;
1040	}
1041
1042	/*
1043	* then try to empty the right most buffer into the middle
1044	*/
1045	if (right) {
1046	wret = push_node_left(trans, dst: mid, src: right, empty: `1`);
1047	if (wret < `0` && wret != -ENOSPC)
1048	ret = wret;
1049	if (btrfs_header_nritems(eb: right) == `0`) {
1050	btrfs_clear_buffer_dirty(trans, buf: right);
1051	btrfs_tree_unlock(eb: right);
1052	ret = btrfs_del_ptr(trans, root, path, level: level + `1`, slot: pslot + `1`);
1053	if (ret < `0`) {
1054	free_extent_buffer_stale(eb: right);
1055	right = NULL;
1056	goto out;
1057	}
1058	root_sub_used_bytes(root);
1059	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf: right,
1060	parent: `0`, last_ref: `1`);
1061	free_extent_buffer_stale(eb: right);
1062	right = NULL;
1063	} else {
1064	struct btrfs_disk_key right_key;
1065	btrfs_node_key(eb: right, disk_key: &right_key, nr: `0`);
1066	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot: pslot + `1`,
1067	op: BTRFS_MOD_LOG_KEY_REPLACE);
1068	if (ret < `0`) {
1069	btrfs_abort_transaction(trans, ret);
1070	goto out;
1071	}
1072	btrfs_set_node_key(eb: parent, disk_key: &right_key, nr: pslot + `1`);
1073	btrfs_mark_buffer_dirty(trans, buf: parent);
1074	}
1075	}
1076	if (btrfs_header_nritems(eb: mid) == `1`) {
1077	/*
1078	* we're not allowed to leave a node with one item in the
1079	* tree during a delete. A deletion from lower in the tree
1080	* could try to delete the only pointer in this node.
1081	* So, pull some keys from the left.
1082	* There has to be a left pointer at this point because
1083	* otherwise we would have pulled some pointers from the
1084	* right
1085	*/
1086	if (unlikely(!left)) {
1087	btrfs_crit(fs_info,
1088	"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1089	parent->start, btrfs_header_level(parent),
1090	mid->start, btrfs_root_id(root));
1091	ret = -EUCLEAN;
1092	btrfs_abort_transaction(trans, ret);
1093	goto out;
1094	}
1095	wret = balance_node_right(trans, dst_buf: mid, src_buf: left);
1096	if (wret < `0`) {
1097	ret = wret;
1098	goto out;
1099	}
1100	if (wret == `1`) {
1101	wret = push_node_left(trans, dst: left, src: mid, empty: `1`);
1102	if (wret < `0`)
1103	ret = wret;
1104	}
1105	BUG_ON(wret == `1`);
1106	}
1107	if (btrfs_header_nritems(eb: mid) == `0`) {
1108	btrfs_clear_buffer_dirty(trans, buf: mid);
1109	btrfs_tree_unlock(eb: mid);
1110	ret = btrfs_del_ptr(trans, root, path, level: level + `1`, slot: pslot);
1111	if (ret < `0`) {
1112	free_extent_buffer_stale(eb: mid);
1113	mid = NULL;
1114	goto out;
1115	}
1116	root_sub_used_bytes(root);
1117	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf: mid, parent: `0`, last_ref: `1`);
1118	free_extent_buffer_stale(eb: mid);
1119	mid = NULL;
1120	} else {
1121	/ update the parent key to reflect our changes /
1122	struct btrfs_disk_key mid_key;
1123	btrfs_node_key(eb: mid, disk_key: &mid_key, nr: `0`);
1124	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot: pslot,
1125	op: BTRFS_MOD_LOG_KEY_REPLACE);
1126	if (ret < `0`) {
1127	btrfs_abort_transaction(trans, ret);
1128	goto out;
1129	}
1130	btrfs_set_node_key(eb: parent, disk_key: &mid_key, nr: pslot);
1131	btrfs_mark_buffer_dirty(trans, buf: parent);
1132	}
1133
1134	/ update the path /
1135	if (left) {
1136	if (btrfs_header_nritems(eb: left) > orig_slot) {
1137	atomic_inc(v: &left->refs);
1138	/ left was locked after cow /
1139	path->nodes[level] = left;
1140	path->slots[level + `1`] -= `1`;
1141	path->slots[level] = orig_slot;
1142	if (mid) {
1143	btrfs_tree_unlock(eb: mid);
1144	free_extent_buffer(eb: mid);
1145	}
1146	} else {
1147	orig_slot -= btrfs_header_nritems(eb: left);
1148	path->slots[level] = orig_slot;
1149	}
1150	}
1151	/ double check we haven't messed things up /
1152	if (orig_ptr !=
1153	btrfs_node_blockptr(eb: path->nodes[level], nr: path->slots[level]))
1154	BUG();
1155	out:
1156	if (right) {
1157	btrfs_tree_unlock(eb: right);
1158	free_extent_buffer(eb: right);
1159	}
1160	if (left) {
1161	if (path->nodes[level] != left)
1162	btrfs_tree_unlock(eb: left);
1163	free_extent_buffer(eb: left);
1164	}
1165	return ret;
1166	}
1167
1168	/ Node balancing for insertion. Here we only split or push nodes around*
1169	* when they are completely full. This is also done top down, so we
1170	* have to be pessimistic.
1171	*/
1172	static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1173	struct btrfs_root *root,
1174	struct btrfs_path path, int* level)
1175	{
1176	struct btrfs_fs_info *fs_info = root->fs_info;
1177	struct extent_buffer *right = NULL;
1178	struct extent_buffer *mid;
1179	struct extent_buffer *left = NULL;
1180	struct extent_buffer *parent = NULL;
1181	int ret = `0`;
1182	int wret;
1183	int pslot;
1184	int orig_slot = path->slots[level];
1185
1186	if (level == `0`)
1187	return `1`;
1188
1189	mid = path->nodes[level];
1190	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1191
1192	if (level < BTRFS_MAX_LEVEL - `1`) {
1193	parent = path->nodes[level + `1`];
1194	pslot = path->slots[level + `1`];
1195	}
1196
1197	if (!parent)
1198	return `1`;
1199
1200	/ first, try to make some room in the middle buffer /
1201	if (pslot) {
1202	u32 left_nr;
1203
1204	left = btrfs_read_node_slot(parent, slot: pslot - `1`);
1205	if (IS_ERR(ptr: left))
1206	return PTR_ERR(ptr: left);
1207
1208	__btrfs_tree_lock(eb: left, nest: BTRFS_NESTING_LEFT);
1209
1210	left_nr = btrfs_header_nritems(eb: left);
1211	if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - `1`) {
1212	wret = `1`;
1213	} else {
1214	ret = btrfs_cow_block(trans, root, buf: left, parent,
1215	parent_slot: pslot - `1`, cow_ret: &left,
1216	nest: BTRFS_NESTING_LEFT_COW);
1217	if (ret)
1218	wret = `1`;
1219	else {
1220	wret = push_node_left(trans, dst: left, src: mid, empty: `0`);
1221	}
1222	}
1223	if (wret < `0`)
1224	ret = wret;
1225	if (wret == `0`) {
1226	struct btrfs_disk_key disk_key;
1227	orig_slot += left_nr;
1228	btrfs_node_key(eb: mid, disk_key: &disk_key, nr: `0`);
1229	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot: pslot,
1230	op: BTRFS_MOD_LOG_KEY_REPLACE);
1231	if (ret < `0`) {
1232	btrfs_tree_unlock(eb: left);
1233	free_extent_buffer(eb: left);
1234	btrfs_abort_transaction(trans, ret);
1235	return ret;
1236	}
1237	btrfs_set_node_key(eb: parent, disk_key: &disk_key, nr: pslot);
1238	btrfs_mark_buffer_dirty(trans, buf: parent);
1239	if (btrfs_header_nritems(eb: left) > orig_slot) {
1240	path->nodes[level] = left;
1241	path->slots[level + `1`] -= `1`;
1242	path->slots[level] = orig_slot;
1243	btrfs_tree_unlock(eb: mid);
1244	free_extent_buffer(eb: mid);
1245	} else {
1246	orig_slot -=
1247	btrfs_header_nritems(eb: left);
1248	path->slots[level] = orig_slot;
1249	btrfs_tree_unlock(eb: left);
1250	free_extent_buffer(eb: left);
1251	}
1252	return `0`;
1253	}
1254	btrfs_tree_unlock(eb: left);
1255	free_extent_buffer(eb: left);
1256	}
1257
1258	/*
1259	* then try to empty the right most buffer into the middle
1260	*/
1261	if (pslot + `1` < btrfs_header_nritems(eb: parent)) {
1262	u32 right_nr;
1263
1264	right = btrfs_read_node_slot(parent, slot: pslot + `1`);
1265	if (IS_ERR(ptr: right))
1266	return PTR_ERR(ptr: right);
1267
1268	__btrfs_tree_lock(eb: right, nest: BTRFS_NESTING_RIGHT);
1269
1270	right_nr = btrfs_header_nritems(eb: right);
1271	if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - `1`) {
1272	wret = `1`;
1273	} else {
1274	ret = btrfs_cow_block(trans, root, buf: right,
1275	parent, parent_slot: pslot + `1`,
1276	cow_ret: &right, nest: BTRFS_NESTING_RIGHT_COW);
1277	if (ret)
1278	wret = `1`;
1279	else {
1280	wret = balance_node_right(trans, dst_buf: right, src_buf: mid);
1281	}
1282	}
1283	if (wret < `0`)
1284	ret = wret;
1285	if (wret == `0`) {
1286	struct btrfs_disk_key disk_key;
1287
1288	btrfs_node_key(eb: right, disk_key: &disk_key, nr: `0`);
1289	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot: pslot + `1`,
1290	op: BTRFS_MOD_LOG_KEY_REPLACE);
1291	if (ret < `0`) {
1292	btrfs_tree_unlock(eb: right);
1293	free_extent_buffer(eb: right);
1294	btrfs_abort_transaction(trans, ret);
1295	return ret;
1296	}
1297	btrfs_set_node_key(eb: parent, disk_key: &disk_key, nr: pslot + `1`);
1298	btrfs_mark_buffer_dirty(trans, buf: parent);
1299
1300	if (btrfs_header_nritems(eb: mid) <= orig_slot) {
1301	path->nodes[level] = right;
1302	path->slots[level + `1`] += `1`;
1303	path->slots[level] = orig_slot -
1304	btrfs_header_nritems(eb: mid);
1305	btrfs_tree_unlock(eb: mid);
1306	free_extent_buffer(eb: mid);
1307	} else {
1308	btrfs_tree_unlock(eb: right);
1309	free_extent_buffer(eb: right);
1310	}
1311	return `0`;
1312	}
1313	btrfs_tree_unlock(eb: right);
1314	free_extent_buffer(eb: right);
1315	}
1316	return `1`;
1317	}
1318
1319	/*
1320	* readahead one full node of leaves, finding things that are close
1321	* to the block in 'slot', and triggering ra on them.
1322	*/
1323	static void reada_for_search(struct btrfs_fs_info *fs_info,
1324	struct btrfs_path *path,
1325	int level, int slot, u64 objectid)
1326	{
1327	struct extent_buffer *node;
1328	struct btrfs_disk_key disk_key;
1329	u32 nritems;
1330	u64 search;
1331	u64 target;
1332	u64 nread = `0`;
1333	u64 nread_max;
1334	u32 nr;
1335	u32 blocksize;
1336	u32 nscan = `0`;
1337
1338	if (level != `1` && path->reada != READA_FORWARD_ALWAYS)
1339	return;
1340
1341	if (!path->nodes[level])
1342	return;
1343
1344	node = path->nodes[level];
1345
1346	/*
1347	* Since the time between visiting leaves is much shorter than the time
1348	* between visiting nodes, limit read ahead of nodes to 1, to avoid too
1349	* much IO at once (possibly random).
1350	*/
1351	if (path->reada == READA_FORWARD_ALWAYS) {
1352	if (level > `1`)
1353	nread_max = node->fs_info->nodesize;
1354	else
1355	nread_max = SZ_128K;
1356	} else {
1357	nread_max = SZ_64K;
1358	}
1359
1360	search = btrfs_node_blockptr(eb: node, nr: slot);
1361	blocksize = fs_info->nodesize;
1362	if (path->reada != READA_FORWARD_ALWAYS) {
1363	struct extent_buffer *eb;
1364
1365	eb = find_extent_buffer(fs_info, start: search);
1366	if (eb) {
1367	free_extent_buffer(eb);
1368	return;
1369	}
1370	}
1371
1372	target = search;
1373
1374	nritems = btrfs_header_nritems(eb: node);
1375	nr = slot;
1376
1377	while (`1`) {
1378	if (path->reada == READA_BACK) {
1379	if (nr == `0`)
1380	break;
1381	nr--;
1382	} else if (path->reada == READA_FORWARD \|\|
1383	path->reada == READA_FORWARD_ALWAYS) {
1384	nr++;
1385	if (nr >= nritems)
1386	break;
1387	}
1388	if (path->reada == READA_BACK && objectid) {
1389	btrfs_node_key(eb: node, disk_key: &disk_key, nr);
1390	if (btrfs_disk_key_objectid(s: &disk_key) != objectid)
1391	break;
1392	}
1393	search = btrfs_node_blockptr(eb: node, nr);
1394	if (path->reada == READA_FORWARD_ALWAYS \|\|
1395	(search <= target && target - search <= `65536`) \|\|
1396	(search > target && search - target <= `65536`)) {
1397	btrfs_readahead_node_child(node, slot: nr);
1398	nread += blocksize;
1399	}
1400	nscan++;
1401	if (nread > nread_max \|\| nscan > `32`)
1402	break;
1403	}
1404	}
1405
1406	static noinline void reada_for_balance(struct btrfs_path path, int* level)
1407	{
1408	struct extent_buffer *parent;
1409	int slot;
1410	int nritems;
1411
1412	parent = path->nodes[level + `1`];
1413	if (!parent)
1414	return;
1415
1416	nritems = btrfs_header_nritems(eb: parent);
1417	slot = path->slots[level + `1`];
1418
1419	if (slot > `0`)
1420	btrfs_readahead_node_child(node: parent, slot: slot - `1`);
1421	if (slot + `1` < nritems)
1422	btrfs_readahead_node_child(node: parent, slot: slot + `1`);
1423	}
1424
1425
1426	/*
1427	* when we walk down the tree, it is usually safe to unlock the higher layers
1428	* in the tree. The exceptions are when our path goes through slot 0, because
1429	* operations on the tree might require changing key pointers higher up in the
1430	* tree.
1431	*
1432	* callers might also have set path->keep_locks, which tells this code to keep
1433	* the lock if the path points to the last slot in the block. This is part of
1434	* walking through the tree, and selecting the next slot in the higher block.
1435	*
1436	* lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1437	* if lowest_unlock is 1, level 0 won't be unlocked
1438	*/
1439	static noinline void unlock_up(struct btrfs_path path, int* level,
1440	int lowest_unlock, int min_write_lock_level,
1441	int *write_lock_level)
1442	{
1443	int i;
1444	int skip_level = level;
1445	bool check_skip = true;
1446
1447	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1448	if (!path->nodes[i])
1449	break;
1450	if (!path->locks[i])
1451	break;
1452
1453	if (check_skip) {
1454	if (path->slots[i] == `0`) {
1455	skip_level = i + `1`;
1456	continue;
1457	}
1458
1459	if (path->keep_locks) {
1460	u32 nritems;
1461
1462	nritems = btrfs_header_nritems(eb: path->nodes[i]);
1463	if (nritems < `1` \|\| path->slots[i] >= nritems - `1`) {
1464	skip_level = i + `1`;
1465	continue;
1466	}
1467	}
1468	}
1469
1470	if (i >= lowest_unlock && i > skip_level) {
1471	check_skip = false;
1472	btrfs_tree_unlock_rw(eb: path->nodes[i], rw: path->locks[i]);
1473	path->locks[i] = `0`;
1474	if (write_lock_level &&
1475	i > min_write_lock_level &&
1476	i <= *write_lock_level) {
1477	*write_lock_level = i - `1`;
1478	}
1479	}
1480	}
1481	}
1482
1483	/*
1484	* Helper function for btrfs_search_slot() and other functions that do a search
1485	* on a btree. The goal is to find a tree block in the cache (the radix tree at
1486	* fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1487	* its pages from disk.
1488	*
1489	* Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1490	* whole btree search, starting again from the current root node.
1491	*/
1492	static int
1493	read_block_for_search(struct btrfs_root root, struct* btrfs_path *p,
1494	struct extent_buffer *eb_ret, int* level, int slot,
1495	const struct btrfs_key *key)
1496	{
1497	struct btrfs_fs_info *fs_info = root->fs_info;
1498	struct btrfs_tree_parent_check check = { `0` };
1499	u64 blocknr;
1500	u64 gen;
1501	struct extent_buffer *tmp;
1502	int ret;
1503	int parent_level;
1504	bool unlock_up;
1505
1506	unlock_up = ((level + `1` < BTRFS_MAX_LEVEL) && p->locks[level + `1`]);
1507	blocknr = btrfs_node_blockptr(eb: *eb_ret, nr: slot);
1508	gen = btrfs_node_ptr_generation(eb: *eb_ret, nr: slot);
1509	parent_level = btrfs_header_level(eb: *eb_ret);
1510	btrfs_node_key_to_cpu(eb: *eb_ret, cpu_key: &check.first_key, nr: slot);
1511	check.has_first_key = true;
1512	check.level = parent_level - `1`;
1513	check.transid = gen;
1514	check.owner_root = root->root_key.objectid;
1515
1516	/*
1517	* If we need to read an extent buffer from disk and we are holding locks
1518	* on upper level nodes, we unlock all the upper nodes before reading the
1519	* extent buffer, and then return -EAGAIN to the caller as it needs to
1520	* restart the search. We don't release the lock on the current level
1521	* because we need to walk this node to figure out which blocks to read.
1522	*/
1523	tmp = find_extent_buffer(fs_info, start: blocknr);
1524	if (tmp) {
1525	if (p->reada == READA_FORWARD_ALWAYS)
1526	reada_for_search(fs_info, path: p, level, slot, objectid: key->objectid);
1527
1528	/ first we do an atomic uptodate check /
1529	if (btrfs_buffer_uptodate(buf: tmp, parent_transid: gen, atomic: `1`) > `0`) {
1530	/*
1531	* Do extra check for first_key, eb can be stale due to
1532	* being cached, read from scrub, or have multiple
1533	* parents (shared tree blocks).
1534	*/
1535	if (btrfs_verify_level_key(eb: tmp,
1536	level: parent_level - `1`, first_key: &check.first_key, parent_transid: gen)) {
1537	free_extent_buffer(eb: tmp);
1538	return -EUCLEAN;
1539	}
1540	*eb_ret = tmp;
1541	return `0`;
1542	}
1543
1544	if (p->nowait) {
1545	free_extent_buffer(eb: tmp);
1546	return -EAGAIN;
1547	}
1548
1549	if (unlock_up)
1550	btrfs_unlock_up_safe(path: p, level: level + `1`);
1551
1552	/ now we're allowed to do a blocking uptodate check /
1553	ret = btrfs_read_extent_buffer(buf: tmp, check: &check);
1554	if (ret) {
1555	free_extent_buffer(eb: tmp);
1556	btrfs_release_path(p);
1557	return -EIO;
1558	}
1559	if (btrfs_check_eb_owner(eb: tmp, root_owner: root->root_key.objectid)) {
1560	free_extent_buffer(eb: tmp);
1561	btrfs_release_path(p);
1562	return -EUCLEAN;
1563	}
1564
1565	if (unlock_up)
1566	ret = -EAGAIN;
1567
1568	goto out;
1569	} else if (p->nowait) {
1570	return -EAGAIN;
1571	}
1572
1573	if (unlock_up) {
1574	btrfs_unlock_up_safe(path: p, level: level + `1`);
1575	ret = -EAGAIN;
1576	} else {
1577	ret = `0`;
1578	}
1579
1580	if (p->reada != READA_NONE)
1581	reada_for_search(fs_info, path: p, level, slot, objectid: key->objectid);
1582
1583	tmp = read_tree_block(fs_info, bytenr: blocknr, check: &check);
1584	if (IS_ERR(ptr: tmp)) {
1585	btrfs_release_path(p);
1586	return PTR_ERR(ptr: tmp);
1587	}
1588	/*
1589	* If the read above didn't mark this buffer up to date,
1590	* it will never end up being up to date. Set ret to EIO now
1591	* and give up so that our caller doesn't loop forever
1592	* on our EAGAINs.
1593	*/
1594	if (!extent_buffer_uptodate(eb: tmp))
1595	ret = -EIO;
1596
1597	out:
1598	if (ret == `0`) {
1599	*eb_ret = tmp;
1600	} else {
1601	free_extent_buffer(eb: tmp);
1602	btrfs_release_path(p);
1603	}
1604
1605	return ret;
1606	}
1607
1608	/*
1609	* helper function for btrfs_search_slot. This does all of the checks
1610	* for node-level blocks and does any balancing required based on
1611	* the ins_len.
1612	*
1613	* If no extra work was required, zero is returned. If we had to
1614	* drop the path, -EAGAIN is returned and btrfs_search_slot must
1615	* start over
1616	*/
1617	static int
1618	setup_nodes_for_search(struct btrfs_trans_handle *trans,
1619	struct btrfs_root root, struct* btrfs_path *p,
1620	struct extent_buffer b, int* level, int ins_len,
1621	int *write_lock_level)
1622	{
1623	struct btrfs_fs_info *fs_info = root->fs_info;
1624	int ret = `0`;
1625
1626	if ((p->search_for_split \|\| ins_len > `0`) && btrfs_header_nritems(eb: b) >=
1627	BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - `3`) {
1628
1629	if (*write_lock_level < level + `1`) {
1630	*write_lock_level = level + `1`;
1631	btrfs_release_path(p);
1632	return -EAGAIN;
1633	}
1634
1635	reada_for_balance(path: p, level);
1636	ret = split_node(trans, root, path: p, level);
1637
1638	b = p->nodes[level];
1639	} else if (ins_len < `0` && btrfs_header_nritems(eb: b) <
1640	BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) / `2`) {
1641
1642	if (*write_lock_level < level + `1`) {
1643	*write_lock_level = level + `1`;
1644	btrfs_release_path(p);
1645	return -EAGAIN;
1646	}
1647
1648	reada_for_balance(path: p, level);
1649	ret = balance_level(trans, root, path: p, level);
1650	if (ret)
1651	return ret;
1652
1653	b = p->nodes[level];
1654	if (!b) {
1655	btrfs_release_path(p);
1656	return -EAGAIN;
1657	}
1658	BUG_ON(btrfs_header_nritems(b) == `1`);
1659	}
1660	return ret;
1661	}
1662
1663	int btrfs_find_item(struct btrfs_root fs_root, struct* btrfs_path *path,
1664	u64 iobjectid, u64 ioff, u8 key_type,
1665	struct btrfs_key *found_key)
1666	{
1667	int ret;
1668	struct btrfs_key key;
1669	struct extent_buffer *eb;
1670
1671	ASSERT(path);
1672	ASSERT(found_key);
1673
1674	key.type = key_type;
1675	key.objectid = iobjectid;
1676	key.offset = ioff;
1677
1678	ret = btrfs_search_slot(NULL, root: fs_root, key: &key, p: path, ins_len: `0`, cow: `0`);
1679	if (ret < `0`)
1680	return ret;
1681
1682	eb = path->nodes[`0`];
1683	if (ret && path->slots[`0`] >= btrfs_header_nritems(eb)) {
1684	ret = btrfs_next_leaf(root: fs_root, path);
1685	if (ret)
1686	return ret;
1687	eb = path->nodes[`0`];
1688	}
1689
1690	btrfs_item_key_to_cpu(eb, cpu_key: found_key, nr: path->slots[`0`]);
1691	if (found_key->type != key.type \|\|
1692	found_key->objectid != key.objectid)
1693	return `1`;
1694
1695	return `0`;
1696	}
1697
1698	static struct extent_buffer btrfs_search_slot_get_root(struct* btrfs_root *root,
1699	struct btrfs_path *p,
1700	int write_lock_level)
1701	{
1702	struct extent_buffer *b;
1703	int root_lock = `0`;
1704	int level = `0`;
1705
1706	if (p->search_commit_root) {
1707	b = root->commit_root;
1708	atomic_inc(v: &b->refs);
1709	level = btrfs_header_level(eb: b);
1710	/*
1711	* Ensure that all callers have set skip_locking when
1712	* p->search_commit_root = 1.
1713	*/
1714	ASSERT(p->skip_locking == `1`);
1715
1716	goto out;
1717	}
1718
1719	if (p->skip_locking) {
1720	b = btrfs_root_node(root);
1721	level = btrfs_header_level(eb: b);
1722	goto out;
1723	}
1724
1725	/ We try very hard to do read locks on the root /
1726	root_lock = BTRFS_READ_LOCK;
1727
1728	/*
1729	* If the level is set to maximum, we can skip trying to get the read
1730	* lock.
1731	*/
1732	if (write_lock_level < BTRFS_MAX_LEVEL) {
1733	/*
1734	* We don't know the level of the root node until we actually
1735	* have it read locked
1736	*/
1737	if (p->nowait) {
1738	b = btrfs_try_read_lock_root_node(root);
1739	if (IS_ERR(ptr: b))
1740	return b;
1741	} else {
1742	b = btrfs_read_lock_root_node(root);
1743	}
1744	level = btrfs_header_level(eb: b);
1745	if (level > write_lock_level)
1746	goto out;
1747
1748	/ Whoops, must trade for write lock /
1749	btrfs_tree_read_unlock(eb: b);
1750	free_extent_buffer(eb: b);
1751	}
1752
1753	b = btrfs_lock_root_node(root);
1754	root_lock = BTRFS_WRITE_LOCK;
1755
1756	/ The level might have changed, check again /
1757	level = btrfs_header_level(eb: b);
1758
1759	out:
1760	/*
1761	* The root may have failed to write out at some point, and thus is no
1762	* longer valid, return an error in this case.
1763	*/
1764	if (!extent_buffer_uptodate(eb: b)) {
1765	if (root_lock)
1766	btrfs_tree_unlock_rw(eb: b, rw: root_lock);
1767	free_extent_buffer(eb: b);
1768	return ERR_PTR(error: -EIO);
1769	}
1770
1771	p->nodes[level] = b;
1772	if (!p->skip_locking)
1773	p->locks[level] = root_lock;
1774	/*
1775	* Callers are responsible for dropping b's references.
1776	*/
1777	return b;
1778	}
1779
1780	/*
1781	* Replace the extent buffer at the lowest level of the path with a cloned
1782	* version. The purpose is to be able to use it safely, after releasing the
1783	* commit root semaphore, even if relocation is happening in parallel, the
1784	* transaction used for relocation is committed and the extent buffer is
1785	* reallocated in the next transaction.
1786	*
1787	* This is used in a context where the caller does not prevent transaction
1788	* commits from happening, either by holding a transaction handle or holding
1789	* some lock, while it's doing searches through a commit root.
1790	* At the moment it's only used for send operations.
1791	*/
1792	static int finish_need_commit_sem_search(struct btrfs_path *path)
1793	{
1794	const int i = path->lowest_level;
1795	const int slot = path->slots[i];
1796	struct extent_buffer *lowest = path->nodes[i];
1797	struct extent_buffer *clone;
1798
1799	ASSERT(path->need_commit_sem);
1800
1801	if (!lowest)
1802	return `0`;
1803
1804	lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1805
1806	clone = btrfs_clone_extent_buffer(src: lowest);
1807	if (!clone)
1808	return -ENOMEM;
1809
1810	btrfs_release_path(p: path);
1811	path->nodes[i] = clone;
1812	path->slots[i] = slot;
1813
1814	return `0`;
1815	}
1816
1817	static inline int search_for_key_slot(struct extent_buffer *eb,
1818	int search_low_slot,
1819	const struct btrfs_key *key,
1820	int prev_cmp,
1821	int *slot)
1822	{
1823	/*
1824	* If a previous call to btrfs_bin_search() on a parent node returned an
1825	* exact match (prev_cmp == 0), we can safely assume the target key will
1826	* always be at slot 0 on lower levels, since each key pointer
1827	* (struct btrfs_key_ptr) refers to the lowest key accessible from the
1828	* subtree it points to. Thus we can skip searching lower levels.
1829	*/
1830	if (prev_cmp == `0`) {
1831	*slot = `0`;
1832	return `0`;
1833	}
1834
1835	return btrfs_bin_search(eb, first_slot: search_low_slot, key, slot);
1836	}
1837
1838	static int search_leaf(struct btrfs_trans_handle *trans,
1839	struct btrfs_root *root,
1840	const struct btrfs_key *key,
1841	struct btrfs_path *path,
1842	int ins_len,
1843	int prev_cmp)
1844	{
1845	struct extent_buffer *leaf = path->nodes[`0`];
1846	int leaf_free_space = -`1`;
1847	int search_low_slot = `0`;
1848	int ret;
1849	bool do_bin_search = true;
1850
1851	/*
1852	* If we are doing an insertion, the leaf has enough free space and the
1853	* destination slot for the key is not slot 0, then we can unlock our
1854	* write lock on the parent, and any other upper nodes, before doing the
1855	* binary search on the leaf (with search_for_key_slot()), allowing other
1856	* tasks to lock the parent and any other upper nodes.
1857	*/
1858	if (ins_len > `0`) {
1859	/*
1860	* Cache the leaf free space, since we will need it later and it
1861	* will not change until then.
1862	*/
1863	leaf_free_space = btrfs_leaf_free_space(leaf);
1864
1865	/*
1866	* !path->locks[1] means we have a single node tree, the leaf is
1867	* the root of the tree.
1868	*/
1869	if (path->locks[`1`] && leaf_free_space >= ins_len) {
1870	struct btrfs_disk_key first_key;
1871
1872	ASSERT(btrfs_header_nritems(leaf) > `0`);
1873	btrfs_item_key(eb: leaf, disk_key: &first_key, nr: `0`);
1874
1875	/*
1876	* Doing the extra comparison with the first key is cheap,
1877	* taking into account that the first key is very likely
1878	* already in a cache line because it immediately follows
1879	* the extent buffer's header and we have recently accessed
1880	* the header's level field.
1881	*/
1882	ret = btrfs_comp_keys(disk_key: &first_key, k2: key);
1883	if (ret < `0`) {
1884	/*
1885	* The first key is smaller than the key we want
1886	* to insert, so we are safe to unlock all upper
1887	* nodes and we have to do the binary search.
1888	*
1889	* We do use btrfs_unlock_up_safe() and not
1890	* unlock_up() because the later does not unlock
1891	* nodes with a slot of 0 - we can safely unlock
1892	* any node even if its slot is 0 since in this
1893	* case the key does not end up at slot 0 of the
1894	* leaf and there's no need to split the leaf.
1895	*/
1896	btrfs_unlock_up_safe(path, level: `1`);
1897	search_low_slot = `1`;
1898	} else {
1899	/*
1900	* The first key is >= then the key we want to
1901	* insert, so we can skip the binary search as
1902	* the target key will be at slot 0.
1903	*
1904	* We can not unlock upper nodes when the key is
1905	* less than the first key, because we will need
1906	* to update the key at slot 0 of the parent node
1907	* and possibly of other upper nodes too.
1908	* If the key matches the first key, then we can
1909	* unlock all the upper nodes, using
1910	* btrfs_unlock_up_safe() instead of unlock_up()
1911	* as stated above.
1912	*/
1913	if (ret == `0`)
1914	btrfs_unlock_up_safe(path, level: `1`);
1915	/*
1916	* ret is already 0 or 1, matching the result of
1917	* a btrfs_bin_search() call, so there is no need
1918	* to adjust it.
1919	*/
1920	do_bin_search = false;
1921	path->slots[`0`] = `0`;
1922	}
1923	}
1924	}
1925
1926	if (do_bin_search) {
1927	ret = search_for_key_slot(eb: leaf, search_low_slot, key,
1928	prev_cmp, slot: &path->slots[`0`]);
1929	if (ret < `0`)
1930	return ret;
1931	}
1932
1933	if (ins_len > `0`) {
1934	/*
1935	* Item key already exists. In this case, if we are allowed to
1936	* insert the item (for example, in dir_item case, item key
1937	* collision is allowed), it will be merged with the original
1938	* item. Only the item size grows, no new btrfs item will be
1939	* added. If search_for_extension is not set, ins_len already
1940	* accounts the size btrfs_item, deduct it here so leaf space
1941	* check will be correct.
1942	*/
1943	if (ret == `0` && !path->search_for_extension) {
1944	ASSERT(ins_len >= sizeof(struct btrfs_item));
1945	ins_len -= sizeof(struct btrfs_item);
1946	}
1947
1948	ASSERT(leaf_free_space >= `0`);
1949
1950	if (leaf_free_space < ins_len) {
1951	int err;
1952
1953	err = split_leaf(trans, root, ins_key: key, path, data_size: ins_len,
1954	extend: (ret == `0`));
1955	ASSERT(err <= `0`);
1956	if (WARN_ON(err > `0`))
1957	err = -EUCLEAN;
1958	if (err)
1959	ret = err;
1960	}
1961	}
1962
1963	return ret;
1964	}
1965
1966	/*
1967	* Look for a key in a tree and perform necessary modifications to preserve
1968	* tree invariants.
1969	*
1970	* @trans: Handle of transaction, used when modifying the tree
1971	* @p: Holds all btree nodes along the search path
1972	* @root: The root node of the tree
1973	* @key: The key we are looking for
1974	* @ins_len: Indicates purpose of search:
1975	* >0 for inserts it's size of item inserted (*)
1976	* <0 for deletions
1977	* 0 for plain searches, not modifying the tree
1978	*
1979	* (*) If size of item inserted doesn't include
1980	* sizeof(struct btrfs_item), then p->search_for_extension must
1981	* be set.
1982	* @cow: boolean should CoW operations be performed. Must always be 1
1983	* when modifying the tree.
1984	*
1985	* If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1986	* If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1987	*
1988	* If @key is found, 0 is returned and you can find the item in the leaf level
1989	* of the path (level 0)
1990	*
1991	* If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1992	* points to the slot where it should be inserted
1993	*
1994	* If an error is encountered while searching the tree a negative error number
1995	* is returned
1996	*/
1997	int btrfs_search_slot(struct btrfs_trans_handle trans, struct* btrfs_root *root,
1998	const struct btrfs_key key, struct* btrfs_path *p,
1999	int ins_len, int cow)
2000	{
2001	struct btrfs_fs_info *fs_info = root->fs_info;
2002	struct extent_buffer *b;
2003	int slot;
2004	int ret;
2005	int err;
2006	int level;
2007	int lowest_unlock = `1`;
2008	/ everything at write_lock_level or lower must be write locked /
2009	int write_lock_level = `0`;
2010	u8 lowest_level = `0`;
2011	int min_write_lock_level;
2012	int prev_cmp;
2013
2014	might_sleep();
2015
2016	lowest_level = p->lowest_level;
2017	WARN_ON(lowest_level && ins_len > `0`);
2018	WARN_ON(p->nodes[`0`] != NULL);
2019	BUG_ON(!cow && ins_len);
2020
2021	/*
2022	* For now only allow nowait for read only operations. There's no
2023	* strict reason why we can't, we just only need it for reads so it's
2024	* only implemented for reads.
2025	*/
2026	ASSERT(!p->nowait \|\| !cow);
2027
2028	if (ins_len < `0`) {
2029	lowest_unlock = `2`;
2030
2031	/ when we are removing items, we might have to go up to level*
2032	* two as we update tree pointers Make sure we keep write
2033	* for those levels as well
2034	*/
2035	write_lock_level = `2`;
2036	} else if (ins_len > `0`) {
2037	/*
2038	* for inserting items, make sure we have a write lock on
2039	* level 1 so we can update keys
2040	*/
2041	write_lock_level = `1`;
2042	}
2043
2044	if (!cow)
2045	write_lock_level = -`1`;
2046
2047	if (cow && (p->keep_locks \|\| p->lowest_level))
2048	write_lock_level = BTRFS_MAX_LEVEL;
2049
2050	min_write_lock_level = write_lock_level;
2051
2052	if (p->need_commit_sem) {
2053	ASSERT(p->search_commit_root);
2054	if (p->nowait) {
2055	if (!down_read_trylock(sem: &fs_info->commit_root_sem))
2056	return -EAGAIN;
2057	} else {
2058	down_read(sem: &fs_info->commit_root_sem);
2059	}
2060	}
2061
2062	again:
2063	prev_cmp = -`1`;
2064	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2065	if (IS_ERR(ptr: b)) {
2066	ret = PTR_ERR(ptr: b);
2067	goto done;
2068	}
2069
2070	while (b) {
2071	int dec = `0`;
2072
2073	level = btrfs_header_level(eb: b);
2074
2075	if (cow) {
2076	bool last_level = (level == (BTRFS_MAX_LEVEL - `1`));
2077
2078	/*
2079	* if we don't really need to cow this block
2080	* then we don't want to set the path blocking,
2081	* so we test it here
2082	*/
2083	if (!should_cow_block(trans, root, buf: b))
2084	goto cow_done;
2085
2086	/*
2087	* must have write locks on this node and the
2088	* parent
2089	*/
2090	if (level > write_lock_level \|\|
2091	(level + `1` > write_lock_level &&
2092	level + `1` < BTRFS_MAX_LEVEL &&
2093	p->nodes[level + `1`])) {
2094	write_lock_level = level + `1`;
2095	btrfs_release_path(p);
2096	goto again;
2097	}
2098
2099	if (last_level)
2100	err = btrfs_cow_block(trans, root, buf: b, NULL, parent_slot: `0`,
2101	cow_ret: &b,
2102	nest: BTRFS_NESTING_COW);
2103	else
2104	err = btrfs_cow_block(trans, root, buf: b,
2105	parent: p->nodes[level + `1`],
2106	parent_slot: p->slots[level + `1`], cow_ret: &b,
2107	nest: BTRFS_NESTING_COW);
2108	if (err) {
2109	ret = err;
2110	goto done;
2111	}
2112	}
2113	cow_done:
2114	p->nodes[level] = b;
2115
2116	/*
2117	* we have a lock on b and as long as we aren't changing
2118	* the tree, there is no way to for the items in b to change.
2119	* It is safe to drop the lock on our parent before we
2120	* go through the expensive btree search on b.
2121	*
2122	* If we're inserting or deleting (ins_len != 0), then we might
2123	* be changing slot zero, which may require changing the parent.
2124	* So, we can't drop the lock until after we know which slot
2125	* we're operating on.
2126	*/
2127	if (!ins_len && !p->keep_locks) {
2128	int u = level + `1`;
2129
2130	if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2131	btrfs_tree_unlock_rw(eb: p->nodes[u], rw: p->locks[u]);
2132	p->locks[u] = `0`;
2133	}
2134	}
2135
2136	if (level == `0`) {
2137	if (ins_len > `0`)
2138	ASSERT(write_lock_level >= `1`);
2139
2140	ret = search_leaf(trans, root, key, path: p, ins_len, prev_cmp);
2141	if (!p->search_for_split)
2142	unlock_up(path: p, level, lowest_unlock,
2143	min_write_lock_level, NULL);
2144	goto done;
2145	}
2146
2147	ret = search_for_key_slot(eb: b, search_low_slot: `0`, key, prev_cmp, slot: &slot);
2148	if (ret < `0`)
2149	goto done;
2150	prev_cmp = ret;
2151
2152	if (ret && slot > `0`) {
2153	dec = `1`;
2154	slot--;
2155	}
2156	p->slots[level] = slot;
2157	err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2158	write_lock_level: &write_lock_level);
2159	if (err == -EAGAIN)
2160	goto again;
2161	if (err) {
2162	ret = err;
2163	goto done;
2164	}
2165	b = p->nodes[level];
2166	slot = p->slots[level];
2167
2168	/*
2169	* Slot 0 is special, if we change the key we have to update
2170	* the parent pointer which means we must have a write lock on
2171	* the parent
2172	*/
2173	if (slot == `0` && ins_len && write_lock_level < level + `1`) {
2174	write_lock_level = level + `1`;
2175	btrfs_release_path(p);
2176	goto again;
2177	}
2178
2179	unlock_up(path: p, level, lowest_unlock, min_write_lock_level,
2180	write_lock_level: &write_lock_level);
2181
2182	if (level == lowest_level) {
2183	if (dec)
2184	p->slots[level]++;
2185	goto done;
2186	}
2187
2188	err = read_block_for_search(root, p, eb_ret: &b, level, slot, key);
2189	if (err == -EAGAIN)
2190	goto again;
2191	if (err) {
2192	ret = err;
2193	goto done;
2194	}
2195
2196	if (!p->skip_locking) {
2197	level = btrfs_header_level(eb: b);
2198
2199	btrfs_maybe_reset_lockdep_class(root, eb: b);
2200
2201	if (level <= write_lock_level) {
2202	btrfs_tree_lock(eb: b);
2203	p->locks[level] = BTRFS_WRITE_LOCK;
2204	} else {
2205	if (p->nowait) {
2206	if (!btrfs_try_tree_read_lock(eb: b)) {
2207	free_extent_buffer(eb: b);
2208	ret = -EAGAIN;
2209	goto done;
2210	}
2211	} else {
2212	btrfs_tree_read_lock(eb: b);
2213	}
2214	p->locks[level] = BTRFS_READ_LOCK;
2215	}
2216	p->nodes[level] = b;
2217	}
2218	}
2219	ret = `1`;
2220	done:
2221	if (ret < `0` && !p->skip_release_on_error)
2222	btrfs_release_path(p);
2223
2224	if (p->need_commit_sem) {
2225	int ret2;
2226
2227	ret2 = finish_need_commit_sem_search(path: p);
2228	up_read(sem: &fs_info->commit_root_sem);
2229	if (ret2)
2230	ret = ret2;
2231	}
2232
2233	return ret;
2234	}
2235	ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2236
2237	/*
2238	* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2239	* current state of the tree together with the operations recorded in the tree
2240	* modification log to search for the key in a previous version of this tree, as
2241	* denoted by the time_seq parameter.
2242	*
2243	* Naturally, there is no support for insert, delete or cow operations.
2244	*
2245	* The resulting path and return value will be set up as if we called
2246	* btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2247	*/
2248	int btrfs_search_old_slot(struct btrfs_root root, const* struct btrfs_key *key,
2249	struct btrfs_path *p, u64 time_seq)
2250	{
2251	struct btrfs_fs_info *fs_info = root->fs_info;
2252	struct extent_buffer *b;
2253	int slot;
2254	int ret;
2255	int err;
2256	int level;
2257	int lowest_unlock = `1`;
2258	u8 lowest_level = `0`;
2259
2260	lowest_level = p->lowest_level;
2261	WARN_ON(p->nodes[`0`] != NULL);
2262	ASSERT(!p->nowait);
2263
2264	if (p->search_commit_root) {
2265	BUG_ON(time_seq);
2266	return btrfs_search_slot(NULL, root, key, p, ins_len: `0`, cow: `0`);
2267	}
2268
2269	again:
2270	b = btrfs_get_old_root(root, time_seq);
2271	if (!b) {
2272	ret = -EIO;
2273	goto done;
2274	}
2275	level = btrfs_header_level(eb: b);
2276	p->locks[level] = BTRFS_READ_LOCK;
2277
2278	while (b) {
2279	int dec = `0`;
2280
2281	level = btrfs_header_level(eb: b);
2282	p->nodes[level] = b;
2283
2284	/*
2285	* we have a lock on b and as long as we aren't changing
2286	* the tree, there is no way to for the items in b to change.
2287	* It is safe to drop the lock on our parent before we
2288	* go through the expensive btree search on b.
2289	*/
2290	btrfs_unlock_up_safe(path: p, level: level + `1`);
2291
2292	ret = btrfs_bin_search(eb: b, first_slot: `0`, key, slot: &slot);
2293	if (ret < `0`)
2294	goto done;
2295
2296	if (level == `0`) {
2297	p->slots[level] = slot;
2298	unlock_up(path: p, level, lowest_unlock, min_write_lock_level: `0`, NULL);
2299	goto done;
2300	}
2301
2302	if (ret && slot > `0`) {
2303	dec = `1`;
2304	slot--;
2305	}
2306	p->slots[level] = slot;
2307	unlock_up(path: p, level, lowest_unlock, min_write_lock_level: `0`, NULL);
2308
2309	if (level == lowest_level) {
2310	if (dec)
2311	p->slots[level]++;
2312	goto done;
2313	}
2314
2315	err = read_block_for_search(root, p, eb_ret: &b, level, slot, key);
2316	if (err == -EAGAIN)
2317	goto again;
2318	if (err) {
2319	ret = err;
2320	goto done;
2321	}
2322
2323	level = btrfs_header_level(eb: b);
2324	btrfs_tree_read_lock(eb: b);
2325	b = btrfs_tree_mod_log_rewind(fs_info, path: p, eb: b, time_seq);
2326	if (!b) {
2327	ret = -ENOMEM;
2328	goto done;
2329	}
2330	p->locks[level] = BTRFS_READ_LOCK;
2331	p->nodes[level] = b;
2332	}
2333	ret = `1`;
2334	done:
2335	if (ret < `0`)
2336	btrfs_release_path(p);
2337
2338	return ret;
2339	}
2340
2341	/*
2342	* Search the tree again to find a leaf with smaller keys.
2343	* Returns 0 if it found something.
2344	* Returns 1 if there are no smaller keys.
2345	* Returns < 0 on error.
2346	*
2347	* This may release the path, and so you may lose any locks held at the
2348	* time you call it.
2349	*/
2350	static int btrfs_prev_leaf(struct btrfs_root root, struct* btrfs_path *path)
2351	{
2352	struct btrfs_key key;
2353	struct btrfs_key orig_key;
2354	struct btrfs_disk_key found_key;
2355	int ret;
2356
2357	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: `0`);
2358	orig_key = key;
2359
2360	if (key.offset > `0`) {
2361	key.offset--;
2362	} else if (key.type > `0`) {
2363	key.type--;
2364	key.offset = (u64)-`1`;
2365	} else if (key.objectid > `0`) {
2366	key.objectid--;
2367	key.type = (u8)-`1`;
2368	key.offset = (u64)-`1`;
2369	} else {
2370	return `1`;
2371	}
2372
2373	btrfs_release_path(p: path);
2374	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
2375	if (ret <= `0`)
2376	return ret;
2377
2378	/*
2379	* Previous key not found. Even if we were at slot 0 of the leaf we had
2380	* before releasing the path and calling btrfs_search_slot(), we now may
2381	* be in a slot pointing to the same original key - this can happen if
2382	* after we released the path, one of more items were moved from a
2383	* sibling leaf into the front of the leaf we had due to an insertion
2384	* (see push_leaf_right()).
2385	* If we hit this case and our slot is > 0 and just decrement the slot
2386	* so that the caller does not process the same key again, which may or
2387	* may not break the caller, depending on its logic.
2388	*/
2389	if (path->slots[`0`] < btrfs_header_nritems(eb: path->nodes[`0`])) {
2390	btrfs_item_key(eb: path->nodes[`0`], disk_key: &found_key, nr: path->slots[`0`]);
2391	ret = btrfs_comp_keys(disk_key: &found_key, k2: &orig_key);
2392	if (ret == `0`) {
2393	if (path->slots[`0`] > `0`) {
2394	path->slots[`0`]--;
2395	return `0`;
2396	}
2397	/*
2398	* At slot 0, same key as before, it means orig_key is
2399	* the lowest, leftmost, key in the tree. We're done.
2400	*/
2401	return `1`;
2402	}
2403	}
2404
2405	btrfs_item_key(eb: path->nodes[`0`], disk_key: &found_key, nr: `0`);
2406	ret = btrfs_comp_keys(disk_key: &found_key, k2: &key);
2407	/*
2408	* We might have had an item with the previous key in the tree right
2409	* before we released our path. And after we released our path, that
2410	* item might have been pushed to the first slot (0) of the leaf we
2411	* were holding due to a tree balance. Alternatively, an item with the
2412	* previous key can exist as the only element of a leaf (big fat item).
2413	* Therefore account for these 2 cases, so that our callers (like
2414	* btrfs_previous_item) don't miss an existing item with a key matching
2415	* the previous key we computed above.
2416	*/
2417	if (ret <= `0`)
2418	return `0`;
2419	return `1`;
2420	}
2421
2422	/*
2423	* helper to use instead of search slot if no exact match is needed but
2424	* instead the next or previous item should be returned.
2425	* When find_higher is true, the next higher item is returned, the next lower
2426	* otherwise.
2427	* When return_any and find_higher are both true, and no higher item is found,
2428	* return the next lower instead.
2429	* When return_any is true and find_higher is false, and no lower item is found,
2430	* return the next higher instead.
2431	* It returns 0 if any item is found, 1 if none is found (tree empty), and
2432	* < 0 on error
2433	*/
2434	int btrfs_search_slot_for_read(struct btrfs_root *root,
2435	const struct btrfs_key *key,
2436	struct btrfs_path p, int* find_higher,
2437	int return_any)
2438	{
2439	int ret;
2440	struct extent_buffer *leaf;
2441
2442	again:
2443	ret = btrfs_search_slot(NULL, root, key, p, ins_len: `0`, cow: `0`);
2444	if (ret <= `0`)
2445	return ret;
2446	/*
2447	* a return value of 1 means the path is at the position where the
2448	* item should be inserted. Normally this is the next bigger item,
2449	* but in case the previous item is the last in a leaf, path points
2450	* to the first free slot in the previous leaf, i.e. at an invalid
2451	* item.
2452	*/
2453	leaf = p->nodes[`0`];
2454
2455	if (find_higher) {
2456	if (p->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
2457	ret = btrfs_next_leaf(root, path: p);
2458	if (ret <= `0`)
2459	return ret;
2460	if (!return_any)
2461	return `1`;
2462	/*
2463	* no higher item found, return the next
2464	* lower instead
2465	*/
2466	return_any = `0`;
2467	find_higher = `0`;
2468	btrfs_release_path(p);
2469	goto again;
2470	}
2471	} else {
2472	if (p->slots[`0`] == `0`) {
2473	ret = btrfs_prev_leaf(root, path: p);
2474	if (ret < `0`)
2475	return ret;
2476	if (!ret) {
2477	leaf = p->nodes[`0`];
2478	if (p->slots[`0`] == btrfs_header_nritems(eb: leaf))
2479	p->slots[`0`]--;
2480	return `0`;
2481	}
2482	if (!return_any)
2483	return `1`;
2484	/*
2485	* no lower item found, return the next
2486	* higher instead
2487	*/
2488	return_any = `0`;
2489	find_higher = `1`;
2490	btrfs_release_path(p);
2491	goto again;
2492	} else {
2493	--p->slots[`0`];
2494	}
2495	}
2496	return `0`;
2497	}
2498
2499	/*
2500	* Execute search and call btrfs_previous_item to traverse backwards if the item
2501	* was not found.
2502	*
2503	* Return 0 if found, 1 if not found and < 0 if error.
2504	*/
2505	int btrfs_search_backwards(struct btrfs_root root, struct* btrfs_key *key,
2506	struct btrfs_path *path)
2507	{
2508	int ret;
2509
2510	ret = btrfs_search_slot(NULL, root, key, p: path, ins_len: `0`, cow: `0`);
2511	if (ret > `0`)
2512	ret = btrfs_previous_item(root, path, min_objectid: key->objectid, type: key->type);
2513
2514	if (ret == `0`)
2515	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: key, nr: path->slots[`0`]);
2516
2517	return ret;
2518	}
2519
2520	/*
2521	* Search for a valid slot for the given path.
2522	*
2523	* @root: The root node of the tree.
2524	* @key: Will contain a valid item if found.
2525	* @path: The starting point to validate the slot.
2526	*
2527	* Return: 0 if the item is valid
2528	* 1 if not found
2529	* <0 if error.
2530	*/
2531	int btrfs_get_next_valid_item(struct btrfs_root root, struct* btrfs_key *key,
2532	struct btrfs_path *path)
2533	{
2534	if (path->slots[`0`] >= btrfs_header_nritems(eb: path->nodes[`0`])) {
2535	int ret;
2536
2537	ret = btrfs_next_leaf(root, path);
2538	if (ret)
2539	return ret;
2540	}
2541
2542	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: key, nr: path->slots[`0`]);
2543	return `0`;
2544	}
2545
2546	/*
2547	* adjust the pointers going up the tree, starting at level
2548	* making sure the right key of each node is points to 'key'.
2549	* This is used after shifting pointers to the left, so it stops
2550	* fixing up pointers when a given leaf/node is not in slot 0 of the
2551	* higher levels
2552	*
2553	*/
2554	static void fixup_low_keys(struct btrfs_trans_handle *trans,
2555	struct btrfs_path *path,
2556	struct btrfs_disk_key key, int* level)
2557	{
2558	int i;
2559	struct extent_buffer *t;
2560	int ret;
2561
2562	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2563	int tslot = path->slots[i];
2564
2565	if (!path->nodes[i])
2566	break;
2567	t = path->nodes[i];
2568	ret = btrfs_tree_mod_log_insert_key(eb: t, slot: tslot,
2569	op: BTRFS_MOD_LOG_KEY_REPLACE);
2570	BUG_ON(ret < `0`);
2571	btrfs_set_node_key(eb: t, disk_key: key, nr: tslot);
2572	btrfs_mark_buffer_dirty(trans, buf: path->nodes[i]);
2573	if (tslot != `0`)
2574	break;
2575	}
2576	}
2577
2578	/*
2579	* update item key.
2580	*
2581	* This function isn't completely safe. It's the caller's responsibility
2582	* that the new key won't break the order
2583	*/
2584	void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2585	struct btrfs_path *path,
2586	const struct btrfs_key *new_key)
2587	{
2588	struct btrfs_fs_info *fs_info = trans->fs_info;
2589	struct btrfs_disk_key disk_key;
2590	struct extent_buffer *eb;
2591	int slot;
2592
2593	eb = path->nodes[`0`];
2594	slot = path->slots[`0`];
2595	if (slot > `0`) {
2596	btrfs_item_key(eb, disk_key: &disk_key, nr: slot - `1`);
2597	if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= `0`)) {
2598	btrfs_print_leaf(l: eb);
2599	btrfs_crit(fs_info,
2600	"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2601	slot, btrfs_disk_key_objectid(&disk_key),
2602	btrfs_disk_key_type(&disk_key),
2603	btrfs_disk_key_offset(&disk_key),
2604	new_key->objectid, new_key->type,
2605	new_key->offset);
2606	BUG();
2607	}
2608	}
2609	if (slot < btrfs_header_nritems(eb) - `1`) {
2610	btrfs_item_key(eb, disk_key: &disk_key, nr: slot + `1`);
2611	if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= `0`)) {
2612	btrfs_print_leaf(l: eb);
2613	btrfs_crit(fs_info,
2614	"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2615	slot, btrfs_disk_key_objectid(&disk_key),
2616	btrfs_disk_key_type(&disk_key),
2617	btrfs_disk_key_offset(&disk_key),
2618	new_key->objectid, new_key->type,
2619	new_key->offset);
2620	BUG();
2621	}
2622	}
2623
2624	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: new_key);
2625	btrfs_set_item_key(eb, disk_key: &disk_key, nr: slot);
2626	btrfs_mark_buffer_dirty(trans, buf: eb);
2627	if (slot == `0`)
2628	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
2629	}
2630
2631	/*
2632	* Check key order of two sibling extent buffers.
2633	*
2634	* Return true if something is wrong.
2635	* Return false if everything is fine.
2636	*
2637	* Tree-checker only works inside one tree block, thus the following
2638	* corruption can not be detected by tree-checker:
2639	*
2640	* Leaf @left \| Leaf @right
2641	* --------------------------------------------------------------
2642	* \| 1 \| 2 \| 3 \| 4 \| 5 \| f6 \| \| 7 \| 8 \|
2643	*
2644	* Key f6 in leaf @left itself is valid, but not valid when the next
2645	* key in leaf @right is 7.
2646	* This can only be checked at tree block merge time.
2647	* And since tree checker has ensured all key order in each tree block
2648	* is correct, we only need to bother the last key of @left and the first
2649	* key of @right.
2650	*/
2651	static bool check_sibling_keys(struct extent_buffer *left,
2652	struct extent_buffer *right)
2653	{
2654	struct btrfs_key left_last;
2655	struct btrfs_key right_first;
2656	int level = btrfs_header_level(eb: left);
2657	int nr_left = btrfs_header_nritems(eb: left);
2658	int nr_right = btrfs_header_nritems(eb: right);
2659
2660	/ No key to check in one of the tree blocks /
2661	if (!nr_left \|\| !nr_right)
2662	return false;
2663
2664	if (level) {
2665	btrfs_node_key_to_cpu(eb: left, cpu_key: &left_last, nr: nr_left - `1`);
2666	btrfs_node_key_to_cpu(eb: right, cpu_key: &right_first, nr: `0`);
2667	} else {
2668	btrfs_item_key_to_cpu(eb: left, cpu_key: &left_last, nr: nr_left - `1`);
2669	btrfs_item_key_to_cpu(eb: right, cpu_key: &right_first, nr: `0`);
2670	}
2671
2672	if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= `0`)) {
2673	btrfs_crit(left->fs_info, "left extent buffer:");
2674	btrfs_print_tree(c: left, follow: false);
2675	btrfs_crit(left->fs_info, "right extent buffer:");
2676	btrfs_print_tree(c: right, follow: false);
2677	btrfs_crit(left->fs_info,
2678	"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2679	left_last.objectid, left_last.type,
2680	left_last.offset, right_first.objectid,
2681	right_first.type, right_first.offset);
2682	return true;
2683	}
2684	return false;
2685	}
2686
2687	/*
2688	* try to push data from one node into the next node left in the
2689	* tree.
2690	*
2691	* returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2692	* error, and > 0 if there was no room in the left hand block.
2693	*/
2694	static int push_node_left(struct btrfs_trans_handle *trans,
2695	struct extent_buffer *dst,
2696	struct extent_buffer src, int* empty)
2697	{
2698	struct btrfs_fs_info *fs_info = trans->fs_info;
2699	int push_items = `0`;
2700	int src_nritems;
2701	int dst_nritems;
2702	int ret = `0`;
2703
2704	src_nritems = btrfs_header_nritems(eb: src);
2705	dst_nritems = btrfs_header_nritems(eb: dst);
2706	push_items = BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - dst_nritems;
2707	WARN_ON(btrfs_header_generation(src) != trans->transid);
2708	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2709
2710	if (!empty && src_nritems <= `8`)
2711	return `1`;
2712
2713	if (push_items <= `0`)
2714	return `1`;
2715
2716	if (empty) {
2717	push_items = min(src_nritems, push_items);
2718	if (push_items < src_nritems) {
2719	/ leave at least 8 pointers in the node if*
2720	* we aren't going to empty it
2721	*/
2722	if (src_nritems - push_items < `8`) {
2723	if (push_items <= `8`)
2724	return `1`;
2725	push_items -= `8`;
2726	}
2727	}
2728	} else
2729	push_items = min(src_nritems - `8`, push_items);
2730
2731	/ dst is the left eb, src is the middle eb /
2732	if (check_sibling_keys(left: dst, right: src)) {
2733	ret = -EUCLEAN;
2734	btrfs_abort_transaction(trans, ret);
2735	return ret;
2736	}
2737	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_offset: dst_nritems, src_offset: `0`, nr_items: push_items);
2738	if (ret) {
2739	btrfs_abort_transaction(trans, ret);
2740	return ret;
2741	}
2742	copy_extent_buffer(dst, src,
2743	dst_offset: btrfs_node_key_ptr_offset(eb: dst, nr: dst_nritems),
2744	src_offset: btrfs_node_key_ptr_offset(eb: src, nr: `0`),
2745	len: push_items * sizeof(struct btrfs_key_ptr));
2746
2747	if (push_items < src_nritems) {
2748	/*
2749	* btrfs_tree_mod_log_eb_copy handles logging the move, so we
2750	* don't need to do an explicit tree mod log operation for it.
2751	*/
2752	memmove_extent_buffer(dst: src, dst_offset: btrfs_node_key_ptr_offset(eb: src, nr: `0`),
2753	src_offset: btrfs_node_key_ptr_offset(eb: src, nr: push_items),
2754	len: (src_nritems - push_items) *
2755	sizeof(struct btrfs_key_ptr));
2756	}
2757	btrfs_set_header_nritems(eb: src, val: src_nritems - push_items);
2758	btrfs_set_header_nritems(eb: dst, val: dst_nritems + push_items);
2759	btrfs_mark_buffer_dirty(trans, buf: src);
2760	btrfs_mark_buffer_dirty(trans, buf: dst);
2761
2762	return ret;
2763	}
2764
2765	/*
2766	* try to push data from one node into the next node right in the
2767	* tree.
2768	*
2769	* returns 0 if some ptrs were pushed, < 0 if there was some horrible
2770	* error, and > 0 if there was no room in the right hand block.
2771	*
2772	* this will only push up to 1/2 the contents of the left node over
2773	*/
2774	static int balance_node_right(struct btrfs_trans_handle *trans,
2775	struct extent_buffer *dst,
2776	struct extent_buffer *src)
2777	{
2778	struct btrfs_fs_info *fs_info = trans->fs_info;
2779	int push_items = `0`;
2780	int max_push;
2781	int src_nritems;
2782	int dst_nritems;
2783	int ret = `0`;
2784
2785	WARN_ON(btrfs_header_generation(src) != trans->transid);
2786	WARN_ON(btrfs_header_generation(dst) != trans->transid);
2787
2788	src_nritems = btrfs_header_nritems(eb: src);
2789	dst_nritems = btrfs_header_nritems(eb: dst);
2790	push_items = BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - dst_nritems;
2791	if (push_items <= `0`)
2792	return `1`;
2793
2794	if (src_nritems < `4`)
2795	return `1`;
2796
2797	max_push = src_nritems / `2` + `1`;
2798	/ don't try to empty the node /
2799	if (max_push >= src_nritems)
2800	return `1`;
2801
2802	if (max_push < push_items)
2803	push_items = max_push;
2804
2805	/ dst is the right eb, src is the middle eb /
2806	if (check_sibling_keys(left: src, right: dst)) {
2807	ret = -EUCLEAN;
2808	btrfs_abort_transaction(trans, ret);
2809	return ret;
2810	}
2811
2812	/*
2813	* btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2814	* need to do an explicit tree mod log operation for it.
2815	*/
2816	memmove_extent_buffer(dst, dst_offset: btrfs_node_key_ptr_offset(eb: dst, nr: push_items),
2817	src_offset: btrfs_node_key_ptr_offset(eb: dst, nr: `0`),
2818	len: (dst_nritems) *
2819	sizeof(struct btrfs_key_ptr));
2820
2821	ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_offset: `0`, src_offset: src_nritems - push_items,
2822	nr_items: push_items);
2823	if (ret) {
2824	btrfs_abort_transaction(trans, ret);
2825	return ret;
2826	}
2827	copy_extent_buffer(dst, src,
2828	dst_offset: btrfs_node_key_ptr_offset(eb: dst, nr: `0`),
2829	src_offset: btrfs_node_key_ptr_offset(eb: src, nr: src_nritems - push_items),
2830	len: push_items * sizeof(struct btrfs_key_ptr));
2831
2832	btrfs_set_header_nritems(eb: src, val: src_nritems - push_items);
2833	btrfs_set_header_nritems(eb: dst, val: dst_nritems + push_items);
2834
2835	btrfs_mark_buffer_dirty(trans, buf: src);
2836	btrfs_mark_buffer_dirty(trans, buf: dst);
2837
2838	return ret;
2839	}
2840
2841	/*
2842	* helper function to insert a new root level in the tree.
2843	* A new node is allocated, and a single item is inserted to
2844	* point to the existing root
2845	*
2846	* returns zero on success or < 0 on failure.
2847	*/
2848	static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2849	struct btrfs_root *root,
2850	struct btrfs_path path, int* level)
2851	{
2852	u64 lower_gen;
2853	struct extent_buffer *lower;
2854	struct extent_buffer *c;
2855	struct extent_buffer *old;
2856	struct btrfs_disk_key lower_key;
2857	int ret;
2858
2859	BUG_ON(path->nodes[level]);
2860	BUG_ON(path->nodes[level-`1`] != root->node);
2861
2862	lower = path->nodes[level-`1`];
2863	if (level == `1`)
2864	btrfs_item_key(eb: lower, disk_key: &lower_key, nr: `0`);
2865	else
2866	btrfs_node_key(eb: lower, disk_key: &lower_key, nr: `0`);
2867
2868	c = btrfs_alloc_tree_block(trans, root, parent: `0`, root_objectid: root->root_key.objectid,
2869	key: &lower_key, level, hint: root->node->start, empty_size: `0`,
2870	reloc_src_root: `0`, nest: BTRFS_NESTING_NEW_ROOT);
2871	if (IS_ERR(ptr: c))
2872	return PTR_ERR(ptr: c);
2873
2874	root_add_used_bytes(root);
2875
2876	btrfs_set_header_nritems(eb: c, val: `1`);
2877	btrfs_set_node_key(eb: c, disk_key: &lower_key, nr: `0`);
2878	btrfs_set_node_blockptr(eb: c, nr: `0`, val: lower->start);
2879	lower_gen = btrfs_header_generation(eb: lower);
2880	WARN_ON(lower_gen != trans->transid);
2881
2882	btrfs_set_node_ptr_generation(eb: c, nr: `0`, val: lower_gen);
2883
2884	btrfs_mark_buffer_dirty(trans, buf: c);
2885
2886	old = root->node;
2887	ret = btrfs_tree_mod_log_insert_root(old_root: root->node, new_root: c, log_removal: false);
2888	if (ret < `0`) {
2889	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf: c, parent: `0`, last_ref: `1`);
2890	btrfs_tree_unlock(eb: c);
2891	free_extent_buffer(eb: c);
2892	return ret;
2893	}
2894	rcu_assign_pointer(root->node, c);
2895
2896	/ the super has an extra ref to root->node /
2897	free_extent_buffer(eb: old);
2898
2899	add_root_to_dirty_list(root);
2900	atomic_inc(v: &c->refs);
2901	path->nodes[level] = c;
2902	path->locks[level] = BTRFS_WRITE_LOCK;
2903	path->slots[level] = `0`;
2904	return `0`;
2905	}
2906
2907	/*
2908	* worker function to insert a single pointer in a node.
2909	* the node should have enough room for the pointer already
2910	*
2911	* slot and level indicate where you want the key to go, and
2912	* blocknr is the block the key points to.
2913	*/
2914	static int insert_ptr(struct btrfs_trans_handle *trans,
2915	struct btrfs_path *path,
2916	struct btrfs_disk_key *key, u64 bytenr,
2917	int slot, int level)
2918	{
2919	struct extent_buffer *lower;
2920	int nritems;
2921	int ret;
2922
2923	BUG_ON(!path->nodes[level]);
2924	btrfs_assert_tree_write_locked(eb: path->nodes[level]);
2925	lower = path->nodes[level];
2926	nritems = btrfs_header_nritems(eb: lower);
2927	BUG_ON(slot > nritems);
2928	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2929	if (slot != nritems) {
2930	if (level) {
2931	ret = btrfs_tree_mod_log_insert_move(eb: lower, dst_slot: slot + `1`,
2932	src_slot: slot, nr_items: nritems - slot);
2933	if (ret < `0`) {
2934	btrfs_abort_transaction(trans, ret);
2935	return ret;
2936	}
2937	}
2938	memmove_extent_buffer(dst: lower,
2939	dst_offset: btrfs_node_key_ptr_offset(eb: lower, nr: slot + `1`),
2940	src_offset: btrfs_node_key_ptr_offset(eb: lower, nr: slot),
2941	len: (nritems - slot) * sizeof(struct btrfs_key_ptr));
2942	}
2943	if (level) {
2944	ret = btrfs_tree_mod_log_insert_key(eb: lower, slot,
2945	op: BTRFS_MOD_LOG_KEY_ADD);
2946	if (ret < `0`) {
2947	btrfs_abort_transaction(trans, ret);
2948	return ret;
2949	}
2950	}
2951	btrfs_set_node_key(eb: lower, disk_key: key, nr: slot);
2952	btrfs_set_node_blockptr(eb: lower, nr: slot, val: bytenr);
2953	WARN_ON(trans->transid == `0`);
2954	btrfs_set_node_ptr_generation(eb: lower, nr: slot, val: trans->transid);
2955	btrfs_set_header_nritems(eb: lower, val: nritems + `1`);
2956	btrfs_mark_buffer_dirty(trans, buf: lower);
2957
2958	return `0`;
2959	}
2960
2961	/*
2962	* split the node at the specified level in path in two.
2963	* The path is corrected to point to the appropriate node after the split
2964	*
2965	* Before splitting this tries to make some room in the node by pushing
2966	* left and right, if either one works, it returns right away.
2967	*
2968	* returns 0 on success and < 0 on failure
2969	*/
2970	static noinline int split_node(struct btrfs_trans_handle *trans,
2971	struct btrfs_root *root,
2972	struct btrfs_path path, int* level)
2973	{
2974	struct btrfs_fs_info *fs_info = root->fs_info;
2975	struct extent_buffer *c;
2976	struct extent_buffer *split;
2977	struct btrfs_disk_key disk_key;
2978	int mid;
2979	int ret;
2980	u32 c_nritems;
2981
2982	c = path->nodes[level];
2983	WARN_ON(btrfs_header_generation(c) != trans->transid);
2984	if (c == root->node) {
2985	/*
2986	* trying to split the root, lets make a new one
2987	*
2988	* tree mod log: We don't log_removal old root in
2989	* insert_new_root, because that root buffer will be kept as a
2990	* normal node. We are going to log removal of half of the
2991	* elements below with btrfs_tree_mod_log_eb_copy(). We're
2992	* holding a tree lock on the buffer, which is why we cannot
2993	* race with other tree_mod_log users.
2994	*/
2995	ret = insert_new_root(trans, root, path, level: level + `1`);
2996	if (ret)
2997	return ret;
2998	} else {
2999	ret = push_nodes_for_insert(trans, root, path, level);
3000	c = path->nodes[level];
3001	if (!ret && btrfs_header_nritems(eb: c) <
3002	BTRFS_NODEPTRS_PER_BLOCK(info: fs_info) - `3`)
3003	return `0`;
3004	if (ret < `0`)
3005	return ret;
3006	}
3007
3008	c_nritems = btrfs_header_nritems(eb: c);
3009	mid = (c_nritems + `1`) / `2`;
3010	btrfs_node_key(eb: c, disk_key: &disk_key, nr: mid);
3011
3012	split = btrfs_alloc_tree_block(trans, root, parent: `0`, root_objectid: root->root_key.objectid,
3013	key: &disk_key, level, hint: c->start, empty_size: `0`,
3014	reloc_src_root: `0`, nest: BTRFS_NESTING_SPLIT);
3015	if (IS_ERR(ptr: split))
3016	return PTR_ERR(ptr: split);
3017
3018	root_add_used_bytes(root);
3019	ASSERT(btrfs_header_level(c) == level);
3020
3021	ret = btrfs_tree_mod_log_eb_copy(dst: split, src: c, dst_offset: `0`, src_offset: mid, nr_items: c_nritems - mid);
3022	if (ret) {
3023	btrfs_tree_unlock(eb: split);
3024	free_extent_buffer(eb: split);
3025	btrfs_abort_transaction(trans, ret);
3026	return ret;
3027	}
3028	copy_extent_buffer(dst: split, src: c,
3029	dst_offset: btrfs_node_key_ptr_offset(eb: split, nr: `0`),
3030	src_offset: btrfs_node_key_ptr_offset(eb: c, nr: mid),
3031	len: (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3032	btrfs_set_header_nritems(eb: split, val: c_nritems - mid);
3033	btrfs_set_header_nritems(eb: c, val: mid);
3034
3035	btrfs_mark_buffer_dirty(trans, buf: c);
3036	btrfs_mark_buffer_dirty(trans, buf: split);
3037
3038	ret = insert_ptr(trans, path, key: &disk_key, bytenr: split->start,
3039	slot: path->slots[level + `1`] + `1`, level: level + `1`);
3040	if (ret < `0`) {
3041	btrfs_tree_unlock(eb: split);
3042	free_extent_buffer(eb: split);
3043	return ret;
3044	}
3045
3046	if (path->slots[level] >= mid) {
3047	path->slots[level] -= mid;
3048	btrfs_tree_unlock(eb: c);
3049	free_extent_buffer(eb: c);
3050	path->nodes[level] = split;
3051	path->slots[level + `1`] += `1`;
3052	} else {
3053	btrfs_tree_unlock(eb: split);
3054	free_extent_buffer(eb: split);
3055	}
3056	return `0`;
3057	}
3058
3059	/*
3060	* how many bytes are required to store the items in a leaf. start
3061	* and nr indicate which items in the leaf to check. This totals up the
3062	* space used both by the item structs and the item data
3063	*/
3064	static int leaf_space_used(const struct extent_buffer l, int* start, int nr)
3065	{
3066	int data_len;
3067	int nritems = btrfs_header_nritems(eb: l);
3068	int end = min(nritems, start + nr) - `1`;
3069
3070	if (!nr)
3071	return `0`;
3072	data_len = btrfs_item_offset(eb: l, slot: start) + btrfs_item_size(eb: l, slot: start);
3073	data_len = data_len - btrfs_item_offset(eb: l, slot: end);
3074	data_len += sizeof(struct btrfs_item) * nr;
3075	WARN_ON(data_len < `0`);
3076	return data_len;
3077	}
3078
3079	/*
3080	* The space between the end of the leaf items and
3081	* the start of the leaf data. IOW, how much room
3082	* the leaf has left for both items and data
3083	*/
3084	int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3085	{
3086	struct btrfs_fs_info *fs_info = leaf->fs_info;
3087	int nritems = btrfs_header_nritems(eb: leaf);
3088	int ret;
3089
3090	ret = BTRFS_LEAF_DATA_SIZE(info: fs_info) - leaf_space_used(l: leaf, start: `0`, nr: nritems);
3091	if (ret < `0`) {
3092	btrfs_crit(fs_info,
3093	"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3094	ret,
3095	(unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3096	leaf_space_used(leaf, `0`, nritems), nritems);
3097	}
3098	return ret;
3099	}
3100
3101	/*
3102	* min slot controls the lowest index we're willing to push to the
3103	* right. We'll push up to and including min_slot, but no lower
3104	*/
3105	static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3106	struct btrfs_path *path,
3107	int data_size, int empty,
3108	struct extent_buffer *right,
3109	int free_space, u32 left_nritems,
3110	u32 min_slot)
3111	{
3112	struct btrfs_fs_info *fs_info = right->fs_info;
3113	struct extent_buffer *left = path->nodes[`0`];
3114	struct extent_buffer *upper = path->nodes[`1`];
3115	struct btrfs_map_token token;
3116	struct btrfs_disk_key disk_key;
3117	int slot;
3118	u32 i;
3119	int push_space = `0`;
3120	int push_items = `0`;
3121	u32 nr;
3122	u32 right_nritems;
3123	u32 data_end;
3124	u32 this_item_size;
3125
3126	if (empty)
3127	nr = `0`;
3128	else
3129	nr = max_t(u32, `1`, min_slot);
3130
3131	if (path->slots[`0`] >= left_nritems)
3132	push_space += data_size;
3133
3134	slot = path->slots[`1`];
3135	i = left_nritems - `1`;
3136	while (i >= nr) {
3137	if (!empty && push_items > `0`) {
3138	if (path->slots[`0`] > i)
3139	break;
3140	if (path->slots[`0`] == i) {
3141	int space = btrfs_leaf_free_space(leaf: left);
3142
3143	if (space + push_space * `2` > free_space)
3144	break;
3145	}
3146	}
3147
3148	if (path->slots[`0`] == i)
3149	push_space += data_size;
3150
3151	this_item_size = btrfs_item_size(eb: left, slot: i);
3152	if (this_item_size + sizeof(struct btrfs_item) +
3153	push_space > free_space)
3154	break;
3155
3156	push_items++;
3157	push_space += this_item_size + sizeof(struct btrfs_item);
3158	if (i == `0`)
3159	break;
3160	i--;
3161	}
3162
3163	if (push_items == `0`)
3164	goto out_unlock;
3165
3166	WARN_ON(!empty && push_items == left_nritems);
3167
3168	/ push left to right /
3169	right_nritems = btrfs_header_nritems(eb: right);
3170
3171	push_space = btrfs_item_data_end(eb: left, nr: left_nritems - push_items);
3172	push_space -= leaf_data_end(leaf: left);
3173
3174	/ make room in the right data area /
3175	data_end = leaf_data_end(leaf: right);
3176	memmove_leaf_data(leaf: right, dst_offset: data_end - push_space, src_offset: data_end,
3177	len: BTRFS_LEAF_DATA_SIZE(info: fs_info) - data_end);
3178
3179	/ copy from the left data area /
3180	copy_leaf_data(dst: right, src: left, dst_offset: BTRFS_LEAF_DATA_SIZE(info: fs_info) - push_space,
3181	src_offset: leaf_data_end(leaf: left), len: push_space);
3182
3183	memmove_leaf_items(leaf: right, dst_item: push_items, src_item: `0`, nr_items: right_nritems);
3184
3185	/ copy the items from left to right /
3186	copy_leaf_items(dst: right, src: left, dst_item: `0`, src_item: left_nritems - push_items, nr_items: push_items);
3187
3188	/ update the item pointers /
3189	btrfs_init_map_token(token: &token, eb: right);
3190	right_nritems += push_items;
3191	btrfs_set_header_nritems(eb: right, val: right_nritems);
3192	push_space = BTRFS_LEAF_DATA_SIZE(info: fs_info);
3193	for (i = `0`; i < right_nritems; i++) {
3194	push_space -= btrfs_token_item_size(token: &token, slot: i);
3195	btrfs_set_token_item_offset(token: &token, slot: i, val: push_space);
3196	}
3197
3198	left_nritems -= push_items;
3199	btrfs_set_header_nritems(eb: left, val: left_nritems);
3200
3201	if (left_nritems)
3202	btrfs_mark_buffer_dirty(trans, buf: left);
3203	else
3204	btrfs_clear_buffer_dirty(trans, buf: left);
3205
3206	btrfs_mark_buffer_dirty(trans, buf: right);
3207
3208	btrfs_item_key(eb: right, disk_key: &disk_key, nr: `0`);
3209	btrfs_set_node_key(eb: upper, disk_key: &disk_key, nr: slot + `1`);
3210	btrfs_mark_buffer_dirty(trans, buf: upper);
3211
3212	/ then fixup the leaf pointer in the path /
3213	if (path->slots[`0`] >= left_nritems) {
3214	path->slots[`0`] -= left_nritems;
3215	if (btrfs_header_nritems(eb: path->nodes[`0`]) == `0`)
3216	btrfs_clear_buffer_dirty(trans, buf: path->nodes[`0`]);
3217	btrfs_tree_unlock(eb: path->nodes[`0`]);
3218	free_extent_buffer(eb: path->nodes[`0`]);
3219	path->nodes[`0`] = right;
3220	path->slots[`1`] += `1`;
3221	} else {
3222	btrfs_tree_unlock(eb: right);
3223	free_extent_buffer(eb: right);
3224	}
3225	return `0`;
3226
3227	out_unlock:
3228	btrfs_tree_unlock(eb: right);
3229	free_extent_buffer(eb: right);
3230	return `1`;
3231	}
3232
3233	/*
3234	* push some data in the path leaf to the right, trying to free up at
3235	* least data_size bytes. returns zero if the push worked, nonzero otherwise
3236	*
3237	* returns 1 if the push failed because the other node didn't have enough
3238	* room, 0 if everything worked out and < 0 if there were major errors.
3239	*
3240	* this will push starting from min_slot to the end of the leaf. It won't
3241	* push any slot lower than min_slot
3242	*/
3243	static int push_leaf_right(struct btrfs_trans_handle trans, struct* btrfs_root
3244	root, struct* btrfs_path *path,
3245	int min_data_size, int data_size,
3246	int empty, u32 min_slot)
3247	{
3248	struct extent_buffer *left = path->nodes[`0`];
3249	struct extent_buffer *right;
3250	struct extent_buffer *upper;
3251	int slot;
3252	int free_space;
3253	u32 left_nritems;
3254	int ret;
3255
3256	if (!path->nodes[`1`])
3257	return `1`;
3258
3259	slot = path->slots[`1`];
3260	upper = path->nodes[`1`];
3261	if (slot >= btrfs_header_nritems(eb: upper) - `1`)
3262	return `1`;
3263
3264	btrfs_assert_tree_write_locked(eb: path->nodes[`1`]);
3265
3266	right = btrfs_read_node_slot(parent: upper, slot: slot + `1`);
3267	if (IS_ERR(ptr: right))
3268	return PTR_ERR(ptr: right);
3269
3270	__btrfs_tree_lock(eb: right, nest: BTRFS_NESTING_RIGHT);
3271
3272	free_space = btrfs_leaf_free_space(leaf: right);
3273	if (free_space < data_size)
3274	goto out_unlock;
3275
3276	ret = btrfs_cow_block(trans, root, buf: right, parent: upper,
3277	parent_slot: slot + `1`, cow_ret: &right, nest: BTRFS_NESTING_RIGHT_COW);
3278	if (ret)
3279	goto out_unlock;
3280
3281	left_nritems = btrfs_header_nritems(eb: left);
3282	if (left_nritems == `0`)
3283	goto out_unlock;
3284
3285	if (check_sibling_keys(left, right)) {
3286	ret = -EUCLEAN;
3287	btrfs_abort_transaction(trans, ret);
3288	btrfs_tree_unlock(eb: right);
3289	free_extent_buffer(eb: right);
3290	return ret;
3291	}
3292	if (path->slots[`0`] == left_nritems && !empty) {
3293	/ Key greater than all keys in the leaf, right neighbor has*
3294	* enough room for it and we're not emptying our leaf to delete
3295	* it, therefore use right neighbor to insert the new item and
3296	* no need to touch/dirty our left leaf. */
3297	btrfs_tree_unlock(eb: left);
3298	free_extent_buffer(eb: left);
3299	path->nodes[`0`] = right;
3300	path->slots[`0`] = `0`;
3301	path->slots[`1`]++;
3302	return `0`;
3303	}
3304
3305	return __push_leaf_right(trans, path, data_size: min_data_size, empty, right,
3306	free_space, left_nritems, min_slot);
3307	out_unlock:
3308	btrfs_tree_unlock(eb: right);
3309	free_extent_buffer(eb: right);
3310	return `1`;
3311	}
3312
3313	/*
3314	* push some data in the path leaf to the left, trying to free up at
3315	* least data_size bytes. returns zero if the push worked, nonzero otherwise
3316	*
3317	* max_slot can put a limit on how far into the leaf we'll push items. The
3318	* item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3319	* items
3320	*/
3321	static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3322	struct btrfs_path path, int* data_size,
3323	int empty, struct extent_buffer *left,
3324	int free_space, u32 right_nritems,
3325	u32 max_slot)
3326	{
3327	struct btrfs_fs_info *fs_info = left->fs_info;
3328	struct btrfs_disk_key disk_key;
3329	struct extent_buffer *right = path->nodes[`0`];
3330	int i;
3331	int push_space = `0`;
3332	int push_items = `0`;
3333	u32 old_left_nritems;
3334	u32 nr;
3335	int ret = `0`;
3336	u32 this_item_size;
3337	u32 old_left_item_size;
3338	struct btrfs_map_token token;
3339
3340	if (empty)
3341	nr = min(right_nritems, max_slot);
3342	else
3343	nr = min(right_nritems - `1`, max_slot);
3344
3345	for (i = `0`; i < nr; i++) {
3346	if (!empty && push_items > `0`) {
3347	if (path->slots[`0`] < i)
3348	break;
3349	if (path->slots[`0`] == i) {
3350	int space = btrfs_leaf_free_space(leaf: right);
3351
3352	if (space + push_space * `2` > free_space)
3353	break;
3354	}
3355	}
3356
3357	if (path->slots[`0`] == i)
3358	push_space += data_size;
3359
3360	this_item_size = btrfs_item_size(eb: right, slot: i);
3361	if (this_item_size + sizeof(struct btrfs_item) + push_space >
3362	free_space)
3363	break;
3364
3365	push_items++;
3366	push_space += this_item_size + sizeof(struct btrfs_item);
3367	}
3368
3369	if (push_items == `0`) {
3370	ret = `1`;
3371	goto out;
3372	}
3373	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3374
3375	/ push data from right to left /
3376	copy_leaf_items(dst: left, src: right, dst_item: btrfs_header_nritems(eb: left), src_item: `0`, nr_items: push_items);
3377
3378	push_space = BTRFS_LEAF_DATA_SIZE(info: fs_info) -
3379	btrfs_item_offset(eb: right, slot: push_items - `1`);
3380
3381	copy_leaf_data(dst: left, src: right, dst_offset: leaf_data_end(leaf: left) - push_space,
3382	src_offset: btrfs_item_offset(eb: right, slot: push_items - `1`), len: push_space);
3383	old_left_nritems = btrfs_header_nritems(eb: left);
3384	BUG_ON(old_left_nritems <= `0`);
3385
3386	btrfs_init_map_token(token: &token, eb: left);
3387	old_left_item_size = btrfs_item_offset(eb: left, slot: old_left_nritems - `1`);
3388	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3389	u32 ioff;
3390
3391	ioff = btrfs_token_item_offset(token: &token, slot: i);
3392	btrfs_set_token_item_offset(token: &token, slot: i,
3393	val: ioff - (BTRFS_LEAF_DATA_SIZE(info: fs_info) - old_left_item_size));
3394	}
3395	btrfs_set_header_nritems(eb: left, val: old_left_nritems + push_items);
3396
3397	/ fixup right node /
3398	if (push_items > right_nritems)
3399	WARN(`1`, KERN_CRIT "push items %d nr %u\n", push_items,
3400	right_nritems);
3401
3402	if (push_items < right_nritems) {
3403	push_space = btrfs_item_offset(eb: right, slot: push_items - `1`) -
3404	leaf_data_end(leaf: right);
3405	memmove_leaf_data(leaf: right,
3406	dst_offset: BTRFS_LEAF_DATA_SIZE(info: fs_info) - push_space,
3407	src_offset: leaf_data_end(leaf: right), len: push_space);
3408
3409	memmove_leaf_items(leaf: right, dst_item: `0`, src_item: push_items,
3410	nr_items: btrfs_header_nritems(eb: right) - push_items);
3411	}
3412
3413	btrfs_init_map_token(token: &token, eb: right);
3414	right_nritems -= push_items;
3415	btrfs_set_header_nritems(eb: right, val: right_nritems);
3416	push_space = BTRFS_LEAF_DATA_SIZE(info: fs_info);
3417	for (i = `0`; i < right_nritems; i++) {
3418	push_space = push_space - btrfs_token_item_size(token: &token, slot: i);
3419	btrfs_set_token_item_offset(token: &token, slot: i, val: push_space);
3420	}
3421
3422	btrfs_mark_buffer_dirty(trans, buf: left);
3423	if (right_nritems)
3424	btrfs_mark_buffer_dirty(trans, buf: right);
3425	else
3426	btrfs_clear_buffer_dirty(trans, buf: right);
3427
3428	btrfs_item_key(eb: right, disk_key: &disk_key, nr: `0`);
3429	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
3430
3431	/ then fixup the leaf pointer in the path /
3432	if (path->slots[`0`] < push_items) {
3433	path->slots[`0`] += old_left_nritems;
3434	btrfs_tree_unlock(eb: path->nodes[`0`]);
3435	free_extent_buffer(eb: path->nodes[`0`]);
3436	path->nodes[`0`] = left;
3437	path->slots[`1`] -= `1`;
3438	} else {
3439	btrfs_tree_unlock(eb: left);
3440	free_extent_buffer(eb: left);
3441	path->slots[`0`] -= push_items;
3442	}
3443	BUG_ON(path->slots[`0`] < `0`);
3444	return ret;
3445	out:
3446	btrfs_tree_unlock(eb: left);
3447	free_extent_buffer(eb: left);
3448	return ret;
3449	}
3450
3451	/*
3452	* push some data in the path leaf to the left, trying to free up at
3453	* least data_size bytes. returns zero if the push worked, nonzero otherwise
3454	*
3455	* max_slot can put a limit on how far into the leaf we'll push items. The
3456	* item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3457	* items
3458	*/
3459	static int push_leaf_left(struct btrfs_trans_handle trans, struct* btrfs_root
3460	root, struct* btrfs_path path, int* min_data_size,
3461	int data_size, int empty, u32 max_slot)
3462	{
3463	struct extent_buffer *right = path->nodes[`0`];
3464	struct extent_buffer *left;
3465	int slot;
3466	int free_space;
3467	u32 right_nritems;
3468	int ret = `0`;
3469
3470	slot = path->slots[`1`];
3471	if (slot == `0`)
3472	return `1`;
3473	if (!path->nodes[`1`])
3474	return `1`;
3475
3476	right_nritems = btrfs_header_nritems(eb: right);
3477	if (right_nritems == `0`)
3478	return `1`;
3479
3480	btrfs_assert_tree_write_locked(eb: path->nodes[`1`]);
3481
3482	left = btrfs_read_node_slot(parent: path->nodes[`1`], slot: slot - `1`);
3483	if (IS_ERR(ptr: left))
3484	return PTR_ERR(ptr: left);
3485
3486	__btrfs_tree_lock(eb: left, nest: BTRFS_NESTING_LEFT);
3487
3488	free_space = btrfs_leaf_free_space(leaf: left);
3489	if (free_space < data_size) {
3490	ret = `1`;
3491	goto out;
3492	}
3493
3494	ret = btrfs_cow_block(trans, root, buf: left,
3495	parent: path->nodes[`1`], parent_slot: slot - `1`, cow_ret: &left,
3496	nest: BTRFS_NESTING_LEFT_COW);
3497	if (ret) {
3498	/ we hit -ENOSPC, but it isn't fatal here /
3499	if (ret == -ENOSPC)
3500	ret = `1`;
3501	goto out;
3502	}
3503
3504	if (check_sibling_keys(left, right)) {
3505	ret = -EUCLEAN;
3506	btrfs_abort_transaction(trans, ret);
3507	goto out;
3508	}
3509	return __push_leaf_left(trans, path, data_size: min_data_size, empty, left,
3510	free_space, right_nritems, max_slot);
3511	out:
3512	btrfs_tree_unlock(eb: left);
3513	free_extent_buffer(eb: left);
3514	return ret;
3515	}
3516
3517	/*
3518	* split the path's leaf in two, making sure there is at least data_size
3519	* available for the resulting leaf level of the path.
3520	*/
3521	static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3522	struct btrfs_path *path,
3523	struct extent_buffer *l,
3524	struct extent_buffer *right,
3525	int slot, int mid, int nritems)
3526	{
3527	struct btrfs_fs_info *fs_info = trans->fs_info;
3528	int data_copy_size;
3529	int rt_data_off;
3530	int i;
3531	int ret;
3532	struct btrfs_disk_key disk_key;
3533	struct btrfs_map_token token;
3534
3535	nritems = nritems - mid;
3536	btrfs_set_header_nritems(eb: right, val: nritems);
3537	data_copy_size = btrfs_item_data_end(eb: l, nr: mid) - leaf_data_end(leaf: l);
3538
3539	copy_leaf_items(dst: right, src: l, dst_item: `0`, src_item: mid, nr_items: nritems);
3540
3541	copy_leaf_data(dst: right, src: l, dst_offset: BTRFS_LEAF_DATA_SIZE(info: fs_info) - data_copy_size,
3542	src_offset: leaf_data_end(leaf: l), len: data_copy_size);
3543
3544	rt_data_off = BTRFS_LEAF_DATA_SIZE(info: fs_info) - btrfs_item_data_end(eb: l, nr: mid);
3545
3546	btrfs_init_map_token(token: &token, eb: right);
3547	for (i = `0`; i < nritems; i++) {
3548	u32 ioff;
3549
3550	ioff = btrfs_token_item_offset(token: &token, slot: i);
3551	btrfs_set_token_item_offset(token: &token, slot: i, val: ioff + rt_data_off);
3552	}
3553
3554	btrfs_set_header_nritems(eb: l, val: mid);
3555	btrfs_item_key(eb: right, disk_key: &disk_key, nr: `0`);
3556	ret = insert_ptr(trans, path, key: &disk_key, bytenr: right->start, slot: path->slots[`1`] + `1`, level: `1`);
3557	if (ret < `0`)
3558	return ret;
3559
3560	btrfs_mark_buffer_dirty(trans, buf: right);
3561	btrfs_mark_buffer_dirty(trans, buf: l);
3562	BUG_ON(path->slots[`0`] != slot);
3563
3564	if (mid <= slot) {
3565	btrfs_tree_unlock(eb: path->nodes[`0`]);
3566	free_extent_buffer(eb: path->nodes[`0`]);
3567	path->nodes[`0`] = right;
3568	path->slots[`0`] -= mid;
3569	path->slots[`1`] += `1`;
3570	} else {
3571	btrfs_tree_unlock(eb: right);
3572	free_extent_buffer(eb: right);
3573	}
3574
3575	BUG_ON(path->slots[`0`] < `0`);
3576
3577	return `0`;
3578	}
3579
3580	/*
3581	* double splits happen when we need to insert a big item in the middle
3582	* of a leaf. A double split can leave us with 3 mostly empty leaves:
3583	* leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3584	* A B C
3585	*
3586	* We avoid this by trying to push the items on either side of our target
3587	* into the adjacent leaves. If all goes well we can avoid the double split
3588	* completely.
3589	*/
3590	static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3591	struct btrfs_root *root,
3592	struct btrfs_path *path,
3593	int data_size)
3594	{
3595	int ret;
3596	int progress = `0`;
3597	int slot;
3598	u32 nritems;
3599	int space_needed = data_size;
3600
3601	slot = path->slots[`0`];
3602	if (slot < btrfs_header_nritems(eb: path->nodes[`0`]))
3603	space_needed -= btrfs_leaf_free_space(leaf: path->nodes[`0`]);
3604
3605	/*
3606	* try to push all the items after our slot into the
3607	* right leaf
3608	*/
3609	ret = push_leaf_right(trans, root, path, min_data_size: `1`, data_size: space_needed, empty: `0`, min_slot: slot);
3610	if (ret < `0`)
3611	return ret;
3612
3613	if (ret == `0`)
3614	progress++;
3615
3616	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
3617	/*
3618	* our goal is to get our slot at the start or end of a leaf. If
3619	* we've done so we're done
3620	*/
3621	if (path->slots[`0`] == `0` \|\| path->slots[`0`] == nritems)
3622	return `0`;
3623
3624	if (btrfs_leaf_free_space(leaf: path->nodes[`0`]) >= data_size)
3625	return `0`;
3626
3627	/ try to push all the items before our slot into the next leaf /
3628	slot = path->slots[`0`];
3629	space_needed = data_size;
3630	if (slot > `0`)
3631	space_needed -= btrfs_leaf_free_space(leaf: path->nodes[`0`]);
3632	ret = push_leaf_left(trans, root, path, min_data_size: `1`, data_size: space_needed, empty: `0`, max_slot: slot);
3633	if (ret < `0`)
3634	return ret;
3635
3636	if (ret == `0`)
3637	progress++;
3638
3639	if (progress)
3640	return `0`;
3641	return `1`;
3642	}
3643
3644	/*
3645	* split the path's leaf in two, making sure there is at least data_size
3646	* available for the resulting leaf level of the path.
3647	*
3648	* returns 0 if all went well and < 0 on failure.
3649	*/
3650	static noinline int split_leaf(struct btrfs_trans_handle *trans,
3651	struct btrfs_root *root,
3652	const struct btrfs_key *ins_key,
3653	struct btrfs_path path, int* data_size,
3654	int extend)
3655	{
3656	struct btrfs_disk_key disk_key;
3657	struct extent_buffer *l;
3658	u32 nritems;
3659	int mid;
3660	int slot;
3661	struct extent_buffer *right;
3662	struct btrfs_fs_info *fs_info = root->fs_info;
3663	int ret = `0`;
3664	int wret;
3665	int split;
3666	int num_doubles = `0`;
3667	int tried_avoid_double = `0`;
3668
3669	l = path->nodes[`0`];
3670	slot = path->slots[`0`];
3671	if (extend && data_size + btrfs_item_size(eb: l, slot) +
3672	sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(info: fs_info))
3673	return -EOVERFLOW;
3674
3675	/ first try to make some room by pushing left and right /
3676	if (data_size && path->nodes[`1`]) {
3677	int space_needed = data_size;
3678
3679	if (slot < btrfs_header_nritems(eb: l))
3680	space_needed -= btrfs_leaf_free_space(leaf: l);
3681
3682	wret = push_leaf_right(trans, root, path, min_data_size: space_needed,
3683	data_size: space_needed, empty: `0`, min_slot: `0`);
3684	if (wret < `0`)
3685	return wret;
3686	if (wret) {
3687	space_needed = data_size;
3688	if (slot > `0`)
3689	space_needed -= btrfs_leaf_free_space(leaf: l);
3690	wret = push_leaf_left(trans, root, path, min_data_size: space_needed,
3691	data_size: space_needed, empty: `0`, max_slot: (u32)-`1`);
3692	if (wret < `0`)
3693	return wret;
3694	}
3695	l = path->nodes[`0`];
3696
3697	/ did the pushes work? /
3698	if (btrfs_leaf_free_space(leaf: l) >= data_size)
3699	return `0`;
3700	}
3701
3702	if (!path->nodes[`1`]) {
3703	ret = insert_new_root(trans, root, path, level: `1`);
3704	if (ret)
3705	return ret;
3706	}
3707	again:
3708	split = `1`;
3709	l = path->nodes[`0`];
3710	slot = path->slots[`0`];
3711	nritems = btrfs_header_nritems(eb: l);
3712	mid = (nritems + `1`) / `2`;
3713
3714	if (mid <= slot) {
3715	if (nritems == `1` \|\|
3716	leaf_space_used(l, start: mid, nr: nritems - mid) + data_size >
3717	BTRFS_LEAF_DATA_SIZE(info: fs_info)) {
3718	if (slot >= nritems) {
3719	split = `0`;
3720	} else {
3721	mid = slot;
3722	if (mid != nritems &&
3723	leaf_space_used(l, start: mid, nr: nritems - mid) +
3724	data_size > BTRFS_LEAF_DATA_SIZE(info: fs_info)) {
3725	if (data_size && !tried_avoid_double)
3726	goto push_for_double;
3727	split = `2`;
3728	}
3729	}
3730	}
3731	} else {
3732	if (leaf_space_used(l, start: `0`, nr: mid) + data_size >
3733	BTRFS_LEAF_DATA_SIZE(info: fs_info)) {
3734	if (!extend && data_size && slot == `0`) {
3735	split = `0`;
3736	} else if ((extend \|\| !data_size) && slot == `0`) {
3737	mid = `1`;
3738	} else {
3739	mid = slot;
3740	if (mid != nritems &&
3741	leaf_space_used(l, start: mid, nr: nritems - mid) +
3742	data_size > BTRFS_LEAF_DATA_SIZE(info: fs_info)) {
3743	if (data_size && !tried_avoid_double)
3744	goto push_for_double;
3745	split = `2`;
3746	}
3747	}
3748	}
3749	}
3750
3751	if (split == `0`)
3752	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: ins_key);
3753	else
3754	btrfs_item_key(eb: l, disk_key: &disk_key, nr: mid);
3755
3756	/*
3757	* We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3758	* split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3759	* subclasses, which is 8 at the time of this patch, and we've maxed it
3760	* out. In the future we could add a
3761	* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3762	* use BTRFS_NESTING_NEW_ROOT.
3763	*/
3764	right = btrfs_alloc_tree_block(trans, root, parent: `0`, root_objectid: root->root_key.objectid,
3765	key: &disk_key, level: `0`, hint: l->start, empty_size: `0`, reloc_src_root: `0`,
3766	nest: num_doubles ? BTRFS_NESTING_NEW_ROOT :
3767	BTRFS_NESTING_SPLIT);
3768	if (IS_ERR(ptr: right))
3769	return PTR_ERR(ptr: right);
3770
3771	root_add_used_bytes(root);
3772
3773	if (split == `0`) {
3774	if (mid <= slot) {
3775	btrfs_set_header_nritems(eb: right, val: `0`);
3776	ret = insert_ptr(trans, path, key: &disk_key,
3777	bytenr: right->start, slot: path->slots[`1`] + `1`, level: `1`);
3778	if (ret < `0`) {
3779	btrfs_tree_unlock(eb: right);
3780	free_extent_buffer(eb: right);
3781	return ret;
3782	}
3783	btrfs_tree_unlock(eb: path->nodes[`0`]);
3784	free_extent_buffer(eb: path->nodes[`0`]);
3785	path->nodes[`0`] = right;
3786	path->slots[`0`] = `0`;
3787	path->slots[`1`] += `1`;
3788	} else {
3789	btrfs_set_header_nritems(eb: right, val: `0`);
3790	ret = insert_ptr(trans, path, key: &disk_key,
3791	bytenr: right->start, slot: path->slots[`1`], level: `1`);
3792	if (ret < `0`) {
3793	btrfs_tree_unlock(eb: right);
3794	free_extent_buffer(eb: right);
3795	return ret;
3796	}
3797	btrfs_tree_unlock(eb: path->nodes[`0`]);
3798	free_extent_buffer(eb: path->nodes[`0`]);
3799	path->nodes[`0`] = right;
3800	path->slots[`0`] = `0`;
3801	if (path->slots[`1`] == `0`)
3802	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
3803	}
3804	/*
3805	* We create a new leaf 'right' for the required ins_len and
3806	* we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3807	* the content of ins_len to 'right'.
3808	*/
3809	return ret;
3810	}
3811
3812	ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3813	if (ret < `0`) {
3814	btrfs_tree_unlock(eb: right);
3815	free_extent_buffer(eb: right);
3816	return ret;
3817	}
3818
3819	if (split == `2`) {
3820	BUG_ON(num_doubles != `0`);
3821	num_doubles++;
3822	goto again;
3823	}
3824
3825	return `0`;
3826
3827	push_for_double:
3828	push_for_double_split(trans, root, path, data_size);
3829	tried_avoid_double = `1`;
3830	if (btrfs_leaf_free_space(leaf: path->nodes[`0`]) >= data_size)
3831	return `0`;
3832	goto again;
3833	}
3834
3835	static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3836	struct btrfs_root *root,
3837	struct btrfs_path path, int* ins_len)
3838	{
3839	struct btrfs_key key;
3840	struct extent_buffer *leaf;
3841	struct btrfs_file_extent_item *fi;
3842	u64 extent_len = `0`;
3843	u32 item_size;
3844	int ret;
3845
3846	leaf = path->nodes[`0`];
3847	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
3848
3849	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3850	key.type != BTRFS_EXTENT_CSUM_KEY);
3851
3852	if (btrfs_leaf_free_space(leaf) >= ins_len)
3853	return `0`;
3854
3855	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
3856	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3857	fi = btrfs_item_ptr(leaf, path->slots[`0`],
3858	struct btrfs_file_extent_item);
3859	extent_len = btrfs_file_extent_num_bytes(eb: leaf, s: fi);
3860	}
3861	btrfs_release_path(p: path);
3862
3863	path->keep_locks = `1`;
3864	path->search_for_split = `1`;
3865	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
3866	path->search_for_split = `0`;
3867	if (ret > `0`)
3868	ret = -EAGAIN;
3869	if (ret < `0`)
3870	goto err;
3871
3872	ret = -EAGAIN;
3873	leaf = path->nodes[`0`];
3874	/ if our item isn't there, return now /
3875	if (item_size != btrfs_item_size(eb: leaf, slot: path->slots[`0`]))
3876	goto err;
3877
3878	/ the leaf has changed, it now has room. return now /
3879	if (btrfs_leaf_free_space(leaf: path->nodes[`0`]) >= ins_len)
3880	goto err;
3881
3882	if (key.type == BTRFS_EXTENT_DATA_KEY) {
3883	fi = btrfs_item_ptr(leaf, path->slots[`0`],
3884	struct btrfs_file_extent_item);
3885	if (extent_len != btrfs_file_extent_num_bytes(eb: leaf, s: fi))
3886	goto err;
3887	}
3888
3889	ret = split_leaf(trans, root, ins_key: &key, path, data_size: ins_len, extend: `1`);
3890	if (ret)
3891	goto err;
3892
3893	path->keep_locks = `0`;
3894	btrfs_unlock_up_safe(path, level: `1`);
3895	return `0`;
3896	err:
3897	path->keep_locks = `0`;
3898	return ret;
3899	}
3900
3901	static noinline int split_item(struct btrfs_trans_handle *trans,
3902	struct btrfs_path *path,
3903	const struct btrfs_key *new_key,
3904	unsigned long split_offset)
3905	{
3906	struct extent_buffer *leaf;
3907	int orig_slot, slot;
3908	char *buf;
3909	u32 nritems;
3910	u32 item_size;
3911	u32 orig_offset;
3912	struct btrfs_disk_key disk_key;
3913
3914	leaf = path->nodes[`0`];
3915	/*
3916	* Shouldn't happen because the caller must have previously called
3917	* setup_leaf_for_split() to make room for the new item in the leaf.
3918	*/
3919	if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
3920	return -ENOSPC;
3921
3922	orig_slot = path->slots[`0`];
3923	orig_offset = btrfs_item_offset(eb: leaf, slot: path->slots[`0`]);
3924	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
3925
3926	buf = kmalloc(size: item_size, GFP_NOFS);
3927	if (!buf)
3928	return -ENOMEM;
3929
3930	read_extent_buffer(eb: leaf, dst: buf, btrfs_item_ptr_offset(leaf,
3931	path->slots[`0`]), len: item_size);
3932
3933	slot = path->slots[`0`] + `1`;
3934	nritems = btrfs_header_nritems(eb: leaf);
3935	if (slot != nritems) {
3936	/ shift the items /
3937	memmove_leaf_items(leaf, dst_item: slot + `1`, src_item: slot, nr_items: nritems - slot);
3938	}
3939
3940	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: new_key);
3941	btrfs_set_item_key(eb: leaf, disk_key: &disk_key, nr: slot);
3942
3943	btrfs_set_item_offset(eb: leaf, slot, val: orig_offset);
3944	btrfs_set_item_size(eb: leaf, slot, val: item_size - split_offset);
3945
3946	btrfs_set_item_offset(eb: leaf, slot: orig_slot,
3947	val: orig_offset + item_size - split_offset);
3948	btrfs_set_item_size(eb: leaf, slot: orig_slot, val: split_offset);
3949
3950	btrfs_set_header_nritems(eb: leaf, val: nritems + `1`);
3951
3952	/ write the data for the start of the original item /
3953	write_extent_buffer(eb: leaf, src: buf,
3954	btrfs_item_ptr_offset(leaf, path->slots[`0`]),
3955	len: split_offset);
3956
3957	/ write the data for the new item /
3958	write_extent_buffer(eb: leaf, src: buf + split_offset,
3959	btrfs_item_ptr_offset(leaf, slot),
3960	len: item_size - split_offset);
3961	btrfs_mark_buffer_dirty(trans, buf: leaf);
3962
3963	BUG_ON(btrfs_leaf_free_space(leaf) < `0`);
3964	kfree(objp: buf);
3965	return `0`;
3966	}
3967
3968	/*
3969	* This function splits a single item into two items,
3970	* giving 'new_key' to the new item and splitting the
3971	* old one at split_offset (from the start of the item).
3972	*
3973	* The path may be released by this operation. After
3974	* the split, the path is pointing to the old item. The
3975	* new item is going to be in the same node as the old one.
3976	*
3977	* Note, the item being split must be smaller enough to live alone on
3978	* a tree block with room for one extra struct btrfs_item
3979	*
3980	* This allows us to split the item in place, keeping a lock on the
3981	* leaf the entire time.
3982	*/
3983	int btrfs_split_item(struct btrfs_trans_handle *trans,
3984	struct btrfs_root *root,
3985	struct btrfs_path *path,
3986	const struct btrfs_key *new_key,
3987	unsigned long split_offset)
3988	{
3989	int ret;
3990	ret = setup_leaf_for_split(trans, root, path,
3991	ins_len: sizeof(struct btrfs_item));
3992	if (ret)
3993	return ret;
3994
3995	ret = split_item(trans, path, new_key, split_offset);
3996	return ret;
3997	}
3998
3999	/*
4000	* make the item pointed to by the path smaller. new_size indicates
4001	* how small to make it, and from_end tells us if we just chop bytes
4002	* off the end of the item or if we shift the item to chop bytes off
4003	* the front.
4004	*/
4005	void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4006	struct btrfs_path path, u32 new_size, int* from_end)
4007	{
4008	int slot;
4009	struct extent_buffer *leaf;
4010	u32 nritems;
4011	unsigned int data_end;
4012	unsigned int old_data_start;
4013	unsigned int old_size;
4014	unsigned int size_diff;
4015	int i;
4016	struct btrfs_map_token token;
4017
4018	leaf = path->nodes[`0`];
4019	slot = path->slots[`0`];
4020
4021	old_size = btrfs_item_size(eb: leaf, slot);
4022	if (old_size == new_size)
4023	return;
4024
4025	nritems = btrfs_header_nritems(eb: leaf);
4026	data_end = leaf_data_end(leaf);
4027
4028	old_data_start = btrfs_item_offset(eb: leaf, slot);
4029
4030	size_diff = old_size - new_size;
4031
4032	BUG_ON(slot < `0`);
4033	BUG_ON(slot >= nritems);
4034
4035	/*
4036	* item0..itemN ... dataN.offset..dataN.size .. data0.size
4037	*/
4038	/ first correct the data pointers /
4039	btrfs_init_map_token(token: &token, eb: leaf);
4040	for (i = slot; i < nritems; i++) {
4041	u32 ioff;
4042
4043	ioff = btrfs_token_item_offset(token: &token, slot: i);
4044	btrfs_set_token_item_offset(token: &token, slot: i, val: ioff + size_diff);
4045	}
4046
4047	/ shift the data /
4048	if (from_end) {
4049	memmove_leaf_data(leaf, dst_offset: data_end + size_diff, src_offset: data_end,
4050	len: old_data_start + new_size - data_end);
4051	} else {
4052	struct btrfs_disk_key disk_key;
4053	u64 offset;
4054
4055	btrfs_item_key(eb: leaf, disk_key: &disk_key, nr: slot);
4056
4057	if (btrfs_disk_key_type(s: &disk_key) == BTRFS_EXTENT_DATA_KEY) {
4058	unsigned long ptr;
4059	struct btrfs_file_extent_item *fi;
4060
4061	fi = btrfs_item_ptr(leaf, slot,
4062	struct btrfs_file_extent_item);
4063	fi = (struct btrfs_file_extent_item *)(
4064	(unsigned long)fi - size_diff);
4065
4066	if (btrfs_file_extent_type(eb: leaf, s: fi) ==
4067	BTRFS_FILE_EXTENT_INLINE) {
4068	ptr = btrfs_item_ptr_offset(leaf, slot);
4069	memmove_extent_buffer(dst: leaf, dst_offset: ptr,
4070	src_offset: (unsigned long)fi,
4071	BTRFS_FILE_EXTENT_INLINE_DATA_START);
4072	}
4073	}
4074
4075	memmove_leaf_data(leaf, dst_offset: data_end + size_diff, src_offset: data_end,
4076	len: old_data_start - data_end);
4077
4078	offset = btrfs_disk_key_offset(s: &disk_key);
4079	btrfs_set_disk_key_offset(s: &disk_key, val: offset + size_diff);
4080	btrfs_set_item_key(eb: leaf, disk_key: &disk_key, nr: slot);
4081	if (slot == `0`)
4082	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
4083	}
4084
4085	btrfs_set_item_size(eb: leaf, slot, val: new_size);
4086	btrfs_mark_buffer_dirty(trans, buf: leaf);
4087
4088	if (btrfs_leaf_free_space(leaf) < `0`) {
4089	btrfs_print_leaf(l: leaf);
4090	BUG();
4091	}
4092	}
4093
4094	/*
4095	* make the item pointed to by the path bigger, data_size is the added size.
4096	*/
4097	void btrfs_extend_item(struct btrfs_trans_handle *trans,
4098	struct btrfs_path *path, u32 data_size)
4099	{
4100	int slot;
4101	struct extent_buffer *leaf;
4102	u32 nritems;
4103	unsigned int data_end;
4104	unsigned int old_data;
4105	unsigned int old_size;
4106	int i;
4107	struct btrfs_map_token token;
4108
4109	leaf = path->nodes[`0`];
4110
4111	nritems = btrfs_header_nritems(eb: leaf);
4112	data_end = leaf_data_end(leaf);
4113
4114	if (btrfs_leaf_free_space(leaf) < data_size) {
4115	btrfs_print_leaf(l: leaf);
4116	BUG();
4117	}
4118	slot = path->slots[`0`];
4119	old_data = btrfs_item_data_end(eb: leaf, nr: slot);
4120
4121	BUG_ON(slot < `0`);
4122	if (slot >= nritems) {
4123	btrfs_print_leaf(l: leaf);
4124	btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4125	slot, nritems);
4126	BUG();
4127	}
4128
4129	/*
4130	* item0..itemN ... dataN.offset..dataN.size .. data0.size
4131	*/
4132	/ first correct the data pointers /
4133	btrfs_init_map_token(token: &token, eb: leaf);
4134	for (i = slot; i < nritems; i++) {
4135	u32 ioff;
4136
4137	ioff = btrfs_token_item_offset(token: &token, slot: i);
4138	btrfs_set_token_item_offset(token: &token, slot: i, val: ioff - data_size);
4139	}
4140
4141	/ shift the data /
4142	memmove_leaf_data(leaf, dst_offset: data_end - data_size, src_offset: data_end,
4143	len: old_data - data_end);
4144
4145	data_end = old_data;
4146	old_size = btrfs_item_size(eb: leaf, slot);
4147	btrfs_set_item_size(eb: leaf, slot, val: old_size + data_size);
4148	btrfs_mark_buffer_dirty(trans, buf: leaf);
4149
4150	if (btrfs_leaf_free_space(leaf) < `0`) {
4151	btrfs_print_leaf(l: leaf);
4152	BUG();
4153	}
4154	}
4155
4156	/*
4157	* Make space in the node before inserting one or more items.
4158	*
4159	* @trans: transaction handle
4160	* @root: root we are inserting items to
4161	* @path: points to the leaf/slot where we are going to insert new items
4162	* @batch: information about the batch of items to insert
4163	*
4164	* Main purpose is to save stack depth by doing the bulk of the work in a
4165	* function that doesn't call btrfs_search_slot
4166	*/
4167	static void setup_items_for_insert(struct btrfs_trans_handle *trans,
4168	struct btrfs_root root, struct* btrfs_path *path,
4169	const struct btrfs_item_batch *batch)
4170	{
4171	struct btrfs_fs_info *fs_info = root->fs_info;
4172	int i;
4173	u32 nritems;
4174	unsigned int data_end;
4175	struct btrfs_disk_key disk_key;
4176	struct extent_buffer *leaf;
4177	int slot;
4178	struct btrfs_map_token token;
4179	u32 total_size;
4180
4181	/*
4182	* Before anything else, update keys in the parent and other ancestors
4183	* if needed, then release the write locks on them, so that other tasks
4184	* can use them while we modify the leaf.
4185	*/
4186	if (path->slots[`0`] == `0`) {
4187	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: &batch->keys[`0`]);
4188	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
4189	}
4190	btrfs_unlock_up_safe(path, level: `1`);
4191
4192	leaf = path->nodes[`0`];
4193	slot = path->slots[`0`];
4194
4195	nritems = btrfs_header_nritems(eb: leaf);
4196	data_end = leaf_data_end(leaf);
4197	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4198
4199	if (btrfs_leaf_free_space(leaf) < total_size) {
4200	btrfs_print_leaf(l: leaf);
4201	btrfs_crit(fs_info, "not enough freespace need %u have %d",
4202	total_size, btrfs_leaf_free_space(leaf));
4203	BUG();
4204	}
4205
4206	btrfs_init_map_token(token: &token, eb: leaf);
4207	if (slot != nritems) {
4208	unsigned int old_data = btrfs_item_data_end(eb: leaf, nr: slot);
4209
4210	if (old_data < data_end) {
4211	btrfs_print_leaf(l: leaf);
4212	btrfs_crit(fs_info,
4213	"item at slot %d with data offset %u beyond data end of leaf %u",
4214	slot, old_data, data_end);
4215	BUG();
4216	}
4217	/*
4218	* item0..itemN ... dataN.offset..dataN.size .. data0.size
4219	*/
4220	/ first correct the data pointers /
4221	for (i = slot; i < nritems; i++) {
4222	u32 ioff;
4223
4224	ioff = btrfs_token_item_offset(token: &token, slot: i);
4225	btrfs_set_token_item_offset(token: &token, slot: i,
4226	val: ioff - batch->total_data_size);
4227	}
4228	/ shift the items /
4229	memmove_leaf_items(leaf, dst_item: slot + batch->nr, src_item: slot, nr_items: nritems - slot);
4230
4231	/ shift the data /
4232	memmove_leaf_data(leaf, dst_offset: data_end - batch->total_data_size,
4233	src_offset: data_end, len: old_data - data_end);
4234	data_end = old_data;
4235	}
4236
4237	/ setup the item for the new data /
4238	for (i = `0`; i < batch->nr; i++) {
4239	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: &batch->keys[i]);
4240	btrfs_set_item_key(eb: leaf, disk_key: &disk_key, nr: slot + i);
4241	data_end -= batch->data_sizes[i];
4242	btrfs_set_token_item_offset(token: &token, slot: slot + i, val: data_end);
4243	btrfs_set_token_item_size(token: &token, slot: slot + i, val: batch->data_sizes[i]);
4244	}
4245
4246	btrfs_set_header_nritems(eb: leaf, val: nritems + batch->nr);
4247	btrfs_mark_buffer_dirty(trans, buf: leaf);
4248
4249	if (btrfs_leaf_free_space(leaf) < `0`) {
4250	btrfs_print_leaf(l: leaf);
4251	BUG();
4252	}
4253	}
4254
4255	/*
4256	* Insert a new item into a leaf.
4257	*
4258	* @trans: Transaction handle.
4259	* @root: The root of the btree.
4260	* @path: A path pointing to the target leaf and slot.
4261	* @key: The key of the new item.
4262	* @data_size: The size of the data associated with the new key.
4263	*/
4264	void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
4265	struct btrfs_root *root,
4266	struct btrfs_path *path,
4267	const struct btrfs_key *key,
4268	u32 data_size)
4269	{
4270	struct btrfs_item_batch batch;
4271
4272	batch.keys = key;
4273	batch.data_sizes = &data_size;
4274	batch.total_data_size = data_size;
4275	batch.nr = `1`;
4276
4277	setup_items_for_insert(trans, root, path, batch: &batch);
4278	}
4279
4280	/*
4281	* Given a key and some data, insert items into the tree.
4282	* This does all the path init required, making room in the tree if needed.
4283	*
4284	* Returns: 0 on success
4285	* -EEXIST if the first key already exists
4286	* < 0 on other errors
4287	*/
4288	int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4289	struct btrfs_root *root,
4290	struct btrfs_path *path,
4291	const struct btrfs_item_batch *batch)
4292	{
4293	int ret = `0`;
4294	int slot;
4295	u32 total_size;
4296
4297	total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4298	ret = btrfs_search_slot(trans, root, key: &batch->keys[`0`], p: path, ins_len: total_size, cow: `1`);
4299	if (ret == `0`)
4300	return -EEXIST;
4301	if (ret < `0`)
4302	return ret;
4303
4304	slot = path->slots[`0`];
4305	BUG_ON(slot < `0`);
4306
4307	setup_items_for_insert(trans, root, path, batch);
4308	return `0`;
4309	}
4310
4311	/*
4312	* Given a key and some data, insert an item into the tree.
4313	* This does all the path init required, making room in the tree if needed.
4314	*/
4315	int btrfs_insert_item(struct btrfs_trans_handle trans, struct* btrfs_root *root,
4316	const struct btrfs_key cpu_key, void* *data,
4317	u32 data_size)
4318	{
4319	int ret = `0`;
4320	struct btrfs_path *path;
4321	struct extent_buffer *leaf;
4322	unsigned long ptr;
4323
4324	path = btrfs_alloc_path();
4325	if (!path)
4326	return -ENOMEM;
4327	ret = btrfs_insert_empty_item(trans, root, path, key: cpu_key, data_size);
4328	if (!ret) {
4329	leaf = path->nodes[`0`];
4330	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
4331	write_extent_buffer(eb: leaf, src: data, start: ptr, len: data_size);
4332	btrfs_mark_buffer_dirty(trans, buf: leaf);
4333	}
4334	btrfs_free_path(p: path);
4335	return ret;
4336	}
4337
4338	/*
4339	* This function duplicates an item, giving 'new_key' to the new item.
4340	* It guarantees both items live in the same tree leaf and the new item is
4341	* contiguous with the original item.
4342	*
4343	* This allows us to split a file extent in place, keeping a lock on the leaf
4344	* the entire time.
4345	*/
4346	int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4347	struct btrfs_root *root,
4348	struct btrfs_path *path,
4349	const struct btrfs_key *new_key)
4350	{
4351	struct extent_buffer *leaf;
4352	int ret;
4353	u32 item_size;
4354
4355	leaf = path->nodes[`0`];
4356	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
4357	ret = setup_leaf_for_split(trans, root, path,
4358	ins_len: item_size + sizeof(struct btrfs_item));
4359	if (ret)
4360	return ret;
4361
4362	path->slots[`0`]++;
4363	btrfs_setup_item_for_insert(trans, root, path, key: new_key, data_size: item_size);
4364	leaf = path->nodes[`0`];
4365	memcpy_extent_buffer(dst: leaf,
4366	btrfs_item_ptr_offset(leaf, path->slots[`0`]),
4367	btrfs_item_ptr_offset(leaf, path->slots[`0`] - `1`),
4368	len: item_size);
4369	return `0`;
4370	}
4371
4372	/*
4373	* delete the pointer from a given node.
4374	*
4375	* the tree should have been previously balanced so the deletion does not
4376	* empty a node.
4377	*
4378	* This is exported for use inside btrfs-progs, don't un-export it.
4379	*/
4380	int btrfs_del_ptr(struct btrfs_trans_handle trans, struct* btrfs_root *root,
4381	struct btrfs_path path, int* level, int slot)
4382	{
4383	struct extent_buffer *parent = path->nodes[level];
4384	u32 nritems;
4385	int ret;
4386
4387	nritems = btrfs_header_nritems(eb: parent);
4388	if (slot != nritems - `1`) {
4389	if (level) {
4390	ret = btrfs_tree_mod_log_insert_move(eb: parent, dst_slot: slot,
4391	src_slot: slot + `1`, nr_items: nritems - slot - `1`);
4392	if (ret < `0`) {
4393	btrfs_abort_transaction(trans, ret);
4394	return ret;
4395	}
4396	}
4397	memmove_extent_buffer(dst: parent,
4398	dst_offset: btrfs_node_key_ptr_offset(eb: parent, nr: slot),
4399	src_offset: btrfs_node_key_ptr_offset(eb: parent, nr: slot + `1`),
4400	len: sizeof(struct btrfs_key_ptr) *
4401	(nritems - slot - `1`));
4402	} else if (level) {
4403	ret = btrfs_tree_mod_log_insert_key(eb: parent, slot,
4404	op: BTRFS_MOD_LOG_KEY_REMOVE);
4405	if (ret < `0`) {
4406	btrfs_abort_transaction(trans, ret);
4407	return ret;
4408	}
4409	}
4410
4411	nritems--;
4412	btrfs_set_header_nritems(eb: parent, val: nritems);
4413	if (nritems == `0` && parent == root->node) {
4414	BUG_ON(btrfs_header_level(root->node) != `1`);
4415	/ just turn the root into a leaf and break /
4416	btrfs_set_header_level(eb: root->node, val: `0`);
4417	} else if (slot == `0`) {
4418	struct btrfs_disk_key disk_key;
4419
4420	btrfs_node_key(eb: parent, disk_key: &disk_key, nr: `0`);
4421	fixup_low_keys(trans, path, key: &disk_key, level: level + `1`);
4422	}
4423	btrfs_mark_buffer_dirty(trans, buf: parent);
4424	return `0`;
4425	}
4426
4427	/*
4428	* a helper function to delete the leaf pointed to by path->slots[1] and
4429	* path->nodes[1].
4430	*
4431	* This deletes the pointer in path->nodes[1] and frees the leaf
4432	* block extent. zero is returned if it all worked out, < 0 otherwise.
4433	*
4434	* The path must have already been setup for deleting the leaf, including
4435	* all the proper balancing. path->nodes[1] must be locked.
4436	*/
4437	static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4438	struct btrfs_root *root,
4439	struct btrfs_path *path,
4440	struct extent_buffer *leaf)
4441	{
4442	int ret;
4443
4444	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4445	ret = btrfs_del_ptr(trans, root, path, level: `1`, slot: path->slots[`1`]);
4446	if (ret < `0`)
4447	return ret;
4448
4449	/*
4450	* btrfs_free_extent is expensive, we want to make sure we
4451	* aren't holding any locks when we call it
4452	*/
4453	btrfs_unlock_up_safe(path, level: `0`);
4454
4455	root_sub_used_bytes(root);
4456
4457	atomic_inc(v: &leaf->refs);
4458	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root), buf: leaf, parent: `0`, last_ref: `1`);
4459	free_extent_buffer_stale(eb: leaf);
4460	return `0`;
4461	}
4462	/*
4463	* delete the item at the leaf level in path. If that empties
4464	* the leaf, remove it from the tree
4465	*/
4466	int btrfs_del_items(struct btrfs_trans_handle trans, struct* btrfs_root *root,
4467	struct btrfs_path path, int* slot, int nr)
4468	{
4469	struct btrfs_fs_info *fs_info = root->fs_info;
4470	struct extent_buffer *leaf;
4471	int ret = `0`;
4472	int wret;
4473	u32 nritems;
4474
4475	leaf = path->nodes[`0`];
4476	nritems = btrfs_header_nritems(eb: leaf);
4477
4478	if (slot + nr != nritems) {
4479	const u32 last_off = btrfs_item_offset(eb: leaf, slot: slot + nr - `1`);
4480	const int data_end = leaf_data_end(leaf);
4481	struct btrfs_map_token token;
4482	u32 dsize = `0`;
4483	int i;
4484
4485	for (i = `0`; i < nr; i++)
4486	dsize += btrfs_item_size(eb: leaf, slot: slot + i);
4487
4488	memmove_leaf_data(leaf, dst_offset: data_end + dsize, src_offset: data_end,
4489	len: last_off - data_end);
4490
4491	btrfs_init_map_token(token: &token, eb: leaf);
4492	for (i = slot + nr; i < nritems; i++) {
4493	u32 ioff;
4494
4495	ioff = btrfs_token_item_offset(token: &token, slot: i);
4496	btrfs_set_token_item_offset(token: &token, slot: i, val: ioff + dsize);
4497	}
4498
4499	memmove_leaf_items(leaf, dst_item: slot, src_item: slot + nr, nr_items: nritems - slot - nr);
4500	}
4501	btrfs_set_header_nritems(eb: leaf, val: nritems - nr);
4502	nritems -= nr;
4503
4504	/ delete the leaf if we've emptied it /
4505	if (nritems == `0`) {
4506	if (leaf == root->node) {
4507	btrfs_set_header_level(eb: leaf, val: `0`);
4508	} else {
4509	btrfs_clear_buffer_dirty(trans, buf: leaf);
4510	ret = btrfs_del_leaf(trans, root, path, leaf);
4511	if (ret < `0`)
4512	return ret;
4513	}
4514	} else {
4515	int used = leaf_space_used(l: leaf, start: `0`, nr: nritems);
4516	if (slot == `0`) {
4517	struct btrfs_disk_key disk_key;
4518
4519	btrfs_item_key(eb: leaf, disk_key: &disk_key, nr: `0`);
4520	fixup_low_keys(trans, path, key: &disk_key, level: `1`);
4521	}
4522
4523	/*
4524	* Try to delete the leaf if it is mostly empty. We do this by
4525	* trying to move all its items into its left and right neighbours.
4526	* If we can't move all the items, then we don't delete it - it's
4527	* not ideal, but future insertions might fill the leaf with more
4528	* items, or items from other leaves might be moved later into our
4529	* leaf due to deletions on those leaves.
4530	*/
4531	if (used < BTRFS_LEAF_DATA_SIZE(info: fs_info) / `3`) {
4532	u32 min_push_space;
4533
4534	/ push_leaf_left fixes the path.*
4535	* make sure the path still points to our leaf
4536	* for possible call to btrfs_del_ptr below
4537	*/
4538	slot = path->slots[`1`];
4539	atomic_inc(v: &leaf->refs);
4540	/*
4541	* We want to be able to at least push one item to the
4542	* left neighbour leaf, and that's the first item.
4543	*/
4544	min_push_space = sizeof(struct btrfs_item) +
4545	btrfs_item_size(eb: leaf, slot: `0`);
4546	wret = push_leaf_left(trans, root, path, min_data_size: `0`,
4547	data_size: min_push_space, empty: `1`, max_slot: (u32)-`1`);
4548	if (wret < `0` && wret != -ENOSPC)
4549	ret = wret;
4550
4551	if (path->nodes[`0`] == leaf &&
4552	btrfs_header_nritems(eb: leaf)) {
4553	/*
4554	* If we were not able to push all items from our
4555	* leaf to its left neighbour, then attempt to
4556	* either push all the remaining items to the
4557	* right neighbour or none. There's no advantage
4558	* in pushing only some items, instead of all, as
4559	* it's pointless to end up with a leaf having
4560	* too few items while the neighbours can be full
4561	* or nearly full.
4562	*/
4563	nritems = btrfs_header_nritems(eb: leaf);
4564	min_push_space = leaf_space_used(l: leaf, start: `0`, nr: nritems);
4565	wret = push_leaf_right(trans, root, path, min_data_size: `0`,
4566	data_size: min_push_space, empty: `1`, min_slot: `0`);
4567	if (wret < `0` && wret != -ENOSPC)
4568	ret = wret;
4569	}
4570
4571	if (btrfs_header_nritems(eb: leaf) == `0`) {
4572	path->slots[`1`] = slot;
4573	ret = btrfs_del_leaf(trans, root, path, leaf);
4574	if (ret < `0`)
4575	return ret;
4576	free_extent_buffer(eb: leaf);
4577	ret = `0`;
4578	} else {
4579	/ if we're still in the path, make sure*
4580	* we're dirty. Otherwise, one of the
4581	* push_leaf functions must have already
4582	* dirtied this buffer
4583	*/
4584	if (path->nodes[`0`] == leaf)
4585	btrfs_mark_buffer_dirty(trans, buf: leaf);
4586	free_extent_buffer(eb: leaf);
4587	}
4588	} else {
4589	btrfs_mark_buffer_dirty(trans, buf: leaf);
4590	}
4591	}
4592	return ret;
4593	}
4594
4595	/*
4596	* A helper function to walk down the tree starting at min_key, and looking
4597	* for nodes or leaves that are have a minimum transaction id.
4598	* This is used by the btree defrag code, and tree logging
4599	*
4600	* This does not cow, but it does stuff the starting key it finds back
4601	* into min_key, so you can call btrfs_search_slot with cow=1 on the
4602	* key and get a writable path.
4603	*
4604	* This honors path->lowest_level to prevent descent past a given level
4605	* of the tree.
4606	*
4607	* min_trans indicates the oldest transaction that you are interested
4608	* in walking through. Any nodes or leaves older than min_trans are
4609	* skipped over (without reading them).
4610	*
4611	* returns zero if something useful was found, < 0 on error and 1 if there
4612	* was nothing in the tree that matched the search criteria.
4613	*/
4614	int btrfs_search_forward(struct btrfs_root root, struct* btrfs_key *min_key,
4615	struct btrfs_path *path,
4616	u64 min_trans)
4617	{
4618	struct extent_buffer *cur;
4619	struct btrfs_key found_key;
4620	int slot;
4621	int sret;
4622	u32 nritems;
4623	int level;
4624	int ret = `1`;
4625	int keep_locks = path->keep_locks;
4626
4627	ASSERT(!path->nowait);
4628	path->keep_locks = `1`;
4629	again:
4630	cur = btrfs_read_lock_root_node(root);
4631	level = btrfs_header_level(eb: cur);
4632	WARN_ON(path->nodes[level]);
4633	path->nodes[level] = cur;
4634	path->locks[level] = BTRFS_READ_LOCK;
4635
4636	if (btrfs_header_generation(eb: cur) < min_trans) {
4637	ret = `1`;
4638	goto out;
4639	}
4640	while (`1`) {
4641	nritems = btrfs_header_nritems(eb: cur);
4642	level = btrfs_header_level(eb: cur);
4643	sret = btrfs_bin_search(eb: cur, first_slot: `0`, key: min_key, slot: &slot);
4644	if (sret < `0`) {
4645	ret = sret;
4646	goto out;
4647	}
4648
4649	/ at the lowest level, we're done, setup the path and exit /
4650	if (level == path->lowest_level) {
4651	if (slot >= nritems)
4652	goto find_next_key;
4653	ret = `0`;
4654	path->slots[level] = slot;
4655	btrfs_item_key_to_cpu(eb: cur, cpu_key: &found_key, nr: slot);
4656	goto out;
4657	}
4658	if (sret && slot > `0`)
4659	slot--;
4660	/*
4661	* check this node pointer against the min_trans parameters.
4662	* If it is too old, skip to the next one.
4663	*/
4664	while (slot < nritems) {
4665	u64 gen;
4666
4667	gen = btrfs_node_ptr_generation(eb: cur, nr: slot);
4668	if (gen < min_trans) {
4669	slot++;
4670	continue;
4671	}
4672	break;
4673	}
4674	find_next_key:
4675	/*
4676	* we didn't find a candidate key in this node, walk forward
4677	* and find another one
4678	*/
4679	if (slot >= nritems) {
4680	path->slots[level] = slot;
4681	sret = btrfs_find_next_key(root, path, key: min_key, lowest_level: level,
4682	min_trans);
4683	if (sret == `0`) {
4684	btrfs_release_path(p: path);
4685	goto again;
4686	} else {
4687	goto out;
4688	}
4689	}
4690	/ save our key for returning back /
4691	btrfs_node_key_to_cpu(eb: cur, cpu_key: &found_key, nr: slot);
4692	path->slots[level] = slot;
4693	if (level == path->lowest_level) {
4694	ret = `0`;
4695	goto out;
4696	}
4697	cur = btrfs_read_node_slot(parent: cur, slot);
4698	if (IS_ERR(ptr: cur)) {
4699	ret = PTR_ERR(ptr: cur);
4700	goto out;
4701	}
4702
4703	btrfs_tree_read_lock(eb: cur);
4704
4705	path->locks[level - `1`] = BTRFS_READ_LOCK;
4706	path->nodes[level - `1`] = cur;
4707	unlock_up(path, level, lowest_unlock: `1`, min_write_lock_level: `0`, NULL);
4708	}
4709	out:
4710	path->keep_locks = keep_locks;
4711	if (ret == `0`) {
4712	btrfs_unlock_up_safe(path, level: path->lowest_level + `1`);
4713	memcpy(min_key, &found_key, sizeof(found_key));
4714	}
4715	return ret;
4716	}
4717
4718	/*
4719	* this is similar to btrfs_next_leaf, but does not try to preserve
4720	* and fixup the path. It looks for and returns the next key in the
4721	* tree based on the current path and the min_trans parameters.
4722	*
4723	* 0 is returned if another key is found, < 0 if there are any errors
4724	* and 1 is returned if there are no higher keys in the tree
4725	*
4726	* path->keep_locks should be set to 1 on the search made before
4727	* calling this function.
4728	*/
4729	int btrfs_find_next_key(struct btrfs_root root, struct* btrfs_path *path,
4730	struct btrfs_key key, int* level, u64 min_trans)
4731	{
4732	int slot;
4733	struct extent_buffer *c;
4734
4735	WARN_ON(!path->keep_locks && !path->skip_locking);
4736	while (level < BTRFS_MAX_LEVEL) {
4737	if (!path->nodes[level])
4738	return `1`;
4739
4740	slot = path->slots[level] + `1`;
4741	c = path->nodes[level];
4742	next:
4743	if (slot >= btrfs_header_nritems(eb: c)) {
4744	int ret;
4745	int orig_lowest;
4746	struct btrfs_key cur_key;
4747	if (level + `1` >= BTRFS_MAX_LEVEL \|\|
4748	!path->nodes[level + `1`])
4749	return `1`;
4750
4751	if (path->locks[level + `1`] \|\| path->skip_locking) {
4752	level++;
4753	continue;
4754	}
4755
4756	slot = btrfs_header_nritems(eb: c) - `1`;
4757	if (level == `0`)
4758	btrfs_item_key_to_cpu(eb: c, cpu_key: &cur_key, nr: slot);
4759	else
4760	btrfs_node_key_to_cpu(eb: c, cpu_key: &cur_key, nr: slot);
4761
4762	orig_lowest = path->lowest_level;
4763	btrfs_release_path(p: path);
4764	path->lowest_level = level;
4765	ret = btrfs_search_slot(NULL, root, key: &cur_key, p: path,
4766	ins_len: `0`, cow: `0`);
4767	path->lowest_level = orig_lowest;
4768	if (ret < `0`)
4769	return ret;
4770
4771	c = path->nodes[level];
4772	slot = path->slots[level];
4773	if (ret == `0`)
4774	slot++;
4775	goto next;
4776	}
4777
4778	if (level == `0`)
4779	btrfs_item_key_to_cpu(eb: c, cpu_key: key, nr: slot);
4780	else {
4781	u64 gen = btrfs_node_ptr_generation(eb: c, nr: slot);
4782
4783	if (gen < min_trans) {
4784	slot++;
4785	goto next;
4786	}
4787	btrfs_node_key_to_cpu(eb: c, cpu_key: key, nr: slot);
4788	}
4789	return `0`;
4790	}
4791	return `1`;
4792	}
4793
4794	int btrfs_next_old_leaf(struct btrfs_root root, struct* btrfs_path *path,
4795	u64 time_seq)
4796	{
4797	int slot;
4798	int level;
4799	struct extent_buffer *c;
4800	struct extent_buffer *next;
4801	struct btrfs_fs_info *fs_info = root->fs_info;
4802	struct btrfs_key key;
4803	bool need_commit_sem = false;
4804	u32 nritems;
4805	int ret;
4806	int i;
4807
4808	/*
4809	* The nowait semantics are used only for write paths, where we don't
4810	* use the tree mod log and sequence numbers.
4811	*/
4812	if (time_seq)
4813	ASSERT(!path->nowait);
4814
4815	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
4816	if (nritems == `0`)
4817	return `1`;
4818
4819	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: nritems - `1`);
4820	again:
4821	level = `1`;
4822	next = NULL;
4823	btrfs_release_path(p: path);
4824
4825	path->keep_locks = `1`;
4826
4827	if (time_seq) {
4828	ret = btrfs_search_old_slot(root, key: &key, p: path, time_seq);
4829	} else {
4830	if (path->need_commit_sem) {
4831	path->need_commit_sem = `0`;
4832	need_commit_sem = true;
4833	if (path->nowait) {
4834	if (!down_read_trylock(sem: &fs_info->commit_root_sem)) {
4835	ret = -EAGAIN;
4836	goto done;
4837	}
4838	} else {
4839	down_read(sem: &fs_info->commit_root_sem);
4840	}
4841	}
4842	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
4843	}
4844	path->keep_locks = `0`;
4845
4846	if (ret < `0`)
4847	goto done;
4848
4849	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
4850	/*
4851	* by releasing the path above we dropped all our locks. A balance
4852	* could have added more items next to the key that used to be
4853	* at the very end of the block. So, check again here and
4854	* advance the path if there are now more items available.
4855	*/
4856	if (nritems > `0` && path->slots[`0`] < nritems - `1`) {
4857	if (ret == `0`)
4858	path->slots[`0`]++;
4859	ret = `0`;
4860	goto done;
4861	}
4862	/*
4863	* So the above check misses one case:
4864	* - after releasing the path above, someone has removed the item that
4865	* used to be at the very end of the block, and balance between leafs
4866	* gets another one with bigger key.offset to replace it.
4867	*
4868	* This one should be returned as well, or we can get leaf corruption
4869	* later(esp. in __btrfs_drop_extents()).
4870	*
4871	* And a bit more explanation about this check,
4872	* with ret > 0, the key isn't found, the path points to the slot
4873	* where it should be inserted, so the path->slots[0] item must be the
4874	* bigger one.
4875	*/
4876	if (nritems > `0` && ret > `0` && path->slots[`0`] == nritems - `1`) {
4877	ret = `0`;
4878	goto done;
4879	}
4880
4881	while (level < BTRFS_MAX_LEVEL) {
4882	if (!path->nodes[level]) {
4883	ret = `1`;
4884	goto done;
4885	}
4886
4887	slot = path->slots[level] + `1`;
4888	c = path->nodes[level];
4889	if (slot >= btrfs_header_nritems(eb: c)) {
4890	level++;
4891	if (level == BTRFS_MAX_LEVEL) {
4892	ret = `1`;
4893	goto done;
4894	}
4895	continue;
4896	}
4897
4898
4899	/*
4900	* Our current level is where we're going to start from, and to
4901	* make sure lockdep doesn't complain we need to drop our locks
4902	* and nodes from 0 to our current level.
4903	*/
4904	for (i = `0`; i < level; i++) {
4905	if (path->locks[level]) {
4906	btrfs_tree_read_unlock(eb: path->nodes[i]);
4907	path->locks[i] = `0`;
4908	}
4909	free_extent_buffer(eb: path->nodes[i]);
4910	path->nodes[i] = NULL;
4911	}
4912
4913	next = c;
4914	ret = read_block_for_search(root, p: path, eb_ret: &next, level,
4915	slot, key: &key);
4916	if (ret == -EAGAIN && !path->nowait)
4917	goto again;
4918
4919	if (ret < `0`) {
4920	btrfs_release_path(p: path);
4921	goto done;
4922	}
4923
4924	if (!path->skip_locking) {
4925	ret = btrfs_try_tree_read_lock(eb: next);
4926	if (!ret && path->nowait) {
4927	ret = -EAGAIN;
4928	goto done;
4929	}
4930	if (!ret && time_seq) {
4931	/*
4932	* If we don't get the lock, we may be racing
4933	* with push_leaf_left, holding that lock while
4934	* itself waiting for the leaf we've currently
4935	* locked. To solve this situation, we give up
4936	* on our lock and cycle.
4937	*/
4938	free_extent_buffer(eb: next);
4939	btrfs_release_path(p: path);
4940	cond_resched();
4941	goto again;
4942	}
4943	if (!ret)
4944	btrfs_tree_read_lock(eb: next);
4945	}
4946	break;
4947	}
4948	path->slots[level] = slot;
4949	while (`1`) {
4950	level--;
4951	path->nodes[level] = next;
4952	path->slots[level] = `0`;
4953	if (!path->skip_locking)
4954	path->locks[level] = BTRFS_READ_LOCK;
4955	if (!level)
4956	break;
4957
4958	ret = read_block_for_search(root, p: path, eb_ret: &next, level,
4959	slot: `0`, key: &key);
4960	if (ret == -EAGAIN && !path->nowait)
4961	goto again;
4962
4963	if (ret < `0`) {
4964	btrfs_release_path(p: path);
4965	goto done;
4966	}
4967
4968	if (!path->skip_locking) {
4969	if (path->nowait) {
4970	if (!btrfs_try_tree_read_lock(eb: next)) {
4971	ret = -EAGAIN;
4972	goto done;
4973	}
4974	} else {
4975	btrfs_tree_read_lock(eb: next);
4976	}
4977	}
4978	}
4979	ret = `0`;
4980	done:
4981	unlock_up(path, level: `0`, lowest_unlock: `1`, min_write_lock_level: `0`, NULL);
4982	if (need_commit_sem) {
4983	int ret2;
4984
4985	path->need_commit_sem = `1`;
4986	ret2 = finish_need_commit_sem_search(path);
4987	up_read(sem: &fs_info->commit_root_sem);
4988	if (ret2)
4989	ret = ret2;
4990	}
4991
4992	return ret;
4993	}
4994
4995	int btrfs_next_old_item(struct btrfs_root root, struct* btrfs_path *path, u64 time_seq)
4996	{
4997	path->slots[`0`]++;
4998	if (path->slots[`0`] >= btrfs_header_nritems(eb: path->nodes[`0`]))
4999	return btrfs_next_old_leaf(root, path, time_seq);
5000	return `0`;
5001	}
5002
5003	/*
5004	* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5005	* searching until it gets past min_objectid or finds an item of 'type'
5006	*
5007	* returns 0 if something is found, 1 if nothing was found and < 0 on error
5008	*/
5009	int btrfs_previous_item(struct btrfs_root *root,
5010	struct btrfs_path *path, u64 min_objectid,
5011	int type)
5012	{
5013	struct btrfs_key found_key;
5014	struct extent_buffer *leaf;
5015	u32 nritems;
5016	int ret;
5017
5018	while (`1`) {
5019	if (path->slots[`0`] == `0`) {
5020	ret = btrfs_prev_leaf(root, path);
5021	if (ret != `0`)
5022	return ret;
5023	} else {
5024	path->slots[`0`]--;
5025	}
5026	leaf = path->nodes[`0`];
5027	nritems = btrfs_header_nritems(eb: leaf);
5028	if (nritems == `0`)
5029	return `1`;
5030	if (path->slots[`0`] == nritems)
5031	path->slots[`0`]--;
5032
5033	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
5034	if (found_key.objectid < min_objectid)
5035	break;
5036	if (found_key.type == type)
5037	return `0`;
5038	if (found_key.objectid == min_objectid &&
5039	found_key.type < type)
5040	break;
5041	}
5042	return `1`;
5043	}
5044
5045	/*
5046	* search in extent tree to find a previous Metadata/Data extent item with
5047	* min objecitd.
5048	*
5049	* returns 0 if something is found, 1 if nothing was found and < 0 on error
5050	*/
5051	int btrfs_previous_extent_item(struct btrfs_root *root,
5052	struct btrfs_path *path, u64 min_objectid)
5053	{
5054	struct btrfs_key found_key;
5055	struct extent_buffer *leaf;
5056	u32 nritems;
5057	int ret;
5058
5059	while (`1`) {
5060	if (path->slots[`0`] == `0`) {
5061	ret = btrfs_prev_leaf(root, path);
5062	if (ret != `0`)
5063	return ret;
5064	} else {
5065	path->slots[`0`]--;
5066	}
5067	leaf = path->nodes[`0`];
5068	nritems = btrfs_header_nritems(eb: leaf);
5069	if (nritems == `0`)
5070	return `1`;
5071	if (path->slots[`0`] == nritems)
5072	path->slots[`0`]--;
5073
5074	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
5075	if (found_key.objectid < min_objectid)
5076	break;
5077	if (found_key.type == BTRFS_EXTENT_ITEM_KEY \|\|
5078	found_key.type == BTRFS_METADATA_ITEM_KEY)
5079	return `0`;
5080	if (found_key.objectid == min_objectid &&
5081	found_key.type < BTRFS_EXTENT_ITEM_KEY)
5082	break;
5083	}
5084	return `1`;
5085	}
5086
5087	int __init btrfs_ctree_init(void)
5088	{
5089	btrfs_path_cachep = KMEM_CACHE(btrfs_path, `0`);
5090	if (!btrfs_path_cachep)
5091	return -ENOMEM;
5092	return `0`;
5093	}
5094
5095	void __cold btrfs_ctree_exit(void)
5096	{
5097	kmem_cache_destroy(s: btrfs_path_cachep);
5098	}
5099

source code of linux/fs/btrfs/ctree.c