send.c source code [linux/fs/btrfs/send.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2012 Alexander Block. All rights reserved.
4	*/
5
6	#include <linux/bsearch.h>
7	#include <linux/fs.h>
8	#include <linux/file.h>
9	#include <linux/sort.h>
10	#include <linux/mount.h>
11	#include <linux/xattr.h>
12	#include <linux/posix_acl_xattr.h>
13	#include <linux/radix-tree.h>
14	#include <linux/vmalloc.h>
15	#include <linux/string.h>
16	#include <linux/compat.h>
17	#include <linux/crc32c.h>
18	#include <linux/fsverity.h>
19
20	#include "send.h"
21	#include "ctree.h"
22	#include "backref.h"
23	#include "locking.h"
24	#include "disk-io.h"
25	#include "btrfs_inode.h"
26	#include "transaction.h"
27	#include "compression.h"
28	#include "print-tree.h"
29	#include "accessors.h"
30	#include "dir-item.h"
31	#include "file-item.h"
32	#include "ioctl.h"
33	#include "verity.h"
34	#include "lru_cache.h"
35
36	/*
37	* Maximum number of references an extent can have in order for us to attempt to
38	* issue clone operations instead of write operations. This currently exists to
39	* avoid hitting limitations of the backreference walking code (taking a lot of
40	* time and using too much memory for extents with large number of references).
41	*/
42	#define SEND_MAX_EXTENT_REFS 1024
43
44	/*
45	* A fs_path is a helper to dynamically build path names with unknown size.
46	* It reallocates the internal buffer on demand.
47	* It allows fast adding of path elements on the right side (normal path) and
48	* fast adding to the left side (reversed path). A reversed path can also be
49	* unreversed if needed.
50	*/
51	struct fs_path {
52	union {
53	struct {
54	char *start;
55	char *end;
56
57	char *buf;
58	unsigned short buf_len:`15`;
59	unsigned short reversed:`1`;
60	char inline_buf[];
61	};
62	/*
63	* Average path length does not exceed 200 bytes, we'll have
64	* better packing in the slab and higher chance to satisfy
65	* a allocation later during send.
66	*/
67	char pad[`256`];
68	};
69	};
70	#define FS_PATH_INLINE_SIZE \
71	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
72
73
74	/ reused for each extent /
75	struct clone_root {
76	struct btrfs_root *root;
77	u64 ino;
78	u64 offset;
79	u64 num_bytes;
80	bool found_ref;
81	};
82
83	#define SEND_MAX_NAME_CACHE_SIZE 256
84
85	/*
86	* Limit the root_ids array of struct backref_cache_entry to 17 elements.
87	* This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
88	* can be satisfied from the kmalloc-192 slab, without wasting any space.
89	* The most common case is to have a single root for cloning, which corresponds
90	* to the send root. Having the user specify more than 16 clone roots is not
91	* common, and in such rare cases we simply don't use caching if the number of
92	* cloning roots that lead down to a leaf is more than 17.
93	*/
94	#define SEND_MAX_BACKREF_CACHE_ROOTS 17
95
96	/*
97	* Max number of entries in the cache.
98	* With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
99	* maple tree's internal nodes, is 24K.
100	*/
101	#define SEND_MAX_BACKREF_CACHE_SIZE 128
102
103	/*
104	* A backref cache entry maps a leaf to a list of IDs of roots from which the
105	* leaf is accessible and we can use for clone operations.
106	* With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
107	* x86_64).
108	*/
109	struct backref_cache_entry {
110	struct btrfs_lru_cache_entry entry;
111	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
112	/ Number of valid elements in the root_ids array. /
113	int num_roots;
114	};
115
116	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
117	static_assert(offsetof(struct backref_cache_entry, entry) == `0`);
118
119	/*
120	* Max number of entries in the cache that stores directories that were already
121	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
122	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
123	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
124	*/
125	#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
126
127	/*
128	* Max number of entries in the cache that stores directories that were already
129	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
130	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
131	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
132	*/
133	#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
134
135	struct send_ctx {
136	struct file *send_filp;
137	loff_t send_off;
138	char *send_buf;
139	u32 send_size;
140	u32 send_max_size;
141	/*
142	* Whether BTRFS_SEND_A_DATA attribute was already added to current
143	* command (since protocol v2, data must be the last attribute).
144	*/
145	bool put_data;
146	struct page **send_buf_pages;
147	u64 flags; / 'flags' member of btrfs_ioctl_send_args is u64 /
148	/ Protocol version compatibility requested /
149	u32 proto;
150
151	struct btrfs_root *send_root;
152	struct btrfs_root *parent_root;
153	struct clone_root *clone_roots;
154	int clone_roots_cnt;
155
156	/ current state of the compare_tree call /
157	struct btrfs_path *left_path;
158	struct btrfs_path *right_path;
159	struct btrfs_key *cmp_key;
160
161	/*
162	* Keep track of the generation of the last transaction that was used
163	* for relocating a block group. This is periodically checked in order
164	* to detect if a relocation happened since the last check, so that we
165	* don't operate on stale extent buffers for nodes (level >= 1) or on
166	* stale disk_bytenr values of file extent items.
167	*/
168	u64 last_reloc_trans;
169
170	/*
171	* infos of the currently processed inode. In case of deleted inodes,
172	* these are the values from the deleted inode.
173	*/
174	u64 cur_ino;
175	u64 cur_inode_gen;
176	u64 cur_inode_size;
177	u64 cur_inode_mode;
178	u64 cur_inode_rdev;
179	u64 cur_inode_last_extent;
180	u64 cur_inode_next_write_offset;
181	bool cur_inode_new;
182	bool cur_inode_new_gen;
183	bool cur_inode_deleted;
184	bool ignore_cur_inode;
185	bool cur_inode_needs_verity;
186	void *verity_descriptor;
187
188	u64 send_progress;
189
190	struct list_head new_refs;
191	struct list_head deleted_refs;
192
193	struct btrfs_lru_cache name_cache;
194
195	/*
196	* The inode we are currently processing. It's not NULL only when we
197	* need to issue write commands for data extents from this inode.
198	*/
199	struct inode *cur_inode;
200	struct file_ra_state ra;
201	u64 page_cache_clear_start;
202	bool clean_page_cache;
203
204	/*
205	* We process inodes by their increasing order, so if before an
206	* incremental send we reverse the parent/child relationship of
207	* directories such that a directory with a lower inode number was
208	* the parent of a directory with a higher inode number, and the one
209	* becoming the new parent got renamed too, we can't rename/move the
210	* directory with lower inode number when we finish processing it - we
211	* must process the directory with higher inode number first, then
212	* rename/move it and then rename/move the directory with lower inode
213	* number. Example follows.
214	*
215	* Tree state when the first send was performed:
216	*
217	* .
218	* \|-- a (ino 257)
219	* \|-- b (ino 258)
220	* \|
221	* \|
222	* \|-- c (ino 259)
223	* \| \|-- d (ino 260)
224	* \|
225	* \|-- c2 (ino 261)
226	*
227	* Tree state when the second (incremental) send is performed:
228	*
229	* .
230	* \|-- a (ino 257)
231	* \|-- b (ino 258)
232	* \|-- c2 (ino 261)
233	* \|-- d2 (ino 260)
234	* \|-- cc (ino 259)
235	*
236	* The sequence of steps that lead to the second state was:
237	*
238	* mv /a/b/c/d /a/b/c2/d2
239	* mv /a/b/c /a/b/c2/d2/cc
240	*
241	* "c" has lower inode number, but we can't move it (2nd mv operation)
242	* before we move "d", which has higher inode number.
243	*
244	* So we just memorize which move/rename operations must be performed
245	* later when their respective parent is processed and moved/renamed.
246	*/
247
248	/ Indexed by parent directory inode number. /
249	struct rb_root pending_dir_moves;
250
251	/*
252	* Reverse index, indexed by the inode number of a directory that
253	* is waiting for the move/rename of its immediate parent before its
254	* own move/rename can be performed.
255	*/
256	struct rb_root waiting_dir_moves;
257
258	/*
259	* A directory that is going to be rm'ed might have a child directory
260	* which is in the pending directory moves index above. In this case,
261	* the directory can only be removed after the move/rename of its child
262	* is performed. Example:
263	*
264	* Parent snapshot:
265	*
266	* . (ino 256)
267	* \|-- a/ (ino 257)
268	* \|-- b/ (ino 258)
269	* \|-- c/ (ino 259)
270	* \| \|-- x/ (ino 260)
271	* \|
272	* \|-- y/ (ino 261)
273	*
274	* Send snapshot:
275	*
276	* . (ino 256)
277	* \|-- a/ (ino 257)
278	* \|-- b/ (ino 258)
279	* \|-- YY/ (ino 261)
280	* \|-- x/ (ino 260)
281	*
282	* Sequence of steps that lead to the send snapshot:
283	* rm -f /a/b/c/foo.txt
284	* mv /a/b/y /a/b/YY
285	* mv /a/b/c/x /a/b/YY
286	* rmdir /a/b/c
287	*
288	* When the child is processed, its move/rename is delayed until its
289	* parent is processed (as explained above), but all other operations
290	* like update utimes, chown, chgrp, etc, are performed and the paths
291	* that it uses for those operations must use the orphanized name of
292	* its parent (the directory we're going to rm later), so we need to
293	* memorize that name.
294	*
295	* Indexed by the inode number of the directory to be deleted.
296	*/
297	struct rb_root orphan_dirs;
298
299	struct rb_root rbtree_new_refs;
300	struct rb_root rbtree_deleted_refs;
301
302	struct btrfs_lru_cache backref_cache;
303	u64 backref_cache_last_reloc_trans;
304
305	struct btrfs_lru_cache dir_created_cache;
306	struct btrfs_lru_cache dir_utimes_cache;
307	};
308
309	struct pending_dir_move {
310	struct rb_node node;
311	struct list_head list;
312	u64 parent_ino;
313	u64 ino;
314	u64 gen;
315	struct list_head update_refs;
316	};
317
318	struct waiting_dir_move {
319	struct rb_node node;
320	u64 ino;
321	/*
322	* There might be some directory that could not be removed because it
323	* was waiting for this directory inode to be moved first. Therefore
324	* after this directory is moved, we can try to rmdir the ino rmdir_ino.
325	*/
326	u64 rmdir_ino;
327	u64 rmdir_gen;
328	bool orphanized;
329	};
330
331	struct orphan_dir_info {
332	struct rb_node node;
333	u64 ino;
334	u64 gen;
335	u64 last_dir_index_offset;
336	u64 dir_high_seq_ino;
337	};
338
339	struct name_cache_entry {
340	/*
341	* The key in the entry is an inode number, and the generation matches
342	* the inode's generation.
343	*/
344	struct btrfs_lru_cache_entry entry;
345	u64 parent_ino;
346	u64 parent_gen;
347	int ret;
348	int need_later_update;
349	int name_len;
350	char name[];
351	};
352
353	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
354	static_assert(offsetof(struct name_cache_entry, entry) == `0`);
355
356	#define ADVANCE 1
357	#define ADVANCE_ONLY_NEXT -1
358
359	enum btrfs_compare_tree_result {
360	BTRFS_COMPARE_TREE_NEW,
361	BTRFS_COMPARE_TREE_DELETED,
362	BTRFS_COMPARE_TREE_CHANGED,
363	BTRFS_COMPARE_TREE_SAME,
364	};
365
366	__cold
367	static void inconsistent_snapshot_error(struct send_ctx *sctx,
368	enum btrfs_compare_tree_result result,
369	const char *what)
370	{
371	const char *result_string;
372
373	switch (result) {
374	case BTRFS_COMPARE_TREE_NEW:
375	result_string = "new";
376	break;
377	case BTRFS_COMPARE_TREE_DELETED:
378	result_string = "deleted";
379	break;
380	case BTRFS_COMPARE_TREE_CHANGED:
381	result_string = "updated";
382	break;
383	case BTRFS_COMPARE_TREE_SAME:
384	ASSERT(`0`);
385	result_string = "unchanged";
386	break;
387	default:
388	ASSERT(`0`);
389	result_string = "unexpected";
390	}
391
392	btrfs_err(sctx->send_root->fs_info,
393	"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
394	result_string, what, sctx->cmp_key->objectid,
395	sctx->send_root->root_key.objectid,
396	(sctx->parent_root ?
397	sctx->parent_root->root_key.objectid : `0`));
398	}
399
400	__maybe_unused
401	static bool proto_cmd_ok(const struct send_ctx sctx, int* cmd)
402	{
403	switch (sctx->proto) {
404	case `1`: return cmd <= BTRFS_SEND_C_MAX_V1;
405	case `2`: return cmd <= BTRFS_SEND_C_MAX_V2;
406	case `3`: return cmd <= BTRFS_SEND_C_MAX_V3;
407	default: return false;
408	}
409	}
410
411	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
412
413	static struct waiting_dir_move *
414	get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
415
416	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
417
418	static int need_send_hole(struct send_ctx *sctx)
419	{
420	return (sctx->parent_root && !sctx->cur_inode_new &&
421	!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
422	S_ISREG(sctx->cur_inode_mode));
423	}
424
425	static void fs_path_reset(struct fs_path *p)
426	{
427	if (p->reversed) {
428	p->start = p->buf + p->buf_len - `1`;
429	p->end = p->start;
430	*p->start = `0`;
431	} else {
432	p->start = p->buf;
433	p->end = p->start;
434	*p->start = `0`;
435	}
436	}
437
438	static struct fs_path fs_path_alloc(void*)
439	{
440	struct fs_path *p;
441
442	p = kmalloc(size: sizeof(*p), GFP_KERNEL);
443	if (!p)
444	return NULL;
445	p->reversed = `0`;
446	p->buf = p->inline_buf;
447	p->buf_len = FS_PATH_INLINE_SIZE;
448	fs_path_reset(p);
449	return p;
450	}
451
452	static struct fs_path fs_path_alloc_reversed(void*)
453	{
454	struct fs_path *p;
455
456	p = fs_path_alloc();
457	if (!p)
458	return NULL;
459	p->reversed = `1`;
460	fs_path_reset(p);
461	return p;
462	}
463
464	static void fs_path_free(struct fs_path *p)
465	{
466	if (!p)
467	return;
468	if (p->buf != p->inline_buf)
469	kfree(objp: p->buf);
470	kfree(objp: p);
471	}
472
473	static int fs_path_len(struct fs_path *p)
474	{
475	return p->end - p->start;
476	}
477
478	static int fs_path_ensure_buf(struct fs_path p, int* len)
479	{
480	char *tmp_buf;
481	int path_len;
482	int old_buf_len;
483
484	len++;
485
486	if (p->buf_len >= len)
487	return `0`;
488
489	if (len > PATH_MAX) {
490	WARN_ON(`1`);
491	return -ENOMEM;
492	}
493
494	path_len = p->end - p->start;
495	old_buf_len = p->buf_len;
496
497	/*
498	* Allocate to the next largest kmalloc bucket size, to let
499	* the fast path happen most of the time.
500	*/
501	len = kmalloc_size_roundup(size: len);
502	/*
503	* First time the inline_buf does not suffice
504	*/
505	if (p->buf == p->inline_buf) {
506	tmp_buf = kmalloc(size: len, GFP_KERNEL);
507	if (tmp_buf)
508	memcpy(tmp_buf, p->buf, old_buf_len);
509	} else {
510	tmp_buf = krealloc(objp: p->buf, new_size: len, GFP_KERNEL);
511	}
512	if (!tmp_buf)
513	return -ENOMEM;
514	p->buf = tmp_buf;
515	p->buf_len = len;
516
517	if (p->reversed) {
518	tmp_buf = p->buf + old_buf_len - path_len - `1`;
519	p->end = p->buf + p->buf_len - `1`;
520	p->start = p->end - path_len;
521	memmove(p->start, tmp_buf, path_len + `1`);
522	} else {
523	p->start = p->buf;
524	p->end = p->start + path_len;
525	}
526	return `0`;
527	}
528
529	static int fs_path_prepare_for_add(struct fs_path p, int* name_len,
530	char **prepared)
531	{
532	int ret;
533	int new_len;
534
535	new_len = p->end - p->start + name_len;
536	if (p->start != p->end)
537	new_len++;
538	ret = fs_path_ensure_buf(p, len: new_len);
539	if (ret < `0`)
540	goto out;
541
542	if (p->reversed) {
543	if (p->start != p->end)
544	*--p->start = `'/'`;
545	p->start -= name_len;
546	*prepared = p->start;
547	} else {
548	if (p->start != p->end)
549	*p->end++ = `'/'`;
550	*prepared = p->end;
551	p->end += name_len;
552	*p->end = `0`;
553	}
554
555	out:
556	return ret;
557	}
558
559	static int fs_path_add(struct fs_path p, const* char name, int* name_len)
560	{
561	int ret;
562	char *prepared;
563
564	ret = fs_path_prepare_for_add(p, name_len, prepared: &prepared);
565	if (ret < `0`)
566	goto out;
567	memcpy(prepared, name, name_len);
568
569	out:
570	return ret;
571	}
572
573	static int fs_path_add_path(struct fs_path p, struct* fs_path *p2)
574	{
575	int ret;
576	char *prepared;
577
578	ret = fs_path_prepare_for_add(p, name_len: p2->end - p2->start, prepared: &prepared);
579	if (ret < `0`)
580	goto out;
581	memcpy(prepared, p2->start, p2->end - p2->start);
582
583	out:
584	return ret;
585	}
586
587	static int fs_path_add_from_extent_buffer(struct fs_path *p,
588	struct extent_buffer *eb,
589	unsigned long off, int len)
590	{
591	int ret;
592	char *prepared;
593
594	ret = fs_path_prepare_for_add(p, name_len: len, prepared: &prepared);
595	if (ret < `0`)
596	goto out;
597
598	read_extent_buffer(eb, dst: prepared, start: off, len);
599
600	out:
601	return ret;
602	}
603
604	static int fs_path_copy(struct fs_path p, struct* fs_path *from)
605	{
606	p->reversed = from->reversed;
607	fs_path_reset(p);
608
609	return fs_path_add_path(p, p2: from);
610	}
611
612	static void fs_path_unreverse(struct fs_path *p)
613	{
614	char *tmp;
615	int len;
616
617	if (!p->reversed)
618	return;
619
620	tmp = p->start;
621	len = p->end - p->start;
622	p->start = p->buf;
623	p->end = p->start + len;
624	memmove(p->start, tmp, len + `1`);
625	p->reversed = `0`;
626	}
627
628	static struct btrfs_path alloc_path_for_send(void*)
629	{
630	struct btrfs_path *path;
631
632	path = btrfs_alloc_path();
633	if (!path)
634	return NULL;
635	path->search_commit_root = `1`;
636	path->skip_locking = `1`;
637	path->need_commit_sem = `1`;
638	return path;
639	}
640
641	static int write_buf(struct file filp, const* void buf, u32 len, loff_t off)
642	{
643	int ret;
644	u32 pos = `0`;
645
646	while (pos < len) {
647	ret = kernel_write(filp, buf + pos, len - pos, off);
648	if (ret < `0`)
649	return ret;
650	if (ret == `0`)
651	return -EIO;
652	pos += ret;
653	}
654
655	return `0`;
656	}
657
658	static int tlv_put(struct send_ctx sctx, u16 attr, const* void data, int* len)
659	{
660	struct btrfs_tlv_header *hdr;
661	int total_len = sizeof(*hdr) + len;
662	int left = sctx->send_max_size - sctx->send_size;
663
664	if (WARN_ON_ONCE(sctx->put_data))
665	return -EINVAL;
666
667	if (unlikely(left < total_len))
668	return -EOVERFLOW;
669
670	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
671	put_unaligned_le16(val: attr, p: &hdr->tlv_type);
672	put_unaligned_le16(val: len, p: &hdr->tlv_len);
673	memcpy(hdr + `1`, data, len);
674	sctx->send_size += total_len;
675
676	return `0`;
677	}
678
679	#define TLV_PUT_DEFINE_INT(bits) \
680	static int tlv_put_u##bits(struct send_ctx *sctx, \
681	u##bits attr, u##bits value) \
682	{ \
683	__le##bits __tmp = cpu_to_le##bits(value); \
684	return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
685	}
686
687	TLV_PUT_DEFINE_INT(`8`)
688	TLV_PUT_DEFINE_INT(`32`)
689	TLV_PUT_DEFINE_INT(`64`)
690
691	static int tlv_put_string(struct send_ctx *sctx, u16 attr,
692	const char str, int* len)
693	{
694	if (len == -`1`)
695	len = strlen(str);
696	return tlv_put(sctx, attr, data: str, len);
697	}
698
699	static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
700	const u8 *uuid)
701	{
702	return tlv_put(sctx, attr, data: uuid, BTRFS_UUID_SIZE);
703	}
704
705	static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
706	struct extent_buffer *eb,
707	struct btrfs_timespec *ts)
708	{
709	struct btrfs_timespec bts;
710	read_extent_buffer(eb, dst: &bts, start: (unsigned long)ts, len: sizeof(bts));
711	return tlv_put(sctx, attr, data: &bts, len: sizeof(bts));
712	}
713
714
715	#define TLV_PUT(sctx, attrtype, data, attrlen) \
716	do { \
717	ret = tlv_put(sctx, attrtype, data, attrlen); \
718	if (ret < 0) \
719	goto tlv_put_failure; \
720	} while (0)
721
722	#define TLV_PUT_INT(sctx, attrtype, bits, value) \
723	do { \
724	ret = tlv_put_u##bits(sctx, attrtype, value); \
725	if (ret < 0) \
726	goto tlv_put_failure; \
727	} while (0)
728
729	#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
730	#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
731	#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
732	#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
733	#define TLV_PUT_STRING(sctx, attrtype, str, len) \
734	do { \
735	ret = tlv_put_string(sctx, attrtype, str, len); \
736	if (ret < 0) \
737	goto tlv_put_failure; \
738	} while (0)
739	#define TLV_PUT_PATH(sctx, attrtype, p) \
740	do { \
741	ret = tlv_put_string(sctx, attrtype, p->start, \
742	p->end - p->start); \
743	if (ret < 0) \
744	goto tlv_put_failure; \
745	} while(0)
746	#define TLV_PUT_UUID(sctx, attrtype, uuid) \
747	do { \
748	ret = tlv_put_uuid(sctx, attrtype, uuid); \
749	if (ret < 0) \
750	goto tlv_put_failure; \
751	} while (0)
752	#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
753	do { \
754	ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
755	if (ret < 0) \
756	goto tlv_put_failure; \
757	} while (0)
758
759	static int send_header(struct send_ctx *sctx)
760	{
761	struct btrfs_stream_header hdr;
762
763	strcpy(p: hdr.magic, BTRFS_SEND_STREAM_MAGIC);
764	hdr.version = cpu_to_le32(sctx->proto);
765	return write_buf(filp: sctx->send_filp, buf: &hdr, len: sizeof(hdr),
766	off: &sctx->send_off);
767	}
768
769	/*
770	* For each command/item we want to send to userspace, we call this function.
771	*/
772	static int begin_cmd(struct send_ctx sctx, int* cmd)
773	{
774	struct btrfs_cmd_header *hdr;
775
776	if (WARN_ON(!sctx->send_buf))
777	return -EINVAL;
778
779	if (unlikely(sctx->send_size != `0`)) {
780	btrfs_err(sctx->send_root->fs_info,
781	"send: command header buffer not empty cmd %d offset %llu",
782	cmd, sctx->send_off);
783	return -EINVAL;
784	}
785
786	sctx->send_size += sizeof(*hdr);
787	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
788	put_unaligned_le16(val: cmd, p: &hdr->cmd);
789
790	return `0`;
791	}
792
793	static int send_cmd(struct send_ctx *sctx)
794	{
795	int ret;
796	struct btrfs_cmd_header *hdr;
797	u32 crc;
798
799	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
800	put_unaligned_le32(val: sctx->send_size - sizeof(*hdr), p: &hdr->len);
801	put_unaligned_le32(val: `0`, p: &hdr->crc);
802
803	crc = crc32c(crc: `0`, address: (unsigned char *)sctx->send_buf, length: sctx->send_size);
804	put_unaligned_le32(val: crc, p: &hdr->crc);
805
806	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
807	off: &sctx->send_off);
808
809	sctx->send_size = `0`;
810	sctx->put_data = false;
811
812	return ret;
813	}
814
815	/*
816	* Sends a move instruction to user space
817	*/
818	static int send_rename(struct send_ctx *sctx,
819	struct fs_path from, struct* fs_path *to)
820	{
821	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
822	int ret;
823
824	btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
825
826	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RENAME);
827	if (ret < `0`)
828	goto out;
829
830	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
831	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
832
833	ret = send_cmd(sctx);
834
835	tlv_put_failure:
836	out:
837	return ret;
838	}
839
840	/*
841	* Sends a link instruction to user space
842	*/
843	static int send_link(struct send_ctx *sctx,
844	struct fs_path path, struct* fs_path *lnk)
845	{
846	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
847	int ret;
848
849	btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
850
851	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_LINK);
852	if (ret < `0`)
853	goto out;
854
855	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
856	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
857
858	ret = send_cmd(sctx);
859
860	tlv_put_failure:
861	out:
862	return ret;
863	}
864
865	/*
866	* Sends an unlink instruction to user space
867	*/
868	static int send_unlink(struct send_ctx sctx, struct* fs_path *path)
869	{
870	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
871	int ret;
872
873	btrfs_debug(fs_info, "send_unlink %s", path->start);
874
875	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UNLINK);
876	if (ret < `0`)
877	goto out;
878
879	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
880
881	ret = send_cmd(sctx);
882
883	tlv_put_failure:
884	out:
885	return ret;
886	}
887
888	/*
889	* Sends a rmdir instruction to user space
890	*/
891	static int send_rmdir(struct send_ctx sctx, struct* fs_path *path)
892	{
893	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
894	int ret;
895
896	btrfs_debug(fs_info, "send_rmdir %s", path->start);
897
898	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RMDIR);
899	if (ret < `0`)
900	goto out;
901
902	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
903
904	ret = send_cmd(sctx);
905
906	tlv_put_failure:
907	out:
908	return ret;
909	}
910
911	struct btrfs_inode_info {
912	u64 size;
913	u64 gen;
914	u64 mode;
915	u64 uid;
916	u64 gid;
917	u64 rdev;
918	u64 fileattr;
919	u64 nlink;
920	};
921
922	/*
923	* Helper function to retrieve some fields from an inode item.
924	*/
925	static int get_inode_info(struct btrfs_root *root, u64 ino,
926	struct btrfs_inode_info *info)
927	{
928	int ret;
929	struct btrfs_path *path;
930	struct btrfs_inode_item *ii;
931	struct btrfs_key key;
932
933	path = alloc_path_for_send();
934	if (!path)
935	return -ENOMEM;
936
937	key.objectid = ino;
938	key.type = BTRFS_INODE_ITEM_KEY;
939	key.offset = `0`;
940	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
941	if (ret) {
942	if (ret > `0`)
943	ret = -ENOENT;
944	goto out;
945	}
946
947	if (!info)
948	goto out;
949
950	ii = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
951	struct btrfs_inode_item);
952	info->size = btrfs_inode_size(eb: path->nodes[`0`], s: ii);
953	info->gen = btrfs_inode_generation(eb: path->nodes[`0`], s: ii);
954	info->mode = btrfs_inode_mode(eb: path->nodes[`0`], s: ii);
955	info->uid = btrfs_inode_uid(eb: path->nodes[`0`], s: ii);
956	info->gid = btrfs_inode_gid(eb: path->nodes[`0`], s: ii);
957	info->rdev = btrfs_inode_rdev(eb: path->nodes[`0`], s: ii);
958	info->nlink = btrfs_inode_nlink(eb: path->nodes[`0`], s: ii);
959	/*
960	* Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
961	* otherwise logically split to 32/32 parts.
962	*/
963	info->fileattr = btrfs_inode_flags(eb: path->nodes[`0`], s: ii);
964
965	out:
966	btrfs_free_path(p: path);
967	return ret;
968	}
969
970	static int get_inode_gen(struct btrfs_root root, u64 ino, u64 gen)
971	{
972	int ret;
973	struct btrfs_inode_info info = { `0` };
974
975	ASSERT(gen);
976
977	ret = get_inode_info(root, ino, info: &info);
978	*gen = info.gen;
979	return ret;
980	}
981
982	typedef int (iterate_inode_ref_t)(int* num, u64 dir, int index,
983	struct fs_path *p,
984	void *ctx);
985
986	/*
987	* Helper function to iterate the entries in ONE btrfs_inode_ref or
988	* btrfs_inode_extref.
989	* The iterate callback may return a non zero value to stop iteration. This can
990	* be a negative value for error codes or 1 to simply stop it.
991	*
992	* path must point to the INODE_REF or INODE_EXTREF when called.
993	*/
994	static int iterate_inode_ref(struct btrfs_root root, struct* btrfs_path *path,
995	struct btrfs_key found_key, int* resolve,
996	iterate_inode_ref_t iterate, void *ctx)
997	{
998	struct extent_buffer *eb = path->nodes[`0`];
999	struct btrfs_inode_ref *iref;
1000	struct btrfs_inode_extref *extref;
1001	struct btrfs_path *tmp_path;
1002	struct fs_path *p;
1003	u32 cur = `0`;
1004	u32 total;
1005	int slot = path->slots[`0`];
1006	u32 name_len;
1007	char *start;
1008	int ret = `0`;
1009	int num = `0`;
1010	int index;
1011	u64 dir;
1012	unsigned long name_off;
1013	unsigned long elem_size;
1014	unsigned long ptr;
1015
1016	p = fs_path_alloc_reversed();
1017	if (!p)
1018	return -ENOMEM;
1019
1020	tmp_path = alloc_path_for_send();
1021	if (!tmp_path) {
1022	fs_path_free(p);
1023	return -ENOMEM;
1024	}
1025
1026
1027	if (found_key->type == BTRFS_INODE_REF_KEY) {
1028	ptr = (unsigned long)btrfs_item_ptr(eb, slot,
1029	struct btrfs_inode_ref);
1030	total = btrfs_item_size(eb, slot);
1031	elem_size = sizeof(*iref);
1032	} else {
1033	ptr = btrfs_item_ptr_offset(eb, slot);
1034	total = btrfs_item_size(eb, slot);
1035	elem_size = sizeof(*extref);
1036	}
1037
1038	while (cur < total) {
1039	fs_path_reset(p);
1040
1041	if (found_key->type == BTRFS_INODE_REF_KEY) {
1042	iref = (struct btrfs_inode_ref *)(ptr + cur);
1043	name_len = btrfs_inode_ref_name_len(eb, s: iref);
1044	name_off = (unsigned long)(iref + `1`);
1045	index = btrfs_inode_ref_index(eb, s: iref);
1046	dir = found_key->offset;
1047	} else {
1048	extref = (struct btrfs_inode_extref *)(ptr + cur);
1049	name_len = btrfs_inode_extref_name_len(eb, s: extref);
1050	name_off = (unsigned long)&extref->name;
1051	index = btrfs_inode_extref_index(eb, s: extref);
1052	dir = btrfs_inode_extref_parent(eb, s: extref);
1053	}
1054
1055	if (resolve) {
1056	start = btrfs_ref_to_path(fs_root: root, path: tmp_path, name_len,
1057	name_off, eb_in: eb, parent: dir,
1058	dest: p->buf, size: p->buf_len);
1059	if (IS_ERR(ptr: start)) {
1060	ret = PTR_ERR(ptr: start);
1061	goto out;
1062	}
1063	if (start < p->buf) {
1064	/ overflow , try again with larger buffer /
1065	ret = fs_path_ensure_buf(p,
1066	len: p->buf_len + p->buf - start);
1067	if (ret < `0`)
1068	goto out;
1069	start = btrfs_ref_to_path(fs_root: root, path: tmp_path,
1070	name_len, name_off,
1071	eb_in: eb, parent: dir,
1072	dest: p->buf, size: p->buf_len);
1073	if (IS_ERR(ptr: start)) {
1074	ret = PTR_ERR(ptr: start);
1075	goto out;
1076	}
1077	if (unlikely(start < p->buf)) {
1078	btrfs_err(root->fs_info,
1079	"send: path ref buffer underflow for key (%llu %u %llu)",
1080	found_key->objectid,
1081	found_key->type,
1082	found_key->offset);
1083	ret = -EINVAL;
1084	goto out;
1085	}
1086	}
1087	p->start = start;
1088	} else {
1089	ret = fs_path_add_from_extent_buffer(p, eb, off: name_off,
1090	len: name_len);
1091	if (ret < `0`)
1092	goto out;
1093	}
1094
1095	cur += elem_size + name_len;
1096	ret = iterate(num, dir, index, p, ctx);
1097	if (ret)
1098	goto out;
1099	num++;
1100	}
1101
1102	out:
1103	btrfs_free_path(p: tmp_path);
1104	fs_path_free(p);
1105	return ret;
1106	}
1107
1108	typedef int (iterate_dir_item_t)(int* num, struct btrfs_key *di_key,
1109	const char name, int* name_len,
1110	const char data, int* data_len,
1111	void *ctx);
1112
1113	/*
1114	* Helper function to iterate the entries in ONE btrfs_dir_item.
1115	* The iterate callback may return a non zero value to stop iteration. This can
1116	* be a negative value for error codes or 1 to simply stop it.
1117	*
1118	* path must point to the dir item when called.
1119	*/
1120	static int iterate_dir_item(struct btrfs_root root, struct* btrfs_path *path,
1121	iterate_dir_item_t iterate, void *ctx)
1122	{
1123	int ret = `0`;
1124	struct extent_buffer *eb;
1125	struct btrfs_dir_item *di;
1126	struct btrfs_key di_key;
1127	char *buf = NULL;
1128	int buf_len;
1129	u32 name_len;
1130	u32 data_len;
1131	u32 cur;
1132	u32 len;
1133	u32 total;
1134	int slot;
1135	int num;
1136
1137	/*
1138	* Start with a small buffer (1 page). If later we end up needing more
1139	* space, which can happen for xattrs on a fs with a leaf size greater
1140	* then the page size, attempt to increase the buffer. Typically xattr
1141	* values are small.
1142	*/
1143	buf_len = PATH_MAX;
1144	buf = kmalloc(size: buf_len, GFP_KERNEL);
1145	if (!buf) {
1146	ret = -ENOMEM;
1147	goto out;
1148	}
1149
1150	eb = path->nodes[`0`];
1151	slot = path->slots[`0`];
1152	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1153	cur = `0`;
1154	len = `0`;
1155	total = btrfs_item_size(eb, slot);
1156
1157	num = `0`;
1158	while (cur < total) {
1159	name_len = btrfs_dir_name_len(eb, s: di);
1160	data_len = btrfs_dir_data_len(eb, s: di);
1161	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
1162
1163	if (btrfs_dir_ftype(eb, item: di) == BTRFS_FT_XATTR) {
1164	if (name_len > XATTR_NAME_MAX) {
1165	ret = -ENAMETOOLONG;
1166	goto out;
1167	}
1168	if (name_len + data_len >
1169	BTRFS_MAX_XATTR_SIZE(info: root->fs_info)) {
1170	ret = -E2BIG;
1171	goto out;
1172	}
1173	} else {
1174	/*
1175	* Path too long
1176	*/
1177	if (name_len + data_len > PATH_MAX) {
1178	ret = -ENAMETOOLONG;
1179	goto out;
1180	}
1181	}
1182
1183	if (name_len + data_len > buf_len) {
1184	buf_len = name_len + data_len;
1185	if (is_vmalloc_addr(x: buf)) {
1186	vfree(addr: buf);
1187	buf = NULL;
1188	} else {
1189	char *tmp = krealloc(objp: buf, new_size: buf_len,
1190	GFP_KERNEL \| __GFP_NOWARN);
1191
1192	if (!tmp)
1193	kfree(objp: buf);
1194	buf = tmp;
1195	}
1196	if (!buf) {
1197	buf = kvmalloc(size: buf_len, GFP_KERNEL);
1198	if (!buf) {
1199	ret = -ENOMEM;
1200	goto out;
1201	}
1202	}
1203	}
1204
1205	read_extent_buffer(eb, dst: buf, start: (unsigned long)(di + `1`),
1206	len: name_len + data_len);
1207
1208	len = sizeof(*di) + name_len + data_len;
1209	di = (struct btrfs_dir_item )((char* *)di + len);
1210	cur += len;
1211
1212	ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1213	data_len, ctx);
1214	if (ret < `0`)
1215	goto out;
1216	if (ret) {
1217	ret = `0`;
1218	goto out;
1219	}
1220
1221	num++;
1222	}
1223
1224	out:
1225	kvfree(addr: buf);
1226	return ret;
1227	}
1228
1229	static int __copy_first_ref(int num, u64 dir, int index,
1230	struct fs_path p, void* *ctx)
1231	{
1232	int ret;
1233	struct fs_path *pt = ctx;
1234
1235	ret = fs_path_copy(p: pt, from: p);
1236	if (ret < `0`)
1237	return ret;
1238
1239	/ we want the first only /
1240	return `1`;
1241	}
1242
1243	/*
1244	* Retrieve the first path of an inode. If an inode has more then one
1245	* ref/hardlink, this is ignored.
1246	*/
1247	static int get_inode_path(struct btrfs_root *root,
1248	u64 ino, struct fs_path *path)
1249	{
1250	int ret;
1251	struct btrfs_key key, found_key;
1252	struct btrfs_path *p;
1253
1254	p = alloc_path_for_send();
1255	if (!p)
1256	return -ENOMEM;
1257
1258	fs_path_reset(p: path);
1259
1260	key.objectid = ino;
1261	key.type = BTRFS_INODE_REF_KEY;
1262	key.offset = `0`;
1263
1264	ret = btrfs_search_slot_for_read(root, key: &key, p, find_higher: `1`, return_any: `0`);
1265	if (ret < `0`)
1266	goto out;
1267	if (ret) {
1268	ret = `1`;
1269	goto out;
1270	}
1271	btrfs_item_key_to_cpu(eb: p->nodes[`0`], cpu_key: &found_key, nr: p->slots[`0`]);
1272	if (found_key.objectid != ino \|\|
1273	(found_key.type != BTRFS_INODE_REF_KEY &&
1274	found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1275	ret = -ENOENT;
1276	goto out;
1277	}
1278
1279	ret = iterate_inode_ref(root, path: p, found_key: &found_key, resolve: `1`,
1280	iterate: __copy_first_ref, ctx: path);
1281	if (ret < `0`)
1282	goto out;
1283	ret = `0`;
1284
1285	out:
1286	btrfs_free_path(p);
1287	return ret;
1288	}
1289
1290	struct backref_ctx {
1291	struct send_ctx *sctx;
1292
1293	/ number of total found references /
1294	u64 found;
1295
1296	/*
1297	* used for clones found in send_root. clones found behind cur_objectid
1298	* and cur_offset are not considered as allowed clones.
1299	*/
1300	u64 cur_objectid;
1301	u64 cur_offset;
1302
1303	/ may be truncated in case it's the last extent in a file /
1304	u64 extent_len;
1305
1306	/ The bytenr the file extent item we are processing refers to. /
1307	u64 bytenr;
1308	/ The owner (root id) of the data backref for the current extent. /
1309	u64 backref_owner;
1310	/ The offset of the data backref for the current extent. /
1311	u64 backref_offset;
1312	};
1313
1314	static int __clone_root_cmp_bsearch(const void key, const* void *elt)
1315	{
1316	u64 root = (u64)(uintptr_t)key;
1317	const struct clone_root *cr = elt;
1318
1319	if (root < cr->root->root_key.objectid)
1320	return -`1`;
1321	if (root > cr->root->root_key.objectid)
1322	return `1`;
1323	return `0`;
1324	}
1325
1326	static int __clone_root_cmp_sort(const void e1, const* void *e2)
1327	{
1328	const struct clone_root *cr1 = e1;
1329	const struct clone_root *cr2 = e2;
1330
1331	if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
1332	return -`1`;
1333	if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
1334	return `1`;
1335	return `0`;
1336	}
1337
1338	/*
1339	* Called for every backref that is found for the current extent.
1340	* Results are collected in sctx->clone_roots->ino/offset.
1341	*/
1342	static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
1343	void *ctx_)
1344	{
1345	struct backref_ctx *bctx = ctx_;
1346	struct clone_root *clone_root;
1347
1348	/ First check if the root is in the list of accepted clone sources /
1349	clone_root = bsearch(key: (void *)(uintptr_t)root_id, base: bctx->sctx->clone_roots,
1350	num: bctx->sctx->clone_roots_cnt,
1351	size: sizeof(struct clone_root),
1352	cmp: __clone_root_cmp_bsearch);
1353	if (!clone_root)
1354	return `0`;
1355
1356	/ This is our own reference, bail out as we can't clone from it. /
1357	if (clone_root->root == bctx->sctx->send_root &&
1358	ino == bctx->cur_objectid &&
1359	offset == bctx->cur_offset)
1360	return `0`;
1361
1362	/*
1363	* Make sure we don't consider clones from send_root that are
1364	* behind the current inode/offset.
1365	*/
1366	if (clone_root->root == bctx->sctx->send_root) {
1367	/*
1368	* If the source inode was not yet processed we can't issue a
1369	* clone operation, as the source extent does not exist yet at
1370	* the destination of the stream.
1371	*/
1372	if (ino > bctx->cur_objectid)
1373	return `0`;
1374	/*
1375	* We clone from the inode currently being sent as long as the
1376	* source extent is already processed, otherwise we could try
1377	* to clone from an extent that does not exist yet at the
1378	* destination of the stream.
1379	*/
1380	if (ino == bctx->cur_objectid &&
1381	offset + bctx->extent_len >
1382	bctx->sctx->cur_inode_next_write_offset)
1383	return `0`;
1384	}
1385
1386	bctx->found++;
1387	clone_root->found_ref = true;
1388
1389	/*
1390	* If the given backref refers to a file extent item with a larger
1391	* number of bytes than what we found before, use the new one so that
1392	* we clone more optimally and end up doing less writes and getting
1393	* less exclusive, non-shared extents at the destination.
1394	*/
1395	if (num_bytes > clone_root->num_bytes) {
1396	clone_root->ino = ino;
1397	clone_root->offset = offset;
1398	clone_root->num_bytes = num_bytes;
1399
1400	/*
1401	* Found a perfect candidate, so there's no need to continue
1402	* backref walking.
1403	*/
1404	if (num_bytes >= bctx->extent_len)
1405	return BTRFS_ITERATE_EXTENT_INODES_STOP;
1406	}
1407
1408	return `0`;
1409	}
1410
1411	static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
1412	const u64 *root_ids_ret, int* *root_count_ret)
1413	{
1414	struct backref_ctx *bctx = ctx;
1415	struct send_ctx *sctx = bctx->sctx;
1416	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1417	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
1418	struct btrfs_lru_cache_entry *raw_entry;
1419	struct backref_cache_entry *entry;
1420
1421	if (sctx->backref_cache.size == `0`)
1422	return false;
1423
1424	/*
1425	* If relocation happened since we first filled the cache, then we must
1426	* empty the cache and can not use it, because even though we operate on
1427	* read-only roots, their leaves and nodes may have been reallocated and
1428	* now be used for different nodes/leaves of the same tree or some other
1429	* tree.
1430	*
1431	* We are called from iterate_extent_inodes() while either holding a
1432	* transaction handle or holding fs_info->commit_root_sem, so no need
1433	* to take any lock here.
1434	*/
1435	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
1436	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
1437	return false;
1438	}
1439
1440	raw_entry = btrfs_lru_cache_lookup(cache: &sctx->backref_cache, key, gen: `0`);
1441	if (!raw_entry)
1442	return false;
1443
1444	entry = container_of(raw_entry, struct backref_cache_entry, entry);
1445	*root_ids_ret = entry->root_ids;
1446	*root_count_ret = entry->num_roots;
1447
1448	return true;
1449	}
1450
1451	static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
1452	void *ctx)
1453	{
1454	struct backref_ctx *bctx = ctx;
1455	struct send_ctx *sctx = bctx->sctx;
1456	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1457	struct backref_cache_entry *new_entry;
1458	struct ulist_iterator uiter;
1459	struct ulist_node *node;
1460	int ret;
1461
1462	/*
1463	* We're called while holding a transaction handle or while holding
1464	* fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1465	* NOFS allocation.
1466	*/
1467	new_entry = kmalloc(size: sizeof(struct backref_cache_entry), GFP_NOFS);
1468	/ No worries, cache is optional. /
1469	if (!new_entry)
1470	return;
1471
1472	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
1473	new_entry->entry.gen = `0`;
1474	new_entry->num_roots = `0`;
1475	ULIST_ITER_INIT(&uiter);
1476	while ((node = ulist_next(ulist: root_ids, uiter: &uiter)) != NULL) {
1477	const u64 root_id = node->val;
1478	struct clone_root *root;
1479
1480	root = bsearch(key: (void *)(uintptr_t)root_id, base: sctx->clone_roots,
1481	num: sctx->clone_roots_cnt, size: sizeof(struct clone_root),
1482	cmp: __clone_root_cmp_bsearch);
1483	if (!root)
1484	continue;
1485
1486	/ Too many roots, just exit, no worries as caching is optional. /
1487	if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
1488	kfree(objp: new_entry);
1489	return;
1490	}
1491
1492	new_entry->root_ids[new_entry->num_roots] = root_id;
1493	new_entry->num_roots++;
1494	}
1495
1496	/*
1497	* We may have not added any roots to the new cache entry, which means
1498	* none of the roots is part of the list of roots from which we are
1499	* allowed to clone. Cache the new entry as it's still useful to avoid
1500	* backref walking to determine which roots have a path to the leaf.
1501	*
1502	* Also use GFP_NOFS because we're called while holding a transaction
1503	* handle or while holding fs_info->commit_root_sem.
1504	*/
1505	ret = btrfs_lru_cache_store(cache: &sctx->backref_cache, new_entry: &new_entry->entry,
1506	GFP_NOFS);
1507	ASSERT(ret == `0` \|\| ret == -ENOMEM);
1508	if (ret) {
1509	/ Caching is optional, no worries. /
1510	kfree(objp: new_entry);
1511	return;
1512	}
1513
1514	/*
1515	* We are called from iterate_extent_inodes() while either holding a
1516	* transaction handle or holding fs_info->commit_root_sem, so no need
1517	* to take any lock here.
1518	*/
1519	if (sctx->backref_cache.size == `1`)
1520	sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
1521	}
1522
1523	static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
1524	const struct extent_buffer leaf, void* *ctx)
1525	{
1526	const u64 refs = btrfs_extent_refs(eb: leaf, s: ei);
1527	const struct backref_ctx *bctx = ctx;
1528	const struct send_ctx *sctx = bctx->sctx;
1529
1530	if (bytenr == bctx->bytenr) {
1531	const u64 flags = btrfs_extent_flags(eb: leaf, s: ei);
1532
1533	if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
1534	return -EUCLEAN;
1535
1536	/*
1537	* If we have only one reference and only the send root as a
1538	* clone source - meaning no clone roots were given in the
1539	* struct btrfs_ioctl_send_args passed to the send ioctl - then
1540	* it's our reference and there's no point in doing backref
1541	* walking which is expensive, so exit early.
1542	*/
1543	if (refs == `1` && sctx->clone_roots_cnt == `1`)
1544	return -ENOENT;
1545	}
1546
1547	/*
1548	* Backreference walking (iterate_extent_inodes() below) is currently
1549	* too expensive when an extent has a large number of references, both
1550	* in time spent and used memory. So for now just fallback to write
1551	* operations instead of clone operations when an extent has more than
1552	* a certain amount of references.
1553	*/
1554	if (refs > SEND_MAX_EXTENT_REFS)
1555	return -ENOENT;
1556
1557	return `0`;
1558	}
1559
1560	static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
1561	{
1562	const struct backref_ctx *bctx = ctx;
1563
1564	if (ino == bctx->cur_objectid &&
1565	root == bctx->backref_owner &&
1566	offset == bctx->backref_offset)
1567	return true;
1568
1569	return false;
1570	}
1571
1572	/*
1573	* Given an inode, offset and extent item, it finds a good clone for a clone
1574	* instruction. Returns -ENOENT when none could be found. The function makes
1575	* sure that the returned clone is usable at the point where sending is at the
1576	* moment. This means, that no clones are accepted which lie behind the current
1577	* inode+offset.
1578	*
1579	* path must point to the extent item when called.
1580	*/
1581	static int find_extent_clone(struct send_ctx *sctx,
1582	struct btrfs_path *path,
1583	u64 ino, u64 data_offset,
1584	u64 ino_size,
1585	struct clone_root **found)
1586	{
1587	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1588	int ret;
1589	int extent_type;
1590	u64 logical;
1591	u64 disk_byte;
1592	u64 num_bytes;
1593	struct btrfs_file_extent_item *fi;
1594	struct extent_buffer *eb = path->nodes[`0`];
1595	struct backref_ctx backref_ctx = { `0` };
1596	struct btrfs_backref_walk_ctx backref_walk_ctx = { `0` };
1597	struct clone_root *cur_clone_root;
1598	int compressed;
1599	u32 i;
1600
1601	/*
1602	* With fallocate we can get prealloc extents beyond the inode's i_size,
1603	* so we don't do anything here because clone operations can not clone
1604	* to a range beyond i_size without increasing the i_size of the
1605	* destination inode.
1606	*/
1607	if (data_offset >= ino_size)
1608	return `0`;
1609
1610	fi = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_file_extent_item);
1611	extent_type = btrfs_file_extent_type(eb, s: fi);
1612	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1613	return -ENOENT;
1614
1615	disk_byte = btrfs_file_extent_disk_bytenr(eb, s: fi);
1616	if (disk_byte == `0`)
1617	return -ENOENT;
1618
1619	compressed = btrfs_file_extent_compression(eb, s: fi);
1620	num_bytes = btrfs_file_extent_num_bytes(eb, s: fi);
1621	logical = disk_byte + btrfs_file_extent_offset(eb, s: fi);
1622
1623	/*
1624	* Setup the clone roots.
1625	*/
1626	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1627	cur_clone_root = sctx->clone_roots + i;
1628	cur_clone_root->ino = (u64)-`1`;
1629	cur_clone_root->offset = `0`;
1630	cur_clone_root->num_bytes = `0`;
1631	cur_clone_root->found_ref = false;
1632	}
1633
1634	backref_ctx.sctx = sctx;
1635	backref_ctx.cur_objectid = ino;
1636	backref_ctx.cur_offset = data_offset;
1637	backref_ctx.bytenr = disk_byte;
1638	/*
1639	* Use the header owner and not the send root's id, because in case of a
1640	* snapshot we can have shared subtrees.
1641	*/
1642	backref_ctx.backref_owner = btrfs_header_owner(eb);
1643	backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, s: fi);
1644
1645	/*
1646	* The last extent of a file may be too large due to page alignment.
1647	* We need to adjust extent_len in this case so that the checks in
1648	* iterate_backrefs() work.
1649	*/
1650	if (data_offset + num_bytes >= ino_size)
1651	backref_ctx.extent_len = ino_size - data_offset;
1652	else
1653	backref_ctx.extent_len = num_bytes;
1654
1655	/*
1656	* Now collect all backrefs.
1657	*/
1658	backref_walk_ctx.bytenr = disk_byte;
1659	if (compressed == BTRFS_COMPRESS_NONE)
1660	backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, s: fi);
1661	backref_walk_ctx.fs_info = fs_info;
1662	backref_walk_ctx.cache_lookup = lookup_backref_cache;
1663	backref_walk_ctx.cache_store = store_backref_cache;
1664	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
1665	backref_walk_ctx.check_extent_item = check_extent_item;
1666	backref_walk_ctx.user_ctx = &backref_ctx;
1667
1668	/*
1669	* If have a single clone root, then it's the send root and we can tell
1670	* the backref walking code to skip our own backref and not resolve it,
1671	* since we can not use it for cloning - the source and destination
1672	* ranges can't overlap and in case the leaf is shared through a subtree
1673	* due to snapshots, we can't use those other roots since they are not
1674	* in the list of clone roots.
1675	*/
1676	if (sctx->clone_roots_cnt == `1`)
1677	backref_walk_ctx.skip_data_ref = skip_self_data_ref;
1678
1679	ret = iterate_extent_inodes(ctx: &backref_walk_ctx, search_commit_root: true, iterate: iterate_backrefs,
1680	user_ctx: &backref_ctx);
1681	if (ret < `0`)
1682	return ret;
1683
1684	down_read(sem: &fs_info->commit_root_sem);
1685	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1686	/*
1687	* A transaction commit for a transaction in which block group
1688	* relocation was done just happened.
1689	* The disk_bytenr of the file extent item we processed is
1690	* possibly stale, referring to the extent's location before
1691	* relocation. So act as if we haven't found any clone sources
1692	* and fallback to write commands, which will read the correct
1693	* data from the new extent location. Otherwise we will fail
1694	* below because we haven't found our own back reference or we
1695	* could be getting incorrect sources in case the old extent
1696	* was already reallocated after the relocation.
1697	*/
1698	up_read(sem: &fs_info->commit_root_sem);
1699	return -ENOENT;
1700	}
1701	up_read(sem: &fs_info->commit_root_sem);
1702
1703	btrfs_debug(fs_info,
1704	"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1705	data_offset, ino, num_bytes, logical);
1706
1707	if (!backref_ctx.found) {
1708	btrfs_debug(fs_info, "no clones found");
1709	return -ENOENT;
1710	}
1711
1712	cur_clone_root = NULL;
1713	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1714	struct clone_root *clone_root = &sctx->clone_roots[i];
1715
1716	if (!clone_root->found_ref)
1717	continue;
1718
1719	/*
1720	* Choose the root from which we can clone more bytes, to
1721	* minimize write operations and therefore have more extent
1722	* sharing at the destination (the same as in the source).
1723	*/
1724	if (!cur_clone_root \|\|
1725	clone_root->num_bytes > cur_clone_root->num_bytes) {
1726	cur_clone_root = clone_root;
1727
1728	/*
1729	* We found an optimal clone candidate (any inode from
1730	* any root is fine), so we're done.
1731	*/
1732	if (clone_root->num_bytes >= backref_ctx.extent_len)
1733	break;
1734	}
1735	}
1736
1737	if (cur_clone_root) {
1738	*found = cur_clone_root;
1739	ret = `0`;
1740	} else {
1741	ret = -ENOENT;
1742	}
1743
1744	return ret;
1745	}
1746
1747	static int read_symlink(struct btrfs_root *root,
1748	u64 ino,
1749	struct fs_path *dest)
1750	{
1751	int ret;
1752	struct btrfs_path *path;
1753	struct btrfs_key key;
1754	struct btrfs_file_extent_item *ei;
1755	u8 type;
1756	u8 compression;
1757	unsigned long off;
1758	int len;
1759
1760	path = alloc_path_for_send();
1761	if (!path)
1762	return -ENOMEM;
1763
1764	key.objectid = ino;
1765	key.type = BTRFS_EXTENT_DATA_KEY;
1766	key.offset = `0`;
1767	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
1768	if (ret < `0`)
1769	goto out;
1770	if (ret) {
1771	/*
1772	* An empty symlink inode. Can happen in rare error paths when
1773	* creating a symlink (transaction committed before the inode
1774	* eviction handler removed the symlink inode items and a crash
1775	* happened in between or the subvol was snapshoted in between).
1776	* Print an informative message to dmesg/syslog so that the user
1777	* can delete the symlink.
1778	*/
1779	btrfs_err(root->fs_info,
1780	"Found empty symlink inode %llu at root %llu",
1781	ino, root->root_key.objectid);
1782	ret = -EIO;
1783	goto out;
1784	}
1785
1786	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
1787	struct btrfs_file_extent_item);
1788	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
1789	if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
1790	ret = -EUCLEAN;
1791	btrfs_crit(root->fs_info,
1792	"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1793	ino, btrfs_root_id(root), type);
1794	goto out;
1795	}
1796	compression = btrfs_file_extent_compression(eb: path->nodes[`0`], s: ei);
1797	if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
1798	ret = -EUCLEAN;
1799	btrfs_crit(root->fs_info,
1800	"send: found symlink extent with compression, ino %llu root %llu compression type %d",
1801	ino, btrfs_root_id(root), compression);
1802	goto out;
1803	}
1804
1805	off = btrfs_file_extent_inline_start(e: ei);
1806	len = btrfs_file_extent_ram_bytes(eb: path->nodes[`0`], s: ei);
1807
1808	ret = fs_path_add_from_extent_buffer(p: dest, eb: path->nodes[`0`], off, len);
1809
1810	out:
1811	btrfs_free_path(p: path);
1812	return ret;
1813	}
1814
1815	/*
1816	* Helper function to generate a file name that is unique in the root of
1817	* send_root and parent_root. This is used to generate names for orphan inodes.
1818	*/
1819	static int gen_unique_name(struct send_ctx *sctx,
1820	u64 ino, u64 gen,
1821	struct fs_path *dest)
1822	{
1823	int ret = `0`;
1824	struct btrfs_path *path;
1825	struct btrfs_dir_item *di;
1826	char tmp[`64`];
1827	int len;
1828	u64 idx = `0`;
1829
1830	path = alloc_path_for_send();
1831	if (!path)
1832	return -ENOMEM;
1833
1834	while (`1`) {
1835	struct fscrypt_str tmp_name;
1836
1837	len = snprintf(buf: tmp, size: sizeof(tmp), fmt: "o%llu-%llu-%llu",
1838	ino, gen, idx);
1839	ASSERT(len < sizeof(tmp));
1840	tmp_name.name = tmp;
1841	tmp_name.len = strlen(tmp);
1842
1843	di = btrfs_lookup_dir_item(NULL, root: sctx->send_root,
1844	path, BTRFS_FIRST_FREE_OBJECTID,
1845	name: &tmp_name, mod: `0`);
1846	btrfs_release_path(p: path);
1847	if (IS_ERR(ptr: di)) {
1848	ret = PTR_ERR(ptr: di);
1849	goto out;
1850	}
1851	if (di) {
1852	/ not unique, try again /
1853	idx++;
1854	continue;
1855	}
1856
1857	if (!sctx->parent_root) {
1858	/ unique /
1859	ret = `0`;
1860	break;
1861	}
1862
1863	di = btrfs_lookup_dir_item(NULL, root: sctx->parent_root,
1864	path, BTRFS_FIRST_FREE_OBJECTID,
1865	name: &tmp_name, mod: `0`);
1866	btrfs_release_path(p: path);
1867	if (IS_ERR(ptr: di)) {
1868	ret = PTR_ERR(ptr: di);
1869	goto out;
1870	}
1871	if (di) {
1872	/ not unique, try again /
1873	idx++;
1874	continue;
1875	}
1876	/ unique /
1877	break;
1878	}
1879
1880	ret = fs_path_add(p: dest, name: tmp, strlen(tmp));
1881
1882	out:
1883	btrfs_free_path(p: path);
1884	return ret;
1885	}
1886
1887	enum inode_state {
1888	inode_state_no_change,
1889	inode_state_will_create,
1890	inode_state_did_create,
1891	inode_state_will_delete,
1892	inode_state_did_delete,
1893	};
1894
1895	static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
1896	u64 send_gen, u64 parent_gen)
1897	{
1898	int ret;
1899	int left_ret;
1900	int right_ret;
1901	u64 left_gen;
1902	u64 right_gen = `0`;
1903	struct btrfs_inode_info info;
1904
1905	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
1906	if (ret < `0` && ret != -ENOENT)
1907	goto out;
1908	left_ret = (info.nlink == `0`) ? -ENOENT : ret;
1909	left_gen = info.gen;
1910	if (send_gen)
1911	*send_gen = ((left_ret == -ENOENT) ? `0` : info.gen);
1912
1913	if (!sctx->parent_root) {
1914	right_ret = -ENOENT;
1915	} else {
1916	ret = get_inode_info(root: sctx->parent_root, ino, info: &info);
1917	if (ret < `0` && ret != -ENOENT)
1918	goto out;
1919	right_ret = (info.nlink == `0`) ? -ENOENT : ret;
1920	right_gen = info.gen;
1921	if (parent_gen)
1922	*parent_gen = ((right_ret == -ENOENT) ? `0` : info.gen);
1923	}
1924
1925	if (!left_ret && !right_ret) {
1926	if (left_gen == gen && right_gen == gen) {
1927	ret = inode_state_no_change;
1928	} else if (left_gen == gen) {
1929	if (ino < sctx->send_progress)
1930	ret = inode_state_did_create;
1931	else
1932	ret = inode_state_will_create;
1933	} else if (right_gen == gen) {
1934	if (ino < sctx->send_progress)
1935	ret = inode_state_did_delete;
1936	else
1937	ret = inode_state_will_delete;
1938	} else {
1939	ret = -ENOENT;
1940	}
1941	} else if (!left_ret) {
1942	if (left_gen == gen) {
1943	if (ino < sctx->send_progress)
1944	ret = inode_state_did_create;
1945	else
1946	ret = inode_state_will_create;
1947	} else {
1948	ret = -ENOENT;
1949	}
1950	} else if (!right_ret) {
1951	if (right_gen == gen) {
1952	if (ino < sctx->send_progress)
1953	ret = inode_state_did_delete;
1954	else
1955	ret = inode_state_will_delete;
1956	} else {
1957	ret = -ENOENT;
1958	}
1959	} else {
1960	ret = -ENOENT;
1961	}
1962
1963	out:
1964	return ret;
1965	}
1966
1967	static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
1968	u64 send_gen, u64 parent_gen)
1969	{
1970	int ret;
1971
1972	if (ino == BTRFS_FIRST_FREE_OBJECTID)
1973	return `1`;
1974
1975	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
1976	if (ret < `0`)
1977	goto out;
1978
1979	if (ret == inode_state_no_change \|\|
1980	ret == inode_state_did_create \|\|
1981	ret == inode_state_will_delete)
1982	ret = `1`;
1983	else
1984	ret = `0`;
1985
1986	out:
1987	return ret;
1988	}
1989
1990	/*
1991	* Helper function to lookup a dir item in a dir.
1992	*/
1993	static int lookup_dir_item_inode(struct btrfs_root *root,
1994	u64 dir, const char name, int* name_len,
1995	u64 *found_inode)
1996	{
1997	int ret = `0`;
1998	struct btrfs_dir_item *di;
1999	struct btrfs_key key;
2000	struct btrfs_path *path;
2001	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
2002
2003	path = alloc_path_for_send();
2004	if (!path)
2005	return -ENOMEM;
2006
2007	di = btrfs_lookup_dir_item(NULL, root, path, dir, name: &name_str, mod: `0`);
2008	if (IS_ERR_OR_NULL(ptr: di)) {
2009	ret = di ? PTR_ERR(ptr: di) : -ENOENT;
2010	goto out;
2011	}
2012	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &key);
2013	if (key.type == BTRFS_ROOT_ITEM_KEY) {
2014	ret = -ENOENT;
2015	goto out;
2016	}
2017	*found_inode = key.objectid;
2018
2019	out:
2020	btrfs_free_path(p: path);
2021	return ret;
2022	}
2023
2024	/*
2025	* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
2026	* generation of the parent dir and the name of the dir entry.
2027	*/
2028	static int get_first_ref(struct btrfs_root *root, u64 ino,
2029	u64 dir, u64 dir_gen, struct fs_path *name)
2030	{
2031	int ret;
2032	struct btrfs_key key;
2033	struct btrfs_key found_key;
2034	struct btrfs_path *path;
2035	int len;
2036	u64 parent_dir;
2037
2038	path = alloc_path_for_send();
2039	if (!path)
2040	return -ENOMEM;
2041
2042	key.objectid = ino;
2043	key.type = BTRFS_INODE_REF_KEY;
2044	key.offset = `0`;
2045
2046	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `1`, return_any: `0`);
2047	if (ret < `0`)
2048	goto out;
2049	if (!ret)
2050	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
2051	nr: path->slots[`0`]);
2052	if (ret \|\| found_key.objectid != ino \|\|
2053	(found_key.type != BTRFS_INODE_REF_KEY &&
2054	found_key.type != BTRFS_INODE_EXTREF_KEY)) {
2055	ret = -ENOENT;
2056	goto out;
2057	}
2058
2059	if (found_key.type == BTRFS_INODE_REF_KEY) {
2060	struct btrfs_inode_ref *iref;
2061	iref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2062	struct btrfs_inode_ref);
2063	len = btrfs_inode_ref_name_len(eb: path->nodes[`0`], s: iref);
2064	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
2065	off: (unsigned long)(iref + `1`),
2066	len);
2067	parent_dir = found_key.offset;
2068	} else {
2069	struct btrfs_inode_extref *extref;
2070	extref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2071	struct btrfs_inode_extref);
2072	len = btrfs_inode_extref_name_len(eb: path->nodes[`0`], s: extref);
2073	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
2074	off: (unsigned long)&extref->name, len);
2075	parent_dir = btrfs_inode_extref_parent(eb: path->nodes[`0`], s: extref);
2076	}
2077	if (ret < `0`)
2078	goto out;
2079	btrfs_release_path(p: path);
2080
2081	if (dir_gen) {
2082	ret = get_inode_gen(root, ino: parent_dir, gen: dir_gen);
2083	if (ret < `0`)
2084	goto out;
2085	}
2086
2087	*dir = parent_dir;
2088
2089	out:
2090	btrfs_free_path(p: path);
2091	return ret;
2092	}
2093
2094	static int is_first_ref(struct btrfs_root *root,
2095	u64 ino, u64 dir,
2096	const char name, int* name_len)
2097	{
2098	int ret;
2099	struct fs_path *tmp_name;
2100	u64 tmp_dir;
2101
2102	tmp_name = fs_path_alloc();
2103	if (!tmp_name)
2104	return -ENOMEM;
2105
2106	ret = get_first_ref(root, ino, dir: &tmp_dir, NULL, name: tmp_name);
2107	if (ret < `0`)
2108	goto out;
2109
2110	if (dir != tmp_dir \|\| name_len != fs_path_len(p: tmp_name)) {
2111	ret = `0`;
2112	goto out;
2113	}
2114
2115	ret = !memcmp(p: tmp_name->start, q: name, size: name_len);
2116
2117	out:
2118	fs_path_free(p: tmp_name);
2119	return ret;
2120	}
2121
2122	/*
2123	* Used by process_recorded_refs to determine if a new ref would overwrite an
2124	* already existing ref. In case it detects an overwrite, it returns the
2125	* inode/gen in who_ino/who_gen.
2126	* When an overwrite is detected, process_recorded_refs does proper orphanizing
2127	* to make sure later references to the overwritten inode are possible.
2128	* Orphanizing is however only required for the first ref of an inode.
2129	* process_recorded_refs does an additional is_first_ref check to see if
2130	* orphanizing is really required.
2131	*/
2132	static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2133	const char name, int* name_len,
2134	u64 who_ino, u64 who_gen, u64 *who_mode)
2135	{
2136	int ret;
2137	u64 parent_root_dir_gen;
2138	u64 other_inode = `0`;
2139	struct btrfs_inode_info info;
2140
2141	if (!sctx->parent_root)
2142	return `0`;
2143
2144	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, NULL, parent_gen: &parent_root_dir_gen);
2145	if (ret <= `0`)
2146	return `0`;
2147
2148	/*
2149	* If we have a parent root we need to verify that the parent dir was
2150	* not deleted and then re-created, if it was then we have no overwrite
2151	* and we can just unlink this entry.
2152	*
2153	* @parent_root_dir_gen was set to 0 if the inode does not exist in the
2154	* parent root.
2155	*/
2156	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
2157	parent_root_dir_gen != dir_gen)
2158	return `0`;
2159
2160	ret = lookup_dir_item_inode(root: sctx->parent_root, dir, name, name_len,
2161	found_inode: &other_inode);
2162	if (ret == -ENOENT)
2163	return `0`;
2164	else if (ret < `0`)
2165	return ret;
2166
2167	/*
2168	* Check if the overwritten ref was already processed. If yes, the ref
2169	* was already unlinked/moved, so we can safely assume that we will not
2170	* overwrite anything at this point in time.
2171	*/
2172	if (other_inode > sctx->send_progress \|\|
2173	is_waiting_for_move(sctx, ino: other_inode)) {
2174	ret = get_inode_info(root: sctx->parent_root, ino: other_inode, info: &info);
2175	if (ret < `0`)
2176	return ret;
2177
2178	*who_ino = other_inode;
2179	*who_gen = info.gen;
2180	*who_mode = info.mode;
2181	return `1`;
2182	}
2183
2184	return `0`;
2185	}
2186
2187	/*
2188	* Checks if the ref was overwritten by an already processed inode. This is
2189	* used by __get_cur_name_and_parent to find out if the ref was orphanized and
2190	* thus the orphan name needs be used.
2191	* process_recorded_refs also uses it to avoid unlinking of refs that were
2192	* overwritten.
2193	*/
2194	static int did_overwrite_ref(struct send_ctx *sctx,
2195	u64 dir, u64 dir_gen,
2196	u64 ino, u64 ino_gen,
2197	const char name, int* name_len)
2198	{
2199	int ret;
2200	u64 ow_inode;
2201	u64 ow_gen = `0`;
2202	u64 send_root_dir_gen;
2203
2204	if (!sctx->parent_root)
2205	return `0`;
2206
2207	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, send_gen: &send_root_dir_gen, NULL);
2208	if (ret <= `0`)
2209	return ret;
2210
2211	/*
2212	* @send_root_dir_gen was set to 0 if the inode does not exist in the
2213	* send root.
2214	*/
2215	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
2216	return `0`;
2217
2218	/ check if the ref was overwritten by another ref /
2219	ret = lookup_dir_item_inode(root: sctx->send_root, dir, name, name_len,
2220	found_inode: &ow_inode);
2221	if (ret == -ENOENT) {
2222	/ was never and will never be overwritten /
2223	return `0`;
2224	} else if (ret < `0`) {
2225	return ret;
2226	}
2227
2228	if (ow_inode == ino) {
2229	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2230	if (ret < `0`)
2231	return ret;
2232
2233	/ It's the same inode, so no overwrite happened. /
2234	if (ow_gen == ino_gen)
2235	return `0`;
2236	}
2237
2238	/*
2239	* We know that it is or will be overwritten. Check this now.
2240	* The current inode being processed might have been the one that caused
2241	* inode 'ino' to be orphanized, therefore check if ow_inode matches
2242	* the current inode being processed.
2243	*/
2244	if (ow_inode < sctx->send_progress)
2245	return `1`;
2246
2247	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
2248	if (ow_gen == `0`) {
2249	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2250	if (ret < `0`)
2251	return ret;
2252	}
2253	if (ow_gen == sctx->cur_inode_gen)
2254	return `1`;
2255	}
2256
2257	return `0`;
2258	}
2259
2260	/*
2261	* Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2262	* that got overwritten. This is used by process_recorded_refs to determine
2263	* if it has to use the path as returned by get_cur_path or the orphan name.
2264	*/
2265	static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2266	{
2267	int ret = `0`;
2268	struct fs_path *name = NULL;
2269	u64 dir;
2270	u64 dir_gen;
2271
2272	if (!sctx->parent_root)
2273	goto out;
2274
2275	name = fs_path_alloc();
2276	if (!name)
2277	return -ENOMEM;
2278
2279	ret = get_first_ref(root: sctx->parent_root, ino, dir: &dir, dir_gen: &dir_gen, name);
2280	if (ret < `0`)
2281	goto out;
2282
2283	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, ino_gen: gen,
2284	name: name->start, name_len: fs_path_len(p: name));
2285
2286	out:
2287	fs_path_free(p: name);
2288	return ret;
2289	}
2290
2291	static inline struct name_cache_entry name_cache_search(struct* send_ctx *sctx,
2292	u64 ino, u64 gen)
2293	{
2294	struct btrfs_lru_cache_entry *entry;
2295
2296	entry = btrfs_lru_cache_lookup(cache: &sctx->name_cache, key: ino, gen);
2297	if (!entry)
2298	return NULL;
2299
2300	return container_of(entry, struct name_cache_entry, entry);
2301	}
2302
2303	/*
2304	* Used by get_cur_path for each ref up to the root.
2305	* Returns 0 if it succeeded.
2306	* Returns 1 if the inode is not existent or got overwritten. In that case, the
2307	* name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2308	* is returned, parent_ino/parent_gen are not guaranteed to be valid.
2309	* Returns <0 in case of error.
2310	*/
2311	static int __get_cur_name_and_parent(struct send_ctx *sctx,
2312	u64 ino, u64 gen,
2313	u64 *parent_ino,
2314	u64 *parent_gen,
2315	struct fs_path *dest)
2316	{
2317	int ret;
2318	int nce_ret;
2319	struct name_cache_entry *nce;
2320
2321	/*
2322	* First check if we already did a call to this function with the same
2323	* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2324	* return the cached result.
2325	*/
2326	nce = name_cache_search(sctx, ino, gen);
2327	if (nce) {
2328	if (ino < sctx->send_progress && nce->need_later_update) {
2329	btrfs_lru_cache_remove(cache: &sctx->name_cache, entry: &nce->entry);
2330	nce = NULL;
2331	} else {
2332	*parent_ino = nce->parent_ino;
2333	*parent_gen = nce->parent_gen;
2334	ret = fs_path_add(p: dest, name: nce->name, name_len: nce->name_len);
2335	if (ret < `0`)
2336	goto out;
2337	ret = nce->ret;
2338	goto out;
2339	}
2340	}
2341
2342	/*
2343	* If the inode is not existent yet, add the orphan name and return 1.
2344	* This should only happen for the parent dir that we determine in
2345	* record_new_ref_if_needed().
2346	*/
2347	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
2348	if (ret < `0`)
2349	goto out;
2350
2351	if (!ret) {
2352	ret = gen_unique_name(sctx, ino, gen, dest);
2353	if (ret < `0`)
2354	goto out;
2355	ret = `1`;
2356	goto out_cache;
2357	}
2358
2359	/*
2360	* Depending on whether the inode was already processed or not, use
2361	* send_root or parent_root for ref lookup.
2362	*/
2363	if (ino < sctx->send_progress)
2364	ret = get_first_ref(root: sctx->send_root, ino,
2365	dir: parent_ino, dir_gen: parent_gen, name: dest);
2366	else
2367	ret = get_first_ref(root: sctx->parent_root, ino,
2368	dir: parent_ino, dir_gen: parent_gen, name: dest);
2369	if (ret < `0`)
2370	goto out;
2371
2372	/*
2373	* Check if the ref was overwritten by an inode's ref that was processed
2374	* earlier. If yes, treat as orphan and return 1.
2375	*/
2376	ret = did_overwrite_ref(sctx, dir: parent_ino, dir_gen: parent_gen, ino, ino_gen: gen,
2377	name: dest->start, name_len: dest->end - dest->start);
2378	if (ret < `0`)
2379	goto out;
2380	if (ret) {
2381	fs_path_reset(p: dest);
2382	ret = gen_unique_name(sctx, ino, gen, dest);
2383	if (ret < `0`)
2384	goto out;
2385	ret = `1`;
2386	}
2387
2388	out_cache:
2389	/*
2390	* Store the result of the lookup in the name cache.
2391	*/
2392	nce = kmalloc(size: sizeof(*nce) + fs_path_len(p: dest) + `1`, GFP_KERNEL);
2393	if (!nce) {
2394	ret = -ENOMEM;
2395	goto out;
2396	}
2397
2398	nce->entry.key = ino;
2399	nce->entry.gen = gen;
2400	nce->parent_ino = *parent_ino;
2401	nce->parent_gen = *parent_gen;
2402	nce->name_len = fs_path_len(p: dest);
2403	nce->ret = ret;
2404	strcpy(p: nce->name, q: dest->start);
2405
2406	if (ino < sctx->send_progress)
2407	nce->need_later_update = `0`;
2408	else
2409	nce->need_later_update = `1`;
2410
2411	nce_ret = btrfs_lru_cache_store(cache: &sctx->name_cache, new_entry: &nce->entry, GFP_KERNEL);
2412	if (nce_ret < `0`) {
2413	kfree(objp: nce);
2414	ret = nce_ret;
2415	}
2416
2417	out:
2418	return ret;
2419	}
2420
2421	/*
2422	* Magic happens here. This function returns the first ref to an inode as it
2423	* would look like while receiving the stream at this point in time.
2424	* We walk the path up to the root. For every inode in between, we check if it
2425	* was already processed/sent. If yes, we continue with the parent as found
2426	* in send_root. If not, we continue with the parent as found in parent_root.
2427	* If we encounter an inode that was deleted at this point in time, we use the
2428	* inodes "orphan" name instead of the real name and stop. Same with new inodes
2429	* that were not created yet and overwritten inodes/refs.
2430	*
2431	* When do we have orphan inodes:
2432	* 1. When an inode is freshly created and thus no valid refs are available yet
2433	* 2. When a directory lost all it's refs (deleted) but still has dir items
2434	* inside which were not processed yet (pending for move/delete). If anyone
2435	* tried to get the path to the dir items, it would get a path inside that
2436	* orphan directory.
2437	* 3. When an inode is moved around or gets new links, it may overwrite the ref
2438	* of an unprocessed inode. If in that case the first ref would be
2439	* overwritten, the overwritten inode gets "orphanized". Later when we
2440	* process this overwritten inode, it is restored at a new place by moving
2441	* the orphan inode.
2442	*
2443	* sctx->send_progress tells this function at which point in time receiving
2444	* would be.
2445	*/
2446	static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2447	struct fs_path *dest)
2448	{
2449	int ret = `0`;
2450	struct fs_path *name = NULL;
2451	u64 parent_inode = `0`;
2452	u64 parent_gen = `0`;
2453	int stop = `0`;
2454
2455	name = fs_path_alloc();
2456	if (!name) {
2457	ret = -ENOMEM;
2458	goto out;
2459	}
2460
2461	dest->reversed = `1`;
2462	fs_path_reset(p: dest);
2463
2464	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2465	struct waiting_dir_move *wdm;
2466
2467	fs_path_reset(p: name);
2468
2469	if (is_waiting_for_rm(sctx, dir_ino: ino, gen)) {
2470	ret = gen_unique_name(sctx, ino, gen, dest: name);
2471	if (ret < `0`)
2472	goto out;
2473	ret = fs_path_add_path(p: dest, p2: name);
2474	break;
2475	}
2476
2477	wdm = get_waiting_dir_move(sctx, ino);
2478	if (wdm && wdm->orphanized) {
2479	ret = gen_unique_name(sctx, ino, gen, dest: name);
2480	stop = `1`;
2481	} else if (wdm) {
2482	ret = get_first_ref(root: sctx->parent_root, ino,
2483	dir: &parent_inode, dir_gen: &parent_gen, name);
2484	} else {
2485	ret = __get_cur_name_and_parent(sctx, ino, gen,
2486	parent_ino: &parent_inode,
2487	parent_gen: &parent_gen, dest: name);
2488	if (ret)
2489	stop = `1`;
2490	}
2491
2492	if (ret < `0`)
2493	goto out;
2494
2495	ret = fs_path_add_path(p: dest, p2: name);
2496	if (ret < `0`)
2497	goto out;
2498
2499	ino = parent_inode;
2500	gen = parent_gen;
2501	}
2502
2503	out:
2504	fs_path_free(p: name);
2505	if (!ret)
2506	fs_path_unreverse(p: dest);
2507	return ret;
2508	}
2509
2510	/*
2511	* Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2512	*/
2513	static int send_subvol_begin(struct send_ctx *sctx)
2514	{
2515	int ret;
2516	struct btrfs_root *send_root = sctx->send_root;
2517	struct btrfs_root *parent_root = sctx->parent_root;
2518	struct btrfs_path *path;
2519	struct btrfs_key key;
2520	struct btrfs_root_ref *ref;
2521	struct extent_buffer *leaf;
2522	char *name = NULL;
2523	int namelen;
2524
2525	path = btrfs_alloc_path();
2526	if (!path)
2527	return -ENOMEM;
2528
2529	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2530	if (!name) {
2531	btrfs_free_path(p: path);
2532	return -ENOMEM;
2533	}
2534
2535	key.objectid = send_root->root_key.objectid;
2536	key.type = BTRFS_ROOT_BACKREF_KEY;
2537	key.offset = `0`;
2538
2539	ret = btrfs_search_slot_for_read(root: send_root->fs_info->tree_root,
2540	key: &key, p: path, find_higher: `1`, return_any: `0`);
2541	if (ret < `0`)
2542	goto out;
2543	if (ret) {
2544	ret = -ENOENT;
2545	goto out;
2546	}
2547
2548	leaf = path->nodes[`0`];
2549	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
2550	if (key.type != BTRFS_ROOT_BACKREF_KEY \|\|
2551	key.objectid != send_root->root_key.objectid) {
2552	ret = -ENOENT;
2553	goto out;
2554	}
2555	ref = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_root_ref);
2556	namelen = btrfs_root_ref_name_len(eb: leaf, s: ref);
2557	read_extent_buffer(eb: leaf, dst: name, start: (unsigned long)(ref + `1`), len: namelen);
2558	btrfs_release_path(p: path);
2559
2560	if (parent_root) {
2561	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SNAPSHOT);
2562	if (ret < `0`)
2563	goto out;
2564	} else {
2565	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SUBVOL);
2566	if (ret < `0`)
2567	goto out;
2568	}
2569
2570	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2571
2572	if (!btrfs_is_empty_uuid(uuid: sctx->send_root->root_item.received_uuid))
2573	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2574	sctx->send_root->root_item.received_uuid);
2575	else
2576	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2577	sctx->send_root->root_item.uuid);
2578
2579	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2580	btrfs_root_ctransid(&sctx->send_root->root_item));
2581	if (parent_root) {
2582	if (!btrfs_is_empty_uuid(uuid: parent_root->root_item.received_uuid))
2583	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2584	parent_root->root_item.received_uuid);
2585	else
2586	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2587	parent_root->root_item.uuid);
2588	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2589	btrfs_root_ctransid(&sctx->parent_root->root_item));
2590	}
2591
2592	ret = send_cmd(sctx);
2593
2594	tlv_put_failure:
2595	out:
2596	btrfs_free_path(p: path);
2597	kfree(objp: name);
2598	return ret;
2599	}
2600
2601	static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2602	{
2603	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2604	int ret = `0`;
2605	struct fs_path *p;
2606
2607	btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2608
2609	p = fs_path_alloc();
2610	if (!p)
2611	return -ENOMEM;
2612
2613	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_TRUNCATE);
2614	if (ret < `0`)
2615	goto out;
2616
2617	ret = get_cur_path(sctx, ino, gen, dest: p);
2618	if (ret < `0`)
2619	goto out;
2620	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2621	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2622
2623	ret = send_cmd(sctx);
2624
2625	tlv_put_failure:
2626	out:
2627	fs_path_free(p);
2628	return ret;
2629	}
2630
2631	static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2632	{
2633	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2634	int ret = `0`;
2635	struct fs_path *p;
2636
2637	btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2638
2639	p = fs_path_alloc();
2640	if (!p)
2641	return -ENOMEM;
2642
2643	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHMOD);
2644	if (ret < `0`)
2645	goto out;
2646
2647	ret = get_cur_path(sctx, ino, gen, dest: p);
2648	if (ret < `0`)
2649	goto out;
2650	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2651	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & `07777`);
2652
2653	ret = send_cmd(sctx);
2654
2655	tlv_put_failure:
2656	out:
2657	fs_path_free(p);
2658	return ret;
2659	}
2660
2661	static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
2662	{
2663	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2664	int ret = `0`;
2665	struct fs_path *p;
2666
2667	if (sctx->proto < `2`)
2668	return `0`;
2669
2670	btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
2671
2672	p = fs_path_alloc();
2673	if (!p)
2674	return -ENOMEM;
2675
2676	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_FILEATTR);
2677	if (ret < `0`)
2678	goto out;
2679
2680	ret = get_cur_path(sctx, ino, gen, dest: p);
2681	if (ret < `0`)
2682	goto out;
2683	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2684	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
2685
2686	ret = send_cmd(sctx);
2687
2688	tlv_put_failure:
2689	out:
2690	fs_path_free(p);
2691	return ret;
2692	}
2693
2694	static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2695	{
2696	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2697	int ret = `0`;
2698	struct fs_path *p;
2699
2700	btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2701	ino, uid, gid);
2702
2703	p = fs_path_alloc();
2704	if (!p)
2705	return -ENOMEM;
2706
2707	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHOWN);
2708	if (ret < `0`)
2709	goto out;
2710
2711	ret = get_cur_path(sctx, ino, gen, dest: p);
2712	if (ret < `0`)
2713	goto out;
2714	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2715	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2716	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2717
2718	ret = send_cmd(sctx);
2719
2720	tlv_put_failure:
2721	out:
2722	fs_path_free(p);
2723	return ret;
2724	}
2725
2726	static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2727	{
2728	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2729	int ret = `0`;
2730	struct fs_path *p = NULL;
2731	struct btrfs_inode_item *ii;
2732	struct btrfs_path *path = NULL;
2733	struct extent_buffer *eb;
2734	struct btrfs_key key;
2735	int slot;
2736
2737	btrfs_debug(fs_info, "send_utimes %llu", ino);
2738
2739	p = fs_path_alloc();
2740	if (!p)
2741	return -ENOMEM;
2742
2743	path = alloc_path_for_send();
2744	if (!path) {
2745	ret = -ENOMEM;
2746	goto out;
2747	}
2748
2749	key.objectid = ino;
2750	key.type = BTRFS_INODE_ITEM_KEY;
2751	key.offset = `0`;
2752	ret = btrfs_search_slot(NULL, root: sctx->send_root, key: &key, p: path, ins_len: `0`, cow: `0`);
2753	if (ret > `0`)
2754	ret = -ENOENT;
2755	if (ret < `0`)
2756	goto out;
2757
2758	eb = path->nodes[`0`];
2759	slot = path->slots[`0`];
2760	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2761
2762	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UTIMES);
2763	if (ret < `0`)
2764	goto out;
2765
2766	ret = get_cur_path(sctx, ino, gen, dest: p);
2767	if (ret < `0`)
2768	goto out;
2769	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2770	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2771	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2772	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2773	if (sctx->proto >= `2`)
2774	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2775
2776	ret = send_cmd(sctx);
2777
2778	tlv_put_failure:
2779	out:
2780	fs_path_free(p);
2781	btrfs_free_path(p: path);
2782	return ret;
2783	}
2784
2785	/*
2786	* If the cache is full, we can't remove entries from it and do a call to
2787	* send_utimes() for each respective inode, because we might be finishing
2788	* processing an inode that is a directory and it just got renamed, and existing
2789	* entries in the cache may refer to inodes that have the directory in their
2790	* full path - in which case we would generate outdated paths (pre-rename)
2791	* for the inodes that the cache entries point to. Instead of prunning the
2792	* cache when inserting, do it after we finish processing each inode at
2793	* finish_inode_if_needed().
2794	*/
2795	static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
2796	{
2797	struct btrfs_lru_cache_entry *entry;
2798	int ret;
2799
2800	entry = btrfs_lru_cache_lookup(cache: &sctx->dir_utimes_cache, key: dir, gen);
2801	if (entry != NULL)
2802	return `0`;
2803
2804	/ Caching is optional, don't fail if we can't allocate memory. /
2805	entry = kmalloc(size: sizeof(*entry), GFP_KERNEL);
2806	if (!entry)
2807	return send_utimes(sctx, ino: dir, gen);
2808
2809	entry->key = dir;
2810	entry->gen = gen;
2811
2812	ret = btrfs_lru_cache_store(cache: &sctx->dir_utimes_cache, new_entry: entry, GFP_KERNEL);
2813	ASSERT(ret != -EEXIST);
2814	if (ret) {
2815	kfree(objp: entry);
2816	return send_utimes(sctx, ino: dir, gen);
2817	}
2818
2819	return `0`;
2820	}
2821
2822	static int trim_dir_utimes_cache(struct send_ctx *sctx)
2823	{
2824	while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
2825	struct btrfs_lru_cache_entry *lru;
2826	int ret;
2827
2828	lru = btrfs_lru_cache_lru_entry(cache: &sctx->dir_utimes_cache);
2829	ASSERT(lru != NULL);
2830
2831	ret = send_utimes(sctx, ino: lru->key, gen: lru->gen);
2832	if (ret)
2833	return ret;
2834
2835	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry: lru);
2836	}
2837
2838	return `0`;
2839	}
2840
2841	/*
2842	* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2843	* a valid path yet because we did not process the refs yet. So, the inode
2844	* is created as orphan.
2845	*/
2846	static int send_create_inode(struct send_ctx *sctx, u64 ino)
2847	{
2848	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2849	int ret = `0`;
2850	struct fs_path *p;
2851	int cmd;
2852	struct btrfs_inode_info info;
2853	u64 gen;
2854	u64 mode;
2855	u64 rdev;
2856
2857	btrfs_debug(fs_info, "send_create_inode %llu", ino);
2858
2859	p = fs_path_alloc();
2860	if (!p)
2861	return -ENOMEM;
2862
2863	if (ino != sctx->cur_ino) {
2864	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
2865	if (ret < `0`)
2866	goto out;
2867	gen = info.gen;
2868	mode = info.mode;
2869	rdev = info.rdev;
2870	} else {
2871	gen = sctx->cur_inode_gen;
2872	mode = sctx->cur_inode_mode;
2873	rdev = sctx->cur_inode_rdev;
2874	}
2875
2876	if (S_ISREG(mode)) {
2877	cmd = BTRFS_SEND_C_MKFILE;
2878	} else if (S_ISDIR(mode)) {
2879	cmd = BTRFS_SEND_C_MKDIR;
2880	} else if (S_ISLNK(mode)) {
2881	cmd = BTRFS_SEND_C_SYMLINK;
2882	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode)) {
2883	cmd = BTRFS_SEND_C_MKNOD;
2884	} else if (S_ISFIFO(mode)) {
2885	cmd = BTRFS_SEND_C_MKFIFO;
2886	} else if (S_ISSOCK(mode)) {
2887	cmd = BTRFS_SEND_C_MKSOCK;
2888	} else {
2889	btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2890	(int)(mode & S_IFMT));
2891	ret = -EOPNOTSUPP;
2892	goto out;
2893	}
2894
2895	ret = begin_cmd(sctx, cmd);
2896	if (ret < `0`)
2897	goto out;
2898
2899	ret = gen_unique_name(sctx, ino, gen, dest: p);
2900	if (ret < `0`)
2901	goto out;
2902
2903	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2904	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2905
2906	if (S_ISLNK(mode)) {
2907	fs_path_reset(p);
2908	ret = read_symlink(root: sctx->send_root, ino, dest: p);
2909	if (ret < `0`)
2910	goto out;
2911	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2912	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode) \|\|
2913	S_ISFIFO(mode) \|\| S_ISSOCK(mode)) {
2914	TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2915	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2916	}
2917
2918	ret = send_cmd(sctx);
2919	if (ret < `0`)
2920	goto out;
2921
2922
2923	tlv_put_failure:
2924	out:
2925	fs_path_free(p);
2926	return ret;
2927	}
2928
2929	static void cache_dir_created(struct send_ctx *sctx, u64 dir)
2930	{
2931	struct btrfs_lru_cache_entry *entry;
2932	int ret;
2933
2934	/ Caching is optional, ignore any failures. /
2935	entry = kmalloc(size: sizeof(*entry), GFP_KERNEL);
2936	if (!entry)
2937	return;
2938
2939	entry->key = dir;
2940	entry->gen = `0`;
2941	ret = btrfs_lru_cache_store(cache: &sctx->dir_created_cache, new_entry: entry, GFP_KERNEL);
2942	if (ret < `0`)
2943	kfree(objp: entry);
2944	}
2945
2946	/*
2947	* We need some special handling for inodes that get processed before the parent
2948	* directory got created. See process_recorded_refs for details.
2949	* This function does the check if we already created the dir out of order.
2950	*/
2951	static int did_create_dir(struct send_ctx *sctx, u64 dir)
2952	{
2953	int ret = `0`;
2954	int iter_ret = `0`;
2955	struct btrfs_path *path = NULL;
2956	struct btrfs_key key;
2957	struct btrfs_key found_key;
2958	struct btrfs_key di_key;
2959	struct btrfs_dir_item *di;
2960
2961	if (btrfs_lru_cache_lookup(cache: &sctx->dir_created_cache, key: dir, gen: `0`))
2962	return `1`;
2963
2964	path = alloc_path_for_send();
2965	if (!path)
2966	return -ENOMEM;
2967
2968	key.objectid = dir;
2969	key.type = BTRFS_DIR_INDEX_KEY;
2970	key.offset = `0`;
2971
2972	btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
2973	struct extent_buffer *eb = path->nodes[`0`];
2974
2975	if (found_key.objectid != key.objectid \|\|
2976	found_key.type != key.type) {
2977	ret = `0`;
2978	break;
2979	}
2980
2981	di = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_dir_item);
2982	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
2983
2984	if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2985	di_key.objectid < sctx->send_progress) {
2986	ret = `1`;
2987	cache_dir_created(sctx, dir);
2988	break;
2989	}
2990	}
2991	/ Catch error found during iteration /
2992	if (iter_ret < `0`)
2993	ret = iter_ret;
2994
2995	btrfs_free_path(p: path);
2996	return ret;
2997	}
2998
2999	/*
3000	* Only creates the inode if it is:
3001	* 1. Not a directory
3002	* 2. Or a directory which was not created already due to out of order
3003	* directories. See did_create_dir and process_recorded_refs for details.
3004	*/
3005	static int send_create_inode_if_needed(struct send_ctx *sctx)
3006	{
3007	int ret;
3008
3009	if (S_ISDIR(sctx->cur_inode_mode)) {
3010	ret = did_create_dir(sctx, dir: sctx->cur_ino);
3011	if (ret < `0`)
3012	return ret;
3013	else if (ret > `0`)
3014	return `0`;
3015	}
3016
3017	ret = send_create_inode(sctx, ino: sctx->cur_ino);
3018
3019	if (ret == `0` && S_ISDIR(sctx->cur_inode_mode))
3020	cache_dir_created(sctx, dir: sctx->cur_ino);
3021
3022	return ret;
3023	}
3024
3025	struct recorded_ref {
3026	struct list_head list;
3027	char *name;
3028	struct fs_path *full_path;
3029	u64 dir;
3030	u64 dir_gen;
3031	int name_len;
3032	struct rb_node node;
3033	struct rb_root *root;
3034	};
3035
3036	static struct recorded_ref recorded_ref_alloc(void*)
3037	{
3038	struct recorded_ref *ref;
3039
3040	ref = kzalloc(size: sizeof(*ref), GFP_KERNEL);
3041	if (!ref)
3042	return NULL;
3043	RB_CLEAR_NODE(&ref->node);
3044	INIT_LIST_HEAD(list: &ref->list);
3045	return ref;
3046	}
3047
3048	static void recorded_ref_free(struct recorded_ref *ref)
3049	{
3050	if (!ref)
3051	return;
3052	if (!RB_EMPTY_NODE(&ref->node))
3053	rb_erase(&ref->node, ref->root);
3054	list_del(entry: &ref->list);
3055	fs_path_free(p: ref->full_path);
3056	kfree(objp: ref);
3057	}
3058
3059	static void set_ref_path(struct recorded_ref ref, struct* fs_path *path)
3060	{
3061	ref->full_path = path;
3062	ref->name = (char *)kbasename(path: ref->full_path->start);
3063	ref->name_len = ref->full_path->end - ref->name;
3064	}
3065
3066	static int dup_ref(struct recorded_ref ref, struct* list_head *list)
3067	{
3068	struct recorded_ref *new;
3069
3070	new = recorded_ref_alloc();
3071	if (!new)
3072	return -ENOMEM;
3073
3074	new->dir = ref->dir;
3075	new->dir_gen = ref->dir_gen;
3076	list_add_tail(new: &new->list, head: list);
3077	return `0`;
3078	}
3079
3080	static void __free_recorded_refs(struct list_head *head)
3081	{
3082	struct recorded_ref *cur;
3083
3084	while (!list_empty(head)) {
3085	cur = list_entry(head->next, struct recorded_ref, list);
3086	recorded_ref_free(ref: cur);
3087	}
3088	}
3089
3090	static void free_recorded_refs(struct send_ctx *sctx)
3091	{
3092	__free_recorded_refs(head: &sctx->new_refs);
3093	__free_recorded_refs(head: &sctx->deleted_refs);
3094	}
3095
3096	/*
3097	* Renames/moves a file/dir to its orphan name. Used when the first
3098	* ref of an unprocessed inode gets overwritten and for all non empty
3099	* directories.
3100	*/
3101	static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
3102	struct fs_path *path)
3103	{
3104	int ret;
3105	struct fs_path *orphan;
3106
3107	orphan = fs_path_alloc();
3108	if (!orphan)
3109	return -ENOMEM;
3110
3111	ret = gen_unique_name(sctx, ino, gen, dest: orphan);
3112	if (ret < `0`)
3113	goto out;
3114
3115	ret = send_rename(sctx, from: path, to: orphan);
3116
3117	out:
3118	fs_path_free(p: orphan);
3119	return ret;
3120	}
3121
3122	static struct orphan_dir_info add_orphan_dir_info(struct* send_ctx *sctx,
3123	u64 dir_ino, u64 dir_gen)
3124	{
3125	struct rb_node **p = &sctx->orphan_dirs.rb_node;
3126	struct rb_node *parent = NULL;
3127	struct orphan_dir_info entry, odi;
3128
3129	while (*p) {
3130	parent = *p;
3131	entry = rb_entry(parent, struct orphan_dir_info, node);
3132	if (dir_ino < entry->ino)
3133	p = &(*p)->rb_left;
3134	else if (dir_ino > entry->ino)
3135	p = &(*p)->rb_right;
3136	else if (dir_gen < entry->gen)
3137	p = &(*p)->rb_left;
3138	else if (dir_gen > entry->gen)
3139	p = &(*p)->rb_right;
3140	else
3141	return entry;
3142	}
3143
3144	odi = kmalloc(size: sizeof(*odi), GFP_KERNEL);
3145	if (!odi)
3146	return ERR_PTR(error: -ENOMEM);
3147	odi->ino = dir_ino;
3148	odi->gen = dir_gen;
3149	odi->last_dir_index_offset = `0`;
3150	odi->dir_high_seq_ino = `0`;
3151
3152	rb_link_node(node: &odi->node, parent, rb_link: p);
3153	rb_insert_color(&odi->node, &sctx->orphan_dirs);
3154	return odi;
3155	}
3156
3157	static struct orphan_dir_info get_orphan_dir_info(struct* send_ctx *sctx,
3158	u64 dir_ino, u64 gen)
3159	{
3160	struct rb_node *n = sctx->orphan_dirs.rb_node;
3161	struct orphan_dir_info *entry;
3162
3163	while (n) {
3164	entry = rb_entry(n, struct orphan_dir_info, node);
3165	if (dir_ino < entry->ino)
3166	n = n->rb_left;
3167	else if (dir_ino > entry->ino)
3168	n = n->rb_right;
3169	else if (gen < entry->gen)
3170	n = n->rb_left;
3171	else if (gen > entry->gen)
3172	n = n->rb_right;
3173	else
3174	return entry;
3175	}
3176	return NULL;
3177	}
3178
3179	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
3180	{
3181	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
3182
3183	return odi != NULL;
3184	}
3185
3186	static void free_orphan_dir_info(struct send_ctx *sctx,
3187	struct orphan_dir_info *odi)
3188	{
3189	if (!odi)
3190	return;
3191	rb_erase(&odi->node, &sctx->orphan_dirs);
3192	kfree(objp: odi);
3193	}
3194
3195	/*
3196	* Returns 1 if a directory can be removed at this point in time.
3197	* We check this by iterating all dir items and checking if the inode behind
3198	* the dir item was already processed.
3199	*/
3200	static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
3201	{
3202	int ret = `0`;
3203	int iter_ret = `0`;
3204	struct btrfs_root *root = sctx->parent_root;
3205	struct btrfs_path *path;
3206	struct btrfs_key key;
3207	struct btrfs_key found_key;
3208	struct btrfs_key loc;
3209	struct btrfs_dir_item *di;
3210	struct orphan_dir_info *odi = NULL;
3211	u64 dir_high_seq_ino = `0`;
3212	u64 last_dir_index_offset = `0`;
3213
3214	/*
3215	* Don't try to rmdir the top/root subvolume dir.
3216	*/
3217	if (dir == BTRFS_FIRST_FREE_OBJECTID)
3218	return `0`;
3219
3220	odi = get_orphan_dir_info(sctx, dir_ino: dir, gen: dir_gen);
3221	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
3222	return `0`;
3223
3224	path = alloc_path_for_send();
3225	if (!path)
3226	return -ENOMEM;
3227
3228	if (!odi) {
3229	/*
3230	* Find the inode number associated with the last dir index
3231	* entry. This is very likely the inode with the highest number
3232	* of all inodes that have an entry in the directory. We can
3233	* then use it to avoid future calls to can_rmdir(), when
3234	* processing inodes with a lower number, from having to search
3235	* the parent root b+tree for dir index keys.
3236	*/
3237	key.objectid = dir;
3238	key.type = BTRFS_DIR_INDEX_KEY;
3239	key.offset = (u64)-`1`;
3240
3241	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3242	if (ret < `0`) {
3243	goto out;
3244	} else if (ret > `0`) {
3245	/ Can't happen, the root is never empty. /
3246	ASSERT(path->slots[`0`] > `0`);
3247	if (WARN_ON(path->slots[`0`] == `0`)) {
3248	ret = -EUCLEAN;
3249	goto out;
3250	}
3251	path->slots[`0`]--;
3252	}
3253
3254	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
3255	if (key.objectid != dir \|\| key.type != BTRFS_DIR_INDEX_KEY) {
3256	/ No index keys, dir can be removed. /
3257	ret = `1`;
3258	goto out;
3259	}
3260
3261	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3262	struct btrfs_dir_item);
3263	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3264	dir_high_seq_ino = loc.objectid;
3265	if (sctx->cur_ino < dir_high_seq_ino) {
3266	ret = `0`;
3267	goto out;
3268	}
3269
3270	btrfs_release_path(p: path);
3271	}
3272
3273	key.objectid = dir;
3274	key.type = BTRFS_DIR_INDEX_KEY;
3275	key.offset = (odi ? odi->last_dir_index_offset : `0`);
3276
3277	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3278	struct waiting_dir_move *dm;
3279
3280	if (found_key.objectid != key.objectid \|\|
3281	found_key.type != key.type)
3282	break;
3283
3284	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3285	struct btrfs_dir_item);
3286	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3287
3288	dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
3289	last_dir_index_offset = found_key.offset;
3290
3291	dm = get_waiting_dir_move(sctx, ino: loc.objectid);
3292	if (dm) {
3293	dm->rmdir_ino = dir;
3294	dm->rmdir_gen = dir_gen;
3295	ret = `0`;
3296	goto out;
3297	}
3298
3299	if (loc.objectid > sctx->cur_ino) {
3300	ret = `0`;
3301	goto out;
3302	}
3303	}
3304	if (iter_ret < `0`) {
3305	ret = iter_ret;
3306	goto out;
3307	}
3308	free_orphan_dir_info(sctx, odi);
3309
3310	ret = `1`;
3311
3312	out:
3313	btrfs_free_path(p: path);
3314
3315	if (ret)
3316	return ret;
3317
3318	if (!odi) {
3319	odi = add_orphan_dir_info(sctx, dir_ino: dir, dir_gen);
3320	if (IS_ERR(ptr: odi))
3321	return PTR_ERR(ptr: odi);
3322
3323	odi->gen = dir_gen;
3324	}
3325
3326	odi->last_dir_index_offset = last_dir_index_offset;
3327	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
3328
3329	return `0`;
3330	}
3331
3332	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3333	{
3334	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3335
3336	return entry != NULL;
3337	}
3338
3339	static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3340	{
3341	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3342	struct rb_node *parent = NULL;
3343	struct waiting_dir_move entry, dm;
3344
3345	dm = kmalloc(size: sizeof(*dm), GFP_KERNEL);
3346	if (!dm)
3347	return -ENOMEM;
3348	dm->ino = ino;
3349	dm->rmdir_ino = `0`;
3350	dm->rmdir_gen = `0`;
3351	dm->orphanized = orphanized;
3352
3353	while (*p) {
3354	parent = *p;
3355	entry = rb_entry(parent, struct waiting_dir_move, node);
3356	if (ino < entry->ino) {
3357	p = &(*p)->rb_left;
3358	} else if (ino > entry->ino) {
3359	p = &(*p)->rb_right;
3360	} else {
3361	kfree(objp: dm);
3362	return -EEXIST;
3363	}
3364	}
3365
3366	rb_link_node(node: &dm->node, parent, rb_link: p);
3367	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3368	return `0`;
3369	}
3370
3371	static struct waiting_dir_move *
3372	get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3373	{
3374	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3375	struct waiting_dir_move *entry;
3376
3377	while (n) {
3378	entry = rb_entry(n, struct waiting_dir_move, node);
3379	if (ino < entry->ino)
3380	n = n->rb_left;
3381	else if (ino > entry->ino)
3382	n = n->rb_right;
3383	else
3384	return entry;
3385	}
3386	return NULL;
3387	}
3388
3389	static void free_waiting_dir_move(struct send_ctx *sctx,
3390	struct waiting_dir_move *dm)
3391	{
3392	if (!dm)
3393	return;
3394	rb_erase(&dm->node, &sctx->waiting_dir_moves);
3395	kfree(objp: dm);
3396	}
3397
3398	static int add_pending_dir_move(struct send_ctx *sctx,
3399	u64 ino,
3400	u64 ino_gen,
3401	u64 parent_ino,
3402	struct list_head *new_refs,
3403	struct list_head *deleted_refs,
3404	const bool is_orphan)
3405	{
3406	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3407	struct rb_node *parent = NULL;
3408	struct pending_dir_move entry = NULL, pm;
3409	struct recorded_ref *cur;
3410	int exists = `0`;
3411	int ret;
3412
3413	pm = kmalloc(size: sizeof(*pm), GFP_KERNEL);
3414	if (!pm)
3415	return -ENOMEM;
3416	pm->parent_ino = parent_ino;
3417	pm->ino = ino;
3418	pm->gen = ino_gen;
3419	INIT_LIST_HEAD(list: &pm->list);
3420	INIT_LIST_HEAD(list: &pm->update_refs);
3421	RB_CLEAR_NODE(&pm->node);
3422
3423	while (*p) {
3424	parent = *p;
3425	entry = rb_entry(parent, struct pending_dir_move, node);
3426	if (parent_ino < entry->parent_ino) {
3427	p = &(*p)->rb_left;
3428	} else if (parent_ino > entry->parent_ino) {
3429	p = &(*p)->rb_right;
3430	} else {
3431	exists = `1`;
3432	break;
3433	}
3434	}
3435
3436	list_for_each_entry(cur, deleted_refs, list) {
3437	ret = dup_ref(ref: cur, list: &pm->update_refs);
3438	if (ret < `0`)
3439	goto out;
3440	}
3441	list_for_each_entry(cur, new_refs, list) {
3442	ret = dup_ref(ref: cur, list: &pm->update_refs);
3443	if (ret < `0`)
3444	goto out;
3445	}
3446
3447	ret = add_waiting_dir_move(sctx, ino: pm->ino, orphanized: is_orphan);
3448	if (ret)
3449	goto out;
3450
3451	if (exists) {
3452	list_add_tail(new: &pm->list, head: &entry->list);
3453	} else {
3454	rb_link_node(node: &pm->node, parent, rb_link: p);
3455	rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3456	}
3457	ret = `0`;
3458	out:
3459	if (ret) {
3460	__free_recorded_refs(head: &pm->update_refs);
3461	kfree(objp: pm);
3462	}
3463	return ret;
3464	}
3465
3466	static struct pending_dir_move get_pending_dir_moves(struct* send_ctx *sctx,
3467	u64 parent_ino)
3468	{
3469	struct rb_node *n = sctx->pending_dir_moves.rb_node;
3470	struct pending_dir_move *entry;
3471
3472	while (n) {
3473	entry = rb_entry(n, struct pending_dir_move, node);
3474	if (parent_ino < entry->parent_ino)
3475	n = n->rb_left;
3476	else if (parent_ino > entry->parent_ino)
3477	n = n->rb_right;
3478	else
3479	return entry;
3480	}
3481	return NULL;
3482	}
3483
3484	static int path_loop(struct send_ctx sctx, struct* fs_path *name,
3485	u64 ino, u64 gen, u64 *ancestor_ino)
3486	{
3487	int ret = `0`;
3488	u64 parent_inode = `0`;
3489	u64 parent_gen = `0`;
3490	u64 start_ino = ino;
3491
3492	*ancestor_ino = `0`;
3493	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3494	fs_path_reset(p: name);
3495
3496	if (is_waiting_for_rm(sctx, dir_ino: ino, gen))
3497	break;
3498	if (is_waiting_for_move(sctx, ino)) {
3499	if (*ancestor_ino == `0`)
3500	*ancestor_ino = ino;
3501	ret = get_first_ref(root: sctx->parent_root, ino,
3502	dir: &parent_inode, dir_gen: &parent_gen, name);
3503	} else {
3504	ret = __get_cur_name_and_parent(sctx, ino, gen,
3505	parent_ino: &parent_inode,
3506	parent_gen: &parent_gen, dest: name);
3507	if (ret > `0`) {
3508	ret = `0`;
3509	break;
3510	}
3511	}
3512	if (ret < `0`)
3513	break;
3514	if (parent_inode == start_ino) {
3515	ret = `1`;
3516	if (*ancestor_ino == `0`)
3517	*ancestor_ino = ino;
3518	break;
3519	}
3520	ino = parent_inode;
3521	gen = parent_gen;
3522	}
3523	return ret;
3524	}
3525
3526	static int apply_dir_move(struct send_ctx sctx, struct* pending_dir_move *pm)
3527	{
3528	struct fs_path *from_path = NULL;
3529	struct fs_path *to_path = NULL;
3530	struct fs_path *name = NULL;
3531	u64 orig_progress = sctx->send_progress;
3532	struct recorded_ref *cur;
3533	u64 parent_ino, parent_gen;
3534	struct waiting_dir_move *dm = NULL;
3535	u64 rmdir_ino = `0`;
3536	u64 rmdir_gen;
3537	u64 ancestor;
3538	bool is_orphan;
3539	int ret;
3540
3541	name = fs_path_alloc();
3542	from_path = fs_path_alloc();
3543	if (!name \|\| !from_path) {
3544	ret = -ENOMEM;
3545	goto out;
3546	}
3547
3548	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3549	ASSERT(dm);
3550	rmdir_ino = dm->rmdir_ino;
3551	rmdir_gen = dm->rmdir_gen;
3552	is_orphan = dm->orphanized;
3553	free_waiting_dir_move(sctx, dm);
3554
3555	if (is_orphan) {
3556	ret = gen_unique_name(sctx, ino: pm->ino,
3557	gen: pm->gen, dest: from_path);
3558	} else {
3559	ret = get_first_ref(root: sctx->parent_root, ino: pm->ino,
3560	dir: &parent_ino, dir_gen: &parent_gen, name);
3561	if (ret < `0`)
3562	goto out;
3563	ret = get_cur_path(sctx, ino: parent_ino, gen: parent_gen,
3564	dest: from_path);
3565	if (ret < `0`)
3566	goto out;
3567	ret = fs_path_add_path(p: from_path, p2: name);
3568	}
3569	if (ret < `0`)
3570	goto out;
3571
3572	sctx->send_progress = sctx->cur_ino + `1`;
3573	ret = path_loop(sctx, name, ino: pm->ino, gen: pm->gen, ancestor_ino: &ancestor);
3574	if (ret < `0`)
3575	goto out;
3576	if (ret) {
3577	LIST_HEAD(deleted_refs);
3578	ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3579	ret = add_pending_dir_move(sctx, ino: pm->ino, ino_gen: pm->gen, parent_ino: ancestor,
3580	new_refs: &pm->update_refs, deleted_refs: &deleted_refs,
3581	is_orphan);
3582	if (ret < `0`)
3583	goto out;
3584	if (rmdir_ino) {
3585	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3586	ASSERT(dm);
3587	dm->rmdir_ino = rmdir_ino;
3588	dm->rmdir_gen = rmdir_gen;
3589	}
3590	goto out;
3591	}
3592	fs_path_reset(p: name);
3593	to_path = name;
3594	name = NULL;
3595	ret = get_cur_path(sctx, ino: pm->ino, gen: pm->gen, dest: to_path);
3596	if (ret < `0`)
3597	goto out;
3598
3599	ret = send_rename(sctx, from: from_path, to: to_path);
3600	if (ret < `0`)
3601	goto out;
3602
3603	if (rmdir_ino) {
3604	struct orphan_dir_info *odi;
3605	u64 gen;
3606
3607	odi = get_orphan_dir_info(sctx, dir_ino: rmdir_ino, gen: rmdir_gen);
3608	if (!odi) {
3609	/ already deleted /
3610	goto finish;
3611	}
3612	gen = odi->gen;
3613
3614	ret = can_rmdir(sctx, dir: rmdir_ino, dir_gen: gen);
3615	if (ret < `0`)
3616	goto out;
3617	if (!ret)
3618	goto finish;
3619
3620	name = fs_path_alloc();
3621	if (!name) {
3622	ret = -ENOMEM;
3623	goto out;
3624	}
3625	ret = get_cur_path(sctx, ino: rmdir_ino, gen, dest: name);
3626	if (ret < `0`)
3627	goto out;
3628	ret = send_rmdir(sctx, path: name);
3629	if (ret < `0`)
3630	goto out;
3631	}
3632
3633	finish:
3634	ret = cache_dir_utimes(sctx, dir: pm->ino, gen: pm->gen);
3635	if (ret < `0`)
3636	goto out;
3637
3638	/*
3639	* After rename/move, need to update the utimes of both new parent(s)
3640	* and old parent(s).
3641	*/
3642	list_for_each_entry(cur, &pm->update_refs, list) {
3643	/*
3644	* The parent inode might have been deleted in the send snapshot
3645	*/
3646	ret = get_inode_info(root: sctx->send_root, ino: cur->dir, NULL);
3647	if (ret == -ENOENT) {
3648	ret = `0`;
3649	continue;
3650	}
3651	if (ret < `0`)
3652	goto out;
3653
3654	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
3655	if (ret < `0`)
3656	goto out;
3657	}
3658
3659	out:
3660	fs_path_free(p: name);
3661	fs_path_free(p: from_path);
3662	fs_path_free(p: to_path);
3663	sctx->send_progress = orig_progress;
3664
3665	return ret;
3666	}
3667
3668	static void free_pending_move(struct send_ctx sctx, struct* pending_dir_move *m)
3669	{
3670	if (!list_empty(head: &m->list))
3671	list_del(entry: &m->list);
3672	if (!RB_EMPTY_NODE(&m->node))
3673	rb_erase(&m->node, &sctx->pending_dir_moves);
3674	__free_recorded_refs(head: &m->update_refs);
3675	kfree(objp: m);
3676	}
3677
3678	static void tail_append_pending_moves(struct send_ctx *sctx,
3679	struct pending_dir_move *moves,
3680	struct list_head *stack)
3681	{
3682	if (list_empty(head: &moves->list)) {
3683	list_add_tail(new: &moves->list, head: stack);
3684	} else {
3685	LIST_HEAD(list);
3686	list_splice_init(list: &moves->list, head: &list);
3687	list_add_tail(new: &moves->list, head: stack);
3688	list_splice_tail(list: &list, head: stack);
3689	}
3690	if (!RB_EMPTY_NODE(&moves->node)) {
3691	rb_erase(&moves->node, &sctx->pending_dir_moves);
3692	RB_CLEAR_NODE(&moves->node);
3693	}
3694	}
3695
3696	static int apply_children_dir_moves(struct send_ctx *sctx)
3697	{
3698	struct pending_dir_move *pm;
3699	LIST_HEAD(stack);
3700	u64 parent_ino = sctx->cur_ino;
3701	int ret = `0`;
3702
3703	pm = get_pending_dir_moves(sctx, parent_ino);
3704	if (!pm)
3705	return `0`;
3706
3707	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3708
3709	while (!list_empty(head: &stack)) {
3710	pm = list_first_entry(&stack, struct pending_dir_move, list);
3711	parent_ino = pm->ino;
3712	ret = apply_dir_move(sctx, pm);
3713	free_pending_move(sctx, m: pm);
3714	if (ret)
3715	goto out;
3716	pm = get_pending_dir_moves(sctx, parent_ino);
3717	if (pm)
3718	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3719	}
3720	return `0`;
3721
3722	out:
3723	while (!list_empty(head: &stack)) {
3724	pm = list_first_entry(&stack, struct pending_dir_move, list);
3725	free_pending_move(sctx, m: pm);
3726	}
3727	return ret;
3728	}
3729
3730	/*
3731	* We might need to delay a directory rename even when no ancestor directory
3732	* (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3733	* renamed. This happens when we rename a directory to the old name (the name
3734	* in the parent root) of some other unrelated directory that got its rename
3735	* delayed due to some ancestor with higher number that got renamed.
3736	*
3737	* Example:
3738	*
3739	* Parent snapshot:
3740	* . (ino 256)
3741	* \|---- a/ (ino 257)
3742	* \| \|---- file (ino 260)
3743	* \|
3744	* \|---- b/ (ino 258)
3745	* \|---- c/ (ino 259)
3746	*
3747	* Send snapshot:
3748	* . (ino 256)
3749	* \|---- a/ (ino 258)
3750	* \|---- x/ (ino 259)
3751	* \|---- y/ (ino 257)
3752	* \|----- file (ino 260)
3753	*
3754	* Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3755	* from 'a' to 'x/y' happening first, which in turn depends on the rename of
3756	* inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3757	* must issue is:
3758	*
3759	* 1 - rename 259 from 'c' to 'x'
3760	* 2 - rename 257 from 'a' to 'x/y'
3761	* 3 - rename 258 from 'b' to 'a'
3762	*
3763	* Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3764	* be done right away and < 0 on error.
3765	*/
3766	static int wait_for_dest_dir_move(struct send_ctx *sctx,
3767	struct recorded_ref *parent_ref,
3768	const bool is_orphan)
3769	{
3770	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3771	struct btrfs_path *path;
3772	struct btrfs_key key;
3773	struct btrfs_key di_key;
3774	struct btrfs_dir_item *di;
3775	u64 left_gen;
3776	u64 right_gen;
3777	int ret = `0`;
3778	struct waiting_dir_move *wdm;
3779
3780	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3781	return `0`;
3782
3783	path = alloc_path_for_send();
3784	if (!path)
3785	return -ENOMEM;
3786
3787	key.objectid = parent_ref->dir;
3788	key.type = BTRFS_DIR_ITEM_KEY;
3789	key.offset = btrfs_name_hash(name: parent_ref->name, len: parent_ref->name_len);
3790
3791	ret = btrfs_search_slot(NULL, root: sctx->parent_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3792	if (ret < `0`) {
3793	goto out;
3794	} else if (ret > `0`) {
3795	ret = `0`;
3796	goto out;
3797	}
3798
3799	di = btrfs_match_dir_item_name(fs_info, path, name: parent_ref->name,
3800	name_len: parent_ref->name_len);
3801	if (!di) {
3802	ret = `0`;
3803	goto out;
3804	}
3805	/*
3806	* di_key.objectid has the number of the inode that has a dentry in the
3807	* parent directory with the same name that sctx->cur_ino is being
3808	* renamed to. We need to check if that inode is in the send root as
3809	* well and if it is currently marked as an inode with a pending rename,
3810	* if it is, we need to delay the rename of sctx->cur_ino as well, so
3811	* that it happens after that other inode is renamed.
3812	*/
3813	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &di_key);
3814	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3815	ret = `0`;
3816	goto out;
3817	}
3818
3819	ret = get_inode_gen(root: sctx->parent_root, ino: di_key.objectid, gen: &left_gen);
3820	if (ret < `0`)
3821	goto out;
3822	ret = get_inode_gen(root: sctx->send_root, ino: di_key.objectid, gen: &right_gen);
3823	if (ret < `0`) {
3824	if (ret == -ENOENT)
3825	ret = `0`;
3826	goto out;
3827	}
3828
3829	/ Different inode, no need to delay the rename of sctx->cur_ino /
3830	if (right_gen != left_gen) {
3831	ret = `0`;
3832	goto out;
3833	}
3834
3835	wdm = get_waiting_dir_move(sctx, ino: di_key.objectid);
3836	if (wdm && !wdm->orphanized) {
3837	ret = add_pending_dir_move(sctx,
3838	ino: sctx->cur_ino,
3839	ino_gen: sctx->cur_inode_gen,
3840	parent_ino: di_key.objectid,
3841	new_refs: &sctx->new_refs,
3842	deleted_refs: &sctx->deleted_refs,
3843	is_orphan);
3844	if (!ret)
3845	ret = `1`;
3846	}
3847	out:
3848	btrfs_free_path(p: path);
3849	return ret;
3850	}
3851
3852	/*
3853	* Check if inode ino2, or any of its ancestors, is inode ino1.
3854	* Return 1 if true, 0 if false and < 0 on error.
3855	*/
3856	static int check_ino_in_path(struct btrfs_root *root,
3857	const u64 ino1,
3858	const u64 ino1_gen,
3859	const u64 ino2,
3860	const u64 ino2_gen,
3861	struct fs_path *fs_path)
3862	{
3863	u64 ino = ino2;
3864
3865	if (ino1 == ino2)
3866	return ino1_gen == ino2_gen;
3867
3868	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3869	u64 parent;
3870	u64 parent_gen;
3871	int ret;
3872
3873	fs_path_reset(p: fs_path);
3874	ret = get_first_ref(root, ino, dir: &parent, dir_gen: &parent_gen, name: fs_path);
3875	if (ret < `0`)
3876	return ret;
3877	if (parent == ino1)
3878	return parent_gen == ino1_gen;
3879	ino = parent;
3880	}
3881	return `0`;
3882	}
3883
3884	/*
3885	* Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3886	* possible path (in case ino2 is not a directory and has multiple hard links).
3887	* Return 1 if true, 0 if false and < 0 on error.
3888	*/
3889	static int is_ancestor(struct btrfs_root *root,
3890	const u64 ino1,
3891	const u64 ino1_gen,
3892	const u64 ino2,
3893	struct fs_path *fs_path)
3894	{
3895	bool free_fs_path = false;
3896	int ret = `0`;
3897	int iter_ret = `0`;
3898	struct btrfs_path *path = NULL;
3899	struct btrfs_key key;
3900
3901	if (!fs_path) {
3902	fs_path = fs_path_alloc();
3903	if (!fs_path)
3904	return -ENOMEM;
3905	free_fs_path = true;
3906	}
3907
3908	path = alloc_path_for_send();
3909	if (!path) {
3910	ret = -ENOMEM;
3911	goto out;
3912	}
3913
3914	key.objectid = ino2;
3915	key.type = BTRFS_INODE_REF_KEY;
3916	key.offset = `0`;
3917
3918	btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3919	struct extent_buffer *leaf = path->nodes[`0`];
3920	int slot = path->slots[`0`];
3921	u32 cur_offset = `0`;
3922	u32 item_size;
3923
3924	if (key.objectid != ino2)
3925	break;
3926	if (key.type != BTRFS_INODE_REF_KEY &&
3927	key.type != BTRFS_INODE_EXTREF_KEY)
3928	break;
3929
3930	item_size = btrfs_item_size(eb: leaf, slot);
3931	while (cur_offset < item_size) {
3932	u64 parent;
3933	u64 parent_gen;
3934
3935	if (key.type == BTRFS_INODE_EXTREF_KEY) {
3936	unsigned long ptr;
3937	struct btrfs_inode_extref *extref;
3938
3939	ptr = btrfs_item_ptr_offset(leaf, slot);
3940	extref = (struct btrfs_inode_extref *)
3941	(ptr + cur_offset);
3942	parent = btrfs_inode_extref_parent(eb: leaf,
3943	s: extref);
3944	cur_offset += sizeof(*extref);
3945	cur_offset += btrfs_inode_extref_name_len(eb: leaf,
3946	s: extref);
3947	} else {
3948	parent = key.offset;
3949	cur_offset = item_size;
3950	}
3951
3952	ret = get_inode_gen(root, ino: parent, gen: &parent_gen);
3953	if (ret < `0`)
3954	goto out;
3955	ret = check_ino_in_path(root, ino1, ino1_gen,
3956	ino2: parent, ino2_gen: parent_gen, fs_path);
3957	if (ret)
3958	goto out;
3959	}
3960	}
3961	ret = `0`;
3962	if (iter_ret < `0`)
3963	ret = iter_ret;
3964
3965	out:
3966	btrfs_free_path(p: path);
3967	if (free_fs_path)
3968	fs_path_free(p: fs_path);
3969	return ret;
3970	}
3971
3972	static int wait_for_parent_move(struct send_ctx *sctx,
3973	struct recorded_ref *parent_ref,
3974	const bool is_orphan)
3975	{
3976	int ret = `0`;
3977	u64 ino = parent_ref->dir;
3978	u64 ino_gen = parent_ref->dir_gen;
3979	u64 parent_ino_before, parent_ino_after;
3980	struct fs_path *path_before = NULL;
3981	struct fs_path *path_after = NULL;
3982	int len1, len2;
3983
3984	path_after = fs_path_alloc();
3985	path_before = fs_path_alloc();
3986	if (!path_after \|\| !path_before) {
3987	ret = -ENOMEM;
3988	goto out;
3989	}
3990
3991	/*
3992	* Our current directory inode may not yet be renamed/moved because some
3993	* ancestor (immediate or not) has to be renamed/moved first. So find if
3994	* such ancestor exists and make sure our own rename/move happens after
3995	* that ancestor is processed to avoid path build infinite loops (done
3996	* at get_cur_path()).
3997	*/
3998	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3999	u64 parent_ino_after_gen;
4000
4001	if (is_waiting_for_move(sctx, ino)) {
4002	/*
4003	* If the current inode is an ancestor of ino in the
4004	* parent root, we need to delay the rename of the
4005	* current inode, otherwise don't delayed the rename
4006	* because we can end up with a circular dependency
4007	* of renames, resulting in some directories never
4008	* getting the respective rename operations issued in
4009	* the send stream or getting into infinite path build
4010	* loops.
4011	*/
4012	ret = is_ancestor(root: sctx->parent_root,
4013	ino1: sctx->cur_ino, ino1_gen: sctx->cur_inode_gen,
4014	ino2: ino, fs_path: path_before);
4015	if (ret)
4016	break;
4017	}
4018
4019	fs_path_reset(p: path_before);
4020	fs_path_reset(p: path_after);
4021
4022	ret = get_first_ref(root: sctx->send_root, ino, dir: &parent_ino_after,
4023	dir_gen: &parent_ino_after_gen, name: path_after);
4024	if (ret < `0`)
4025	goto out;
4026	ret = get_first_ref(root: sctx->parent_root, ino, dir: &parent_ino_before,
4027	NULL, name: path_before);
4028	if (ret < `0` && ret != -ENOENT) {
4029	goto out;
4030	} else if (ret == -ENOENT) {
4031	ret = `0`;
4032	break;
4033	}
4034
4035	len1 = fs_path_len(p: path_before);
4036	len2 = fs_path_len(p: path_after);
4037	if (ino > sctx->cur_ino &&
4038	(parent_ino_before != parent_ino_after \|\| len1 != len2 \|\|
4039	memcmp(p: path_before->start, q: path_after->start, size: len1))) {
4040	u64 parent_ino_gen;
4041
4042	ret = get_inode_gen(root: sctx->parent_root, ino, gen: &parent_ino_gen);
4043	if (ret < `0`)
4044	goto out;
4045	if (ino_gen == parent_ino_gen) {
4046	ret = `1`;
4047	break;
4048	}
4049	}
4050	ino = parent_ino_after;
4051	ino_gen = parent_ino_after_gen;
4052	}
4053
4054	out:
4055	fs_path_free(p: path_before);
4056	fs_path_free(p: path_after);
4057
4058	if (ret == `1`) {
4059	ret = add_pending_dir_move(sctx,
4060	ino: sctx->cur_ino,
4061	ino_gen: sctx->cur_inode_gen,
4062	parent_ino: ino,
4063	new_refs: &sctx->new_refs,
4064	deleted_refs: &sctx->deleted_refs,
4065	is_orphan);
4066	if (!ret)
4067	ret = `1`;
4068	}
4069
4070	return ret;
4071	}
4072
4073	static int update_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4074	{
4075	int ret;
4076	struct fs_path *new_path;
4077
4078	/*
4079	* Our reference's name member points to its full_path member string, so
4080	* we use here a new path.
4081	*/
4082	new_path = fs_path_alloc();
4083	if (!new_path)
4084	return -ENOMEM;
4085
4086	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: new_path);
4087	if (ret < `0`) {
4088	fs_path_free(p: new_path);
4089	return ret;
4090	}
4091	ret = fs_path_add(p: new_path, name: ref->name, name_len: ref->name_len);
4092	if (ret < `0`) {
4093	fs_path_free(p: new_path);
4094	return ret;
4095	}
4096
4097	fs_path_free(p: ref->full_path);
4098	set_ref_path(ref, path: new_path);
4099
4100	return `0`;
4101	}
4102
4103	/*
4104	* When processing the new references for an inode we may orphanize an existing
4105	* directory inode because its old name conflicts with one of the new references
4106	* of the current inode. Later, when processing another new reference of our
4107	* inode, we might need to orphanize another inode, but the path we have in the
4108	* reference reflects the pre-orphanization name of the directory we previously
4109	* orphanized. For example:
4110	*
4111	* parent snapshot looks like:
4112	*
4113	* . (ino 256)
4114	* \|----- f1 (ino 257)
4115	* \|----- f2 (ino 258)
4116	* \|----- d1/ (ino 259)
4117	* \|----- d2/ (ino 260)
4118	*
4119	* send snapshot looks like:
4120	*
4121	* . (ino 256)
4122	* \|----- d1 (ino 258)
4123	* \|----- f2/ (ino 259)
4124	* \|----- f2_link/ (ino 260)
4125	* \| \|----- f1 (ino 257)
4126	* \|
4127	* \|----- d2 (ino 258)
4128	*
4129	* When processing inode 257 we compute the name for inode 259 as "d1", and we
4130	* cache it in the name cache. Later when we start processing inode 258, when
4131	* collecting all its new references we set a full path of "d1/d2" for its new
4132	* reference with name "d2". When we start processing the new references we
4133	* start by processing the new reference with name "d1", and this results in
4134	* orphanizing inode 259, since its old reference causes a conflict. Then we
4135	* move on the next new reference, with name "d2", and we find out we must
4136	* orphanize inode 260, as its old reference conflicts with ours - but for the
4137	* orphanization we use a source path corresponding to the path we stored in the
4138	* new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4139	* receiver fail since the path component "d1/" no longer exists, it was renamed
4140	* to "o259-6-0/" when processing the previous new reference. So in this case we
4141	* must recompute the path in the new reference and use it for the new
4142	* orphanization operation.
4143	*/
4144	static int refresh_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4145	{
4146	char *name;
4147	int ret;
4148
4149	name = kmemdup(p: ref->name, size: ref->name_len, GFP_KERNEL);
4150	if (!name)
4151	return -ENOMEM;
4152
4153	fs_path_reset(p: ref->full_path);
4154	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: ref->full_path);
4155	if (ret < `0`)
4156	goto out;
4157
4158	ret = fs_path_add(p: ref->full_path, name, name_len: ref->name_len);
4159	if (ret < `0`)
4160	goto out;
4161
4162	/ Update the reference's base name pointer. /
4163	set_ref_path(ref, path: ref->full_path);
4164	out:
4165	kfree(objp: name);
4166	return ret;
4167	}
4168
4169	/*
4170	* This does all the move/link/unlink/rmdir magic.
4171	*/
4172	static int process_recorded_refs(struct send_ctx sctx, int* *pending_move)
4173	{
4174	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4175	int ret = `0`;
4176	struct recorded_ref *cur;
4177	struct recorded_ref *cur2;
4178	LIST_HEAD(check_dirs);
4179	struct fs_path *valid_path = NULL;
4180	u64 ow_inode = `0`;
4181	u64 ow_gen;
4182	u64 ow_mode;
4183	int did_overwrite = `0`;
4184	int is_orphan = `0`;
4185	u64 last_dir_ino_rm = `0`;
4186	bool can_rename = true;
4187	bool orphanized_dir = false;
4188	bool orphanized_ancestor = false;
4189
4190	btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
4191
4192	/*
4193	* This should never happen as the root dir always has the same ref
4194	* which is always '..'
4195	*/
4196	if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
4197	btrfs_err(fs_info,
4198	"send: unexpected inode %llu in process_recorded_refs()",
4199	sctx->cur_ino);
4200	ret = -EINVAL;
4201	goto out;
4202	}
4203
4204	valid_path = fs_path_alloc();
4205	if (!valid_path) {
4206	ret = -ENOMEM;
4207	goto out;
4208	}
4209
4210	/*
4211	* First, check if the first ref of the current inode was overwritten
4212	* before. If yes, we know that the current inode was already orphanized
4213	* and thus use the orphan name. If not, we can use get_cur_path to
4214	* get the path of the first ref as it would like while receiving at
4215	* this point in time.
4216	* New inodes are always orphan at the beginning, so force to use the
4217	* orphan name in this case.
4218	* The first ref is stored in valid_path and will be updated if it
4219	* gets moved around.
4220	*/
4221	if (!sctx->cur_inode_new) {
4222	ret = did_overwrite_first_ref(sctx, ino: sctx->cur_ino,
4223	gen: sctx->cur_inode_gen);
4224	if (ret < `0`)
4225	goto out;
4226	if (ret)
4227	did_overwrite = `1`;
4228	}
4229	if (sctx->cur_inode_new \|\| did_overwrite) {
4230	ret = gen_unique_name(sctx, ino: sctx->cur_ino,
4231	gen: sctx->cur_inode_gen, dest: valid_path);
4232	if (ret < `0`)
4233	goto out;
4234	is_orphan = `1`;
4235	} else {
4236	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
4237	dest: valid_path);
4238	if (ret < `0`)
4239	goto out;
4240	}
4241
4242	/*
4243	* Before doing any rename and link operations, do a first pass on the
4244	* new references to orphanize any unprocessed inodes that may have a
4245	* reference that conflicts with one of the new references of the current
4246	* inode. This needs to happen first because a new reference may conflict
4247	* with the old reference of a parent directory, so we must make sure
4248	* that the path used for link and rename commands don't use an
4249	* orphanized name when an ancestor was not yet orphanized.
4250	*
4251	* Example:
4252	*
4253	* Parent snapshot:
4254	*
4255	* . (ino 256)
4256	* \|----- testdir/ (ino 259)
4257	* \| \|----- a (ino 257)
4258	* \|
4259	* \|----- b (ino 258)
4260	*
4261	* Send snapshot:
4262	*
4263	* . (ino 256)
4264	* \|----- testdir_2/ (ino 259)
4265	* \| \|----- a (ino 260)
4266	* \|
4267	* \|----- testdir (ino 257)
4268	* \|----- b (ino 257)
4269	* \|----- b2 (ino 258)
4270	*
4271	* Processing the new reference for inode 257 with name "b" may happen
4272	* before processing the new reference with name "testdir". If so, we
4273	* must make sure that by the time we send a link command to create the
4274	* hard link "b", inode 259 was already orphanized, since the generated
4275	* path in "valid_path" already contains the orphanized name for 259.
4276	* We are processing inode 257, so only later when processing 259 we do
4277	* the rename operation to change its temporary (orphanized) name to
4278	* "testdir_2".
4279	*/
4280	list_for_each_entry(cur, &sctx->new_refs, list) {
4281	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4282	if (ret < `0`)
4283	goto out;
4284	if (ret == inode_state_will_create)
4285	continue;
4286
4287	/*
4288	* Check if this new ref would overwrite the first ref of another
4289	* unprocessed inode. If yes, orphanize the overwritten inode.
4290	* If we find an overwritten ref that is not the first ref,
4291	* simply unlink it.
4292	*/
4293	ret = will_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4294	name: cur->name, name_len: cur->name_len,
4295	who_ino: &ow_inode, who_gen: &ow_gen, who_mode: &ow_mode);
4296	if (ret < `0`)
4297	goto out;
4298	if (ret) {
4299	ret = is_first_ref(root: sctx->parent_root,
4300	ino: ow_inode, dir: cur->dir, name: cur->name,
4301	name_len: cur->name_len);
4302	if (ret < `0`)
4303	goto out;
4304	if (ret) {
4305	struct name_cache_entry *nce;
4306	struct waiting_dir_move *wdm;
4307
4308	if (orphanized_dir) {
4309	ret = refresh_ref_path(sctx, ref: cur);
4310	if (ret < `0`)
4311	goto out;
4312	}
4313
4314	ret = orphanize_inode(sctx, ino: ow_inode, gen: ow_gen,
4315	path: cur->full_path);
4316	if (ret < `0`)
4317	goto out;
4318	if (S_ISDIR(ow_mode))
4319	orphanized_dir = true;
4320
4321	/*
4322	* If ow_inode has its rename operation delayed
4323	* make sure that its orphanized name is used in
4324	* the source path when performing its rename
4325	* operation.
4326	*/
4327	wdm = get_waiting_dir_move(sctx, ino: ow_inode);
4328	if (wdm)
4329	wdm->orphanized = true;
4330
4331	/*
4332	* Make sure we clear our orphanized inode's
4333	* name from the name cache. This is because the
4334	* inode ow_inode might be an ancestor of some
4335	* other inode that will be orphanized as well
4336	* later and has an inode number greater than
4337	* sctx->send_progress. We need to prevent
4338	* future name lookups from using the old name
4339	* and get instead the orphan name.
4340	*/
4341	nce = name_cache_search(sctx, ino: ow_inode, gen: ow_gen);
4342	if (nce)
4343	btrfs_lru_cache_remove(cache: &sctx->name_cache,
4344	entry: &nce->entry);
4345
4346	/*
4347	* ow_inode might currently be an ancestor of
4348	* cur_ino, therefore compute valid_path (the
4349	* current path of cur_ino) again because it
4350	* might contain the pre-orphanization name of
4351	* ow_inode, which is no longer valid.
4352	*/
4353	ret = is_ancestor(root: sctx->parent_root,
4354	ino1: ow_inode, ino1_gen: ow_gen,
4355	ino2: sctx->cur_ino, NULL);
4356	if (ret > `0`) {
4357	orphanized_ancestor = true;
4358	fs_path_reset(p: valid_path);
4359	ret = get_cur_path(sctx, ino: sctx->cur_ino,
4360	gen: sctx->cur_inode_gen,
4361	dest: valid_path);
4362	}
4363	if (ret < `0`)
4364	goto out;
4365	} else {
4366	/*
4367	* If we previously orphanized a directory that
4368	* collided with a new reference that we already
4369	* processed, recompute the current path because
4370	* that directory may be part of the path.
4371	*/
4372	if (orphanized_dir) {
4373	ret = refresh_ref_path(sctx, ref: cur);
4374	if (ret < `0`)
4375	goto out;
4376	}
4377	ret = send_unlink(sctx, path: cur->full_path);
4378	if (ret < `0`)
4379	goto out;
4380	}
4381	}
4382
4383	}
4384
4385	list_for_each_entry(cur, &sctx->new_refs, list) {
4386	/*
4387	* We may have refs where the parent directory does not exist
4388	* yet. This happens if the parent directories inum is higher
4389	* than the current inum. To handle this case, we create the
4390	* parent directory out of order. But we need to check if this
4391	* did already happen before due to other refs in the same dir.
4392	*/
4393	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4394	if (ret < `0`)
4395	goto out;
4396	if (ret == inode_state_will_create) {
4397	ret = `0`;
4398	/*
4399	* First check if any of the current inodes refs did
4400	* already create the dir.
4401	*/
4402	list_for_each_entry(cur2, &sctx->new_refs, list) {
4403	if (cur == cur2)
4404	break;
4405	if (cur2->dir == cur->dir) {
4406	ret = `1`;
4407	break;
4408	}
4409	}
4410
4411	/*
4412	* If that did not happen, check if a previous inode
4413	* did already create the dir.
4414	*/
4415	if (!ret)
4416	ret = did_create_dir(sctx, dir: cur->dir);
4417	if (ret < `0`)
4418	goto out;
4419	if (!ret) {
4420	ret = send_create_inode(sctx, ino: cur->dir);
4421	if (ret < `0`)
4422	goto out;
4423	cache_dir_created(sctx, dir: cur->dir);
4424	}
4425	}
4426
4427	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4428	ret = wait_for_dest_dir_move(sctx, parent_ref: cur, is_orphan);
4429	if (ret < `0`)
4430	goto out;
4431	if (ret == `1`) {
4432	can_rename = false;
4433	*pending_move = `1`;
4434	}
4435	}
4436
4437	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4438	can_rename) {
4439	ret = wait_for_parent_move(sctx, parent_ref: cur, is_orphan);
4440	if (ret < `0`)
4441	goto out;
4442	if (ret == `1`) {
4443	can_rename = false;
4444	*pending_move = `1`;
4445	}
4446	}
4447
4448	/*
4449	* link/move the ref to the new place. If we have an orphan
4450	* inode, move it and update valid_path. If not, link or move
4451	* it depending on the inode mode.
4452	*/
4453	if (is_orphan && can_rename) {
4454	ret = send_rename(sctx, from: valid_path, to: cur->full_path);
4455	if (ret < `0`)
4456	goto out;
4457	is_orphan = `0`;
4458	ret = fs_path_copy(p: valid_path, from: cur->full_path);
4459	if (ret < `0`)
4460	goto out;
4461	} else if (can_rename) {
4462	if (S_ISDIR(sctx->cur_inode_mode)) {
4463	/*
4464	* Dirs can't be linked, so move it. For moved
4465	* dirs, we always have one new and one deleted
4466	* ref. The deleted ref is ignored later.
4467	*/
4468	ret = send_rename(sctx, from: valid_path,
4469	to: cur->full_path);
4470	if (!ret)
4471	ret = fs_path_copy(p: valid_path,
4472	from: cur->full_path);
4473	if (ret < `0`)
4474	goto out;
4475	} else {
4476	/*
4477	* We might have previously orphanized an inode
4478	* which is an ancestor of our current inode,
4479	* so our reference's full path, which was
4480	* computed before any such orphanizations, must
4481	* be updated.
4482	*/
4483	if (orphanized_dir) {
4484	ret = update_ref_path(sctx, ref: cur);
4485	if (ret < `0`)
4486	goto out;
4487	}
4488	ret = send_link(sctx, path: cur->full_path,
4489	lnk: valid_path);
4490	if (ret < `0`)
4491	goto out;
4492	}
4493	}
4494	ret = dup_ref(ref: cur, list: &check_dirs);
4495	if (ret < `0`)
4496	goto out;
4497	}
4498
4499	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4500	/*
4501	* Check if we can already rmdir the directory. If not,
4502	* orphanize it. For every dir item inside that gets deleted
4503	* later, we do this check again and rmdir it then if possible.
4504	* See the use of check_dirs for more details.
4505	*/
4506	ret = can_rmdir(sctx, dir: sctx->cur_ino, dir_gen: sctx->cur_inode_gen);
4507	if (ret < `0`)
4508	goto out;
4509	if (ret) {
4510	ret = send_rmdir(sctx, path: valid_path);
4511	if (ret < `0`)
4512	goto out;
4513	} else if (!is_orphan) {
4514	ret = orphanize_inode(sctx, ino: sctx->cur_ino,
4515	gen: sctx->cur_inode_gen, path: valid_path);
4516	if (ret < `0`)
4517	goto out;
4518	is_orphan = `1`;
4519	}
4520
4521	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4522	ret = dup_ref(ref: cur, list: &check_dirs);
4523	if (ret < `0`)
4524	goto out;
4525	}
4526	} else if (S_ISDIR(sctx->cur_inode_mode) &&
4527	!list_empty(head: &sctx->deleted_refs)) {
4528	/*
4529	* We have a moved dir. Add the old parent to check_dirs
4530	*/
4531	cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4532	list);
4533	ret = dup_ref(ref: cur, list: &check_dirs);
4534	if (ret < `0`)
4535	goto out;
4536	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
4537	/*
4538	* We have a non dir inode. Go through all deleted refs and
4539	* unlink them if they were not already overwritten by other
4540	* inodes.
4541	*/
4542	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4543	ret = did_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4544	ino: sctx->cur_ino, ino_gen: sctx->cur_inode_gen,
4545	name: cur->name, name_len: cur->name_len);
4546	if (ret < `0`)
4547	goto out;
4548	if (!ret) {
4549	/*
4550	* If we orphanized any ancestor before, we need
4551	* to recompute the full path for deleted names,
4552	* since any such path was computed before we
4553	* processed any references and orphanized any
4554	* ancestor inode.
4555	*/
4556	if (orphanized_ancestor) {
4557	ret = update_ref_path(sctx, ref: cur);
4558	if (ret < `0`)
4559	goto out;
4560	}
4561	ret = send_unlink(sctx, path: cur->full_path);
4562	if (ret < `0`)
4563	goto out;
4564	}
4565	ret = dup_ref(ref: cur, list: &check_dirs);
4566	if (ret < `0`)
4567	goto out;
4568	}
4569	/*
4570	* If the inode is still orphan, unlink the orphan. This may
4571	* happen when a previous inode did overwrite the first ref
4572	* of this inode and no new refs were added for the current
4573	* inode. Unlinking does not mean that the inode is deleted in
4574	* all cases. There may still be links to this inode in other
4575	* places.
4576	*/
4577	if (is_orphan) {
4578	ret = send_unlink(sctx, path: valid_path);
4579	if (ret < `0`)
4580	goto out;
4581	}
4582	}
4583
4584	/*
4585	* We did collect all parent dirs where cur_inode was once located. We
4586	* now go through all these dirs and check if they are pending for
4587	* deletion and if it's finally possible to perform the rmdir now.
4588	* We also update the inode stats of the parent dirs here.
4589	*/
4590	list_for_each_entry(cur, &check_dirs, list) {
4591	/*
4592	* In case we had refs into dirs that were not processed yet,
4593	* we don't need to do the utime and rmdir logic for these dirs.
4594	* The dir will be processed later.
4595	*/
4596	if (cur->dir > sctx->cur_ino)
4597	continue;
4598
4599	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4600	if (ret < `0`)
4601	goto out;
4602
4603	if (ret == inode_state_did_create \|\|
4604	ret == inode_state_no_change) {
4605	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
4606	if (ret < `0`)
4607	goto out;
4608	} else if (ret == inode_state_did_delete &&
4609	cur->dir != last_dir_ino_rm) {
4610	ret = can_rmdir(sctx, dir: cur->dir, dir_gen: cur->dir_gen);
4611	if (ret < `0`)
4612	goto out;
4613	if (ret) {
4614	ret = get_cur_path(sctx, ino: cur->dir,
4615	gen: cur->dir_gen, dest: valid_path);
4616	if (ret < `0`)
4617	goto out;
4618	ret = send_rmdir(sctx, path: valid_path);
4619	if (ret < `0`)
4620	goto out;
4621	last_dir_ino_rm = cur->dir;
4622	}
4623	}
4624	}
4625
4626	ret = `0`;
4627
4628	out:
4629	__free_recorded_refs(head: &check_dirs);
4630	free_recorded_refs(sctx);
4631	fs_path_free(p: valid_path);
4632	return ret;
4633	}
4634
4635	static int rbtree_ref_comp(const void k, const* struct rb_node *node)
4636	{
4637	const struct recorded_ref *data = k;
4638	const struct recorded_ref ref = rb_entry(node, struct* recorded_ref, node);
4639	int result;
4640
4641	if (data->dir > ref->dir)
4642	return `1`;
4643	if (data->dir < ref->dir)
4644	return -`1`;
4645	if (data->dir_gen > ref->dir_gen)
4646	return `1`;
4647	if (data->dir_gen < ref->dir_gen)
4648	return -`1`;
4649	if (data->name_len > ref->name_len)
4650	return `1`;
4651	if (data->name_len < ref->name_len)
4652	return -`1`;
4653	result = strcmp(data->name, ref->name);
4654	if (result > `0`)
4655	return `1`;
4656	if (result < `0`)
4657	return -`1`;
4658	return `0`;
4659	}
4660
4661	static bool rbtree_ref_less(struct rb_node node, const* struct rb_node *parent)
4662	{
4663	const struct recorded_ref entry = rb_entry(node, struct* recorded_ref, node);
4664
4665	return rbtree_ref_comp(k: entry, node: parent) < `0`;
4666	}
4667
4668	static int record_ref_in_tree(struct rb_root root, struct* list_head *refs,
4669	struct fs_path *name, u64 dir, u64 dir_gen,
4670	struct send_ctx *sctx)
4671	{
4672	int ret = `0`;
4673	struct fs_path *path = NULL;
4674	struct recorded_ref *ref = NULL;
4675
4676	path = fs_path_alloc();
4677	if (!path) {
4678	ret = -ENOMEM;
4679	goto out;
4680	}
4681
4682	ref = recorded_ref_alloc();
4683	if (!ref) {
4684	ret = -ENOMEM;
4685	goto out;
4686	}
4687
4688	ret = get_cur_path(sctx, ino: dir, gen: dir_gen, dest: path);
4689	if (ret < `0`)
4690	goto out;
4691	ret = fs_path_add_path(p: path, p2: name);
4692	if (ret < `0`)
4693	goto out;
4694
4695	ref->dir = dir;
4696	ref->dir_gen = dir_gen;
4697	set_ref_path(ref, path);
4698	list_add_tail(new: &ref->list, head: refs);
4699	rb_add(node: &ref->node, tree: root, less: rbtree_ref_less);
4700	ref->root = root;
4701	out:
4702	if (ret) {
4703	if (path && (!ref \|\| !ref->full_path))
4704	fs_path_free(p: path);
4705	recorded_ref_free(ref);
4706	}
4707	return ret;
4708	}
4709
4710	static int record_new_ref_if_needed(int num, u64 dir, int index,
4711	struct fs_path name, void* *ctx)
4712	{
4713	int ret = `0`;
4714	struct send_ctx *sctx = ctx;
4715	struct rb_node *node = NULL;
4716	struct recorded_ref data;
4717	struct recorded_ref *ref;
4718	u64 dir_gen;
4719
4720	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &dir_gen);
4721	if (ret < `0`)
4722	goto out;
4723
4724	data.dir = dir;
4725	data.dir_gen = dir_gen;
4726	set_ref_path(ref: &data, path: name);
4727	node = rb_find(key: &data, tree: &sctx->rbtree_deleted_refs, cmp: rbtree_ref_comp);
4728	if (node) {
4729	ref = rb_entry(node, struct recorded_ref, node);
4730	recorded_ref_free(ref);
4731	} else {
4732	ret = record_ref_in_tree(root: &sctx->rbtree_new_refs,
4733	refs: &sctx->new_refs, name, dir, dir_gen,
4734	sctx);
4735	}
4736	out:
4737	return ret;
4738	}
4739
4740	static int record_deleted_ref_if_needed(int num, u64 dir, int index,
4741	struct fs_path name, void* *ctx)
4742	{
4743	int ret = `0`;
4744	struct send_ctx *sctx = ctx;
4745	struct rb_node *node = NULL;
4746	struct recorded_ref data;
4747	struct recorded_ref *ref;
4748	u64 dir_gen;
4749
4750	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &dir_gen);
4751	if (ret < `0`)
4752	goto out;
4753
4754	data.dir = dir;
4755	data.dir_gen = dir_gen;
4756	set_ref_path(ref: &data, path: name);
4757	node = rb_find(key: &data, tree: &sctx->rbtree_new_refs, cmp: rbtree_ref_comp);
4758	if (node) {
4759	ref = rb_entry(node, struct recorded_ref, node);
4760	recorded_ref_free(ref);
4761	} else {
4762	ret = record_ref_in_tree(root: &sctx->rbtree_deleted_refs,
4763	refs: &sctx->deleted_refs, name, dir,
4764	dir_gen, sctx);
4765	}
4766	out:
4767	return ret;
4768	}
4769
4770	static int record_new_ref(struct send_ctx *sctx)
4771	{
4772	int ret;
4773
4774	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path,
4775	found_key: sctx->cmp_key, resolve: `0`, iterate: record_new_ref_if_needed, ctx: sctx);
4776	if (ret < `0`)
4777	goto out;
4778	ret = `0`;
4779
4780	out:
4781	return ret;
4782	}
4783
4784	static int record_deleted_ref(struct send_ctx *sctx)
4785	{
4786	int ret;
4787
4788	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path,
4789	found_key: sctx->cmp_key, resolve: `0`, iterate: record_deleted_ref_if_needed,
4790	ctx: sctx);
4791	if (ret < `0`)
4792	goto out;
4793	ret = `0`;
4794
4795	out:
4796	return ret;
4797	}
4798
4799	static int record_changed_ref(struct send_ctx *sctx)
4800	{
4801	int ret = `0`;
4802
4803	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path,
4804	found_key: sctx->cmp_key, resolve: `0`, iterate: record_new_ref_if_needed, ctx: sctx);
4805	if (ret < `0`)
4806	goto out;
4807	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path,
4808	found_key: sctx->cmp_key, resolve: `0`, iterate: record_deleted_ref_if_needed, ctx: sctx);
4809	if (ret < `0`)
4810	goto out;
4811	ret = `0`;
4812
4813	out:
4814	return ret;
4815	}
4816
4817	/*
4818	* Record and process all refs at once. Needed when an inode changes the
4819	* generation number, which means that it was deleted and recreated.
4820	*/
4821	static int process_all_refs(struct send_ctx *sctx,
4822	enum btrfs_compare_tree_result cmd)
4823	{
4824	int ret = `0`;
4825	int iter_ret = `0`;
4826	struct btrfs_root *root;
4827	struct btrfs_path *path;
4828	struct btrfs_key key;
4829	struct btrfs_key found_key;
4830	iterate_inode_ref_t cb;
4831	int pending_move = `0`;
4832
4833	path = alloc_path_for_send();
4834	if (!path)
4835	return -ENOMEM;
4836
4837	if (cmd == BTRFS_COMPARE_TREE_NEW) {
4838	root = sctx->send_root;
4839	cb = record_new_ref_if_needed;
4840	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4841	root = sctx->parent_root;
4842	cb = record_deleted_ref_if_needed;
4843	} else {
4844	btrfs_err(sctx->send_root->fs_info,
4845	"Wrong command %d in process_all_refs", cmd);
4846	ret = -EINVAL;
4847	goto out;
4848	}
4849
4850	key.objectid = sctx->cmp_key->objectid;
4851	key.type = BTRFS_INODE_REF_KEY;
4852	key.offset = `0`;
4853	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4854	if (found_key.objectid != key.objectid \|\|
4855	(found_key.type != BTRFS_INODE_REF_KEY &&
4856	found_key.type != BTRFS_INODE_EXTREF_KEY))
4857	break;
4858
4859	ret = iterate_inode_ref(root, path, found_key: &found_key, resolve: `0`, iterate: cb, ctx: sctx);
4860	if (ret < `0`)
4861	goto out;
4862	}
4863	/ Catch error found during iteration /
4864	if (iter_ret < `0`) {
4865	ret = iter_ret;
4866	goto out;
4867	}
4868	btrfs_release_path(p: path);
4869
4870	/*
4871	* We don't actually care about pending_move as we are simply
4872	* re-creating this inode and will be rename'ing it into place once we
4873	* rename the parent directory.
4874	*/
4875	ret = process_recorded_refs(sctx, pending_move: &pending_move);
4876	out:
4877	btrfs_free_path(p: path);
4878	return ret;
4879	}
4880
4881	static int send_set_xattr(struct send_ctx *sctx,
4882	struct fs_path *path,
4883	const char name, int* name_len,
4884	const char data, int* data_len)
4885	{
4886	int ret = `0`;
4887
4888	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SET_XATTR);
4889	if (ret < `0`)
4890	goto out;
4891
4892	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4893	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4894	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4895
4896	ret = send_cmd(sctx);
4897
4898	tlv_put_failure:
4899	out:
4900	return ret;
4901	}
4902
4903	static int send_remove_xattr(struct send_ctx *sctx,
4904	struct fs_path *path,
4905	const char name, int* name_len)
4906	{
4907	int ret = `0`;
4908
4909	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_REMOVE_XATTR);
4910	if (ret < `0`)
4911	goto out;
4912
4913	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4914	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4915
4916	ret = send_cmd(sctx);
4917
4918	tlv_put_failure:
4919	out:
4920	return ret;
4921	}
4922
4923	static int __process_new_xattr(int num, struct btrfs_key *di_key,
4924	const char name, int* name_len, const char *data,
4925	int data_len, void *ctx)
4926	{
4927	int ret;
4928	struct send_ctx *sctx = ctx;
4929	struct fs_path *p;
4930	struct posix_acl_xattr_header dummy_acl;
4931
4932	/ Capabilities are emitted by finish_inode_if_needed /
4933	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4934	return `0`;
4935
4936	p = fs_path_alloc();
4937	if (!p)
4938	return -ENOMEM;
4939
4940	/*
4941	* This hack is needed because empty acls are stored as zero byte
4942	* data in xattrs. Problem with that is, that receiving these zero byte
4943	* acls will fail later. To fix this, we send a dummy acl list that
4944	* only contains the version number and no entries.
4945	*/
4946	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) \|\|
4947	!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4948	if (data_len == `0`) {
4949	dummy_acl.a_version =
4950	cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4951	data = (char *)&dummy_acl;
4952	data_len = sizeof(dummy_acl);
4953	}
4954	}
4955
4956	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
4957	if (ret < `0`)
4958	goto out;
4959
4960	ret = send_set_xattr(sctx, path: p, name, name_len, data, data_len);
4961
4962	out:
4963	fs_path_free(p);
4964	return ret;
4965	}
4966
4967	static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4968	const char name, int* name_len,
4969	const char data, int* data_len, void *ctx)
4970	{
4971	int ret;
4972	struct send_ctx *sctx = ctx;
4973	struct fs_path *p;
4974
4975	p = fs_path_alloc();
4976	if (!p)
4977	return -ENOMEM;
4978
4979	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
4980	if (ret < `0`)
4981	goto out;
4982
4983	ret = send_remove_xattr(sctx, path: p, name, name_len);
4984
4985	out:
4986	fs_path_free(p);
4987	return ret;
4988	}
4989
4990	static int process_new_xattr(struct send_ctx *sctx)
4991	{
4992	int ret = `0`;
4993
4994	ret = iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
4995	iterate: __process_new_xattr, ctx: sctx);
4996
4997	return ret;
4998	}
4999
5000	static int process_deleted_xattr(struct send_ctx *sctx)
5001	{
5002	return iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
5003	iterate: __process_deleted_xattr, ctx: sctx);
5004	}
5005
5006	struct find_xattr_ctx {
5007	const char *name;
5008	int name_len;
5009	int found_idx;
5010	char *found_data;
5011	int found_data_len;
5012	};
5013
5014	static int __find_xattr(int num, struct btrfs_key di_key, const* char *name,
5015	int name_len, const char data, int* data_len, void *vctx)
5016	{
5017	struct find_xattr_ctx *ctx = vctx;
5018
5019	if (name_len == ctx->name_len &&
5020	strncmp(name, ctx->name, name_len) == `0`) {
5021	ctx->found_idx = num;
5022	ctx->found_data_len = data_len;
5023	ctx->found_data = kmemdup(p: data, size: data_len, GFP_KERNEL);
5024	if (!ctx->found_data)
5025	return -ENOMEM;
5026	return `1`;
5027	}
5028	return `0`;
5029	}
5030
5031	static int find_xattr(struct btrfs_root *root,
5032	struct btrfs_path *path,
5033	struct btrfs_key *key,
5034	const char name, int* name_len,
5035	char *data, int* *data_len)
5036	{
5037	int ret;
5038	struct find_xattr_ctx ctx;
5039
5040	ctx.name = name;
5041	ctx.name_len = name_len;
5042	ctx.found_idx = -`1`;
5043	ctx.found_data = NULL;
5044	ctx.found_data_len = `0`;
5045
5046	ret = iterate_dir_item(root, path, iterate: __find_xattr, ctx: &ctx);
5047	if (ret < `0`)
5048	return ret;
5049
5050	if (ctx.found_idx == -`1`)
5051	return -ENOENT;
5052	if (data) {
5053	*data = ctx.found_data;
5054	*data_len = ctx.found_data_len;
5055	} else {
5056	kfree(objp: ctx.found_data);
5057	}
5058	return ctx.found_idx;
5059	}
5060
5061
5062	static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
5063	const char name, int* name_len,
5064	const char data, int* data_len,
5065	void *ctx)
5066	{
5067	int ret;
5068	struct send_ctx *sctx = ctx;
5069	char *found_data = NULL;
5070	int found_data_len = `0`;
5071
5072	ret = find_xattr(root: sctx->parent_root, path: sctx->right_path,
5073	key: sctx->cmp_key, name, name_len, data: &found_data,
5074	data_len: &found_data_len);
5075	if (ret == -ENOENT) {
5076	ret = __process_new_xattr(num, di_key, name, name_len, data,
5077	data_len, ctx);
5078	} else if (ret >= `0`) {
5079	if (data_len != found_data_len \|\|
5080	memcmp(p: data, q: found_data, size: data_len)) {
5081	ret = __process_new_xattr(num, di_key, name, name_len,
5082	data, data_len, ctx);
5083	} else {
5084	ret = `0`;
5085	}
5086	}
5087
5088	kfree(objp: found_data);
5089	return ret;
5090	}
5091
5092	static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
5093	const char name, int* name_len,
5094	const char data, int* data_len,
5095	void *ctx)
5096	{
5097	int ret;
5098	struct send_ctx *sctx = ctx;
5099
5100	ret = find_xattr(root: sctx->send_root, path: sctx->left_path, key: sctx->cmp_key,
5101	name, name_len, NULL, NULL);
5102	if (ret == -ENOENT)
5103	ret = __process_deleted_xattr(num, di_key, name, name_len, data,
5104	data_len, ctx);
5105	else if (ret >= `0`)
5106	ret = `0`;
5107
5108	return ret;
5109	}
5110
5111	static int process_changed_xattr(struct send_ctx *sctx)
5112	{
5113	int ret = `0`;
5114
5115	ret = iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
5116	iterate: __process_changed_new_xattr, ctx: sctx);
5117	if (ret < `0`)
5118	goto out;
5119	ret = iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
5120	iterate: __process_changed_deleted_xattr, ctx: sctx);
5121
5122	out:
5123	return ret;
5124	}
5125
5126	static int process_all_new_xattrs(struct send_ctx *sctx)
5127	{
5128	int ret = `0`;
5129	int iter_ret = `0`;
5130	struct btrfs_root *root;
5131	struct btrfs_path *path;
5132	struct btrfs_key key;
5133	struct btrfs_key found_key;
5134
5135	path = alloc_path_for_send();
5136	if (!path)
5137	return -ENOMEM;
5138
5139	root = sctx->send_root;
5140
5141	key.objectid = sctx->cmp_key->objectid;
5142	key.type = BTRFS_XATTR_ITEM_KEY;
5143	key.offset = `0`;
5144	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
5145	if (found_key.objectid != key.objectid \|\|
5146	found_key.type != key.type) {
5147	ret = `0`;
5148	break;
5149	}
5150
5151	ret = iterate_dir_item(root, path, iterate: __process_new_xattr, ctx: sctx);
5152	if (ret < `0`)
5153	break;
5154	}
5155	/ Catch error found during iteration /
5156	if (iter_ret < `0`)
5157	ret = iter_ret;
5158
5159	btrfs_free_path(p: path);
5160	return ret;
5161	}
5162
5163	static int send_verity(struct send_ctx sctx, struct* fs_path *path,
5164	struct fsverity_descriptor *desc)
5165	{
5166	int ret;
5167
5168	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY);
5169	if (ret < `0`)
5170	goto out;
5171
5172	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
5173	TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
5174	le8_to_cpu(desc->hash_algorithm));
5175	TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
5176	`1U` << le8_to_cpu(desc->log_blocksize));
5177	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
5178	le8_to_cpu(desc->salt_size));
5179	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
5180	le32_to_cpu(desc->sig_size));
5181
5182	ret = send_cmd(sctx);
5183
5184	tlv_put_failure:
5185	out:
5186	return ret;
5187	}
5188
5189	static int process_verity(struct send_ctx *sctx)
5190	{
5191	int ret = `0`;
5192	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5193	struct inode *inode;
5194	struct fs_path *p;
5195
5196	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root: sctx->send_root);
5197	if (IS_ERR(ptr: inode))
5198	return PTR_ERR(ptr: inode);
5199
5200	ret = btrfs_get_verity_descriptor(inode, NULL, buf_size: `0`);
5201	if (ret < `0`)
5202	goto iput;
5203
5204	if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
5205	ret = -EMSGSIZE;
5206	goto iput;
5207	}
5208	if (!sctx->verity_descriptor) {
5209	sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
5210	GFP_KERNEL);
5211	if (!sctx->verity_descriptor) {
5212	ret = -ENOMEM;
5213	goto iput;
5214	}
5215	}
5216
5217	ret = btrfs_get_verity_descriptor(inode, buf: sctx->verity_descriptor, buf_size: ret);
5218	if (ret < `0`)
5219	goto iput;
5220
5221	p = fs_path_alloc();
5222	if (!p) {
5223	ret = -ENOMEM;
5224	goto iput;
5225	}
5226	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5227	if (ret < `0`)
5228	goto free_path;
5229
5230	ret = send_verity(sctx, path: p, desc: sctx->verity_descriptor);
5231	if (ret < `0`)
5232	goto free_path;
5233
5234	free_path:
5235	fs_path_free(p);
5236	iput:
5237	iput(inode);
5238	return ret;
5239	}
5240
5241	static inline u64 max_send_read_size(const struct send_ctx *sctx)
5242	{
5243	return sctx->send_max_size - SZ_16K;
5244	}
5245
5246	static int put_data_header(struct send_ctx *sctx, u32 len)
5247	{
5248	if (WARN_ON_ONCE(sctx->put_data))
5249	return -EINVAL;
5250	sctx->put_data = true;
5251	if (sctx->proto >= `2`) {
5252	/*
5253	* Since v2, the data attribute header doesn't include a length,
5254	* it is implicitly to the end of the command.
5255	*/
5256	if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
5257	return -EOVERFLOW;
5258	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: sctx->send_buf + sctx->send_size);
5259	sctx->send_size += sizeof(__le16);
5260	} else {
5261	struct btrfs_tlv_header *hdr;
5262
5263	if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
5264	return -EOVERFLOW;
5265	hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
5266	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: &hdr->tlv_type);
5267	put_unaligned_le16(val: len, p: &hdr->tlv_len);
5268	sctx->send_size += sizeof(*hdr);
5269	}
5270	return `0`;
5271	}
5272
5273	static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5274	{
5275	struct btrfs_root *root = sctx->send_root;
5276	struct btrfs_fs_info *fs_info = root->fs_info;
5277	struct page *page;
5278	pgoff_t index = offset >> PAGE_SHIFT;
5279	pgoff_t last_index;
5280	unsigned pg_offset = offset_in_page(offset);
5281	int ret;
5282
5283	ret = put_data_header(sctx, len);
5284	if (ret)
5285	return ret;
5286
5287	last_index = (offset + len - `1`) >> PAGE_SHIFT;
5288
5289	while (index <= last_index) {
5290	unsigned cur_len = min_t(unsigned, len,
5291	PAGE_SIZE - pg_offset);
5292
5293	page = find_lock_page(mapping: sctx->cur_inode->i_mapping, index);
5294	if (!page) {
5295	page_cache_sync_readahead(mapping: sctx->cur_inode->i_mapping,
5296	ra: &sctx->ra, NULL, index,
5297	req_count: last_index + `1` - index);
5298
5299	page = find_or_create_page(mapping: sctx->cur_inode->i_mapping,
5300	index, GFP_KERNEL);
5301	if (!page) {
5302	ret = -ENOMEM;
5303	break;
5304	}
5305	}
5306
5307	if (PageReadahead(page))
5308	page_cache_async_readahead(mapping: sctx->cur_inode->i_mapping,
5309	ra: &sctx->ra, NULL, page_folio(page),
5310	index, req_count: last_index + `1` - index);
5311
5312	if (!PageUptodate(page)) {
5313	btrfs_read_folio(NULL, page_folio(page));
5314	lock_page(page);
5315	if (!PageUptodate(page)) {
5316	unlock_page(page);
5317	btrfs_err(fs_info,
5318	"send: IO error at offset %llu for inode %llu root %llu",
5319	page_offset(page), sctx->cur_ino,
5320	sctx->send_root->root_key.objectid);
5321	put_page(page);
5322	ret = -EIO;
5323	break;
5324	}
5325	}
5326
5327	memcpy_from_page(to: sctx->send_buf + sctx->send_size, page,
5328	offset: pg_offset, len: cur_len);
5329	unlock_page(page);
5330	put_page(page);
5331	index++;
5332	pg_offset = `0`;
5333	len -= cur_len;
5334	sctx->send_size += cur_len;
5335	}
5336
5337	return ret;
5338	}
5339
5340	/*
5341	* Read some bytes from the current inode/file and send a write command to
5342	* user space.
5343	*/
5344	static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5345	{
5346	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5347	int ret = `0`;
5348	struct fs_path *p;
5349
5350	p = fs_path_alloc();
5351	if (!p)
5352	return -ENOMEM;
5353
5354	btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5355
5356	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5357	if (ret < `0`)
5358	goto out;
5359
5360	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5361	if (ret < `0`)
5362	goto out;
5363
5364	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5365	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5366	ret = put_file_data(sctx, offset, len);
5367	if (ret < `0`)
5368	goto out;
5369
5370	ret = send_cmd(sctx);
5371
5372	tlv_put_failure:
5373	out:
5374	fs_path_free(p);
5375	return ret;
5376	}
5377
5378	/*
5379	* Send a clone command to user space.
5380	*/
5381	static int send_clone(struct send_ctx *sctx,
5382	u64 offset, u32 len,
5383	struct clone_root *clone_root)
5384	{
5385	int ret = `0`;
5386	struct fs_path *p;
5387	u64 gen;
5388
5389	btrfs_debug(sctx->send_root->fs_info,
5390	"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5391	offset, len, clone_root->root->root_key.objectid,
5392	clone_root->ino, clone_root->offset);
5393
5394	p = fs_path_alloc();
5395	if (!p)
5396	return -ENOMEM;
5397
5398	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CLONE);
5399	if (ret < `0`)
5400	goto out;
5401
5402	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5403	if (ret < `0`)
5404	goto out;
5405
5406	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5407	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5408	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5409
5410	if (clone_root->root == sctx->send_root) {
5411	ret = get_inode_gen(root: sctx->send_root, ino: clone_root->ino, gen: &gen);
5412	if (ret < `0`)
5413	goto out;
5414	ret = get_cur_path(sctx, ino: clone_root->ino, gen, dest: p);
5415	} else {
5416	ret = get_inode_path(root: clone_root->root, ino: clone_root->ino, path: p);
5417	}
5418	if (ret < `0`)
5419	goto out;
5420
5421	/*
5422	* If the parent we're using has a received_uuid set then use that as
5423	* our clone source as that is what we will look for when doing a
5424	* receive.
5425	*
5426	* This covers the case that we create a snapshot off of a received
5427	* subvolume and then use that as the parent and try to receive on a
5428	* different host.
5429	*/
5430	if (!btrfs_is_empty_uuid(uuid: clone_root->root->root_item.received_uuid))
5431	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5432	clone_root->root->root_item.received_uuid);
5433	else
5434	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5435	clone_root->root->root_item.uuid);
5436	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5437	btrfs_root_ctransid(&clone_root->root->root_item));
5438	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5439	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5440	clone_root->offset);
5441
5442	ret = send_cmd(sctx);
5443
5444	tlv_put_failure:
5445	out:
5446	fs_path_free(p);
5447	return ret;
5448	}
5449
5450	/*
5451	* Send an update extent command to user space.
5452	*/
5453	static int send_update_extent(struct send_ctx *sctx,
5454	u64 offset, u32 len)
5455	{
5456	int ret = `0`;
5457	struct fs_path *p;
5458
5459	p = fs_path_alloc();
5460	if (!p)
5461	return -ENOMEM;
5462
5463	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UPDATE_EXTENT);
5464	if (ret < `0`)
5465	goto out;
5466
5467	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5468	if (ret < `0`)
5469	goto out;
5470
5471	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5472	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5473	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5474
5475	ret = send_cmd(sctx);
5476
5477	tlv_put_failure:
5478	out:
5479	fs_path_free(p);
5480	return ret;
5481	}
5482
5483	static int send_hole(struct send_ctx *sctx, u64 end)
5484	{
5485	struct fs_path *p = NULL;
5486	u64 read_size = max_send_read_size(sctx);
5487	u64 offset = sctx->cur_inode_last_extent;
5488	int ret = `0`;
5489
5490	/*
5491	* A hole that starts at EOF or beyond it. Since we do not yet support
5492	* fallocate (for extent preallocation and hole punching), sending a
5493	* write of zeroes starting at EOF or beyond would later require issuing
5494	* a truncate operation which would undo the write and achieve nothing.
5495	*/
5496	if (offset >= sctx->cur_inode_size)
5497	return `0`;
5498
5499	/*
5500	* Don't go beyond the inode's i_size due to prealloc extents that start
5501	* after the i_size.
5502	*/
5503	end = min_t(u64, end, sctx->cur_inode_size);
5504
5505	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5506	return send_update_extent(sctx, offset, len: end - offset);
5507
5508	p = fs_path_alloc();
5509	if (!p)
5510	return -ENOMEM;
5511	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5512	if (ret < `0`)
5513	goto tlv_put_failure;
5514	while (offset < end) {
5515	u64 len = min(end - offset, read_size);
5516
5517	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5518	if (ret < `0`)
5519	break;
5520	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5521	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5522	ret = put_data_header(sctx, len);
5523	if (ret < `0`)
5524	break;
5525	memset(sctx->send_buf + sctx->send_size, `0`, len);
5526	sctx->send_size += len;
5527	ret = send_cmd(sctx);
5528	if (ret < `0`)
5529	break;
5530	offset += len;
5531	}
5532	sctx->cur_inode_next_write_offset = offset;
5533	tlv_put_failure:
5534	fs_path_free(p);
5535	return ret;
5536	}
5537
5538	static int send_encoded_inline_extent(struct send_ctx *sctx,
5539	struct btrfs_path *path, u64 offset,
5540	u64 len)
5541	{
5542	struct btrfs_root *root = sctx->send_root;
5543	struct btrfs_fs_info *fs_info = root->fs_info;
5544	struct inode *inode;
5545	struct fs_path *fspath;
5546	struct extent_buffer *leaf = path->nodes[`0`];
5547	struct btrfs_key key;
5548	struct btrfs_file_extent_item *ei;
5549	u64 ram_bytes;
5550	size_t inline_size;
5551	int ret;
5552
5553	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root);
5554	if (IS_ERR(ptr: inode))
5555	return PTR_ERR(ptr: inode);
5556
5557	fspath = fs_path_alloc();
5558	if (!fspath) {
5559	ret = -ENOMEM;
5560	goto out;
5561	}
5562
5563	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5564	if (ret < `0`)
5565	goto out;
5566
5567	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5568	if (ret < `0`)
5569	goto out;
5570
5571	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5572	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5573	ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5574	inline_size = btrfs_file_extent_inline_item_len(eb: leaf, nr: path->slots[`0`]);
5575
5576	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5577	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5578	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5579	min(key.offset + ram_bytes - offset, len));
5580	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
5581	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
5582	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5583	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5584	if (ret < `0`)
5585	goto out;
5586	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5587
5588	ret = put_data_header(sctx, len: inline_size);
5589	if (ret < `0`)
5590	goto out;
5591	read_extent_buffer(eb: leaf, dst: sctx->send_buf + sctx->send_size,
5592	start: btrfs_file_extent_inline_start(e: ei), len: inline_size);
5593	sctx->send_size += inline_size;
5594
5595	ret = send_cmd(sctx);
5596
5597	tlv_put_failure:
5598	out:
5599	fs_path_free(p: fspath);
5600	iput(inode);
5601	return ret;
5602	}
5603
5604	static int send_encoded_extent(struct send_ctx sctx, struct* btrfs_path *path,
5605	u64 offset, u64 len)
5606	{
5607	struct btrfs_root *root = sctx->send_root;
5608	struct btrfs_fs_info *fs_info = root->fs_info;
5609	struct inode *inode;
5610	struct fs_path *fspath;
5611	struct extent_buffer *leaf = path->nodes[`0`];
5612	struct btrfs_key key;
5613	struct btrfs_file_extent_item *ei;
5614	u64 disk_bytenr, disk_num_bytes;
5615	u32 data_offset;
5616	struct btrfs_cmd_header *hdr;
5617	u32 crc;
5618	int ret;
5619
5620	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root);
5621	if (IS_ERR(ptr: inode))
5622	return PTR_ERR(ptr: inode);
5623
5624	fspath = fs_path_alloc();
5625	if (!fspath) {
5626	ret = -ENOMEM;
5627	goto out;
5628	}
5629
5630	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5631	if (ret < `0`)
5632	goto out;
5633
5634	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5635	if (ret < `0`)
5636	goto out;
5637
5638	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5639	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5640	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: ei);
5641	disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei);
5642
5643	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5644	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5645	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5646	min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
5647	len));
5648	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
5649	btrfs_file_extent_ram_bytes(leaf, ei));
5650	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
5651	offset - key.offset + btrfs_file_extent_offset(leaf, ei));
5652	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5653	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5654	if (ret < `0`)
5655	goto out;
5656	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5657	TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, `0`);
5658
5659	ret = put_data_header(sctx, len: disk_num_bytes);
5660	if (ret < `0`)
5661	goto out;
5662
5663	/*
5664	* We want to do I/O directly into the send buffer, so get the next page
5665	* boundary in the send buffer. This means that there may be a gap
5666	* between the beginning of the command and the file data.
5667	*/
5668	data_offset = PAGE_ALIGN(sctx->send_size);
5669	if (data_offset > sctx->send_max_size \|\|
5670	sctx->send_max_size - data_offset < disk_num_bytes) {
5671	ret = -EOVERFLOW;
5672	goto out;
5673	}
5674
5675	/*
5676	* Note that send_buf is a mapping of send_buf_pages, so this is really
5677	* reading into send_buf.
5678	*/
5679	ret = btrfs_encoded_read_regular_fill_pages(inode: BTRFS_I(inode), file_offset: offset,
5680	disk_bytenr, disk_io_size: disk_num_bytes,
5681	pages: sctx->send_buf_pages +
5682	(data_offset >> PAGE_SHIFT));
5683	if (ret)
5684	goto out;
5685
5686	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
5687	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
5688	hdr->crc = `0`;
5689	crc = crc32c(crc: `0`, address: sctx->send_buf, length: sctx->send_size);
5690	crc = crc32c(crc, address: sctx->send_buf + data_offset, length: disk_num_bytes);
5691	hdr->crc = cpu_to_le32(crc);
5692
5693	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
5694	off: &sctx->send_off);
5695	if (!ret) {
5696	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf + data_offset,
5697	len: disk_num_bytes, off: &sctx->send_off);
5698	}
5699	sctx->send_size = `0`;
5700	sctx->put_data = false;
5701
5702	tlv_put_failure:
5703	out:
5704	fs_path_free(p: fspath);
5705	iput(inode);
5706	return ret;
5707	}
5708
5709	static int send_extent_data(struct send_ctx sctx, struct* btrfs_path *path,
5710	const u64 offset, const u64 len)
5711	{
5712	const u64 end = offset + len;
5713	struct extent_buffer *leaf = path->nodes[`0`];
5714	struct btrfs_file_extent_item *ei;
5715	u64 read_size = max_send_read_size(sctx);
5716	u64 sent = `0`;
5717
5718	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5719	return send_update_extent(sctx, offset, len);
5720
5721	ei = btrfs_item_ptr(leaf, path->slots[`0`],
5722	struct btrfs_file_extent_item);
5723	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5724	btrfs_file_extent_compression(eb: leaf, s: ei) != BTRFS_COMPRESS_NONE) {
5725	bool is_inline = (btrfs_file_extent_type(eb: leaf, s: ei) ==
5726	BTRFS_FILE_EXTENT_INLINE);
5727
5728	/*
5729	* Send the compressed extent unless the compressed data is
5730	* larger than the decompressed data. This can happen if we're
5731	* not sending the entire extent, either because it has been
5732	* partially overwritten/truncated or because this is a part of
5733	* the extent that we couldn't clone in clone_range().
5734	*/
5735	if (is_inline &&
5736	btrfs_file_extent_inline_item_len(eb: leaf,
5737	nr: path->slots[`0`]) <= len) {
5738	return send_encoded_inline_extent(sctx, path, offset,
5739	len);
5740	} else if (!is_inline &&
5741	btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei) <= len) {
5742	return send_encoded_extent(sctx, path, offset, len);
5743	}
5744	}
5745
5746	if (sctx->cur_inode == NULL) {
5747	struct btrfs_root *root = sctx->send_root;
5748
5749	sctx->cur_inode = btrfs_iget(s: root->fs_info->sb, ino: sctx->cur_ino, root);
5750	if (IS_ERR(ptr: sctx->cur_inode)) {
5751	int err = PTR_ERR(ptr: sctx->cur_inode);
5752
5753	sctx->cur_inode = NULL;
5754	return err;
5755	}
5756	memset(&sctx->ra, `0`, sizeof(struct file_ra_state));
5757	file_ra_state_init(ra: &sctx->ra, mapping: sctx->cur_inode->i_mapping);
5758
5759	/*
5760	* It's very likely there are no pages from this inode in the page
5761	* cache, so after reading extents and sending their data, we clean
5762	* the page cache to avoid trashing the page cache (adding pressure
5763	* to the page cache and forcing eviction of other data more useful
5764	* for applications).
5765	*
5766	* We decide if we should clean the page cache simply by checking
5767	* if the inode's mapping nrpages is 0 when we first open it, and
5768	* not by using something like filemap_range_has_page() before
5769	* reading an extent because when we ask the readahead code to
5770	* read a given file range, it may (and almost always does) read
5771	* pages from beyond that range (see the documentation for
5772	* page_cache_sync_readahead()), so it would not be reliable,
5773	* because after reading the first extent future calls to
5774	* filemap_range_has_page() would return true because the readahead
5775	* on the previous extent resulted in reading pages of the current
5776	* extent as well.
5777	*/
5778	sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == `0`);
5779	sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5780	}
5781
5782	while (sent < len) {
5783	u64 size = min(len - sent, read_size);
5784	int ret;
5785
5786	ret = send_write(sctx, offset: offset + sent, len: size);
5787	if (ret < `0`)
5788	return ret;
5789	sent += size;
5790	}
5791
5792	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
5793	/*
5794	* Always operate only on ranges that are a multiple of the page
5795	* size. This is not only to prevent zeroing parts of a page in
5796	* the case of subpage sector size, but also to guarantee we evict
5797	* pages, as passing a range that is smaller than page size does
5798	* not evict the respective page (only zeroes part of its content).
5799	*
5800	* Always start from the end offset of the last range cleared.
5801	* This is because the readahead code may (and very often does)
5802	* reads pages beyond the range we request for readahead. So if
5803	* we have an extent layout like this:
5804	*
5805	* [ extent A ] [ extent B ] [ extent C ]
5806	*
5807	* When we ask page_cache_sync_readahead() to read extent A, it
5808	* may also trigger reads for pages of extent B. If we are doing
5809	* an incremental send and extent B has not changed between the
5810	* parent and send snapshots, some or all of its pages may end
5811	* up being read and placed in the page cache. So when truncating
5812	* the page cache we always start from the end offset of the
5813	* previously processed extent up to the end of the current
5814	* extent.
5815	*/
5816	truncate_inode_pages_range(&sctx->cur_inode->i_data,
5817	lstart: sctx->page_cache_clear_start,
5818	lend: end - `1`);
5819	sctx->page_cache_clear_start = end;
5820	}
5821
5822	return `0`;
5823	}
5824
5825	/*
5826	* Search for a capability xattr related to sctx->cur_ino. If the capability is
5827	* found, call send_set_xattr function to emit it.
5828	*
5829	* Return 0 if there isn't a capability, or when the capability was emitted
5830	* successfully, or < 0 if an error occurred.
5831	*/
5832	static int send_capabilities(struct send_ctx *sctx)
5833	{
5834	struct fs_path *fspath = NULL;
5835	struct btrfs_path *path;
5836	struct btrfs_dir_item *di;
5837	struct extent_buffer *leaf;
5838	unsigned long data_ptr;
5839	char *buf = NULL;
5840	int buf_len;
5841	int ret = `0`;
5842
5843	path = alloc_path_for_send();
5844	if (!path)
5845	return -ENOMEM;
5846
5847	di = btrfs_lookup_xattr(NULL, root: sctx->send_root, path, dir: sctx->cur_ino,
5848	XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), mod: `0`);
5849	if (!di) {
5850	/ There is no xattr for this inode /
5851	goto out;
5852	} else if (IS_ERR(ptr: di)) {
5853	ret = PTR_ERR(ptr: di);
5854	goto out;
5855	}
5856
5857	leaf = path->nodes[`0`];
5858	buf_len = btrfs_dir_data_len(eb: leaf, s: di);
5859
5860	fspath = fs_path_alloc();
5861	buf = kmalloc(size: buf_len, GFP_KERNEL);
5862	if (!fspath \|\| !buf) {
5863	ret = -ENOMEM;
5864	goto out;
5865	}
5866
5867	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5868	if (ret < `0`)
5869	goto out;
5870
5871	data_ptr = (unsigned long)(di + `1`) + btrfs_dir_name_len(eb: leaf, s: di);
5872	read_extent_buffer(eb: leaf, dst: buf, start: data_ptr, len: buf_len);
5873
5874	ret = send_set_xattr(sctx, path: fspath, XATTR_NAME_CAPS,
5875	strlen(XATTR_NAME_CAPS), data: buf, data_len: buf_len);
5876	out:
5877	kfree(objp: buf);
5878	fs_path_free(p: fspath);
5879	btrfs_free_path(p: path);
5880	return ret;
5881	}
5882
5883	static int clone_range(struct send_ctx sctx, struct* btrfs_path *dst_path,
5884	struct clone_root clone_root, const* u64 disk_byte,
5885	u64 data_offset, u64 offset, u64 len)
5886	{
5887	struct btrfs_path *path;
5888	struct btrfs_key key;
5889	int ret;
5890	struct btrfs_inode_info info;
5891	u64 clone_src_i_size = `0`;
5892
5893	/*
5894	* Prevent cloning from a zero offset with a length matching the sector
5895	* size because in some scenarios this will make the receiver fail.
5896	*
5897	* For example, if in the source filesystem the extent at offset 0
5898	* has a length of sectorsize and it was written using direct IO, then
5899	* it can never be an inline extent (even if compression is enabled).
5900	* Then this extent can be cloned in the original filesystem to a non
5901	* zero file offset, but it may not be possible to clone in the
5902	* destination filesystem because it can be inlined due to compression
5903	* on the destination filesystem (as the receiver's write operations are
5904	* always done using buffered IO). The same happens when the original
5905	* filesystem does not have compression enabled but the destination
5906	* filesystem has.
5907	*/
5908	if (clone_root->offset == `0` &&
5909	len == sctx->send_root->fs_info->sectorsize)
5910	return send_extent_data(sctx, path: dst_path, offset, len);
5911
5912	path = alloc_path_for_send();
5913	if (!path)
5914	return -ENOMEM;
5915
5916	/*
5917	* There are inodes that have extents that lie behind its i_size. Don't
5918	* accept clones from these extents.
5919	*/
5920	ret = get_inode_info(root: clone_root->root, ino: clone_root->ino, info: &info);
5921	btrfs_release_path(p: path);
5922	if (ret < `0`)
5923	goto out;
5924	clone_src_i_size = info.size;
5925
5926	/*
5927	* We can't send a clone operation for the entire range if we find
5928	* extent items in the respective range in the source file that
5929	* refer to different extents or if we find holes.
5930	* So check for that and do a mix of clone and regular write/copy
5931	* operations if needed.
5932	*
5933	* Example:
5934	*
5935	* mkfs.btrfs -f /dev/sda
5936	* mount /dev/sda /mnt
5937	* xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5938	* cp --reflink=always /mnt/foo /mnt/bar
5939	* xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5940	* btrfs subvolume snapshot -r /mnt /mnt/snap
5941	*
5942	* If when we send the snapshot and we are processing file bar (which
5943	* has a higher inode number than foo) we blindly send a clone operation
5944	* for the [0, 100K[ range from foo to bar, the receiver ends up getting
5945	* a file bar that matches the content of file foo - iow, doesn't match
5946	* the content from bar in the original filesystem.
5947	*/
5948	key.objectid = clone_root->ino;
5949	key.type = BTRFS_EXTENT_DATA_KEY;
5950	key.offset = clone_root->offset;
5951	ret = btrfs_search_slot(NULL, root: clone_root->root, key: &key, p: path, ins_len: `0`, cow: `0`);
5952	if (ret < `0`)
5953	goto out;
5954	if (ret > `0` && path->slots[`0`] > `0`) {
5955	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
5956	if (key.objectid == clone_root->ino &&
5957	key.type == BTRFS_EXTENT_DATA_KEY)
5958	path->slots[`0`]--;
5959	}
5960
5961	while (true) {
5962	struct extent_buffer *leaf = path->nodes[`0`];
5963	int slot = path->slots[`0`];
5964	struct btrfs_file_extent_item *ei;
5965	u8 type;
5966	u64 ext_len;
5967	u64 clone_len;
5968	u64 clone_data_offset;
5969	bool crossed_src_i_size = false;
5970
5971	if (slot >= btrfs_header_nritems(eb: leaf)) {
5972	ret = btrfs_next_leaf(root: clone_root->root, path);
5973	if (ret < `0`)
5974	goto out;
5975	else if (ret > `0`)
5976	break;
5977	continue;
5978	}
5979
5980	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5981
5982	/*
5983	* We might have an implicit trailing hole (NO_HOLES feature
5984	* enabled). We deal with it after leaving this loop.
5985	*/
5986	if (key.objectid != clone_root->ino \|\|
5987	key.type != BTRFS_EXTENT_DATA_KEY)
5988	break;
5989
5990	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5991	type = btrfs_file_extent_type(eb: leaf, s: ei);
5992	if (type == BTRFS_FILE_EXTENT_INLINE) {
5993	ext_len = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5994	ext_len = PAGE_ALIGN(ext_len);
5995	} else {
5996	ext_len = btrfs_file_extent_num_bytes(eb: leaf, s: ei);
5997	}
5998
5999	if (key.offset + ext_len <= clone_root->offset)
6000	goto next;
6001
6002	if (key.offset > clone_root->offset) {
6003	/ Implicit hole, NO_HOLES feature enabled. /
6004	u64 hole_len = key.offset - clone_root->offset;
6005
6006	if (hole_len > len)
6007	hole_len = len;
6008	ret = send_extent_data(sctx, path: dst_path, offset,
6009	len: hole_len);
6010	if (ret < `0`)
6011	goto out;
6012
6013	len -= hole_len;
6014	if (len == `0`)
6015	break;
6016	offset += hole_len;
6017	clone_root->offset += hole_len;
6018	data_offset += hole_len;
6019	}
6020
6021	if (key.offset >= clone_root->offset + len)
6022	break;
6023
6024	if (key.offset >= clone_src_i_size)
6025	break;
6026
6027	if (key.offset + ext_len > clone_src_i_size) {
6028	ext_len = clone_src_i_size - key.offset;
6029	crossed_src_i_size = true;
6030	}
6031
6032	clone_data_offset = btrfs_file_extent_offset(eb: leaf, s: ei);
6033	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte) {
6034	clone_root->offset = key.offset;
6035	if (clone_data_offset < data_offset &&
6036	clone_data_offset + ext_len > data_offset) {
6037	u64 extent_offset;
6038
6039	extent_offset = data_offset - clone_data_offset;
6040	ext_len -= extent_offset;
6041	clone_data_offset += extent_offset;
6042	clone_root->offset += extent_offset;
6043	}
6044	}
6045
6046	clone_len = min_t(u64, ext_len, len);
6047
6048	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte &&
6049	clone_data_offset == data_offset) {
6050	const u64 src_end = clone_root->offset + clone_len;
6051	const u64 sectorsize = SZ_64K;
6052
6053	/*
6054	* We can't clone the last block, when its size is not
6055	* sector size aligned, into the middle of a file. If we
6056	* do so, the receiver will get a failure (-EINVAL) when
6057	* trying to clone or will silently corrupt the data in
6058	* the destination file if it's on a kernel without the
6059	* fix introduced by commit ac765f83f1397646
6060	* ("Btrfs: fix data corruption due to cloning of eof
6061	* block).
6062	*
6063	* So issue a clone of the aligned down range plus a
6064	* regular write for the eof block, if we hit that case.
6065	*
6066	* Also, we use the maximum possible sector size, 64K,
6067	* because we don't know what's the sector size of the
6068	* filesystem that receives the stream, so we have to
6069	* assume the largest possible sector size.
6070	*/
6071	if (src_end == clone_src_i_size &&
6072	!IS_ALIGNED(src_end, sectorsize) &&
6073	offset + clone_len < sctx->cur_inode_size) {
6074	u64 slen;
6075
6076	slen = ALIGN_DOWN(src_end - clone_root->offset,
6077	sectorsize);
6078	if (slen > `0`) {
6079	ret = send_clone(sctx, offset, len: slen,
6080	clone_root);
6081	if (ret < `0`)
6082	goto out;
6083	}
6084	ret = send_extent_data(sctx, path: dst_path,
6085	offset: offset + slen,
6086	len: clone_len - slen);
6087	} else {
6088	ret = send_clone(sctx, offset, len: clone_len,
6089	clone_root);
6090	}
6091	} else if (crossed_src_i_size && clone_len < len) {
6092	/*
6093	* If we are at i_size of the clone source inode and we
6094	* can not clone from it, terminate the loop. This is
6095	* to avoid sending two write operations, one with a
6096	* length matching clone_len and the final one after
6097	* this loop with a length of len - clone_len.
6098	*
6099	* When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6100	* was passed to the send ioctl), this helps avoid
6101	* sending an encoded write for an offset that is not
6102	* sector size aligned, in case the i_size of the source
6103	* inode is not sector size aligned. That will make the
6104	* receiver fallback to decompression of the data and
6105	* writing it using regular buffered IO, therefore while
6106	* not incorrect, it's not optimal due decompression and
6107	* possible re-compression at the receiver.
6108	*/
6109	break;
6110	} else {
6111	ret = send_extent_data(sctx, path: dst_path, offset,
6112	len: clone_len);
6113	}
6114
6115	if (ret < `0`)
6116	goto out;
6117
6118	len -= clone_len;
6119	if (len == `0`)
6120	break;
6121	offset += clone_len;
6122	clone_root->offset += clone_len;
6123
6124	/*
6125	* If we are cloning from the file we are currently processing,
6126	* and using the send root as the clone root, we must stop once
6127	* the current clone offset reaches the current eof of the file
6128	* at the receiver, otherwise we would issue an invalid clone
6129	* operation (source range going beyond eof) and cause the
6130	* receiver to fail. So if we reach the current eof, bail out
6131	* and fallback to a regular write.
6132	*/
6133	if (clone_root->root == sctx->send_root &&
6134	clone_root->ino == sctx->cur_ino &&
6135	clone_root->offset >= sctx->cur_inode_next_write_offset)
6136	break;
6137
6138	data_offset += clone_len;
6139	next:
6140	path->slots[`0`]++;
6141	}
6142
6143	if (len > `0`)
6144	ret = send_extent_data(sctx, path: dst_path, offset, len);
6145	else
6146	ret = `0`;
6147	out:
6148	btrfs_free_path(p: path);
6149	return ret;
6150	}
6151
6152	static int send_write_or_clone(struct send_ctx *sctx,
6153	struct btrfs_path *path,
6154	struct btrfs_key *key,
6155	struct clone_root *clone_root)
6156	{
6157	int ret = `0`;
6158	u64 offset = key->offset;
6159	u64 end;
6160	u64 bs = sctx->send_root->fs_info->sectorsize;
6161
6162	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
6163	if (offset >= end)
6164	return `0`;
6165
6166	if (clone_root && IS_ALIGNED(end, bs)) {
6167	struct btrfs_file_extent_item *ei;
6168	u64 disk_byte;
6169	u64 data_offset;
6170
6171	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6172	struct btrfs_file_extent_item);
6173	disk_byte = btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei);
6174	data_offset = btrfs_file_extent_offset(eb: path->nodes[`0`], s: ei);
6175	ret = clone_range(sctx, dst_path: path, clone_root, disk_byte,
6176	data_offset, offset, len: end - offset);
6177	} else {
6178	ret = send_extent_data(sctx, path, offset, len: end - offset);
6179	}
6180	sctx->cur_inode_next_write_offset = end;
6181	return ret;
6182	}
6183
6184	static int is_extent_unchanged(struct send_ctx *sctx,
6185	struct btrfs_path *left_path,
6186	struct btrfs_key *ekey)
6187	{
6188	int ret = `0`;
6189	struct btrfs_key key;
6190	struct btrfs_path *path = NULL;
6191	struct extent_buffer *eb;
6192	int slot;
6193	struct btrfs_key found_key;
6194	struct btrfs_file_extent_item *ei;
6195	u64 left_disknr;
6196	u64 right_disknr;
6197	u64 left_offset;
6198	u64 right_offset;
6199	u64 left_offset_fixed;
6200	u64 left_len;
6201	u64 right_len;
6202	u64 left_gen;
6203	u64 right_gen;
6204	u8 left_type;
6205	u8 right_type;
6206
6207	path = alloc_path_for_send();
6208	if (!path)
6209	return -ENOMEM;
6210
6211	eb = left_path->nodes[`0`];
6212	slot = left_path->slots[`0`];
6213	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6214	left_type = btrfs_file_extent_type(eb, s: ei);
6215
6216	if (left_type != BTRFS_FILE_EXTENT_REG) {
6217	ret = `0`;
6218	goto out;
6219	}
6220	left_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6221	left_len = btrfs_file_extent_num_bytes(eb, s: ei);
6222	left_offset = btrfs_file_extent_offset(eb, s: ei);
6223	left_gen = btrfs_file_extent_generation(eb, s: ei);
6224
6225	/*
6226	* Following comments will refer to these graphics. L is the left
6227	* extents which we are checking at the moment. 1-8 are the right
6228	* extents that we iterate.
6229	*
6230	* \|-----L-----\|
6231	* \|-1-\|-2a-\|-3-\|-4-\|-5-\|-6-\|
6232	*
6233	* \|-----L-----\|
6234	* \|--1--\|-2b-\|...(same as above)
6235	*
6236	* Alternative situation. Happens on files where extents got split.
6237	* \|-----L-----\|
6238	* \|-----------7-----------\|-6-\|
6239	*
6240	* Alternative situation. Happens on files which got larger.
6241	* \|-----L-----\|
6242	* \|-8-\|
6243	* Nothing follows after 8.
6244	*/
6245
6246	key.objectid = ekey->objectid;
6247	key.type = BTRFS_EXTENT_DATA_KEY;
6248	key.offset = ekey->offset;
6249	ret = btrfs_search_slot_for_read(root: sctx->parent_root, key: &key, p: path, find_higher: `0`, return_any: `0`);
6250	if (ret < `0`)
6251	goto out;
6252	if (ret) {
6253	ret = `0`;
6254	goto out;
6255	}
6256
6257	/*
6258	* Handle special case where the right side has no extents at all.
6259	*/
6260	eb = path->nodes[`0`];
6261	slot = path->slots[`0`];
6262	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6263	if (found_key.objectid != key.objectid \|\|
6264	found_key.type != key.type) {
6265	/ If we're a hole then just pretend nothing changed /
6266	ret = (left_disknr) ? `0` : `1`;
6267	goto out;
6268	}
6269
6270	/*
6271	* We're now on 2a, 2b or 7.
6272	*/
6273	key = found_key;
6274	while (key.offset < ekey->offset + left_len) {
6275	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6276	right_type = btrfs_file_extent_type(eb, s: ei);
6277	if (right_type != BTRFS_FILE_EXTENT_REG &&
6278	right_type != BTRFS_FILE_EXTENT_INLINE) {
6279	ret = `0`;
6280	goto out;
6281	}
6282
6283	if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6284	right_len = btrfs_file_extent_ram_bytes(eb, s: ei);
6285	right_len = PAGE_ALIGN(right_len);
6286	} else {
6287	right_len = btrfs_file_extent_num_bytes(eb, s: ei);
6288	}
6289
6290	/*
6291	* Are we at extent 8? If yes, we know the extent is changed.
6292	* This may only happen on the first iteration.
6293	*/
6294	if (found_key.offset + right_len <= ekey->offset) {
6295	/ If we're a hole just pretend nothing changed /
6296	ret = (left_disknr) ? `0` : `1`;
6297	goto out;
6298	}
6299
6300	/*
6301	* We just wanted to see if when we have an inline extent, what
6302	* follows it is a regular extent (wanted to check the above
6303	* condition for inline extents too). This should normally not
6304	* happen but it's possible for example when we have an inline
6305	* compressed extent representing data with a size matching
6306	* the page size (currently the same as sector size).
6307	*/
6308	if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6309	ret = `0`;
6310	goto out;
6311	}
6312
6313	right_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6314	right_offset = btrfs_file_extent_offset(eb, s: ei);
6315	right_gen = btrfs_file_extent_generation(eb, s: ei);
6316
6317	left_offset_fixed = left_offset;
6318	if (key.offset < ekey->offset) {
6319	/ Fix the right offset for 2a and 7. /
6320	right_offset += ekey->offset - key.offset;
6321	} else {
6322	/ Fix the left offset for all behind 2a and 2b /
6323	left_offset_fixed += key.offset - ekey->offset;
6324	}
6325
6326	/*
6327	* Check if we have the same extent.
6328	*/
6329	if (left_disknr != right_disknr \|\|
6330	left_offset_fixed != right_offset \|\|
6331	left_gen != right_gen) {
6332	ret = `0`;
6333	goto out;
6334	}
6335
6336	/*
6337	* Go to the next extent.
6338	*/
6339	ret = btrfs_next_item(root: sctx->parent_root, p: path);
6340	if (ret < `0`)
6341	goto out;
6342	if (!ret) {
6343	eb = path->nodes[`0`];
6344	slot = path->slots[`0`];
6345	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6346	}
6347	if (ret \|\| found_key.objectid != key.objectid \|\|
6348	found_key.type != key.type) {
6349	key.offset += right_len;
6350	break;
6351	}
6352	if (found_key.offset != key.offset + right_len) {
6353	ret = `0`;
6354	goto out;
6355	}
6356	key = found_key;
6357	}
6358
6359	/*
6360	* We're now behind the left extent (treat as unchanged) or at the end
6361	* of the right side (treat as changed).
6362	*/
6363	if (key.offset >= ekey->offset + left_len)
6364	ret = `1`;
6365	else
6366	ret = `0`;
6367
6368
6369	out:
6370	btrfs_free_path(p: path);
6371	return ret;
6372	}
6373
6374	static int get_last_extent(struct send_ctx *sctx, u64 offset)
6375	{
6376	struct btrfs_path *path;
6377	struct btrfs_root *root = sctx->send_root;
6378	struct btrfs_key key;
6379	int ret;
6380
6381	path = alloc_path_for_send();
6382	if (!path)
6383	return -ENOMEM;
6384
6385	sctx->cur_inode_last_extent = `0`;
6386
6387	key.objectid = sctx->cur_ino;
6388	key.type = BTRFS_EXTENT_DATA_KEY;
6389	key.offset = offset;
6390	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `0`, return_any: `1`);
6391	if (ret < `0`)
6392	goto out;
6393	ret = `0`;
6394	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
6395	if (key.objectid != sctx->cur_ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
6396	goto out;
6397
6398	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6399	out:
6400	btrfs_free_path(p: path);
6401	return ret;
6402	}
6403
6404	static int range_is_hole_in_parent(struct send_ctx *sctx,
6405	const u64 start,
6406	const u64 end)
6407	{
6408	struct btrfs_path *path;
6409	struct btrfs_key key;
6410	struct btrfs_root *root = sctx->parent_root;
6411	u64 search_start = start;
6412	int ret;
6413
6414	path = alloc_path_for_send();
6415	if (!path)
6416	return -ENOMEM;
6417
6418	key.objectid = sctx->cur_ino;
6419	key.type = BTRFS_EXTENT_DATA_KEY;
6420	key.offset = search_start;
6421	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
6422	if (ret < `0`)
6423	goto out;
6424	if (ret > `0` && path->slots[`0`] > `0`)
6425	path->slots[`0`]--;
6426
6427	while (search_start < end) {
6428	struct extent_buffer *leaf = path->nodes[`0`];
6429	int slot = path->slots[`0`];
6430	struct btrfs_file_extent_item *fi;
6431	u64 extent_end;
6432
6433	if (slot >= btrfs_header_nritems(eb: leaf)) {
6434	ret = btrfs_next_leaf(root, path);
6435	if (ret < `0`)
6436	goto out;
6437	else if (ret > `0`)
6438	break;
6439	continue;
6440	}
6441
6442	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
6443	if (key.objectid < sctx->cur_ino \|\|
6444	key.type < BTRFS_EXTENT_DATA_KEY)
6445	goto next;
6446	if (key.objectid > sctx->cur_ino \|\|
6447	key.type > BTRFS_EXTENT_DATA_KEY \|\|
6448	key.offset >= end)
6449	break;
6450
6451	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6452	extent_end = btrfs_file_extent_end(path);
6453	if (extent_end <= start)
6454	goto next;
6455	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: fi) == `0`) {
6456	search_start = extent_end;
6457	goto next;
6458	}
6459	ret = `0`;
6460	goto out;
6461	next:
6462	path->slots[`0`]++;
6463	}
6464	ret = `1`;
6465	out:
6466	btrfs_free_path(p: path);
6467	return ret;
6468	}
6469
6470	static int maybe_send_hole(struct send_ctx sctx, struct* btrfs_path *path,
6471	struct btrfs_key *key)
6472	{
6473	int ret = `0`;
6474
6475	if (sctx->cur_ino != key->objectid \|\| !need_send_hole(sctx))
6476	return `0`;
6477
6478	/*
6479	* Get last extent's end offset (exclusive) if we haven't determined it
6480	* yet (we're processing the first file extent item that is new), or if
6481	* we're at the first slot of a leaf and the last extent's end is less
6482	* than the current extent's offset, because we might have skipped
6483	* entire leaves that contained only file extent items for our current
6484	* inode. These leaves have a generation number smaller (older) than the
6485	* one in the current leaf and the leaf our last extent came from, and
6486	* are located between these 2 leaves.
6487	*/
6488	if ((sctx->cur_inode_last_extent == (u64)-`1`) \|\|
6489	(path->slots[`0`] == `0` && sctx->cur_inode_last_extent < key->offset)) {
6490	ret = get_last_extent(sctx, offset: key->offset - `1`);
6491	if (ret)
6492	return ret;
6493	}
6494
6495	if (sctx->cur_inode_last_extent < key->offset) {
6496	ret = range_is_hole_in_parent(sctx,
6497	start: sctx->cur_inode_last_extent,
6498	end: key->offset);
6499	if (ret < `0`)
6500	return ret;
6501	else if (ret == `0`)
6502	ret = send_hole(sctx, end: key->offset);
6503	else
6504	ret = `0`;
6505	}
6506	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6507	return ret;
6508	}
6509
6510	static int process_extent(struct send_ctx *sctx,
6511	struct btrfs_path *path,
6512	struct btrfs_key *key)
6513	{
6514	struct clone_root *found_clone = NULL;
6515	int ret = `0`;
6516
6517	if (S_ISLNK(sctx->cur_inode_mode))
6518	return `0`;
6519
6520	if (sctx->parent_root && !sctx->cur_inode_new) {
6521	ret = is_extent_unchanged(sctx, left_path: path, ekey: key);
6522	if (ret < `0`)
6523	goto out;
6524	if (ret) {
6525	ret = `0`;
6526	goto out_hole;
6527	}
6528	} else {
6529	struct btrfs_file_extent_item *ei;
6530	u8 type;
6531
6532	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6533	struct btrfs_file_extent_item);
6534	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
6535	if (type == BTRFS_FILE_EXTENT_PREALLOC \|\|
6536	type == BTRFS_FILE_EXTENT_REG) {
6537	/*
6538	* The send spec does not have a prealloc command yet,
6539	* so just leave a hole for prealloc'ed extents until
6540	* we have enough commands queued up to justify rev'ing
6541	* the send spec.
6542	*/
6543	if (type == BTRFS_FILE_EXTENT_PREALLOC) {
6544	ret = `0`;
6545	goto out;
6546	}
6547
6548	/ Have a hole, just skip it. /
6549	if (btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei) == `0`) {
6550	ret = `0`;
6551	goto out;
6552	}
6553	}
6554	}
6555
6556	ret = find_extent_clone(sctx, path, ino: key->objectid, data_offset: key->offset,
6557	ino_size: sctx->cur_inode_size, found: &found_clone);
6558	if (ret != -ENOENT && ret < `0`)
6559	goto out;
6560
6561	ret = send_write_or_clone(sctx, path, key, clone_root: found_clone);
6562	if (ret)
6563	goto out;
6564	out_hole:
6565	ret = maybe_send_hole(sctx, path, key);
6566	out:
6567	return ret;
6568	}
6569
6570	static int process_all_extents(struct send_ctx *sctx)
6571	{
6572	int ret = `0`;
6573	int iter_ret = `0`;
6574	struct btrfs_root *root;
6575	struct btrfs_path *path;
6576	struct btrfs_key key;
6577	struct btrfs_key found_key;
6578
6579	root = sctx->send_root;
6580	path = alloc_path_for_send();
6581	if (!path)
6582	return -ENOMEM;
6583
6584	key.objectid = sctx->cmp_key->objectid;
6585	key.type = BTRFS_EXTENT_DATA_KEY;
6586	key.offset = `0`;
6587	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6588	if (found_key.objectid != key.objectid \|\|
6589	found_key.type != key.type) {
6590	ret = `0`;
6591	break;
6592	}
6593
6594	ret = process_extent(sctx, path, key: &found_key);
6595	if (ret < `0`)
6596	break;
6597	}
6598	/ Catch error found during iteration /
6599	if (iter_ret < `0`)
6600	ret = iter_ret;
6601
6602	btrfs_free_path(p: path);
6603	return ret;
6604	}
6605
6606	static int process_recorded_refs_if_needed(struct send_ctx sctx, int* at_end,
6607	int *pending_move,
6608	int *refs_processed)
6609	{
6610	int ret = `0`;
6611
6612	if (sctx->cur_ino == `0`)
6613	goto out;
6614	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6615	sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6616	goto out;
6617	if (list_empty(head: &sctx->new_refs) && list_empty(head: &sctx->deleted_refs))
6618	goto out;
6619
6620	ret = process_recorded_refs(sctx, pending_move);
6621	if (ret < `0`)
6622	goto out;
6623
6624	*refs_processed = `1`;
6625	out:
6626	return ret;
6627	}
6628
6629	static int finish_inode_if_needed(struct send_ctx sctx, int* at_end)
6630	{
6631	int ret = `0`;
6632	struct btrfs_inode_info info;
6633	u64 left_mode;
6634	u64 left_uid;
6635	u64 left_gid;
6636	u64 left_fileattr;
6637	u64 right_mode;
6638	u64 right_uid;
6639	u64 right_gid;
6640	u64 right_fileattr;
6641	int need_chmod = `0`;
6642	int need_chown = `0`;
6643	bool need_fileattr = false;
6644	int need_truncate = `1`;
6645	int pending_move = `0`;
6646	int refs_processed = `0`;
6647
6648	if (sctx->ignore_cur_inode)
6649	return `0`;
6650
6651	ret = process_recorded_refs_if_needed(sctx, at_end, pending_move: &pending_move,
6652	refs_processed: &refs_processed);
6653	if (ret < `0`)
6654	goto out;
6655
6656	/*
6657	* We have processed the refs and thus need to advance send_progress.
6658	* Now, calls to get_cur_xxx will take the updated refs of the current
6659	* inode into account.
6660	*
6661	* On the other hand, if our current inode is a directory and couldn't
6662	* be moved/renamed because its parent was renamed/moved too and it has
6663	* a higher inode number, we can only move/rename our current inode
6664	* after we moved/renamed its parent. Therefore in this case operate on
6665	* the old path (pre move/rename) of our current inode, and the
6666	* move/rename will be performed later.
6667	*/
6668	if (refs_processed && !pending_move)
6669	sctx->send_progress = sctx->cur_ino + `1`;
6670
6671	if (sctx->cur_ino == `0` \|\| sctx->cur_inode_deleted)
6672	goto out;
6673	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6674	goto out;
6675	ret = get_inode_info(root: sctx->send_root, ino: sctx->cur_ino, info: &info);
6676	if (ret < `0`)
6677	goto out;
6678	left_mode = info.mode;
6679	left_uid = info.uid;
6680	left_gid = info.gid;
6681	left_fileattr = info.fileattr;
6682
6683	if (!sctx->parent_root \|\| sctx->cur_inode_new) {
6684	need_chown = `1`;
6685	if (!S_ISLNK(sctx->cur_inode_mode))
6686	need_chmod = `1`;
6687	if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6688	need_truncate = `0`;
6689	} else {
6690	u64 old_size;
6691
6692	ret = get_inode_info(root: sctx->parent_root, ino: sctx->cur_ino, info: &info);
6693	if (ret < `0`)
6694	goto out;
6695	old_size = info.size;
6696	right_mode = info.mode;
6697	right_uid = info.uid;
6698	right_gid = info.gid;
6699	right_fileattr = info.fileattr;
6700
6701	if (left_uid != right_uid \|\| left_gid != right_gid)
6702	need_chown = `1`;
6703	if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6704	need_chmod = `1`;
6705	if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
6706	need_fileattr = true;
6707	if ((old_size == sctx->cur_inode_size) \|\|
6708	(sctx->cur_inode_size > old_size &&
6709	sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6710	need_truncate = `0`;
6711	}
6712
6713	if (S_ISREG(sctx->cur_inode_mode)) {
6714	if (need_send_hole(sctx)) {
6715	if (sctx->cur_inode_last_extent == (u64)-`1` \|\|
6716	sctx->cur_inode_last_extent <
6717	sctx->cur_inode_size) {
6718	ret = get_last_extent(sctx, offset: (u64)-`1`);
6719	if (ret)
6720	goto out;
6721	}
6722	if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
6723	ret = range_is_hole_in_parent(sctx,
6724	start: sctx->cur_inode_last_extent,
6725	end: sctx->cur_inode_size);
6726	if (ret < `0`) {
6727	goto out;
6728	} else if (ret == `0`) {
6729	ret = send_hole(sctx, end: sctx->cur_inode_size);
6730	if (ret < `0`)
6731	goto out;
6732	} else {
6733	/ Range is already a hole, skip. /
6734	ret = `0`;
6735	}
6736	}
6737	}
6738	if (need_truncate) {
6739	ret = send_truncate(sctx, ino: sctx->cur_ino,
6740	gen: sctx->cur_inode_gen,
6741	size: sctx->cur_inode_size);
6742	if (ret < `0`)
6743	goto out;
6744	}
6745	}
6746
6747	if (need_chown) {
6748	ret = send_chown(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6749	uid: left_uid, gid: left_gid);
6750	if (ret < `0`)
6751	goto out;
6752	}
6753	if (need_chmod) {
6754	ret = send_chmod(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6755	mode: left_mode);
6756	if (ret < `0`)
6757	goto out;
6758	}
6759	if (need_fileattr) {
6760	ret = send_fileattr(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6761	fileattr: left_fileattr);
6762	if (ret < `0`)
6763	goto out;
6764	}
6765
6766	if (proto_cmd_ok(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY)
6767	&& sctx->cur_inode_needs_verity) {
6768	ret = process_verity(sctx);
6769	if (ret < `0`)
6770	goto out;
6771	}
6772
6773	ret = send_capabilities(sctx);
6774	if (ret < `0`)
6775	goto out;
6776
6777	/*
6778	* If other directory inodes depended on our current directory
6779	* inode's move/rename, now do their move/rename operations.
6780	*/
6781	if (!is_waiting_for_move(sctx, ino: sctx->cur_ino)) {
6782	ret = apply_children_dir_moves(sctx);
6783	if (ret)
6784	goto out;
6785	/*
6786	* Need to send that every time, no matter if it actually
6787	* changed between the two trees as we have done changes to
6788	* the inode before. If our inode is a directory and it's
6789	* waiting to be moved/renamed, we will send its utimes when
6790	* it's moved/renamed, therefore we don't need to do it here.
6791	*/
6792	sctx->send_progress = sctx->cur_ino + `1`;
6793
6794	/*
6795	* If the current inode is a non-empty directory, delay issuing
6796	* the utimes command for it, as it's very likely we have inodes
6797	* with an higher number inside it. We want to issue the utimes
6798	* command only after adding all dentries to it.
6799	*/
6800	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > `0`)
6801	ret = cache_dir_utimes(sctx, dir: sctx->cur_ino, gen: sctx->cur_inode_gen);
6802	else
6803	ret = send_utimes(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen);
6804
6805	if (ret < `0`)
6806	goto out;
6807	}
6808
6809	out:
6810	if (!ret)
6811	ret = trim_dir_utimes_cache(sctx);
6812
6813	return ret;
6814	}
6815
6816	static void close_current_inode(struct send_ctx *sctx)
6817	{
6818	u64 i_size;
6819
6820	if (sctx->cur_inode == NULL)
6821	return;
6822
6823	i_size = i_size_read(inode: sctx->cur_inode);
6824
6825	/*
6826	* If we are doing an incremental send, we may have extents between the
6827	* last processed extent and the i_size that have not been processed
6828	* because they haven't changed but we may have read some of their pages
6829	* through readahead, see the comments at send_extent_data().
6830	*/
6831	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
6832	truncate_inode_pages_range(&sctx->cur_inode->i_data,
6833	lstart: sctx->page_cache_clear_start,
6834	round_up(i_size, PAGE_SIZE) - `1`);
6835
6836	iput(sctx->cur_inode);
6837	sctx->cur_inode = NULL;
6838	}
6839
6840	static int changed_inode(struct send_ctx *sctx,
6841	enum btrfs_compare_tree_result result)
6842	{
6843	int ret = `0`;
6844	struct btrfs_key *key = sctx->cmp_key;
6845	struct btrfs_inode_item *left_ii = NULL;
6846	struct btrfs_inode_item *right_ii = NULL;
6847	u64 left_gen = `0`;
6848	u64 right_gen = `0`;
6849
6850	close_current_inode(sctx);
6851
6852	sctx->cur_ino = key->objectid;
6853	sctx->cur_inode_new_gen = false;
6854	sctx->cur_inode_last_extent = (u64)-`1`;
6855	sctx->cur_inode_next_write_offset = `0`;
6856	sctx->ignore_cur_inode = false;
6857
6858	/*
6859	* Set send_progress to current inode. This will tell all get_cur_xxx
6860	* functions that the current inode's refs are not updated yet. Later,
6861	* when process_recorded_refs is finished, it is set to cur_ino + 1.
6862	*/
6863	sctx->send_progress = sctx->cur_ino;
6864
6865	if (result == BTRFS_COMPARE_TREE_NEW \|\|
6866	result == BTRFS_COMPARE_TREE_CHANGED) {
6867	left_ii = btrfs_item_ptr(sctx->left_path->nodes[`0`],
6868	sctx->left_path->slots[`0`],
6869	struct btrfs_inode_item);
6870	left_gen = btrfs_inode_generation(eb: sctx->left_path->nodes[`0`],
6871	s: left_ii);
6872	} else {
6873	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6874	sctx->right_path->slots[`0`],
6875	struct btrfs_inode_item);
6876	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6877	s: right_ii);
6878	}
6879	if (result == BTRFS_COMPARE_TREE_CHANGED) {
6880	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6881	sctx->right_path->slots[`0`],
6882	struct btrfs_inode_item);
6883
6884	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6885	s: right_ii);
6886
6887	/*
6888	* The cur_ino = root dir case is special here. We can't treat
6889	* the inode as deleted+reused because it would generate a
6890	* stream that tries to delete/mkdir the root dir.
6891	*/
6892	if (left_gen != right_gen &&
6893	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6894	sctx->cur_inode_new_gen = true;
6895	}
6896
6897	/*
6898	* Normally we do not find inodes with a link count of zero (orphans)
6899	* because the most common case is to create a snapshot and use it
6900	* for a send operation. However other less common use cases involve
6901	* using a subvolume and send it after turning it to RO mode just
6902	* after deleting all hard links of a file while holding an open
6903	* file descriptor against it or turning a RO snapshot into RW mode,
6904	* keep an open file descriptor against a file, delete it and then
6905	* turn the snapshot back to RO mode before using it for a send
6906	* operation. The former is what the receiver operation does.
6907	* Therefore, if we want to send these snapshots soon after they're
6908	* received, we need to handle orphan inodes as well. Moreover, orphans
6909	* can appear not only in the send snapshot but also in the parent
6910	* snapshot. Here are several cases:
6911	*
6912	* Case 1: BTRFS_COMPARE_TREE_NEW
6913	* \| send snapshot \| action
6914	* --------------------------------
6915	* nlink \| 0 \| ignore
6916	*
6917	* Case 2: BTRFS_COMPARE_TREE_DELETED
6918	* \| parent snapshot \| action
6919	* ----------------------------------
6920	* nlink \| 0 \| as usual
6921	* Note: No unlinks will be sent because there're no paths for it.
6922	*
6923	* Case 3: BTRFS_COMPARE_TREE_CHANGED
6924	* \| \| parent snapshot \| send snapshot \| action
6925	* -----------------------------------------------------------------------
6926	* subcase 1 \| nlink \| 0 \| 0 \| ignore
6927	* subcase 2 \| nlink \| >0 \| 0 \| new_gen(deletion)
6928	* subcase 3 \| nlink \| 0 \| >0 \| new_gen(creation)
6929	*
6930	*/
6931	if (result == BTRFS_COMPARE_TREE_NEW) {
6932	if (btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii) == `0`) {
6933	sctx->ignore_cur_inode = true;
6934	goto out;
6935	}
6936	sctx->cur_inode_gen = left_gen;
6937	sctx->cur_inode_new = true;
6938	sctx->cur_inode_deleted = false;
6939	sctx->cur_inode_size = btrfs_inode_size(
6940	eb: sctx->left_path->nodes[`0`], s: left_ii);
6941	sctx->cur_inode_mode = btrfs_inode_mode(
6942	eb: sctx->left_path->nodes[`0`], s: left_ii);
6943	sctx->cur_inode_rdev = btrfs_inode_rdev(
6944	eb: sctx->left_path->nodes[`0`], s: left_ii);
6945	if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6946	ret = send_create_inode_if_needed(sctx);
6947	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
6948	sctx->cur_inode_gen = right_gen;
6949	sctx->cur_inode_new = false;
6950	sctx->cur_inode_deleted = true;
6951	sctx->cur_inode_size = btrfs_inode_size(
6952	eb: sctx->right_path->nodes[`0`], s: right_ii);
6953	sctx->cur_inode_mode = btrfs_inode_mode(
6954	eb: sctx->right_path->nodes[`0`], s: right_ii);
6955	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6956	u32 new_nlinks, old_nlinks;
6957
6958	new_nlinks = btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii);
6959	old_nlinks = btrfs_inode_nlink(eb: sctx->right_path->nodes[`0`], s: right_ii);
6960	if (new_nlinks == `0` && old_nlinks == `0`) {
6961	sctx->ignore_cur_inode = true;
6962	goto out;
6963	} else if (new_nlinks == `0` \|\| old_nlinks == `0`) {
6964	sctx->cur_inode_new_gen = `1`;
6965	}
6966	/*
6967	* We need to do some special handling in case the inode was
6968	* reported as changed with a changed generation number. This
6969	* means that the original inode was deleted and new inode
6970	* reused the same inum. So we have to treat the old inode as
6971	* deleted and the new one as new.
6972	*/
6973	if (sctx->cur_inode_new_gen) {
6974	/*
6975	* First, process the inode as if it was deleted.
6976	*/
6977	if (old_nlinks > `0`) {
6978	sctx->cur_inode_gen = right_gen;
6979	sctx->cur_inode_new = false;
6980	sctx->cur_inode_deleted = true;
6981	sctx->cur_inode_size = btrfs_inode_size(
6982	eb: sctx->right_path->nodes[`0`], s: right_ii);
6983	sctx->cur_inode_mode = btrfs_inode_mode(
6984	eb: sctx->right_path->nodes[`0`], s: right_ii);
6985	ret = process_all_refs(sctx,
6986	cmd: BTRFS_COMPARE_TREE_DELETED);
6987	if (ret < `0`)
6988	goto out;
6989	}
6990
6991	/*
6992	* Now process the inode as if it was new.
6993	*/
6994	if (new_nlinks > `0`) {
6995	sctx->cur_inode_gen = left_gen;
6996	sctx->cur_inode_new = true;
6997	sctx->cur_inode_deleted = false;
6998	sctx->cur_inode_size = btrfs_inode_size(
6999	eb: sctx->left_path->nodes[`0`],
7000	s: left_ii);
7001	sctx->cur_inode_mode = btrfs_inode_mode(
7002	eb: sctx->left_path->nodes[`0`],
7003	s: left_ii);
7004	sctx->cur_inode_rdev = btrfs_inode_rdev(
7005	eb: sctx->left_path->nodes[`0`],
7006	s: left_ii);
7007	ret = send_create_inode_if_needed(sctx);
7008	if (ret < `0`)
7009	goto out;
7010
7011	ret = process_all_refs(sctx, cmd: BTRFS_COMPARE_TREE_NEW);
7012	if (ret < `0`)
7013	goto out;
7014	/*
7015	* Advance send_progress now as we did not get
7016	* into process_recorded_refs_if_needed in the
7017	* new_gen case.
7018	*/
7019	sctx->send_progress = sctx->cur_ino + `1`;
7020
7021	/*
7022	* Now process all extents and xattrs of the
7023	* inode as if they were all new.
7024	*/
7025	ret = process_all_extents(sctx);
7026	if (ret < `0`)
7027	goto out;
7028	ret = process_all_new_xattrs(sctx);
7029	if (ret < `0`)
7030	goto out;
7031	}
7032	} else {
7033	sctx->cur_inode_gen = left_gen;
7034	sctx->cur_inode_new = false;
7035	sctx->cur_inode_new_gen = false;
7036	sctx->cur_inode_deleted = false;
7037	sctx->cur_inode_size = btrfs_inode_size(
7038	eb: sctx->left_path->nodes[`0`], s: left_ii);
7039	sctx->cur_inode_mode = btrfs_inode_mode(
7040	eb: sctx->left_path->nodes[`0`], s: left_ii);
7041	}
7042	}
7043
7044	out:
7045	return ret;
7046	}
7047
7048	/*
7049	* We have to process new refs before deleted refs, but compare_trees gives us
7050	* the new and deleted refs mixed. To fix this, we record the new/deleted refs
7051	* first and later process them in process_recorded_refs.
7052	* For the cur_inode_new_gen case, we skip recording completely because
7053	* changed_inode did already initiate processing of refs. The reason for this is
7054	* that in this case, compare_tree actually compares the refs of 2 different
7055	* inodes. To fix this, process_all_refs is used in changed_inode to handle all
7056	* refs of the right tree as deleted and all refs of the left tree as new.
7057	*/
7058	static int changed_ref(struct send_ctx *sctx,
7059	enum btrfs_compare_tree_result result)
7060	{
7061	int ret = `0`;
7062
7063	if (sctx->cur_ino != sctx->cmp_key->objectid) {
7064	inconsistent_snapshot_error(sctx, result, what: "reference");
7065	return -EIO;
7066	}
7067
7068	if (!sctx->cur_inode_new_gen &&
7069	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
7070	if (result == BTRFS_COMPARE_TREE_NEW)
7071	ret = record_new_ref(sctx);
7072	else if (result == BTRFS_COMPARE_TREE_DELETED)
7073	ret = record_deleted_ref(sctx);
7074	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7075	ret = record_changed_ref(sctx);
7076	}
7077
7078	return ret;
7079	}
7080
7081	/*
7082	* Process new/deleted/changed xattrs. We skip processing in the
7083	* cur_inode_new_gen case because changed_inode did already initiate processing
7084	* of xattrs. The reason is the same as in changed_ref
7085	*/
7086	static int changed_xattr(struct send_ctx *sctx,
7087	enum btrfs_compare_tree_result result)
7088	{
7089	int ret = `0`;
7090
7091	if (sctx->cur_ino != sctx->cmp_key->objectid) {
7092	inconsistent_snapshot_error(sctx, result, what: "xattr");
7093	return -EIO;
7094	}
7095
7096	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7097	if (result == BTRFS_COMPARE_TREE_NEW)
7098	ret = process_new_xattr(sctx);
7099	else if (result == BTRFS_COMPARE_TREE_DELETED)
7100	ret = process_deleted_xattr(sctx);
7101	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7102	ret = process_changed_xattr(sctx);
7103	}
7104
7105	return ret;
7106	}
7107
7108	/*
7109	* Process new/deleted/changed extents. We skip processing in the
7110	* cur_inode_new_gen case because changed_inode did already initiate processing
7111	* of extents. The reason is the same as in changed_ref
7112	*/
7113	static int changed_extent(struct send_ctx *sctx,
7114	enum btrfs_compare_tree_result result)
7115	{
7116	int ret = `0`;
7117
7118	/*
7119	* We have found an extent item that changed without the inode item
7120	* having changed. This can happen either after relocation (where the
7121	* disk_bytenr of an extent item is replaced at
7122	* relocation.c:replace_file_extents()) or after deduplication into a
7123	* file in both the parent and send snapshots (where an extent item can
7124	* get modified or replaced with a new one). Note that deduplication
7125	* updates the inode item, but it only changes the iversion (sequence
7126	* field in the inode item) of the inode, so if a file is deduplicated
7127	* the same amount of times in both the parent and send snapshots, its
7128	* iversion becomes the same in both snapshots, whence the inode item is
7129	* the same on both snapshots.
7130	*/
7131	if (sctx->cur_ino != sctx->cmp_key->objectid)
7132	return `0`;
7133
7134	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7135	if (result != BTRFS_COMPARE_TREE_DELETED)
7136	ret = process_extent(sctx, path: sctx->left_path,
7137	key: sctx->cmp_key);
7138	}
7139
7140	return ret;
7141	}
7142
7143	static int changed_verity(struct send_ctx sctx, enum* btrfs_compare_tree_result result)
7144	{
7145	int ret = `0`;
7146
7147	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7148	if (result == BTRFS_COMPARE_TREE_NEW)
7149	sctx->cur_inode_needs_verity = true;
7150	}
7151	return ret;
7152	}
7153
7154	static int dir_changed(struct send_ctx *sctx, u64 dir)
7155	{
7156	u64 orig_gen, new_gen;
7157	int ret;
7158
7159	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &new_gen);
7160	if (ret)
7161	return ret;
7162
7163	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &orig_gen);
7164	if (ret)
7165	return ret;
7166
7167	return (orig_gen != new_gen) ? `1` : `0`;
7168	}
7169
7170	static int compare_refs(struct send_ctx sctx, struct* btrfs_path *path,
7171	struct btrfs_key *key)
7172	{
7173	struct btrfs_inode_extref *extref;
7174	struct extent_buffer *leaf;
7175	u64 dirid = `0`, last_dirid = `0`;
7176	unsigned long ptr;
7177	u32 item_size;
7178	u32 cur_offset = `0`;
7179	int ref_name_len;
7180	int ret = `0`;
7181
7182	/ Easy case, just check this one dirid /
7183	if (key->type == BTRFS_INODE_REF_KEY) {
7184	dirid = key->offset;
7185
7186	ret = dir_changed(sctx, dir: dirid);
7187	goto out;
7188	}
7189
7190	leaf = path->nodes[`0`];
7191	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
7192	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
7193	while (cur_offset < item_size) {
7194	extref = (struct btrfs_inode_extref *)(ptr +
7195	cur_offset);
7196	dirid = btrfs_inode_extref_parent(eb: leaf, s: extref);
7197	ref_name_len = btrfs_inode_extref_name_len(eb: leaf, s: extref);
7198	cur_offset += ref_name_len + sizeof(*extref);
7199	if (dirid == last_dirid)
7200	continue;
7201	ret = dir_changed(sctx, dir: dirid);
7202	if (ret)
7203	break;
7204	last_dirid = dirid;
7205	}
7206	out:
7207	return ret;
7208	}
7209
7210	/*
7211	* Updates compare related fields in sctx and simply forwards to the actual
7212	* changed_xxx functions.
7213	*/
7214	static int changed_cb(struct btrfs_path *left_path,
7215	struct btrfs_path *right_path,
7216	struct btrfs_key *key,
7217	enum btrfs_compare_tree_result result,
7218	struct send_ctx *sctx)
7219	{
7220	int ret = `0`;
7221
7222	/*
7223	* We can not hold the commit root semaphore here. This is because in
7224	* the case of sending and receiving to the same filesystem, using a
7225	* pipe, could result in a deadlock:
7226	*
7227	* 1) The task running send blocks on the pipe because it's full;
7228	*
7229	* 2) The task running receive, which is the only consumer of the pipe,
7230	* is waiting for a transaction commit (for example due to a space
7231	* reservation when doing a write or triggering a transaction commit
7232	* when creating a subvolume);
7233	*
7234	* 3) The transaction is waiting to write lock the commit root semaphore,
7235	* but can not acquire it since it's being held at 1).
7236	*
7237	* Down this call chain we write to the pipe through kernel_write().
7238	* The same type of problem can also happen when sending to a file that
7239	* is stored in the same filesystem - when reserving space for a write
7240	* into the file, we can trigger a transaction commit.
7241	*
7242	* Our caller has supplied us with clones of leaves from the send and
7243	* parent roots, so we're safe here from a concurrent relocation and
7244	* further reallocation of metadata extents while we are here. Below we
7245	* also assert that the leaves are clones.
7246	*/
7247	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
7248
7249	/*
7250	* We always have a send root, so left_path is never NULL. We will not
7251	* have a leaf when we have reached the end of the send root but have
7252	* not yet reached the end of the parent root.
7253	*/
7254	if (left_path->nodes[`0`])
7255	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7256	&left_path->nodes[`0`]->bflags));
7257	/*
7258	* When doing a full send we don't have a parent root, so right_path is
7259	* NULL. When doing an incremental send, we may have reached the end of
7260	* the parent root already, so we don't have a leaf at right_path.
7261	*/
7262	if (right_path && right_path->nodes[`0`])
7263	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7264	&right_path->nodes[`0`]->bflags));
7265
7266	if (result == BTRFS_COMPARE_TREE_SAME) {
7267	if (key->type == BTRFS_INODE_REF_KEY \|\|
7268	key->type == BTRFS_INODE_EXTREF_KEY) {
7269	ret = compare_refs(sctx, path: left_path, key);
7270	if (!ret)
7271	return `0`;
7272	if (ret < `0`)
7273	return ret;
7274	} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
7275	return maybe_send_hole(sctx, path: left_path, key);
7276	} else {
7277	return `0`;
7278	}
7279	result = BTRFS_COMPARE_TREE_CHANGED;
7280	ret = `0`;
7281	}
7282
7283	sctx->left_path = left_path;
7284	sctx->right_path = right_path;
7285	sctx->cmp_key = key;
7286
7287	ret = finish_inode_if_needed(sctx, at_end: `0`);
7288	if (ret < `0`)
7289	goto out;
7290
7291	/ Ignore non-FS objects /
7292	if (key->objectid == BTRFS_FREE_INO_OBJECTID \|\|
7293	key->objectid == BTRFS_FREE_SPACE_OBJECTID)
7294	goto out;
7295
7296	if (key->type == BTRFS_INODE_ITEM_KEY) {
7297	ret = changed_inode(sctx, result);
7298	} else if (!sctx->ignore_cur_inode) {
7299	if (key->type == BTRFS_INODE_REF_KEY \|\|
7300	key->type == BTRFS_INODE_EXTREF_KEY)
7301	ret = changed_ref(sctx, result);
7302	else if (key->type == BTRFS_XATTR_ITEM_KEY)
7303	ret = changed_xattr(sctx, result);
7304	else if (key->type == BTRFS_EXTENT_DATA_KEY)
7305	ret = changed_extent(sctx, result);
7306	else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
7307	key->offset == `0`)
7308	ret = changed_verity(sctx, result);
7309	}
7310
7311	out:
7312	return ret;
7313	}
7314
7315	static int search_key_again(const struct send_ctx *sctx,
7316	struct btrfs_root *root,
7317	struct btrfs_path *path,
7318	const struct btrfs_key *key)
7319	{
7320	int ret;
7321
7322	if (!path->need_commit_sem)
7323	lockdep_assert_held_read(&root->fs_info->commit_root_sem);
7324
7325	/*
7326	* Roots used for send operations are readonly and no one can add,
7327	* update or remove keys from them, so we should be able to find our
7328	* key again. The only exception is deduplication, which can operate on
7329	* readonly roots and add, update or remove keys to/from them - but at
7330	* the moment we don't allow it to run in parallel with send.
7331	*/
7332	ret = btrfs_search_slot(NULL, root, key, p: path, ins_len: `0`, cow: `0`);
7333	ASSERT(ret <= `0`);
7334	if (ret > `0`) {
7335	btrfs_print_tree(c: path->nodes[path->lowest_level], follow: false);
7336	btrfs_err(root->fs_info,
7337	"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7338	key->objectid, key->type, key->offset,
7339	(root == sctx->parent_root ? "parent" : "send"),
7340	root->root_key.objectid, path->lowest_level,
7341	path->slots[path->lowest_level]);
7342	return -EUCLEAN;
7343	}
7344
7345	return ret;
7346	}
7347
7348	static int full_send_tree(struct send_ctx *sctx)
7349	{
7350	int ret;
7351	struct btrfs_root *send_root = sctx->send_root;
7352	struct btrfs_key key;
7353	struct btrfs_fs_info *fs_info = send_root->fs_info;
7354	struct btrfs_path *path;
7355
7356	path = alloc_path_for_send();
7357	if (!path)
7358	return -ENOMEM;
7359	path->reada = READA_FORWARD_ALWAYS;
7360
7361	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7362	key.type = BTRFS_INODE_ITEM_KEY;
7363	key.offset = `0`;
7364
7365	down_read(sem: &fs_info->commit_root_sem);
7366	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7367	up_read(sem: &fs_info->commit_root_sem);
7368
7369	ret = btrfs_search_slot_for_read(root: send_root, key: &key, p: path, find_higher: `1`, return_any: `0`);
7370	if (ret < `0`)
7371	goto out;
7372	if (ret)
7373	goto out_finish;
7374
7375	while (`1`) {
7376	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
7377
7378	ret = changed_cb(left_path: path, NULL, key: &key,
7379	result: BTRFS_COMPARE_TREE_NEW, sctx);
7380	if (ret < `0`)
7381	goto out;
7382
7383	down_read(sem: &fs_info->commit_root_sem);
7384	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7385	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7386	up_read(sem: &fs_info->commit_root_sem);
7387	/*
7388	* A transaction used for relocating a block group was
7389	* committed or is about to finish its commit. Release
7390	* our path (leaf) and restart the search, so that we
7391	* avoid operating on any file extent items that are
7392	* stale, with a disk_bytenr that reflects a pre
7393	* relocation value. This way we avoid as much as
7394	* possible to fallback to regular writes when checking
7395	* if we can clone file ranges.
7396	*/
7397	btrfs_release_path(p: path);
7398	ret = search_key_again(sctx, root: send_root, path, key: &key);
7399	if (ret < `0`)
7400	goto out;
7401	} else {
7402	up_read(sem: &fs_info->commit_root_sem);
7403	}
7404
7405	ret = btrfs_next_item(root: send_root, p: path);
7406	if (ret < `0`)
7407	goto out;
7408	if (ret) {
7409	ret = `0`;
7410	break;
7411	}
7412	}
7413
7414	out_finish:
7415	ret = finish_inode_if_needed(sctx, at_end: `1`);
7416
7417	out:
7418	btrfs_free_path(p: path);
7419	return ret;
7420	}
7421
7422	static int replace_node_with_clone(struct btrfs_path path, int* level)
7423	{
7424	struct extent_buffer *clone;
7425
7426	clone = btrfs_clone_extent_buffer(src: path->nodes[level]);
7427	if (!clone)
7428	return -ENOMEM;
7429
7430	free_extent_buffer(eb: path->nodes[level]);
7431	path->nodes[level] = clone;
7432
7433	return `0`;
7434	}
7435
7436	static int tree_move_down(struct btrfs_path path, int* *level, u64 reada_min_gen)
7437	{
7438	struct extent_buffer *eb;
7439	struct extent_buffer parent = path->nodes[level];
7440	int slot = path->slots[*level];
7441	const int nritems = btrfs_header_nritems(eb: parent);
7442	u64 reada_max;
7443	u64 reada_done = `0`;
7444
7445	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7446	ASSERT(*level != `0`);
7447
7448	eb = btrfs_read_node_slot(parent, slot);
7449	if (IS_ERR(ptr: eb))
7450	return PTR_ERR(ptr: eb);
7451
7452	/*
7453	* Trigger readahead for the next leaves we will process, so that it is
7454	* very likely that when we need them they are already in memory and we
7455	* will not block on disk IO. For nodes we only do readahead for one,
7456	* since the time window between processing nodes is typically larger.
7457	*/
7458	reada_max = (*level == `1` ? SZ_128K : eb->fs_info->nodesize);
7459
7460	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
7461	if (btrfs_node_ptr_generation(eb: parent, nr: slot) > reada_min_gen) {
7462	btrfs_readahead_node_child(node: parent, slot);
7463	reada_done += eb->fs_info->nodesize;
7464	}
7465	}
7466
7467	path->nodes[*level - `1`] = eb;
7468	path->slots[*level - `1`] = `0`;
7469	(*level)--;
7470
7471	if (*level == `0`)
7472	return replace_node_with_clone(path, level: `0`);
7473
7474	return `0`;
7475	}
7476
7477	static int tree_move_next_or_upnext(struct btrfs_path *path,
7478	int level, int* root_level)
7479	{
7480	int ret = `0`;
7481	int nritems;
7482	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7483
7484	path->slots[*level]++;
7485
7486	while (path->slots[*level] >= nritems) {
7487	if (*level == root_level) {
7488	path->slots[*level] = nritems - `1`;
7489	return -`1`;
7490	}
7491
7492	/ move upnext /
7493	path->slots[*level] = `0`;
7494	free_extent_buffer(eb: path->nodes[*level]);
7495	path->nodes[*level] = NULL;
7496	(*level)++;
7497	path->slots[*level]++;
7498
7499	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7500	ret = `1`;
7501	}
7502	return ret;
7503	}
7504
7505	/*
7506	* Returns 1 if it had to move up and next. 0 is returned if it moved only next
7507	* or down.
7508	*/
7509	static int tree_advance(struct btrfs_path *path,
7510	int level, int* root_level,
7511	int allow_down,
7512	struct btrfs_key *key,
7513	u64 reada_min_gen)
7514	{
7515	int ret;
7516
7517	if (*level == `0` \|\| !allow_down) {
7518	ret = tree_move_next_or_upnext(path, level, root_level);
7519	} else {
7520	ret = tree_move_down(path, level, reada_min_gen);
7521	}
7522
7523	/*
7524	* Even if we have reached the end of a tree, ret is -1, update the key
7525	* anyway, so that in case we need to restart due to a block group
7526	* relocation, we can assert that the last key of the root node still
7527	* exists in the tree.
7528	*/
7529	if (*level == `0`)
7530	btrfs_item_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7531	nr: path->slots[*level]);
7532	else
7533	btrfs_node_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7534	nr: path->slots[*level]);
7535
7536	return ret;
7537	}
7538
7539	static int tree_compare_item(struct btrfs_path *left_path,
7540	struct btrfs_path *right_path,
7541	char *tmp_buf)
7542	{
7543	int cmp;
7544	int len1, len2;
7545	unsigned long off1, off2;
7546
7547	len1 = btrfs_item_size(eb: left_path->nodes[`0`], slot: left_path->slots[`0`]);
7548	len2 = btrfs_item_size(eb: right_path->nodes[`0`], slot: right_path->slots[`0`]);
7549	if (len1 != len2)
7550	return `1`;
7551
7552	off1 = btrfs_item_ptr_offset(left_path->nodes[`0`], left_path->slots[`0`]);
7553	off2 = btrfs_item_ptr_offset(right_path->nodes[`0`],
7554	right_path->slots[`0`]);
7555
7556	read_extent_buffer(eb: left_path->nodes[`0`], dst: tmp_buf, start: off1, len: len1);
7557
7558	cmp = memcmp_extent_buffer(eb: right_path->nodes[`0`], ptrv: tmp_buf, start: off2, len: len1);
7559	if (cmp)
7560	return `1`;
7561	return `0`;
7562	}
7563
7564	/*
7565	* A transaction used for relocating a block group was committed or is about to
7566	* finish its commit. Release our paths and restart the search, so that we are
7567	* not using stale extent buffers:
7568	*
7569	* 1) For levels > 0, we are only holding references of extent buffers, without
7570	* any locks on them, which does not prevent them from having been relocated
7571	* and reallocated after the last time we released the commit root semaphore.
7572	* The exception are the root nodes, for which we always have a clone, see
7573	* the comment at btrfs_compare_trees();
7574	*
7575	* 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7576	* we are safe from the concurrent relocation and reallocation. However they
7577	* can have file extent items with a pre relocation disk_bytenr value, so we
7578	* restart the start from the current commit roots and clone the new leaves so
7579	* that we get the post relocation disk_bytenr values. Not doing so, could
7580	* make us clone the wrong data in case there are new extents using the old
7581	* disk_bytenr that happen to be shared.
7582	*/
7583	static int restart_after_relocation(struct btrfs_path *left_path,
7584	struct btrfs_path *right_path,
7585	const struct btrfs_key *left_key,
7586	const struct btrfs_key *right_key,
7587	int left_level,
7588	int right_level,
7589	const struct send_ctx *sctx)
7590	{
7591	int root_level;
7592	int ret;
7593
7594	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
7595
7596	btrfs_release_path(p: left_path);
7597	btrfs_release_path(p: right_path);
7598
7599	/*
7600	* Since keys can not be added or removed to/from our roots because they
7601	* are readonly and we do not allow deduplication to run in parallel
7602	* (which can add, remove or change keys), the layout of the trees should
7603	* not change.
7604	*/
7605	left_path->lowest_level = left_level;
7606	ret = search_key_again(sctx, root: sctx->send_root, path: left_path, key: left_key);
7607	if (ret < `0`)
7608	return ret;
7609
7610	right_path->lowest_level = right_level;
7611	ret = search_key_again(sctx, root: sctx->parent_root, path: right_path, key: right_key);
7612	if (ret < `0`)
7613	return ret;
7614
7615	/*
7616	* If the lowest level nodes are leaves, clone them so that they can be
7617	* safely used by changed_cb() while not under the protection of the
7618	* commit root semaphore, even if relocation and reallocation happens in
7619	* parallel.
7620	*/
7621	if (left_level == `0`) {
7622	ret = replace_node_with_clone(path: left_path, level: `0`);
7623	if (ret < `0`)
7624	return ret;
7625	}
7626
7627	if (right_level == `0`) {
7628	ret = replace_node_with_clone(path: right_path, level: `0`);
7629	if (ret < `0`)
7630	return ret;
7631	}
7632
7633	/*
7634	* Now clone the root nodes (unless they happen to be the leaves we have
7635	* already cloned). This is to protect against concurrent snapshotting of
7636	* the send and parent roots (see the comment at btrfs_compare_trees()).
7637	*/
7638	root_level = btrfs_header_level(eb: sctx->send_root->commit_root);
7639	if (root_level > `0`) {
7640	ret = replace_node_with_clone(path: left_path, level: root_level);
7641	if (ret < `0`)
7642	return ret;
7643	}
7644
7645	root_level = btrfs_header_level(eb: sctx->parent_root->commit_root);
7646	if (root_level > `0`) {
7647	ret = replace_node_with_clone(path: right_path, level: root_level);
7648	if (ret < `0`)
7649	return ret;
7650	}
7651
7652	return `0`;
7653	}
7654
7655	/*
7656	* This function compares two trees and calls the provided callback for
7657	* every changed/new/deleted item it finds.
7658	* If shared tree blocks are encountered, whole subtrees are skipped, making
7659	* the compare pretty fast on snapshotted subvolumes.
7660	*
7661	* This currently works on commit roots only. As commit roots are read only,
7662	* we don't do any locking. The commit roots are protected with transactions.
7663	* Transactions are ended and rejoined when a commit is tried in between.
7664	*
7665	* This function checks for modifications done to the trees while comparing.
7666	* If it detects a change, it aborts immediately.
7667	*/
7668	static int btrfs_compare_trees(struct btrfs_root *left_root,
7669	struct btrfs_root right_root, struct* send_ctx *sctx)
7670	{
7671	struct btrfs_fs_info *fs_info = left_root->fs_info;
7672	int ret;
7673	int cmp;
7674	struct btrfs_path *left_path = NULL;
7675	struct btrfs_path *right_path = NULL;
7676	struct btrfs_key left_key;
7677	struct btrfs_key right_key;
7678	char *tmp_buf = NULL;
7679	int left_root_level;
7680	int right_root_level;
7681	int left_level;
7682	int right_level;
7683	int left_end_reached = `0`;
7684	int right_end_reached = `0`;
7685	int advance_left = `0`;
7686	int advance_right = `0`;
7687	u64 left_blockptr;
7688	u64 right_blockptr;
7689	u64 left_gen;
7690	u64 right_gen;
7691	u64 reada_min_gen;
7692
7693	left_path = btrfs_alloc_path();
7694	if (!left_path) {
7695	ret = -ENOMEM;
7696	goto out;
7697	}
7698	right_path = btrfs_alloc_path();
7699	if (!right_path) {
7700	ret = -ENOMEM;
7701	goto out;
7702	}
7703
7704	tmp_buf = kvmalloc(size: fs_info->nodesize, GFP_KERNEL);
7705	if (!tmp_buf) {
7706	ret = -ENOMEM;
7707	goto out;
7708	}
7709
7710	left_path->search_commit_root = `1`;
7711	left_path->skip_locking = `1`;
7712	right_path->search_commit_root = `1`;
7713	right_path->skip_locking = `1`;
7714
7715	/*
7716	* Strategy: Go to the first items of both trees. Then do
7717	*
7718	* If both trees are at level 0
7719	* Compare keys of current items
7720	* If left < right treat left item as new, advance left tree
7721	* and repeat
7722	* If left > right treat right item as deleted, advance right tree
7723	* and repeat
7724	* If left == right do deep compare of items, treat as changed if
7725	* needed, advance both trees and repeat
7726	* If both trees are at the same level but not at level 0
7727	* Compare keys of current nodes/leafs
7728	* If left < right advance left tree and repeat
7729	* If left > right advance right tree and repeat
7730	* If left == right compare blockptrs of the next nodes/leafs
7731	* If they match advance both trees but stay at the same level
7732	* and repeat
7733	* If they don't match advance both trees while allowing to go
7734	* deeper and repeat
7735	* If tree levels are different
7736	* Advance the tree that needs it and repeat
7737	*
7738	* Advancing a tree means:
7739	* If we are at level 0, try to go to the next slot. If that's not
7740	* possible, go one level up and repeat. Stop when we found a level
7741	* where we could go to the next slot. We may at this point be on a
7742	* node or a leaf.
7743	*
7744	* If we are not at level 0 and not on shared tree blocks, go one
7745	* level deeper.
7746	*
7747	* If we are not at level 0 and on shared tree blocks, go one slot to
7748	* the right if possible or go up and right.
7749	*/
7750
7751	down_read(sem: &fs_info->commit_root_sem);
7752	left_level = btrfs_header_level(eb: left_root->commit_root);
7753	left_root_level = left_level;
7754	/*
7755	* We clone the root node of the send and parent roots to prevent races
7756	* with snapshot creation of these roots. Snapshot creation COWs the
7757	* root node of a tree, so after the transaction is committed the old
7758	* extent can be reallocated while this send operation is still ongoing.
7759	* So we clone them, under the commit root semaphore, to be race free.
7760	*/
7761	left_path->nodes[left_level] =
7762	btrfs_clone_extent_buffer(src: left_root->commit_root);
7763	if (!left_path->nodes[left_level]) {
7764	ret = -ENOMEM;
7765	goto out_unlock;
7766	}
7767
7768	right_level = btrfs_header_level(eb: right_root->commit_root);
7769	right_root_level = right_level;
7770	right_path->nodes[right_level] =
7771	btrfs_clone_extent_buffer(src: right_root->commit_root);
7772	if (!right_path->nodes[right_level]) {
7773	ret = -ENOMEM;
7774	goto out_unlock;
7775	}
7776	/*
7777	* Our right root is the parent root, while the left root is the "send"
7778	* root. We know that all new nodes/leaves in the left root must have
7779	* a generation greater than the right root's generation, so we trigger
7780	* readahead for those nodes and leaves of the left root, as we know we
7781	* will need to read them at some point.
7782	*/
7783	reada_min_gen = btrfs_header_generation(eb: right_root->commit_root);
7784
7785	if (left_level == `0`)
7786	btrfs_item_key_to_cpu(eb: left_path->nodes[left_level],
7787	cpu_key: &left_key, nr: left_path->slots[left_level]);
7788	else
7789	btrfs_node_key_to_cpu(eb: left_path->nodes[left_level],
7790	cpu_key: &left_key, nr: left_path->slots[left_level]);
7791	if (right_level == `0`)
7792	btrfs_item_key_to_cpu(eb: right_path->nodes[right_level],
7793	cpu_key: &right_key, nr: right_path->slots[right_level]);
7794	else
7795	btrfs_node_key_to_cpu(eb: right_path->nodes[right_level],
7796	cpu_key: &right_key, nr: right_path->slots[right_level]);
7797
7798	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7799
7800	while (`1`) {
7801	if (need_resched() \|\|
7802	rwsem_is_contended(sem: &fs_info->commit_root_sem)) {
7803	up_read(sem: &fs_info->commit_root_sem);
7804	cond_resched();
7805	down_read(sem: &fs_info->commit_root_sem);
7806	}
7807
7808	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7809	ret = restart_after_relocation(left_path, right_path,
7810	left_key: &left_key, right_key: &right_key,
7811	left_level, right_level,
7812	sctx);
7813	if (ret < `0`)
7814	goto out_unlock;
7815	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7816	}
7817
7818	if (advance_left && !left_end_reached) {
7819	ret = tree_advance(path: left_path, level: &left_level,
7820	root_level: left_root_level,
7821	allow_down: advance_left != ADVANCE_ONLY_NEXT,
7822	key: &left_key, reada_min_gen);
7823	if (ret == -`1`)
7824	left_end_reached = ADVANCE;
7825	else if (ret < `0`)
7826	goto out_unlock;
7827	advance_left = `0`;
7828	}
7829	if (advance_right && !right_end_reached) {
7830	ret = tree_advance(path: right_path, level: &right_level,
7831	root_level: right_root_level,
7832	allow_down: advance_right != ADVANCE_ONLY_NEXT,
7833	key: &right_key, reada_min_gen);
7834	if (ret == -`1`)
7835	right_end_reached = ADVANCE;
7836	else if (ret < `0`)
7837	goto out_unlock;
7838	advance_right = `0`;
7839	}
7840
7841	if (left_end_reached && right_end_reached) {
7842	ret = `0`;
7843	goto out_unlock;
7844	} else if (left_end_reached) {
7845	if (right_level == `0`) {
7846	up_read(sem: &fs_info->commit_root_sem);
7847	ret = changed_cb(left_path, right_path,
7848	key: &right_key,
7849	result: BTRFS_COMPARE_TREE_DELETED,
7850	sctx);
7851	if (ret < `0`)
7852	goto out;
7853	down_read(sem: &fs_info->commit_root_sem);
7854	}
7855	advance_right = ADVANCE;
7856	continue;
7857	} else if (right_end_reached) {
7858	if (left_level == `0`) {
7859	up_read(sem: &fs_info->commit_root_sem);
7860	ret = changed_cb(left_path, right_path,
7861	key: &left_key,
7862	result: BTRFS_COMPARE_TREE_NEW,
7863	sctx);
7864	if (ret < `0`)
7865	goto out;
7866	down_read(sem: &fs_info->commit_root_sem);
7867	}
7868	advance_left = ADVANCE;
7869	continue;
7870	}
7871
7872	if (left_level == `0` && right_level == `0`) {
7873	up_read(sem: &fs_info->commit_root_sem);
7874	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7875	if (cmp < `0`) {
7876	ret = changed_cb(left_path, right_path,
7877	key: &left_key,
7878	result: BTRFS_COMPARE_TREE_NEW,
7879	sctx);
7880	advance_left = ADVANCE;
7881	} else if (cmp > `0`) {
7882	ret = changed_cb(left_path, right_path,
7883	key: &right_key,
7884	result: BTRFS_COMPARE_TREE_DELETED,
7885	sctx);
7886	advance_right = ADVANCE;
7887	} else {
7888	enum btrfs_compare_tree_result result;
7889
7890	WARN_ON(!extent_buffer_uptodate(left_path->nodes[`0`]));
7891	ret = tree_compare_item(left_path, right_path,
7892	tmp_buf);
7893	if (ret)
7894	result = BTRFS_COMPARE_TREE_CHANGED;
7895	else
7896	result = BTRFS_COMPARE_TREE_SAME;
7897	ret = changed_cb(left_path, right_path,
7898	key: &left_key, result, sctx);
7899	advance_left = ADVANCE;
7900	advance_right = ADVANCE;
7901	}
7902
7903	if (ret < `0`)
7904	goto out;
7905	down_read(sem: &fs_info->commit_root_sem);
7906	} else if (left_level == right_level) {
7907	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7908	if (cmp < `0`) {
7909	advance_left = ADVANCE;
7910	} else if (cmp > `0`) {
7911	advance_right = ADVANCE;
7912	} else {
7913	left_blockptr = btrfs_node_blockptr(
7914	eb: left_path->nodes[left_level],
7915	nr: left_path->slots[left_level]);
7916	right_blockptr = btrfs_node_blockptr(
7917	eb: right_path->nodes[right_level],
7918	nr: right_path->slots[right_level]);
7919	left_gen = btrfs_node_ptr_generation(
7920	eb: left_path->nodes[left_level],
7921	nr: left_path->slots[left_level]);
7922	right_gen = btrfs_node_ptr_generation(
7923	eb: right_path->nodes[right_level],
7924	nr: right_path->slots[right_level]);
7925	if (left_blockptr == right_blockptr &&
7926	left_gen == right_gen) {
7927	/*
7928	* As we're on a shared block, don't
7929	* allow to go deeper.
7930	*/
7931	advance_left = ADVANCE_ONLY_NEXT;
7932	advance_right = ADVANCE_ONLY_NEXT;
7933	} else {
7934	advance_left = ADVANCE;
7935	advance_right = ADVANCE;
7936	}
7937	}
7938	} else if (left_level < right_level) {
7939	advance_right = ADVANCE;
7940	} else {
7941	advance_left = ADVANCE;
7942	}
7943	}
7944
7945	out_unlock:
7946	up_read(sem: &fs_info->commit_root_sem);
7947	out:
7948	btrfs_free_path(p: left_path);
7949	btrfs_free_path(p: right_path);
7950	kvfree(addr: tmp_buf);
7951	return ret;
7952	}
7953
7954	static int send_subvol(struct send_ctx *sctx)
7955	{
7956	int ret;
7957
7958	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
7959	ret = send_header(sctx);
7960	if (ret < `0`)
7961	goto out;
7962	}
7963
7964	ret = send_subvol_begin(sctx);
7965	if (ret < `0`)
7966	goto out;
7967
7968	if (sctx->parent_root) {
7969	ret = btrfs_compare_trees(left_root: sctx->send_root, right_root: sctx->parent_root, sctx);
7970	if (ret < `0`)
7971	goto out;
7972	ret = finish_inode_if_needed(sctx, at_end: `1`);
7973	if (ret < `0`)
7974	goto out;
7975	} else {
7976	ret = full_send_tree(sctx);
7977	if (ret < `0`)
7978	goto out;
7979	}
7980
7981	out:
7982	free_recorded_refs(sctx);
7983	return ret;
7984	}
7985
7986	/*
7987	* If orphan cleanup did remove any orphans from a root, it means the tree
7988	* was modified and therefore the commit root is not the same as the current
7989	* root anymore. This is a problem, because send uses the commit root and
7990	* therefore can see inode items that don't exist in the current root anymore,
7991	* and for example make calls to btrfs_iget, which will do tree lookups based
7992	* on the current root and not on the commit root. Those lookups will fail,
7993	* returning a -ESTALE error, and making send fail with that error. So make
7994	* sure a send does not see any orphans we have just removed, and that it will
7995	* see the same inodes regardless of whether a transaction commit happened
7996	* before it started (meaning that the commit root will be the same as the
7997	* current root) or not.
7998	*/
7999	static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
8000	{
8001	int i;
8002	struct btrfs_trans_handle *trans = NULL;
8003
8004	again:
8005	if (sctx->parent_root &&
8006	sctx->parent_root->node != sctx->parent_root->commit_root)
8007	goto commit_trans;
8008
8009	for (i = `0`; i < sctx->clone_roots_cnt; i++)
8010	if (sctx->clone_roots[i].root->node !=
8011	sctx->clone_roots[i].root->commit_root)
8012	goto commit_trans;
8013
8014	if (trans)
8015	return btrfs_end_transaction(trans);
8016
8017	return `0`;
8018
8019	commit_trans:
8020	/ Use any root, all fs roots will get their commit roots updated. /
8021	if (!trans) {
8022	trans = btrfs_join_transaction(root: sctx->send_root);
8023	if (IS_ERR(ptr: trans))
8024	return PTR_ERR(ptr: trans);
8025	goto again;
8026	}
8027
8028	return btrfs_commit_transaction(trans);
8029	}
8030
8031	/*
8032	* Make sure any existing dellaloc is flushed for any root used by a send
8033	* operation so that we do not miss any data and we do not race with writeback
8034	* finishing and changing a tree while send is using the tree. This could
8035	* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
8036	* a send operation then uses the subvolume.
8037	* After flushing delalloc ensure_commit_roots_uptodate() must be called.
8038	*/
8039	static int flush_delalloc_roots(struct send_ctx *sctx)
8040	{
8041	struct btrfs_root *root = sctx->parent_root;
8042	int ret;
8043	int i;
8044
8045	if (root) {
8046	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
8047	if (ret)
8048	return ret;
8049	btrfs_wait_ordered_extents(root, U64_MAX, range_start: `0`, U64_MAX);
8050	}
8051
8052	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
8053	root = sctx->clone_roots[i].root;
8054	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
8055	if (ret)
8056	return ret;
8057	btrfs_wait_ordered_extents(root, U64_MAX, range_start: `0`, U64_MAX);
8058	}
8059
8060	return `0`;
8061	}
8062
8063	static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
8064	{
8065	spin_lock(lock: &root->root_item_lock);
8066	root->send_in_progress--;
8067	/*
8068	* Not much left to do, we don't know why it's unbalanced and
8069	* can't blindly reset it to 0.
8070	*/
8071	if (root->send_in_progress < `0`)
8072	btrfs_err(root->fs_info,
8073	"send_in_progress unbalanced %d root %llu",
8074	root->send_in_progress, root->root_key.objectid);
8075	spin_unlock(lock: &root->root_item_lock);
8076	}
8077
8078	static void dedupe_in_progress_warn(const struct btrfs_root *root)
8079	{
8080	btrfs_warn_rl(root->fs_info,
8081	"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8082	root->root_key.objectid, root->dedupe_in_progress);
8083	}
8084
8085	long btrfs_ioctl_send(struct inode inode, struct* btrfs_ioctl_send_args *arg)
8086	{
8087	int ret = `0`;
8088	struct btrfs_root *send_root = BTRFS_I(inode)->root;
8089	struct btrfs_fs_info *fs_info = send_root->fs_info;
8090	struct btrfs_root *clone_root;
8091	struct send_ctx *sctx = NULL;
8092	u32 i;
8093	u64 *clone_sources_tmp = NULL;
8094	int clone_sources_to_rollback = `0`;
8095	size_t alloc_size;
8096	int sort_clone_roots = `0`;
8097	struct btrfs_lru_cache_entry *entry;
8098	struct btrfs_lru_cache_entry *tmp;
8099
8100	if (!capable(CAP_SYS_ADMIN))
8101	return -EPERM;
8102
8103	/*
8104	* The subvolume must remain read-only during send, protect against
8105	* making it RW. This also protects against deletion.
8106	*/
8107	spin_lock(lock: &send_root->root_item_lock);
8108	if (btrfs_root_readonly(root: send_root) && send_root->dedupe_in_progress) {
8109	dedupe_in_progress_warn(root: send_root);
8110	spin_unlock(lock: &send_root->root_item_lock);
8111	return -EAGAIN;
8112	}
8113	send_root->send_in_progress++;
8114	spin_unlock(lock: &send_root->root_item_lock);
8115
8116	/*
8117	* Userspace tools do the checks and warn the user if it's
8118	* not RO.
8119	*/
8120	if (!btrfs_root_readonly(root: send_root)) {
8121	ret = -EPERM;
8122	goto out;
8123	}
8124
8125	/*
8126	* Check that we don't overflow at later allocations, we request
8127	* clone_sources_count + 1 items, and compare to unsigned long inside
8128	* access_ok. Also set an upper limit for allocation size so this can't
8129	* easily exhaust memory. Max number of clone sources is about 200K.
8130	*/
8131	if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
8132	ret = -EINVAL;
8133	goto out;
8134	}
8135
8136	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
8137	ret = -EOPNOTSUPP;
8138	goto out;
8139	}
8140
8141	sctx = kzalloc(size: sizeof(struct send_ctx), GFP_KERNEL);
8142	if (!sctx) {
8143	ret = -ENOMEM;
8144	goto out;
8145	}
8146
8147	INIT_LIST_HEAD(list: &sctx->new_refs);
8148	INIT_LIST_HEAD(list: &sctx->deleted_refs);
8149
8150	btrfs_lru_cache_init(cache: &sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
8151	btrfs_lru_cache_init(cache: &sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
8152	btrfs_lru_cache_init(cache: &sctx->dir_created_cache,
8153	SEND_MAX_DIR_CREATED_CACHE_SIZE);
8154	/*
8155	* This cache is periodically trimmed to a fixed size elsewhere, see
8156	* cache_dir_utimes() and trim_dir_utimes_cache().
8157	*/
8158	btrfs_lru_cache_init(cache: &sctx->dir_utimes_cache, max_size: `0`);
8159
8160	sctx->pending_dir_moves = RB_ROOT;
8161	sctx->waiting_dir_moves = RB_ROOT;
8162	sctx->orphan_dirs = RB_ROOT;
8163	sctx->rbtree_new_refs = RB_ROOT;
8164	sctx->rbtree_deleted_refs = RB_ROOT;
8165
8166	sctx->flags = arg->flags;
8167
8168	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
8169	if (arg->version > BTRFS_SEND_STREAM_VERSION) {
8170	ret = -EPROTO;
8171	goto out;
8172	}
8173	/ Zero means "use the highest version" /
8174	sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
8175	} else {
8176	sctx->proto = `1`;
8177	}
8178	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < `2`) {
8179	ret = -EINVAL;
8180	goto out;
8181	}
8182
8183	sctx->send_filp = fget(fd: arg->send_fd);
8184	if (!sctx->send_filp \|\| !(sctx->send_filp->f_mode & FMODE_WRITE)) {
8185	ret = -EBADF;
8186	goto out;
8187	}
8188
8189	sctx->send_root = send_root;
8190	/*
8191	* Unlikely but possible, if the subvolume is marked for deletion but
8192	* is slow to remove the directory entry, send can still be started
8193	*/
8194	if (btrfs_root_dead(root: sctx->send_root)) {
8195	ret = -EPERM;
8196	goto out;
8197	}
8198
8199	sctx->clone_roots_cnt = arg->clone_sources_count;
8200
8201	if (sctx->proto >= `2`) {
8202	u32 send_buf_num_pages;
8203
8204	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
8205	sctx->send_buf = vmalloc(size: sctx->send_max_size);
8206	if (!sctx->send_buf) {
8207	ret = -ENOMEM;
8208	goto out;
8209	}
8210	send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
8211	sctx->send_buf_pages = kcalloc(n: send_buf_num_pages,
8212	size: sizeof(*sctx->send_buf_pages),
8213	GFP_KERNEL);
8214	if (!sctx->send_buf_pages) {
8215	ret = -ENOMEM;
8216	goto out;
8217	}
8218	for (i = `0`; i < send_buf_num_pages; i++) {
8219	sctx->send_buf_pages[i] =
8220	vmalloc_to_page(addr: sctx->send_buf + (i << PAGE_SHIFT));
8221	}
8222	} else {
8223	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
8224	sctx->send_buf = kvmalloc(size: sctx->send_max_size, GFP_KERNEL);
8225	}
8226	if (!sctx->send_buf) {
8227	ret = -ENOMEM;
8228	goto out;
8229	}
8230
8231	sctx->clone_roots = kvcalloc(n: arg->clone_sources_count + `1`,
8232	size: sizeof(*sctx->clone_roots),
8233	GFP_KERNEL);
8234	if (!sctx->clone_roots) {
8235	ret = -ENOMEM;
8236	goto out;
8237	}
8238
8239	alloc_size = array_size(sizeof(*arg->clone_sources),
8240	arg->clone_sources_count);
8241
8242	if (arg->clone_sources_count) {
8243	clone_sources_tmp = kvmalloc(size: alloc_size, GFP_KERNEL);
8244	if (!clone_sources_tmp) {
8245	ret = -ENOMEM;
8246	goto out;
8247	}
8248
8249	ret = copy_from_user(to: clone_sources_tmp, from: arg->clone_sources,
8250	n: alloc_size);
8251	if (ret) {
8252	ret = -EFAULT;
8253	goto out;
8254	}
8255
8256	for (i = `0`; i < arg->clone_sources_count; i++) {
8257	clone_root = btrfs_get_fs_root(fs_info,
8258	objectid: clone_sources_tmp[i], check_ref: true);
8259	if (IS_ERR(ptr: clone_root)) {
8260	ret = PTR_ERR(ptr: clone_root);
8261	goto out;
8262	}
8263	spin_lock(lock: &clone_root->root_item_lock);
8264	if (!btrfs_root_readonly(root: clone_root) \|\|
8265	btrfs_root_dead(root: clone_root)) {
8266	spin_unlock(lock: &clone_root->root_item_lock);
8267	btrfs_put_root(root: clone_root);
8268	ret = -EPERM;
8269	goto out;
8270	}
8271	if (clone_root->dedupe_in_progress) {
8272	dedupe_in_progress_warn(root: clone_root);
8273	spin_unlock(lock: &clone_root->root_item_lock);
8274	btrfs_put_root(root: clone_root);
8275	ret = -EAGAIN;
8276	goto out;
8277	}
8278	clone_root->send_in_progress++;
8279	spin_unlock(lock: &clone_root->root_item_lock);
8280
8281	sctx->clone_roots[i].root = clone_root;
8282	clone_sources_to_rollback = i + `1`;
8283	}
8284	kvfree(addr: clone_sources_tmp);
8285	clone_sources_tmp = NULL;
8286	}
8287
8288	if (arg->parent_root) {
8289	sctx->parent_root = btrfs_get_fs_root(fs_info, objectid: arg->parent_root,
8290	check_ref: true);
8291	if (IS_ERR(ptr: sctx->parent_root)) {
8292	ret = PTR_ERR(ptr: sctx->parent_root);
8293	goto out;
8294	}
8295
8296	spin_lock(lock: &sctx->parent_root->root_item_lock);
8297	sctx->parent_root->send_in_progress++;
8298	if (!btrfs_root_readonly(root: sctx->parent_root) \|\|
8299	btrfs_root_dead(root: sctx->parent_root)) {
8300	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8301	ret = -EPERM;
8302	goto out;
8303	}
8304	if (sctx->parent_root->dedupe_in_progress) {
8305	dedupe_in_progress_warn(root: sctx->parent_root);
8306	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8307	ret = -EAGAIN;
8308	goto out;
8309	}
8310	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8311	}
8312
8313	/*
8314	* Clones from send_root are allowed, but only if the clone source
8315	* is behind the current send position. This is checked while searching
8316	* for possible clone sources.
8317	*/
8318	sctx->clone_roots[sctx->clone_roots_cnt++].root =
8319	btrfs_grab_root(root: sctx->send_root);
8320
8321	/ We do a bsearch later /
8322	sort(base: sctx->clone_roots, num: sctx->clone_roots_cnt,
8323	size: sizeof(*sctx->clone_roots), cmp_func: __clone_root_cmp_sort,
8324	NULL);
8325	sort_clone_roots = `1`;
8326
8327	ret = flush_delalloc_roots(sctx);
8328	if (ret)
8329	goto out;
8330
8331	ret = ensure_commit_roots_uptodate(sctx);
8332	if (ret)
8333	goto out;
8334
8335	ret = send_subvol(sctx);
8336	if (ret < `0`)
8337	goto out;
8338
8339	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
8340	ret = send_utimes(sctx, ino: entry->key, gen: entry->gen);
8341	if (ret < `0`)
8342	goto out;
8343	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry);
8344	}
8345
8346	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
8347	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_END);
8348	if (ret < `0`)
8349	goto out;
8350	ret = send_cmd(sctx);
8351	if (ret < `0`)
8352	goto out;
8353	}
8354
8355	out:
8356	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
8357	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
8358	struct rb_node *n;
8359	struct pending_dir_move *pm;
8360
8361	n = rb_first(&sctx->pending_dir_moves);
8362	pm = rb_entry(n, struct pending_dir_move, node);
8363	while (!list_empty(head: &pm->list)) {
8364	struct pending_dir_move *pm2;
8365
8366	pm2 = list_first_entry(&pm->list,
8367	struct pending_dir_move, list);
8368	free_pending_move(sctx, m: pm2);
8369	}
8370	free_pending_move(sctx, m: pm);
8371	}
8372
8373	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
8374	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
8375	struct rb_node *n;
8376	struct waiting_dir_move *dm;
8377
8378	n = rb_first(&sctx->waiting_dir_moves);
8379	dm = rb_entry(n, struct waiting_dir_move, node);
8380	rb_erase(&dm->node, &sctx->waiting_dir_moves);
8381	kfree(objp: dm);
8382	}
8383
8384	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
8385	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
8386	struct rb_node *n;
8387	struct orphan_dir_info *odi;
8388
8389	n = rb_first(&sctx->orphan_dirs);
8390	odi = rb_entry(n, struct orphan_dir_info, node);
8391	free_orphan_dir_info(sctx, odi);
8392	}
8393
8394	if (sort_clone_roots) {
8395	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
8396	btrfs_root_dec_send_in_progress(
8397	root: sctx->clone_roots[i].root);
8398	btrfs_put_root(root: sctx->clone_roots[i].root);
8399	}
8400	} else {
8401	for (i = `0`; sctx && i < clone_sources_to_rollback; i++) {
8402	btrfs_root_dec_send_in_progress(
8403	root: sctx->clone_roots[i].root);
8404	btrfs_put_root(root: sctx->clone_roots[i].root);
8405	}
8406
8407	btrfs_root_dec_send_in_progress(root: send_root);
8408	}
8409	if (sctx && !IS_ERR_OR_NULL(ptr: sctx->parent_root)) {
8410	btrfs_root_dec_send_in_progress(root: sctx->parent_root);
8411	btrfs_put_root(root: sctx->parent_root);
8412	}
8413
8414	kvfree(addr: clone_sources_tmp);
8415
8416	if (sctx) {
8417	if (sctx->send_filp)
8418	fput(sctx->send_filp);
8419
8420	kvfree(addr: sctx->clone_roots);
8421	kfree(objp: sctx->send_buf_pages);
8422	kvfree(addr: sctx->send_buf);
8423	kvfree(addr: sctx->verity_descriptor);
8424
8425	close_current_inode(sctx);
8426
8427	btrfs_lru_cache_clear(cache: &sctx->name_cache);
8428	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
8429	btrfs_lru_cache_clear(cache: &sctx->dir_created_cache);
8430	btrfs_lru_cache_clear(cache: &sctx->dir_utimes_cache);
8431
8432	kfree(objp: sctx);
8433	}
8434
8435	return ret;
8436	}
8437

source code of linux/fs/btrfs/send.c