file.c source code [linux/fs/btrfs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <linux/fs.h>
7	#include <linux/pagemap.h>
8	#include <linux/time.h>
9	#include <linux/init.h>
10	#include <linux/string.h>
11	#include <linux/backing-dev.h>
12	#include <linux/falloc.h>
13	#include <linux/writeback.h>
14	#include <linux/compat.h>
15	#include <linux/slab.h>
16	#include <linux/btrfs.h>
17	#include <linux/uio.h>
18	#include <linux/iversion.h>
19	#include <linux/fsverity.h>
20	#include <linux/iomap.h>
21	#include "ctree.h"
22	#include "disk-io.h"
23	#include "transaction.h"
24	#include "btrfs_inode.h"
25	#include "tree-log.h"
26	#include "locking.h"
27	#include "qgroup.h"
28	#include "compression.h"
29	#include "delalloc-space.h"
30	#include "reflink.h"
31	#include "subpage.h"
32	#include "fs.h"
33	#include "accessors.h"
34	#include "extent-tree.h"
35	#include "file-item.h"
36	#include "ioctl.h"
37	#include "file.h"
38	#include "super.h"
39
40	/ simple helper to fault in pages and copy. This should go away*
41	* and be replaced with calls into generic code.
42	*/
43	static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
44	struct page **prepared_pages,
45	struct iov_iter *i)
46	{
47	size_t copied = `0`;
48	size_t total_copied = `0`;
49	int pg = `0`;
50	int offset = offset_in_page(pos);
51
52	while (write_bytes > `0`) {
53	size_t count = min_t(size_t,
54	PAGE_SIZE - offset, write_bytes);
55	struct page *page = prepared_pages[pg];
56	/*
57	* Copy data from userspace to the current page
58	*/
59	copied = copy_page_from_iter_atomic(page, offset, bytes: count, i);
60
61	/ Flush processor's dcache for this page /
62	flush_dcache_page(page);
63
64	/*
65	* if we get a partial write, we can end up with
66	* partially up to date pages. These add
67	* a lot of complexity, so make sure they don't
68	* happen by forcing this copy to be retried.
69	*
70	* The rest of the btrfs_file_write code will fall
71	* back to page at a time copies after we return 0.
72	*/
73	if (unlikely(copied < count)) {
74	if (!PageUptodate(page)) {
75	iov_iter_revert(i, bytes: copied);
76	copied = `0`;
77	}
78	if (!copied)
79	break;
80	}
81
82	write_bytes -= copied;
83	total_copied += copied;
84	offset += copied;
85	if (offset == PAGE_SIZE) {
86	pg++;
87	offset = `0`;
88	}
89	}
90	return total_copied;
91	}
92
93	/*
94	* unlocks pages after btrfs_file_write is done with them
95	*/
96	static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
97	struct page **pages, size_t num_pages,
98	u64 pos, u64 copied)
99	{
100	size_t i;
101	u64 block_start = round_down(pos, fs_info->sectorsize);
102	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
103
104	ASSERT(block_len <= U32_MAX);
105	for (i = `0`; i < num_pages; i++) {
106	/ page checked is some magic around finding pages that*
107	* have been modified without going through btrfs_set_page_dirty
108	* clear it here. There should be no need to mark the pages
109	* accessed as prepare_pages should have marked them accessed
110	* in prepare_pages via find_or_create_page()
111	*/
112	btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
113	start: block_start, len: block_len);
114	unlock_page(page: pages[i]);
115	put_page(page: pages[i]);
116	}
117	}
118
119	/*
120	* After btrfs_copy_from_user(), update the following things for delalloc:
121	* - Mark newly dirtied pages as DELALLOC in the io tree.
122	* Used to advise which range is to be written back.
123	* - Mark modified pages as Uptodate/Dirty and not needing COW fixup
124	* - Update inode size for past EOF write
125	*/
126	int btrfs_dirty_pages(struct btrfs_inode inode, struct* page **pages,
127	size_t num_pages, loff_t pos, size_t write_bytes,
128	struct extent_state **cached, bool noreserve)
129	{
130	struct btrfs_fs_info *fs_info = inode->root->fs_info;
131	int err = `0`;
132	int i;
133	u64 num_bytes;
134	u64 start_pos;
135	u64 end_of_last_block;
136	u64 end_pos = pos + write_bytes;
137	loff_t isize = i_size_read(inode: &inode->vfs_inode);
138	unsigned int extra_bits = `0`;
139
140	if (write_bytes == `0`)
141	return `0`;
142
143	if (noreserve)
144	extra_bits \|= EXTENT_NORESERVE;
145
146	start_pos = round_down(pos, fs_info->sectorsize);
147	num_bytes = round_up(write_bytes + pos - start_pos,
148	fs_info->sectorsize);
149	ASSERT(num_bytes <= U32_MAX);
150
151	end_of_last_block = start_pos + num_bytes - `1`;
152
153	/*
154	* The pages may have already been dirty, clear out old accounting so
155	* we can set things up properly
156	*/
157	clear_extent_bit(tree: &inode->io_tree, start: start_pos, end: end_of_last_block,
158	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
159	cached);
160
161	err = btrfs_set_extent_delalloc(inode, start: start_pos, end: end_of_last_block,
162	extra_bits, cached_state: cached);
163	if (err)
164	return err;
165
166	for (i = `0`; i < num_pages; i++) {
167	struct page *p = pages[i];
168
169	btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
170	start: start_pos, len: num_bytes);
171	btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
172	start: start_pos, len: num_bytes);
173	btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
174	start: start_pos, len: num_bytes);
175	}
176
177	/*
178	* we've only changed i_size in ram, and we haven't updated
179	* the disk i_size. There is no need to log the inode
180	* at this time.
181	*/
182	if (end_pos > isize)
183	i_size_write(inode: &inode->vfs_inode, i_size: end_pos);
184	return `0`;
185	}
186
187	/*
188	* this is very complex, but the basic idea is to drop all extents
189	* in the range start - end. hint_block is filled in with a block number
190	* that would be a good hint to the block allocator for this file.
191	*
192	* If an extent intersects the range but is not entirely inside the range
193	* it is either truncated or split. Anything entirely inside the range
194	* is deleted from the tree.
195	*
196	* Note: the VFS' inode number of bytes is not updated, it's up to the caller
197	* to deal with that. We set the field 'bytes_found' of the arguments structure
198	* with the number of allocated bytes found in the target range, so that the
199	* caller can update the inode's number of bytes in an atomic way when
200	* replacing extents in a range to avoid races with stat(2).
201	*/
202	int btrfs_drop_extents(struct btrfs_trans_handle *trans,
203	struct btrfs_root root, struct* btrfs_inode *inode,
204	struct btrfs_drop_extents_args *args)
205	{
206	struct btrfs_fs_info *fs_info = root->fs_info;
207	struct extent_buffer *leaf;
208	struct btrfs_file_extent_item *fi;
209	struct btrfs_ref ref = { `0` };
210	struct btrfs_key key;
211	struct btrfs_key new_key;
212	u64 ino = btrfs_ino(inode);
213	u64 search_start = args->start;
214	u64 disk_bytenr = `0`;
215	u64 num_bytes = `0`;
216	u64 extent_offset = `0`;
217	u64 extent_end = `0`;
218	u64 last_end = args->start;
219	int del_nr = `0`;
220	int del_slot = `0`;
221	int extent_type;
222	int recow;
223	int ret;
224	int modify_tree = -`1`;
225	int update_refs;
226	int found = `0`;
227	struct btrfs_path *path = args->path;
228
229	args->bytes_found = `0`;
230	args->extent_inserted = false;
231
232	/ Must always have a path if ->replace_extent is true /
233	ASSERT(!(args->replace_extent && !args->path));
234
235	if (!path) {
236	path = btrfs_alloc_path();
237	if (!path) {
238	ret = -ENOMEM;
239	goto out;
240	}
241	}
242
243	if (args->drop_cache)
244	btrfs_drop_extent_map_range(inode, start: args->start, end: args->end - `1`, skip_pinned: false);
245
246	if (args->start >= inode->disk_i_size && !args->replace_extent)
247	modify_tree = `0`;
248
249	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
250	while (`1`) {
251	recow = `0`;
252	ret = btrfs_lookup_file_extent(trans, root, path, objectid: ino,
253	bytenr: search_start, mod: modify_tree);
254	if (ret < `0`)
255	break;
256	if (ret > `0` && path->slots[`0`] > `0` && search_start == args->start) {
257	leaf = path->nodes[`0`];
258	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`] - `1`);
259	if (key.objectid == ino &&
260	key.type == BTRFS_EXTENT_DATA_KEY)
261	path->slots[`0`]--;
262	}
263	ret = `0`;
264	next_slot:
265	leaf = path->nodes[`0`];
266	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
267	BUG_ON(del_nr > `0`);
268	ret = btrfs_next_leaf(root, path);
269	if (ret < `0`)
270	break;
271	if (ret > `0`) {
272	ret = `0`;
273	break;
274	}
275	leaf = path->nodes[`0`];
276	recow = `1`;
277	}
278
279	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
280
281	if (key.objectid > ino)
282	break;
283	if (WARN_ON_ONCE(key.objectid < ino) \|\|
284	key.type < BTRFS_EXTENT_DATA_KEY) {
285	ASSERT(del_nr == `0`);
286	path->slots[`0`]++;
287	goto next_slot;
288	}
289	if (key.type > BTRFS_EXTENT_DATA_KEY \|\| key.offset >= args->end)
290	break;
291
292	fi = btrfs_item_ptr(leaf, path->slots[`0`],
293	struct btrfs_file_extent_item);
294	extent_type = btrfs_file_extent_type(eb: leaf, s: fi);
295
296	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
297	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
298	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: fi);
299	num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: fi);
300	extent_offset = btrfs_file_extent_offset(eb: leaf, s: fi);
301	extent_end = key.offset +
302	btrfs_file_extent_num_bytes(eb: leaf, s: fi);
303	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
304	extent_end = key.offset +
305	btrfs_file_extent_ram_bytes(eb: leaf, s: fi);
306	} else {
307	/ can't happen /
308	BUG();
309	}
310
311	/*
312	* Don't skip extent items representing 0 byte lengths. They
313	* used to be created (bug) if while punching holes we hit
314	* -ENOSPC condition. So if we find one here, just ensure we
315	* delete it, otherwise we would insert a new file extent item
316	* with the same key (offset) as that 0 bytes length file
317	* extent item in the call to setup_items_for_insert() later
318	* in this function.
319	*/
320	if (extent_end == key.offset && extent_end >= search_start) {
321	last_end = extent_end;
322	goto delete_extent_item;
323	}
324
325	if (extent_end <= search_start) {
326	path->slots[`0`]++;
327	goto next_slot;
328	}
329
330	found = `1`;
331	search_start = max(key.offset, args->start);
332	if (recow \|\| !modify_tree) {
333	modify_tree = -`1`;
334	btrfs_release_path(p: path);
335	continue;
336	}
337
338	/*
339	* \| - range to drop - \|
340	* \| -------- extent -------- \|
341	*/
342	if (args->start > key.offset && args->end < extent_end) {
343	BUG_ON(del_nr > `0`);
344	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
345	ret = -EOPNOTSUPP;
346	break;
347	}
348
349	memcpy(&new_key, &key, sizeof(new_key));
350	new_key.offset = args->start;
351	ret = btrfs_duplicate_item(trans, root, path,
352	new_key: &new_key);
353	if (ret == -EAGAIN) {
354	btrfs_release_path(p: path);
355	continue;
356	}
357	if (ret < `0`)
358	break;
359
360	leaf = path->nodes[`0`];
361	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
362	struct btrfs_file_extent_item);
363	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
364	val: args->start - key.offset);
365
366	fi = btrfs_item_ptr(leaf, path->slots[`0`],
367	struct btrfs_file_extent_item);
368
369	extent_offset += args->start - key.offset;
370	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: extent_offset);
371	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
372	val: extent_end - args->start);
373	btrfs_mark_buffer_dirty(trans, buf: leaf);
374
375	if (update_refs && disk_bytenr > `0`) {
376	btrfs_init_generic_ref(generic_ref: &ref,
377	action: BTRFS_ADD_DELAYED_REF,
378	bytenr: disk_bytenr, len: num_bytes, parent: `0`,
379	owning_root: root->root_key.objectid);
380	btrfs_init_data_ref(generic_ref: &ref,
381	ref_root: root->root_key.objectid,
382	ino: new_key.objectid,
383	offset: args->start - extent_offset,
384	mod_root: `0`, skip_qgroup: false);
385	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
386	if (ret) {
387	btrfs_abort_transaction(trans, ret);
388	break;
389	}
390	}
391	key.offset = args->start;
392	}
393	/*
394	* From here on out we will have actually dropped something, so
395	* last_end can be updated.
396	*/
397	last_end = extent_end;
398
399	/*
400	* \| ---- range to drop ----- \|
401	* \| -------- extent -------- \|
402	*/
403	if (args->start <= key.offset && args->end < extent_end) {
404	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
405	ret = -EOPNOTSUPP;
406	break;
407	}
408
409	memcpy(&new_key, &key, sizeof(new_key));
410	new_key.offset = args->end;
411	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
412
413	extent_offset += args->end - key.offset;
414	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: extent_offset);
415	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
416	val: extent_end - args->end);
417	btrfs_mark_buffer_dirty(trans, buf: leaf);
418	if (update_refs && disk_bytenr > `0`)
419	args->bytes_found += args->end - key.offset;
420	break;
421	}
422
423	search_start = extent_end;
424	/*
425	* \| ---- range to drop ----- \|
426	* \| -------- extent -------- \|
427	*/
428	if (args->start > key.offset && args->end >= extent_end) {
429	BUG_ON(del_nr > `0`);
430	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
431	ret = -EOPNOTSUPP;
432	break;
433	}
434
435	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
436	val: args->start - key.offset);
437	btrfs_mark_buffer_dirty(trans, buf: leaf);
438	if (update_refs && disk_bytenr > `0`)
439	args->bytes_found += extent_end - args->start;
440	if (args->end == extent_end)
441	break;
442
443	path->slots[`0`]++;
444	goto next_slot;
445	}
446
447	/*
448	* \| ---- range to drop ----- \|
449	* \| ------ extent ------ \|
450	*/
451	if (args->start <= key.offset && args->end >= extent_end) {
452	delete_extent_item:
453	if (del_nr == `0`) {
454	del_slot = path->slots[`0`];
455	del_nr = `1`;
456	} else {
457	BUG_ON(del_slot + del_nr != path->slots[`0`]);
458	del_nr++;
459	}
460
461	if (update_refs &&
462	extent_type == BTRFS_FILE_EXTENT_INLINE) {
463	args->bytes_found += extent_end - key.offset;
464	extent_end = ALIGN(extent_end,
465	fs_info->sectorsize);
466	} else if (update_refs && disk_bytenr > `0`) {
467	btrfs_init_generic_ref(generic_ref: &ref,
468	action: BTRFS_DROP_DELAYED_REF,
469	bytenr: disk_bytenr, len: num_bytes, parent: `0`,
470	owning_root: root->root_key.objectid);
471	btrfs_init_data_ref(generic_ref: &ref,
472	ref_root: root->root_key.objectid,
473	ino: key.objectid,
474	offset: key.offset - extent_offset, mod_root: `0`,
475	skip_qgroup: false);
476	ret = btrfs_free_extent(trans, ref: &ref);
477	if (ret) {
478	btrfs_abort_transaction(trans, ret);
479	break;
480	}
481	args->bytes_found += extent_end - key.offset;
482	}
483
484	if (args->end == extent_end)
485	break;
486
487	if (path->slots[`0`] + `1` < btrfs_header_nritems(eb: leaf)) {
488	path->slots[`0`]++;
489	goto next_slot;
490	}
491
492	ret = btrfs_del_items(trans, root, path, slot: del_slot,
493	nr: del_nr);
494	if (ret) {
495	btrfs_abort_transaction(trans, ret);
496	break;
497	}
498
499	del_nr = `0`;
500	del_slot = `0`;
501
502	btrfs_release_path(p: path);
503	continue;
504	}
505
506	BUG();
507	}
508
509	if (!ret && del_nr > `0`) {
510	/*
511	* Set path->slots[0] to first slot, so that after the delete
512	* if items are move off from our leaf to its immediate left or
513	* right neighbor leafs, we end up with a correct and adjusted
514	* path->slots[0] for our insertion (if args->replace_extent).
515	*/
516	path->slots[`0`] = del_slot;
517	ret = btrfs_del_items(trans, root, path, slot: del_slot, nr: del_nr);
518	if (ret)
519	btrfs_abort_transaction(trans, ret);
520	}
521
522	leaf = path->nodes[`0`];
523	/*
524	* If btrfs_del_items() was called, it might have deleted a leaf, in
525	* which case it unlocked our path, so check path->locks[0] matches a
526	* write lock.
527	*/
528	if (!ret && args->replace_extent &&
529	path->locks[`0`] == BTRFS_WRITE_LOCK &&
530	btrfs_leaf_free_space(leaf) >=
531	sizeof(struct btrfs_item) + args->extent_item_size) {
532
533	key.objectid = ino;
534	key.type = BTRFS_EXTENT_DATA_KEY;
535	key.offset = args->start;
536	if (!del_nr && path->slots[`0`] < btrfs_header_nritems(eb: leaf)) {
537	struct btrfs_key slot_key;
538
539	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &slot_key, nr: path->slots[`0`]);
540	if (btrfs_comp_cpu_keys(k1: &key, k2: &slot_key) > `0`)
541	path->slots[`0`]++;
542	}
543	btrfs_setup_item_for_insert(trans, root, path, key: &key,
544	data_size: args->extent_item_size);
545	args->extent_inserted = true;
546	}
547
548	if (!args->path)
549	btrfs_free_path(p: path);
550	else if (!args->extent_inserted)
551	btrfs_release_path(p: path);
552	out:
553	args->drop_end = found ? min(args->end, last_end) : args->end;
554
555	return ret;
556	}
557
558	static int extent_mergeable(struct extent_buffer leaf, int* slot,
559	u64 objectid, u64 bytenr, u64 orig_offset,
560	u64 start, u64 end)
561	{
562	struct btrfs_file_extent_item *fi;
563	struct btrfs_key key;
564	u64 extent_end;
565
566	if (slot < `0` \|\| slot >= btrfs_header_nritems(eb: leaf))
567	return `0`;
568
569	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
570	if (key.objectid != objectid \|\| key.type != BTRFS_EXTENT_DATA_KEY)
571	return `0`;
572
573	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
574	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_REG \|\|
575	btrfs_file_extent_disk_bytenr(eb: leaf, s: fi) != bytenr \|\|
576	btrfs_file_extent_offset(eb: leaf, s: fi) != key.offset - orig_offset \|\|
577	btrfs_file_extent_compression(eb: leaf, s: fi) \|\|
578	btrfs_file_extent_encryption(eb: leaf, s: fi) \|\|
579	btrfs_file_extent_other_encoding(eb: leaf, s: fi))
580	return `0`;
581
582	extent_end = key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi);
583	if ((start && start != key.offset) \|\| (end && end != extent_end))
584	return `0`;
585
586	*start = key.offset;
587	*end = extent_end;
588	return `1`;
589	}
590
591	/*
592	* Mark extent in the range start - end as written.
593	*
594	* This changes extent type from 'pre-allocated' to 'regular'. If only
595	* part of extent is marked as written, the extent will be split into
596	* two or three.
597	*/
598	int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
599	struct btrfs_inode *inode, u64 start, u64 end)
600	{
601	struct btrfs_root *root = inode->root;
602	struct extent_buffer *leaf;
603	struct btrfs_path *path;
604	struct btrfs_file_extent_item *fi;
605	struct btrfs_ref ref = { `0` };
606	struct btrfs_key key;
607	struct btrfs_key new_key;
608	u64 bytenr;
609	u64 num_bytes;
610	u64 extent_end;
611	u64 orig_offset;
612	u64 other_start;
613	u64 other_end;
614	u64 split;
615	int del_nr = `0`;
616	int del_slot = `0`;
617	int recow;
618	int ret = `0`;
619	u64 ino = btrfs_ino(inode);
620
621	path = btrfs_alloc_path();
622	if (!path)
623	return -ENOMEM;
624	again:
625	recow = `0`;
626	split = start;
627	key.objectid = ino;
628	key.type = BTRFS_EXTENT_DATA_KEY;
629	key.offset = split;
630
631	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
632	if (ret < `0`)
633	goto out;
634	if (ret > `0` && path->slots[`0`] > `0`)
635	path->slots[`0`]--;
636
637	leaf = path->nodes[`0`];
638	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
639	if (key.objectid != ino \|\|
640	key.type != BTRFS_EXTENT_DATA_KEY) {
641	ret = -EINVAL;
642	btrfs_abort_transaction(trans, ret);
643	goto out;
644	}
645	fi = btrfs_item_ptr(leaf, path->slots[`0`],
646	struct btrfs_file_extent_item);
647	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_PREALLOC) {
648	ret = -EINVAL;
649	btrfs_abort_transaction(trans, ret);
650	goto out;
651	}
652	extent_end = key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi);
653	if (key.offset > start \|\| extent_end < end) {
654	ret = -EINVAL;
655	btrfs_abort_transaction(trans, ret);
656	goto out;
657	}
658
659	bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: fi);
660	num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: fi);
661	orig_offset = key.offset - btrfs_file_extent_offset(eb: leaf, s: fi);
662	memcpy(&new_key, &key, sizeof(new_key));
663
664	if (start == key.offset && end < extent_end) {
665	other_start = `0`;
666	other_end = start;
667	if (extent_mergeable(leaf, slot: path->slots[`0`] - `1`,
668	objectid: ino, bytenr, orig_offset,
669	start: &other_start, end: &other_end)) {
670	new_key.offset = end;
671	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
672	fi = btrfs_item_ptr(leaf, path->slots[`0`],
673	struct btrfs_file_extent_item);
674	btrfs_set_file_extent_generation(eb: leaf, s: fi,
675	val: trans->transid);
676	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
677	val: extent_end - end);
678	btrfs_set_file_extent_offset(eb: leaf, s: fi,
679	val: end - orig_offset);
680	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
681	struct btrfs_file_extent_item);
682	btrfs_set_file_extent_generation(eb: leaf, s: fi,
683	val: trans->transid);
684	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
685	val: end - other_start);
686	btrfs_mark_buffer_dirty(trans, buf: leaf);
687	goto out;
688	}
689	}
690
691	if (start > key.offset && end == extent_end) {
692	other_start = end;
693	other_end = `0`;
694	if (extent_mergeable(leaf, slot: path->slots[`0`] + `1`,
695	objectid: ino, bytenr, orig_offset,
696	start: &other_start, end: &other_end)) {
697	fi = btrfs_item_ptr(leaf, path->slots[`0`],
698	struct btrfs_file_extent_item);
699	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
700	val: start - key.offset);
701	btrfs_set_file_extent_generation(eb: leaf, s: fi,
702	val: trans->transid);
703	path->slots[`0`]++;
704	new_key.offset = start;
705	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
706
707	fi = btrfs_item_ptr(leaf, path->slots[`0`],
708	struct btrfs_file_extent_item);
709	btrfs_set_file_extent_generation(eb: leaf, s: fi,
710	val: trans->transid);
711	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
712	val: other_end - start);
713	btrfs_set_file_extent_offset(eb: leaf, s: fi,
714	val: start - orig_offset);
715	btrfs_mark_buffer_dirty(trans, buf: leaf);
716	goto out;
717	}
718	}
719
720	while (start > key.offset \|\| end < extent_end) {
721	if (key.offset == start)
722	split = end;
723
724	new_key.offset = split;
725	ret = btrfs_duplicate_item(trans, root, path, new_key: &new_key);
726	if (ret == -EAGAIN) {
727	btrfs_release_path(p: path);
728	goto again;
729	}
730	if (ret < `0`) {
731	btrfs_abort_transaction(trans, ret);
732	goto out;
733	}
734
735	leaf = path->nodes[`0`];
736	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
737	struct btrfs_file_extent_item);
738	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
739	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
740	val: split - key.offset);
741
742	fi = btrfs_item_ptr(leaf, path->slots[`0`],
743	struct btrfs_file_extent_item);
744
745	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
746	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: split - orig_offset);
747	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
748	val: extent_end - split);
749	btrfs_mark_buffer_dirty(trans, buf: leaf);
750
751	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_ADD_DELAYED_REF, bytenr,
752	len: num_bytes, parent: `0`, owning_root: root->root_key.objectid);
753	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid, ino,
754	offset: orig_offset, mod_root: `0`, skip_qgroup: false);
755	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
756	if (ret) {
757	btrfs_abort_transaction(trans, ret);
758	goto out;
759	}
760
761	if (split == start) {
762	key.offset = start;
763	} else {
764	if (start != key.offset) {
765	ret = -EINVAL;
766	btrfs_abort_transaction(trans, ret);
767	goto out;
768	}
769	path->slots[`0`]--;
770	extent_end = end;
771	}
772	recow = `1`;
773	}
774
775	other_start = end;
776	other_end = `0`;
777	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_DROP_DELAYED_REF, bytenr,
778	len: num_bytes, parent: `0`, owning_root: root->root_key.objectid);
779	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid, ino, offset: orig_offset,
780	mod_root: `0`, skip_qgroup: false);
781	if (extent_mergeable(leaf, slot: path->slots[`0`] + `1`,
782	objectid: ino, bytenr, orig_offset,
783	start: &other_start, end: &other_end)) {
784	if (recow) {
785	btrfs_release_path(p: path);
786	goto again;
787	}
788	extent_end = other_end;
789	del_slot = path->slots[`0`] + `1`;
790	del_nr++;
791	ret = btrfs_free_extent(trans, ref: &ref);
792	if (ret) {
793	btrfs_abort_transaction(trans, ret);
794	goto out;
795	}
796	}
797	other_start = `0`;
798	other_end = start;
799	if (extent_mergeable(leaf, slot: path->slots[`0`] - `1`,
800	objectid: ino, bytenr, orig_offset,
801	start: &other_start, end: &other_end)) {
802	if (recow) {
803	btrfs_release_path(p: path);
804	goto again;
805	}
806	key.offset = other_start;
807	del_slot = path->slots[`0`];
808	del_nr++;
809	ret = btrfs_free_extent(trans, ref: &ref);
810	if (ret) {
811	btrfs_abort_transaction(trans, ret);
812	goto out;
813	}
814	}
815	if (del_nr == `0`) {
816	fi = btrfs_item_ptr(leaf, path->slots[`0`],
817	struct btrfs_file_extent_item);
818	btrfs_set_file_extent_type(eb: leaf, s: fi,
819	val: BTRFS_FILE_EXTENT_REG);
820	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
821	btrfs_mark_buffer_dirty(trans, buf: leaf);
822	} else {
823	fi = btrfs_item_ptr(leaf, del_slot - `1`,
824	struct btrfs_file_extent_item);
825	btrfs_set_file_extent_type(eb: leaf, s: fi,
826	val: BTRFS_FILE_EXTENT_REG);
827	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
828	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
829	val: extent_end - key.offset);
830	btrfs_mark_buffer_dirty(trans, buf: leaf);
831
832	ret = btrfs_del_items(trans, root, path, slot: del_slot, nr: del_nr);
833	if (ret < `0`) {
834	btrfs_abort_transaction(trans, ret);
835	goto out;
836	}
837	}
838	out:
839	btrfs_free_path(p: path);
840	return ret;
841	}
842
843	/*
844	* on error we return an unlocked page and the error value
845	* on success we return a locked page and 0
846	*/
847	static int prepare_uptodate_page(struct inode *inode,
848	struct page *page, u64 pos,
849	bool force_uptodate)
850	{
851	struct folio *folio = page_folio(page);
852	int ret = `0`;
853
854	if (((pos & (PAGE_SIZE - `1`)) \|\| force_uptodate) &&
855	!PageUptodate(page)) {
856	ret = btrfs_read_folio(NULL, folio);
857	if (ret)
858	return ret;
859	lock_page(page);
860	if (!PageUptodate(page)) {
861	unlock_page(page);
862	return -EIO;
863	}
864
865	/*
866	* Since btrfs_read_folio() will unlock the folio before it
867	* returns, there is a window where btrfs_release_folio() can be
868	* called to release the page. Here we check both inode
869	* mapping and PagePrivate() to make sure the page was not
870	* released.
871	*
872	* The private flag check is essential for subpage as we need
873	* to store extra bitmap using folio private.
874	*/
875	if (page->mapping != inode->i_mapping \|\| !folio_test_private(folio)) {
876	unlock_page(page);
877	return -EAGAIN;
878	}
879	}
880	return `0`;
881	}
882
883	static fgf_t get_prepare_fgp_flags(bool nowait)
884	{
885	fgf_t fgp_flags = FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT;
886
887	if (nowait)
888	fgp_flags \|= FGP_NOWAIT;
889
890	return fgp_flags;
891	}
892
893	static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
894	{
895	gfp_t gfp;
896
897	gfp = btrfs_alloc_write_mask(mapping: inode->i_mapping);
898	if (nowait) {
899	gfp &= ~__GFP_DIRECT_RECLAIM;
900	gfp \|= GFP_NOWAIT;
901	}
902
903	return gfp;
904	}
905
906	/*
907	* this just gets pages into the page cache and locks them down.
908	*/
909	static noinline int prepare_pages(struct inode inode, struct* page **pages,
910	size_t num_pages, loff_t pos,
911	size_t write_bytes, bool force_uptodate,
912	bool nowait)
913	{
914	int i;
915	unsigned long index = pos >> PAGE_SHIFT;
916	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
917	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
918	int err = `0`;
919	int faili;
920
921	for (i = `0`; i < num_pages; i++) {
922	again:
923	pages[i] = pagecache_get_page(mapping: inode->i_mapping, index: index + i,
924	fgp_flags, gfp: mask \| __GFP_WRITE);
925	if (!pages[i]) {
926	faili = i - `1`;
927	if (nowait)
928	err = -EAGAIN;
929	else
930	err = -ENOMEM;
931	goto fail;
932	}
933
934	err = set_page_extent_mapped(pages[i]);
935	if (err < `0`) {
936	faili = i;
937	goto fail;
938	}
939
940	if (i == `0`)
941	err = prepare_uptodate_page(inode, page: pages[i], pos,
942	force_uptodate);
943	if (!err && i == num_pages - `1`)
944	err = prepare_uptodate_page(inode, page: pages[i],
945	pos: pos + write_bytes, force_uptodate: false);
946	if (err) {
947	put_page(page: pages[i]);
948	if (!nowait && err == -EAGAIN) {
949	err = `0`;
950	goto again;
951	}
952	faili = i - `1`;
953	goto fail;
954	}
955	wait_on_page_writeback(page: pages[i]);
956	}
957
958	return `0`;
959	fail:
960	while (faili >= `0`) {
961	unlock_page(page: pages[faili]);
962	put_page(page: pages[faili]);
963	faili--;
964	}
965	return err;
966
967	}
968
969	/*
970	* This function locks the extent and properly waits for data=ordered extents
971	* to finish before allowing the pages to be modified if need.
972	*
973	* The return value:
974	* 1 - the extent is locked
975	* 0 - the extent is not locked, and everything is OK
976	* -EAGAIN - need re-prepare the pages
977	* the other < 0 number - Something wrong happens
978	*/
979	static noinline int
980	lock_and_cleanup_extent_if_need(struct btrfs_inode inode, struct* page **pages,
981	size_t num_pages, loff_t pos,
982	size_t write_bytes,
983	u64 lockstart, u64 lockend, bool nowait,
984	struct extent_state **cached_state)
985	{
986	struct btrfs_fs_info *fs_info = inode->root->fs_info;
987	u64 start_pos;
988	u64 last_pos;
989	int i;
990	int ret = `0`;
991
992	start_pos = round_down(pos, fs_info->sectorsize);
993	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - `1`;
994
995	if (start_pos < inode->vfs_inode.i_size) {
996	struct btrfs_ordered_extent *ordered;
997
998	if (nowait) {
999	if (!try_lock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos,
1000	cached: cached_state)) {
1001	for (i = `0`; i < num_pages; i++) {
1002	unlock_page(page: pages[i]);
1003	put_page(page: pages[i]);
1004	pages[i] = NULL;
1005	}
1006
1007	return -EAGAIN;
1008	}
1009	} else {
1010	lock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos, cached: cached_state);
1011	}
1012
1013	ordered = btrfs_lookup_ordered_range(inode, file_offset: start_pos,
1014	len: last_pos - start_pos + `1`);
1015	if (ordered &&
1016	ordered->file_offset + ordered->num_bytes > start_pos &&
1017	ordered->file_offset <= last_pos) {
1018	unlock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos,
1019	cached: cached_state);
1020	for (i = `0`; i < num_pages; i++) {
1021	unlock_page(page: pages[i]);
1022	put_page(page: pages[i]);
1023	}
1024	btrfs_start_ordered_extent(entry: ordered);
1025	btrfs_put_ordered_extent(entry: ordered);
1026	return -EAGAIN;
1027	}
1028	if (ordered)
1029	btrfs_put_ordered_extent(entry: ordered);
1030
1031	*lockstart = start_pos;
1032	*lockend = last_pos;
1033	ret = `1`;
1034	}
1035
1036	/*
1037	* We should be called after prepare_pages() which should have locked
1038	* all pages in the range.
1039	*/
1040	for (i = `0`; i < num_pages; i++)
1041	WARN_ON(!PageLocked(pages[i]));
1042
1043	return ret;
1044	}
1045
1046	/*
1047	* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1048	*
1049	* @pos: File offset.
1050	* @write_bytes: The length to write, will be updated to the nocow writeable
1051	* range.
1052	*
1053	* This function will flush ordered extents in the range to ensure proper
1054	* nocow checks.
1055	*
1056	* Return:
1057	* > 0 If we can nocow, and updates @write_bytes.
1058	* 0 If we can't do a nocow write.
1059	* -EAGAIN If we can't do a nocow write because snapshoting of the inode's
1060	* root is in progress.
1061	* < 0 If an error happened.
1062	*
1063	* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1064	*/
1065	int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1066	size_t *write_bytes, bool nowait)
1067	{
1068	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1069	struct btrfs_root *root = inode->root;
1070	struct extent_state *cached_state = NULL;
1071	u64 lockstart, lockend;
1072	u64 num_bytes;
1073	int ret;
1074
1075	if (!(inode->flags & (BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)))
1076	return `0`;
1077
1078	if (!btrfs_drew_try_write_lock(lock: &root->snapshot_lock))
1079	return -EAGAIN;
1080
1081	lockstart = round_down(pos, fs_info->sectorsize);
1082	lockend = round_up(pos + *write_bytes,
1083	fs_info->sectorsize) - `1`;
1084	num_bytes = lockend - lockstart + `1`;
1085
1086	if (nowait) {
1087	if (!btrfs_try_lock_ordered_range(inode, start: lockstart, end: lockend,
1088	cached_state: &cached_state)) {
1089	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
1090	return -EAGAIN;
1091	}
1092	} else {
1093	btrfs_lock_and_flush_ordered_range(inode, start: lockstart, end: lockend,
1094	cached_state: &cached_state);
1095	}
1096	ret = can_nocow_extent(inode: &inode->vfs_inode, offset: lockstart, len: &num_bytes,
1097	NULL, NULL, NULL, nowait, strict: false);
1098	if (ret <= `0`)
1099	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
1100	else
1101	write_bytes = min_t(size_t, write_bytes ,
1102	num_bytes - pos + lockstart);
1103	unlock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
1104
1105	return ret;
1106	}
1107
1108	void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1109	{
1110	btrfs_drew_write_unlock(lock: &inode->root->snapshot_lock);
1111	}
1112
1113	static void update_time_for_write(struct inode *inode)
1114	{
1115	struct timespec64 now, ts;
1116
1117	if (IS_NOCMTIME(inode))
1118	return;
1119
1120	now = current_time(inode);
1121	ts = inode_get_mtime(inode);
1122	if (!timespec64_equal(a: &ts, b: &now))
1123	inode_set_mtime_to_ts(inode, ts: now);
1124
1125	ts = inode_get_ctime(inode);
1126	if (!timespec64_equal(a: &ts, b: &now))
1127	inode_set_ctime_to_ts(inode, ts: now);
1128
1129	if (IS_I_VERSION(inode))
1130	inode_inc_iversion(inode);
1131	}
1132
1133	static int btrfs_write_check(struct kiocb iocb, struct* iov_iter *from,
1134	size_t count)
1135	{
1136	struct file *file = iocb->ki_filp;
1137	struct inode *inode = file_inode(f: file);
1138	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1139	loff_t pos = iocb->ki_pos;
1140	int ret;
1141	loff_t oldsize;
1142	loff_t start_pos;
1143
1144	/*
1145	* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1146	* prealloc flags, as without those flags we always have to COW. We will
1147	* later check if we can really COW into the target range (using
1148	* can_nocow_extent() at btrfs_get_blocks_direct_write()).
1149	*/
1150	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1151	!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)))
1152	return -EAGAIN;
1153
1154	ret = file_remove_privs(file);
1155	if (ret)
1156	return ret;
1157
1158	/*
1159	* We reserve space for updating the inode when we reserve space for the
1160	* extent we are going to write, so we will enospc out there. We don't
1161	* need to start yet another transaction to update the inode as we will
1162	* update the inode when we finish writing whatever data we write.
1163	*/
1164	update_time_for_write(inode);
1165
1166	start_pos = round_down(pos, fs_info->sectorsize);
1167	oldsize = i_size_read(inode);
1168	if (start_pos > oldsize) {
1169	/ Expand hole size to cover write data, preventing empty gap /
1170	loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1171
1172	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize, size: end_pos);
1173	if (ret)
1174	return ret;
1175	}
1176
1177	return `0`;
1178	}
1179
1180	static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1181	struct iov_iter *i)
1182	{
1183	struct file *file = iocb->ki_filp;
1184	loff_t pos;
1185	struct inode *inode = file_inode(f: file);
1186	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1187	struct page **pages = NULL;
1188	struct extent_changeset *data_reserved = NULL;
1189	u64 release_bytes = `0`;
1190	u64 lockstart;
1191	u64 lockend;
1192	size_t num_written = `0`;
1193	int nrptrs;
1194	ssize_t ret;
1195	bool only_release_metadata = false;
1196	bool force_page_uptodate = false;
1197	loff_t old_isize = i_size_read(inode);
1198	unsigned int ilock_flags = `0`;
1199	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1200	unsigned int bdp_flags = (nowait ? BDP_ASYNC : `0`);
1201
1202	if (nowait)
1203	ilock_flags \|= BTRFS_ILOCK_TRY;
1204
1205	ret = btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags);
1206	if (ret < `0`)
1207	return ret;
1208
1209	ret = generic_write_checks(iocb, i);
1210	if (ret <= `0`)
1211	goto out;
1212
1213	ret = btrfs_write_check(iocb, from: i, count: ret);
1214	if (ret < `0`)
1215	goto out;
1216
1217	pos = iocb->ki_pos;
1218	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1219	PAGE_SIZE / (sizeof(struct page *)));
1220	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1221	nrptrs = max(nrptrs, `8`);
1222	pages = kmalloc_array(n: nrptrs, size: sizeof(struct page *), GFP_KERNEL);
1223	if (!pages) {
1224	ret = -ENOMEM;
1225	goto out;
1226	}
1227
1228	while (iov_iter_count(i) > `0`) {
1229	struct extent_state *cached_state = NULL;
1230	size_t offset = offset_in_page(pos);
1231	size_t sector_offset;
1232	size_t write_bytes = min(iov_iter_count(i),
1233	nrptrs * (size_t)PAGE_SIZE -
1234	offset);
1235	size_t num_pages;
1236	size_t reserve_bytes;
1237	size_t dirty_pages;
1238	size_t copied;
1239	size_t dirty_sectors;
1240	size_t num_sectors;
1241	int extents_locked;
1242
1243	/*
1244	* Fault pages before locking them in prepare_pages
1245	* to avoid recursive lock
1246	*/
1247	if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1248	ret = -EFAULT;
1249	break;
1250	}
1251
1252	only_release_metadata = false;
1253	sector_offset = pos & (fs_info->sectorsize - `1`);
1254
1255	extent_changeset_release(changeset: data_reserved);
1256	ret = btrfs_check_data_free_space(inode: BTRFS_I(inode),
1257	reserved: &data_reserved, start: pos,
1258	len: write_bytes, noflush: nowait);
1259	if (ret < `0`) {
1260	int can_nocow;
1261
1262	if (nowait && (ret == -ENOSPC \|\| ret == -EAGAIN)) {
1263	ret = -EAGAIN;
1264	break;
1265	}
1266
1267	/*
1268	* If we don't have to COW at the offset, reserve
1269	* metadata only. write_bytes may get smaller than
1270	* requested here.
1271	*/
1272	can_nocow = btrfs_check_nocow_lock(inode: BTRFS_I(inode), pos,
1273	write_bytes: &write_bytes, nowait);
1274	if (can_nocow < `0`)
1275	ret = can_nocow;
1276	if (can_nocow > `0`)
1277	ret = `0`;
1278	if (ret)
1279	break;
1280	only_release_metadata = true;
1281	}
1282
1283	num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1284	WARN_ON(num_pages > nrptrs);
1285	reserve_bytes = round_up(write_bytes + sector_offset,
1286	fs_info->sectorsize);
1287	WARN_ON(reserve_bytes == `0`);
1288	ret = btrfs_delalloc_reserve_metadata(inode: BTRFS_I(inode),
1289	num_bytes: reserve_bytes,
1290	disk_num_bytes: reserve_bytes, noflush: nowait);
1291	if (ret) {
1292	if (!only_release_metadata)
1293	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
1294	reserved: data_reserved, start: pos,
1295	len: write_bytes);
1296	else
1297	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1298
1299	if (nowait && ret == -ENOSPC)
1300	ret = -EAGAIN;
1301	break;
1302	}
1303
1304	release_bytes = reserve_bytes;
1305	again:
1306	ret = balance_dirty_pages_ratelimited_flags(mapping: inode->i_mapping, flags: bdp_flags);
1307	if (ret) {
1308	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: reserve_bytes);
1309	break;
1310	}
1311
1312	/*
1313	* This is going to setup the pages array with the number of
1314	* pages we want, so we don't really need to worry about the
1315	* contents of pages from loop to loop
1316	*/
1317	ret = prepare_pages(inode, pages, num_pages,
1318	pos, write_bytes, force_uptodate: force_page_uptodate, nowait: false);
1319	if (ret) {
1320	btrfs_delalloc_release_extents(inode: BTRFS_I(inode),
1321	num_bytes: reserve_bytes);
1322	break;
1323	}
1324
1325	extents_locked = lock_and_cleanup_extent_if_need(
1326	inode: BTRFS_I(inode), pages,
1327	num_pages, pos, write_bytes, lockstart: &lockstart,
1328	lockend: &lockend, nowait, cached_state: &cached_state);
1329	if (extents_locked < `0`) {
1330	if (!nowait && extents_locked == -EAGAIN)
1331	goto again;
1332
1333	btrfs_delalloc_release_extents(inode: BTRFS_I(inode),
1334	num_bytes: reserve_bytes);
1335	ret = extents_locked;
1336	break;
1337	}
1338
1339	copied = btrfs_copy_from_user(pos, write_bytes, prepared_pages: pages, i);
1340
1341	num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1342	dirty_sectors = round_up(copied + sector_offset,
1343	fs_info->sectorsize);
1344	dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1345
1346	/*
1347	* if we have trouble faulting in the pages, fall
1348	* back to one page at a time
1349	*/
1350	if (copied < write_bytes)
1351	nrptrs = `1`;
1352
1353	if (copied == `0`) {
1354	force_page_uptodate = true;
1355	dirty_sectors = `0`;
1356	dirty_pages = `0`;
1357	} else {
1358	force_page_uptodate = false;
1359	dirty_pages = DIV_ROUND_UP(copied + offset,
1360	PAGE_SIZE);
1361	}
1362
1363	if (num_sectors > dirty_sectors) {
1364	/ release everything except the sectors we dirtied /
1365	release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1366	if (only_release_metadata) {
1367	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode),
1368	num_bytes: release_bytes, qgroup_free: true);
1369	} else {
1370	u64 __pos;
1371
1372	__pos = round_down(pos,
1373	fs_info->sectorsize) +
1374	(dirty_pages << PAGE_SHIFT);
1375	btrfs_delalloc_release_space(inode: BTRFS_I(inode),
1376	reserved: data_reserved, start: __pos,
1377	len: release_bytes, qgroup_free: true);
1378	}
1379	}
1380
1381	release_bytes = round_up(copied + sector_offset,
1382	fs_info->sectorsize);
1383
1384	ret = btrfs_dirty_pages(inode: BTRFS_I(inode), pages,
1385	num_pages: dirty_pages, pos, write_bytes: copied,
1386	cached: &cached_state, noreserve: only_release_metadata);
1387
1388	/*
1389	* If we have not locked the extent range, because the range's
1390	* start offset is >= i_size, we might still have a non-NULL
1391	* cached extent state, acquired while marking the extent range
1392	* as delalloc through btrfs_dirty_pages(). Therefore free any
1393	* possible cached extent state to avoid a memory leak.
1394	*/
1395	if (extents_locked)
1396	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart,
1397	end: lockend, cached: &cached_state);
1398	else
1399	free_extent_state(state: cached_state);
1400
1401	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: reserve_bytes);
1402	if (ret) {
1403	btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1404	break;
1405	}
1406
1407	release_bytes = `0`;
1408	if (only_release_metadata)
1409	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1410
1411	btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1412
1413	cond_resched();
1414
1415	pos += copied;
1416	num_written += copied;
1417	}
1418
1419	kfree(objp: pages);
1420
1421	if (release_bytes) {
1422	if (only_release_metadata) {
1423	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1424	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode),
1425	num_bytes: release_bytes, qgroup_free: true);
1426	} else {
1427	btrfs_delalloc_release_space(inode: BTRFS_I(inode),
1428	reserved: data_reserved,
1429	round_down(pos, fs_info->sectorsize),
1430	len: release_bytes, qgroup_free: true);
1431	}
1432	}
1433
1434	extent_changeset_free(changeset: data_reserved);
1435	if (num_written > `0`) {
1436	pagecache_isize_extended(inode, from: old_isize, to: iocb->ki_pos);
1437	iocb->ki_pos += num_written;
1438	}
1439	out:
1440	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1441	return num_written ? num_written : ret;
1442	}
1443
1444	static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1445	const struct iov_iter *iter, loff_t offset)
1446	{
1447	const u32 blocksize_mask = fs_info->sectorsize - `1`;
1448
1449	if (offset & blocksize_mask)
1450	return -EINVAL;
1451
1452	if (iov_iter_alignment(i: iter) & blocksize_mask)
1453	return -EINVAL;
1454
1455	return `0`;
1456	}
1457
1458	static ssize_t btrfs_direct_write(struct kiocb iocb, struct* iov_iter *from)
1459	{
1460	struct file *file = iocb->ki_filp;
1461	struct inode *inode = file_inode(f: file);
1462	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1463	loff_t pos;
1464	ssize_t written = `0`;
1465	ssize_t written_buffered;
1466	size_t prev_left = `0`;
1467	loff_t endbyte;
1468	ssize_t err;
1469	unsigned int ilock_flags = `0`;
1470	struct iomap_dio *dio;
1471
1472	if (iocb->ki_flags & IOCB_NOWAIT)
1473	ilock_flags \|= BTRFS_ILOCK_TRY;
1474
1475	/*
1476	* If the write DIO is within EOF, use a shared lock and also only if
1477	* security bits will likely not be dropped by file_remove_privs() called
1478	* from btrfs_write_check(). Either will need to be rechecked after the
1479	* lock was acquired.
1480	*/
1481	if (iocb->ki_pos + iov_iter_count(i: from) <= i_size_read(inode) && IS_NOSEC(inode))
1482	ilock_flags \|= BTRFS_ILOCK_SHARED;
1483
1484	relock:
1485	err = btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags);
1486	if (err < `0`)
1487	return err;
1488
1489	/ Shared lock cannot be used with security bits set. /
1490	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1491	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1492	ilock_flags &= ~BTRFS_ILOCK_SHARED;
1493	goto relock;
1494	}
1495
1496	err = generic_write_checks(iocb, from);
1497	if (err <= `0`) {
1498	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1499	return err;
1500	}
1501
1502	err = btrfs_write_check(iocb, from, count: err);
1503	if (err < `0`) {
1504	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1505	goto out;
1506	}
1507
1508	pos = iocb->ki_pos;
1509	/*
1510	* Re-check since file size may have changed just before taking the
1511	* lock or pos may have changed because of O_APPEND in generic_write_check()
1512	*/
1513	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1514	pos + iov_iter_count(i: from) > i_size_read(inode)) {
1515	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1516	ilock_flags &= ~BTRFS_ILOCK_SHARED;
1517	goto relock;
1518	}
1519
1520	if (check_direct_IO(fs_info, iter: from, offset: pos)) {
1521	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1522	goto buffered;
1523	}
1524
1525	/*
1526	* The iov_iter can be mapped to the same file range we are writing to.
1527	* If that's the case, then we will deadlock in the iomap code, because
1528	* it first calls our callback btrfs_dio_iomap_begin(), which will create
1529	* an ordered extent, and after that it will fault in the pages that the
1530	* iov_iter refers to. During the fault in we end up in the readahead
1531	* pages code (starting at btrfs_readahead()), which will lock the range,
1532	* find that ordered extent and then wait for it to complete (at
1533	* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1534	* obviously the ordered extent can never complete as we didn't submit
1535	* yet the respective bio(s). This always happens when the buffer is
1536	* memory mapped to the same file range, since the iomap DIO code always
1537	* invalidates pages in the target file range (after starting and waiting
1538	* for any writeback).
1539	*
1540	* So here we disable page faults in the iov_iter and then retry if we
1541	* got -EFAULT, faulting in the pages before the retry.
1542	*/
1543	from->nofault = true;
1544	dio = btrfs_dio_write(iocb, iter: from, done_before: written);
1545	from->nofault = false;
1546
1547	/*
1548	* iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1549	* iocb, and that needs to lock the inode. So unlock it before calling
1550	* iomap_dio_complete() to avoid a deadlock.
1551	*/
1552	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1553
1554	if (IS_ERR_OR_NULL(ptr: dio))
1555	err = PTR_ERR_OR_ZERO(ptr: dio);
1556	else
1557	err = iomap_dio_complete(dio);
1558
1559	/ No increment (+=) because iomap returns a cumulative value. /
1560	if (err > `0`)
1561	written = err;
1562
1563	if (iov_iter_count(i: from) > `0` && (err == -EFAULT \|\| err > `0`)) {
1564	const size_t left = iov_iter_count(i: from);
1565	/*
1566	* We have more data left to write. Try to fault in as many as
1567	* possible of the remainder pages and retry. We do this without
1568	* releasing and locking again the inode, to prevent races with
1569	* truncate.
1570	*
1571	* Also, in case the iov refers to pages in the file range of the
1572	* file we want to write to (due to a mmap), we could enter an
1573	* infinite loop if we retry after faulting the pages in, since
1574	* iomap will invalidate any pages in the range early on, before
1575	* it tries to fault in the pages of the iov. So we keep track of
1576	* how much was left of iov in the previous EFAULT and fallback
1577	* to buffered IO in case we haven't made any progress.
1578	*/
1579	if (left == prev_left) {
1580	err = -ENOTBLK;
1581	} else {
1582	fault_in_iov_iter_readable(i: from, bytes: left);
1583	prev_left = left;
1584	goto relock;
1585	}
1586	}
1587
1588	/*
1589	* If 'err' is -ENOTBLK or we have not written all data, then it means
1590	* we must fallback to buffered IO.
1591	*/
1592	if ((err < `0` && err != -ENOTBLK) \|\| !iov_iter_count(i: from))
1593	goto out;
1594
1595	buffered:
1596	/*
1597	* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1598	* it must retry the operation in a context where blocking is acceptable,
1599	* because even if we end up not blocking during the buffered IO attempt
1600	* below, we will block when flushing and waiting for the IO.
1601	*/
1602	if (iocb->ki_flags & IOCB_NOWAIT) {
1603	err = -EAGAIN;
1604	goto out;
1605	}
1606
1607	pos = iocb->ki_pos;
1608	written_buffered = btrfs_buffered_write(iocb, i: from);
1609	if (written_buffered < `0`) {
1610	err = written_buffered;
1611	goto out;
1612	}
1613	/*
1614	* Ensure all data is persisted. We want the next direct IO read to be
1615	* able to read what was just written.
1616	*/
1617	endbyte = pos + written_buffered - `1`;
1618	err = btrfs_fdatawrite_range(inode, start: pos, end: endbyte);
1619	if (err)
1620	goto out;
1621	err = filemap_fdatawait_range(inode->i_mapping, lstart: pos, lend: endbyte);
1622	if (err)
1623	goto out;
1624	written += written_buffered;
1625	iocb->ki_pos = pos + written_buffered;
1626	invalidate_mapping_pages(mapping: file->f_mapping, start: pos >> PAGE_SHIFT,
1627	end: endbyte >> PAGE_SHIFT);
1628	out:
1629	return err < `0` ? err : written;
1630	}
1631
1632	static ssize_t btrfs_encoded_write(struct kiocb iocb, struct* iov_iter *from,
1633	const struct btrfs_ioctl_encoded_io_args *encoded)
1634	{
1635	struct file *file = iocb->ki_filp;
1636	struct inode *inode = file_inode(f: file);
1637	loff_t count;
1638	ssize_t ret;
1639
1640	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: `0`);
1641	count = encoded->len;
1642	ret = generic_write_checks_count(iocb, count: &count);
1643	if (ret == `0` && count != encoded->len) {
1644	/*
1645	* The write got truncated by generic_write_checks_count(). We
1646	* can't do a partial encoded write.
1647	*/
1648	ret = -EFBIG;
1649	}
1650	if (ret \|\| encoded->len == `0`)
1651	goto out;
1652
1653	ret = btrfs_write_check(iocb, from, count: encoded->len);
1654	if (ret < `0`)
1655	goto out;
1656
1657	ret = btrfs_do_encoded_write(iocb, from, encoded);
1658	out:
1659	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1660	return ret;
1661	}
1662
1663	ssize_t btrfs_do_write_iter(struct kiocb iocb, struct* iov_iter *from,
1664	const struct btrfs_ioctl_encoded_io_args *encoded)
1665	{
1666	struct file *file = iocb->ki_filp;
1667	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: file));
1668	ssize_t num_written, num_sync;
1669
1670	/*
1671	* If the fs flips readonly due to some impossible error, although we
1672	* have opened a file as writable, we have to stop this write operation
1673	* to ensure consistency.
1674	*/
1675	if (BTRFS_FS_ERROR(inode->root->fs_info))
1676	return -EROFS;
1677
1678	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1679	return -EOPNOTSUPP;
1680
1681	if (encoded) {
1682	num_written = btrfs_encoded_write(iocb, from, encoded);
1683	num_sync = encoded->len;
1684	} else if (iocb->ki_flags & IOCB_DIRECT) {
1685	num_written = btrfs_direct_write(iocb, from);
1686	num_sync = num_written;
1687	} else {
1688	num_written = btrfs_buffered_write(iocb, i: from);
1689	num_sync = num_written;
1690	}
1691
1692	btrfs_set_inode_last_sub_trans(inode);
1693
1694	if (num_sync > `0`) {
1695	num_sync = generic_write_sync(iocb, count: num_sync);
1696	if (num_sync < `0`)
1697	num_written = num_sync;
1698	}
1699
1700	return num_written;
1701	}
1702
1703	static ssize_t btrfs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
1704	{
1705	return btrfs_do_write_iter(iocb, from, NULL);
1706	}
1707
1708	int btrfs_release_file(struct inode inode, struct* file *filp)
1709	{
1710	struct btrfs_file_private *private = filp->private_data;
1711
1712	if (private) {
1713	kfree(objp: private->filldir_buf);
1714	free_extent_state(state: private->llseek_cached_state);
1715	kfree(objp: private);
1716	filp->private_data = NULL;
1717	}
1718
1719	/*
1720	* Set by setattr when we are about to truncate a file from a non-zero
1721	* size to a zero size. This tries to flush down new bytes that may
1722	* have been written if the application were using truncate to replace
1723	* a file in place.
1724	*/
1725	if (test_and_clear_bit(nr: BTRFS_INODE_FLUSH_ON_CLOSE,
1726	addr: &BTRFS_I(inode)->runtime_flags))
1727	filemap_flush(inode->i_mapping);
1728	return `0`;
1729	}
1730
1731	static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1732	{
1733	int ret;
1734	struct blk_plug plug;
1735
1736	/*
1737	* This is only called in fsync, which would do synchronous writes, so
1738	* a plug can merge adjacent IOs as much as possible. Esp. in case of
1739	* multiple disks using raid profile, a large IO can be split to
1740	* several segments of stripe length (currently 64K).
1741	*/
1742	blk_start_plug(&plug);
1743	ret = btrfs_fdatawrite_range(inode, start, end);
1744	blk_finish_plug(&plug);
1745
1746	return ret;
1747	}
1748
1749	static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1750	{
1751	struct btrfs_inode *inode = BTRFS_I(inode: ctx->inode);
1752	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1753
1754	if (btrfs_inode_in_log(inode, generation: btrfs_get_fs_generation(fs_info)) &&
1755	list_empty(head: &ctx->ordered_extents))
1756	return true;
1757
1758	/*
1759	* If we are doing a fast fsync we can not bail out if the inode's
1760	* last_trans is <= then the last committed transaction, because we only
1761	* update the last_trans of the inode during ordered extent completion,
1762	* and for a fast fsync we don't wait for that, we only wait for the
1763	* writeback to complete.
1764	*/
1765	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1766	(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) \|\|
1767	list_empty(head: &ctx->ordered_extents)))
1768	return true;
1769
1770	return false;
1771	}
1772
1773	/*
1774	* fsync call for both files and directories. This logs the inode into
1775	* the tree log instead of forcing full commits whenever possible.
1776	*
1777	* It needs to call filemap_fdatawait so that all ordered extent updates are
1778	* in the metadata btree are up to date for copying to the log.
1779	*
1780	* It drops the inode mutex before doing the tree log commit. This is an
1781	* important optimization for directories because holding the mutex prevents
1782	* new operations on the dir while we write to disk.
1783	*/
1784	int btrfs_sync_file(struct file file, loff_t start, loff_t end, int* datasync)
1785	{
1786	struct dentry *dentry = file_dentry(file);
1787	struct inode *inode = d_inode(dentry);
1788	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1789	struct btrfs_root *root = BTRFS_I(inode)->root;
1790	struct btrfs_trans_handle *trans;
1791	struct btrfs_log_ctx ctx;
1792	int ret = `0`, err;
1793	u64 len;
1794	bool full_sync;
1795
1796	trace_btrfs_sync_file(file, datasync);
1797
1798	btrfs_init_log_ctx(ctx: &ctx, inode);
1799
1800	/*
1801	* Always set the range to a full range, otherwise we can get into
1802	* several problems, from missing file extent items to represent holes
1803	* when not using the NO_HOLES feature, to log tree corruption due to
1804	* races between hole detection during logging and completion of ordered
1805	* extents outside the range, to missing checksums due to ordered extents
1806	* for which we flushed only a subset of their pages.
1807	*/
1808	start = `0`;
1809	end = LLONG_MAX;
1810	len = (u64)LLONG_MAX + `1`;
1811
1812	/*
1813	* We write the dirty pages in the range and wait until they complete
1814	* out of the ->i_mutex. If so, we can flush the dirty pages by
1815	* multi-task, and make the performance up. See
1816	* btrfs_wait_ordered_range for an explanation of the ASYNC check.
1817	*/
1818	ret = start_ordered_ops(inode, start, end);
1819	if (ret)
1820	goto out;
1821
1822	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1823
1824	atomic_inc(v: &root->log_batch);
1825
1826	/*
1827	* Before we acquired the inode's lock and the mmap lock, someone may
1828	* have dirtied more pages in the target range. We need to make sure
1829	* that writeback for any such pages does not start while we are logging
1830	* the inode, because if it does, any of the following might happen when
1831	* we are not doing a full inode sync:
1832	*
1833	* 1) We log an extent after its writeback finishes but before its
1834	* checksums are added to the csum tree, leading to -EIO errors
1835	* when attempting to read the extent after a log replay.
1836	*
1837	* 2) We can end up logging an extent before its writeback finishes.
1838	* Therefore after the log replay we will have a file extent item
1839	* pointing to an unwritten extent (and no data checksums as well).
1840	*
1841	* So trigger writeback for any eventual new dirty pages and then we
1842	* wait for all ordered extents to complete below.
1843	*/
1844	ret = start_ordered_ops(inode, start, end);
1845	if (ret) {
1846	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1847	goto out;
1848	}
1849
1850	/*
1851	* Always check for the full sync flag while holding the inode's lock,
1852	* to avoid races with other tasks. The flag must be either set all the
1853	* time during logging or always off all the time while logging.
1854	* We check the flag here after starting delalloc above, because when
1855	* running delalloc the full sync flag may be set if we need to drop
1856	* extra extent map ranges due to temporary memory allocation failures.
1857	*/
1858	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1859	&BTRFS_I(inode)->runtime_flags);
1860
1861	/*
1862	* We have to do this here to avoid the priority inversion of waiting on
1863	* IO of a lower priority task while holding a transaction open.
1864	*
1865	* For a full fsync we wait for the ordered extents to complete while
1866	* for a fast fsync we wait just for writeback to complete, and then
1867	* attach the ordered extents to the transaction so that a transaction
1868	* commit waits for their completion, to avoid data loss if we fsync,
1869	* the current transaction commits before the ordered extents complete
1870	* and a power failure happens right after that.
1871	*
1872	* For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1873	* logical address recorded in the ordered extent may change. We need
1874	* to wait for the IO to stabilize the logical address.
1875	*/
1876	if (full_sync \|\| btrfs_is_zoned(fs_info)) {
1877	ret = btrfs_wait_ordered_range(inode, start, len);
1878	} else {
1879	/*
1880	* Get our ordered extents as soon as possible to avoid doing
1881	* checksum lookups in the csum tree, and use instead the
1882	* checksums attached to the ordered extents.
1883	*/
1884	btrfs_get_ordered_extents_for_logging(inode: BTRFS_I(inode),
1885	list: &ctx.ordered_extents);
1886	ret = filemap_fdatawait_range(inode->i_mapping, lstart: start, lend: end);
1887	}
1888
1889	if (ret)
1890	goto out_release_extents;
1891
1892	atomic_inc(v: &root->log_batch);
1893
1894	if (skip_inode_logging(ctx: &ctx)) {
1895	/*
1896	* We've had everything committed since the last time we were
1897	* modified so clear this flag in case it was set for whatever
1898	* reason, it's no longer relevant.
1899	*/
1900	clear_bit(nr: BTRFS_INODE_NEEDS_FULL_SYNC,
1901	addr: &BTRFS_I(inode)->runtime_flags);
1902	/*
1903	* An ordered extent might have started before and completed
1904	* already with io errors, in which case the inode was not
1905	* updated and we end up here. So check the inode's mapping
1906	* for any errors that might have happened since we last
1907	* checked called fsync.
1908	*/
1909	ret = filemap_check_wb_err(mapping: inode->i_mapping, since: file->f_wb_err);
1910	goto out_release_extents;
1911	}
1912
1913	btrfs_init_log_ctx_scratch_eb(ctx: &ctx);
1914
1915	/*
1916	* We use start here because we will need to wait on the IO to complete
1917	* in btrfs_sync_log, which could require joining a transaction (for
1918	* example checking cross references in the nocow path). If we use join
1919	* here we could get into a situation where we're waiting on IO to
1920	* happen that is blocked on a transaction trying to commit. With start
1921	* we inc the extwriter counter, so we wait for all extwriters to exit
1922	* before we start blocking joiners. This comment is to keep somebody
1923	* from thinking they are super smart and changing this to
1924	* btrfs_join_transaction coughJosefcough.
1925	*/
1926	trans = btrfs_start_transaction(root, num_items: `0`);
1927	if (IS_ERR(ptr: trans)) {
1928	ret = PTR_ERR(ptr: trans);
1929	goto out_release_extents;
1930	}
1931	trans->in_fsync = true;
1932
1933	ret = btrfs_log_dentry_safe(trans, dentry, ctx: &ctx);
1934	/*
1935	* Scratch eb no longer needed, release before syncing log or commit
1936	* transaction, to avoid holding unnecessary memory during such long
1937	* operations.
1938	*/
1939	if (ctx.scratch_eb) {
1940	free_extent_buffer(eb: ctx.scratch_eb);
1941	ctx.scratch_eb = NULL;
1942	}
1943	btrfs_release_log_ctx_extents(ctx: &ctx);
1944	if (ret < `0`) {
1945	/ Fallthrough and commit/free transaction. /
1946	ret = BTRFS_LOG_FORCE_COMMIT;
1947	}
1948
1949	/ we've logged all the items and now have a consistent*
1950	* version of the file in the log. It is possible that
1951	* someone will come in and modify the file, but that's
1952	* fine because the log is consistent on disk, and we
1953	* have references to all of the file's extents
1954	*
1955	* It is possible that someone will come in and log the
1956	* file again, but that will end up using the synchronization
1957	* inside btrfs_sync_log to keep things safe.
1958	*/
1959	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1960
1961	if (ret == BTRFS_NO_LOG_SYNC) {
1962	ret = btrfs_end_transaction(trans);
1963	goto out;
1964	}
1965
1966	/ We successfully logged the inode, attempt to sync the log. /
1967	if (!ret) {
1968	ret = btrfs_sync_log(trans, root, ctx: &ctx);
1969	if (!ret) {
1970	ret = btrfs_end_transaction(trans);
1971	goto out;
1972	}
1973	}
1974
1975	/*
1976	* At this point we need to commit the transaction because we had
1977	* btrfs_need_log_full_commit() or some other error.
1978	*
1979	* If we didn't do a full sync we have to stop the trans handle, wait on
1980	* the ordered extents, start it again and commit the transaction. If
1981	* we attempt to wait on the ordered extents here we could deadlock with
1982	* something like fallocate() that is holding the extent lock trying to
1983	* start a transaction while some other thread is trying to commit the
1984	* transaction while we (fsync) are currently holding the transaction
1985	* open.
1986	*/
1987	if (!full_sync) {
1988	ret = btrfs_end_transaction(trans);
1989	if (ret)
1990	goto out;
1991	ret = btrfs_wait_ordered_range(inode, start, len);
1992	if (ret)
1993	goto out;
1994
1995	/*
1996	* This is safe to use here because we're only interested in
1997	* making sure the transaction that had the ordered extents is
1998	* committed. We aren't waiting on anything past this point,
1999	* we're purely getting the transaction and committing it.
2000	*/
2001	trans = btrfs_attach_transaction_barrier(root);
2002	if (IS_ERR(ptr: trans)) {
2003	ret = PTR_ERR(ptr: trans);
2004
2005	/*
2006	* We committed the transaction and there's no currently
2007	* running transaction, this means everything we care
2008	* about made it to disk and we are done.
2009	*/
2010	if (ret == -ENOENT)
2011	ret = `0`;
2012	goto out;
2013	}
2014	}
2015
2016	ret = btrfs_commit_transaction(trans);
2017	out:
2018	free_extent_buffer(eb: ctx.scratch_eb);
2019	ASSERT(list_empty(&ctx.list));
2020	ASSERT(list_empty(&ctx.conflict_inodes));
2021	err = file_check_and_advance_wb_err(file);
2022	if (!ret)
2023	ret = err;
2024	return ret > `0` ? -EIO : ret;
2025
2026	out_release_extents:
2027	btrfs_release_log_ctx_extents(ctx: &ctx);
2028	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2029	goto out;
2030	}
2031
2032	static const struct vm_operations_struct btrfs_file_vm_ops = {
2033	.fault = filemap_fault,
2034	.map_pages = filemap_map_pages,
2035	.page_mkwrite = btrfs_page_mkwrite,
2036	};
2037
2038	static int btrfs_file_mmap(struct file filp, struct* vm_area_struct *vma)
2039	{
2040	struct address_space *mapping = filp->f_mapping;
2041
2042	if (!mapping->a_ops->read_folio)
2043	return -ENOEXEC;
2044
2045	file_accessed(file: filp);
2046	vma->vm_ops = &btrfs_file_vm_ops;
2047
2048	return `0`;
2049	}
2050
2051	static int hole_mergeable(struct btrfs_inode inode, struct* extent_buffer *leaf,
2052	int slot, u64 start, u64 end)
2053	{
2054	struct btrfs_file_extent_item *fi;
2055	struct btrfs_key key;
2056
2057	if (slot < `0` \|\| slot >= btrfs_header_nritems(eb: leaf))
2058	return `0`;
2059
2060	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
2061	if (key.objectid != btrfs_ino(inode) \|\|
2062	key.type != BTRFS_EXTENT_DATA_KEY)
2063	return `0`;
2064
2065	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2066
2067	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_REG)
2068	return `0`;
2069
2070	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: fi))
2071	return `0`;
2072
2073	if (key.offset == end)
2074	return `1`;
2075	if (key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi) == start)
2076	return `1`;
2077	return `0`;
2078	}
2079
2080	static int fill_holes(struct btrfs_trans_handle *trans,
2081	struct btrfs_inode *inode,
2082	struct btrfs_path *path, u64 offset, u64 end)
2083	{
2084	struct btrfs_fs_info *fs_info = trans->fs_info;
2085	struct btrfs_root *root = inode->root;
2086	struct extent_buffer *leaf;
2087	struct btrfs_file_extent_item *fi;
2088	struct extent_map *hole_em;
2089	struct btrfs_key key;
2090	int ret;
2091
2092	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2093	goto out;
2094
2095	key.objectid = btrfs_ino(inode);
2096	key.type = BTRFS_EXTENT_DATA_KEY;
2097	key.offset = offset;
2098
2099	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
2100	if (ret <= `0`) {
2101	/*
2102	* We should have dropped this offset, so if we find it then
2103	* something has gone horribly wrong.
2104	*/
2105	if (ret == `0`)
2106	ret = -EINVAL;
2107	return ret;
2108	}
2109
2110	leaf = path->nodes[`0`];
2111	if (hole_mergeable(inode, leaf, slot: path->slots[`0`] - `1`, start: offset, end)) {
2112	u64 num_bytes;
2113
2114	path->slots[`0`]--;
2115	fi = btrfs_item_ptr(leaf, path->slots[`0`],
2116	struct btrfs_file_extent_item);
2117	num_bytes = btrfs_file_extent_num_bytes(eb: leaf, s: fi) +
2118	end - offset;
2119	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi, val: num_bytes);
2120	btrfs_set_file_extent_ram_bytes(eb: leaf, s: fi, val: num_bytes);
2121	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: `0`);
2122	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
2123	btrfs_mark_buffer_dirty(trans, buf: leaf);
2124	goto out;
2125	}
2126
2127	if (hole_mergeable(inode, leaf, slot: path->slots[`0`], start: offset, end)) {
2128	u64 num_bytes;
2129
2130	key.offset = offset;
2131	btrfs_set_item_key_safe(trans, path, new_key: &key);
2132	fi = btrfs_item_ptr(leaf, path->slots[`0`],
2133	struct btrfs_file_extent_item);
2134	num_bytes = btrfs_file_extent_num_bytes(eb: leaf, s: fi) + end -
2135	offset;
2136	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi, val: num_bytes);
2137	btrfs_set_file_extent_ram_bytes(eb: leaf, s: fi, val: num_bytes);
2138	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: `0`);
2139	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
2140	btrfs_mark_buffer_dirty(trans, buf: leaf);
2141	goto out;
2142	}
2143	btrfs_release_path(p: path);
2144
2145	ret = btrfs_insert_hole_extent(trans, root, objectid: btrfs_ino(inode), pos: offset,
2146	num_bytes: end - offset);
2147	if (ret)
2148	return ret;
2149
2150	out:
2151	btrfs_release_path(p: path);
2152
2153	hole_em = alloc_extent_map();
2154	if (!hole_em) {
2155	btrfs_drop_extent_map_range(inode, start: offset, end: end - `1`, skip_pinned: false);
2156	btrfs_set_inode_full_sync(inode);
2157	} else {
2158	hole_em->start = offset;
2159	hole_em->len = end - offset;
2160	hole_em->ram_bytes = hole_em->len;
2161	hole_em->orig_start = offset;
2162
2163	hole_em->block_start = EXTENT_MAP_HOLE;
2164	hole_em->block_len = `0`;
2165	hole_em->orig_block_len = `0`;
2166	hole_em->generation = trans->transid;
2167
2168	ret = btrfs_replace_extent_map_range(inode, new_em: hole_em, modified: true);
2169	free_extent_map(em: hole_em);
2170	if (ret)
2171	btrfs_set_inode_full_sync(inode);
2172	}
2173
2174	return `0`;
2175	}
2176
2177	/*
2178	* Find a hole extent on given inode and change start/len to the end of hole
2179	* extent.(hole/vacuum extent whose em->start <= start &&
2180	* em->start + em->len > start)
2181	* When a hole extent is found, return 1 and modify start/len.
2182	*/
2183	static int find_first_non_hole(struct btrfs_inode inode, u64 start, u64 *len)
2184	{
2185	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2186	struct extent_map *em;
2187	int ret = `0`;
2188
2189	em = btrfs_get_extent(inode, NULL,
2190	round_down(*start, fs_info->sectorsize),
2191	round_up(*len, fs_info->sectorsize));
2192	if (IS_ERR(ptr: em))
2193	return PTR_ERR(ptr: em);
2194
2195	/ Hole or vacuum extent(only exists in no-hole mode) /
2196	if (em->block_start == EXTENT_MAP_HOLE) {
2197	ret = `1`;
2198	len = em->start + em->len > start + *len ?
2199	`0` : start + len - em->start - em->len;
2200	*start = em->start + em->len;
2201	}
2202	free_extent_map(em);
2203	return ret;
2204	}
2205
2206	static void btrfs_punch_hole_lock_range(struct inode *inode,
2207	const u64 lockstart,
2208	const u64 lockend,
2209	struct extent_state **cached_state)
2210	{
2211	/*
2212	* For subpage case, if the range is not at page boundary, we could
2213	* have pages at the leading/tailing part of the range.
2214	* This could lead to dead loop since filemap_range_has_page()
2215	* will always return true.
2216	* So here we need to do extra page alignment for
2217	* filemap_range_has_page().
2218	*/
2219	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2220	const u64 page_lockend = round_down(lockend + `1`, PAGE_SIZE) - `1`;
2221
2222	while (`1`) {
2223	truncate_pagecache_range(inode, offset: lockstart, end: lockend);
2224
2225	lock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2226	cached: cached_state);
2227	/*
2228	* We can't have ordered extents in the range, nor dirty/writeback
2229	* pages, because we have locked the inode's VFS lock in exclusive
2230	* mode, we have locked the inode's i_mmap_lock in exclusive mode,
2231	* we have flushed all delalloc in the range and we have waited
2232	* for any ordered extents in the range to complete.
2233	* We can race with anyone reading pages from this range, so after
2234	* locking the range check if we have pages in the range, and if
2235	* we do, unlock the range and retry.
2236	*/
2237	if (!filemap_range_has_page(inode->i_mapping, lstart: page_lockstart,
2238	lend: page_lockend))
2239	break;
2240
2241	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2242	cached: cached_state);
2243	}
2244
2245	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode), start: lockstart, end: lockend);
2246	}
2247
2248	static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2249	struct btrfs_inode *inode,
2250	struct btrfs_path *path,
2251	struct btrfs_replace_extent_info *extent_info,
2252	const u64 replace_len,
2253	const u64 bytes_to_drop)
2254	{
2255	struct btrfs_fs_info *fs_info = trans->fs_info;
2256	struct btrfs_root *root = inode->root;
2257	struct btrfs_file_extent_item *extent;
2258	struct extent_buffer *leaf;
2259	struct btrfs_key key;
2260	int slot;
2261	struct btrfs_ref ref = { `0` };
2262	int ret;
2263
2264	if (replace_len == `0`)
2265	return `0`;
2266
2267	if (extent_info->disk_offset == `0` &&
2268	btrfs_fs_incompat(fs_info, NO_HOLES)) {
2269	btrfs_update_inode_bytes(inode, add_bytes: `0`, del_bytes: bytes_to_drop);
2270	return `0`;
2271	}
2272
2273	key.objectid = btrfs_ino(inode);
2274	key.type = BTRFS_EXTENT_DATA_KEY;
2275	key.offset = extent_info->file_offset;
2276	ret = btrfs_insert_empty_item(trans, root, path, key: &key,
2277	data_size: sizeof(struct btrfs_file_extent_item));
2278	if (ret)
2279	return ret;
2280	leaf = path->nodes[`0`];
2281	slot = path->slots[`0`];
2282	write_extent_buffer(eb: leaf, src: extent_info->extent_buf,
2283	btrfs_item_ptr_offset(leaf, slot),
2284	len: sizeof(struct btrfs_file_extent_item));
2285	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2286	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2287	btrfs_set_file_extent_offset(eb: leaf, s: extent, val: extent_info->data_offset);
2288	btrfs_set_file_extent_num_bytes(eb: leaf, s: extent, val: replace_len);
2289	if (extent_info->is_new_extent)
2290	btrfs_set_file_extent_generation(eb: leaf, s: extent, val: trans->transid);
2291	btrfs_mark_buffer_dirty(trans, buf: leaf);
2292	btrfs_release_path(p: path);
2293
2294	ret = btrfs_inode_set_file_extent_range(inode, start: extent_info->file_offset,
2295	len: replace_len);
2296	if (ret)
2297	return ret;
2298
2299	/ If it's a hole, nothing more needs to be done. /
2300	if (extent_info->disk_offset == `0`) {
2301	btrfs_update_inode_bytes(inode, add_bytes: `0`, del_bytes: bytes_to_drop);
2302	return `0`;
2303	}
2304
2305	btrfs_update_inode_bytes(inode, add_bytes: replace_len, del_bytes: bytes_to_drop);
2306
2307	if (extent_info->is_new_extent && extent_info->insertions == `0`) {
2308	key.objectid = extent_info->disk_offset;
2309	key.type = BTRFS_EXTENT_ITEM_KEY;
2310	key.offset = extent_info->disk_len;
2311	ret = btrfs_alloc_reserved_file_extent(trans, root,
2312	owner: btrfs_ino(inode),
2313	offset: extent_info->file_offset,
2314	ram_bytes: extent_info->qgroup_reserved,
2315	ins: &key);
2316	} else {
2317	u64 ref_offset;
2318
2319	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_ADD_DELAYED_REF,
2320	bytenr: extent_info->disk_offset,
2321	len: extent_info->disk_len, parent: `0`,
2322	owning_root: root->root_key.objectid);
2323	ref_offset = extent_info->file_offset - extent_info->data_offset;
2324	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid,
2325	ino: btrfs_ino(inode), offset: ref_offset, mod_root: `0`, skip_qgroup: false);
2326	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
2327	}
2328
2329	extent_info->insertions++;
2330
2331	return ret;
2332	}
2333
2334	/*
2335	* The respective range must have been previously locked, as well as the inode.
2336	* The end offset is inclusive (last byte of the range).
2337	* @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2338	* the file range with an extent.
2339	* When not punching a hole, we don't want to end up in a state where we dropped
2340	* extents without inserting a new one, so we must abort the transaction to avoid
2341	* a corruption.
2342	*/
2343	int btrfs_replace_file_extents(struct btrfs_inode *inode,
2344	struct btrfs_path path, const* u64 start,
2345	const u64 end,
2346	struct btrfs_replace_extent_info *extent_info,
2347	struct btrfs_trans_handle **trans_out)
2348	{
2349	struct btrfs_drop_extents_args drop_args = { `0` };
2350	struct btrfs_root *root = inode->root;
2351	struct btrfs_fs_info *fs_info = root->fs_info;
2352	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
2353	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2354	struct btrfs_trans_handle *trans = NULL;
2355	struct btrfs_block_rsv *rsv;
2356	unsigned int rsv_count;
2357	u64 cur_offset;
2358	u64 len = end - start;
2359	int ret = `0`;
2360
2361	if (end <= start)
2362	return -EINVAL;
2363
2364	rsv = btrfs_alloc_block_rsv(fs_info, type: BTRFS_BLOCK_RSV_TEMP);
2365	if (!rsv) {
2366	ret = -ENOMEM;
2367	goto out;
2368	}
2369	rsv->size = btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
2370	rsv->failfast = true;
2371
2372	/*
2373	* 1 - update the inode
2374	* 1 - removing the extents in the range
2375	* 1 - adding the hole extent if no_holes isn't set or if we are
2376	* replacing the range with a new extent
2377	*/
2378	if (!btrfs_fs_incompat(fs_info, NO_HOLES) \|\| extent_info)
2379	rsv_count = `3`;
2380	else
2381	rsv_count = `2`;
2382
2383	trans = btrfs_start_transaction(root, num_items: rsv_count);
2384	if (IS_ERR(ptr: trans)) {
2385	ret = PTR_ERR(ptr: trans);
2386	trans = NULL;
2387	goto out_free;
2388	}
2389
2390	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv, dst_rsv: rsv,
2391	num_bytes: min_size, update_size: false);
2392	if (WARN_ON(ret))
2393	goto out_trans;
2394	trans->block_rsv = rsv;
2395
2396	cur_offset = start;
2397	drop_args.path = path;
2398	drop_args.end = end + `1`;
2399	drop_args.drop_cache = true;
2400	while (cur_offset < end) {
2401	drop_args.start = cur_offset;
2402	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
2403	/ If we are punching a hole decrement the inode's byte count /
2404	if (!extent_info)
2405	btrfs_update_inode_bytes(inode, add_bytes: `0`,
2406	del_bytes: drop_args.bytes_found);
2407	if (ret != -ENOSPC) {
2408	/*
2409	* The only time we don't want to abort is if we are
2410	* attempting to clone a partial inline extent, in which
2411	* case we'll get EOPNOTSUPP. However if we aren't
2412	* clone we need to abort no matter what, because if we
2413	* got EOPNOTSUPP via prealloc then we messed up and
2414	* need to abort.
2415	*/
2416	if (ret &&
2417	(ret != -EOPNOTSUPP \|\|
2418	(extent_info && extent_info->is_new_extent)))
2419	btrfs_abort_transaction(trans, ret);
2420	break;
2421	}
2422
2423	trans->block_rsv = &fs_info->trans_block_rsv;
2424
2425	if (!extent_info && cur_offset < drop_args.drop_end &&
2426	cur_offset < ino_size) {
2427	ret = fill_holes(trans, inode, path, offset: cur_offset,
2428	end: drop_args.drop_end);
2429	if (ret) {
2430	/*
2431	* If we failed then we didn't insert our hole
2432	* entries for the area we dropped, so now the
2433	* fs is corrupted, so we must abort the
2434	* transaction.
2435	*/
2436	btrfs_abort_transaction(trans, ret);
2437	break;
2438	}
2439	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2440	/*
2441	* We are past the i_size here, but since we didn't
2442	* insert holes we need to clear the mapped area so we
2443	* know to not set disk_i_size in this area until a new
2444	* file extent is inserted here.
2445	*/
2446	ret = btrfs_inode_clear_file_extent_range(inode,
2447	start: cur_offset,
2448	len: drop_args.drop_end - cur_offset);
2449	if (ret) {
2450	/*
2451	* We couldn't clear our area, so we could
2452	* presumably adjust up and corrupt the fs, so
2453	* we need to abort.
2454	*/
2455	btrfs_abort_transaction(trans, ret);
2456	break;
2457	}
2458	}
2459
2460	if (extent_info &&
2461	drop_args.drop_end > extent_info->file_offset) {
2462	u64 replace_len = drop_args.drop_end -
2463	extent_info->file_offset;
2464
2465	ret = btrfs_insert_replace_extent(trans, inode, path,
2466	extent_info, replace_len,
2467	bytes_to_drop: drop_args.bytes_found);
2468	if (ret) {
2469	btrfs_abort_transaction(trans, ret);
2470	break;
2471	}
2472	extent_info->data_len -= replace_len;
2473	extent_info->data_offset += replace_len;
2474	extent_info->file_offset += replace_len;
2475	}
2476
2477	/*
2478	* We are releasing our handle on the transaction, balance the
2479	* dirty pages of the btree inode and flush delayed items, and
2480	* then get a new transaction handle, which may now point to a
2481	* new transaction in case someone else may have committed the
2482	* transaction we used to replace/drop file extent items. So
2483	* bump the inode's iversion and update mtime and ctime except
2484	* if we are called from a dedupe context. This is because a
2485	* power failure/crash may happen after the transaction is
2486	* committed and before we finish replacing/dropping all the
2487	* file extent items we need.
2488	*/
2489	inode_inc_iversion(inode: &inode->vfs_inode);
2490
2491	if (!extent_info \|\| extent_info->update_times)
2492	inode_set_mtime_to_ts(inode: &inode->vfs_inode,
2493	ts: inode_set_ctime_current(inode: &inode->vfs_inode));
2494
2495	ret = btrfs_update_inode(trans, inode);
2496	if (ret)
2497	break;
2498
2499	btrfs_end_transaction(trans);
2500	btrfs_btree_balance_dirty(fs_info);
2501
2502	trans = btrfs_start_transaction(root, num_items: rsv_count);
2503	if (IS_ERR(ptr: trans)) {
2504	ret = PTR_ERR(ptr: trans);
2505	trans = NULL;
2506	break;
2507	}
2508
2509	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv,
2510	dst_rsv: rsv, num_bytes: min_size, update_size: false);
2511	if (WARN_ON(ret))
2512	break;
2513	trans->block_rsv = rsv;
2514
2515	cur_offset = drop_args.drop_end;
2516	len = end - cur_offset;
2517	if (!extent_info && len) {
2518	ret = find_first_non_hole(inode, start: &cur_offset, len: &len);
2519	if (unlikely(ret < `0`))
2520	break;
2521	if (ret && !len) {
2522	ret = `0`;
2523	break;
2524	}
2525	}
2526	}
2527
2528	/*
2529	* If we were cloning, force the next fsync to be a full one since we
2530	* we replaced (or just dropped in the case of cloning holes when
2531	* NO_HOLES is enabled) file extent items and did not setup new extent
2532	* maps for the replacement extents (or holes).
2533	*/
2534	if (extent_info && !extent_info->is_new_extent)
2535	btrfs_set_inode_full_sync(inode);
2536
2537	if (ret)
2538	goto out_trans;
2539
2540	trans->block_rsv = &fs_info->trans_block_rsv;
2541	/*
2542	* If we are using the NO_HOLES feature we might have had already an
2543	* hole that overlaps a part of the region [lockstart, lockend] and
2544	* ends at (or beyond) lockend. Since we have no file extent items to
2545	* represent holes, drop_end can be less than lockend and so we must
2546	* make sure we have an extent map representing the existing hole (the
2547	* call to __btrfs_drop_extents() might have dropped the existing extent
2548	* map representing the existing hole), otherwise the fast fsync path
2549	* will not record the existence of the hole region
2550	* [existing_hole_start, lockend].
2551	*/
2552	if (drop_args.drop_end <= end)
2553	drop_args.drop_end = end + `1`;
2554	/*
2555	* Don't insert file hole extent item if it's for a range beyond eof
2556	* (because it's useless) or if it represents a 0 bytes range (when
2557	* cur_offset == drop_end).
2558	*/
2559	if (!extent_info && cur_offset < ino_size &&
2560	cur_offset < drop_args.drop_end) {
2561	ret = fill_holes(trans, inode, path, offset: cur_offset,
2562	end: drop_args.drop_end);
2563	if (ret) {
2564	/ Same comment as above. /
2565	btrfs_abort_transaction(trans, ret);
2566	goto out_trans;
2567	}
2568	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2569	/ See the comment in the loop above for the reasoning here. /
2570	ret = btrfs_inode_clear_file_extent_range(inode, start: cur_offset,
2571	len: drop_args.drop_end - cur_offset);
2572	if (ret) {
2573	btrfs_abort_transaction(trans, ret);
2574	goto out_trans;
2575	}
2576
2577	}
2578	if (extent_info) {
2579	ret = btrfs_insert_replace_extent(trans, inode, path,
2580	extent_info, replace_len: extent_info->data_len,
2581	bytes_to_drop: drop_args.bytes_found);
2582	if (ret) {
2583	btrfs_abort_transaction(trans, ret);
2584	goto out_trans;
2585	}
2586	}
2587
2588	out_trans:
2589	if (!trans)
2590	goto out_free;
2591
2592	trans->block_rsv = &fs_info->trans_block_rsv;
2593	if (ret)
2594	btrfs_end_transaction(trans);
2595	else
2596	*trans_out = trans;
2597	out_free:
2598	btrfs_free_block_rsv(fs_info, rsv);
2599	out:
2600	return ret;
2601	}
2602
2603	static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2604	{
2605	struct inode *inode = file_inode(f: file);
2606	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2607	struct btrfs_root *root = BTRFS_I(inode)->root;
2608	struct extent_state *cached_state = NULL;
2609	struct btrfs_path *path;
2610	struct btrfs_trans_handle *trans = NULL;
2611	u64 lockstart;
2612	u64 lockend;
2613	u64 tail_start;
2614	u64 tail_len;
2615	u64 orig_start = offset;
2616	int ret = `0`;
2617	bool same_block;
2618	u64 ino_size;
2619	bool truncated_block = false;
2620	bool updated_inode = false;
2621
2622	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2623
2624	ret = btrfs_wait_ordered_range(inode, start: offset, len);
2625	if (ret)
2626	goto out_only_mutex;
2627
2628	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2629	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &offset, len: &len);
2630	if (ret < `0`)
2631	goto out_only_mutex;
2632	if (ret && !len) {
2633	/ Already in a large hole /
2634	ret = `0`;
2635	goto out_only_mutex;
2636	}
2637
2638	ret = file_modified(file);
2639	if (ret)
2640	goto out_only_mutex;
2641
2642	lockstart = round_up(offset, fs_info->sectorsize);
2643	lockend = round_down(offset + len, fs_info->sectorsize) - `1`;
2644	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2645	== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - `1`));
2646	/*
2647	* We needn't truncate any block which is beyond the end of the file
2648	* because we are sure there is no data there.
2649	*/
2650	/*
2651	* Only do this if we are in the same block and we aren't doing the
2652	* entire block.
2653	*/
2654	if (same_block && len < fs_info->sectorsize) {
2655	if (offset < ino_size) {
2656	truncated_block = true;
2657	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len,
2658	front: `0`);
2659	} else {
2660	ret = `0`;
2661	}
2662	goto out_only_mutex;
2663	}
2664
2665	/ zero back part of the first block /
2666	if (offset < ino_size) {
2667	truncated_block = true;
2668	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len: `0`, front: `0`);
2669	if (ret) {
2670	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2671	return ret;
2672	}
2673	}
2674
2675	/ Check the aligned pages after the first unaligned page,*
2676	* if offset != orig_start, which means the first unaligned page
2677	* including several following pages are already in holes,
2678	* the extra check can be skipped */
2679	if (offset == orig_start) {
2680	/ after truncate page, check hole again /
2681	len = offset + len - lockstart;
2682	offset = lockstart;
2683	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &offset, len: &len);
2684	if (ret < `0`)
2685	goto out_only_mutex;
2686	if (ret && !len) {
2687	ret = `0`;
2688	goto out_only_mutex;
2689	}
2690	lockstart = offset;
2691	}
2692
2693	/ Check the tail unaligned part is in a hole /
2694	tail_start = lockend + `1`;
2695	tail_len = offset + len - tail_start;
2696	if (tail_len) {
2697	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &tail_start, len: &tail_len);
2698	if (unlikely(ret < `0`))
2699	goto out_only_mutex;
2700	if (!ret) {
2701	/ zero the front end of the last page /
2702	if (tail_start + tail_len < ino_size) {
2703	truncated_block = true;
2704	ret = btrfs_truncate_block(inode: BTRFS_I(inode),
2705	from: tail_start + tail_len,
2706	len: `0`, front: `1`);
2707	if (ret)
2708	goto out_only_mutex;
2709	}
2710	}
2711	}
2712
2713	if (lockend < lockstart) {
2714	ret = `0`;
2715	goto out_only_mutex;
2716	}
2717
2718	btrfs_punch_hole_lock_range(inode, lockstart, lockend, cached_state: &cached_state);
2719
2720	path = btrfs_alloc_path();
2721	if (!path) {
2722	ret = -ENOMEM;
2723	goto out;
2724	}
2725
2726	ret = btrfs_replace_file_extents(inode: BTRFS_I(inode), path, start: lockstart,
2727	end: lockend, NULL, trans_out: &trans);
2728	btrfs_free_path(p: path);
2729	if (ret)
2730	goto out;
2731
2732	ASSERT(trans != NULL);
2733	inode_inc_iversion(inode);
2734	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
2735	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2736	updated_inode = true;
2737	btrfs_end_transaction(trans);
2738	btrfs_btree_balance_dirty(fs_info);
2739	out:
2740	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2741	cached: &cached_state);
2742	out_only_mutex:
2743	if (!updated_inode && truncated_block && !ret) {
2744	/*
2745	* If we only end up zeroing part of a page, we still need to
2746	* update the inode item, so that all the time fields are
2747	* updated as well as the necessary btrfs inode in memory fields
2748	* for detecting, at fsync time, if the inode isn't yet in the
2749	* log tree or it's there but not up to date.
2750	*/
2751	struct timespec64 now = inode_set_ctime_current(inode);
2752
2753	inode_inc_iversion(inode);
2754	inode_set_mtime_to_ts(inode, ts: now);
2755	trans = btrfs_start_transaction(root, num_items: `1`);
2756	if (IS_ERR(ptr: trans)) {
2757	ret = PTR_ERR(ptr: trans);
2758	} else {
2759	int ret2;
2760
2761	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2762	ret2 = btrfs_end_transaction(trans);
2763	if (!ret)
2764	ret = ret2;
2765	}
2766	}
2767	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2768	return ret;
2769	}
2770
2771	/ Helper structure to record which range is already reserved /
2772	struct falloc_range {
2773	struct list_head list;
2774	u64 start;
2775	u64 len;
2776	};
2777
2778	/*
2779	* Helper function to add falloc range
2780	*
2781	* Caller should have locked the larger range of extent containing
2782	* [start, len)
2783	*/
2784	static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2785	{
2786	struct falloc_range *range = NULL;
2787
2788	if (!list_empty(head)) {
2789	/*
2790	* As fallocate iterates by bytenr order, we only need to check
2791	* the last range.
2792	*/
2793	range = list_last_entry(head, struct falloc_range, list);
2794	if (range->start + range->len == start) {
2795	range->len += len;
2796	return `0`;
2797	}
2798	}
2799
2800	range = kmalloc(size: sizeof(*range), GFP_KERNEL);
2801	if (!range)
2802	return -ENOMEM;
2803	range->start = start;
2804	range->len = len;
2805	list_add_tail(new: &range->list, head);
2806	return `0`;
2807	}
2808
2809	static int btrfs_fallocate_update_isize(struct inode *inode,
2810	const u64 end,
2811	const int mode)
2812	{
2813	struct btrfs_trans_handle *trans;
2814	struct btrfs_root *root = BTRFS_I(inode)->root;
2815	int ret;
2816	int ret2;
2817
2818	if (mode & FALLOC_FL_KEEP_SIZE \|\| end <= i_size_read(inode))
2819	return `0`;
2820
2821	trans = btrfs_start_transaction(root, num_items: `1`);
2822	if (IS_ERR(ptr: trans))
2823	return PTR_ERR(ptr: trans);
2824
2825	inode_set_ctime_current(inode);
2826	i_size_write(inode, i_size: end);
2827	btrfs_inode_safe_disk_i_size_write(inode: BTRFS_I(inode), new_i_size: `0`);
2828	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2829	ret2 = btrfs_end_transaction(trans);
2830
2831	return ret ? ret : ret2;
2832	}
2833
2834	enum {
2835	RANGE_BOUNDARY_WRITTEN_EXTENT,
2836	RANGE_BOUNDARY_PREALLOC_EXTENT,
2837	RANGE_BOUNDARY_HOLE,
2838	};
2839
2840	static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2841	u64 offset)
2842	{
2843	const u64 sectorsize = inode->root->fs_info->sectorsize;
2844	struct extent_map *em;
2845	int ret;
2846
2847	offset = round_down(offset, sectorsize);
2848	em = btrfs_get_extent(inode, NULL, start: offset, len: sectorsize);
2849	if (IS_ERR(ptr: em))
2850	return PTR_ERR(ptr: em);
2851
2852	if (em->block_start == EXTENT_MAP_HOLE)
2853	ret = RANGE_BOUNDARY_HOLE;
2854	else if (em->flags & EXTENT_FLAG_PREALLOC)
2855	ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2856	else
2857	ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2858
2859	free_extent_map(em);
2860	return ret;
2861	}
2862
2863	static int btrfs_zero_range(struct inode *inode,
2864	loff_t offset,
2865	loff_t len,
2866	const int mode)
2867	{
2868	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2869	struct extent_map *em;
2870	struct extent_changeset *data_reserved = NULL;
2871	int ret;
2872	u64 alloc_hint = `0`;
2873	const u64 sectorsize = fs_info->sectorsize;
2874	u64 alloc_start = round_down(offset, sectorsize);
2875	u64 alloc_end = round_up(offset + len, sectorsize);
2876	u64 bytes_to_reserve = `0`;
2877	bool space_reserved = false;
2878
2879	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, start: alloc_start,
2880	len: alloc_end - alloc_start);
2881	if (IS_ERR(ptr: em)) {
2882	ret = PTR_ERR(ptr: em);
2883	goto out;
2884	}
2885
2886	/*
2887	* Avoid hole punching and extent allocation for some cases. More cases
2888	* could be considered, but these are unlikely common and we keep things
2889	* as simple as possible for now. Also, intentionally, if the target
2890	* range contains one or more prealloc extents together with regular
2891	* extents and holes, we drop all the existing extents and allocate a
2892	* new prealloc extent, so that we get a larger contiguous disk extent.
2893	*/
2894	if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2895	const u64 em_end = em->start + em->len;
2896
2897	if (em_end >= offset + len) {
2898	/*
2899	* The whole range is already a prealloc extent,
2900	* do nothing except updating the inode's i_size if
2901	* needed.
2902	*/
2903	free_extent_map(em);
2904	ret = btrfs_fallocate_update_isize(inode, end: offset + len,
2905	mode);
2906	goto out;
2907	}
2908	/*
2909	* Part of the range is already a prealloc extent, so operate
2910	* only on the remaining part of the range.
2911	*/
2912	alloc_start = em_end;
2913	ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2914	len = offset + len - alloc_start;
2915	offset = alloc_start;
2916	alloc_hint = em->block_start + em->len;
2917	}
2918	free_extent_map(em);
2919
2920	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2921	BTRFS_BYTES_TO_BLKS(fs_info, offset + len - `1`)) {
2922	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, start: alloc_start, len: sectorsize);
2923	if (IS_ERR(ptr: em)) {
2924	ret = PTR_ERR(ptr: em);
2925	goto out;
2926	}
2927
2928	if (em->flags & EXTENT_FLAG_PREALLOC) {
2929	free_extent_map(em);
2930	ret = btrfs_fallocate_update_isize(inode, end: offset + len,
2931	mode);
2932	goto out;
2933	}
2934	if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
2935	free_extent_map(em);
2936	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len,
2937	front: `0`);
2938	if (!ret)
2939	ret = btrfs_fallocate_update_isize(inode,
2940	end: offset + len,
2941	mode);
2942	return ret;
2943	}
2944	free_extent_map(em);
2945	alloc_start = round_down(offset, sectorsize);
2946	alloc_end = alloc_start + sectorsize;
2947	goto reserve_space;
2948	}
2949
2950	alloc_start = round_up(offset, sectorsize);
2951	alloc_end = round_down(offset + len, sectorsize);
2952
2953	/*
2954	* For unaligned ranges, check the pages at the boundaries, they might
2955	* map to an extent, in which case we need to partially zero them, or
2956	* they might map to a hole, in which case we need our allocation range
2957	* to cover them.
2958	*/
2959	if (!IS_ALIGNED(offset, sectorsize)) {
2960	ret = btrfs_zero_range_check_range_boundary(inode: BTRFS_I(inode),
2961	offset);
2962	if (ret < `0`)
2963	goto out;
2964	if (ret == RANGE_BOUNDARY_HOLE) {
2965	alloc_start = round_down(offset, sectorsize);
2966	ret = `0`;
2967	} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2968	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len: `0`, front: `0`);
2969	if (ret)
2970	goto out;
2971	} else {
2972	ret = `0`;
2973	}
2974	}
2975
2976	if (!IS_ALIGNED(offset + len, sectorsize)) {
2977	ret = btrfs_zero_range_check_range_boundary(inode: BTRFS_I(inode),
2978	offset: offset + len);
2979	if (ret < `0`)
2980	goto out;
2981	if (ret == RANGE_BOUNDARY_HOLE) {
2982	alloc_end = round_up(offset + len, sectorsize);
2983	ret = `0`;
2984	} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2985	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset + len,
2986	len: `0`, front: `1`);
2987	if (ret)
2988	goto out;
2989	} else {
2990	ret = `0`;
2991	}
2992	}
2993
2994	reserve_space:
2995	if (alloc_start < alloc_end) {
2996	struct extent_state *cached_state = NULL;
2997	const u64 lockstart = alloc_start;
2998	const u64 lockend = alloc_end - `1`;
2999
3000	bytes_to_reserve = alloc_end - alloc_start;
3001	ret = btrfs_alloc_data_chunk_ondemand(inode: BTRFS_I(inode),
3002	bytes: bytes_to_reserve);
3003	if (ret < `0`)
3004	goto out;
3005	space_reserved = true;
3006	btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3007	cached_state: &cached_state);
3008	ret = btrfs_qgroup_reserve_data(inode: BTRFS_I(inode), reserved: &data_reserved,
3009	start: alloc_start, len: bytes_to_reserve);
3010	if (ret) {
3011	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart,
3012	end: lockend, cached: &cached_state);
3013	goto out;
3014	}
3015	ret = btrfs_prealloc_file_range(inode, mode, start: alloc_start,
3016	num_bytes: alloc_end - alloc_start,
3017	min_size: fs_info->sectorsize,
3018	actual_len: offset + len, alloc_hint: &alloc_hint);
3019	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
3020	cached: &cached_state);
3021	/ btrfs_prealloc_file_range releases reserved space on error /
3022	if (ret) {
3023	space_reserved = false;
3024	goto out;
3025	}
3026	}
3027	ret = btrfs_fallocate_update_isize(inode, end: offset + len, mode);
3028	out:
3029	if (ret && space_reserved)
3030	btrfs_free_reserved_data_space(inode: BTRFS_I(inode), reserved: data_reserved,
3031	start: alloc_start, len: bytes_to_reserve);
3032	extent_changeset_free(changeset: data_reserved);
3033
3034	return ret;
3035	}
3036
3037	static long btrfs_fallocate(struct file file, int* mode,
3038	loff_t offset, loff_t len)
3039	{
3040	struct inode *inode = file_inode(f: file);
3041	struct extent_state *cached_state = NULL;
3042	struct extent_changeset *data_reserved = NULL;
3043	struct falloc_range *range;
3044	struct falloc_range *tmp;
3045	LIST_HEAD(reserve_list);
3046	u64 cur_offset;
3047	u64 last_byte;
3048	u64 alloc_start;
3049	u64 alloc_end;
3050	u64 alloc_hint = `0`;
3051	u64 locked_end;
3052	u64 actual_end = `0`;
3053	u64 data_space_needed = `0`;
3054	u64 data_space_reserved = `0`;
3055	u64 qgroup_reserved = `0`;
3056	struct extent_map *em;
3057	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3058	int ret;
3059
3060	/ Do not allow fallocate in ZONED mode /
3061	if (btrfs_is_zoned(inode_to_fs_info(inode)))
3062	return -EOPNOTSUPP;
3063
3064	alloc_start = round_down(offset, blocksize);
3065	alloc_end = round_up(offset + len, blocksize);
3066	cur_offset = alloc_start;
3067
3068	/ Make sure we aren't being give some crap mode /
3069	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \|
3070	FALLOC_FL_ZERO_RANGE))
3071	return -EOPNOTSUPP;
3072
3073	if (mode & FALLOC_FL_PUNCH_HOLE)
3074	return btrfs_punch_hole(file, offset, len);
3075
3076	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3077
3078	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3079	ret = inode_newsize_ok(inode, offset: offset + len);
3080	if (ret)
3081	goto out;
3082	}
3083
3084	ret = file_modified(file);
3085	if (ret)
3086	goto out;
3087
3088	/*
3089	* TODO: Move these two operations after we have checked
3090	* accurate reserved space, or fallocate can still fail but
3091	* with page truncated or size expanded.
3092	*
3093	* But that's a minor problem and won't do much harm BTW.
3094	*/
3095	if (alloc_start > inode->i_size) {
3096	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize: i_size_read(inode),
3097	size: alloc_start);
3098	if (ret)
3099	goto out;
3100	} else if (offset + len > inode->i_size) {
3101	/*
3102	* If we are fallocating from the end of the file onward we
3103	* need to zero out the end of the block if i_size lands in the
3104	* middle of a block.
3105	*/
3106	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: inode->i_size, len: `0`, front: `0`);
3107	if (ret)
3108	goto out;
3109	}
3110
3111	/*
3112	* We have locked the inode at the VFS level (in exclusive mode) and we
3113	* have locked the i_mmap_lock lock (in exclusive mode). Now before
3114	* locking the file range, flush all dealloc in the range and wait for
3115	* all ordered extents in the range to complete. After this we can lock
3116	* the file range and, due to the previous locking we did, we know there
3117	* can't be more delalloc or ordered extents in the range.
3118	*/
3119	ret = btrfs_wait_ordered_range(inode, start: alloc_start,
3120	len: alloc_end - alloc_start);
3121	if (ret)
3122	goto out;
3123
3124	if (mode & FALLOC_FL_ZERO_RANGE) {
3125	ret = btrfs_zero_range(inode, offset, len, mode);
3126	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3127	return ret;
3128	}
3129
3130	locked_end = alloc_end - `1`;
3131	lock_extent(tree: &BTRFS_I(inode)->io_tree, start: alloc_start, end: locked_end,
3132	cached: &cached_state);
3133
3134	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode), start: alloc_start, end: locked_end);
3135
3136	/ First, check if we exceed the qgroup limit /
3137	while (cur_offset < alloc_end) {
3138	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, start: cur_offset,
3139	len: alloc_end - cur_offset);
3140	if (IS_ERR(ptr: em)) {
3141	ret = PTR_ERR(ptr: em);
3142	break;
3143	}
3144	last_byte = min(extent_map_end(em), alloc_end);
3145	actual_end = min_t(u64, extent_map_end(em), offset + len);
3146	last_byte = ALIGN(last_byte, blocksize);
3147	if (em->block_start == EXTENT_MAP_HOLE \|\|
3148	(cur_offset >= inode->i_size &&
3149	!(em->flags & EXTENT_FLAG_PREALLOC))) {
3150	const u64 range_len = last_byte - cur_offset;
3151
3152	ret = add_falloc_range(head: &reserve_list, start: cur_offset, len: range_len);
3153	if (ret < `0`) {
3154	free_extent_map(em);
3155	break;
3156	}
3157	ret = btrfs_qgroup_reserve_data(inode: BTRFS_I(inode),
3158	reserved: &data_reserved, start: cur_offset, len: range_len);
3159	if (ret < `0`) {
3160	free_extent_map(em);
3161	break;
3162	}
3163	qgroup_reserved += range_len;
3164	data_space_needed += range_len;
3165	}
3166	free_extent_map(em);
3167	cur_offset = last_byte;
3168	}
3169
3170	if (!ret && data_space_needed > `0`) {
3171	/*
3172	* We are safe to reserve space here as we can't have delalloc
3173	* in the range, see above.
3174	*/
3175	ret = btrfs_alloc_data_chunk_ondemand(inode: BTRFS_I(inode),
3176	bytes: data_space_needed);
3177	if (!ret)
3178	data_space_reserved = data_space_needed;
3179	}
3180
3181	/*
3182	* If ret is still 0, means we're OK to fallocate.
3183	* Or just cleanup the list and exit.
3184	*/
3185	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3186	if (!ret) {
3187	ret = btrfs_prealloc_file_range(inode, mode,
3188	start: range->start,
3189	num_bytes: range->len, min_size: blocksize,
3190	actual_len: offset + len, alloc_hint: &alloc_hint);
3191	/*
3192	* btrfs_prealloc_file_range() releases space even
3193	* if it returns an error.
3194	*/
3195	data_space_reserved -= range->len;
3196	qgroup_reserved -= range->len;
3197	} else if (data_space_reserved > `0`) {
3198	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
3199	reserved: data_reserved, start: range->start,
3200	len: range->len);
3201	data_space_reserved -= range->len;
3202	qgroup_reserved -= range->len;
3203	} else if (qgroup_reserved > `0`) {
3204	btrfs_qgroup_free_data(inode: BTRFS_I(inode), reserved: data_reserved,
3205	start: range->start, len: range->len, NULL);
3206	qgroup_reserved -= range->len;
3207	}
3208	list_del(entry: &range->list);
3209	kfree(objp: range);
3210	}
3211	if (ret < `0`)
3212	goto out_unlock;
3213
3214	/*
3215	* We didn't need to allocate any more space, but we still extended the
3216	* size of the file so we need to update i_size and the inode item.
3217	*/
3218	ret = btrfs_fallocate_update_isize(inode, end: actual_end, mode);
3219	out_unlock:
3220	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: alloc_start, end: locked_end,
3221	cached: &cached_state);
3222	out:
3223	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3224	extent_changeset_free(changeset: data_reserved);
3225	return ret;
3226	}
3227
3228	/*
3229	* Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3230	* that has unflushed and/or flushing delalloc. There might be other adjacent
3231	* subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3232	* looping while it gets adjacent subranges, and merging them together.
3233	*/
3234	static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3235	struct extent_state **cached_state,
3236	bool *search_io_tree,
3237	u64 delalloc_start_ret, u64 delalloc_end_ret)
3238	{
3239	u64 len = end + `1` - start;
3240	u64 delalloc_len = `0`;
3241	struct btrfs_ordered_extent *oe;
3242	u64 oe_start;
3243	u64 oe_end;
3244
3245	/*
3246	* Search the io tree first for EXTENT_DELALLOC. If we find any, it
3247	* means we have delalloc (dirty pages) for which writeback has not
3248	* started yet.
3249	*/
3250	if (*search_io_tree) {
3251	spin_lock(lock: &inode->lock);
3252	if (inode->delalloc_bytes > `0`) {
3253	spin_unlock(lock: &inode->lock);
3254	*delalloc_start_ret = start;
3255	delalloc_len = count_range_bits(tree: &inode->io_tree,
3256	start: delalloc_start_ret, search_end: end,
3257	max_bytes: len, bits: EXTENT_DELALLOC, contig: `1`,
3258	cached_state);
3259	} else {
3260	spin_unlock(lock: &inode->lock);
3261	}
3262	}
3263
3264	if (delalloc_len > `0`) {
3265	/*
3266	* If delalloc was found then *delalloc_start_ret has a sector size
3267	* aligned value (rounded down).
3268	*/
3269	delalloc_end_ret = delalloc_start_ret + delalloc_len - `1`;
3270
3271	if (*delalloc_start_ret == start) {
3272	/ Delalloc for the whole range, nothing more to do. /
3273	if (*delalloc_end_ret == end)
3274	return true;
3275	/ Else trim our search range for ordered extents. /
3276	start = *delalloc_end_ret + `1`;
3277	len = end + `1` - start;
3278	}
3279	} else {
3280	/ No delalloc, future calls don't need to search again. /
3281	*search_io_tree = false;
3282	}
3283
3284	/*
3285	* Now also check if there's any ordered extent in the range.
3286	* We do this because:
3287	*
3288	* 1) When delalloc is flushed, the file range is locked, we clear the
3289	* EXTENT_DELALLOC bit from the io tree and create an extent map and
3290	* an ordered extent for the write. So we might just have been called
3291	* after delalloc is flushed and before the ordered extent completes
3292	* and inserts the new file extent item in the subvolume's btree;
3293	*
3294	* 2) We may have an ordered extent created by flushing delalloc for a
3295	* subrange that starts before the subrange we found marked with
3296	* EXTENT_DELALLOC in the io tree.
3297	*
3298	* We could also use the extent map tree to find such delalloc that is
3299	* being flushed, but using the ordered extents tree is more efficient
3300	* because it's usually much smaller as ordered extents are removed from
3301	* the tree once they complete. With the extent maps, we mau have them
3302	* in the extent map tree for a very long time, and they were either
3303	* created by previous writes or loaded by read operations.
3304	*/
3305	oe = btrfs_lookup_first_ordered_range(inode, file_offset: start, len);
3306	if (!oe)
3307	return (delalloc_len > `0`);
3308
3309	/ The ordered extent may span beyond our search range. /
3310	oe_start = max(oe->file_offset, start);
3311	oe_end = min(oe->file_offset + oe->num_bytes - `1`, end);
3312
3313	btrfs_put_ordered_extent(entry: oe);
3314
3315	/ Don't have unflushed delalloc, return the ordered extent range. /
3316	if (delalloc_len == `0`) {
3317	*delalloc_start_ret = oe_start;
3318	*delalloc_end_ret = oe_end;
3319	return true;
3320	}
3321
3322	/*
3323	* We have both unflushed delalloc (io_tree) and an ordered extent.
3324	* If the ranges are adjacent returned a combined range, otherwise
3325	* return the leftmost range.
3326	*/
3327	if (oe_start < *delalloc_start_ret) {
3328	if (oe_end < *delalloc_start_ret)
3329	*delalloc_end_ret = oe_end;
3330	*delalloc_start_ret = oe_start;
3331	} else if (*delalloc_end_ret + `1` == oe_start) {
3332	*delalloc_end_ret = oe_end;
3333	}
3334
3335	return true;
3336	}
3337
3338	/*
3339	* Check if there's delalloc in a given range.
3340	*
3341	* @inode: The inode.
3342	* @start: The start offset of the range. It does not need to be
3343	* sector size aligned.
3344	* @end: The end offset (inclusive value) of the search range.
3345	* It does not need to be sector size aligned.
3346	* @cached_state: Extent state record used for speeding up delalloc
3347	* searches in the inode's io_tree. Can be NULL.
3348	* @delalloc_start_ret: Output argument, set to the start offset of the
3349	* subrange found with delalloc (may not be sector size
3350	* aligned).
3351	* @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3352	* of the subrange found with delalloc.
3353	*
3354	* Returns true if a subrange with delalloc is found within the given range, and
3355	* if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3356	* end offsets of the subrange.
3357	*/
3358	bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3359	struct extent_state **cached_state,
3360	u64 delalloc_start_ret, u64 delalloc_end_ret)
3361	{
3362	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3363	u64 prev_delalloc_end = `0`;
3364	bool search_io_tree = true;
3365	bool ret = false;
3366
3367	while (cur_offset <= end) {
3368	u64 delalloc_start;
3369	u64 delalloc_end;
3370	bool delalloc;
3371
3372	delalloc = find_delalloc_subrange(inode, start: cur_offset, end,
3373	cached_state, search_io_tree: &search_io_tree,
3374	delalloc_start_ret: &delalloc_start,
3375	delalloc_end_ret: &delalloc_end);
3376	if (!delalloc)
3377	break;
3378
3379	if (prev_delalloc_end == `0`) {
3380	/ First subrange found. /
3381	*delalloc_start_ret = max(delalloc_start, start);
3382	*delalloc_end_ret = delalloc_end;
3383	ret = true;
3384	} else if (delalloc_start == prev_delalloc_end + `1`) {
3385	/ Subrange adjacent to the previous one, merge them. /
3386	*delalloc_end_ret = delalloc_end;
3387	} else {
3388	/ Subrange not adjacent to the previous one, exit. /
3389	break;
3390	}
3391
3392	prev_delalloc_end = delalloc_end;
3393	cur_offset = delalloc_end + `1`;
3394	cond_resched();
3395	}
3396
3397	return ret;
3398	}
3399
3400	/*
3401	* Check if there's a hole or delalloc range in a range representing a hole (or
3402	* prealloc extent) found in the inode's subvolume btree.
3403	*
3404	* @inode: The inode.
3405	* @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3406	* @start: Start offset of the hole region. It does not need to be sector
3407	* size aligned.
3408	* @end: End offset (inclusive value) of the hole region. It does not
3409	* need to be sector size aligned.
3410	* @start_ret: Return parameter, used to set the start of the subrange in the
3411	* hole that matches the search criteria (seek mode), if such
3412	* subrange is found (return value of the function is true).
3413	* The value returned here may not be sector size aligned.
3414	*
3415	* Returns true if a subrange matching the given seek mode is found, and if one
3416	* is found, it updates @start_ret with the start of the subrange.
3417	*/
3418	static bool find_desired_extent_in_hole(struct btrfs_inode inode, int* whence,
3419	struct extent_state **cached_state,
3420	u64 start, u64 end, u64 *start_ret)
3421	{
3422	u64 delalloc_start;
3423	u64 delalloc_end;
3424	bool delalloc;
3425
3426	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3427	delalloc_start_ret: &delalloc_start, delalloc_end_ret: &delalloc_end);
3428	if (delalloc && whence == SEEK_DATA) {
3429	*start_ret = delalloc_start;
3430	return true;
3431	}
3432
3433	if (delalloc && whence == SEEK_HOLE) {
3434	/*
3435	* We found delalloc but it starts after out start offset. So we
3436	* have a hole between our start offset and the delalloc start.
3437	*/
3438	if (start < delalloc_start) {
3439	*start_ret = start;
3440	return true;
3441	}
3442	/*
3443	* Delalloc range starts at our start offset.
3444	* If the delalloc range's length is smaller than our range,
3445	* then it means we have a hole that starts where the delalloc
3446	* subrange ends.
3447	*/
3448	if (delalloc_end < end) {
3449	*start_ret = delalloc_end + `1`;
3450	return true;
3451	}
3452
3453	/ There's delalloc for the whole range. /
3454	return false;
3455	}
3456
3457	if (!delalloc && whence == SEEK_HOLE) {
3458	*start_ret = start;
3459	return true;
3460	}
3461
3462	/*
3463	* No delalloc in the range and we are seeking for data. The caller has
3464	* to iterate to the next extent item in the subvolume btree.
3465	*/
3466	return false;
3467	}
3468
3469	static loff_t find_desired_extent(struct file file, loff_t offset, int* whence)
3470	{
3471	struct btrfs_inode *inode = BTRFS_I(inode: file->f_mapping->host);
3472	struct btrfs_file_private *private = file->private_data;
3473	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3474	struct extent_state *cached_state = NULL;
3475	struct extent_state **delalloc_cached_state;
3476	const loff_t i_size = i_size_read(inode: &inode->vfs_inode);
3477	const u64 ino = btrfs_ino(inode);
3478	struct btrfs_root *root = inode->root;
3479	struct btrfs_path *path;
3480	struct btrfs_key key;
3481	u64 last_extent_end;
3482	u64 lockstart;
3483	u64 lockend;
3484	u64 start;
3485	int ret;
3486	bool found = false;
3487
3488	if (i_size == `0` \|\| offset >= i_size)
3489	return -ENXIO;
3490
3491	/*
3492	* Quick path. If the inode has no prealloc extents and its number of
3493	* bytes used matches its i_size, then it can not have holes.
3494	*/
3495	if (whence == SEEK_HOLE &&
3496	!(inode->flags & BTRFS_INODE_PREALLOC) &&
3497	inode_get_bytes(inode: &inode->vfs_inode) == i_size)
3498	return i_size;
3499
3500	if (!private) {
3501	private = kzalloc(size: sizeof(*private), GFP_KERNEL);
3502	/*
3503	* No worries if memory allocation failed.
3504	* The private structure is used only for speeding up multiple
3505	* lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3506	* so everything will still be correct.
3507	*/
3508	file->private_data = private;
3509	}
3510
3511	if (private)
3512	delalloc_cached_state = &private->llseek_cached_state;
3513	else
3514	delalloc_cached_state = NULL;
3515
3516	/*
3517	* offset can be negative, in this case we start finding DATA/HOLE from
3518	* the very start of the file.
3519	*/
3520	start = max_t(loff_t, `0`, offset);
3521
3522	lockstart = round_down(start, fs_info->sectorsize);
3523	lockend = round_up(i_size, fs_info->sectorsize);
3524	if (lockend <= lockstart)
3525	lockend = lockstart + fs_info->sectorsize;
3526	lockend--;
3527
3528	path = btrfs_alloc_path();
3529	if (!path)
3530	return -ENOMEM;
3531	path->reada = READA_FORWARD;
3532
3533	key.objectid = ino;
3534	key.type = BTRFS_EXTENT_DATA_KEY;
3535	key.offset = start;
3536
3537	last_extent_end = lockstart;
3538
3539	lock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
3540
3541	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3542	if (ret < `0`) {
3543	goto out;
3544	} else if (ret > `0` && path->slots[`0`] > `0`) {
3545	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
3546	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3547	path->slots[`0`]--;
3548	}
3549
3550	while (start < i_size) {
3551	struct extent_buffer *leaf = path->nodes[`0`];
3552	struct btrfs_file_extent_item *extent;
3553	u64 extent_end;
3554	u8 type;
3555
3556	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
3557	ret = btrfs_next_leaf(root, path);
3558	if (ret < `0`)
3559	goto out;
3560	else if (ret > `0`)
3561	break;
3562
3563	leaf = path->nodes[`0`];
3564	}
3565
3566	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
3567	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
3568	break;
3569
3570	extent_end = btrfs_file_extent_end(path);
3571
3572	/*
3573	* In the first iteration we may have a slot that points to an
3574	* extent that ends before our start offset, so skip it.
3575	*/
3576	if (extent_end <= start) {
3577	path->slots[`0`]++;
3578	continue;
3579	}
3580
3581	/ We have an implicit hole, NO_HOLES feature is likely set. /
3582	if (last_extent_end < key.offset) {
3583	u64 search_start = last_extent_end;
3584	u64 found_start;
3585
3586	/*
3587	* First iteration, @start matches @offset and it's
3588	* within the hole.
3589	*/
3590	if (start == offset)
3591	search_start = offset;
3592
3593	found = find_desired_extent_in_hole(inode, whence,
3594	cached_state: delalloc_cached_state,
3595	start: search_start,
3596	end: key.offset - `1`,
3597	start_ret: &found_start);
3598	if (found) {
3599	start = found_start;
3600	break;
3601	}
3602	/*
3603	* Didn't find data or a hole (due to delalloc) in the
3604	* implicit hole range, so need to analyze the extent.
3605	*/
3606	}
3607
3608	extent = btrfs_item_ptr(leaf, path->slots[`0`],
3609	struct btrfs_file_extent_item);
3610	type = btrfs_file_extent_type(eb: leaf, s: extent);
3611
3612	/*
3613	* Can't access the extent's disk_bytenr field if this is an
3614	* inline extent, since at that offset, it's where the extent
3615	* data starts.
3616	*/
3617	if (type == BTRFS_FILE_EXTENT_PREALLOC \|\|
3618	(type == BTRFS_FILE_EXTENT_REG &&
3619	btrfs_file_extent_disk_bytenr(eb: leaf, s: extent) == `0`)) {
3620	/*
3621	* Explicit hole or prealloc extent, search for delalloc.
3622	* A prealloc extent is treated like a hole.
3623	*/
3624	u64 search_start = key.offset;
3625	u64 found_start;
3626
3627	/*
3628	* First iteration, @start matches @offset and it's
3629	* within the hole.
3630	*/
3631	if (start == offset)
3632	search_start = offset;
3633
3634	found = find_desired_extent_in_hole(inode, whence,
3635	cached_state: delalloc_cached_state,
3636	start: search_start,
3637	end: extent_end - `1`,
3638	start_ret: &found_start);
3639	if (found) {
3640	start = found_start;
3641	break;
3642	}
3643	/*
3644	* Didn't find data or a hole (due to delalloc) in the
3645	* implicit hole range, so need to analyze the next
3646	* extent item.
3647	*/
3648	} else {
3649	/*
3650	* Found a regular or inline extent.
3651	* If we are seeking for data, adjust the start offset
3652	* and stop, we're done.
3653	*/
3654	if (whence == SEEK_DATA) {
3655	start = max_t(u64, key.offset, offset);
3656	found = true;
3657	break;
3658	}
3659	/*
3660	* Else, we are seeking for a hole, check the next file
3661	* extent item.
3662	*/
3663	}
3664
3665	start = extent_end;
3666	last_extent_end = extent_end;
3667	path->slots[`0`]++;
3668	if (fatal_signal_pending(current)) {
3669	ret = -EINTR;
3670	goto out;
3671	}
3672	cond_resched();
3673	}
3674
3675	/ We have an implicit hole from the last extent found up to i_size. /
3676	if (!found && start < i_size) {
3677	found = find_desired_extent_in_hole(inode, whence,
3678	cached_state: delalloc_cached_state, start,
3679	end: i_size - `1`, start_ret: &start);
3680	if (!found)
3681	start = i_size;
3682	}
3683
3684	out:
3685	unlock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
3686	btrfs_free_path(p: path);
3687
3688	if (ret < `0`)
3689	return ret;
3690
3691	if (whence == SEEK_DATA && start >= i_size)
3692	return -ENXIO;
3693
3694	return min_t(loff_t, start, i_size);
3695	}
3696
3697	static loff_t btrfs_file_llseek(struct file file, loff_t offset, int* whence)
3698	{
3699	struct inode *inode = file->f_mapping->host;
3700
3701	switch (whence) {
3702	default:
3703	return generic_file_llseek(file, offset, whence);
3704	case SEEK_DATA:
3705	case SEEK_HOLE:
3706	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3707	offset = find_desired_extent(file, offset, whence);
3708	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3709	break;
3710	}
3711
3712	if (offset < `0`)
3713	return offset;
3714
3715	return vfs_setpos(file, offset, maxsize: inode->i_sb->s_maxbytes);
3716	}
3717
3718	static int btrfs_file_open(struct inode inode, struct* file *filp)
3719	{
3720	int ret;
3721
3722	filp->f_mode \|= FMODE_NOWAIT \| FMODE_BUF_RASYNC \| FMODE_BUF_WASYNC \|
3723	FMODE_CAN_ODIRECT;
3724
3725	ret = fsverity_file_open(inode, filp);
3726	if (ret)
3727	return ret;
3728	return generic_file_open(inode, filp);
3729	}
3730
3731	static int check_direct_read(struct btrfs_fs_info *fs_info,
3732	const struct iov_iter *iter, loff_t offset)
3733	{
3734	int ret;
3735	int i, seg;
3736
3737	ret = check_direct_IO(fs_info, iter, offset);
3738	if (ret < `0`)
3739	return ret;
3740
3741	if (!iter_is_iovec(i: iter))
3742	return `0`;
3743
3744	for (seg = `0`; seg < iter->nr_segs; seg++) {
3745	for (i = seg + `1`; i < iter->nr_segs; i++) {
3746	const struct iovec *iov1 = iter_iov(iter) + seg;
3747	const struct iovec *iov2 = iter_iov(iter) + i;
3748
3749	if (iov1->iov_base == iov2->iov_base)
3750	return -EINVAL;
3751	}
3752	}
3753	return `0`;
3754	}
3755
3756	static ssize_t btrfs_direct_read(struct kiocb iocb, struct* iov_iter *to)
3757	{
3758	struct inode *inode = file_inode(f: iocb->ki_filp);
3759	size_t prev_left = `0`;
3760	ssize_t read = `0`;
3761	ssize_t ret;
3762
3763	if (fsverity_active(inode))
3764	return `0`;
3765
3766	if (check_direct_read(inode_to_fs_info(inode), iter: to, offset: iocb->ki_pos))
3767	return `0`;
3768
3769	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3770	again:
3771	/*
3772	* This is similar to what we do for direct IO writes, see the comment
3773	* at btrfs_direct_write(), but we also disable page faults in addition
3774	* to disabling them only at the iov_iter level. This is because when
3775	* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3776	* which can still trigger page fault ins despite having set ->nofault
3777	* to true of our 'to' iov_iter.
3778	*
3779	* The difference to direct IO writes is that we deadlock when trying
3780	* to lock the extent range in the inode's tree during he page reads
3781	* triggered by the fault in (while for writes it is due to waiting for
3782	* our own ordered extent). This is because for direct IO reads,
3783	* btrfs_dio_iomap_begin() returns with the extent range locked, which
3784	* is only unlocked in the endio callback (end_bio_extent_readpage()).
3785	*/
3786	pagefault_disable();
3787	to->nofault = true;
3788	ret = btrfs_dio_read(iocb, iter: to, done_before: read);
3789	to->nofault = false;
3790	pagefault_enable();
3791
3792	/ No increment (+=) because iomap returns a cumulative value. /
3793	if (ret > `0`)
3794	read = ret;
3795
3796	if (iov_iter_count(i: to) > `0` && (ret == -EFAULT \|\| ret > `0`)) {
3797	const size_t left = iov_iter_count(i: to);
3798
3799	if (left == prev_left) {
3800	/*
3801	* We didn't make any progress since the last attempt,
3802	* fallback to a buffered read for the remainder of the
3803	* range. This is just to avoid any possibility of looping
3804	* for too long.
3805	*/
3806	ret = read;
3807	} else {
3808	/*
3809	* We made some progress since the last retry or this is
3810	* the first time we are retrying. Fault in as many pages
3811	* as possible and retry.
3812	*/
3813	fault_in_iov_iter_writeable(i: to, bytes: left);
3814	prev_left = left;
3815	goto again;
3816	}
3817	}
3818	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3819	return ret < `0` ? ret : read;
3820	}
3821
3822	static ssize_t btrfs_file_read_iter(struct kiocb iocb, struct* iov_iter *to)
3823	{
3824	ssize_t ret = `0`;
3825
3826	if (iocb->ki_flags & IOCB_DIRECT) {
3827	ret = btrfs_direct_read(iocb, to);
3828	if (ret < `0` \|\| !iov_iter_count(i: to) \|\|
3829	iocb->ki_pos >= i_size_read(inode: file_inode(f: iocb->ki_filp)))
3830	return ret;
3831	}
3832
3833	return filemap_read(iocb, to, already_read: ret);
3834	}
3835
3836	const struct file_operations btrfs_file_operations = {
3837	.llseek = btrfs_file_llseek,
3838	.read_iter = btrfs_file_read_iter,
3839	.splice_read = filemap_splice_read,
3840	.write_iter = btrfs_file_write_iter,
3841	.splice_write = iter_file_splice_write,
3842	.mmap = btrfs_file_mmap,
3843	.open = btrfs_file_open,
3844	.release = btrfs_release_file,
3845	.get_unmapped_area = thp_get_unmapped_area,
3846	.fsync = btrfs_sync_file,
3847	.fallocate = btrfs_fallocate,
3848	.unlocked_ioctl = btrfs_ioctl,
3849	#ifdef CONFIG_COMPAT
3850	.compat_ioctl = btrfs_compat_ioctl,
3851	#endif
3852	.remap_file_range = btrfs_remap_file_range,
3853	};
3854
3855	int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3856	{
3857	int ret;
3858
3859	/*
3860	* So with compression we will find and lock a dirty page and clear the
3861	* first one as dirty, setup an async extent, and immediately return
3862	* with the entire range locked but with nobody actually marked with
3863	* writeback. So we can't just filemap_write_and_wait_range() and
3864	* expect it to work since it will just kick off a thread to do the
3865	* actual work. So we need to call filemap_fdatawrite_range _again_
3866	* since it will wait on the page lock, which won't be unlocked until
3867	* after the pages have been marked as writeback and so we're good to go
3868	* from there. We have to do this otherwise we'll miss the ordered
3869	* extents and that results in badness. Please Josef, do not think you
3870	* know better and pull this out at some point in the future, it is
3871	* right and you are wrong.
3872	*/
3873	ret = filemap_fdatawrite_range(mapping: inode->i_mapping, start, end);
3874	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3875	&BTRFS_I(inode)->runtime_flags))
3876	ret = filemap_fdatawrite_range(mapping: inode->i_mapping, start, end);
3877
3878	return ret;
3879	}
3880

source code of linux/fs/btrfs/file.c