defrag.c source code [linux/fs/btrfs/defrag.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <linux/sched.h>
7	#include "ctree.h"
8	#include "disk-io.h"
9	#include "transaction.h"
10	#include "locking.h"
11	#include "accessors.h"
12	#include "messages.h"
13	#include "delalloc-space.h"
14	#include "subpage.h"
15	#include "defrag.h"
16	#include "file-item.h"
17	#include "super.h"
18
19	static struct kmem_cache *btrfs_inode_defrag_cachep;
20
21	/*
22	* When auto defrag is enabled we queue up these defrag structs to remember
23	* which inodes need defragging passes.
24	*/
25	struct inode_defrag {
26	struct rb_node rb_node;
27	/ Inode number /
28	u64 ino;
29	/*
30	* Transid where the defrag was added, we search for extents newer than
31	* this.
32	*/
33	u64 transid;
34
35	/ Root objectid /
36	u64 root;
37
38	/*
39	* The extent size threshold for autodefrag.
40	*
41	* This value is different for compressed/non-compressed extents, thus
42	* needs to be passed from higher layer.
43	* (aka, inode_should_defrag())
44	*/
45	u32 extent_thresh;
46	};
47
48	static int __compare_inode_defrag(struct inode_defrag *defrag1,
49	struct inode_defrag *defrag2)
50	{
51	if (defrag1->root > defrag2->root)
52	return `1`;
53	else if (defrag1->root < defrag2->root)
54	return -`1`;
55	else if (defrag1->ino > defrag2->ino)
56	return `1`;
57	else if (defrag1->ino < defrag2->ino)
58	return -`1`;
59	else
60	return `0`;
61	}
62
63	/*
64	* Pop a record for an inode into the defrag tree. The lock must be held
65	* already.
66	*
67	* If you're inserting a record for an older transid than an existing record,
68	* the transid already in the tree is lowered.
69	*
70	* If an existing record is found the defrag item you pass in is freed.
71	*/
72	static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
73	struct inode_defrag *defrag)
74	{
75	struct btrfs_fs_info *fs_info = inode->root->fs_info;
76	struct inode_defrag *entry;
77	struct rb_node **p;
78	struct rb_node *parent = NULL;
79	int ret;
80
81	p = &fs_info->defrag_inodes.rb_node;
82	while (*p) {
83	parent = *p;
84	entry = rb_entry(parent, struct inode_defrag, rb_node);
85
86	ret = __compare_inode_defrag(defrag1: defrag, defrag2: entry);
87	if (ret < `0`)
88	p = &parent->rb_left;
89	else if (ret > `0`)
90	p = &parent->rb_right;
91	else {
92	/*
93	* If we're reinserting an entry for an old defrag run,
94	* make sure to lower the transid of our existing
95	* record.
96	*/
97	if (defrag->transid < entry->transid)
98	entry->transid = defrag->transid;
99	entry->extent_thresh = min(defrag->extent_thresh,
100	entry->extent_thresh);
101	return -EEXIST;
102	}
103	}
104	set_bit(nr: BTRFS_INODE_IN_DEFRAG, addr: &inode->runtime_flags);
105	rb_link_node(node: &defrag->rb_node, parent, rb_link: p);
106	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
107	return `0`;
108	}
109
110	static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
111	{
112	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
113	return `0`;
114
115	if (btrfs_fs_closing(fs_info))
116	return `0`;
117
118	return `1`;
119	}
120
121	/*
122	* Insert a defrag record for this inode if auto defrag is enabled.
123	*/
124	int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
125	struct btrfs_inode *inode, u32 extent_thresh)
126	{
127	struct btrfs_root *root = inode->root;
128	struct btrfs_fs_info *fs_info = root->fs_info;
129	struct inode_defrag *defrag;
130	u64 transid;
131	int ret;
132
133	if (!__need_auto_defrag(fs_info))
134	return `0`;
135
136	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
137	return `0`;
138
139	if (trans)
140	transid = trans->transid;
141	else
142	transid = inode->root->last_trans;
143
144	defrag = kmem_cache_zalloc(k: btrfs_inode_defrag_cachep, GFP_NOFS);
145	if (!defrag)
146	return -ENOMEM;
147
148	defrag->ino = btrfs_ino(inode);
149	defrag->transid = transid;
150	defrag->root = root->root_key.objectid;
151	defrag->extent_thresh = extent_thresh;
152
153	spin_lock(lock: &fs_info->defrag_inodes_lock);
154	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
155	/*
156	* If we set IN_DEFRAG flag and evict the inode from memory,
157	* and then re-read this inode, this new inode doesn't have
158	* IN_DEFRAG flag. At the case, we may find the existed defrag.
159	*/
160	ret = __btrfs_add_inode_defrag(inode, defrag);
161	if (ret)
162	kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag);
163	} else {
164	kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag);
165	}
166	spin_unlock(lock: &fs_info->defrag_inodes_lock);
167	return `0`;
168	}
169
170	/*
171	* Pick the defragable inode that we want, if it doesn't exist, we will get the
172	* next one.
173	*/
174	static struct inode_defrag *btrfs_pick_defrag_inode(
175	struct btrfs_fs_info *fs_info, u64 root, u64 ino)
176	{
177	struct inode_defrag *entry = NULL;
178	struct inode_defrag tmp;
179	struct rb_node *p;
180	struct rb_node *parent = NULL;
181	int ret;
182
183	tmp.ino = ino;
184	tmp.root = root;
185
186	spin_lock(lock: &fs_info->defrag_inodes_lock);
187	p = fs_info->defrag_inodes.rb_node;
188	while (p) {
189	parent = p;
190	entry = rb_entry(parent, struct inode_defrag, rb_node);
191
192	ret = __compare_inode_defrag(defrag1: &tmp, defrag2: entry);
193	if (ret < `0`)
194	p = parent->rb_left;
195	else if (ret > `0`)
196	p = parent->rb_right;
197	else
198	goto out;
199	}
200
201	if (parent && __compare_inode_defrag(defrag1: &tmp, defrag2: entry) > `0`) {
202	parent = rb_next(parent);
203	if (parent)
204	entry = rb_entry(parent, struct inode_defrag, rb_node);
205	else
206	entry = NULL;
207	}
208	out:
209	if (entry)
210	rb_erase(parent, &fs_info->defrag_inodes);
211	spin_unlock(lock: &fs_info->defrag_inodes_lock);
212	return entry;
213	}
214
215	void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
216	{
217	struct inode_defrag *defrag;
218	struct rb_node *node;
219
220	spin_lock(lock: &fs_info->defrag_inodes_lock);
221	node = rb_first(&fs_info->defrag_inodes);
222	while (node) {
223	rb_erase(node, &fs_info->defrag_inodes);
224	defrag = rb_entry(node, struct inode_defrag, rb_node);
225	kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag);
226
227	cond_resched_lock(&fs_info->defrag_inodes_lock);
228
229	node = rb_first(&fs_info->defrag_inodes);
230	}
231	spin_unlock(lock: &fs_info->defrag_inodes_lock);
232	}
233
234	#define BTRFS_DEFRAG_BATCH 1024
235
236	static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
237	struct inode_defrag *defrag)
238	{
239	struct btrfs_root *inode_root;
240	struct inode *inode;
241	struct btrfs_ioctl_defrag_range_args range;
242	int ret = `0`;
243	u64 cur = `0`;
244
245	again:
246	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
247	goto cleanup;
248	if (!__need_auto_defrag(fs_info))
249	goto cleanup;
250
251	/ Get the inode /
252	inode_root = btrfs_get_fs_root(fs_info, objectid: defrag->root, check_ref: true);
253	if (IS_ERR(ptr: inode_root)) {
254	ret = PTR_ERR(ptr: inode_root);
255	goto cleanup;
256	}
257
258	inode = btrfs_iget(s: fs_info->sb, ino: defrag->ino, root: inode_root);
259	btrfs_put_root(root: inode_root);
260	if (IS_ERR(ptr: inode)) {
261	ret = PTR_ERR(ptr: inode);
262	goto cleanup;
263	}
264
265	if (cur >= i_size_read(inode)) {
266	iput(inode);
267	goto cleanup;
268	}
269
270	/ Do a chunk of defrag /
271	clear_bit(nr: BTRFS_INODE_IN_DEFRAG, addr: &BTRFS_I(inode)->runtime_flags);
272	memset(&range, `0`, sizeof(range));
273	range.len = (u64)-`1`;
274	range.start = cur;
275	range.extent_thresh = defrag->extent_thresh;
276
277	sb_start_write(sb: fs_info->sb);
278	ret = btrfs_defrag_file(inode, NULL, range: &range, newer_than: defrag->transid,
279	BTRFS_DEFRAG_BATCH);
280	sb_end_write(sb: fs_info->sb);
281	iput(inode);
282
283	if (ret < `0`)
284	goto cleanup;
285
286	cur = max(cur + fs_info->sectorsize, range.start);
287	goto again;
288
289	cleanup:
290	kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag);
291	return ret;
292	}
293
294	/*
295	* Run through the list of inodes in the FS that need defragging.
296	*/
297	int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
298	{
299	struct inode_defrag *defrag;
300	u64 first_ino = `0`;
301	u64 root_objectid = `0`;
302
303	atomic_inc(v: &fs_info->defrag_running);
304	while (`1`) {
305	/ Pause the auto defragger. /
306	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
307	break;
308
309	if (!__need_auto_defrag(fs_info))
310	break;
311
312	/ find an inode to defrag /
313	defrag = btrfs_pick_defrag_inode(fs_info, root: root_objectid, ino: first_ino);
314	if (!defrag) {
315	if (root_objectid \|\| first_ino) {
316	root_objectid = `0`;
317	first_ino = `0`;
318	continue;
319	} else {
320	break;
321	}
322	}
323
324	first_ino = defrag->ino + `1`;
325	root_objectid = defrag->root;
326
327	__btrfs_run_defrag_inode(fs_info, defrag);
328	}
329	atomic_dec(v: &fs_info->defrag_running);
330
331	/*
332	* During unmount, we use the transaction_wait queue to wait for the
333	* defragger to stop.
334	*/
335	wake_up(&fs_info->transaction_wait);
336	return `0`;
337	}
338
339	/*
340	* Check if two blocks addresses are close, used by defrag.
341	*/
342	static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
343	{
344	if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
345	return true;
346	if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
347	return true;
348	return false;
349	}
350
351	/*
352	* Go through all the leaves pointed to by a node and reallocate them so that
353	* disk order is close to key order.
354	*/
355	static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
356	struct btrfs_root *root,
357	struct extent_buffer *parent,
358	int start_slot, u64 *last_ret,
359	struct btrfs_key *progress)
360	{
361	struct btrfs_fs_info *fs_info = root->fs_info;
362	const u32 blocksize = fs_info->nodesize;
363	const int end_slot = btrfs_header_nritems(eb: parent) - `1`;
364	u64 search_start = *last_ret;
365	u64 last_block = `0`;
366	int ret = `0`;
367	bool progress_passed = false;
368
369	/*
370	* COWing must happen through a running transaction, which always
371	* matches the current fs generation (it's a transaction with a state
372	* less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
373	* into error state to prevent the commit of any transaction.
374	*/
375	if (unlikely(trans->transaction != fs_info->running_transaction \|\|
376	trans->transid != fs_info->generation)) {
377	btrfs_abort_transaction(trans, -EUCLEAN);
378	btrfs_crit(fs_info,
379	"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
380	parent->start, btrfs_root_id(root), trans->transid,
381	fs_info->running_transaction->transid,
382	fs_info->generation);
383	return -EUCLEAN;
384	}
385
386	if (btrfs_header_nritems(eb: parent) <= `1`)
387	return `0`;
388
389	for (int i = start_slot; i <= end_slot; i++) {
390	struct extent_buffer *cur;
391	struct btrfs_disk_key disk_key;
392	u64 blocknr;
393	u64 other;
394	bool close = true;
395
396	btrfs_node_key(eb: parent, disk_key: &disk_key, nr: i);
397	if (!progress_passed && btrfs_comp_keys(disk_key: &disk_key, k2: progress) < `0`)
398	continue;
399
400	progress_passed = true;
401	blocknr = btrfs_node_blockptr(eb: parent, nr: i);
402	if (last_block == `0`)
403	last_block = blocknr;
404
405	if (i > `0`) {
406	other = btrfs_node_blockptr(eb: parent, nr: i - `1`);
407	close = close_blocks(blocknr, other, blocksize);
408	}
409	if (!close && i < end_slot) {
410	other = btrfs_node_blockptr(eb: parent, nr: i + `1`);
411	close = close_blocks(blocknr, other, blocksize);
412	}
413	if (close) {
414	last_block = blocknr;
415	continue;
416	}
417
418	cur = btrfs_read_node_slot(parent, slot: i);
419	if (IS_ERR(ptr: cur))
420	return PTR_ERR(ptr: cur);
421	if (search_start == `0`)
422	search_start = last_block;
423
424	btrfs_tree_lock(eb: cur);
425	ret = btrfs_force_cow_block(trans, root, buf: cur, parent, parent_slot: i,
426	cow_ret: &cur, search_start,
427	min(`16` * blocksize,
428	(end_slot - i) * blocksize),
429	nest: BTRFS_NESTING_COW);
430	if (ret) {
431	btrfs_tree_unlock(eb: cur);
432	free_extent_buffer(eb: cur);
433	break;
434	}
435	search_start = cur->start;
436	last_block = cur->start;
437	*last_ret = search_start;
438	btrfs_tree_unlock(eb: cur);
439	free_extent_buffer(eb: cur);
440	}
441	return ret;
442	}
443
444	/*
445	* Defrag all the leaves in a given btree.
446	* Read all the leaves and try to get key order to
447	* better reflect disk order
448	*/
449
450	static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
451	struct btrfs_root *root)
452	{
453	struct btrfs_path *path = NULL;
454	struct btrfs_key key;
455	int ret = `0`;
456	int wret;
457	int level;
458	int next_key_ret = `0`;
459	u64 last_ret = `0`;
460
461	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
462	goto out;
463
464	path = btrfs_alloc_path();
465	if (!path) {
466	ret = -ENOMEM;
467	goto out;
468	}
469
470	level = btrfs_header_level(eb: root->node);
471
472	if (level == `0`)
473	goto out;
474
475	if (root->defrag_progress.objectid == `0`) {
476	struct extent_buffer *root_node;
477	u32 nritems;
478
479	root_node = btrfs_lock_root_node(root);
480	nritems = btrfs_header_nritems(eb: root_node);
481	root->defrag_max.objectid = `0`;
482	/ from above we know this is not a leaf /
483	btrfs_node_key_to_cpu(eb: root_node, cpu_key: &root->defrag_max,
484	nr: nritems - `1`);
485	btrfs_tree_unlock(eb: root_node);
486	free_extent_buffer(eb: root_node);
487	memset(&key, `0`, sizeof(key));
488	} else {
489	memcpy(&key, &root->defrag_progress, sizeof(key));
490	}
491
492	path->keep_locks = `1`;
493
494	ret = btrfs_search_forward(root, min_key: &key, path, BTRFS_OLDEST_GENERATION);
495	if (ret < `0`)
496	goto out;
497	if (ret > `0`) {
498	ret = `0`;
499	goto out;
500	}
501	btrfs_release_path(p: path);
502	/*
503	* We don't need a lock on a leaf. btrfs_realloc_node() will lock all
504	* leafs from path->nodes[1], so set lowest_level to 1 to avoid later
505	* a deadlock (attempting to write lock an already write locked leaf).
506	*/
507	path->lowest_level = `1`;
508	wret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
509
510	if (wret < `0`) {
511	ret = wret;
512	goto out;
513	}
514	if (!path->nodes[`1`]) {
515	ret = `0`;
516	goto out;
517	}
518	/*
519	* The node at level 1 must always be locked when our path has
520	* keep_locks set and lowest_level is 1, regardless of the value of
521	* path->slots[1].
522	*/
523	ASSERT(path->locks[`1`] != `0`);
524	ret = btrfs_realloc_node(trans, root,
525	parent: path->nodes[`1`], start_slot: `0`,
526	last_ret: &last_ret,
527	progress: &root->defrag_progress);
528	if (ret) {
529	WARN_ON(ret == -EAGAIN);
530	goto out;
531	}
532	/*
533	* Now that we reallocated the node we can find the next key. Note that
534	* btrfs_find_next_key() can release our path and do another search
535	* without COWing, this is because even with path->keep_locks = 1,
536	* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
537	* node when path->slots[node_level - 1] does not point to the last
538	* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
539	* we search for the next key after reallocating our node.
540	*/
541	path->slots[`1`] = btrfs_header_nritems(eb: path->nodes[`1`]);
542	next_key_ret = btrfs_find_next_key(root, path, key: &key, lowest_level: `1`,
543	BTRFS_OLDEST_GENERATION);
544	if (next_key_ret == `0`) {
545	memcpy(&root->defrag_progress, &key, sizeof(key));
546	ret = -EAGAIN;
547	}
548	out:
549	btrfs_free_path(p: path);
550	if (ret == -EAGAIN) {
551	if (root->defrag_max.objectid > root->defrag_progress.objectid)
552	goto done;
553	if (root->defrag_max.type > root->defrag_progress.type)
554	goto done;
555	if (root->defrag_max.offset > root->defrag_progress.offset)
556	goto done;
557	ret = `0`;
558	}
559	done:
560	if (ret != -EAGAIN)
561	memset(&root->defrag_progress, `0`,
562	sizeof(root->defrag_progress));
563
564	return ret;
565	}
566
567	/*
568	* Defrag a given btree. Every leaf in the btree is read and defragmented.
569	*/
570	int btrfs_defrag_root(struct btrfs_root *root)
571	{
572	struct btrfs_fs_info *fs_info = root->fs_info;
573	int ret;
574
575	if (test_and_set_bit(nr: BTRFS_ROOT_DEFRAG_RUNNING, addr: &root->state))
576	return `0`;
577
578	while (`1`) {
579	struct btrfs_trans_handle *trans;
580
581	trans = btrfs_start_transaction(root, num_items: `0`);
582	if (IS_ERR(ptr: trans)) {
583	ret = PTR_ERR(ptr: trans);
584	break;
585	}
586
587	ret = btrfs_defrag_leaves(trans, root);
588
589	btrfs_end_transaction(trans);
590	btrfs_btree_balance_dirty(fs_info);
591	cond_resched();
592
593	if (btrfs_fs_closing(fs_info) \|\| ret != -EAGAIN)
594	break;
595
596	if (btrfs_defrag_cancelled(fs_info)) {
597	btrfs_debug(fs_info, "defrag_root cancelled");
598	ret = -EAGAIN;
599	break;
600	}
601	}
602	clear_bit(nr: BTRFS_ROOT_DEFRAG_RUNNING, addr: &root->state);
603	return ret;
604	}
605
606	/*
607	* Defrag specific helper to get an extent map.
608	*
609	* Differences between this and btrfs_get_extent() are:
610	*
611	* - No extent_map will be added to inode->extent_tree
612	* To reduce memory usage in the long run.
613	*
614	* - Extra optimization to skip file extents older than @newer_than
615	* By using btrfs_search_forward() we can skip entire file ranges that
616	* have extents created in past transactions, because btrfs_search_forward()
617	* will not visit leaves and nodes with a generation smaller than given
618	* minimal generation threshold (@newer_than).
619	*
620	* Return valid em if we find a file extent matching the requirement.
621	* Return NULL if we can not find a file extent matching the requirement.
622	*
623	* Return ERR_PTR() for error.
624	*/
625	static struct extent_map defrag_get_extent(struct* btrfs_inode *inode,
626	u64 start, u64 newer_than)
627	{
628	struct btrfs_root *root = inode->root;
629	struct btrfs_file_extent_item *fi;
630	struct btrfs_path path = { `0` };
631	struct extent_map *em;
632	struct btrfs_key key;
633	u64 ino = btrfs_ino(inode);
634	int ret;
635
636	em = alloc_extent_map();
637	if (!em) {
638	ret = -ENOMEM;
639	goto err;
640	}
641
642	key.objectid = ino;
643	key.type = BTRFS_EXTENT_DATA_KEY;
644	key.offset = start;
645
646	if (newer_than) {
647	ret = btrfs_search_forward(root, min_key: &key, path: &path, min_trans: newer_than);
648	if (ret < `0`)
649	goto err;
650	/ Can't find anything newer /
651	if (ret > `0`)
652	goto not_found;
653	} else {
654	ret = btrfs_search_slot(NULL, root, key: &key, p: &path, ins_len: `0`, cow: `0`);
655	if (ret < `0`)
656	goto err;
657	}
658	if (path.slots[`0`] >= btrfs_header_nritems(eb: path.nodes[`0`])) {
659	/*
660	* If btrfs_search_slot() makes path to point beyond nritems,
661	* we should not have an empty leaf, as this inode must at
662	* least have its INODE_ITEM.
663	*/
664	ASSERT(btrfs_header_nritems(path.nodes[`0`]));
665	path.slots[`0`] = btrfs_header_nritems(eb: path.nodes[`0`]) - `1`;
666	}
667	btrfs_item_key_to_cpu(eb: path.nodes[`0`], cpu_key: &key, nr: path.slots[`0`]);
668	/ Perfect match, no need to go one slot back /
669	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
670	key.offset == start)
671	goto iterate;
672
673	/ We didn't find a perfect match, needs to go one slot back /
674	if (path.slots[`0`] > `0`) {
675	btrfs_item_key_to_cpu(eb: path.nodes[`0`], cpu_key: &key, nr: path.slots[`0`]);
676	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
677	path.slots[`0`]--;
678	}
679
680	iterate:
681	/ Iterate through the path to find a file extent covering @start /
682	while (true) {
683	u64 extent_end;
684
685	if (path.slots[`0`] >= btrfs_header_nritems(eb: path.nodes[`0`]))
686	goto next;
687
688	btrfs_item_key_to_cpu(eb: path.nodes[`0`], cpu_key: &key, nr: path.slots[`0`]);
689
690	/*
691	* We may go one slot back to INODE_REF/XATTR item, then
692	* need to go forward until we reach an EXTENT_DATA.
693	* But we should still has the correct ino as key.objectid.
694	*/
695	if (WARN_ON(key.objectid < ino) \|\| key.type < BTRFS_EXTENT_DATA_KEY)
696	goto next;
697
698	/ It's beyond our target range, definitely not extent found /
699	if (key.objectid > ino \|\| key.type > BTRFS_EXTENT_DATA_KEY)
700	goto not_found;
701
702	/*
703	* \| \|<- File extent ->\|
704	* \- start
705	*
706	* This means there is a hole between start and key.offset.
707	*/
708	if (key.offset > start) {
709	em->start = start;
710	em->orig_start = start;
711	em->block_start = EXTENT_MAP_HOLE;
712	em->len = key.offset - start;
713	break;
714	}
715
716	fi = btrfs_item_ptr(path.nodes[`0`], path.slots[`0`],
717	struct btrfs_file_extent_item);
718	extent_end = btrfs_file_extent_end(path: &path);
719
720	/*
721	* \|<- file extent ->\| \|
722	* \- start
723	*
724	* We haven't reached start, search next slot.
725	*/
726	if (extent_end <= start)
727	goto next;
728
729	/ Now this extent covers @start, convert it to em /
730	btrfs_extent_item_to_extent_map(inode, path: &path, fi, em);
731	break;
732	next:
733	ret = btrfs_next_item(root, p: &path);
734	if (ret < `0`)
735	goto err;
736	if (ret > `0`)
737	goto not_found;
738	}
739	btrfs_release_path(p: &path);
740	return em;
741
742	not_found:
743	btrfs_release_path(p: &path);
744	free_extent_map(em);
745	return NULL;
746
747	err:
748	btrfs_release_path(p: &path);
749	free_extent_map(em);
750	return ERR_PTR(error: ret);
751	}
752
753	static struct extent_map defrag_lookup_extent(struct* inode *inode, u64 start,
754	u64 newer_than, bool locked)
755	{
756	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
757	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
758	struct extent_map *em;
759	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
760
761	/*
762	* Hopefully we have this extent in the tree already, try without the
763	* full extent lock.
764	*/
765	read_lock(&em_tree->lock);
766	em = lookup_extent_mapping(tree: em_tree, start, len: sectorsize);
767	read_unlock(&em_tree->lock);
768
769	/*
770	* We can get a merged extent, in that case, we need to re-search
771	* tree to get the original em for defrag.
772	*
773	* If @newer_than is 0 or em::generation < newer_than, we can trust
774	* this em, as either we don't care about the generation, or the
775	* merged extent map will be rejected anyway.
776	*/
777	if (em && (em->flags & EXTENT_FLAG_MERGED) &&
778	newer_than && em->generation >= newer_than) {
779	free_extent_map(em);
780	em = NULL;
781	}
782
783	if (!em) {
784	struct extent_state *cached = NULL;
785	u64 end = start + sectorsize - `1`;
786
787	/ Get the big lock and read metadata off disk. /
788	if (!locked)
789	lock_extent(tree: io_tree, start, end, cached: &cached);
790	em = defrag_get_extent(inode: BTRFS_I(inode), start, newer_than);
791	if (!locked)
792	unlock_extent(tree: io_tree, start, end, cached: &cached);
793
794	if (IS_ERR(ptr: em))
795	return NULL;
796	}
797
798	return em;
799	}
800
801	static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
802	const struct extent_map *em)
803	{
804	if (extent_map_is_compressed(em))
805	return BTRFS_MAX_COMPRESSED;
806	return fs_info->max_extent_size;
807	}
808
809	static bool defrag_check_next_extent(struct inode inode, struct* extent_map *em,
810	u32 extent_thresh, u64 newer_than, bool locked)
811	{
812	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
813	struct extent_map *next;
814	bool ret = false;
815
816	/ This is the last extent /
817	if (em->start + em->len >= i_size_read(inode))
818	return false;
819
820	/*
821	* Here we need to pass @newer_then when checking the next extent, or
822	* we will hit a case we mark current extent for defrag, but the next
823	* one will not be a target.
824	* This will just cause extra IO without really reducing the fragments.
825	*/
826	next = defrag_lookup_extent(inode, start: em->start + em->len, newer_than, locked);
827	/ No more em or hole /
828	if (!next \|\| next->block_start >= EXTENT_MAP_LAST_BYTE)
829	goto out;
830	if (next->flags & EXTENT_FLAG_PREALLOC)
831	goto out;
832	/*
833	* If the next extent is at its max capacity, defragging current extent
834	* makes no sense, as the total number of extents won't change.
835	*/
836	if (next->len >= get_extent_max_capacity(fs_info, em))
837	goto out;
838	/ Skip older extent /
839	if (next->generation < newer_than)
840	goto out;
841	/ Also check extent size /
842	if (next->len >= extent_thresh)
843	goto out;
844
845	ret = true;
846	out:
847	free_extent_map(em: next);
848	return ret;
849	}
850
851	/*
852	* Prepare one page to be defragged.
853	*
854	* This will ensure:
855	*
856	* - Returned page is locked and has been set up properly.
857	* - No ordered extent exists in the page.
858	* - The page is uptodate.
859	*
860	* NOTE: Caller should also wait for page writeback after the cluster is
861	* prepared, here we don't do writeback wait for each page.
862	*/
863	static struct folio defrag_prepare_one_folio(struct* btrfs_inode *inode, pgoff_t index)
864	{
865	struct address_space *mapping = inode->vfs_inode.i_mapping;
866	gfp_t mask = btrfs_alloc_write_mask(mapping);
867	u64 page_start = (u64)index << PAGE_SHIFT;
868	u64 page_end = page_start + PAGE_SIZE - `1`;
869	struct extent_state *cached_state = NULL;
870	struct folio *folio;
871	int ret;
872
873	again:
874	folio = __filemap_get_folio(mapping, index,
875	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp: mask);
876	if (IS_ERR(ptr: folio))
877	return folio;
878
879	/*
880	* Since we can defragment files opened read-only, we can encounter
881	* transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
882	* can't do I/O using huge pages yet, so return an error for now.
883	* Filesystem transparent huge pages are typically only used for
884	* executables that explicitly enable them, so this isn't very
885	* restrictive.
886	*/
887	if (folio_test_large(folio)) {
888	folio_unlock(folio);
889	folio_put(folio);
890	return ERR_PTR(error: -ETXTBSY);
891	}
892
893	ret = set_folio_extent_mapped(folio);
894	if (ret < `0`) {
895	folio_unlock(folio);
896	folio_put(folio);
897	return ERR_PTR(error: ret);
898	}
899
900	/ Wait for any existing ordered extent in the range /
901	while (`1`) {
902	struct btrfs_ordered_extent *ordered;
903
904	lock_extent(tree: &inode->io_tree, start: page_start, end: page_end, cached: &cached_state);
905	ordered = btrfs_lookup_ordered_range(inode, file_offset: page_start, PAGE_SIZE);
906	unlock_extent(tree: &inode->io_tree, start: page_start, end: page_end,
907	cached: &cached_state);
908	if (!ordered)
909	break;
910
911	folio_unlock(folio);
912	btrfs_start_ordered_extent(entry: ordered);
913	btrfs_put_ordered_extent(entry: ordered);
914	folio_lock(folio);
915	/*
916	* We unlocked the folio above, so we need check if it was
917	* released or not.
918	*/
919	if (folio->mapping != mapping \|\| !folio->private) {
920	folio_unlock(folio);
921	folio_put(folio);
922	goto again;
923	}
924	}
925
926	/*
927	* Now the page range has no ordered extent any more. Read the page to
928	* make it uptodate.
929	*/
930	if (!folio_test_uptodate(folio)) {
931	btrfs_read_folio(NULL, folio);
932	folio_lock(folio);
933	if (folio->mapping != mapping \|\| !folio->private) {
934	folio_unlock(folio);
935	folio_put(folio);
936	goto again;
937	}
938	if (!folio_test_uptodate(folio)) {
939	folio_unlock(folio);
940	folio_put(folio);
941	return ERR_PTR(error: -EIO);
942	}
943	}
944	return folio;
945	}
946
947	struct defrag_target_range {
948	struct list_head list;
949	u64 start;
950	u64 len;
951	};
952
953	/*
954	* Collect all valid target extents.
955	*
956	* @start: file offset to lookup
957	* @len: length to lookup
958	* @extent_thresh: file extent size threshold, any extent size >= this value
959	* will be ignored
960	* @newer_than: only defrag extents newer than this value
961	* @do_compress: whether the defrag is doing compression
962	* if true, @extent_thresh will be ignored and all regular
963	* file extents meeting @newer_than will be targets.
964	* @locked: if the range has already held extent lock
965	* @target_list: list of targets file extents
966	*/
967	static int defrag_collect_targets(struct btrfs_inode *inode,
968	u64 start, u64 len, u32 extent_thresh,
969	u64 newer_than, bool do_compress,
970	bool locked, struct list_head *target_list,
971	u64 *last_scanned_ret)
972	{
973	struct btrfs_fs_info *fs_info = inode->root->fs_info;
974	bool last_is_target = false;
975	u64 cur = start;
976	int ret = `0`;
977
978	while (cur < start + len) {
979	struct extent_map *em;
980	struct defrag_target_range *new;
981	bool next_mergeable = true;
982	u64 range_len;
983
984	last_is_target = false;
985	em = defrag_lookup_extent(inode: &inode->vfs_inode, start: cur, newer_than, locked);
986	if (!em)
987	break;
988
989	/*
990	* If the file extent is an inlined one, we may still want to
991	* defrag it (fallthrough) if it will cause a regular extent.
992	* This is for users who want to convert inline extents to
993	* regular ones through max_inline= mount option.
994	*/
995	if (em->block_start == EXTENT_MAP_INLINE &&
996	em->len <= inode->root->fs_info->max_inline)
997	goto next;
998
999	/ Skip holes and preallocated extents. /
1000	if (em->block_start == EXTENT_MAP_HOLE \|\|
1001	(em->flags & EXTENT_FLAG_PREALLOC))
1002	goto next;
1003
1004	/ Skip older extent /
1005	if (em->generation < newer_than)
1006	goto next;
1007
1008	/ This em is under writeback, no need to defrag /
1009	if (em->generation == (u64)-`1`)
1010	goto next;
1011
1012	/*
1013	* Our start offset might be in the middle of an existing extent
1014	* map, so take that into account.
1015	*/
1016	range_len = em->len - (cur - em->start);
1017	/*
1018	* If this range of the extent map is already flagged for delalloc,
1019	* skip it, because:
1020	*
1021	* 1) We could deadlock later, when trying to reserve space for
1022	* delalloc, because in case we can't immediately reserve space
1023	* the flusher can start delalloc and wait for the respective
1024	* ordered extents to complete. The deadlock would happen
1025	* because we do the space reservation while holding the range
1026	* locked, and starting writeback, or finishing an ordered
1027	* extent, requires locking the range;
1028	*
1029	* 2) If there's delalloc there, it means there's dirty pages for
1030	* which writeback has not started yet (we clean the delalloc
1031	* flag when starting writeback and after creating an ordered
1032	* extent). If we mark pages in an adjacent range for defrag,
1033	* then we will have a larger contiguous range for delalloc,
1034	* very likely resulting in a larger extent after writeback is
1035	* triggered (except in a case of free space fragmentation).
1036	*/
1037	if (test_range_bit_exists(tree: &inode->io_tree, start: cur, end: cur + range_len - `1`,
1038	bit: EXTENT_DELALLOC))
1039	goto next;
1040
1041	/*
1042	* For do_compress case, we want to compress all valid file
1043	* extents, thus no @extent_thresh or mergeable check.
1044	*/
1045	if (do_compress)
1046	goto add;
1047
1048	/ Skip too large extent /
1049	if (em->len >= extent_thresh)
1050	goto next;
1051
1052	/*
1053	* Skip extents already at its max capacity, this is mostly for
1054	* compressed extents, which max cap is only 128K.
1055	*/
1056	if (em->len >= get_extent_max_capacity(fs_info, em))
1057	goto next;
1058
1059	/*
1060	* Normally there are no more extents after an inline one, thus
1061	* @next_mergeable will normally be false and not defragged.
1062	* So if an inline extent passed all above checks, just add it
1063	* for defrag, and be converted to regular extents.
1064	*/
1065	if (em->block_start == EXTENT_MAP_INLINE)
1066	goto add;
1067
1068	next_mergeable = defrag_check_next_extent(inode: &inode->vfs_inode, em,
1069	extent_thresh, newer_than, locked);
1070	if (!next_mergeable) {
1071	struct defrag_target_range *last;
1072
1073	/ Empty target list, no way to merge with last entry /
1074	if (list_empty(head: target_list))
1075	goto next;
1076	last = list_entry(target_list->prev,
1077	struct defrag_target_range, list);
1078	/ Not mergeable with last entry /
1079	if (last->start + last->len != cur)
1080	goto next;
1081
1082	/ Mergeable, fall through to add it to @target_list. /
1083	}
1084
1085	add:
1086	last_is_target = true;
1087	range_len = min(extent_map_end(em), start + len) - cur;
1088	/*
1089	* This one is a good target, check if it can be merged into
1090	* last range of the target list.
1091	*/
1092	if (!list_empty(head: target_list)) {
1093	struct defrag_target_range *last;
1094
1095	last = list_entry(target_list->prev,
1096	struct defrag_target_range, list);
1097	ASSERT(last->start + last->len <= cur);
1098	if (last->start + last->len == cur) {
1099	/ Mergeable, enlarge the last entry /
1100	last->len += range_len;
1101	goto next;
1102	}
1103	/ Fall through to allocate a new entry /
1104	}
1105
1106	/ Allocate new defrag_target_range /
1107	new = kmalloc(size: sizeof(*new), GFP_NOFS);
1108	if (!new) {
1109	free_extent_map(em);
1110	ret = -ENOMEM;
1111	break;
1112	}
1113	new->start = cur;
1114	new->len = range_len;
1115	list_add_tail(new: &new->list, head: target_list);
1116
1117	next:
1118	cur = extent_map_end(em);
1119	free_extent_map(em);
1120	}
1121	if (ret < `0`) {
1122	struct defrag_target_range *entry;
1123	struct defrag_target_range *tmp;
1124
1125	list_for_each_entry_safe(entry, tmp, target_list, list) {
1126	list_del_init(entry: &entry->list);
1127	kfree(objp: entry);
1128	}
1129	}
1130	if (!ret && last_scanned_ret) {
1131	/*
1132	* If the last extent is not a target, the caller can skip to
1133	* the end of that extent.
1134	* Otherwise, we can only go the end of the specified range.
1135	*/
1136	if (!last_is_target)
1137	last_scanned_ret = max(cur, last_scanned_ret);
1138	else
1139	last_scanned_ret = max(start + len, last_scanned_ret);
1140	}
1141	return ret;
1142	}
1143
1144	#define CLUSTER_SIZE (SZ_256K)
1145	static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
1146
1147	/*
1148	* Defrag one contiguous target range.
1149	*
1150	* @inode: target inode
1151	* @target: target range to defrag
1152	* @pages: locked pages covering the defrag range
1153	* @nr_pages: number of locked pages
1154	*
1155	* Caller should ensure:
1156	*
1157	* - Pages are prepared
1158	* Pages should be locked, no ordered extent in the pages range,
1159	* no writeback.
1160	*
1161	* - Extent bits are locked
1162	*/
1163	static int defrag_one_locked_target(struct btrfs_inode *inode,
1164	struct defrag_target_range *target,
1165	struct folio *folios, int* nr_pages,
1166	struct extent_state **cached_state)
1167	{
1168	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1169	struct extent_changeset *data_reserved = NULL;
1170	const u64 start = target->start;
1171	const u64 len = target->len;
1172	unsigned long last_index = (start + len - `1`) >> PAGE_SHIFT;
1173	unsigned long start_index = start >> PAGE_SHIFT;
1174	unsigned long first_index = folios[`0`]->index;
1175	int ret = `0`;
1176	int i;
1177
1178	ASSERT(last_index - first_index + `1` <= nr_pages);
1179
1180	ret = btrfs_delalloc_reserve_space(inode, reserved: &data_reserved, start, len);
1181	if (ret < `0`)
1182	return ret;
1183	clear_extent_bit(tree: &inode->io_tree, start, end: start + len - `1`,
1184	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \|
1185	EXTENT_DEFRAG, cached: cached_state);
1186	set_extent_bit(tree: &inode->io_tree, start, end: start + len - `1`,
1187	bits: EXTENT_DELALLOC \| EXTENT_DEFRAG, cached_state);
1188
1189	/ Update the page status /
1190	for (i = start_index - first_index; i <= last_index - first_index; i++) {
1191	folio_clear_checked(folio: folios[i]);
1192	btrfs_folio_clamp_set_dirty(fs_info, folio: folios[i], start, len);
1193	}
1194	btrfs_delalloc_release_extents(inode, num_bytes: len);
1195	extent_changeset_free(changeset: data_reserved);
1196
1197	return ret;
1198	}
1199
1200	static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1201	u32 extent_thresh, u64 newer_than, bool do_compress,
1202	u64 *last_scanned_ret)
1203	{
1204	struct extent_state *cached_state = NULL;
1205	struct defrag_target_range *entry;
1206	struct defrag_target_range *tmp;
1207	LIST_HEAD(target_list);
1208	struct folio **folios;
1209	const u32 sectorsize = inode->root->fs_info->sectorsize;
1210	u64 last_index = (start + len - `1`) >> PAGE_SHIFT;
1211	u64 start_index = start >> PAGE_SHIFT;
1212	unsigned int nr_pages = last_index - start_index + `1`;
1213	int ret = `0`;
1214	int i;
1215
1216	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1217	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1218
1219	folios = kcalloc(n: nr_pages, size: sizeof(struct folio *), GFP_NOFS);
1220	if (!folios)
1221	return -ENOMEM;
1222
1223	/ Prepare all pages /
1224	for (i = `0`; i < nr_pages; i++) {
1225	folios[i] = defrag_prepare_one_folio(inode, index: start_index + i);
1226	if (IS_ERR(ptr: folios[i])) {
1227	ret = PTR_ERR(ptr: folios[i]);
1228	nr_pages = i;
1229	goto free_folios;
1230	}
1231	}
1232	for (i = `0`; i < nr_pages; i++)
1233	folio_wait_writeback(folio: folios[i]);
1234
1235	/ Lock the pages range /
1236	lock_extent(tree: &inode->io_tree, start: start_index << PAGE_SHIFT,
1237	end: (last_index << PAGE_SHIFT) + PAGE_SIZE - `1`,
1238	cached: &cached_state);
1239	/*
1240	* Now we have a consistent view about the extent map, re-check
1241	* which range really needs to be defragged.
1242	*
1243	* And this time we have extent locked already, pass @locked = true
1244	* so that we won't relock the extent range and cause deadlock.
1245	*/
1246	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1247	newer_than, do_compress, locked: true,
1248	target_list: &target_list, last_scanned_ret);
1249	if (ret < `0`)
1250	goto unlock_extent;
1251
1252	list_for_each_entry(entry, &target_list, list) {
1253	ret = defrag_one_locked_target(inode, target: entry, folios, nr_pages,
1254	cached_state: &cached_state);
1255	if (ret < `0`)
1256	break;
1257	}
1258
1259	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1260	list_del_init(entry: &entry->list);
1261	kfree(objp: entry);
1262	}
1263	unlock_extent:
1264	unlock_extent(tree: &inode->io_tree, start: start_index << PAGE_SHIFT,
1265	end: (last_index << PAGE_SHIFT) + PAGE_SIZE - `1`,
1266	cached: &cached_state);
1267	free_folios:
1268	for (i = `0`; i < nr_pages; i++) {
1269	folio_unlock(folio: folios[i]);
1270	folio_put(folio: folios[i]);
1271	}
1272	kfree(objp: folios);
1273	return ret;
1274	}
1275
1276	static int defrag_one_cluster(struct btrfs_inode *inode,
1277	struct file_ra_state *ra,
1278	u64 start, u32 len, u32 extent_thresh,
1279	u64 newer_than, bool do_compress,
1280	unsigned long *sectors_defragged,
1281	unsigned long max_sectors,
1282	u64 *last_scanned_ret)
1283	{
1284	const u32 sectorsize = inode->root->fs_info->sectorsize;
1285	struct defrag_target_range *entry;
1286	struct defrag_target_range *tmp;
1287	LIST_HEAD(target_list);
1288	int ret;
1289
1290	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1291	newer_than, do_compress, locked: false,
1292	target_list: &target_list, NULL);
1293	if (ret < `0`)
1294	goto out;
1295
1296	list_for_each_entry(entry, &target_list, list) {
1297	u32 range_len = entry->len;
1298
1299	/ Reached or beyond the limit /
1300	if (max_sectors && *sectors_defragged >= max_sectors) {
1301	ret = `1`;
1302	break;
1303	}
1304
1305	if (max_sectors)
1306	range_len = min_t(u32, range_len,
1307	(max_sectors - sectors_defragged) sectorsize);
1308
1309	/*
1310	* If defrag_one_range() has updated last_scanned_ret,
1311	* our range may already be invalid (e.g. hole punched).
1312	* Skip if our range is before last_scanned_ret, as there is
1313	* no need to defrag the range anymore.
1314	*/
1315	if (entry->start + range_len <= *last_scanned_ret)
1316	continue;
1317
1318	if (ra)
1319	page_cache_sync_readahead(mapping: inode->vfs_inode.i_mapping,
1320	ra, NULL, index: entry->start >> PAGE_SHIFT,
1321	req_count: ((entry->start + range_len - `1`) >> PAGE_SHIFT) -
1322	(entry->start >> PAGE_SHIFT) + `1`);
1323	/*
1324	* Here we may not defrag any range if holes are punched before
1325	* we locked the pages.
1326	* But that's fine, it only affects the @sectors_defragged
1327	* accounting.
1328	*/
1329	ret = defrag_one_range(inode, start: entry->start, len: range_len,
1330	extent_thresh, newer_than, do_compress,
1331	last_scanned_ret);
1332	if (ret < `0`)
1333	break;
1334	*sectors_defragged += range_len >>
1335	inode->root->fs_info->sectorsize_bits;
1336	}
1337	out:
1338	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1339	list_del_init(entry: &entry->list);
1340	kfree(objp: entry);
1341	}
1342	if (ret >= `0`)
1343	last_scanned_ret = max(last_scanned_ret, start + len);
1344	return ret;
1345	}
1346
1347	/*
1348	* Entry point to file defragmentation.
1349	*
1350	* @inode: inode to be defragged
1351	* @ra: readahead state (can be NUL)
1352	* @range: defrag options including range and flags
1353	* @newer_than: minimum transid to defrag
1354	* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1355	* will be defragged.
1356	*
1357	* Return <0 for error.
1358	* Return >=0 for the number of sectors defragged, and range->start will be updated
1359	* to indicate the file offset where next defrag should be started at.
1360	* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1361	* defragging all the range).
1362	*/
1363	int btrfs_defrag_file(struct inode inode, struct* file_ra_state *ra,
1364	struct btrfs_ioctl_defrag_range_args *range,
1365	u64 newer_than, unsigned long max_to_defrag)
1366	{
1367	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1368	unsigned long sectors_defragged = `0`;
1369	u64 isize = i_size_read(inode);
1370	u64 cur;
1371	u64 last_byte;
1372	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
1373	bool ra_allocated = false;
1374	int compress_type = BTRFS_COMPRESS_ZLIB;
1375	int ret = `0`;
1376	u32 extent_thresh = range->extent_thresh;
1377	pgoff_t start_index;
1378
1379	if (isize == `0`)
1380	return `0`;
1381
1382	if (range->start >= isize)
1383	return -EINVAL;
1384
1385	if (do_compress) {
1386	if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1387	return -EINVAL;
1388	if (range->compress_type)
1389	compress_type = range->compress_type;
1390	}
1391
1392	if (extent_thresh == `0`)
1393	extent_thresh = SZ_256K;
1394
1395	if (range->start + range->len > range->start) {
1396	/ Got a specific range /
1397	last_byte = min(isize, range->start + range->len);
1398	} else {
1399	/ Defrag until file end /
1400	last_byte = isize;
1401	}
1402
1403	/ Align the range /
1404	cur = round_down(range->start, fs_info->sectorsize);
1405	last_byte = round_up(last_byte, fs_info->sectorsize) - `1`;
1406
1407	/*
1408	* If we were not given a ra, allocate a readahead context. As
1409	* readahead is just an optimization, defrag will work without it so
1410	* we don't error out.
1411	*/
1412	if (!ra) {
1413	ra_allocated = true;
1414	ra = kzalloc(size: sizeof(*ra), GFP_KERNEL);
1415	if (ra)
1416	file_ra_state_init(ra, mapping: inode->i_mapping);
1417	}
1418
1419	/*
1420	* Make writeback start from the beginning of the range, so that the
1421	* defrag range can be written sequentially.
1422	*/
1423	start_index = cur >> PAGE_SHIFT;
1424	if (start_index < inode->i_mapping->writeback_index)
1425	inode->i_mapping->writeback_index = start_index;
1426
1427	while (cur < last_byte) {
1428	const unsigned long prev_sectors_defragged = sectors_defragged;
1429	u64 last_scanned = cur;
1430	u64 cluster_end;
1431
1432	if (btrfs_defrag_cancelled(fs_info)) {
1433	ret = -EAGAIN;
1434	break;
1435	}
1436
1437	/ We want the cluster end at page boundary when possible /
1438	cluster_end = (((cur >> PAGE_SHIFT) +
1439	(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - `1`;
1440	cluster_end = min(cluster_end, last_byte);
1441
1442	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: `0`);
1443	if (IS_SWAPFILE(inode)) {
1444	ret = -ETXTBSY;
1445	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1446	break;
1447	}
1448	if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
1449	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1450	break;
1451	}
1452	if (do_compress)
1453	BTRFS_I(inode)->defrag_compress = compress_type;
1454	ret = defrag_one_cluster(inode: BTRFS_I(inode), ra, start: cur,
1455	len: cluster_end + `1` - cur, extent_thresh,
1456	newer_than, do_compress, sectors_defragged: &sectors_defragged,
1457	max_sectors: max_to_defrag, last_scanned_ret: &last_scanned);
1458
1459	if (sectors_defragged > prev_sectors_defragged)
1460	balance_dirty_pages_ratelimited(mapping: inode->i_mapping);
1461
1462	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1463	if (ret < `0`)
1464	break;
1465	cur = max(cluster_end + `1`, last_scanned);
1466	if (ret > `0`) {
1467	ret = `0`;
1468	break;
1469	}
1470	cond_resched();
1471	}
1472
1473	if (ra_allocated)
1474	kfree(objp: ra);
1475	/*
1476	* Update range.start for autodefrag, this will indicate where to start
1477	* in next run.
1478	*/
1479	range->start = cur;
1480	if (sectors_defragged) {
1481	/*
1482	* We have defragged some sectors, for compression case they
1483	* need to be written back immediately.
1484	*/
1485	if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1486	filemap_flush(inode->i_mapping);
1487	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1488	&BTRFS_I(inode)->runtime_flags))
1489	filemap_flush(inode->i_mapping);
1490	}
1491	if (range->compress_type == BTRFS_COMPRESS_LZO)
1492	btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1493	else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1494	btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1495	ret = sectors_defragged;
1496	}
1497	if (do_compress) {
1498	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: `0`);
1499	BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
1500	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1501	}
1502	return ret;
1503	}
1504
1505	void __cold btrfs_auto_defrag_exit(void)
1506	{
1507	kmem_cache_destroy(s: btrfs_inode_defrag_cachep);
1508	}
1509
1510	int __init btrfs_auto_defrag_init(void)
1511	{
1512	btrfs_inode_defrag_cachep = kmem_cache_create(name: "btrfs_inode_defrag",
1513	size: sizeof(struct inode_defrag), align: `0`, flags: `0`, NULL);
1514	if (!btrfs_inode_defrag_cachep)
1515	return -ENOMEM;
1516
1517	return `0`;
1518	}
1519

source code of linux/fs/btrfs/defrag.c