transaction.c source code [linux/fs/btrfs/transaction.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <linux/fs.h>
7	#include <linux/slab.h>
8	#include <linux/sched.h>
9	#include <linux/sched/mm.h>
10	#include <linux/writeback.h>
11	#include <linux/pagemap.h>
12	#include <linux/blkdev.h>
13	#include <linux/uuid.h>
14	#include <linux/timekeeping.h>
15	#include "misc.h"
16	#include "ctree.h"
17	#include "disk-io.h"
18	#include "transaction.h"
19	#include "locking.h"
20	#include "tree-log.h"
21	#include "volumes.h"
22	#include "dev-replace.h"
23	#include "qgroup.h"
24	#include "block-group.h"
25	#include "space-info.h"
26	#include "fs.h"
27	#include "accessors.h"
28	#include "extent-tree.h"
29	#include "root-tree.h"
30	#include "dir-item.h"
31	#include "uuid-tree.h"
32	#include "ioctl.h"
33	#include "relocation.h"
34	#include "scrub.h"
35
36	static struct kmem_cache *btrfs_trans_handle_cachep;
37
38	/*
39	* Transaction states and transitions
40	*
41	* No running transaction (fs tree blocks are not modified)
42	* \|
43	* \| To next stage:
44	* \| Call start_transaction() variants. Except btrfs_join_transaction_nostart().
45	* V
46	* Transaction N [[TRANS_STATE_RUNNING]]
47	* \|
48	* \| New trans handles can be attached to transaction N by calling all
49	* \| start_transaction() variants.
50	* \|
51	* \| To next stage:
52	* \| Call btrfs_commit_transaction() on any trans handle attached to
53	* \| transaction N
54	* V
55	* Transaction N [[TRANS_STATE_COMMIT_PREP]]
56	* \|
57	* \| If there are simultaneous calls to btrfs_commit_transaction() one will win
58	* \| the race and the rest will wait for the winner to commit the transaction.
59	* \|
60	* \| The winner will wait for previous running transaction to completely finish
61	* \| if there is one.
62	* \|
63	* Transaction N [[TRANS_STATE_COMMIT_START]]
64	* \|
65	* \| Then one of the following happens:
66	* \| - Wait for all other trans handle holders to release.
67	* \| The btrfs_commit_transaction() caller will do the commit work.
68	* \| - Wait for current transaction to be committed by others.
69	* \| Other btrfs_commit_transaction() caller will do the commit work.
70	* \|
71	* \| At this stage, only btrfs_join_transaction*() variants can attach
72	* \| to this running transaction.
73	* \| All other variants will wait for current one to finish and attach to
74	* \| transaction N+1.
75	* \|
76	* \| To next stage:
77	* \| Caller is chosen to commit transaction N, and all other trans handle
78	* \| haven been released.
79	* V
80	* Transaction N [[TRANS_STATE_COMMIT_DOING]]
81	* \|
82	* \| The heavy lifting transaction work is started.
83	* \| From running delayed refs (modifying extent tree) to creating pending
84	* \| snapshots, running qgroups.
85	* \| In short, modify supporting trees to reflect modifications of subvolume
86	* \| trees.
87	* \|
88	* \| At this stage, all start_transaction() calls will wait for this
89	* \| transaction to finish and attach to transaction N+1.
90	* \|
91	* \| To next stage:
92	* \| Until all supporting trees are updated.
93	* V
94	* Transaction N [[TRANS_STATE_UNBLOCKED]]
95	* \| Transaction N+1
96	* \| All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
97	* \| need to write them back to disk and update \|
98	* \| super blocks. \|
99	* \| \|
100	* \| At this stage, new transaction is allowed to \|
101	* \| start. \|
102	* \| All new start_transaction() calls will be \|
103	* \| attached to transid N+1. \|
104	* \| \|
105	* \| To next stage: \|
106	* \| Until all tree blocks are super blocks are \|
107	* \| written to block devices \|
108	* V \|
109	* Transaction N [[TRANS_STATE_COMPLETED]] V
110	* All tree blocks and super blocks are written. Transaction N+1
111	* This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
112	* data structures will be cleaned up. \| Life goes on
113	*/
114	static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
115	[TRANS_STATE_RUNNING] = `0U`,
116	[TRANS_STATE_COMMIT_PREP] = `0U`,
117	[TRANS_STATE_COMMIT_START] = (__TRANS_START \| __TRANS_ATTACH),
118	[TRANS_STATE_COMMIT_DOING] = (__TRANS_START \|
119	__TRANS_ATTACH \|
120	__TRANS_JOIN \|
121	__TRANS_JOIN_NOSTART),
122	[TRANS_STATE_UNBLOCKED] = (__TRANS_START \|
123	__TRANS_ATTACH \|
124	__TRANS_JOIN \|
125	__TRANS_JOIN_NOLOCK \|
126	__TRANS_JOIN_NOSTART),
127	[TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START \|
128	__TRANS_ATTACH \|
129	__TRANS_JOIN \|
130	__TRANS_JOIN_NOLOCK \|
131	__TRANS_JOIN_NOSTART),
132	[TRANS_STATE_COMPLETED] = (__TRANS_START \|
133	__TRANS_ATTACH \|
134	__TRANS_JOIN \|
135	__TRANS_JOIN_NOLOCK \|
136	__TRANS_JOIN_NOSTART),
137	};
138
139	void btrfs_put_transaction(struct btrfs_transaction *transaction)
140	{
141	WARN_ON(refcount_read(&transaction->use_count) == `0`);
142	if (refcount_dec_and_test(r: &transaction->use_count)) {
143	BUG_ON(!list_empty(&transaction->list));
144	WARN_ON(!RB_EMPTY_ROOT(
145	&transaction->delayed_refs.href_root.rb_root));
146	WARN_ON(!RB_EMPTY_ROOT(
147	&transaction->delayed_refs.dirty_extent_root));
148	if (transaction->delayed_refs.pending_csums)
149	btrfs_err(transaction->fs_info,
150	"pending csums is %llu",
151	transaction->delayed_refs.pending_csums);
152	/*
153	* If any block groups are found in ->deleted_bgs then it's
154	* because the transaction was aborted and a commit did not
155	* happen (things failed before writing the new superblock
156	* and calling btrfs_finish_extent_commit()), so we can not
157	* discard the physical locations of the block groups.
158	*/
159	while (!list_empty(head: &transaction->deleted_bgs)) {
160	struct btrfs_block_group *cache;
161
162	cache = list_first_entry(&transaction->deleted_bgs,
163	struct btrfs_block_group,
164	bg_list);
165	list_del_init(entry: &cache->bg_list);
166	btrfs_unfreeze_block_group(cache);
167	btrfs_put_block_group(cache);
168	}
169	WARN_ON(!list_empty(&transaction->dev_update_list));
170	kfree(objp: transaction);
171	}
172	}
173
174	static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
175	{
176	struct btrfs_transaction *cur_trans = trans->transaction;
177	struct btrfs_fs_info *fs_info = trans->fs_info;
178	struct btrfs_root root, tmp;
179
180	/*
181	* At this point no one can be using this transaction to modify any tree
182	* and no one can start another transaction to modify any tree either.
183	*/
184	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
185
186	down_write(sem: &fs_info->commit_root_sem);
187
188	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
189	fs_info->last_reloc_trans = trans->transid;
190
191	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
192	dirty_list) {
193	list_del_init(entry: &root->dirty_list);
194	free_extent_buffer(eb: root->commit_root);
195	root->commit_root = btrfs_root_node(root);
196	extent_io_tree_release(tree: &root->dirty_log_pages);
197	btrfs_qgroup_clean_swapped_blocks(root);
198	}
199
200	/ We can free old roots now. /
201	spin_lock(lock: &cur_trans->dropped_roots_lock);
202	while (!list_empty(head: &cur_trans->dropped_roots)) {
203	root = list_first_entry(&cur_trans->dropped_roots,
204	struct btrfs_root, root_list);
205	list_del_init(entry: &root->root_list);
206	spin_unlock(lock: &cur_trans->dropped_roots_lock);
207	btrfs_free_log(trans, root);
208	btrfs_drop_and_free_fs_root(fs_info, root);
209	spin_lock(lock: &cur_trans->dropped_roots_lock);
210	}
211	spin_unlock(lock: &cur_trans->dropped_roots_lock);
212
213	up_write(sem: &fs_info->commit_root_sem);
214	}
215
216	static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
217	unsigned int type)
218	{
219	if (type & TRANS_EXTWRITERS)
220	atomic_inc(v: &trans->num_extwriters);
221	}
222
223	static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
224	unsigned int type)
225	{
226	if (type & TRANS_EXTWRITERS)
227	atomic_dec(v: &trans->num_extwriters);
228	}
229
230	static inline void extwriter_counter_init(struct btrfs_transaction *trans,
231	unsigned int type)
232	{
233	atomic_set(v: &trans->num_extwriters, i: ((type & TRANS_EXTWRITERS) ? `1` : `0`));
234	}
235
236	static inline int extwriter_counter_read(struct btrfs_transaction *trans)
237	{
238	return atomic_read(v: &trans->num_extwriters);
239	}
240
241	/*
242	* To be called after doing the chunk btree updates right after allocating a new
243	* chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
244	* chunk after all chunk btree updates and after finishing the second phase of
245	* chunk allocation (btrfs_create_pending_block_groups()) in case some block
246	* group had its chunk item insertion delayed to the second phase.
247	*/
248	void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
249	{
250	struct btrfs_fs_info *fs_info = trans->fs_info;
251
252	if (!trans->chunk_bytes_reserved)
253	return;
254
255	btrfs_block_rsv_release(fs_info, block_rsv: &fs_info->chunk_block_rsv,
256	num_bytes: trans->chunk_bytes_reserved, NULL);
257	trans->chunk_bytes_reserved = `0`;
258	}
259
260	/*
261	* either allocate a new transaction or hop into the existing one
262	*/
263	static noinline int join_transaction(struct btrfs_fs_info *fs_info,
264	unsigned int type)
265	{
266	struct btrfs_transaction *cur_trans;
267
268	spin_lock(lock: &fs_info->trans_lock);
269	loop:
270	/ The file system has been taken offline. No new transactions. /
271	if (BTRFS_FS_ERROR(fs_info)) {
272	spin_unlock(lock: &fs_info->trans_lock);
273	return -EROFS;
274	}
275
276	cur_trans = fs_info->running_transaction;
277	if (cur_trans) {
278	if (TRANS_ABORTED(cur_trans)) {
279	spin_unlock(lock: &fs_info->trans_lock);
280	return cur_trans->aborted;
281	}
282	if (btrfs_blocked_trans_types[cur_trans->state] & type) {
283	spin_unlock(lock: &fs_info->trans_lock);
284	return -EBUSY;
285	}
286	refcount_inc(r: &cur_trans->use_count);
287	atomic_inc(v: &cur_trans->num_writers);
288	extwriter_counter_inc(trans: cur_trans, type);
289	spin_unlock(lock: &fs_info->trans_lock);
290	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
291	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
292	return `0`;
293	}
294	spin_unlock(lock: &fs_info->trans_lock);
295
296	/*
297	* If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
298	* current transaction, and commit it. If there is no transaction, just
299	* return ENOENT.
300	*/
301	if (type == TRANS_ATTACH \|\| type == TRANS_JOIN_NOSTART)
302	return -ENOENT;
303
304	/*
305	* JOIN_NOLOCK only happens during the transaction commit, so
306	* it is impossible that ->running_transaction is NULL
307	*/
308	BUG_ON(type == TRANS_JOIN_NOLOCK);
309
310	cur_trans = kmalloc(size: sizeof(*cur_trans), GFP_NOFS);
311	if (!cur_trans)
312	return -ENOMEM;
313
314	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
315	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
316
317	spin_lock(lock: &fs_info->trans_lock);
318	if (fs_info->running_transaction) {
319	/*
320	* someone started a transaction after we unlocked. Make sure
321	* to redo the checks above
322	*/
323	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
324	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
325	kfree(objp: cur_trans);
326	goto loop;
327	} else if (BTRFS_FS_ERROR(fs_info)) {
328	spin_unlock(lock: &fs_info->trans_lock);
329	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
330	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
331	kfree(objp: cur_trans);
332	return -EROFS;
333	}
334
335	cur_trans->fs_info = fs_info;
336	atomic_set(v: &cur_trans->pending_ordered, i: `0`);
337	init_waitqueue_head(&cur_trans->pending_wait);
338	atomic_set(v: &cur_trans->num_writers, i: `1`);
339	extwriter_counter_init(trans: cur_trans, type);
340	init_waitqueue_head(&cur_trans->writer_wait);
341	init_waitqueue_head(&cur_trans->commit_wait);
342	cur_trans->state = TRANS_STATE_RUNNING;
343	/*
344	* One for this trans handle, one so it will live on until we
345	* commit the transaction.
346	*/
347	refcount_set(r: &cur_trans->use_count, n: `2`);
348	cur_trans->flags = `0`;
349	cur_trans->start_time = ktime_get_seconds();
350
351	memset(&cur_trans->delayed_refs, `0`, sizeof(cur_trans->delayed_refs));
352
353	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
354	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
355	atomic_set(v: &cur_trans->delayed_refs.num_entries, i: `0`);
356
357	/*
358	* although the tree mod log is per file system and not per transaction,
359	* the log must never go across transaction boundaries.
360	*/
361	smp_mb();
362	if (!list_empty(head: &fs_info->tree_mod_seq_list))
363	WARN(`1`, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
364	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
365	WARN(`1`, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
366	atomic64_set(v: &fs_info->tree_mod_seq, i: `0`);
367
368	spin_lock_init(&cur_trans->delayed_refs.lock);
369
370	INIT_LIST_HEAD(list: &cur_trans->pending_snapshots);
371	INIT_LIST_HEAD(list: &cur_trans->dev_update_list);
372	INIT_LIST_HEAD(list: &cur_trans->switch_commits);
373	INIT_LIST_HEAD(list: &cur_trans->dirty_bgs);
374	INIT_LIST_HEAD(list: &cur_trans->io_bgs);
375	INIT_LIST_HEAD(list: &cur_trans->dropped_roots);
376	mutex_init(&cur_trans->cache_write_mutex);
377	spin_lock_init(&cur_trans->dirty_bgs_lock);
378	INIT_LIST_HEAD(list: &cur_trans->deleted_bgs);
379	spin_lock_init(&cur_trans->dropped_roots_lock);
380	list_add_tail(new: &cur_trans->list, head: &fs_info->trans_list);
381	extent_io_tree_init(fs_info, tree: &cur_trans->dirty_pages,
382	owner: IO_TREE_TRANS_DIRTY_PAGES);
383	extent_io_tree_init(fs_info, tree: &cur_trans->pinned_extents,
384	owner: IO_TREE_FS_PINNED_EXTENTS);
385	btrfs_set_fs_generation(fs_info, gen: fs_info->generation + `1`);
386	cur_trans->transid = fs_info->generation;
387	fs_info->running_transaction = cur_trans;
388	cur_trans->aborted = `0`;
389	spin_unlock(lock: &fs_info->trans_lock);
390
391	return `0`;
392	}
393
394	/*
395	* This does all the record keeping required to make sure that a shareable root
396	* is properly recorded in a given transaction. This is required to make sure
397	* the old root from before we joined the transaction is deleted when the
398	* transaction commits.
399	*/
400	static int record_root_in_trans(struct btrfs_trans_handle *trans,
401	struct btrfs_root *root,
402	int force)
403	{
404	struct btrfs_fs_info *fs_info = root->fs_info;
405	int ret = `0`;
406
407	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
408	root->last_trans < trans->transid) \|\| force) {
409	WARN_ON(!force && root->commit_root != root->node);
410
411	/*
412	* see below for IN_TRANS_SETUP usage rules
413	* we have the reloc mutex held now, so there
414	* is only one writer in this function
415	*/
416	set_bit(nr: BTRFS_ROOT_IN_TRANS_SETUP, addr: &root->state);
417
418	/ make sure readers find IN_TRANS_SETUP before*
419	* they find our root->last_trans update
420	*/
421	smp_wmb();
422
423	spin_lock(lock: &fs_info->fs_roots_radix_lock);
424	if (root->last_trans == trans->transid && !force) {
425	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
426	return `0`;
427	}
428	radix_tree_tag_set(&fs_info->fs_roots_radix,
429	index: (unsigned long)root->root_key.objectid,
430	BTRFS_ROOT_TRANS_TAG);
431	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
432	root->last_trans = trans->transid;
433
434	/ this is pretty tricky. We don't want to*
435	* take the relocation lock in btrfs_record_root_in_trans
436	* unless we're really doing the first setup for this root in
437	* this transaction.
438	*
439	* Normally we'd use root->last_trans as a flag to decide
440	* if we want to take the expensive mutex.
441	*
442	* But, we have to set root->last_trans before we
443	* init the relocation root, otherwise, we trip over warnings
444	* in ctree.c. The solution used here is to flag ourselves
445	* with root IN_TRANS_SETUP. When this is 1, we're still
446	* fixing up the reloc trees and everyone must wait.
447	*
448	* When this is zero, they can trust root->last_trans and fly
449	* through btrfs_record_root_in_trans without having to take the
450	* lock. smp_wmb() makes sure that all the writes above are
451	* done before we pop in the zero below
452	*/
453	ret = btrfs_init_reloc_root(trans, root);
454	smp_mb__before_atomic();
455	clear_bit(nr: BTRFS_ROOT_IN_TRANS_SETUP, addr: &root->state);
456	}
457	return ret;
458	}
459
460
461	void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
462	struct btrfs_root *root)
463	{
464	struct btrfs_fs_info *fs_info = root->fs_info;
465	struct btrfs_transaction *cur_trans = trans->transaction;
466
467	/ Add ourselves to the transaction dropped list /
468	spin_lock(lock: &cur_trans->dropped_roots_lock);
469	list_add_tail(new: &root->root_list, head: &cur_trans->dropped_roots);
470	spin_unlock(lock: &cur_trans->dropped_roots_lock);
471
472	/ Make sure we don't try to update the root at commit time /
473	spin_lock(lock: &fs_info->fs_roots_radix_lock);
474	radix_tree_tag_clear(&fs_info->fs_roots_radix,
475	index: (unsigned long)root->root_key.objectid,
476	BTRFS_ROOT_TRANS_TAG);
477	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
478	}
479
480	int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
481	struct btrfs_root *root)
482	{
483	struct btrfs_fs_info *fs_info = root->fs_info;
484	int ret;
485
486	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
487	return `0`;
488
489	/*
490	* see record_root_in_trans for comments about IN_TRANS_SETUP usage
491	* and barriers
492	*/
493	smp_rmb();
494	if (root->last_trans == trans->transid &&
495	!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
496	return `0`;
497
498	mutex_lock(&fs_info->reloc_mutex);
499	ret = record_root_in_trans(trans, root, force: `0`);
500	mutex_unlock(lock: &fs_info->reloc_mutex);
501
502	return ret;
503	}
504
505	static inline int is_transaction_blocked(struct btrfs_transaction *trans)
506	{
507	return (trans->state >= TRANS_STATE_COMMIT_START &&
508	trans->state < TRANS_STATE_UNBLOCKED &&
509	!TRANS_ABORTED(trans));
510	}
511
512	/ wait for commit against the current transaction to become unblocked*
513	* when this is done, it is safe to start a new transaction, but the current
514	* transaction might not be fully on disk.
515	*/
516	static void wait_current_trans(struct btrfs_fs_info *fs_info)
517	{
518	struct btrfs_transaction *cur_trans;
519
520	spin_lock(lock: &fs_info->trans_lock);
521	cur_trans = fs_info->running_transaction;
522	if (cur_trans && is_transaction_blocked(trans: cur_trans)) {
523	refcount_inc(r: &cur_trans->use_count);
524	spin_unlock(lock: &fs_info->trans_lock);
525
526	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
527	wait_event(fs_info->transaction_wait,
528	cur_trans->state >= TRANS_STATE_UNBLOCKED \|\|
529	TRANS_ABORTED(cur_trans));
530	btrfs_put_transaction(transaction: cur_trans);
531	} else {
532	spin_unlock(lock: &fs_info->trans_lock);
533	}
534	}
535
536	static int may_wait_transaction(struct btrfs_fs_info fs_info, int* type)
537	{
538	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
539	return `0`;
540
541	if (type == TRANS_START)
542	return `1`;
543
544	return `0`;
545	}
546
547	static inline bool need_reserve_reloc_root(struct btrfs_root *root)
548	{
549	struct btrfs_fs_info *fs_info = root->fs_info;
550
551	if (!fs_info->reloc_ctl \|\|
552	!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) \|\|
553	root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID \|\|
554	root->reloc_root)
555	return false;
556
557	return true;
558	}
559
560	static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
561	enum btrfs_reserve_flush_enum flush,
562	u64 num_bytes,
563	u64 *delayed_refs_bytes)
564	{
565	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
566	u64 bytes = num_bytes + *delayed_refs_bytes;
567	int ret;
568
569	/*
570	* We want to reserve all the bytes we may need all at once, so we only
571	* do 1 enospc flushing cycle per transaction start.
572	*/
573	ret = btrfs_reserve_metadata_bytes(fs_info, space_info: si, orig_bytes: bytes, flush);
574
575	/*
576	* If we are an emergency flush, which can steal from the global block
577	* reserve, then attempt to not reserve space for the delayed refs, as
578	* we will consume space for them from the global block reserve.
579	*/
580	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
581	bytes -= *delayed_refs_bytes;
582	*delayed_refs_bytes = `0`;
583	ret = btrfs_reserve_metadata_bytes(fs_info, space_info: si, orig_bytes: bytes, flush);
584	}
585
586	return ret;
587	}
588
589	static struct btrfs_trans_handle *
590	start_transaction(struct btrfs_root root, unsigned* int num_items,
591	unsigned int type, enum btrfs_reserve_flush_enum flush,
592	bool enforce_qgroups)
593	{
594	struct btrfs_fs_info *fs_info = root->fs_info;
595	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
596	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
597	struct btrfs_trans_handle *h;
598	struct btrfs_transaction *cur_trans;
599	u64 num_bytes = `0`;
600	u64 qgroup_reserved = `0`;
601	u64 delayed_refs_bytes = `0`;
602	bool reloc_reserved = false;
603	bool do_chunk_alloc = false;
604	int ret;
605
606	if (BTRFS_FS_ERROR(fs_info))
607	return ERR_PTR(error: -EROFS);
608
609	if (current->journal_info) {
610	WARN_ON(type & TRANS_EXTWRITERS);
611	h = current->journal_info;
612	refcount_inc(r: &h->use_count);
613	WARN_ON(refcount_read(&h->use_count) > `2`);
614	h->orig_rsv = h->block_rsv;
615	h->block_rsv = NULL;
616	goto got_it;
617	}
618
619	/*
620	* Do the reservation before we join the transaction so we can do all
621	* the appropriate flushing if need be.
622	*/
623	if (num_items && root != fs_info->chunk_root) {
624	qgroup_reserved = num_items * fs_info->nodesize;
625	/*
626	* Use prealloc for now, as there might be a currently running
627	* transaction that could free this reserved space prematurely
628	* by committing.
629	*/
630	ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes: qgroup_reserved,
631	enforce: enforce_qgroups, noflush: false);
632	if (ret)
633	return ERR_PTR(error: ret);
634
635	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
636	/*
637	* If we plan to insert/update/delete "num_items" from a btree,
638	* we will also generate delayed refs for extent buffers in the
639	* respective btree paths, so reserve space for the delayed refs
640	* that will be generated by the caller as it modifies btrees.
641	* Try to reserve them to avoid excessive use of the global
642	* block reserve.
643	*/
644	delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_delayed_refs: num_items);
645
646	/*
647	* Do the reservation for the relocation root creation
648	*/
649	if (need_reserve_reloc_root(root)) {
650	num_bytes += fs_info->nodesize;
651	reloc_reserved = true;
652	}
653
654	ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
655	delayed_refs_bytes: &delayed_refs_bytes);
656	if (ret)
657	goto reserve_fail;
658
659	btrfs_block_rsv_add_bytes(block_rsv: trans_rsv, num_bytes, update_size: true);
660
661	if (trans_rsv->space_info->force_alloc)
662	do_chunk_alloc = true;
663	} else if (num_items == `0` && flush == BTRFS_RESERVE_FLUSH_ALL &&
664	!btrfs_block_rsv_full(rsv: delayed_refs_rsv)) {
665	/*
666	* Some people call with btrfs_start_transaction(root, 0)
667	* because they can be throttled, but have some other mechanism
668	* for reserving space. We still want these guys to refill the
669	* delayed block_rsv so just add 1 items worth of reservation
670	* here.
671	*/
672	ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
673	if (ret)
674	goto reserve_fail;
675	}
676	again:
677	h = kmem_cache_zalloc(k: btrfs_trans_handle_cachep, GFP_NOFS);
678	if (!h) {
679	ret = -ENOMEM;
680	goto alloc_fail;
681	}
682
683	/*
684	* If we are JOIN_NOLOCK we're already committing a transaction and
685	* waiting on this guy, so we don't need to do the sb_start_intwrite
686	* because we're already holding a ref. We need this because we could
687	* have raced in and did an fsync() on a file which can kick a commit
688	* and then we deadlock with somebody doing a freeze.
689	*
690	* If we are ATTACH, it means we just want to catch the current
691	* transaction and commit it, so we needn't do sb_start_intwrite().
692	*/
693	if (type & __TRANS_FREEZABLE)
694	sb_start_intwrite(sb: fs_info->sb);
695
696	if (may_wait_transaction(fs_info, type))
697	wait_current_trans(fs_info);
698
699	do {
700	ret = join_transaction(fs_info, type);
701	if (ret == -EBUSY) {
702	wait_current_trans(fs_info);
703	if (unlikely(type == TRANS_ATTACH \|\|
704	type == TRANS_JOIN_NOSTART))
705	ret = -ENOENT;
706	}
707	} while (ret == -EBUSY);
708
709	if (ret < `0`)
710	goto join_fail;
711
712	cur_trans = fs_info->running_transaction;
713
714	h->transid = cur_trans->transid;
715	h->transaction = cur_trans;
716	refcount_set(r: &h->use_count, n: `1`);
717	h->fs_info = root->fs_info;
718
719	h->type = type;
720	INIT_LIST_HEAD(list: &h->new_bgs);
721	btrfs_init_metadata_block_rsv(fs_info, rsv: &h->delayed_rsv, type: BTRFS_BLOCK_RSV_DELOPS);
722
723	smp_mb();
724	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
725	may_wait_transaction(fs_info, type)) {
726	current->journal_info = h;
727	btrfs_commit_transaction(trans: h);
728	goto again;
729	}
730
731	if (num_bytes) {
732	trace_btrfs_space_reservation(fs_info, type: "transaction",
733	val: h->transid, bytes: num_bytes, reserve: `1`);
734	h->block_rsv = trans_rsv;
735	h->bytes_reserved = num_bytes;
736	if (delayed_refs_bytes > `0`) {
737	trace_btrfs_space_reservation(fs_info,
738	type: "local_delayed_refs_rsv",
739	val: h->transid,
740	bytes: delayed_refs_bytes, reserve: `1`);
741	h->delayed_refs_bytes_reserved = delayed_refs_bytes;
742	btrfs_block_rsv_add_bytes(block_rsv: &h->delayed_rsv, num_bytes: delayed_refs_bytes, update_size: true);
743	delayed_refs_bytes = `0`;
744	}
745	h->reloc_reserved = reloc_reserved;
746	}
747
748	got_it:
749	if (!current->journal_info)
750	current->journal_info = h;
751
752	/*
753	* If the space_info is marked ALLOC_FORCE then we'll get upgraded to
754	* ALLOC_FORCE the first run through, and then we won't allocate for
755	* anybody else who races in later. We don't care about the return
756	* value here.
757	*/
758	if (do_chunk_alloc && num_bytes) {
759	u64 flags = h->block_rsv->space_info->flags;
760
761	btrfs_chunk_alloc(trans: h, flags: btrfs_get_alloc_profile(fs_info, orig_flags: flags),
762	force: CHUNK_ALLOC_NO_FORCE);
763	}
764
765	/*
766	* btrfs_record_root_in_trans() needs to alloc new extents, and may
767	* call btrfs_join_transaction() while we're also starting a
768	* transaction.
769	*
770	* Thus it need to be called after current->journal_info initialized,
771	* or we can deadlock.
772	*/
773	ret = btrfs_record_root_in_trans(trans: h, root);
774	if (ret) {
775	/*
776	* The transaction handle is fully initialized and linked with
777	* other structures so it needs to be ended in case of errors,
778	* not just freed.
779	*/
780	btrfs_end_transaction(trans: h);
781	goto reserve_fail;
782	}
783	/*
784	* Now that we have found a transaction to be a part of, convert the
785	* qgroup reservation from prealloc to pertrans. A different transaction
786	* can't race in and free our pertrans out from under us.
787	*/
788	if (qgroup_reserved)
789	btrfs_qgroup_convert_reserved_meta(root, num_bytes: qgroup_reserved);
790
791	return h;
792
793	join_fail:
794	if (type & __TRANS_FREEZABLE)
795	sb_end_intwrite(sb: fs_info->sb);
796	kmem_cache_free(s: btrfs_trans_handle_cachep, objp: h);
797	alloc_fail:
798	if (num_bytes)
799	btrfs_block_rsv_release(fs_info, block_rsv: trans_rsv, num_bytes, NULL);
800	if (delayed_refs_bytes)
801	btrfs_space_info_free_bytes_may_use(fs_info, space_info: trans_rsv->space_info,
802	num_bytes: delayed_refs_bytes);
803	reserve_fail:
804	btrfs_qgroup_free_meta_prealloc(root, num_bytes: qgroup_reserved);
805	return ERR_PTR(error: ret);
806	}
807
808	struct btrfs_trans_handle btrfs_start_transaction(struct* btrfs_root *root,
809	unsigned int num_items)
810	{
811	return start_transaction(root, num_items, TRANS_START,
812	flush: BTRFS_RESERVE_FLUSH_ALL, enforce_qgroups: true);
813	}
814
815	struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
816	struct btrfs_root *root,
817	unsigned int num_items)
818	{
819	return start_transaction(root, num_items, TRANS_START,
820	flush: BTRFS_RESERVE_FLUSH_ALL_STEAL, enforce_qgroups: false);
821	}
822
823	struct btrfs_trans_handle btrfs_join_transaction(struct* btrfs_root *root)
824	{
825	return start_transaction(root, num_items: `0`, TRANS_JOIN, flush: BTRFS_RESERVE_NO_FLUSH,
826	enforce_qgroups: true);
827	}
828
829	struct btrfs_trans_handle btrfs_join_transaction_spacecache(struct* btrfs_root *root)
830	{
831	return start_transaction(root, num_items: `0`, TRANS_JOIN_NOLOCK,
832	flush: BTRFS_RESERVE_NO_FLUSH, enforce_qgroups: true);
833	}
834
835	/*
836	* Similar to regular join but it never starts a transaction when none is
837	* running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED.
838	* This is similar to btrfs_attach_transaction() but it allows the join to
839	* happen if the transaction commit already started but it's not yet in the
840	* "doing" phase (the state is < TRANS_STATE_COMMIT_DOING).
841	*/
842	struct btrfs_trans_handle btrfs_join_transaction_nostart(struct* btrfs_root *root)
843	{
844	return start_transaction(root, num_items: `0`, TRANS_JOIN_NOSTART,
845	flush: BTRFS_RESERVE_NO_FLUSH, enforce_qgroups: true);
846	}
847
848	/*
849	* Catch the running transaction.
850	*
851	* It is used when we want to commit the current the transaction, but
852	* don't want to start a new one.
853	*
854	* Note: If this function return -ENOENT, it just means there is no
855	* running transaction. But it is possible that the inactive transaction
856	* is still in the memory, not fully on disk. If you hope there is no
857	* inactive transaction in the fs when -ENOENT is returned, you should
858	* invoke
859	* btrfs_attach_transaction_barrier()
860	*/
861	struct btrfs_trans_handle btrfs_attach_transaction(struct* btrfs_root *root)
862	{
863	return start_transaction(root, num_items: `0`, TRANS_ATTACH,
864	flush: BTRFS_RESERVE_NO_FLUSH, enforce_qgroups: true);
865	}
866
867	/*
868	* Catch the running transaction.
869	*
870	* It is similar to the above function, the difference is this one
871	* will wait for all the inactive transactions until they fully
872	* complete.
873	*/
874	struct btrfs_trans_handle *
875	btrfs_attach_transaction_barrier(struct btrfs_root *root)
876	{
877	struct btrfs_trans_handle *trans;
878
879	trans = start_transaction(root, num_items: `0`, TRANS_ATTACH,
880	flush: BTRFS_RESERVE_NO_FLUSH, enforce_qgroups: true);
881	if (trans == ERR_PTR(error: -ENOENT)) {
882	int ret;
883
884	ret = btrfs_wait_for_commit(fs_info: root->fs_info, transid: `0`);
885	if (ret)
886	return ERR_PTR(error: ret);
887	}
888
889	return trans;
890	}
891
892	/ Wait for a transaction commit to reach at least the given state. /
893	static noinline void wait_for_commit(struct btrfs_transaction *commit,
894	const enum btrfs_trans_state min_state)
895	{
896	struct btrfs_fs_info *fs_info = commit->fs_info;
897	u64 transid = commit->transid;
898	bool put = false;
899
900	/*
901	* At the moment this function is called with min_state either being
902	* TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
903	*/
904	if (min_state == TRANS_STATE_COMPLETED)
905	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
906	else
907	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
908
909	while (`1`) {
910	wait_event(commit->commit_wait, commit->state >= min_state);
911	if (put)
912	btrfs_put_transaction(transaction: commit);
913
914	if (min_state < TRANS_STATE_COMPLETED)
915	break;
916
917	/*
918	* A transaction isn't really completed until all of the
919	* previous transactions are completed, but with fsync we can
920	* end up with SUPER_COMMITTED transactions before a COMPLETED
921	* transaction. Wait for those.
922	*/
923
924	spin_lock(lock: &fs_info->trans_lock);
925	commit = list_first_entry_or_null(&fs_info->trans_list,
926	struct btrfs_transaction,
927	list);
928	if (!commit \|\| commit->transid > transid) {
929	spin_unlock(lock: &fs_info->trans_lock);
930	break;
931	}
932	refcount_inc(r: &commit->use_count);
933	put = true;
934	spin_unlock(lock: &fs_info->trans_lock);
935	}
936	}
937
938	int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
939	{
940	struct btrfs_transaction cur_trans = NULL, t;
941	int ret = `0`;
942
943	if (transid) {
944	if (transid <= btrfs_get_last_trans_committed(fs_info))
945	goto out;
946
947	/ find specified transaction /
948	spin_lock(lock: &fs_info->trans_lock);
949	list_for_each_entry(t, &fs_info->trans_list, list) {
950	if (t->transid == transid) {
951	cur_trans = t;
952	refcount_inc(r: &cur_trans->use_count);
953	ret = `0`;
954	break;
955	}
956	if (t->transid > transid) {
957	ret = `0`;
958	break;
959	}
960	}
961	spin_unlock(lock: &fs_info->trans_lock);
962
963	/*
964	* The specified transaction doesn't exist, or we
965	* raced with btrfs_commit_transaction
966	*/
967	if (!cur_trans) {
968	if (transid > btrfs_get_last_trans_committed(fs_info))
969	ret = -EINVAL;
970	goto out;
971	}
972	} else {
973	/ find newest transaction that is committing \| committed /
974	spin_lock(lock: &fs_info->trans_lock);
975	list_for_each_entry_reverse(t, &fs_info->trans_list,
976	list) {
977	if (t->state >= TRANS_STATE_COMMIT_START) {
978	if (t->state == TRANS_STATE_COMPLETED)
979	break;
980	cur_trans = t;
981	refcount_inc(r: &cur_trans->use_count);
982	break;
983	}
984	}
985	spin_unlock(lock: &fs_info->trans_lock);
986	if (!cur_trans)
987	goto out; / nothing committing\|committed /
988	}
989
990	wait_for_commit(commit: cur_trans, min_state: TRANS_STATE_COMPLETED);
991	ret = cur_trans->aborted;
992	btrfs_put_transaction(transaction: cur_trans);
993	out:
994	return ret;
995	}
996
997	void btrfs_throttle(struct btrfs_fs_info *fs_info)
998	{
999	wait_current_trans(fs_info);
1000	}
1001
1002	bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
1003	{
1004	struct btrfs_transaction *cur_trans = trans->transaction;
1005
1006	if (cur_trans->state >= TRANS_STATE_COMMIT_START \|\|
1007	test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
1008	return true;
1009
1010	if (btrfs_check_space_for_delayed_refs(fs_info: trans->fs_info))
1011	return true;
1012
1013	return !!btrfs_block_rsv_check(block_rsv: &trans->fs_info->global_block_rsv, min_percent: `50`);
1014	}
1015
1016	static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
1017
1018	{
1019	struct btrfs_fs_info *fs_info = trans->fs_info;
1020
1021	if (!trans->block_rsv) {
1022	ASSERT(!trans->bytes_reserved);
1023	ASSERT(!trans->delayed_refs_bytes_reserved);
1024	return;
1025	}
1026
1027	if (!trans->bytes_reserved) {
1028	ASSERT(!trans->delayed_refs_bytes_reserved);
1029	return;
1030	}
1031
1032	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
1033	trace_btrfs_space_reservation(fs_info, type: "transaction",
1034	val: trans->transid, bytes: trans->bytes_reserved, reserve: `0`);
1035	btrfs_block_rsv_release(fs_info, block_rsv: trans->block_rsv,
1036	num_bytes: trans->bytes_reserved, NULL);
1037	trans->bytes_reserved = `0`;
1038
1039	if (!trans->delayed_refs_bytes_reserved)
1040	return;
1041
1042	trace_btrfs_space_reservation(fs_info, type: "local_delayed_refs_rsv",
1043	val: trans->transid,
1044	bytes: trans->delayed_refs_bytes_reserved, reserve: `0`);
1045	btrfs_block_rsv_release(fs_info, block_rsv: &trans->delayed_rsv,
1046	num_bytes: trans->delayed_refs_bytes_reserved, NULL);
1047	trans->delayed_refs_bytes_reserved = `0`;
1048	}
1049
1050	static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
1051	int throttle)
1052	{
1053	struct btrfs_fs_info *info = trans->fs_info;
1054	struct btrfs_transaction *cur_trans = trans->transaction;
1055	int err = `0`;
1056
1057	if (refcount_read(r: &trans->use_count) > `1`) {
1058	refcount_dec(r: &trans->use_count);
1059	trans->block_rsv = trans->orig_rsv;
1060	return `0`;
1061	}
1062
1063	btrfs_trans_release_metadata(trans);
1064	trans->block_rsv = NULL;
1065
1066	btrfs_create_pending_block_groups(trans);
1067
1068	btrfs_trans_release_chunk_metadata(trans);
1069
1070	if (trans->type & __TRANS_FREEZABLE)
1071	sb_end_intwrite(sb: info->sb);
1072
1073	WARN_ON(cur_trans != info->running_transaction);
1074	WARN_ON(atomic_read(&cur_trans->num_writers) < `1`);
1075	atomic_dec(v: &cur_trans->num_writers);
1076	extwriter_counter_dec(trans: cur_trans, type: trans->type);
1077
1078	cond_wake_up(wq: &cur_trans->writer_wait);
1079
1080	btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
1081	btrfs_lockdep_release(info, btrfs_trans_num_writers);
1082
1083	btrfs_put_transaction(transaction: cur_trans);
1084
1085	if (current->journal_info == trans)
1086	current->journal_info = NULL;
1087
1088	if (throttle)
1089	btrfs_run_delayed_iputs(fs_info: info);
1090
1091	if (TRANS_ABORTED(trans) \|\| BTRFS_FS_ERROR(info)) {
1092	wake_up_process(tsk: info->transaction_kthread);
1093	if (TRANS_ABORTED(trans))
1094	err = trans->aborted;
1095	else
1096	err = -EROFS;
1097	}
1098
1099	kmem_cache_free(s: btrfs_trans_handle_cachep, objp: trans);
1100	return err;
1101	}
1102
1103	int btrfs_end_transaction(struct btrfs_trans_handle *trans)
1104	{
1105	return __btrfs_end_transaction(trans, throttle: `0`);
1106	}
1107
1108	int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
1109	{
1110	return __btrfs_end_transaction(trans, throttle: `1`);
1111	}
1112
1113	/*
1114	* when btree blocks are allocated, they have some corresponding bits set for
1115	* them in one of two extent_io trees. This is used to make sure all of
1116	* those extents are sent to disk but does not wait on them
1117	*/
1118	int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
1119	struct extent_io_tree dirty_pages, int* mark)
1120	{
1121	int err = `0`;
1122	int werr = `0`;
1123	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1124	struct extent_state *cached_state = NULL;
1125	u64 start = `0`;
1126	u64 end;
1127
1128	while (find_first_extent_bit(tree: dirty_pages, start, start_ret: &start, end_ret: &end,
1129	bits: mark, cached_state: &cached_state)) {
1130	bool wait_writeback = false;
1131
1132	err = convert_extent_bit(tree: dirty_pages, start, end,
1133	bits: EXTENT_NEED_WAIT,
1134	clear_bits: mark, cached_state: &cached_state);
1135	/*
1136	* convert_extent_bit can return -ENOMEM, which is most of the
1137	* time a temporary error. So when it happens, ignore the error
1138	* and wait for writeback of this range to finish - because we
1139	* failed to set the bit EXTENT_NEED_WAIT for the range, a call
1140	* to __btrfs_wait_marked_extents() would not know that
1141	* writeback for this range started and therefore wouldn't
1142	* wait for it to finish - we don't want to commit a
1143	* superblock that points to btree nodes/leafs for which
1144	* writeback hasn't finished yet (and without errors).
1145	* We cleanup any entries left in the io tree when committing
1146	* the transaction (through extent_io_tree_release()).
1147	*/
1148	if (err == -ENOMEM) {
1149	err = `0`;
1150	wait_writeback = true;
1151	}
1152	if (!err)
1153	err = filemap_fdatawrite_range(mapping, start, end);
1154	if (err)
1155	werr = err;
1156	else if (wait_writeback)
1157	werr = filemap_fdatawait_range(mapping, lstart: start, lend: end);
1158	free_extent_state(state: cached_state);
1159	cached_state = NULL;
1160	cond_resched();
1161	start = end + `1`;
1162	}
1163	return werr;
1164	}
1165
1166	/*
1167	* when btree blocks are allocated, they have some corresponding bits set for
1168	* them in one of two extent_io trees. This is used to make sure all of
1169	* those extents are on disk for transaction or log commit. We wait
1170	* on all the pages and clear them from the dirty pages state tree
1171	*/
1172	static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
1173	struct extent_io_tree *dirty_pages)
1174	{
1175	int err = `0`;
1176	int werr = `0`;
1177	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1178	struct extent_state *cached_state = NULL;
1179	u64 start = `0`;
1180	u64 end;
1181
1182	while (find_first_extent_bit(tree: dirty_pages, start, start_ret: &start, end_ret: &end,
1183	bits: EXTENT_NEED_WAIT, cached_state: &cached_state)) {
1184	/*
1185	* Ignore -ENOMEM errors returned by clear_extent_bit().
1186	* When committing the transaction, we'll remove any entries
1187	* left in the io tree. For a log commit, we don't remove them
1188	* after committing the log because the tree can be accessed
1189	* concurrently - we do it only at transaction commit time when
1190	* it's safe to do it (through extent_io_tree_release()).
1191	*/
1192	err = clear_extent_bit(tree: dirty_pages, start, end,
1193	bits: EXTENT_NEED_WAIT, cached: &cached_state);
1194	if (err == -ENOMEM)
1195	err = `0`;
1196	if (!err)
1197	err = filemap_fdatawait_range(mapping, lstart: start, lend: end);
1198	if (err)
1199	werr = err;
1200	free_extent_state(state: cached_state);
1201	cached_state = NULL;
1202	cond_resched();
1203	start = end + `1`;
1204	}
1205	if (err)
1206	werr = err;
1207	return werr;
1208	}
1209
1210	static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
1211	struct extent_io_tree *dirty_pages)
1212	{
1213	bool errors = false;
1214	int err;
1215
1216	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
1217	if (test_and_clear_bit(nr: BTRFS_FS_BTREE_ERR, addr: &fs_info->flags))
1218	errors = true;
1219
1220	if (errors && !err)
1221	err = -EIO;
1222	return err;
1223	}
1224
1225	int btrfs_wait_tree_log_extents(struct btrfs_root log_root, int* mark)
1226	{
1227	struct btrfs_fs_info *fs_info = log_root->fs_info;
1228	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
1229	bool errors = false;
1230	int err;
1231
1232	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
1233
1234	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
1235	if ((mark & EXTENT_DIRTY) &&
1236	test_and_clear_bit(nr: BTRFS_FS_LOG1_ERR, addr: &fs_info->flags))
1237	errors = true;
1238
1239	if ((mark & EXTENT_NEW) &&
1240	test_and_clear_bit(nr: BTRFS_FS_LOG2_ERR, addr: &fs_info->flags))
1241	errors = true;
1242
1243	if (errors && !err)
1244	err = -EIO;
1245	return err;
1246	}
1247
1248	/*
1249	* When btree blocks are allocated the corresponding extents are marked dirty.
1250	* This function ensures such extents are persisted on disk for transaction or
1251	* log commit.
1252	*
1253	* @trans: transaction whose dirty pages we'd like to write
1254	*/
1255	static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
1256	{
1257	int ret;
1258	int ret2;
1259	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
1260	struct btrfs_fs_info *fs_info = trans->fs_info;
1261	struct blk_plug plug;
1262
1263	blk_start_plug(&plug);
1264	ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark: EXTENT_DIRTY);
1265	blk_finish_plug(&plug);
1266	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
1267
1268	extent_io_tree_release(tree: &trans->transaction->dirty_pages);
1269
1270	if (ret)
1271	return ret;
1272	else if (ret2)
1273	return ret2;
1274	else
1275	return `0`;
1276	}
1277
1278	/*
1279	* this is used to update the root pointer in the tree of tree roots.
1280	*
1281	* But, in the case of the extent allocation tree, updating the root
1282	* pointer may allocate blocks which may change the root of the extent
1283	* allocation tree.
1284	*
1285	* So, this loops and repeats and makes sure the cowonly root didn't
1286	* change while the root pointer was being updated in the metadata.
1287	*/
1288	static int update_cowonly_root(struct btrfs_trans_handle *trans,
1289	struct btrfs_root *root)
1290	{
1291	int ret;
1292	u64 old_root_bytenr;
1293	u64 old_root_used;
1294	struct btrfs_fs_info *fs_info = root->fs_info;
1295	struct btrfs_root *tree_root = fs_info->tree_root;
1296
1297	old_root_used = btrfs_root_used(s: &root->root_item);
1298
1299	while (`1`) {
1300	old_root_bytenr = btrfs_root_bytenr(s: &root->root_item);
1301	if (old_root_bytenr == root->node->start &&
1302	old_root_used == btrfs_root_used(s: &root->root_item))
1303	break;
1304
1305	btrfs_set_root_node(item: &root->root_item, node: root->node);
1306	ret = btrfs_update_root(trans, root: tree_root,
1307	key: &root->root_key,
1308	item: &root->root_item);
1309	if (ret)
1310	return ret;
1311
1312	old_root_used = btrfs_root_used(s: &root->root_item);
1313	}
1314
1315	return `0`;
1316	}
1317
1318	/*
1319	* update all the cowonly tree roots on disk
1320	*
1321	* The error handling in this function may not be obvious. Any of the
1322	* failures will cause the file system to go offline. We still need
1323	* to clean up the delayed refs.
1324	*/
1325	static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
1326	{
1327	struct btrfs_fs_info *fs_info = trans->fs_info;
1328	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1329	struct list_head *io_bgs = &trans->transaction->io_bgs;
1330	struct list_head *next;
1331	struct extent_buffer *eb;
1332	int ret;
1333
1334	/*
1335	* At this point no one can be using this transaction to modify any tree
1336	* and no one can start another transaction to modify any tree either.
1337	*/
1338	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
1339
1340	eb = btrfs_lock_root_node(root: fs_info->tree_root);
1341	ret = btrfs_cow_block(trans, root: fs_info->tree_root, buf: eb, NULL,
1342	parent_slot: `0`, cow_ret: &eb, nest: BTRFS_NESTING_COW);
1343	btrfs_tree_unlock(eb);
1344	free_extent_buffer(eb);
1345
1346	if (ret)
1347	return ret;
1348
1349	ret = btrfs_run_dev_stats(trans);
1350	if (ret)
1351	return ret;
1352	ret = btrfs_run_dev_replace(trans);
1353	if (ret)
1354	return ret;
1355	ret = btrfs_run_qgroups(trans);
1356	if (ret)
1357	return ret;
1358
1359	ret = btrfs_setup_space_cache(trans);
1360	if (ret)
1361	return ret;
1362
1363	again:
1364	while (!list_empty(head: &fs_info->dirty_cowonly_roots)) {
1365	struct btrfs_root *root;
1366	next = fs_info->dirty_cowonly_roots.next;
1367	list_del_init(entry: next);
1368	root = list_entry(next, struct btrfs_root, dirty_list);
1369	clear_bit(nr: BTRFS_ROOT_DIRTY, addr: &root->state);
1370
1371	list_add_tail(new: &root->dirty_list,
1372	head: &trans->transaction->switch_commits);
1373	ret = update_cowonly_root(trans, root);
1374	if (ret)
1375	return ret;
1376	}
1377
1378	/ Now flush any delayed refs generated by updating all of the roots /
1379	ret = btrfs_run_delayed_refs(trans, U64_MAX);
1380	if (ret)
1381	return ret;
1382
1383	while (!list_empty(head: dirty_bgs) \|\| !list_empty(head: io_bgs)) {
1384	ret = btrfs_write_dirty_block_groups(trans);
1385	if (ret)
1386	return ret;
1387
1388	/*
1389	* We're writing the dirty block groups, which could generate
1390	* delayed refs, which could generate more dirty block groups,
1391	* so we want to keep this flushing in this loop to make sure
1392	* everything gets run.
1393	*/
1394	ret = btrfs_run_delayed_refs(trans, U64_MAX);
1395	if (ret)
1396	return ret;
1397	}
1398
1399	if (!list_empty(head: &fs_info->dirty_cowonly_roots))
1400	goto again;
1401
1402	/ Update dev-replace pointer once everything is committed /
1403	fs_info->dev_replace.committed_cursor_left =
1404	fs_info->dev_replace.cursor_left_last_write_of_item;
1405
1406	return `0`;
1407	}
1408
1409	/*
1410	* If we had a pending drop we need to see if there are any others left in our
1411	* dead roots list, and if not clear our bit and wake any waiters.
1412	*/
1413	void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1414	{
1415	/*
1416	* We put the drop in progress roots at the front of the list, so if the
1417	* first entry doesn't have UNFINISHED_DROP set we can wake everybody
1418	* up.
1419	*/
1420	spin_lock(lock: &fs_info->trans_lock);
1421	if (!list_empty(head: &fs_info->dead_roots)) {
1422	struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
1423	struct btrfs_root,
1424	root_list);
1425	if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
1426	spin_unlock(lock: &fs_info->trans_lock);
1427	return;
1428	}
1429	}
1430	spin_unlock(lock: &fs_info->trans_lock);
1431
1432	btrfs_wake_unfinished_drop(fs_info);
1433	}
1434
1435	/*
1436	* dead roots are old snapshots that need to be deleted. This allocates
1437	* a dirty root struct and adds it into the list of dead roots that need to
1438	* be deleted
1439	*/
1440	void btrfs_add_dead_root(struct btrfs_root *root)
1441	{
1442	struct btrfs_fs_info *fs_info = root->fs_info;
1443
1444	spin_lock(lock: &fs_info->trans_lock);
1445	if (list_empty(head: &root->root_list)) {
1446	btrfs_grab_root(root);
1447
1448	/ We want to process the partially complete drops first. /
1449	if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
1450	list_add(new: &root->root_list, head: &fs_info->dead_roots);
1451	else
1452	list_add_tail(new: &root->root_list, head: &fs_info->dead_roots);
1453	}
1454	spin_unlock(lock: &fs_info->trans_lock);
1455	}
1456
1457	/*
1458	* Update each subvolume root and its relocation root, if it exists, in the tree
1459	* of tree roots. Also free log roots if they exist.
1460	*/
1461	static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
1462	{
1463	struct btrfs_fs_info *fs_info = trans->fs_info;
1464	struct btrfs_root *gang[`8`];
1465	int i;
1466	int ret;
1467
1468	/*
1469	* At this point no one can be using this transaction to modify any tree
1470	* and no one can start another transaction to modify any tree either.
1471	*/
1472	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
1473
1474	spin_lock(lock: &fs_info->fs_roots_radix_lock);
1475	while (`1`) {
1476	ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
1477	results: (void **)gang, first_index: `0`,
1478	ARRAY_SIZE(gang),
1479	BTRFS_ROOT_TRANS_TAG);
1480	if (ret == `0`)
1481	break;
1482	for (i = `0`; i < ret; i++) {
1483	struct btrfs_root *root = gang[i];
1484	int ret2;
1485
1486	/*
1487	* At this point we can neither have tasks logging inodes
1488	* from a root nor trying to commit a log tree.
1489	*/
1490	ASSERT(atomic_read(&root->log_writers) == `0`);
1491	ASSERT(atomic_read(&root->log_commit[`0`]) == `0`);
1492	ASSERT(atomic_read(&root->log_commit[`1`]) == `0`);
1493
1494	radix_tree_tag_clear(&fs_info->fs_roots_radix,
1495	index: (unsigned long)root->root_key.objectid,
1496	BTRFS_ROOT_TRANS_TAG);
1497	btrfs_qgroup_free_meta_all_pertrans(root);
1498	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
1499
1500	btrfs_free_log(trans, root);
1501	ret2 = btrfs_update_reloc_root(trans, root);
1502	if (ret2)
1503	return ret2;
1504
1505	/ see comments in should_cow_block() /
1506	clear_bit(nr: BTRFS_ROOT_FORCE_COW, addr: &root->state);
1507	smp_mb__after_atomic();
1508
1509	if (root->commit_root != root->node) {
1510	list_add_tail(new: &root->dirty_list,
1511	head: &trans->transaction->switch_commits);
1512	btrfs_set_root_node(item: &root->root_item,
1513	node: root->node);
1514	}
1515
1516	ret2 = btrfs_update_root(trans, root: fs_info->tree_root,
1517	key: &root->root_key,
1518	item: &root->root_item);
1519	if (ret2)
1520	return ret2;
1521	spin_lock(lock: &fs_info->fs_roots_radix_lock);
1522	}
1523	}
1524	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
1525	return `0`;
1526	}
1527
1528	/*
1529	* Do all special snapshot related qgroup dirty hack.
1530	*
1531	* Will do all needed qgroup inherit and dirty hack like switch commit
1532	* roots inside one transaction and write all btree into disk, to make
1533	* qgroup works.
1534	*/
1535	static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
1536	struct btrfs_root *src,
1537	struct btrfs_root *parent,
1538	struct btrfs_qgroup_inherit *inherit,
1539	u64 dst_objectid)
1540	{
1541	struct btrfs_fs_info *fs_info = src->fs_info;
1542	int ret;
1543
1544	/*
1545	* Save some performance in the case that qgroups are not enabled. If
1546	* this check races with the ioctl, rescan will kick in anyway.
1547	*/
1548	if (!btrfs_qgroup_full_accounting(fs_info))
1549	return `0`;
1550
1551	/*
1552	* Ensure dirty @src will be committed. Or, after coming
1553	* commit_fs_roots() and switch_commit_roots(), any dirty but not
1554	* recorded root will never be updated again, causing an outdated root
1555	* item.
1556	*/
1557	ret = record_root_in_trans(trans, root: src, force: `1`);
1558	if (ret)
1559	return ret;
1560
1561	/*
1562	* btrfs_qgroup_inherit relies on a consistent view of the usage for the
1563	* src root, so we must run the delayed refs here.
1564	*
1565	* However this isn't particularly fool proof, because there's no
1566	* synchronization keeping us from changing the tree after this point
1567	* before we do the qgroup_inherit, or even from making changes while
1568	* we're doing the qgroup_inherit. But that's a problem for the future,
1569	* for now flush the delayed refs to narrow the race window where the
1570	* qgroup counters could end up wrong.
1571	*/
1572	ret = btrfs_run_delayed_refs(trans, U64_MAX);
1573	if (ret) {
1574	btrfs_abort_transaction(trans, ret);
1575	return ret;
1576	}
1577
1578	ret = commit_fs_roots(trans);
1579	if (ret)
1580	goto out;
1581	ret = btrfs_qgroup_account_extents(trans);
1582	if (ret < `0`)
1583	goto out;
1584
1585	/ Now qgroup are all updated, we can inherit it to new qgroups /
1586	ret = btrfs_qgroup_inherit(trans, srcid: src->root_key.objectid, objectid: dst_objectid,
1587	inode_rootid: parent->root_key.objectid, inherit);
1588	if (ret < `0`)
1589	goto out;
1590
1591	/*
1592	* Now we do a simplified commit transaction, which will:
1593	* 1) commit all subvolume and extent tree
1594	* To ensure all subvolume and extent tree have a valid
1595	* commit_root to accounting later insert_dir_item()
1596	* 2) write all btree blocks onto disk
1597	* This is to make sure later btree modification will be cowed
1598	* Or commit_root can be populated and cause wrong qgroup numbers
1599	* In this simplified commit, we don't really care about other trees
1600	* like chunk and root tree, as they won't affect qgroup.
1601	* And we don't write super to avoid half committed status.
1602	*/
1603	ret = commit_cowonly_roots(trans);
1604	if (ret)
1605	goto out;
1606	switch_commit_roots(trans);
1607	ret = btrfs_write_and_wait_transaction(trans);
1608	if (ret)
1609	btrfs_handle_fs_error(fs_info, ret,
1610	"Error while writing out transaction for qgroup");
1611
1612	out:
1613	/*
1614	* Force parent root to be updated, as we recorded it before so its
1615	* last_trans == cur_transid.
1616	* Or it won't be committed again onto disk after later
1617	* insert_dir_item()
1618	*/
1619	if (!ret)
1620	ret = record_root_in_trans(trans, root: parent, force: `1`);
1621	return ret;
1622	}
1623
1624	/*
1625	* new snapshots need to be created at a very specific time in the
1626	* transaction commit. This does the actual creation.
1627	*
1628	* Note:
1629	* If the error which may affect the commitment of the current transaction
1630	* happens, we should return the error number. If the error which just affect
1631	* the creation of the pending snapshots, just return 0.
1632	*/
1633	static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1634	struct btrfs_pending_snapshot *pending)
1635	{
1636
1637	struct btrfs_fs_info *fs_info = trans->fs_info;
1638	struct btrfs_key key;
1639	struct btrfs_root_item *new_root_item;
1640	struct btrfs_root *tree_root = fs_info->tree_root;
1641	struct btrfs_root *root = pending->root;
1642	struct btrfs_root *parent_root;
1643	struct btrfs_block_rsv *rsv;
1644	struct inode *parent_inode = pending->dir;
1645	struct btrfs_path *path;
1646	struct btrfs_dir_item *dir_item;
1647	struct extent_buffer *tmp;
1648	struct extent_buffer *old;
1649	struct timespec64 cur_time;
1650	int ret = `0`;
1651	u64 to_reserve = `0`;
1652	u64 index = `0`;
1653	u64 objectid;
1654	u64 root_flags;
1655	unsigned int nofs_flags;
1656	struct fscrypt_name fname;
1657
1658	ASSERT(pending->path);
1659	path = pending->path;
1660
1661	ASSERT(pending->root_item);
1662	new_root_item = pending->root_item;
1663
1664	/*
1665	* We're inside a transaction and must make sure that any potential
1666	* allocations with GFP_KERNEL in fscrypt won't recurse back to
1667	* filesystem.
1668	*/
1669	nofs_flags = memalloc_nofs_save();
1670	pending->error = fscrypt_setup_filename(inode: parent_inode,
1671	iname: &pending->dentry->d_name, lookup: `0`,
1672	fname: &fname);
1673	memalloc_nofs_restore(flags: nofs_flags);
1674	if (pending->error)
1675	goto free_pending;
1676
1677	pending->error = btrfs_get_free_objectid(root: tree_root, objectid: &objectid);
1678	if (pending->error)
1679	goto free_fname;
1680
1681	/*
1682	* Make qgroup to skip current new snapshot's qgroupid, as it is
1683	* accounted by later btrfs_qgroup_inherit().
1684	*/
1685	btrfs_set_skip_qgroup(trans, qgroupid: objectid);
1686
1687	btrfs_reloc_pre_snapshot(pending, bytes_to_reserve: &to_reserve);
1688
1689	if (to_reserve > `0`) {
1690	pending->error = btrfs_block_rsv_add(fs_info,
1691	block_rsv: &pending->block_rsv,
1692	num_bytes: to_reserve,
1693	flush: BTRFS_RESERVE_NO_FLUSH);
1694	if (pending->error)
1695	goto clear_skip_qgroup;
1696	}
1697
1698	key.objectid = objectid;
1699	key.offset = (u64)-`1`;
1700	key.type = BTRFS_ROOT_ITEM_KEY;
1701
1702	rsv = trans->block_rsv;
1703	trans->block_rsv = &pending->block_rsv;
1704	trans->bytes_reserved = trans->block_rsv->reserved;
1705	trace_btrfs_space_reservation(fs_info, type: "transaction",
1706	val: trans->transid,
1707	bytes: trans->bytes_reserved, reserve: `1`);
1708	parent_root = BTRFS_I(inode: parent_inode)->root;
1709	ret = record_root_in_trans(trans, root: parent_root, force: `0`);
1710	if (ret)
1711	goto fail;
1712	cur_time = current_time(inode: parent_inode);
1713
1714	/*
1715	* insert the directory item
1716	*/
1717	ret = btrfs_set_inode_index(dir: BTRFS_I(inode: parent_inode), index: &index);
1718	if (ret) {
1719	btrfs_abort_transaction(trans, ret);
1720	goto fail;
1721	}
1722
1723	/ check if there is a file/dir which has the same name. /
1724	dir_item = btrfs_lookup_dir_item(NULL, root: parent_root, path,
1725	dir: btrfs_ino(inode: BTRFS_I(inode: parent_inode)),
1726	name: &fname.disk_name, mod: `0`);
1727	if (dir_item != NULL && !IS_ERR(ptr: dir_item)) {
1728	pending->error = -EEXIST;
1729	goto dir_item_existed;
1730	} else if (IS_ERR(ptr: dir_item)) {
1731	ret = PTR_ERR(ptr: dir_item);
1732	btrfs_abort_transaction(trans, ret);
1733	goto fail;
1734	}
1735	btrfs_release_path(p: path);
1736
1737	ret = btrfs_create_qgroup(trans, qgroupid: objectid);
1738	if (ret && ret != -EEXIST) {
1739	btrfs_abort_transaction(trans, ret);
1740	goto fail;
1741	}
1742
1743	/*
1744	* pull in the delayed directory update
1745	* and the delayed inode item
1746	* otherwise we corrupt the FS during
1747	* snapshot
1748	*/
1749	ret = btrfs_run_delayed_items(trans);
1750	if (ret) { / Transaction aborted /
1751	btrfs_abort_transaction(trans, ret);
1752	goto fail;
1753	}
1754
1755	ret = record_root_in_trans(trans, root, force: `0`);
1756	if (ret) {
1757	btrfs_abort_transaction(trans, ret);
1758	goto fail;
1759	}
1760	btrfs_set_root_last_snapshot(s: &root->root_item, val: trans->transid);
1761	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1762	btrfs_check_and_init_root_item(item: new_root_item);
1763
1764	root_flags = btrfs_root_flags(s: new_root_item);
1765	if (pending->readonly)
1766	root_flags \|= BTRFS_ROOT_SUBVOL_RDONLY;
1767	else
1768	root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
1769	btrfs_set_root_flags(s: new_root_item, val: root_flags);
1770
1771	btrfs_set_root_generation_v2(s: new_root_item,
1772	val: trans->transid);
1773	generate_random_guid(guid: new_root_item->uuid);
1774	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
1775	BTRFS_UUID_SIZE);
1776	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
1777	memset(new_root_item->received_uuid, `0`,
1778	sizeof(new_root_item->received_uuid));
1779	memset(&new_root_item->stime, `0`, sizeof(new_root_item->stime));
1780	memset(&new_root_item->rtime, `0`, sizeof(new_root_item->rtime));
1781	btrfs_set_root_stransid(s: new_root_item, val: `0`);
1782	btrfs_set_root_rtransid(s: new_root_item, val: `0`);
1783	}
1784	btrfs_set_stack_timespec_sec(s: &new_root_item->otime, val: cur_time.tv_sec);
1785	btrfs_set_stack_timespec_nsec(s: &new_root_item->otime, val: cur_time.tv_nsec);
1786	btrfs_set_root_otransid(s: new_root_item, val: trans->transid);
1787
1788	old = btrfs_lock_root_node(root);
1789	ret = btrfs_cow_block(trans, root, buf: old, NULL, parent_slot: `0`, cow_ret: &old,
1790	nest: BTRFS_NESTING_COW);
1791	if (ret) {
1792	btrfs_tree_unlock(eb: old);
1793	free_extent_buffer(eb: old);
1794	btrfs_abort_transaction(trans, ret);
1795	goto fail;
1796	}
1797
1798	ret = btrfs_copy_root(trans, root, buf: old, cow_ret: &tmp, new_root_objectid: objectid);
1799	/ clean up in any case /
1800	btrfs_tree_unlock(eb: old);
1801	free_extent_buffer(eb: old);
1802	if (ret) {
1803	btrfs_abort_transaction(trans, ret);
1804	goto fail;
1805	}
1806	/ see comments in should_cow_block() /
1807	set_bit(nr: BTRFS_ROOT_FORCE_COW, addr: &root->state);
1808	smp_wmb();
1809
1810	btrfs_set_root_node(item: new_root_item, node: tmp);
1811	/ record when the snapshot was created in key.offset /
1812	key.offset = trans->transid;
1813	ret = btrfs_insert_root(trans, root: tree_root, key: &key, item: new_root_item);
1814	btrfs_tree_unlock(eb: tmp);
1815	free_extent_buffer(eb: tmp);
1816	if (ret) {
1817	btrfs_abort_transaction(trans, ret);
1818	goto fail;
1819	}
1820
1821	/*
1822	* insert root back/forward references
1823	*/
1824	ret = btrfs_add_root_ref(trans, root_id: objectid,
1825	ref_id: parent_root->root_key.objectid,
1826	dirid: btrfs_ino(inode: BTRFS_I(inode: parent_inode)), sequence: index,
1827	name: &fname.disk_name);
1828	if (ret) {
1829	btrfs_abort_transaction(trans, ret);
1830	goto fail;
1831	}
1832
1833	key.offset = (u64)-`1`;
1834	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, anon_dev: &pending->anon_dev);
1835	if (IS_ERR(ptr: pending->snap)) {
1836	ret = PTR_ERR(ptr: pending->snap);
1837	pending->snap = NULL;
1838	btrfs_abort_transaction(trans, ret);
1839	goto fail;
1840	}
1841
1842	ret = btrfs_reloc_post_snapshot(trans, pending);
1843	if (ret) {
1844	btrfs_abort_transaction(trans, ret);
1845	goto fail;
1846	}
1847
1848	/*
1849	* Do special qgroup accounting for snapshot, as we do some qgroup
1850	* snapshot hack to do fast snapshot.
1851	* To co-operate with that hack, we do hack again.
1852	* Or snapshot will be greatly slowed down by a subtree qgroup rescan
1853	*/
1854	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
1855	ret = qgroup_account_snapshot(trans, src: root, parent: parent_root,
1856	inherit: pending->inherit, dst_objectid: objectid);
1857	else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
1858	ret = btrfs_qgroup_inherit(trans, srcid: root->root_key.objectid, objectid,
1859	inode_rootid: parent_root->root_key.objectid, inherit: pending->inherit);
1860	if (ret < `0`)
1861	goto fail;
1862
1863	ret = btrfs_insert_dir_item(trans, name: &fname.disk_name,
1864	dir: BTRFS_I(inode: parent_inode), location: &key, BTRFS_FT_DIR,
1865	index);
1866	/ We have check then name at the beginning, so it is impossible. /
1867	BUG_ON(ret == -EEXIST \|\| ret == -EOVERFLOW);
1868	if (ret) {
1869	btrfs_abort_transaction(trans, ret);
1870	goto fail;
1871	}
1872
1873	btrfs_i_size_write(inode: BTRFS_I(inode: parent_inode), size: parent_inode->i_size +
1874	fname.disk_name.len * `2`);
1875	inode_set_mtime_to_ts(inode: parent_inode,
1876	ts: inode_set_ctime_current(inode: parent_inode));
1877	ret = btrfs_update_inode_fallback(trans, inode: BTRFS_I(inode: parent_inode));
1878	if (ret) {
1879	btrfs_abort_transaction(trans, ret);
1880	goto fail;
1881	}
1882	ret = btrfs_uuid_tree_add(trans, uuid: new_root_item->uuid,
1883	BTRFS_UUID_KEY_SUBVOL,
1884	subid: objectid);
1885	if (ret) {
1886	btrfs_abort_transaction(trans, ret);
1887	goto fail;
1888	}
1889	if (!btrfs_is_empty_uuid(uuid: new_root_item->received_uuid)) {
1890	ret = btrfs_uuid_tree_add(trans, uuid: new_root_item->received_uuid,
1891	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
1892	subid: objectid);
1893	if (ret && ret != -EEXIST) {
1894	btrfs_abort_transaction(trans, ret);
1895	goto fail;
1896	}
1897	}
1898
1899	fail:
1900	pending->error = ret;
1901	dir_item_existed:
1902	trans->block_rsv = rsv;
1903	trans->bytes_reserved = `0`;
1904	clear_skip_qgroup:
1905	btrfs_clear_skip_qgroup(trans);
1906	free_fname:
1907	fscrypt_free_filename(fname: &fname);
1908	free_pending:
1909	kfree(objp: new_root_item);
1910	pending->root_item = NULL;
1911	btrfs_free_path(p: path);
1912	pending->path = NULL;
1913
1914	return ret;
1915	}
1916
1917	/*
1918	* create all the snapshots we've scheduled for creation
1919	*/
1920	static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
1921	{
1922	struct btrfs_pending_snapshot pending, next;
1923	struct list_head *head = &trans->transaction->pending_snapshots;
1924	int ret = `0`;
1925
1926	list_for_each_entry_safe(pending, next, head, list) {
1927	list_del(entry: &pending->list);
1928	ret = create_pending_snapshot(trans, pending);
1929	if (ret)
1930	break;
1931	}
1932	return ret;
1933	}
1934
1935	static void update_super_roots(struct btrfs_fs_info *fs_info)
1936	{
1937	struct btrfs_root_item *root_item;
1938	struct btrfs_super_block *super;
1939
1940	super = fs_info->super_copy;
1941
1942	root_item = &fs_info->chunk_root->root_item;
1943	super->chunk_root = root_item->bytenr;
1944	super->chunk_root_generation = root_item->generation;
1945	super->chunk_root_level = root_item->level;
1946
1947	root_item = &fs_info->tree_root->root_item;
1948	super->root = root_item->bytenr;
1949	super->generation = root_item->generation;
1950	super->root_level = root_item->level;
1951	if (btrfs_test_opt(fs_info, SPACE_CACHE))
1952	super->cache_generation = root_item->generation;
1953	else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
1954	super->cache_generation = `0`;
1955	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
1956	super->uuid_tree_generation = root_item->generation;
1957	}
1958
1959	int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1960	{
1961	struct btrfs_transaction *trans;
1962	int ret = `0`;
1963
1964	spin_lock(lock: &info->trans_lock);
1965	trans = info->running_transaction;
1966	if (trans)
1967	ret = is_transaction_blocked(trans);
1968	spin_unlock(lock: &info->trans_lock);
1969	return ret;
1970	}
1971
1972	void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
1973	{
1974	struct btrfs_fs_info *fs_info = trans->fs_info;
1975	struct btrfs_transaction *cur_trans;
1976
1977	/ Kick the transaction kthread. /
1978	set_bit(nr: BTRFS_FS_COMMIT_TRANS, addr: &fs_info->flags);
1979	wake_up_process(tsk: fs_info->transaction_kthread);
1980
1981	/ take transaction reference /
1982	cur_trans = trans->transaction;
1983	refcount_inc(r: &cur_trans->use_count);
1984
1985	btrfs_end_transaction(trans);
1986
1987	/*
1988	* Wait for the current transaction commit to start and block
1989	* subsequent transaction joins
1990	*/
1991	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
1992	wait_event(fs_info->transaction_blocked_wait,
1993	cur_trans->state >= TRANS_STATE_COMMIT_START \|\|
1994	TRANS_ABORTED(cur_trans));
1995	btrfs_put_transaction(transaction: cur_trans);
1996	}
1997
1998	static void cleanup_transaction(struct btrfs_trans_handle trans, int* err)
1999	{
2000	struct btrfs_fs_info *fs_info = trans->fs_info;
2001	struct btrfs_transaction *cur_trans = trans->transaction;
2002
2003	WARN_ON(refcount_read(&trans->use_count) > `1`);
2004
2005	btrfs_abort_transaction(trans, err);
2006
2007	spin_lock(lock: &fs_info->trans_lock);
2008
2009	/*
2010	* If the transaction is removed from the list, it means this
2011	* transaction has been committed successfully, so it is impossible
2012	* to call the cleanup function.
2013	*/
2014	BUG_ON(list_empty(&cur_trans->list));
2015
2016	if (cur_trans == fs_info->running_transaction) {
2017	cur_trans->state = TRANS_STATE_COMMIT_DOING;
2018	spin_unlock(lock: &fs_info->trans_lock);
2019
2020	/*
2021	* The thread has already released the lockdep map as reader
2022	* already in btrfs_commit_transaction().
2023	*/
2024	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2025	wait_event(cur_trans->writer_wait,
2026	atomic_read(&cur_trans->num_writers) == `1`);
2027
2028	spin_lock(lock: &fs_info->trans_lock);
2029	}
2030
2031	/*
2032	* Now that we know no one else is still using the transaction we can
2033	* remove the transaction from the list of transactions. This avoids
2034	* the transaction kthread from cleaning up the transaction while some
2035	* other task is still using it, which could result in a use-after-free
2036	* on things like log trees, as it forces the transaction kthread to
2037	* wait for this transaction to be cleaned up by us.
2038	*/
2039	list_del_init(entry: &cur_trans->list);
2040
2041	spin_unlock(lock: &fs_info->trans_lock);
2042
2043	btrfs_cleanup_one_transaction(trans: trans->transaction, fs_info);
2044
2045	spin_lock(lock: &fs_info->trans_lock);
2046	if (cur_trans == fs_info->running_transaction)
2047	fs_info->running_transaction = NULL;
2048	spin_unlock(lock: &fs_info->trans_lock);
2049
2050	if (trans->type & __TRANS_FREEZABLE)
2051	sb_end_intwrite(sb: fs_info->sb);
2052	btrfs_put_transaction(transaction: cur_trans);
2053	btrfs_put_transaction(transaction: cur_trans);
2054
2055	trace_btrfs_transaction_commit(fs_info);
2056
2057	if (current->journal_info == trans)
2058	current->journal_info = NULL;
2059
2060	/*
2061	* If relocation is running, we can't cancel scrub because that will
2062	* result in a deadlock. Before relocating a block group, relocation
2063	* pauses scrub, then starts and commits a transaction before unpausing
2064	* scrub. If the transaction commit is being done by the relocation
2065	* task or triggered by another task and the relocation task is waiting
2066	* for the commit, and we end up here due to an error in the commit
2067	* path, then calling btrfs_scrub_cancel() will deadlock, as we are
2068	* asking for scrub to stop while having it asked to be paused higher
2069	* above in relocation code.
2070	*/
2071	if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
2072	btrfs_scrub_cancel(info: fs_info);
2073
2074	kmem_cache_free(s: btrfs_trans_handle_cachep, objp: trans);
2075	}
2076
2077	/*
2078	* Release reserved delayed ref space of all pending block groups of the
2079	* transaction and remove them from the list
2080	*/
2081	static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
2082	{
2083	struct btrfs_fs_info *fs_info = trans->fs_info;
2084	struct btrfs_block_group block_group, tmp;
2085
2086	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
2087	btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2088	list_del_init(entry: &block_group->bg_list);
2089	}
2090	}
2091
2092	static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
2093	{
2094	/*
2095	* We use try_to_writeback_inodes_sb() here because if we used
2096	* btrfs_start_delalloc_roots we would deadlock with fs freeze.
2097	* Currently are holding the fs freeze lock, if we do an async flush
2098	* we'll do btrfs_join_transaction() and deadlock because we need to
2099	* wait for the fs freeze lock. Using the direct flushing we benefit
2100	* from already being in a transaction and our join_transaction doesn't
2101	* have to re-take the fs freeze lock.
2102	*
2103	* Note that try_to_writeback_inodes_sb() will only trigger writeback
2104	* if it can read lock sb->s_umount. It will always be able to lock it,
2105	* except when the filesystem is being unmounted or being frozen, but in
2106	* those cases sync_filesystem() is called, which results in calling
2107	* writeback_inodes_sb() while holding a write lock on sb->s_umount.
2108	* Note that we don't call writeback_inodes_sb() directly, because it
2109	* will emit a warning if sb->s_umount is not locked.
2110	*/
2111	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2112	try_to_writeback_inodes_sb(sb: fs_info->sb, reason: WB_REASON_SYNC);
2113	return `0`;
2114	}
2115
2116	static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
2117	{
2118	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2119	btrfs_wait_ordered_roots(fs_info, U64_MAX, range_start: `0`, range_len: (u64)-`1`);
2120	}
2121
2122	/*
2123	* Add a pending snapshot associated with the given transaction handle to the
2124	* respective handle. This must be called after the transaction commit started
2125	* and while holding fs_info->trans_lock.
2126	* This serves to guarantee a caller of btrfs_commit_transaction() that it can
2127	* safely free the pending snapshot pointer in case btrfs_commit_transaction()
2128	* returns an error.
2129	*/
2130	static void add_pending_snapshot(struct btrfs_trans_handle *trans)
2131	{
2132	struct btrfs_transaction *cur_trans = trans->transaction;
2133
2134	if (!trans->pending_snapshot)
2135	return;
2136
2137	lockdep_assert_held(&trans->fs_info->trans_lock);
2138	ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
2139
2140	list_add(new: &trans->pending_snapshot->list, head: &cur_trans->pending_snapshots);
2141	}
2142
2143	static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
2144	{
2145	fs_info->commit_stats.commit_count++;
2146	fs_info->commit_stats.last_commit_dur = interval;
2147	fs_info->commit_stats.max_commit_dur =
2148	max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
2149	fs_info->commit_stats.total_commit_dur += interval;
2150	}
2151
2152	int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2153	{
2154	struct btrfs_fs_info *fs_info = trans->fs_info;
2155	struct btrfs_transaction *cur_trans = trans->transaction;
2156	struct btrfs_transaction *prev_trans = NULL;
2157	int ret;
2158	ktime_t start_time;
2159	ktime_t interval;
2160
2161	ASSERT(refcount_read(&trans->use_count) == `1`);
2162	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2163
2164	clear_bit(nr: BTRFS_FS_NEED_TRANS_COMMIT, addr: &fs_info->flags);
2165
2166	/ Stop the commit early if ->aborted is set /
2167	if (TRANS_ABORTED(cur_trans)) {
2168	ret = cur_trans->aborted;
2169	goto lockdep_trans_commit_start_release;
2170	}
2171
2172	btrfs_trans_release_metadata(trans);
2173	trans->block_rsv = NULL;
2174
2175	/*
2176	* We only want one transaction commit doing the flushing so we do not
2177	* waste a bunch of time on lock contention on the extent root node.
2178	*/
2179	if (!test_and_set_bit(nr: BTRFS_DELAYED_REFS_FLUSHING,
2180	addr: &cur_trans->delayed_refs.flags)) {
2181	/*
2182	* Make a pass through all the delayed refs we have so far.
2183	* Any running threads may add more while we are here.
2184	*/
2185	ret = btrfs_run_delayed_refs(trans, min_bytes: `0`);
2186	if (ret)
2187	goto lockdep_trans_commit_start_release;
2188	}
2189
2190	btrfs_create_pending_block_groups(trans);
2191
2192	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
2193	int run_it = `0`;
2194
2195	/ this mutex is also taken before trying to set*
2196	* block groups readonly. We need to make sure
2197	* that nobody has set a block group readonly
2198	* after a extents from that block group have been
2199	* allocated for cache files. btrfs_set_block_group_ro
2200	* will wait for the transaction to commit if it
2201	* finds BTRFS_TRANS_DIRTY_BG_RUN set.
2202	*
2203	* The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
2204	* only one process starts all the block group IO. It wouldn't
2205	* hurt to have more than one go through, but there's no
2206	* real advantage to it either.
2207	*/
2208	mutex_lock(&fs_info->ro_block_group_mutex);
2209	if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
2210	addr: &cur_trans->flags))
2211	run_it = `1`;
2212	mutex_unlock(lock: &fs_info->ro_block_group_mutex);
2213
2214	if (run_it) {
2215	ret = btrfs_start_dirty_block_groups(trans);
2216	if (ret)
2217	goto lockdep_trans_commit_start_release;
2218	}
2219	}
2220
2221	spin_lock(lock: &fs_info->trans_lock);
2222	if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) {
2223	enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
2224
2225	add_pending_snapshot(trans);
2226
2227	spin_unlock(lock: &fs_info->trans_lock);
2228	refcount_inc(r: &cur_trans->use_count);
2229
2230	if (trans->in_fsync)
2231	want_state = TRANS_STATE_SUPER_COMMITTED;
2232
2233	btrfs_trans_state_lockdep_release(fs_info,
2234	BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2235	ret = btrfs_end_transaction(trans);
2236	wait_for_commit(commit: cur_trans, min_state: want_state);
2237
2238	if (TRANS_ABORTED(cur_trans))
2239	ret = cur_trans->aborted;
2240
2241	btrfs_put_transaction(transaction: cur_trans);
2242
2243	return ret;
2244	}
2245
2246	cur_trans->state = TRANS_STATE_COMMIT_PREP;
2247	wake_up(&fs_info->transaction_blocked_wait);
2248	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2249
2250	if (cur_trans->list.prev != &fs_info->trans_list) {
2251	enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
2252
2253	if (trans->in_fsync)
2254	want_state = TRANS_STATE_SUPER_COMMITTED;
2255
2256	prev_trans = list_entry(cur_trans->list.prev,
2257	struct btrfs_transaction, list);
2258	if (prev_trans->state < want_state) {
2259	refcount_inc(r: &prev_trans->use_count);
2260	spin_unlock(lock: &fs_info->trans_lock);
2261
2262	wait_for_commit(commit: prev_trans, min_state: want_state);
2263
2264	ret = READ_ONCE(prev_trans->aborted);
2265
2266	btrfs_put_transaction(transaction: prev_trans);
2267	if (ret)
2268	goto lockdep_release;
2269	spin_lock(lock: &fs_info->trans_lock);
2270	}
2271	} else {
2272	/*
2273	* The previous transaction was aborted and was already removed
2274	* from the list of transactions at fs_info->trans_list. So we
2275	* abort to prevent writing a new superblock that reflects a
2276	* corrupt state (pointing to trees with unwritten nodes/leafs).
2277	*/
2278	if (BTRFS_FS_ERROR(fs_info)) {
2279	spin_unlock(lock: &fs_info->trans_lock);
2280	ret = -EROFS;
2281	goto lockdep_release;
2282	}
2283	}
2284
2285	cur_trans->state = TRANS_STATE_COMMIT_START;
2286	wake_up(&fs_info->transaction_blocked_wait);
2287	spin_unlock(lock: &fs_info->trans_lock);
2288
2289	/*
2290	* Get the time spent on the work done by the commit thread and not
2291	* the time spent waiting on a previous commit
2292	*/
2293	start_time = ktime_get_ns();
2294
2295	extwriter_counter_dec(trans: cur_trans, type: trans->type);
2296
2297	ret = btrfs_start_delalloc_flush(fs_info);
2298	if (ret)
2299	goto lockdep_release;
2300
2301	ret = btrfs_run_delayed_items(trans);
2302	if (ret)
2303	goto lockdep_release;
2304
2305	/*
2306	* The thread has started/joined the transaction thus it holds the
2307	* lockdep map as a reader. It has to release it before acquiring the
2308	* lockdep map as a writer.
2309	*/
2310	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
2311	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
2312	wait_event(cur_trans->writer_wait,
2313	extwriter_counter_read(cur_trans) == `0`);
2314
2315	/ some pending stuffs might be added after the previous flush. /
2316	ret = btrfs_run_delayed_items(trans);
2317	if (ret) {
2318	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2319	goto cleanup_transaction;
2320	}
2321
2322	btrfs_wait_delalloc_flush(fs_info);
2323
2324	/*
2325	* Wait for all ordered extents started by a fast fsync that joined this
2326	* transaction. Otherwise if this transaction commits before the ordered
2327	* extents complete we lose logged data after a power failure.
2328	*/
2329	btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
2330	wait_event(cur_trans->pending_wait,
2331	atomic_read(&cur_trans->pending_ordered) == `0`);
2332
2333	btrfs_scrub_pause(fs_info);
2334	/*
2335	* Ok now we need to make sure to block out any other joins while we
2336	* commit the transaction. We could have started a join before setting
2337	* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
2338	*/
2339	spin_lock(lock: &fs_info->trans_lock);
2340	add_pending_snapshot(trans);
2341	cur_trans->state = TRANS_STATE_COMMIT_DOING;
2342	spin_unlock(lock: &fs_info->trans_lock);
2343
2344	/*
2345	* The thread has started/joined the transaction thus it holds the
2346	* lockdep map as a reader. It has to release it before acquiring the
2347	* lockdep map as a writer.
2348	*/
2349	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2350	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2351	wait_event(cur_trans->writer_wait,
2352	atomic_read(&cur_trans->num_writers) == `1`);
2353
2354	/*
2355	* Make lockdep happy by acquiring the state locks after
2356	* btrfs_trans_num_writers is released. If we acquired the state locks
2357	* before releasing the btrfs_trans_num_writers lock then lockdep would
2358	* complain because we did not follow the reverse order unlocking rule.
2359	*/
2360	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2361	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2362	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2363
2364	/*
2365	* We've started the commit, clear the flag in case we were triggered to
2366	* do an async commit but somebody else started before the transaction
2367	* kthread could do the work.
2368	*/
2369	clear_bit(nr: BTRFS_FS_COMMIT_TRANS, addr: &fs_info->flags);
2370
2371	if (TRANS_ABORTED(cur_trans)) {
2372	ret = cur_trans->aborted;
2373	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2374	goto scrub_continue;
2375	}
2376	/*
2377	* the reloc mutex makes sure that we stop
2378	* the balancing code from coming in and moving
2379	* extents around in the middle of the commit
2380	*/
2381	mutex_lock(&fs_info->reloc_mutex);
2382
2383	/*
2384	* We needn't worry about the delayed items because we will
2385	* deal with them in create_pending_snapshot(), which is the
2386	* core function of the snapshot creation.
2387	*/
2388	ret = create_pending_snapshots(trans);
2389	if (ret)
2390	goto unlock_reloc;
2391
2392	/*
2393	* We insert the dir indexes of the snapshots and update the inode
2394	* of the snapshots' parents after the snapshot creation, so there
2395	* are some delayed items which are not dealt with. Now deal with
2396	* them.
2397	*
2398	* We needn't worry that this operation will corrupt the snapshots,
2399	* because all the tree which are snapshoted will be forced to COW
2400	* the nodes and leaves.
2401	*/
2402	ret = btrfs_run_delayed_items(trans);
2403	if (ret)
2404	goto unlock_reloc;
2405
2406	ret = btrfs_run_delayed_refs(trans, U64_MAX);
2407	if (ret)
2408	goto unlock_reloc;
2409
2410	/*
2411	* make sure none of the code above managed to slip in a
2412	* delayed item
2413	*/
2414	btrfs_assert_delayed_root_empty(fs_info);
2415
2416	WARN_ON(cur_trans != trans->transaction);
2417
2418	ret = commit_fs_roots(trans);
2419	if (ret)
2420	goto unlock_reloc;
2421
2422	/ commit_fs_roots gets rid of all the tree log roots, it is now*
2423	* safe to free the root of tree log roots
2424	*/
2425	btrfs_free_log_root_tree(trans, fs_info);
2426
2427	/*
2428	* Since fs roots are all committed, we can get a quite accurate
2429	* new_roots. So let's do quota accounting.
2430	*/
2431	ret = btrfs_qgroup_account_extents(trans);
2432	if (ret < `0`)
2433	goto unlock_reloc;
2434
2435	ret = commit_cowonly_roots(trans);
2436	if (ret)
2437	goto unlock_reloc;
2438
2439	/*
2440	* The tasks which save the space cache and inode cache may also
2441	* update ->aborted, check it.
2442	*/
2443	if (TRANS_ABORTED(cur_trans)) {
2444	ret = cur_trans->aborted;
2445	goto unlock_reloc;
2446	}
2447
2448	cur_trans = fs_info->running_transaction;
2449
2450	btrfs_set_root_node(item: &fs_info->tree_root->root_item,
2451	node: fs_info->tree_root->node);
2452	list_add_tail(new: &fs_info->tree_root->dirty_list,
2453	head: &cur_trans->switch_commits);
2454
2455	btrfs_set_root_node(item: &fs_info->chunk_root->root_item,
2456	node: fs_info->chunk_root->node);
2457	list_add_tail(new: &fs_info->chunk_root->dirty_list,
2458	head: &cur_trans->switch_commits);
2459
2460	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2461	btrfs_set_root_node(item: &fs_info->block_group_root->root_item,
2462	node: fs_info->block_group_root->node);
2463	list_add_tail(new: &fs_info->block_group_root->dirty_list,
2464	head: &cur_trans->switch_commits);
2465	}
2466
2467	switch_commit_roots(trans);
2468
2469	ASSERT(list_empty(&cur_trans->dirty_bgs));
2470	ASSERT(list_empty(&cur_trans->io_bgs));
2471	update_super_roots(fs_info);
2472
2473	btrfs_set_super_log_root(s: fs_info->super_copy, val: `0`);
2474	btrfs_set_super_log_root_level(s: fs_info->super_copy, val: `0`);
2475	memcpy(fs_info->super_for_commit, fs_info->super_copy,
2476	sizeof(*fs_info->super_copy));
2477
2478	btrfs_commit_device_sizes(trans: cur_trans);
2479
2480	clear_bit(nr: BTRFS_FS_LOG1_ERR, addr: &fs_info->flags);
2481	clear_bit(nr: BTRFS_FS_LOG2_ERR, addr: &fs_info->flags);
2482
2483	btrfs_trans_release_chunk_metadata(trans);
2484
2485	/*
2486	* Before changing the transaction state to TRANS_STATE_UNBLOCKED and
2487	* setting fs_info->running_transaction to NULL, lock tree_log_mutex to
2488	* make sure that before we commit our superblock, no other task can
2489	* start a new transaction and commit a log tree before we commit our
2490	* superblock. Anyone trying to commit a log tree locks this mutex before
2491	* writing its superblock.
2492	*/
2493	mutex_lock(&fs_info->tree_log_mutex);
2494
2495	spin_lock(lock: &fs_info->trans_lock);
2496	cur_trans->state = TRANS_STATE_UNBLOCKED;
2497	fs_info->running_transaction = NULL;
2498	spin_unlock(lock: &fs_info->trans_lock);
2499	mutex_unlock(lock: &fs_info->reloc_mutex);
2500
2501	wake_up(&fs_info->transaction_wait);
2502	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2503
2504	/ If we have features changed, wake up the cleaner to update sysfs. /
2505	if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
2506	fs_info->cleaner_kthread)
2507	wake_up_process(tsk: fs_info->cleaner_kthread);
2508
2509	ret = btrfs_write_and_wait_transaction(trans);
2510	if (ret) {
2511	btrfs_handle_fs_error(fs_info, ret,
2512	"Error while writing out transaction");
2513	mutex_unlock(lock: &fs_info->tree_log_mutex);
2514	goto scrub_continue;
2515	}
2516
2517	ret = write_all_supers(fs_info, max_mirrors: `0`);
2518	/*
2519	* the super is written, we can safely allow the tree-loggers
2520	* to go about their business
2521	*/
2522	mutex_unlock(lock: &fs_info->tree_log_mutex);
2523	if (ret)
2524	goto scrub_continue;
2525
2526	/*
2527	* We needn't acquire the lock here because there is no other task
2528	* which can change it.
2529	*/
2530	cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
2531	wake_up(&cur_trans->commit_wait);
2532	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2533
2534	btrfs_finish_extent_commit(trans);
2535
2536	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
2537	btrfs_clear_space_info_full(info: fs_info);
2538
2539	btrfs_set_last_trans_committed(fs_info, gen: cur_trans->transid);
2540	/*
2541	* We needn't acquire the lock here because there is no other task
2542	* which can change it.
2543	*/
2544	cur_trans->state = TRANS_STATE_COMPLETED;
2545	wake_up(&cur_trans->commit_wait);
2546	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2547
2548	spin_lock(lock: &fs_info->trans_lock);
2549	list_del_init(entry: &cur_trans->list);
2550	spin_unlock(lock: &fs_info->trans_lock);
2551
2552	btrfs_put_transaction(transaction: cur_trans);
2553	btrfs_put_transaction(transaction: cur_trans);
2554
2555	if (trans->type & __TRANS_FREEZABLE)
2556	sb_end_intwrite(sb: fs_info->sb);
2557
2558	trace_btrfs_transaction_commit(fs_info);
2559
2560	interval = ktime_get_ns() - start_time;
2561
2562	btrfs_scrub_continue(fs_info);
2563
2564	if (current->journal_info == trans)
2565	current->journal_info = NULL;
2566
2567	kmem_cache_free(s: btrfs_trans_handle_cachep, objp: trans);
2568
2569	update_commit_stats(fs_info, interval);
2570
2571	return ret;
2572
2573	unlock_reloc:
2574	mutex_unlock(lock: &fs_info->reloc_mutex);
2575	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2576	scrub_continue:
2577	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2578	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2579	btrfs_scrub_continue(fs_info);
2580	cleanup_transaction:
2581	btrfs_trans_release_metadata(trans);
2582	btrfs_cleanup_pending_block_groups(trans);
2583	btrfs_trans_release_chunk_metadata(trans);
2584	trans->block_rsv = NULL;
2585	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
2586	if (current->journal_info == trans)
2587	current->journal_info = NULL;
2588	cleanup_transaction(trans, err: ret);
2589
2590	return ret;
2591
2592	lockdep_release:
2593	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
2594	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2595	goto cleanup_transaction;
2596
2597	lockdep_trans_commit_start_release:
2598	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2599	btrfs_end_transaction(trans);
2600	return ret;
2601	}
2602
2603	/*
2604	* return < 0 if error
2605	* 0 if there are no more dead_roots at the time of call
2606	* 1 there are more to be processed, call me again
2607	*
2608	* The return value indicates there are certainly more snapshots to delete, but
2609	* if there comes a new one during processing, it may return 0. We don't mind,
2610	* because btrfs_commit_super will poke cleaner thread and it will process it a
2611	* few seconds later.
2612	*/
2613	int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
2614	{
2615	struct btrfs_root *root;
2616	int ret;
2617
2618	spin_lock(lock: &fs_info->trans_lock);
2619	if (list_empty(head: &fs_info->dead_roots)) {
2620	spin_unlock(lock: &fs_info->trans_lock);
2621	return `0`;
2622	}
2623	root = list_first_entry(&fs_info->dead_roots,
2624	struct btrfs_root, root_list);
2625	list_del_init(entry: &root->root_list);
2626	spin_unlock(lock: &fs_info->trans_lock);
2627
2628	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
2629
2630	btrfs_kill_all_delayed_nodes(root);
2631
2632	if (btrfs_header_backref_rev(eb: root->node) <
2633	BTRFS_MIXED_BACKREF_REV)
2634	ret = btrfs_drop_snapshot(root, update_ref: `0`, for_reloc: `0`);
2635	else
2636	ret = btrfs_drop_snapshot(root, update_ref: `1`, for_reloc: `0`);
2637
2638	btrfs_put_root(root);
2639	return (ret < `0`) ? `0` : `1`;
2640	}
2641
2642	/*
2643	* We only mark the transaction aborted and then set the file system read-only.
2644	* This will prevent new transactions from starting or trying to join this
2645	* one.
2646	*
2647	* This means that error recovery at the call site is limited to freeing
2648	* any local memory allocations and passing the error code up without
2649	* further cleanup. The transaction should complete as it normally would
2650	* in the call path but will return -EIO.
2651	*
2652	* We'll complete the cleanup in btrfs_end_transaction and
2653	* btrfs_commit_transaction.
2654	*/
2655	void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
2656	const char *function,
2657	unsigned int line, int error, bool first_hit)
2658	{
2659	struct btrfs_fs_info *fs_info = trans->fs_info;
2660
2661	WRITE_ONCE(trans->aborted, error);
2662	WRITE_ONCE(trans->transaction->aborted, error);
2663	if (first_hit && error == -ENOSPC)
2664	btrfs_dump_space_info_for_trans_abort(fs_info);
2665	/ Wake up anybody who may be waiting on this transaction /
2666	wake_up(&fs_info->transaction_wait);
2667	wake_up(&fs_info->transaction_blocked_wait);
2668	__btrfs_handle_fs_error(fs_info, function, line, error, NULL);
2669	}
2670
2671	int __init btrfs_transaction_init(void)
2672	{
2673	btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
2674	if (!btrfs_trans_handle_cachep)
2675	return -ENOMEM;
2676	return `0`;
2677	}
2678
2679	void __cold btrfs_transaction_exit(void)
2680	{
2681	kmem_cache_destroy(s: btrfs_trans_handle_cachep);
2682	}
2683

source code of linux/fs/btrfs/transaction.c