qgroup.c source code [linux/fs/btrfs/qgroup.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2011 STRATO. All rights reserved.
4	*/
5
6	#include <linux/sched.h>
7	#include <linux/pagemap.h>
8	#include <linux/writeback.h>
9	#include <linux/blkdev.h>
10	#include <linux/rbtree.h>
11	#include <linux/slab.h>
12	#include <linux/workqueue.h>
13	#include <linux/btrfs.h>
14	#include <linux/sched/mm.h>
15
16	#include "ctree.h"
17	#include "transaction.h"
18	#include "disk-io.h"
19	#include "locking.h"
20	#include "ulist.h"
21	#include "backref.h"
22	#include "extent_io.h"
23	#include "qgroup.h"
24	#include "block-group.h"
25	#include "sysfs.h"
26	#include "tree-mod-log.h"
27	#include "fs.h"
28	#include "accessors.h"
29	#include "extent-tree.h"
30	#include "root-tree.h"
31	#include "tree-checker.h"
32
33	enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
34	{
35	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
36	return BTRFS_QGROUP_MODE_DISABLED;
37	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
38	return BTRFS_QGROUP_MODE_SIMPLE;
39	return BTRFS_QGROUP_MODE_FULL;
40	}
41
42	bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
43	{
44	return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
45	}
46
47	bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
48	{
49	return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
50	}
51
52	/*
53	* Helpers to access qgroup reservation
54	*
55	* Callers should ensure the lock context and type are valid
56	*/
57
58	static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
59	{
60	u64 ret = `0`;
61	int i;
62
63	for (i = `0`; i < BTRFS_QGROUP_RSV_LAST; i++)
64	ret += qgroup->rsv.values[i];
65
66	return ret;
67	}
68
69	#ifdef CONFIG_BTRFS_DEBUG
70	static const char qgroup_rsv_type_str(enum* btrfs_qgroup_rsv_type type)
71	{
72	if (type == BTRFS_QGROUP_RSV_DATA)
73	return "data";
74	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
75	return "meta_pertrans";
76	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
77	return "meta_prealloc";
78	return NULL;
79	}
80	#endif
81
82	static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
83	struct btrfs_qgroup *qgroup, u64 num_bytes,
84	enum btrfs_qgroup_rsv_type type)
85	{
86	trace_qgroup_update_reserve(fs_info, qgroup, diff: num_bytes, type);
87	qgroup->rsv.values[type] += num_bytes;
88	}
89
90	static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
91	struct btrfs_qgroup *qgroup, u64 num_bytes,
92	enum btrfs_qgroup_rsv_type type)
93	{
94	trace_qgroup_update_reserve(fs_info, qgroup, diff: -(s64)num_bytes, type);
95	if (qgroup->rsv.values[type] >= num_bytes) {
96	qgroup->rsv.values[type] -= num_bytes;
97	return;
98	}
99	#ifdef CONFIG_BTRFS_DEBUG
100	WARN_RATELIMIT(`1`,
101	"qgroup %llu %s reserved space underflow, have %llu to free %llu",
102	qgroup->qgroupid, qgroup_rsv_type_str(type),
103	qgroup->rsv.values[type], num_bytes);
104	#endif
105	qgroup->rsv.values[type] = `0`;
106	}
107
108	static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
109	struct btrfs_qgroup *dest,
110	struct btrfs_qgroup *src)
111	{
112	int i;
113
114	for (i = `0`; i < BTRFS_QGROUP_RSV_LAST; i++)
115	qgroup_rsv_add(fs_info, qgroup: dest, num_bytes: src->rsv.values[i], type: i);
116	}
117
118	static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
119	struct btrfs_qgroup *dest,
120	struct btrfs_qgroup *src)
121	{
122	int i;
123
124	for (i = `0`; i < BTRFS_QGROUP_RSV_LAST; i++)
125	qgroup_rsv_release(fs_info, qgroup: dest, num_bytes: src->rsv.values[i], type: i);
126	}
127
128	static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
129	int mod)
130	{
131	if (qg->old_refcnt < seq)
132	qg->old_refcnt = seq;
133	qg->old_refcnt += mod;
134	}
135
136	static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
137	int mod)
138	{
139	if (qg->new_refcnt < seq)
140	qg->new_refcnt = seq;
141	qg->new_refcnt += mod;
142	}
143
144	static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
145	{
146	if (qg->old_refcnt < seq)
147	return `0`;
148	return qg->old_refcnt - seq;
149	}
150
151	static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
152	{
153	if (qg->new_refcnt < seq)
154	return `0`;
155	return qg->new_refcnt - seq;
156	}
157
158	/*
159	* glue structure to represent the relations between qgroups.
160	*/
161	struct btrfs_qgroup_list {
162	struct list_head next_group;
163	struct list_head next_member;
164	struct btrfs_qgroup *group;
165	struct btrfs_qgroup *member;
166	};
167
168	static int
169	qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
170	int init_flags);
171	static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
172
173	/ must be called with qgroup_ioctl_lock held /
174	static struct btrfs_qgroup find_qgroup_rb(struct* btrfs_fs_info *fs_info,
175	u64 qgroupid)
176	{
177	struct rb_node *n = fs_info->qgroup_tree.rb_node;
178	struct btrfs_qgroup *qgroup;
179
180	while (n) {
181	qgroup = rb_entry(n, struct btrfs_qgroup, node);
182	if (qgroup->qgroupid < qgroupid)
183	n = n->rb_left;
184	else if (qgroup->qgroupid > qgroupid)
185	n = n->rb_right;
186	else
187	return qgroup;
188	}
189	return NULL;
190	}
191
192	/*
193	* Add qgroup to the filesystem's qgroup tree.
194	*
195	* Must be called with qgroup_lock held and @prealloc preallocated.
196	*
197	* The control on the lifespan of @prealloc would be transferred to this
198	* function, thus caller should no longer touch @prealloc.
199	*/
200	static struct btrfs_qgroup add_qgroup_rb(struct* btrfs_fs_info *fs_info,
201	struct btrfs_qgroup *prealloc,
202	u64 qgroupid)
203	{
204	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
205	struct rb_node *parent = NULL;
206	struct btrfs_qgroup *qgroup;
207
208	/ Caller must have pre-allocated @prealloc. /
209	ASSERT(prealloc);
210
211	while (*p) {
212	parent = *p;
213	qgroup = rb_entry(parent, struct btrfs_qgroup, node);
214
215	if (qgroup->qgroupid < qgroupid) {
216	p = &(*p)->rb_left;
217	} else if (qgroup->qgroupid > qgroupid) {
218	p = &(*p)->rb_right;
219	} else {
220	kfree(objp: prealloc);
221	return qgroup;
222	}
223	}
224
225	qgroup = prealloc;
226	qgroup->qgroupid = qgroupid;
227	INIT_LIST_HEAD(list: &qgroup->groups);
228	INIT_LIST_HEAD(list: &qgroup->members);
229	INIT_LIST_HEAD(list: &qgroup->dirty);
230	INIT_LIST_HEAD(list: &qgroup->iterator);
231	INIT_LIST_HEAD(list: &qgroup->nested_iterator);
232
233	rb_link_node(node: &qgroup->node, parent, rb_link: p);
234	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
235
236	return qgroup;
237	}
238
239	static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
240	struct btrfs_qgroup *qgroup)
241	{
242	struct btrfs_qgroup_list *list;
243
244	list_del(entry: &qgroup->dirty);
245	while (!list_empty(head: &qgroup->groups)) {
246	list = list_first_entry(&qgroup->groups,
247	struct btrfs_qgroup_list, next_group);
248	list_del(entry: &list->next_group);
249	list_del(entry: &list->next_member);
250	kfree(objp: list);
251	}
252
253	while (!list_empty(head: &qgroup->members)) {
254	list = list_first_entry(&qgroup->members,
255	struct btrfs_qgroup_list, next_member);
256	list_del(entry: &list->next_group);
257	list_del(entry: &list->next_member);
258	kfree(objp: list);
259	}
260	}
261
262	/ must be called with qgroup_lock held /
263	static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
264	{
265	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
266
267	if (!qgroup)
268	return -ENOENT;
269
270	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
271	__del_qgroup_rb(fs_info, qgroup);
272	return `0`;
273	}
274
275	/*
276	* Add relation specified by two qgroups.
277	*
278	* Must be called with qgroup_lock held, the ownership of @prealloc is
279	* transferred to this function and caller should not touch it anymore.
280	*
281	* Return: 0 on success
282	* -ENOENT if one of the qgroups is NULL
283	* <0 other errors
284	*/
285	static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
286	struct btrfs_qgroup *member,
287	struct btrfs_qgroup *parent)
288	{
289	if (!member \|\| !parent) {
290	kfree(objp: prealloc);
291	return -ENOENT;
292	}
293
294	prealloc->group = parent;
295	prealloc->member = member;
296	list_add_tail(new: &prealloc->next_group, head: &member->groups);
297	list_add_tail(new: &prealloc->next_member, head: &parent->members);
298
299	return `0`;
300	}
301
302	/*
303	* Add relation specified by two qgroup ids.
304	*
305	* Must be called with qgroup_lock held.
306	*
307	* Return: 0 on success
308	* -ENOENT if one of the ids does not exist
309	* <0 other errors
310	*/
311	static int add_relation_rb(struct btrfs_fs_info *fs_info,
312	struct btrfs_qgroup_list *prealloc,
313	u64 memberid, u64 parentid)
314	{
315	struct btrfs_qgroup *member;
316	struct btrfs_qgroup *parent;
317
318	member = find_qgroup_rb(fs_info, qgroupid: memberid);
319	parent = find_qgroup_rb(fs_info, qgroupid: parentid);
320
321	return __add_relation_rb(prealloc, member, parent);
322	}
323
324	/ Must be called with qgroup_lock held /
325	static int del_relation_rb(struct btrfs_fs_info *fs_info,
326	u64 memberid, u64 parentid)
327	{
328	struct btrfs_qgroup *member;
329	struct btrfs_qgroup *parent;
330	struct btrfs_qgroup_list *list;
331
332	member = find_qgroup_rb(fs_info, qgroupid: memberid);
333	parent = find_qgroup_rb(fs_info, qgroupid: parentid);
334	if (!member \|\| !parent)
335	return -ENOENT;
336
337	list_for_each_entry(list, &member->groups, next_group) {
338	if (list->group == parent) {
339	list_del(entry: &list->next_group);
340	list_del(entry: &list->next_member);
341	kfree(objp: list);
342	return `0`;
343	}
344	}
345	return -ENOENT;
346	}
347
348	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
349	int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
350	u64 rfer, u64 excl)
351	{
352	struct btrfs_qgroup *qgroup;
353
354	qgroup = find_qgroup_rb(fs_info, qgroupid);
355	if (!qgroup)
356	return -EINVAL;
357	if (qgroup->rfer != rfer \|\| qgroup->excl != excl)
358	return -EINVAL;
359	return `0`;
360	}
361	#endif
362
363	static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
364	{
365	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
366	return;
367	fs_info->qgroup_flags \|= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT \|
368	BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN \|
369	BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
370	}
371
372	static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
373	struct extent_buffer leaf, int* slot,
374	struct btrfs_qgroup_status_item *ptr)
375	{
376	ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
377	ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
378	fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(eb: leaf, s: ptr);
379	}
380
381	/*
382	* The full config is read in one go, only called from open_ctree()
383	* It doesn't use any locking, as at this point we're still single-threaded
384	*/
385	int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
386	{
387	struct btrfs_key key;
388	struct btrfs_key found_key;
389	struct btrfs_root *quota_root = fs_info->quota_root;
390	struct btrfs_path *path = NULL;
391	struct extent_buffer *l;
392	int slot;
393	int ret = `0`;
394	u64 flags = `0`;
395	u64 rescan_progress = `0`;
396
397	if (!fs_info->quota_root)
398	return `0`;
399
400	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
401	if (!fs_info->qgroup_ulist) {
402	ret = -ENOMEM;
403	goto out;
404	}
405
406	path = btrfs_alloc_path();
407	if (!path) {
408	ret = -ENOMEM;
409	goto out;
410	}
411
412	ret = btrfs_sysfs_add_qgroups(fs_info);
413	if (ret < `0`)
414	goto out;
415	/ default this to quota off, in case no status key is found /
416	fs_info->qgroup_flags = `0`;
417
418	/*
419	* pass 1: read status, all qgroup infos and limits
420	*/
421	key.objectid = `0`;
422	key.type = `0`;
423	key.offset = `0`;
424	ret = btrfs_search_slot_for_read(root: quota_root, key: &key, p: path, find_higher: `1`, return_any: `1`);
425	if (ret)
426	goto out;
427
428	while (`1`) {
429	struct btrfs_qgroup *qgroup;
430
431	slot = path->slots[`0`];
432	l = path->nodes[`0`];
433	btrfs_item_key_to_cpu(eb: l, cpu_key: &found_key, nr: slot);
434
435	if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
436	struct btrfs_qgroup_status_item *ptr;
437
438	ptr = btrfs_item_ptr(l, slot,
439	struct btrfs_qgroup_status_item);
440
441	if (btrfs_qgroup_status_version(eb: l, s: ptr) !=
442	BTRFS_QGROUP_STATUS_VERSION) {
443	btrfs_err(fs_info,
444	"old qgroup version, quota disabled");
445	goto out;
446	}
447	fs_info->qgroup_flags = btrfs_qgroup_status_flags(eb: l, s: ptr);
448	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
449	qgroup_read_enable_gen(fs_info, leaf: l, slot, ptr);
450	} else if (btrfs_qgroup_status_generation(eb: l, s: ptr) != fs_info->generation) {
451	qgroup_mark_inconsistent(fs_info);
452	btrfs_err(fs_info,
453	"qgroup generation mismatch, marked as inconsistent");
454	}
455	rescan_progress = btrfs_qgroup_status_rescan(eb: l, s: ptr);
456	goto next1;
457	}
458
459	if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
460	found_key.type != BTRFS_QGROUP_LIMIT_KEY)
461	goto next1;
462
463	qgroup = find_qgroup_rb(fs_info, qgroupid: found_key.offset);
464	if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) \|\|
465	(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
466	btrfs_err(fs_info, "inconsistent qgroup config");
467	qgroup_mark_inconsistent(fs_info);
468	}
469	if (!qgroup) {
470	struct btrfs_qgroup *prealloc;
471
472	prealloc = kzalloc(size: sizeof(*prealloc), GFP_KERNEL);
473	if (!prealloc) {
474	ret = -ENOMEM;
475	goto out;
476	}
477	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid: found_key.offset);
478	}
479	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
480	if (ret < `0`)
481	goto out;
482
483	switch (found_key.type) {
484	case BTRFS_QGROUP_INFO_KEY: {
485	struct btrfs_qgroup_info_item *ptr;
486
487	ptr = btrfs_item_ptr(l, slot,
488	struct btrfs_qgroup_info_item);
489	qgroup->rfer = btrfs_qgroup_info_rfer(eb: l, s: ptr);
490	qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(eb: l, s: ptr);
491	qgroup->excl = btrfs_qgroup_info_excl(eb: l, s: ptr);
492	qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(eb: l, s: ptr);
493	/ generation currently unused /
494	break;
495	}
496	case BTRFS_QGROUP_LIMIT_KEY: {
497	struct btrfs_qgroup_limit_item *ptr;
498
499	ptr = btrfs_item_ptr(l, slot,
500	struct btrfs_qgroup_limit_item);
501	qgroup->lim_flags = btrfs_qgroup_limit_flags(eb: l, s: ptr);
502	qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(eb: l, s: ptr);
503	qgroup->max_excl = btrfs_qgroup_limit_max_excl(eb: l, s: ptr);
504	qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(eb: l, s: ptr);
505	qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(eb: l, s: ptr);
506	break;
507	}
508	}
509	next1:
510	ret = btrfs_next_item(root: quota_root, p: path);
511	if (ret < `0`)
512	goto out;
513	if (ret)
514	break;
515	}
516	btrfs_release_path(p: path);
517
518	/*
519	* pass 2: read all qgroup relations
520	*/
521	key.objectid = `0`;
522	key.type = BTRFS_QGROUP_RELATION_KEY;
523	key.offset = `0`;
524	ret = btrfs_search_slot_for_read(root: quota_root, key: &key, p: path, find_higher: `1`, return_any: `0`);
525	if (ret)
526	goto out;
527	while (`1`) {
528	struct btrfs_qgroup_list *list = NULL;
529
530	slot = path->slots[`0`];
531	l = path->nodes[`0`];
532	btrfs_item_key_to_cpu(eb: l, cpu_key: &found_key, nr: slot);
533
534	if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
535	goto next2;
536
537	if (found_key.objectid > found_key.offset) {
538	/ parent <- member, not needed to build config /
539	/ FIXME should we omit the key completely? /
540	goto next2;
541	}
542
543	list = kzalloc(size: sizeof(*list), GFP_KERNEL);
544	if (!list) {
545	ret = -ENOMEM;
546	goto out;
547	}
548	ret = add_relation_rb(fs_info, prealloc: list, memberid: found_key.objectid,
549	parentid: found_key.offset);
550	list = NULL;
551	if (ret == -ENOENT) {
552	btrfs_warn(fs_info,
553	"orphan qgroup relation 0x%llx->0x%llx",
554	found_key.objectid, found_key.offset);
555	ret = `0`; / ignore the error /
556	}
557	if (ret)
558	goto out;
559	next2:
560	ret = btrfs_next_item(root: quota_root, p: path);
561	if (ret < `0`)
562	goto out;
563	if (ret)
564	break;
565	}
566	out:
567	btrfs_free_path(p: path);
568	fs_info->qgroup_flags \|= flags;
569	if (ret >= `0`) {
570	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
571	set_bit(nr: BTRFS_FS_QUOTA_ENABLED, addr: &fs_info->flags);
572	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
573	ret = qgroup_rescan_init(fs_info, progress_objectid: rescan_progress, init_flags: `0`);
574	} else {
575	ulist_free(ulist: fs_info->qgroup_ulist);
576	fs_info->qgroup_ulist = NULL;
577	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
578	btrfs_sysfs_del_qgroups(fs_info);
579	}
580
581	return ret < `0` ? ret : `0`;
582	}
583
584	/*
585	* Called in close_ctree() when quota is still enabled. This verifies we don't
586	* leak some reserved space.
587	*
588	* Return false if no reserved space is left.
589	* Return true if some reserved space is leaked.
590	*/
591	bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
592	{
593	struct rb_node *node;
594	bool ret = false;
595
596	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
597	return ret;
598	/*
599	* Since we're unmounting, there is no race and no need to grab qgroup
600	* lock. And here we don't go post-order to provide a more user
601	* friendly sorted result.
602	*/
603	for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
604	struct btrfs_qgroup *qgroup;
605	int i;
606
607	qgroup = rb_entry(node, struct btrfs_qgroup, node);
608	for (i = `0`; i < BTRFS_QGROUP_RSV_LAST; i++) {
609	if (qgroup->rsv.values[i]) {
610	ret = true;
611	btrfs_warn(fs_info,
612	"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
613	btrfs_qgroup_level(qgroup->qgroupid),
614	btrfs_qgroup_subvolid(qgroup->qgroupid),
615	i, qgroup->rsv.values[i]);
616	}
617	}
618	}
619	return ret;
620	}
621
622	/*
623	* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
624	* first two are in single-threaded paths.And for the third one, we have set
625	* quota_root to be null with qgroup_lock held before, so it is safe to clean
626	* up the in-memory structures without qgroup_lock held.
627	*/
628	void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
629	{
630	struct rb_node *n;
631	struct btrfs_qgroup *qgroup;
632
633	while ((n = rb_first(&fs_info->qgroup_tree))) {
634	qgroup = rb_entry(n, struct btrfs_qgroup, node);
635	rb_erase(n, &fs_info->qgroup_tree);
636	__del_qgroup_rb(fs_info, qgroup);
637	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
638	kfree(objp: qgroup);
639	}
640	/*
641	* We call btrfs_free_qgroup_config() when unmounting
642	* filesystem and disabling quota, so we set qgroup_ulist
643	* to be null here to avoid double free.
644	*/
645	ulist_free(ulist: fs_info->qgroup_ulist);
646	fs_info->qgroup_ulist = NULL;
647	btrfs_sysfs_del_qgroups(fs_info);
648	}
649
650	static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
651	u64 dst)
652	{
653	int ret;
654	struct btrfs_root *quota_root = trans->fs_info->quota_root;
655	struct btrfs_path *path;
656	struct btrfs_key key;
657
658	path = btrfs_alloc_path();
659	if (!path)
660	return -ENOMEM;
661
662	key.objectid = src;
663	key.type = BTRFS_QGROUP_RELATION_KEY;
664	key.offset = dst;
665
666	ret = btrfs_insert_empty_item(trans, root: quota_root, path, key: &key, data_size: `0`);
667
668	btrfs_mark_buffer_dirty(trans, buf: path->nodes[`0`]);
669
670	btrfs_free_path(p: path);
671	return ret;
672	}
673
674	static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
675	u64 dst)
676	{
677	int ret;
678	struct btrfs_root *quota_root = trans->fs_info->quota_root;
679	struct btrfs_path *path;
680	struct btrfs_key key;
681
682	path = btrfs_alloc_path();
683	if (!path)
684	return -ENOMEM;
685
686	key.objectid = src;
687	key.type = BTRFS_QGROUP_RELATION_KEY;
688	key.offset = dst;
689
690	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: -`1`, cow: `1`);
691	if (ret < `0`)
692	goto out;
693
694	if (ret > `0`) {
695	ret = -ENOENT;
696	goto out;
697	}
698
699	ret = btrfs_del_item(trans, root: quota_root, path);
700	out:
701	btrfs_free_path(p: path);
702	return ret;
703	}
704
705	static int add_qgroup_item(struct btrfs_trans_handle *trans,
706	struct btrfs_root *quota_root, u64 qgroupid)
707	{
708	int ret;
709	struct btrfs_path *path;
710	struct btrfs_qgroup_info_item *qgroup_info;
711	struct btrfs_qgroup_limit_item *qgroup_limit;
712	struct extent_buffer *leaf;
713	struct btrfs_key key;
714
715	if (btrfs_is_testing(fs_info: quota_root->fs_info))
716	return `0`;
717
718	path = btrfs_alloc_path();
719	if (!path)
720	return -ENOMEM;
721
722	key.objectid = `0`;
723	key.type = BTRFS_QGROUP_INFO_KEY;
724	key.offset = qgroupid;
725
726	/*
727	* Avoid a transaction abort by catching -EEXIST here. In that
728	* case, we proceed by re-initializing the existing structure
729	* on disk.
730	*/
731
732	ret = btrfs_insert_empty_item(trans, root: quota_root, path, key: &key,
733	data_size: sizeof(*qgroup_info));
734	if (ret && ret != -EEXIST)
735	goto out;
736
737	leaf = path->nodes[`0`];
738	qgroup_info = btrfs_item_ptr(leaf, path->slots[`0`],
739	struct btrfs_qgroup_info_item);
740	btrfs_set_qgroup_info_generation(eb: leaf, s: qgroup_info, val: trans->transid);
741	btrfs_set_qgroup_info_rfer(eb: leaf, s: qgroup_info, val: `0`);
742	btrfs_set_qgroup_info_rfer_cmpr(eb: leaf, s: qgroup_info, val: `0`);
743	btrfs_set_qgroup_info_excl(eb: leaf, s: qgroup_info, val: `0`);
744	btrfs_set_qgroup_info_excl_cmpr(eb: leaf, s: qgroup_info, val: `0`);
745
746	btrfs_mark_buffer_dirty(trans, buf: leaf);
747
748	btrfs_release_path(p: path);
749
750	key.type = BTRFS_QGROUP_LIMIT_KEY;
751	ret = btrfs_insert_empty_item(trans, root: quota_root, path, key: &key,
752	data_size: sizeof(*qgroup_limit));
753	if (ret && ret != -EEXIST)
754	goto out;
755
756	leaf = path->nodes[`0`];
757	qgroup_limit = btrfs_item_ptr(leaf, path->slots[`0`],
758	struct btrfs_qgroup_limit_item);
759	btrfs_set_qgroup_limit_flags(eb: leaf, s: qgroup_limit, val: `0`);
760	btrfs_set_qgroup_limit_max_rfer(eb: leaf, s: qgroup_limit, val: `0`);
761	btrfs_set_qgroup_limit_max_excl(eb: leaf, s: qgroup_limit, val: `0`);
762	btrfs_set_qgroup_limit_rsv_rfer(eb: leaf, s: qgroup_limit, val: `0`);
763	btrfs_set_qgroup_limit_rsv_excl(eb: leaf, s: qgroup_limit, val: `0`);
764
765	btrfs_mark_buffer_dirty(trans, buf: leaf);
766
767	ret = `0`;
768	out:
769	btrfs_free_path(p: path);
770	return ret;
771	}
772
773	static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
774	{
775	int ret;
776	struct btrfs_root *quota_root = trans->fs_info->quota_root;
777	struct btrfs_path *path;
778	struct btrfs_key key;
779
780	path = btrfs_alloc_path();
781	if (!path)
782	return -ENOMEM;
783
784	key.objectid = `0`;
785	key.type = BTRFS_QGROUP_INFO_KEY;
786	key.offset = qgroupid;
787	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: -`1`, cow: `1`);
788	if (ret < `0`)
789	goto out;
790
791	if (ret > `0`) {
792	ret = -ENOENT;
793	goto out;
794	}
795
796	ret = btrfs_del_item(trans, root: quota_root, path);
797	if (ret)
798	goto out;
799
800	btrfs_release_path(p: path);
801
802	key.type = BTRFS_QGROUP_LIMIT_KEY;
803	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: -`1`, cow: `1`);
804	if (ret < `0`)
805	goto out;
806
807	if (ret > `0`) {
808	ret = -ENOENT;
809	goto out;
810	}
811
812	ret = btrfs_del_item(trans, root: quota_root, path);
813
814	out:
815	btrfs_free_path(p: path);
816	return ret;
817	}
818
819	static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
820	struct btrfs_qgroup *qgroup)
821	{
822	struct btrfs_root *quota_root = trans->fs_info->quota_root;
823	struct btrfs_path *path;
824	struct btrfs_key key;
825	struct extent_buffer *l;
826	struct btrfs_qgroup_limit_item *qgroup_limit;
827	int ret;
828	int slot;
829
830	key.objectid = `0`;
831	key.type = BTRFS_QGROUP_LIMIT_KEY;
832	key.offset = qgroup->qgroupid;
833
834	path = btrfs_alloc_path();
835	if (!path)
836	return -ENOMEM;
837
838	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: `0`, cow: `1`);
839	if (ret > `0`)
840	ret = -ENOENT;
841
842	if (ret)
843	goto out;
844
845	l = path->nodes[`0`];
846	slot = path->slots[`0`];
847	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
848	btrfs_set_qgroup_limit_flags(eb: l, s: qgroup_limit, val: qgroup->lim_flags);
849	btrfs_set_qgroup_limit_max_rfer(eb: l, s: qgroup_limit, val: qgroup->max_rfer);
850	btrfs_set_qgroup_limit_max_excl(eb: l, s: qgroup_limit, val: qgroup->max_excl);
851	btrfs_set_qgroup_limit_rsv_rfer(eb: l, s: qgroup_limit, val: qgroup->rsv_rfer);
852	btrfs_set_qgroup_limit_rsv_excl(eb: l, s: qgroup_limit, val: qgroup->rsv_excl);
853
854	btrfs_mark_buffer_dirty(trans, buf: l);
855
856	out:
857	btrfs_free_path(p: path);
858	return ret;
859	}
860
861	static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
862	struct btrfs_qgroup *qgroup)
863	{
864	struct btrfs_fs_info *fs_info = trans->fs_info;
865	struct btrfs_root *quota_root = fs_info->quota_root;
866	struct btrfs_path *path;
867	struct btrfs_key key;
868	struct extent_buffer *l;
869	struct btrfs_qgroup_info_item *qgroup_info;
870	int ret;
871	int slot;
872
873	if (btrfs_is_testing(fs_info))
874	return `0`;
875
876	key.objectid = `0`;
877	key.type = BTRFS_QGROUP_INFO_KEY;
878	key.offset = qgroup->qgroupid;
879
880	path = btrfs_alloc_path();
881	if (!path)
882	return -ENOMEM;
883
884	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: `0`, cow: `1`);
885	if (ret > `0`)
886	ret = -ENOENT;
887
888	if (ret)
889	goto out;
890
891	l = path->nodes[`0`];
892	slot = path->slots[`0`];
893	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
894	btrfs_set_qgroup_info_generation(eb: l, s: qgroup_info, val: trans->transid);
895	btrfs_set_qgroup_info_rfer(eb: l, s: qgroup_info, val: qgroup->rfer);
896	btrfs_set_qgroup_info_rfer_cmpr(eb: l, s: qgroup_info, val: qgroup->rfer_cmpr);
897	btrfs_set_qgroup_info_excl(eb: l, s: qgroup_info, val: qgroup->excl);
898	btrfs_set_qgroup_info_excl_cmpr(eb: l, s: qgroup_info, val: qgroup->excl_cmpr);
899
900	btrfs_mark_buffer_dirty(trans, buf: l);
901
902	out:
903	btrfs_free_path(p: path);
904	return ret;
905	}
906
907	static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
908	{
909	struct btrfs_fs_info *fs_info = trans->fs_info;
910	struct btrfs_root *quota_root = fs_info->quota_root;
911	struct btrfs_path *path;
912	struct btrfs_key key;
913	struct extent_buffer *l;
914	struct btrfs_qgroup_status_item *ptr;
915	int ret;
916	int slot;
917
918	key.objectid = `0`;
919	key.type = BTRFS_QGROUP_STATUS_KEY;
920	key.offset = `0`;
921
922	path = btrfs_alloc_path();
923	if (!path)
924	return -ENOMEM;
925
926	ret = btrfs_search_slot(trans, root: quota_root, key: &key, p: path, ins_len: `0`, cow: `1`);
927	if (ret > `0`)
928	ret = -ENOENT;
929
930	if (ret)
931	goto out;
932
933	l = path->nodes[`0`];
934	slot = path->slots[`0`];
935	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
936	btrfs_set_qgroup_status_flags(eb: l, s: ptr, val: fs_info->qgroup_flags &
937	BTRFS_QGROUP_STATUS_FLAGS_MASK);
938	btrfs_set_qgroup_status_generation(eb: l, s: ptr, val: trans->transid);
939	btrfs_set_qgroup_status_rescan(eb: l, s: ptr,
940	val: fs_info->qgroup_rescan_progress.objectid);
941
942	btrfs_mark_buffer_dirty(trans, buf: l);
943
944	out:
945	btrfs_free_path(p: path);
946	return ret;
947	}
948
949	/*
950	* called with qgroup_lock held
951	*/
952	static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
953	struct btrfs_root *root)
954	{
955	struct btrfs_path *path;
956	struct btrfs_key key;
957	struct extent_buffer *leaf = NULL;
958	int ret;
959	int nr = `0`;
960
961	path = btrfs_alloc_path();
962	if (!path)
963	return -ENOMEM;
964
965	key.objectid = `0`;
966	key.offset = `0`;
967	key.type = `0`;
968
969	while (`1`) {
970	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
971	if (ret < `0`)
972	goto out;
973	leaf = path->nodes[`0`];
974	nr = btrfs_header_nritems(eb: leaf);
975	if (!nr)
976	break;
977	/*
978	* delete the leaf one by one
979	* since the whole tree is going
980	* to be deleted.
981	*/
982	path->slots[`0`] = `0`;
983	ret = btrfs_del_items(trans, root, path, slot: `0`, nr);
984	if (ret)
985	goto out;
986
987	btrfs_release_path(p: path);
988	}
989	ret = `0`;
990	out:
991	btrfs_free_path(p: path);
992	return ret;
993	}
994
995	int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
996	struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
997	{
998	struct btrfs_root *quota_root;
999	struct btrfs_root *tree_root = fs_info->tree_root;
1000	struct btrfs_path *path = NULL;
1001	struct btrfs_qgroup_status_item *ptr;
1002	struct extent_buffer *leaf;
1003	struct btrfs_key key;
1004	struct btrfs_key found_key;
1005	struct btrfs_qgroup *qgroup = NULL;
1006	struct btrfs_qgroup *prealloc = NULL;
1007	struct btrfs_trans_handle *trans = NULL;
1008	struct ulist *ulist = NULL;
1009	const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
1010	int ret = `0`;
1011	int slot;
1012
1013	/*
1014	* We need to have subvol_sem write locked, to prevent races between
1015	* concurrent tasks trying to enable quotas, because we will unlock
1016	* and relock qgroup_ioctl_lock before setting fs_info->quota_root
1017	* and before setting BTRFS_FS_QUOTA_ENABLED.
1018	*/
1019	lockdep_assert_held_write(&fs_info->subvol_sem);
1020
1021	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
1022	btrfs_err(fs_info,
1023	"qgroups are currently unsupported in extent tree v2");
1024	return -EINVAL;
1025	}
1026
1027	mutex_lock(&fs_info->qgroup_ioctl_lock);
1028	if (fs_info->quota_root)
1029	goto out;
1030
1031	ulist = ulist_alloc(GFP_KERNEL);
1032	if (!ulist) {
1033	ret = -ENOMEM;
1034	goto out;
1035	}
1036
1037	ret = btrfs_sysfs_add_qgroups(fs_info);
1038	if (ret < `0`)
1039	goto out;
1040
1041	/*
1042	* Unlock qgroup_ioctl_lock before starting the transaction. This is to
1043	* avoid lock acquisition inversion problems (reported by lockdep) between
1044	* qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
1045	* start a transaction.
1046	* After we started the transaction lock qgroup_ioctl_lock again and
1047	* check if someone else created the quota root in the meanwhile. If so,
1048	* just return success and release the transaction handle.
1049	*
1050	* Also we don't need to worry about someone else calling
1051	* btrfs_sysfs_add_qgroups() after we unlock and getting an error because
1052	* that function returns 0 (success) when the sysfs entries already exist.
1053	*/
1054	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1055
1056	/*
1057	* 1 for quota root item
1058	* 1 for BTRFS_QGROUP_STATUS item
1059	*
1060	* Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1061	* per subvolume. However those are not currently reserved since it
1062	* would be a lot of overkill.
1063	*/
1064	trans = btrfs_start_transaction(root: tree_root, num_items: `2`);
1065
1066	mutex_lock(&fs_info->qgroup_ioctl_lock);
1067	if (IS_ERR(ptr: trans)) {
1068	ret = PTR_ERR(ptr: trans);
1069	trans = NULL;
1070	goto out;
1071	}
1072
1073	if (fs_info->quota_root)
1074	goto out;
1075
1076	fs_info->qgroup_ulist = ulist;
1077	ulist = NULL;
1078
1079	/*
1080	* initially create the quota tree
1081	*/
1082	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1083	if (IS_ERR(ptr: quota_root)) {
1084	ret = PTR_ERR(ptr: quota_root);
1085	btrfs_abort_transaction(trans, ret);
1086	goto out;
1087	}
1088
1089	path = btrfs_alloc_path();
1090	if (!path) {
1091	ret = -ENOMEM;
1092	btrfs_abort_transaction(trans, ret);
1093	goto out_free_root;
1094	}
1095
1096	key.objectid = `0`;
1097	key.type = BTRFS_QGROUP_STATUS_KEY;
1098	key.offset = `0`;
1099
1100	ret = btrfs_insert_empty_item(trans, root: quota_root, path, key: &key,
1101	data_size: sizeof(*ptr));
1102	if (ret) {
1103	btrfs_abort_transaction(trans, ret);
1104	goto out_free_path;
1105	}
1106
1107	leaf = path->nodes[`0`];
1108	ptr = btrfs_item_ptr(leaf, path->slots[`0`],
1109	struct btrfs_qgroup_status_item);
1110	btrfs_set_qgroup_status_generation(eb: leaf, s: ptr, val: trans->transid);
1111	btrfs_set_qgroup_status_version(eb: leaf, s: ptr, BTRFS_QGROUP_STATUS_VERSION);
1112	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
1113	if (simple) {
1114	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1115	btrfs_set_qgroup_status_enable_gen(eb: leaf, s: ptr, val: trans->transid);
1116	} else {
1117	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1118	}
1119	btrfs_set_qgroup_status_flags(eb: leaf, s: ptr, val: fs_info->qgroup_flags &
1120	BTRFS_QGROUP_STATUS_FLAGS_MASK);
1121	btrfs_set_qgroup_status_rescan(eb: leaf, s: ptr, val: `0`);
1122
1123	btrfs_mark_buffer_dirty(trans, buf: leaf);
1124
1125	key.objectid = `0`;
1126	key.type = BTRFS_ROOT_REF_KEY;
1127	key.offset = `0`;
1128
1129	btrfs_release_path(p: path);
1130	ret = btrfs_search_slot_for_read(root: tree_root, key: &key, p: path, find_higher: `1`, return_any: `0`);
1131	if (ret > `0`)
1132	goto out_add_root;
1133	if (ret < `0`) {
1134	btrfs_abort_transaction(trans, ret);
1135	goto out_free_path;
1136	}
1137
1138	while (`1`) {
1139	slot = path->slots[`0`];
1140	leaf = path->nodes[`0`];
1141	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: slot);
1142
1143	if (found_key.type == BTRFS_ROOT_REF_KEY) {
1144
1145	/ Release locks on tree_root before we access quota_root /
1146	btrfs_release_path(p: path);
1147
1148	/ We should not have a stray @prealloc pointer. /
1149	ASSERT(prealloc == NULL);
1150	prealloc = kzalloc(size: sizeof(*prealloc), GFP_NOFS);
1151	if (!prealloc) {
1152	ret = -ENOMEM;
1153	btrfs_abort_transaction(trans, ret);
1154	goto out_free_path;
1155	}
1156
1157	ret = add_qgroup_item(trans, quota_root,
1158	qgroupid: found_key.offset);
1159	if (ret) {
1160	btrfs_abort_transaction(trans, ret);
1161	goto out_free_path;
1162	}
1163
1164	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid: found_key.offset);
1165	prealloc = NULL;
1166	if (IS_ERR(ptr: qgroup)) {
1167	ret = PTR_ERR(ptr: qgroup);
1168	btrfs_abort_transaction(trans, ret);
1169	goto out_free_path;
1170	}
1171	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1172	if (ret < `0`) {
1173	btrfs_abort_transaction(trans, ret);
1174	goto out_free_path;
1175	}
1176	ret = btrfs_search_slot_for_read(root: tree_root, key: &found_key,
1177	p: path, find_higher: `1`, return_any: `0`);
1178	if (ret < `0`) {
1179	btrfs_abort_transaction(trans, ret);
1180	goto out_free_path;
1181	}
1182	if (ret > `0`) {
1183	/*
1184	* Shouldn't happen, but in case it does we
1185	* don't need to do the btrfs_next_item, just
1186	* continue.
1187	*/
1188	continue;
1189	}
1190	}
1191	ret = btrfs_next_item(root: tree_root, p: path);
1192	if (ret < `0`) {
1193	btrfs_abort_transaction(trans, ret);
1194	goto out_free_path;
1195	}
1196	if (ret)
1197	break;
1198	}
1199
1200	out_add_root:
1201	btrfs_release_path(p: path);
1202	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1203	if (ret) {
1204	btrfs_abort_transaction(trans, ret);
1205	goto out_free_path;
1206	}
1207
1208	ASSERT(prealloc == NULL);
1209	prealloc = kzalloc(size: sizeof(*prealloc), GFP_NOFS);
1210	if (!prealloc) {
1211	ret = -ENOMEM;
1212	goto out_free_path;
1213	}
1214	qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
1215	prealloc = NULL;
1216	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1217	if (ret < `0`) {
1218	btrfs_abort_transaction(trans, ret);
1219	goto out_free_path;
1220	}
1221
1222	fs_info->qgroup_enable_gen = trans->transid;
1223
1224	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1225	/*
1226	* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1227	* a deadlock with tasks concurrently doing other qgroup operations, such
1228	* adding/removing qgroups or adding/deleting qgroup relations for example,
1229	* because all qgroup operations first start or join a transaction and then
1230	* lock the qgroup_ioctl_lock mutex.
1231	* We are safe from a concurrent task trying to enable quotas, by calling
1232	* this function, since we are serialized by fs_info->subvol_sem.
1233	*/
1234	ret = btrfs_commit_transaction(trans);
1235	trans = NULL;
1236	mutex_lock(&fs_info->qgroup_ioctl_lock);
1237	if (ret)
1238	goto out_free_path;
1239
1240	/*
1241	* Set quota enabled flag after committing the transaction, to avoid
1242	* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1243	* creation.
1244	*/
1245	spin_lock(lock: &fs_info->qgroup_lock);
1246	fs_info->quota_root = quota_root;
1247	set_bit(nr: BTRFS_FS_QUOTA_ENABLED, addr: &fs_info->flags);
1248	if (simple)
1249	btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
1250	spin_unlock(lock: &fs_info->qgroup_lock);
1251
1252	/ Skip rescan for simple qgroups. /
1253	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
1254	goto out_free_path;
1255
1256	ret = qgroup_rescan_init(fs_info, progress_objectid: `0`, init_flags: `1`);
1257	if (!ret) {
1258	qgroup_rescan_zero_tracking(fs_info);
1259	fs_info->qgroup_rescan_running = true;
1260	btrfs_queue_work(wq: fs_info->qgroup_rescan_workers,
1261	work: &fs_info->qgroup_rescan_work);
1262	} else {
1263	/*
1264	* We have set both BTRFS_FS_QUOTA_ENABLED and
1265	* BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1266	* -EINPROGRESS. That can happen because someone started the
1267	* rescan worker by calling quota rescan ioctl before we
1268	* attempted to initialize the rescan worker. Failure due to
1269	* quotas disabled in the meanwhile is not possible, because
1270	* we are holding a write lock on fs_info->subvol_sem, which
1271	* is also acquired when disabling quotas.
1272	* Ignore such error, and any other error would need to undo
1273	* everything we did in the transaction we just committed.
1274	*/
1275	ASSERT(ret == -EINPROGRESS);
1276	ret = `0`;
1277	}
1278
1279	out_free_path:
1280	btrfs_free_path(p: path);
1281	out_free_root:
1282	if (ret)
1283	btrfs_put_root(root: quota_root);
1284	out:
1285	if (ret) {
1286	ulist_free(ulist: fs_info->qgroup_ulist);
1287	fs_info->qgroup_ulist = NULL;
1288	btrfs_sysfs_del_qgroups(fs_info);
1289	}
1290	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1291	if (ret && trans)
1292	btrfs_end_transaction(trans);
1293	else if (trans)
1294	ret = btrfs_end_transaction(trans);
1295	ulist_free(ulist);
1296	kfree(objp: prealloc);
1297	return ret;
1298	}
1299
1300	/*
1301	* It is possible to have outstanding ordered extents which reserved bytes
1302	* before we disabled. We need to fully flush delalloc, ordered extents, and a
1303	* commit to ensure that we don't leak such reservations, only to have them
1304	* come back if we re-enable.
1305	*
1306	* - enable simple quotas
1307	* - reserve space
1308	* - release it, store rsv_bytes in OE
1309	* - disable quotas
1310	* - enable simple quotas (qgroup rsv are all 0)
1311	* - OE finishes
1312	* - run delayed refs
1313	* - free rsv_bytes, resulting in miscounting or even underflow
1314	*/
1315	static int flush_reservations(struct btrfs_fs_info *fs_info)
1316	{
1317	struct btrfs_trans_handle *trans;
1318	int ret;
1319
1320	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, in_reclaim_context: false);
1321	if (ret)
1322	return ret;
1323	btrfs_wait_ordered_roots(fs_info, U64_MAX, range_start: `0`, range_len: (u64)-`1`);
1324	trans = btrfs_join_transaction(root: fs_info->tree_root);
1325	if (IS_ERR(ptr: trans))
1326	return PTR_ERR(ptr: trans);
1327	ret = btrfs_commit_transaction(trans);
1328
1329	return ret;
1330	}
1331
1332	int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1333	{
1334	struct btrfs_root *quota_root;
1335	struct btrfs_trans_handle *trans = NULL;
1336	int ret = `0`;
1337
1338	/*
1339	* We need to have subvol_sem write locked to prevent races with
1340	* snapshot creation.
1341	*/
1342	lockdep_assert_held_write(&fs_info->subvol_sem);
1343
1344	/*
1345	* Lock the cleaner mutex to prevent races with concurrent relocation,
1346	* because relocation may be building backrefs for blocks of the quota
1347	* root while we are deleting the root. This is like dropping fs roots
1348	* of deleted snapshots/subvolumes, we need the same protection.
1349	*
1350	* This also prevents races between concurrent tasks trying to disable
1351	* quotas, because we will unlock and relock qgroup_ioctl_lock across
1352	* BTRFS_FS_QUOTA_ENABLED changes.
1353	*/
1354	mutex_lock(&fs_info->cleaner_mutex);
1355
1356	mutex_lock(&fs_info->qgroup_ioctl_lock);
1357	if (!fs_info->quota_root)
1358	goto out;
1359
1360	/*
1361	* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1362	* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1363	* to lock that mutex while holding a transaction handle and the rescan
1364	* worker needs to commit a transaction.
1365	*/
1366	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1367
1368	/*
1369	* Request qgroup rescan worker to complete and wait for it. This wait
1370	* must be done before transaction start for quota disable since it may
1371	* deadlock with transaction by the qgroup rescan worker.
1372	*/
1373	clear_bit(nr: BTRFS_FS_QUOTA_ENABLED, addr: &fs_info->flags);
1374	btrfs_qgroup_wait_for_completion(fs_info, interruptible: false);
1375
1376	ret = flush_reservations(fs_info);
1377	if (ret)
1378	goto out_unlock_cleaner;
1379
1380	/*
1381	* 1 For the root item
1382	*
1383	* We should also reserve enough items for the quota tree deletion in
1384	* btrfs_clean_quota_tree but this is not done.
1385	*
1386	* Also, we must always start a transaction without holding the mutex
1387	* qgroup_ioctl_lock, see btrfs_quota_enable().
1388	*/
1389	trans = btrfs_start_transaction(root: fs_info->tree_root, num_items: `1`);
1390
1391	mutex_lock(&fs_info->qgroup_ioctl_lock);
1392	if (IS_ERR(ptr: trans)) {
1393	ret = PTR_ERR(ptr: trans);
1394	trans = NULL;
1395	set_bit(nr: BTRFS_FS_QUOTA_ENABLED, addr: &fs_info->flags);
1396	goto out;
1397	}
1398
1399	if (!fs_info->quota_root)
1400	goto out;
1401
1402	spin_lock(lock: &fs_info->qgroup_lock);
1403	quota_root = fs_info->quota_root;
1404	fs_info->quota_root = NULL;
1405	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1406	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1407	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
1408	spin_unlock(lock: &fs_info->qgroup_lock);
1409
1410	btrfs_free_qgroup_config(fs_info);
1411
1412	ret = btrfs_clean_quota_tree(trans, root: quota_root);
1413	if (ret) {
1414	btrfs_abort_transaction(trans, ret);
1415	goto out;
1416	}
1417
1418	ret = btrfs_del_root(trans, key: &quota_root->root_key);
1419	if (ret) {
1420	btrfs_abort_transaction(trans, ret);
1421	goto out;
1422	}
1423
1424	spin_lock(lock: &fs_info->trans_lock);
1425	list_del(entry: &quota_root->dirty_list);
1426	spin_unlock(lock: &fs_info->trans_lock);
1427
1428	btrfs_tree_lock(eb: quota_root->node);
1429	btrfs_clear_buffer_dirty(trans, buf: quota_root->node);
1430	btrfs_tree_unlock(eb: quota_root->node);
1431	btrfs_free_tree_block(trans, root_id: btrfs_root_id(root: quota_root),
1432	buf: quota_root->node, parent: `0`, last_ref: `1`);
1433
1434	btrfs_put_root(root: quota_root);
1435
1436	out:
1437	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1438	if (ret && trans)
1439	btrfs_end_transaction(trans);
1440	else if (trans)
1441	ret = btrfs_commit_transaction(trans);
1442	out_unlock_cleaner:
1443	mutex_unlock(lock: &fs_info->cleaner_mutex);
1444
1445	return ret;
1446	}
1447
1448	static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1449	struct btrfs_qgroup *qgroup)
1450	{
1451	if (list_empty(head: &qgroup->dirty))
1452	list_add(new: &qgroup->dirty, head: &fs_info->dirty_qgroups);
1453	}
1454
1455	static void qgroup_iterator_add(struct list_head head, struct* btrfs_qgroup *qgroup)
1456	{
1457	if (!list_empty(head: &qgroup->iterator))
1458	return;
1459
1460	list_add_tail(new: &qgroup->iterator, head);
1461	}
1462
1463	static void qgroup_iterator_clean(struct list_head *head)
1464	{
1465	while (!list_empty(head)) {
1466	struct btrfs_qgroup *qgroup;
1467
1468	qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
1469	list_del_init(entry: &qgroup->iterator);
1470	}
1471	}
1472
1473	/*
1474	* The easy accounting, we're updating qgroup relationship whose child qgroup
1475	* only has exclusive extents.
1476	*
1477	* In this case, all exclusive extents will also be exclusive for parent, so
1478	* excl/rfer just get added/removed.
1479	*
1480	* So is qgroup reservation space, which should also be added/removed to
1481	* parent.
1482	* Or when child tries to release reservation space, parent will underflow its
1483	* reservation (for relationship adding case).
1484	*
1485	* Caller should hold fs_info->qgroup_lock.
1486	*/
1487	static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
1488	struct btrfs_qgroup src, int* sign)
1489	{
1490	struct btrfs_qgroup *qgroup;
1491	struct btrfs_qgroup *cur;
1492	LIST_HEAD(qgroup_list);
1493	u64 num_bytes = src->excl;
1494	int ret = `0`;
1495
1496	qgroup = find_qgroup_rb(fs_info, qgroupid: ref_root);
1497	if (!qgroup)
1498	goto out;
1499
1500	qgroup_iterator_add(head: &qgroup_list, qgroup);
1501	list_for_each_entry(cur, &qgroup_list, iterator) {
1502	struct btrfs_qgroup_list *glist;
1503
1504	qgroup->rfer += sign * num_bytes;
1505	qgroup->rfer_cmpr += sign * num_bytes;
1506
1507	WARN_ON(sign < `0` && qgroup->excl < num_bytes);
1508	qgroup->excl += sign * num_bytes;
1509	qgroup->excl_cmpr += sign * num_bytes;
1510
1511	if (sign > `0`)
1512	qgroup_rsv_add_by_qgroup(fs_info, dest: qgroup, src);
1513	else
1514	qgroup_rsv_release_by_qgroup(fs_info, dest: qgroup, src);
1515	qgroup_dirty(fs_info, qgroup);
1516
1517	/ Append parent qgroups to @qgroup_list. /
1518	list_for_each_entry(glist, &qgroup->groups, next_group)
1519	qgroup_iterator_add(head: &qgroup_list, qgroup: glist->group);
1520	}
1521	ret = `0`;
1522	out:
1523	qgroup_iterator_clean(head: &qgroup_list);
1524	return ret;
1525	}
1526
1527
1528	/*
1529	* Quick path for updating qgroup with only excl refs.
1530	*
1531	* In that case, just update all parent will be enough.
1532	* Or we needs to do a full rescan.
1533	* Caller should also hold fs_info->qgroup_lock.
1534	*
1535	* Return 0 for quick update, return >0 for need to full rescan
1536	* and mark INCONSISTENT flag.
1537	* Return < 0 for other error.
1538	*/
1539	static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1540	u64 src, u64 dst, int sign)
1541	{
1542	struct btrfs_qgroup *qgroup;
1543	int ret = `1`;
1544	int err = `0`;
1545
1546	qgroup = find_qgroup_rb(fs_info, qgroupid: src);
1547	if (!qgroup)
1548	goto out;
1549	if (qgroup->excl == qgroup->rfer) {
1550	ret = `0`;
1551	err = __qgroup_excl_accounting(fs_info, ref_root: dst, src: qgroup, sign);
1552	if (err < `0`) {
1553	ret = err;
1554	goto out;
1555	}
1556	}
1557	out:
1558	if (ret)
1559	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1560	return ret;
1561	}
1562
1563	int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
1564	{
1565	struct btrfs_fs_info *fs_info = trans->fs_info;
1566	struct btrfs_qgroup *parent;
1567	struct btrfs_qgroup *member;
1568	struct btrfs_qgroup_list *list;
1569	struct btrfs_qgroup_list *prealloc = NULL;
1570	int ret = `0`;
1571
1572	/ Check the level of src and dst first /
1573	if (btrfs_qgroup_level(qgroupid: src) >= btrfs_qgroup_level(qgroupid: dst))
1574	return -EINVAL;
1575
1576	mutex_lock(&fs_info->qgroup_ioctl_lock);
1577	if (!fs_info->quota_root) {
1578	ret = -ENOTCONN;
1579	goto out;
1580	}
1581	member = find_qgroup_rb(fs_info, qgroupid: src);
1582	parent = find_qgroup_rb(fs_info, qgroupid: dst);
1583	if (!member \|\| !parent) {
1584	ret = -EINVAL;
1585	goto out;
1586	}
1587
1588	/ check if such qgroup relation exist firstly /
1589	list_for_each_entry(list, &member->groups, next_group) {
1590	if (list->group == parent) {
1591	ret = -EEXIST;
1592	goto out;
1593	}
1594	}
1595
1596	prealloc = kzalloc(size: sizeof(*list), GFP_NOFS);
1597	if (!prealloc) {
1598	ret = -ENOMEM;
1599	goto out;
1600	}
1601	ret = add_qgroup_relation_item(trans, src, dst);
1602	if (ret)
1603	goto out;
1604
1605	ret = add_qgroup_relation_item(trans, src: dst, dst: src);
1606	if (ret) {
1607	del_qgroup_relation_item(trans, src, dst);
1608	goto out;
1609	}
1610
1611	spin_lock(lock: &fs_info->qgroup_lock);
1612	ret = __add_relation_rb(prealloc, member, parent);
1613	prealloc = NULL;
1614	if (ret < `0`) {
1615	spin_unlock(lock: &fs_info->qgroup_lock);
1616	goto out;
1617	}
1618	ret = quick_update_accounting(fs_info, src, dst, sign: `1`);
1619	spin_unlock(lock: &fs_info->qgroup_lock);
1620	out:
1621	kfree(objp: prealloc);
1622	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1623	return ret;
1624	}
1625
1626	static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1627	u64 dst)
1628	{
1629	struct btrfs_fs_info *fs_info = trans->fs_info;
1630	struct btrfs_qgroup *parent;
1631	struct btrfs_qgroup *member;
1632	struct btrfs_qgroup_list *list;
1633	bool found = false;
1634	int ret = `0`;
1635	int ret2;
1636
1637	if (!fs_info->quota_root) {
1638	ret = -ENOTCONN;
1639	goto out;
1640	}
1641
1642	member = find_qgroup_rb(fs_info, qgroupid: src);
1643	parent = find_qgroup_rb(fs_info, qgroupid: dst);
1644	/*
1645	* The parent/member pair doesn't exist, then try to delete the dead
1646	* relation items only.
1647	*/
1648	if (!member \|\| !parent)
1649	goto delete_item;
1650
1651	/ check if such qgroup relation exist firstly /
1652	list_for_each_entry(list, &member->groups, next_group) {
1653	if (list->group == parent) {
1654	found = true;
1655	break;
1656	}
1657	}
1658
1659	delete_item:
1660	ret = del_qgroup_relation_item(trans, src, dst);
1661	if (ret < `0` && ret != -ENOENT)
1662	goto out;
1663	ret2 = del_qgroup_relation_item(trans, src: dst, dst: src);
1664	if (ret2 < `0` && ret2 != -ENOENT)
1665	goto out;
1666
1667	/ At least one deletion succeeded, return 0 /
1668	if (!ret \|\| !ret2)
1669	ret = `0`;
1670
1671	if (found) {
1672	spin_lock(lock: &fs_info->qgroup_lock);
1673	del_relation_rb(fs_info, memberid: src, parentid: dst);
1674	ret = quick_update_accounting(fs_info, src, dst, sign: -`1`);
1675	spin_unlock(lock: &fs_info->qgroup_lock);
1676	}
1677	out:
1678	return ret;
1679	}
1680
1681	int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1682	u64 dst)
1683	{
1684	struct btrfs_fs_info *fs_info = trans->fs_info;
1685	int ret = `0`;
1686
1687	mutex_lock(&fs_info->qgroup_ioctl_lock);
1688	ret = __del_qgroup_relation(trans, src, dst);
1689	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1690
1691	return ret;
1692	}
1693
1694	int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1695	{
1696	struct btrfs_fs_info *fs_info = trans->fs_info;
1697	struct btrfs_root *quota_root;
1698	struct btrfs_qgroup *qgroup;
1699	struct btrfs_qgroup *prealloc = NULL;
1700	int ret = `0`;
1701
1702	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
1703	return `0`;
1704
1705	mutex_lock(&fs_info->qgroup_ioctl_lock);
1706	if (!fs_info->quota_root) {
1707	ret = -ENOTCONN;
1708	goto out;
1709	}
1710	quota_root = fs_info->quota_root;
1711	qgroup = find_qgroup_rb(fs_info, qgroupid);
1712	if (qgroup) {
1713	ret = -EEXIST;
1714	goto out;
1715	}
1716
1717	prealloc = kzalloc(size: sizeof(*prealloc), GFP_NOFS);
1718	if (!prealloc) {
1719	ret = -ENOMEM;
1720	goto out;
1721	}
1722
1723	ret = add_qgroup_item(trans, quota_root, qgroupid);
1724	if (ret)
1725	goto out;
1726
1727	spin_lock(lock: &fs_info->qgroup_lock);
1728	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
1729	spin_unlock(lock: &fs_info->qgroup_lock);
1730	prealloc = NULL;
1731
1732	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1733	out:
1734	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1735	kfree(objp: prealloc);
1736	return ret;
1737	}
1738
1739	static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
1740	{
1741	return (qgroup->rfer > `0` \|\| qgroup->rfer_cmpr > `0` \|\|
1742	qgroup->excl > `0` \|\| qgroup->excl_cmpr > `0` \|\|
1743	qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > `0` \|\|
1744	qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > `0` \|\|
1745	qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > `0`);
1746	}
1747
1748	int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1749	{
1750	struct btrfs_fs_info *fs_info = trans->fs_info;
1751	struct btrfs_qgroup *qgroup;
1752	struct btrfs_qgroup_list *list;
1753	int ret = `0`;
1754
1755	mutex_lock(&fs_info->qgroup_ioctl_lock);
1756	if (!fs_info->quota_root) {
1757	ret = -ENOTCONN;
1758	goto out;
1759	}
1760
1761	qgroup = find_qgroup_rb(fs_info, qgroupid);
1762	if (!qgroup) {
1763	ret = -ENOENT;
1764	goto out;
1765	}
1766
1767	if (is_fstree(rootid: qgroupid) && qgroup_has_usage(qgroup)) {
1768	ret = -EBUSY;
1769	goto out;
1770	}
1771
1772	/ Check if there are no children of this qgroup /
1773	if (!list_empty(head: &qgroup->members)) {
1774	ret = -EBUSY;
1775	goto out;
1776	}
1777
1778	ret = del_qgroup_item(trans, qgroupid);
1779	if (ret && ret != -ENOENT)
1780	goto out;
1781
1782	while (!list_empty(head: &qgroup->groups)) {
1783	list = list_first_entry(&qgroup->groups,
1784	struct btrfs_qgroup_list, next_group);
1785	ret = __del_qgroup_relation(trans, src: qgroupid,
1786	dst: list->group->qgroupid);
1787	if (ret)
1788	goto out;
1789	}
1790
1791	spin_lock(lock: &fs_info->qgroup_lock);
1792	del_qgroup_rb(fs_info, qgroupid);
1793	spin_unlock(lock: &fs_info->qgroup_lock);
1794
1795	/*
1796	* Remove the qgroup from sysfs now without holding the qgroup_lock
1797	* spinlock, since the sysfs_remove_group() function needs to take
1798	* the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1799	*/
1800	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1801	kfree(objp: qgroup);
1802	out:
1803	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1804	return ret;
1805	}
1806
1807	int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1808	struct btrfs_qgroup_limit *limit)
1809	{
1810	struct btrfs_fs_info *fs_info = trans->fs_info;
1811	struct btrfs_qgroup *qgroup;
1812	int ret = `0`;
1813	/ Sometimes we would want to clear the limit on this qgroup.*
1814	* To meet this requirement, we treat the -1 as a special value
1815	* which tell kernel to clear the limit on this qgroup.
1816	*/
1817	const u64 CLEAR_VALUE = -`1`;
1818
1819	mutex_lock(&fs_info->qgroup_ioctl_lock);
1820	if (!fs_info->quota_root) {
1821	ret = -ENOTCONN;
1822	goto out;
1823	}
1824
1825	qgroup = find_qgroup_rb(fs_info, qgroupid);
1826	if (!qgroup) {
1827	ret = -ENOENT;
1828	goto out;
1829	}
1830
1831	spin_lock(lock: &fs_info->qgroup_lock);
1832	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1833	if (limit->max_rfer == CLEAR_VALUE) {
1834	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1835	limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1836	qgroup->max_rfer = `0`;
1837	} else {
1838	qgroup->max_rfer = limit->max_rfer;
1839	}
1840	}
1841	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1842	if (limit->max_excl == CLEAR_VALUE) {
1843	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1844	limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1845	qgroup->max_excl = `0`;
1846	} else {
1847	qgroup->max_excl = limit->max_excl;
1848	}
1849	}
1850	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1851	if (limit->rsv_rfer == CLEAR_VALUE) {
1852	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1853	limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1854	qgroup->rsv_rfer = `0`;
1855	} else {
1856	qgroup->rsv_rfer = limit->rsv_rfer;
1857	}
1858	}
1859	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1860	if (limit->rsv_excl == CLEAR_VALUE) {
1861	qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1862	limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1863	qgroup->rsv_excl = `0`;
1864	} else {
1865	qgroup->rsv_excl = limit->rsv_excl;
1866	}
1867	}
1868	qgroup->lim_flags \|= limit->flags;
1869
1870	spin_unlock(lock: &fs_info->qgroup_lock);
1871
1872	ret = update_qgroup_limit_item(trans, qgroup);
1873	if (ret) {
1874	qgroup_mark_inconsistent(fs_info);
1875	btrfs_info(fs_info, "unable to update quota limit for %llu",
1876	qgroupid);
1877	}
1878
1879	out:
1880	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
1881	return ret;
1882	}
1883
1884	/*
1885	* Inform qgroup to trace one dirty extent, its info is recorded in @record.
1886	* So qgroup can account it at transaction committing time.
1887	*
1888	* No lock version, caller must acquire delayed ref lock and allocated memory,
1889	* then call btrfs_qgroup_trace_extent_post() after exiting lock context.
1890	*
1891	* Return 0 for success insert
1892	* Return >0 for existing record, caller can free @record safely.
1893	* Error is not possible
1894	*/
1895	int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1896	struct btrfs_delayed_ref_root *delayed_refs,
1897	struct btrfs_qgroup_extent_record *record)
1898	{
1899	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1900	struct rb_node *parent_node = NULL;
1901	struct btrfs_qgroup_extent_record *entry;
1902	u64 bytenr = record->bytenr;
1903
1904	if (!btrfs_qgroup_full_accounting(fs_info))
1905	return `1`;
1906
1907	lockdep_assert_held(&delayed_refs->lock);
1908	trace_btrfs_qgroup_trace_extent(fs_info, rec: record);
1909
1910	while (*p) {
1911	parent_node = *p;
1912	entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1913	node);
1914	if (bytenr < entry->bytenr) {
1915	p = &(*p)->rb_left;
1916	} else if (bytenr > entry->bytenr) {
1917	p = &(*p)->rb_right;
1918	} else {
1919	if (record->data_rsv && !entry->data_rsv) {
1920	entry->data_rsv = record->data_rsv;
1921	entry->data_rsv_refroot =
1922	record->data_rsv_refroot;
1923	}
1924	return `1`;
1925	}
1926	}
1927
1928	rb_link_node(node: &record->node, parent: parent_node, rb_link: p);
1929	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1930	return `0`;
1931	}
1932
1933	/*
1934	* Post handler after qgroup_trace_extent_nolock().
1935	*
1936	* NOTE: Current qgroup does the expensive backref walk at transaction
1937	* committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
1938	* new transaction.
1939	* This is designed to allow btrfs_find_all_roots() to get correct new_roots
1940	* result.
1941	*
1942	* However for old_roots there is no need to do backref walk at that time,
1943	* since we search commit roots to walk backref and result will always be
1944	* correct.
1945	*
1946	* Due to the nature of no lock version, we can't do backref there.
1947	* So we must call btrfs_qgroup_trace_extent_post() after exiting
1948	* spinlock context.
1949	*
1950	* TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
1951	* using current root, then we can move all expensive backref walk out of
1952	* transaction committing, but not now as qgroup accounting will be wrong again.
1953	*/
1954	int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1955	struct btrfs_qgroup_extent_record *qrecord)
1956	{
1957	struct btrfs_backref_walk_ctx ctx = { `0` };
1958	int ret;
1959
1960	if (!btrfs_qgroup_full_accounting(fs_info: trans->fs_info))
1961	return `0`;
1962	/*
1963	* We are always called in a context where we are already holding a
1964	* transaction handle. Often we are called when adding a data delayed
1965	* reference from btrfs_truncate_inode_items() (truncating or unlinking),
1966	* in which case we will be holding a write lock on extent buffer from a
1967	* subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1968	* acquire fs_info->commit_root_sem, because that is a higher level lock
1969	* that must be acquired before locking any extent buffers.
1970	*
1971	* So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1972	* but we can't pass it a non-NULL transaction handle, because otherwise
1973	* it would not use commit roots and would lock extent buffers, causing
1974	* a deadlock if it ends up trying to read lock the same extent buffer
1975	* that was previously write locked at btrfs_truncate_inode_items().
1976	*
1977	* So pass a NULL transaction handle to btrfs_find_all_roots() and
1978	* explicitly tell it to not acquire the commit_root_sem - if we are
1979	* holding a transaction handle we don't need its protection.
1980	*/
1981	ASSERT(trans != NULL);
1982
1983	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
1984	return `0`;
1985
1986	ctx.bytenr = qrecord->bytenr;
1987	ctx.fs_info = trans->fs_info;
1988
1989	ret = btrfs_find_all_roots(ctx: &ctx, skip_commit_root_sem: true);
1990	if (ret < `0`) {
1991	qgroup_mark_inconsistent(fs_info: trans->fs_info);
1992	btrfs_warn(trans->fs_info,
1993	"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1994	ret);
1995	return `0`;
1996	}
1997
1998	/*
1999	* Here we don't need to get the lock of
2000	* trans->transaction->delayed_refs, since inserted qrecord won't
2001	* be deleted, only qrecord->node may be modified (new qrecord insert)
2002	*
2003	* So modifying qrecord->old_roots is safe here
2004	*/
2005	qrecord->old_roots = ctx.roots;
2006	return `0`;
2007	}
2008
2009	/*
2010	* Inform qgroup to trace one dirty extent, specified by @bytenr and
2011	* @num_bytes.
2012	* So qgroup can account it at commit trans time.
2013	*
2014	* Better encapsulated version, with memory allocation and backref walk for
2015	* commit roots.
2016	* So this can sleep.
2017	*
2018	* Return 0 if the operation is done.
2019	* Return <0 for error, like memory allocation failure or invalid parameter
2020	* (NULL trans)
2021	*/
2022	int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2023	u64 num_bytes)
2024	{
2025	struct btrfs_fs_info *fs_info = trans->fs_info;
2026	struct btrfs_qgroup_extent_record *record;
2027	struct btrfs_delayed_ref_root *delayed_refs;
2028	int ret;
2029
2030	if (!btrfs_qgroup_full_accounting(fs_info) \|\| bytenr == `0` \|\| num_bytes == `0`)
2031	return `0`;
2032	record = kzalloc(size: sizeof(*record), GFP_NOFS);
2033	if (!record)
2034	return -ENOMEM;
2035
2036	delayed_refs = &trans->transaction->delayed_refs;
2037	record->bytenr = bytenr;
2038	record->num_bytes = num_bytes;
2039	record->old_roots = NULL;
2040
2041	spin_lock(lock: &delayed_refs->lock);
2042	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
2043	spin_unlock(lock: &delayed_refs->lock);
2044	if (ret > `0`) {
2045	kfree(objp: record);
2046	return `0`;
2047	}
2048	return btrfs_qgroup_trace_extent_post(trans, qrecord: record);
2049	}
2050
2051	/*
2052	* Inform qgroup to trace all leaf items of data
2053	*
2054	* Return 0 for success
2055	* Return <0 for error(ENOMEM)
2056	*/
2057	int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
2058	struct extent_buffer *eb)
2059	{
2060	struct btrfs_fs_info *fs_info = trans->fs_info;
2061	int nr = btrfs_header_nritems(eb);
2062	int i, extent_type, ret;
2063	struct btrfs_key key;
2064	struct btrfs_file_extent_item *fi;
2065	u64 bytenr, num_bytes;
2066
2067	/ We can be called directly from walk_up_proc() /
2068	if (!btrfs_qgroup_full_accounting(fs_info))
2069	return `0`;
2070
2071	for (i = `0`; i < nr; i++) {
2072	btrfs_item_key_to_cpu(eb, cpu_key: &key, nr: i);
2073
2074	if (key.type != BTRFS_EXTENT_DATA_KEY)
2075	continue;
2076
2077	fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2078	/ filter out non qgroup-accountable extents /
2079	extent_type = btrfs_file_extent_type(eb, s: fi);
2080
2081	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
2082	continue;
2083
2084	bytenr = btrfs_file_extent_disk_bytenr(eb, s: fi);
2085	if (!bytenr)
2086	continue;
2087
2088	num_bytes = btrfs_file_extent_disk_num_bytes(eb, s: fi);
2089
2090	ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
2091	if (ret)
2092	return ret;
2093	}
2094	cond_resched();
2095	return `0`;
2096	}
2097
2098	/*
2099	* Walk up the tree from the bottom, freeing leaves and any interior
2100	* nodes which have had all slots visited. If a node (leaf or
2101	* interior) is freed, the node above it will have it's slot
2102	* incremented. The root node will never be freed.
2103	*
2104	* At the end of this function, we should have a path which has all
2105	* slots incremented to the next position for a search. If we need to
2106	* read a new node it will be NULL and the node above it will have the
2107	* correct slot selected for a later read.
2108	*
2109	* If we increment the root nodes slot counter past the number of
2110	* elements, 1 is returned to signal completion of the search.
2111	*/
2112	static int adjust_slots_upwards(struct btrfs_path path, int* root_level)
2113	{
2114	int level = `0`;
2115	int nr, slot;
2116	struct extent_buffer *eb;
2117
2118	if (root_level == `0`)
2119	return `1`;
2120
2121	while (level <= root_level) {
2122	eb = path->nodes[level];
2123	nr = btrfs_header_nritems(eb);
2124	path->slots[level]++;
2125	slot = path->slots[level];
2126	if (slot >= nr \|\| level == `0`) {
2127	/*
2128	* Don't free the root - we will detect this
2129	* condition after our loop and return a
2130	* positive value for caller to stop walking the tree.
2131	*/
2132	if (level != root_level) {
2133	btrfs_tree_unlock_rw(eb, rw: path->locks[level]);
2134	path->locks[level] = `0`;
2135
2136	free_extent_buffer(eb);
2137	path->nodes[level] = NULL;
2138	path->slots[level] = `0`;
2139	}
2140	} else {
2141	/*
2142	* We have a valid slot to walk back down
2143	* from. Stop here so caller can process these
2144	* new nodes.
2145	*/
2146	break;
2147	}
2148
2149	level++;
2150	}
2151
2152	eb = path->nodes[root_level];
2153	if (path->slots[root_level] >= btrfs_header_nritems(eb))
2154	return `1`;
2155
2156	return `0`;
2157	}
2158
2159	/*
2160	* Helper function to trace a subtree tree block swap.
2161	*
2162	* The swap will happen in highest tree block, but there may be a lot of
2163	* tree blocks involved.
2164	*
2165	* For example:
2166	* OO = Old tree blocks
2167	* NN = New tree blocks allocated during balance
2168	*
2169	* File tree (257) Reloc tree for 257
2170	* L2 OO NN
2171	* / \ / \
2172	* L1 OO OO (a) OO NN (a)
2173	* / \ / \ / \ / \
2174	* L0 OO OO OO OO OO OO NN NN
2175	* (b) (c) (b) (c)
2176	*
2177	* When calling qgroup_trace_extent_swap(), we will pass:
2178	* @src_eb = OO(a)
2179	* @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
2180	* @dst_level = 0
2181	* @root_level = 1
2182	*
2183	* In that case, qgroup_trace_extent_swap() will search from OO(a) to
2184	* reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
2185	*
2186	* The main work of qgroup_trace_extent_swap() can be split into 3 parts:
2187	*
2188	* 1) Tree search from @src_eb
2189	* It should acts as a simplified btrfs_search_slot().
2190	* The key for search can be extracted from @dst_path->nodes[dst_level]
2191	* (first key).
2192	*
2193	* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
2194	* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
2195	* They should be marked during previous (@dst_level = 1) iteration.
2196	*
2197	* 3) Mark file extents in leaves dirty
2198	* We don't have good way to pick out new file extents only.
2199	* So we still follow the old method by scanning all file extents in
2200	* the leave.
2201	*
2202	* This function can free us from keeping two paths, thus later we only need
2203	* to care about how to iterate all new tree blocks in reloc tree.
2204	*/
2205	static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
2206	struct extent_buffer *src_eb,
2207	struct btrfs_path *dst_path,
2208	int dst_level, int root_level,
2209	bool trace_leaf)
2210	{
2211	struct btrfs_key key;
2212	struct btrfs_path *src_path;
2213	struct btrfs_fs_info *fs_info = trans->fs_info;
2214	u32 nodesize = fs_info->nodesize;
2215	int cur_level = root_level;
2216	int ret;
2217
2218	BUG_ON(dst_level > root_level);
2219	/ Level mismatch /
2220	if (btrfs_header_level(eb: src_eb) != root_level)
2221	return -EINVAL;
2222
2223	src_path = btrfs_alloc_path();
2224	if (!src_path) {
2225	ret = -ENOMEM;
2226	goto out;
2227	}
2228
2229	if (dst_level)
2230	btrfs_node_key_to_cpu(eb: dst_path->nodes[dst_level], cpu_key: &key, nr: `0`);
2231	else
2232	btrfs_item_key_to_cpu(eb: dst_path->nodes[dst_level], cpu_key: &key, nr: `0`);
2233
2234	/ For src_path /
2235	atomic_inc(v: &src_eb->refs);
2236	src_path->nodes[root_level] = src_eb;
2237	src_path->slots[root_level] = dst_path->slots[root_level];
2238	src_path->locks[root_level] = `0`;
2239
2240	/ A simplified version of btrfs_search_slot() /
2241	while (cur_level >= dst_level) {
2242	struct btrfs_key src_key;
2243	struct btrfs_key dst_key;
2244
2245	if (src_path->nodes[cur_level] == NULL) {
2246	struct extent_buffer *eb;
2247	int parent_slot;
2248
2249	eb = src_path->nodes[cur_level + `1`];
2250	parent_slot = src_path->slots[cur_level + `1`];
2251
2252	eb = btrfs_read_node_slot(parent: eb, slot: parent_slot);
2253	if (IS_ERR(ptr: eb)) {
2254	ret = PTR_ERR(ptr: eb);
2255	goto out;
2256	}
2257
2258	src_path->nodes[cur_level] = eb;
2259
2260	btrfs_tree_read_lock(eb);
2261	src_path->locks[cur_level] = BTRFS_READ_LOCK;
2262	}
2263
2264	src_path->slots[cur_level] = dst_path->slots[cur_level];
2265	if (cur_level) {
2266	btrfs_node_key_to_cpu(eb: dst_path->nodes[cur_level],
2267	cpu_key: &dst_key, nr: dst_path->slots[cur_level]);
2268	btrfs_node_key_to_cpu(eb: src_path->nodes[cur_level],
2269	cpu_key: &src_key, nr: src_path->slots[cur_level]);
2270	} else {
2271	btrfs_item_key_to_cpu(eb: dst_path->nodes[cur_level],
2272	cpu_key: &dst_key, nr: dst_path->slots[cur_level]);
2273	btrfs_item_key_to_cpu(eb: src_path->nodes[cur_level],
2274	cpu_key: &src_key, nr: src_path->slots[cur_level]);
2275	}
2276	/ Content mismatch, something went wrong /
2277	if (btrfs_comp_cpu_keys(k1: &dst_key, k2: &src_key)) {
2278	ret = -ENOENT;
2279	goto out;
2280	}
2281	cur_level--;
2282	}
2283
2284	/*
2285	* Now both @dst_path and @src_path have been populated, record the tree
2286	* blocks for qgroup accounting.
2287	*/
2288	ret = btrfs_qgroup_trace_extent(trans, bytenr: src_path->nodes[dst_level]->start,
2289	num_bytes: nodesize);
2290	if (ret < `0`)
2291	goto out;
2292	ret = btrfs_qgroup_trace_extent(trans, bytenr: dst_path->nodes[dst_level]->start,
2293	num_bytes: nodesize);
2294	if (ret < `0`)
2295	goto out;
2296
2297	/ Record leaf file extents /
2298	if (dst_level == `0` && trace_leaf) {
2299	ret = btrfs_qgroup_trace_leaf_items(trans, eb: src_path->nodes[`0`]);
2300	if (ret < `0`)
2301	goto out;
2302	ret = btrfs_qgroup_trace_leaf_items(trans, eb: dst_path->nodes[`0`]);
2303	}
2304	out:
2305	btrfs_free_path(p: src_path);
2306	return ret;
2307	}
2308
2309	/*
2310	* Helper function to do recursive generation-aware depth-first search, to
2311	* locate all new tree blocks in a subtree of reloc tree.
2312	*
2313	* E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2314	* reloc tree
2315	* L2 NN (a)
2316	* / \
2317	* L1 OO NN (b)
2318	* / \ / \
2319	* L0 OO OO OO NN
2320	* (c) (d)
2321	* If we pass:
2322	* @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2323	* @cur_level = 1
2324	* @root_level = 1
2325	*
2326	* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2327	* above tree blocks along with their counter parts in file tree.
2328	* While during search, old tree blocks OO(c) will be skipped as tree block swap
2329	* won't affect OO(c).
2330	*/
2331	static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2332	struct extent_buffer *src_eb,
2333	struct btrfs_path *dst_path,
2334	int cur_level, int root_level,
2335	u64 last_snapshot, bool trace_leaf)
2336	{
2337	struct btrfs_fs_info *fs_info = trans->fs_info;
2338	struct extent_buffer *eb;
2339	bool need_cleanup = false;
2340	int ret = `0`;
2341	int i;
2342
2343	/ Level sanity check /
2344	if (cur_level < `0` \|\| cur_level >= BTRFS_MAX_LEVEL - `1` \|\|
2345	root_level < `0` \|\| root_level >= BTRFS_MAX_LEVEL - `1` \|\|
2346	root_level < cur_level) {
2347	btrfs_err_rl(fs_info,
2348	"%s: bad levels, cur_level=%d root_level=%d",
2349	__func__, cur_level, root_level);
2350	return -EUCLEAN;
2351	}
2352
2353	/ Read the tree block if needed /
2354	if (dst_path->nodes[cur_level] == NULL) {
2355	int parent_slot;
2356	u64 child_gen;
2357
2358	/*
2359	* dst_path->nodes[root_level] must be initialized before
2360	* calling this function.
2361	*/
2362	if (cur_level == root_level) {
2363	btrfs_err_rl(fs_info,
2364	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2365	__func__, root_level, root_level, cur_level);
2366	return -EUCLEAN;
2367	}
2368
2369	/*
2370	* We need to get child blockptr/gen from parent before we can
2371	* read it.
2372	*/
2373	eb = dst_path->nodes[cur_level + `1`];
2374	parent_slot = dst_path->slots[cur_level + `1`];
2375	child_gen = btrfs_node_ptr_generation(eb, nr: parent_slot);
2376
2377	/ This node is old, no need to trace /
2378	if (child_gen < last_snapshot)
2379	goto out;
2380
2381	eb = btrfs_read_node_slot(parent: eb, slot: parent_slot);
2382	if (IS_ERR(ptr: eb)) {
2383	ret = PTR_ERR(ptr: eb);
2384	goto out;
2385	}
2386
2387	dst_path->nodes[cur_level] = eb;
2388	dst_path->slots[cur_level] = `0`;
2389
2390	btrfs_tree_read_lock(eb);
2391	dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2392	need_cleanup = true;
2393	}
2394
2395	/ Now record this tree block and its counter part for qgroups /
2396	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, dst_level: cur_level,
2397	root_level, trace_leaf);
2398	if (ret < `0`)
2399	goto cleanup;
2400
2401	eb = dst_path->nodes[cur_level];
2402
2403	if (cur_level > `0`) {
2404	/ Iterate all child tree blocks /
2405	for (i = `0`; i < btrfs_header_nritems(eb); i++) {
2406	/ Skip old tree blocks as they won't be swapped /
2407	if (btrfs_node_ptr_generation(eb, nr: i) < last_snapshot)
2408	continue;
2409	dst_path->slots[cur_level] = i;
2410
2411	/ Recursive call (at most 7 times) /
2412	ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2413	dst_path, cur_level: cur_level - `1`, root_level,
2414	last_snapshot, trace_leaf);
2415	if (ret < `0`)
2416	goto cleanup;
2417	}
2418	}
2419
2420	cleanup:
2421	if (need_cleanup) {
2422	/ Clean up /
2423	btrfs_tree_unlock_rw(eb: dst_path->nodes[cur_level],
2424	rw: dst_path->locks[cur_level]);
2425	free_extent_buffer(eb: dst_path->nodes[cur_level]);
2426	dst_path->nodes[cur_level] = NULL;
2427	dst_path->slots[cur_level] = `0`;
2428	dst_path->locks[cur_level] = `0`;
2429	}
2430	out:
2431	return ret;
2432	}
2433
2434	static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2435	struct extent_buffer *src_eb,
2436	struct extent_buffer *dst_eb,
2437	u64 last_snapshot, bool trace_leaf)
2438	{
2439	struct btrfs_fs_info *fs_info = trans->fs_info;
2440	struct btrfs_path *dst_path = NULL;
2441	int level;
2442	int ret;
2443
2444	if (!btrfs_qgroup_full_accounting(fs_info))
2445	return `0`;
2446
2447	/ Wrong parameter order /
2448	if (btrfs_header_generation(eb: src_eb) > btrfs_header_generation(eb: dst_eb)) {
2449	btrfs_err_rl(fs_info,
2450	"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2451	btrfs_header_generation(src_eb),
2452	btrfs_header_generation(dst_eb));
2453	return -EUCLEAN;
2454	}
2455
2456	if (!extent_buffer_uptodate(eb: src_eb) \|\| !extent_buffer_uptodate(eb: dst_eb)) {
2457	ret = -EIO;
2458	goto out;
2459	}
2460
2461	level = btrfs_header_level(eb: dst_eb);
2462	dst_path = btrfs_alloc_path();
2463	if (!dst_path) {
2464	ret = -ENOMEM;
2465	goto out;
2466	}
2467	/ For dst_path /
2468	atomic_inc(v: &dst_eb->refs);
2469	dst_path->nodes[level] = dst_eb;
2470	dst_path->slots[level] = `0`;
2471	dst_path->locks[level] = `0`;
2472
2473	/ Do the generation aware breadth-first search /
2474	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, cur_level: level,
2475	root_level: level, last_snapshot, trace_leaf);
2476	if (ret < `0`)
2477	goto out;
2478	ret = `0`;
2479
2480	out:
2481	btrfs_free_path(p: dst_path);
2482	if (ret < `0`)
2483	qgroup_mark_inconsistent(fs_info);
2484	return ret;
2485	}
2486
2487	/*
2488	* Inform qgroup to trace a whole subtree, including all its child tree
2489	* blocks and data.
2490	* The root tree block is specified by @root_eb.
2491	*
2492	* Normally used by relocation(tree block swap) and subvolume deletion.
2493	*
2494	* Return 0 for success
2495	* Return <0 for error(ENOMEM or tree search error)
2496	*/
2497	int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2498	struct extent_buffer *root_eb,
2499	u64 root_gen, int root_level)
2500	{
2501	struct btrfs_fs_info *fs_info = trans->fs_info;
2502	int ret = `0`;
2503	int level;
2504	u8 drop_subptree_thres;
2505	struct extent_buffer *eb = root_eb;
2506	struct btrfs_path *path = NULL;
2507
2508	ASSERT(`0` <= root_level && root_level < BTRFS_MAX_LEVEL);
2509	ASSERT(root_eb != NULL);
2510
2511	if (!btrfs_qgroup_full_accounting(fs_info))
2512	return `0`;
2513
2514	spin_lock(lock: &fs_info->qgroup_lock);
2515	drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
2516	spin_unlock(lock: &fs_info->qgroup_lock);
2517
2518	/*
2519	* This function only gets called for snapshot drop, if we hit a high
2520	* node here, it means we are going to change ownership for quite a lot
2521	* of extents, which will greatly slow down btrfs_commit_transaction().
2522	*
2523	* So here if we find a high tree here, we just skip the accounting and
2524	* mark qgroup inconsistent.
2525	*/
2526	if (root_level >= drop_subptree_thres) {
2527	qgroup_mark_inconsistent(fs_info);
2528	return `0`;
2529	}
2530
2531	if (!extent_buffer_uptodate(eb: root_eb)) {
2532	struct btrfs_tree_parent_check check = {
2533	.has_first_key = false,
2534	.transid = root_gen,
2535	.level = root_level
2536	};
2537
2538	ret = btrfs_read_extent_buffer(buf: root_eb, check: &check);
2539	if (ret)
2540	goto out;
2541	}
2542
2543	if (root_level == `0`) {
2544	ret = btrfs_qgroup_trace_leaf_items(trans, eb: root_eb);
2545	goto out;
2546	}
2547
2548	path = btrfs_alloc_path();
2549	if (!path)
2550	return -ENOMEM;
2551
2552	/*
2553	* Walk down the tree. Missing extent blocks are filled in as
2554	* we go. Metadata is accounted every time we read a new
2555	* extent block.
2556	*
2557	* When we reach a leaf, we account for file extent items in it,
2558	* walk back up the tree (adjusting slot pointers as we go)
2559	* and restart the search process.
2560	*/
2561	atomic_inc(v: &root_eb->refs); / For path /
2562	path->nodes[root_level] = root_eb;
2563	path->slots[root_level] = `0`;
2564	path->locks[root_level] = `0`; / so release_path doesn't try to unlock /
2565	walk_down:
2566	level = root_level;
2567	while (level >= `0`) {
2568	if (path->nodes[level] == NULL) {
2569	int parent_slot;
2570	u64 child_bytenr;
2571
2572	/*
2573	* We need to get child blockptr from parent before we
2574	* can read it.
2575	*/
2576	eb = path->nodes[level + `1`];
2577	parent_slot = path->slots[level + `1`];
2578	child_bytenr = btrfs_node_blockptr(eb, nr: parent_slot);
2579
2580	eb = btrfs_read_node_slot(parent: eb, slot: parent_slot);
2581	if (IS_ERR(ptr: eb)) {
2582	ret = PTR_ERR(ptr: eb);
2583	goto out;
2584	}
2585
2586	path->nodes[level] = eb;
2587	path->slots[level] = `0`;
2588
2589	btrfs_tree_read_lock(eb);
2590	path->locks[level] = BTRFS_READ_LOCK;
2591
2592	ret = btrfs_qgroup_trace_extent(trans, bytenr: child_bytenr,
2593	num_bytes: fs_info->nodesize);
2594	if (ret)
2595	goto out;
2596	}
2597
2598	if (level == `0`) {
2599	ret = btrfs_qgroup_trace_leaf_items(trans,
2600	eb: path->nodes[level]);
2601	if (ret)
2602	goto out;
2603
2604	/ Nonzero return here means we completed our search /
2605	ret = adjust_slots_upwards(path, root_level);
2606	if (ret)
2607	break;
2608
2609	/ Restart search with new slots /
2610	goto walk_down;
2611	}
2612
2613	level--;
2614	}
2615
2616	ret = `0`;
2617	out:
2618	btrfs_free_path(p: path);
2619
2620	return ret;
2621	}
2622
2623	static void qgroup_iterator_nested_add(struct list_head head, struct* btrfs_qgroup *qgroup)
2624	{
2625	if (!list_empty(head: &qgroup->nested_iterator))
2626	return;
2627
2628	list_add_tail(new: &qgroup->nested_iterator, head);
2629	}
2630
2631	static void qgroup_iterator_nested_clean(struct list_head *head)
2632	{
2633	while (!list_empty(head)) {
2634	struct btrfs_qgroup *qgroup;
2635
2636	qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
2637	list_del_init(entry: &qgroup->nested_iterator);
2638	}
2639	}
2640
2641	#define UPDATE_NEW 0
2642	#define UPDATE_OLD 1
2643	/*
2644	* Walk all of the roots that points to the bytenr and adjust their refcnts.
2645	*/
2646	static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2647	struct ulist roots, struct* list_head *qgroups,
2648	u64 seq, int update_old)
2649	{
2650	struct ulist_node *unode;
2651	struct ulist_iterator uiter;
2652	struct btrfs_qgroup *qg;
2653
2654	if (!roots)
2655	return;
2656	ULIST_ITER_INIT(&uiter);
2657	while ((unode = ulist_next(ulist: roots, uiter: &uiter))) {
2658	LIST_HEAD(tmp);
2659
2660	qg = find_qgroup_rb(fs_info, qgroupid: unode->val);
2661	if (!qg)
2662	continue;
2663
2664	qgroup_iterator_nested_add(head: qgroups, qgroup: qg);
2665	qgroup_iterator_add(head: &tmp, qgroup: qg);
2666	list_for_each_entry(qg, &tmp, iterator) {
2667	struct btrfs_qgroup_list *glist;
2668
2669	if (update_old)
2670	btrfs_qgroup_update_old_refcnt(qg, seq, mod: `1`);
2671	else
2672	btrfs_qgroup_update_new_refcnt(qg, seq, mod: `1`);
2673
2674	list_for_each_entry(glist, &qg->groups, next_group) {
2675	qgroup_iterator_nested_add(head: qgroups, qgroup: glist->group);
2676	qgroup_iterator_add(head: &tmp, qgroup: glist->group);
2677	}
2678	}
2679	qgroup_iterator_clean(head: &tmp);
2680	}
2681	}
2682
2683	/*
2684	* Update qgroup rfer/excl counters.
2685	* Rfer update is easy, codes can explain themselves.
2686	*
2687	* Excl update is tricky, the update is split into 2 parts.
2688	* Part 1: Possible exclusive <-> sharing detect:
2689	* \| A \| !A \|
2690	* -------------------------------------
2691	* B \| * \| - \|
2692	* -------------------------------------
2693	* !B \| + \| ** \|
2694	* -------------------------------------
2695	*
2696	* Conditions:
2697	* A: cur_old_roots < nr_old_roots (not exclusive before)
2698	* !A: cur_old_roots == nr_old_roots (possible exclusive before)
2699	* B: cur_new_roots < nr_new_roots (not exclusive now)
2700	* !B: cur_new_roots == nr_new_roots (possible exclusive now)
2701	*
2702	* Results:
2703	* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
2704	* : Definitely not changed. *: Possible unchanged.
2705	*
2706	* For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2707	*
2708	* To make the logic clear, we first use condition A and B to split
2709	* combination into 4 results.
2710	*
2711	* Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2712	* only on variant maybe 0.
2713	*
2714	* Lastly, check result **, since there are 2 variants maybe 0, split them
2715	* again(2x2).
2716	* But this time we don't need to consider other things, the codes and logic
2717	* is easy to understand now.
2718	*/
2719	static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
2720	struct list_head *qgroups, u64 nr_old_roots,
2721	u64 nr_new_roots, u64 num_bytes, u64 seq)
2722	{
2723	struct btrfs_qgroup *qg;
2724
2725	list_for_each_entry(qg, qgroups, nested_iterator) {
2726	u64 cur_new_count, cur_old_count;
2727	bool dirty = false;
2728
2729	cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2730	cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2731
2732	trace_qgroup_update_counters(fs_info, qgroup: qg, cur_old_count,
2733	cur_new_count);
2734
2735	/ Rfer update part /
2736	if (cur_old_count == `0` && cur_new_count > `0`) {
2737	qg->rfer += num_bytes;
2738	qg->rfer_cmpr += num_bytes;
2739	dirty = true;
2740	}
2741	if (cur_old_count > `0` && cur_new_count == `0`) {
2742	qg->rfer -= num_bytes;
2743	qg->rfer_cmpr -= num_bytes;
2744	dirty = true;
2745	}
2746
2747	/ Excl update part /
2748	/ Exclusive/none -> shared case /
2749	if (cur_old_count == nr_old_roots &&
2750	cur_new_count < nr_new_roots) {
2751	/ Exclusive -> shared /
2752	if (cur_old_count != `0`) {
2753	qg->excl -= num_bytes;
2754	qg->excl_cmpr -= num_bytes;
2755	dirty = true;
2756	}
2757	}
2758
2759	/ Shared -> exclusive/none case /
2760	if (cur_old_count < nr_old_roots &&
2761	cur_new_count == nr_new_roots) {
2762	/ Shared->exclusive /
2763	if (cur_new_count != `0`) {
2764	qg->excl += num_bytes;
2765	qg->excl_cmpr += num_bytes;
2766	dirty = true;
2767	}
2768	}
2769
2770	/ Exclusive/none -> exclusive/none case /
2771	if (cur_old_count == nr_old_roots &&
2772	cur_new_count == nr_new_roots) {
2773	if (cur_old_count == `0`) {
2774	/ None -> exclusive/none /
2775
2776	if (cur_new_count != `0`) {
2777	/ None -> exclusive /
2778	qg->excl += num_bytes;
2779	qg->excl_cmpr += num_bytes;
2780	dirty = true;
2781	}
2782	/ None -> none, nothing changed /
2783	} else {
2784	/ Exclusive -> exclusive/none /
2785
2786	if (cur_new_count == `0`) {
2787	/ Exclusive -> none /
2788	qg->excl -= num_bytes;
2789	qg->excl_cmpr -= num_bytes;
2790	dirty = true;
2791	}
2792	/ Exclusive -> exclusive, nothing changed /
2793	}
2794	}
2795
2796	if (dirty)
2797	qgroup_dirty(fs_info, qgroup: qg);
2798	}
2799	}
2800
2801	/*
2802	* Check if the @roots potentially is a list of fs tree roots
2803	*
2804	* Return 0 for definitely not a fs/subvol tree roots ulist
2805	* Return 1 for possible fs/subvol tree roots in the list (considering an empty
2806	* one as well)
2807	*/
2808	static int maybe_fs_roots(struct ulist *roots)
2809	{
2810	struct ulist_node *unode;
2811	struct ulist_iterator uiter;
2812
2813	/ Empty one, still possible for fs roots /
2814	if (!roots \|\| roots->nnodes == `0`)
2815	return `1`;
2816
2817	ULIST_ITER_INIT(&uiter);
2818	unode = ulist_next(ulist: roots, uiter: &uiter);
2819	if (!unode)
2820	return `1`;
2821
2822	/*
2823	* If it contains fs tree roots, then it must belong to fs/subvol
2824	* trees.
2825	* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2826	*/
2827	return is_fstree(rootid: unode->val);
2828	}
2829
2830	int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2831	u64 num_bytes, struct ulist *old_roots,
2832	struct ulist *new_roots)
2833	{
2834	struct btrfs_fs_info *fs_info = trans->fs_info;
2835	LIST_HEAD(qgroups);
2836	u64 seq;
2837	u64 nr_new_roots = `0`;
2838	u64 nr_old_roots = `0`;
2839	int ret = `0`;
2840
2841	/*
2842	* If quotas get disabled meanwhile, the resources need to be freed and
2843	* we can't just exit here.
2844	*/
2845	if (!btrfs_qgroup_full_accounting(fs_info) \|\|
2846	fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2847	goto out_free;
2848
2849	if (new_roots) {
2850	if (!maybe_fs_roots(roots: new_roots))
2851	goto out_free;
2852	nr_new_roots = new_roots->nnodes;
2853	}
2854	if (old_roots) {
2855	if (!maybe_fs_roots(roots: old_roots))
2856	goto out_free;
2857	nr_old_roots = old_roots->nnodes;
2858	}
2859
2860	/ Quick exit, either not fs tree roots, or won't affect any qgroup /
2861	if (nr_old_roots == `0` && nr_new_roots == `0`)
2862	goto out_free;
2863
2864	trace_btrfs_qgroup_account_extent(fs_info, transid: trans->transid, bytenr,
2865	num_bytes, nr_old_roots, nr_new_roots);
2866
2867	mutex_lock(&fs_info->qgroup_rescan_lock);
2868	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2869	if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2870	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
2871	ret = `0`;
2872	goto out_free;
2873	}
2874	}
2875	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
2876
2877	spin_lock(lock: &fs_info->qgroup_lock);
2878	seq = fs_info->qgroup_seq;
2879
2880	/ Update old refcnts using old_roots /
2881	qgroup_update_refcnt(fs_info, roots: old_roots, qgroups: &qgroups, seq, UPDATE_OLD);
2882
2883	/ Update new refcnts using new_roots /
2884	qgroup_update_refcnt(fs_info, roots: new_roots, qgroups: &qgroups, seq, UPDATE_NEW);
2885
2886	qgroup_update_counters(fs_info, qgroups: &qgroups, nr_old_roots, nr_new_roots,
2887	num_bytes, seq);
2888
2889	/*
2890	* We're done using the iterator, release all its qgroups while holding
2891	* fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
2892	* and trigger use-after-free accesses to qgroups.
2893	*/
2894	qgroup_iterator_nested_clean(head: &qgroups);
2895
2896	/*
2897	* Bump qgroup_seq to avoid seq overlap
2898	*/
2899	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + `1`;
2900	spin_unlock(lock: &fs_info->qgroup_lock);
2901	out_free:
2902	ulist_free(ulist: old_roots);
2903	ulist_free(ulist: new_roots);
2904	return ret;
2905	}
2906
2907	int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2908	{
2909	struct btrfs_fs_info *fs_info = trans->fs_info;
2910	struct btrfs_qgroup_extent_record *record;
2911	struct btrfs_delayed_ref_root *delayed_refs;
2912	struct ulist *new_roots = NULL;
2913	struct rb_node *node;
2914	u64 num_dirty_extents = `0`;
2915	u64 qgroup_to_skip;
2916	int ret = `0`;
2917
2918	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
2919	return `0`;
2920
2921	delayed_refs = &trans->transaction->delayed_refs;
2922	qgroup_to_skip = delayed_refs->qgroup_to_skip;
2923	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2924	record = rb_entry(node, struct btrfs_qgroup_extent_record,
2925	node);
2926
2927	num_dirty_extents++;
2928	trace_btrfs_qgroup_account_extents(fs_info, rec: record);
2929
2930	if (!ret && !(fs_info->qgroup_flags &
2931	BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
2932	struct btrfs_backref_walk_ctx ctx = { `0` };
2933
2934	ctx.bytenr = record->bytenr;
2935	ctx.fs_info = fs_info;
2936
2937	/*
2938	* Old roots should be searched when inserting qgroup
2939	* extent record.
2940	*
2941	* But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
2942	* we may have some record inserted during
2943	* NO_ACCOUNTING (thus no old_roots populated), but
2944	* later we start rescan, which clears NO_ACCOUNTING,
2945	* leaving some inserted records without old_roots
2946	* populated.
2947	*
2948	* Those cases are rare and should not cause too much
2949	* time spent during commit_transaction().
2950	*/
2951	if (!record->old_roots) {
2952	/ Search commit root to find old_roots /
2953	ret = btrfs_find_all_roots(ctx: &ctx, skip_commit_root_sem: false);
2954	if (ret < `0`)
2955	goto cleanup;
2956	record->old_roots = ctx.roots;
2957	ctx.roots = NULL;
2958	}
2959
2960	/*
2961	* Use BTRFS_SEQ_LAST as time_seq to do special search,
2962	* which doesn't lock tree or delayed_refs and search
2963	* current root. It's safe inside commit_transaction().
2964	*/
2965	ctx.trans = trans;
2966	ctx.time_seq = BTRFS_SEQ_LAST;
2967	ret = btrfs_find_all_roots(ctx: &ctx, skip_commit_root_sem: false);
2968	if (ret < `0`)
2969	goto cleanup;
2970	new_roots = ctx.roots;
2971	if (qgroup_to_skip) {
2972	ulist_del(ulist: new_roots, val: qgroup_to_skip, aux: `0`);
2973	ulist_del(ulist: record->old_roots, val: qgroup_to_skip,
2974	aux: `0`);
2975	}
2976	ret = btrfs_qgroup_account_extent(trans, bytenr: record->bytenr,
2977	num_bytes: record->num_bytes,
2978	old_roots: record->old_roots,
2979	new_roots);
2980	record->old_roots = NULL;
2981	new_roots = NULL;
2982	}
2983	/ Free the reserved data space /
2984	btrfs_qgroup_free_refroot(fs_info,
2985	ref_root: record->data_rsv_refroot,
2986	num_bytes: record->data_rsv,
2987	type: BTRFS_QGROUP_RSV_DATA);
2988	cleanup:
2989	ulist_free(ulist: record->old_roots);
2990	ulist_free(ulist: new_roots);
2991	new_roots = NULL;
2992	rb_erase(node, &delayed_refs->dirty_extent_root);
2993	kfree(objp: record);
2994
2995	}
2996	trace_qgroup_num_dirty_extents(fs_info, transid: trans->transid,
2997	num_dirty_extents);
2998	return ret;
2999	}
3000
3001	/*
3002	* Writes all changed qgroups to disk.
3003	* Called by the transaction commit path and the qgroup assign ioctl.
3004	*/
3005	int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
3006	{
3007	struct btrfs_fs_info *fs_info = trans->fs_info;
3008	int ret = `0`;
3009
3010	/*
3011	* In case we are called from the qgroup assign ioctl, assert that we
3012	* are holding the qgroup_ioctl_lock, otherwise we can race with a quota
3013	* disable operation (ioctl) and access a freed quota root.
3014	*/
3015	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
3016	lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
3017
3018	if (!fs_info->quota_root)
3019	return ret;
3020
3021	spin_lock(lock: &fs_info->qgroup_lock);
3022	while (!list_empty(head: &fs_info->dirty_qgroups)) {
3023	struct btrfs_qgroup *qgroup;
3024	qgroup = list_first_entry(&fs_info->dirty_qgroups,
3025	struct btrfs_qgroup, dirty);
3026	list_del_init(entry: &qgroup->dirty);
3027	spin_unlock(lock: &fs_info->qgroup_lock);
3028	ret = update_qgroup_info_item(trans, qgroup);
3029	if (ret)
3030	qgroup_mark_inconsistent(fs_info);
3031	ret = update_qgroup_limit_item(trans, qgroup);
3032	if (ret)
3033	qgroup_mark_inconsistent(fs_info);
3034	spin_lock(lock: &fs_info->qgroup_lock);
3035	}
3036	if (btrfs_qgroup_enabled(fs_info))
3037	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_ON;
3038	else
3039	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
3040	spin_unlock(lock: &fs_info->qgroup_lock);
3041
3042	ret = update_qgroup_status_item(trans);
3043	if (ret)
3044	qgroup_mark_inconsistent(fs_info);
3045
3046	return ret;
3047	}
3048
3049	int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
3050	struct btrfs_qgroup_inherit *inherit,
3051	size_t size)
3052	{
3053	if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
3054	return -EOPNOTSUPP;
3055	if (size < sizeof(*inherit) \|\| size > PAGE_SIZE)
3056	return -EINVAL;
3057
3058	/*
3059	* In the past we allowed btrfs_qgroup_inherit to specify to copy
3060	* rfer/excl numbers directly from other qgroups. This behavior has
3061	* been disabled in userspace for a very long time, but here we should
3062	* also disable it in kernel, as this behavior is known to mark qgroup
3063	* inconsistent, and a rescan would wipe out the changes anyway.
3064	*
3065	* Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
3066	*/
3067	if (inherit->num_ref_copies > `0` \|\| inherit->num_excl_copies > `0`)
3068	return -EINVAL;
3069
3070	if (inherit->num_qgroups > PAGE_SIZE)
3071	return -EINVAL;
3072
3073	if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
3074	return -EINVAL;
3075
3076	/*
3077	* Now check all the remaining qgroups, they should all:
3078	*
3079	* - Exist
3080	* - Be higher level qgroups.
3081	*/
3082	for (int i = `0`; i < inherit->num_qgroups; i++) {
3083	struct btrfs_qgroup *qgroup;
3084	u64 qgroupid = inherit->qgroups[i];
3085
3086	if (btrfs_qgroup_level(qgroupid) == `0`)
3087	return -EINVAL;
3088
3089	spin_lock(lock: &fs_info->qgroup_lock);
3090	qgroup = find_qgroup_rb(fs_info, qgroupid);
3091	if (!qgroup) {
3092	spin_unlock(lock: &fs_info->qgroup_lock);
3093	return -ENOENT;
3094	}
3095	spin_unlock(lock: &fs_info->qgroup_lock);
3096	}
3097	return `0`;
3098	}
3099
3100	static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
3101	u64 inode_rootid,
3102	struct btrfs_qgroup_inherit **inherit)
3103	{
3104	int i = `0`;
3105	u64 num_qgroups = `0`;
3106	struct btrfs_qgroup *inode_qg;
3107	struct btrfs_qgroup_list *qg_list;
3108	struct btrfs_qgroup_inherit *res;
3109	size_t struct_sz;
3110	u64 *qgids;
3111
3112	if (*inherit)
3113	return -EEXIST;
3114
3115	inode_qg = find_qgroup_rb(fs_info, qgroupid: inode_rootid);
3116	if (!inode_qg)
3117	return -ENOENT;
3118
3119	num_qgroups = list_count_nodes(head: &inode_qg->groups);
3120
3121	if (!num_qgroups)
3122	return `0`;
3123
3124	struct_sz = struct_size(res, qgroups, num_qgroups);
3125	if (struct_sz == SIZE_MAX)
3126	return -ERANGE;
3127
3128	res = kzalloc(size: struct_sz, GFP_NOFS);
3129	if (!res)
3130	return -ENOMEM;
3131	res->num_qgroups = num_qgroups;
3132	qgids = res->qgroups;
3133
3134	list_for_each_entry(qg_list, &inode_qg->groups, next_group)
3135	qgids[i] = qg_list->group->qgroupid;
3136
3137	*inherit = res;
3138	return `0`;
3139	}
3140
3141	/*
3142	* Check if we can skip rescan when inheriting qgroups. If @src has a single
3143	* @parent, and that @parent is owning all its bytes exclusively, we can skip
3144	* the full rescan, by just adding nodesize to the @parent's excl/rfer.
3145	*
3146	* Return <0 for fatal errors (like srcid/parentid has no qgroup).
3147	* Return 0 if a quick inherit is done.
3148	* Return >0 if a quick inherit is not possible, and a full rescan is needed.
3149	*/
3150	static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
3151	u64 srcid, u64 parentid)
3152	{
3153	struct btrfs_qgroup *src;
3154	struct btrfs_qgroup *parent;
3155	struct btrfs_qgroup_list *list;
3156	int nr_parents = `0`;
3157
3158	src = find_qgroup_rb(fs_info, qgroupid: srcid);
3159	if (!src)
3160	return -ENOENT;
3161	parent = find_qgroup_rb(fs_info, qgroupid: parentid);
3162	if (!parent)
3163	return -ENOENT;
3164
3165	/*
3166	* Source has no parent qgroup, but our new qgroup would have one.
3167	* Qgroup numbers would become inconsistent.
3168	*/
3169	if (list_empty(head: &src->groups))
3170	return `1`;
3171
3172	list_for_each_entry(list, &src->groups, next_group) {
3173	/ The parent is not the same, quick update is not possible. /
3174	if (list->group->qgroupid != parentid)
3175	return `1`;
3176	nr_parents++;
3177	/*
3178	* More than one parent qgroup, we can't be sure about accounting
3179	* consistency.
3180	*/
3181	if (nr_parents > `1`)
3182	return `1`;
3183	}
3184
3185	/*
3186	* The parent is not exclusively owning all its bytes. We're not sure
3187	* if the source has any bytes not fully owned by the parent.
3188	*/
3189	if (parent->excl != parent->rfer)
3190	return `1`;
3191
3192	parent->excl += fs_info->nodesize;
3193	parent->rfer += fs_info->nodesize;
3194	return `0`;
3195	}
3196
3197	/*
3198	* Copy the accounting information between qgroups. This is necessary
3199	* when a snapshot or a subvolume is created. Throwing an error will
3200	* cause a transaction abort so we take extra care here to only error
3201	* when a readonly fs is a reasonable outcome.
3202	*/
3203	int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
3204	u64 objectid, u64 inode_rootid,
3205	struct btrfs_qgroup_inherit *inherit)
3206	{
3207	int ret = `0`;
3208	int i;
3209	u64 *i_qgroups;
3210	bool committing = false;
3211	struct btrfs_fs_info *fs_info = trans->fs_info;
3212	struct btrfs_root *quota_root;
3213	struct btrfs_qgroup *srcgroup;
3214	struct btrfs_qgroup *dstgroup;
3215	struct btrfs_qgroup *prealloc;
3216	struct btrfs_qgroup_list **qlist_prealloc = NULL;
3217	bool free_inherit = false;
3218	bool need_rescan = false;
3219	u32 level_size = `0`;
3220	u64 nums;
3221
3222	prealloc = kzalloc(size: sizeof(*prealloc), GFP_NOFS);
3223	if (!prealloc)
3224	return -ENOMEM;
3225
3226	/*
3227	* There are only two callers of this function.
3228	*
3229	* One in create_subvol() in the ioctl context, which needs to hold
3230	* the qgroup_ioctl_lock.
3231	*
3232	* The other one in create_pending_snapshot() where no other qgroup
3233	* code can modify the fs as they all need to either start a new trans
3234	* or hold a trans handler, thus we don't need to hold
3235	* qgroup_ioctl_lock.
3236	* This would avoid long and complex lock chain and make lockdep happy.
3237	*/
3238	spin_lock(lock: &fs_info->trans_lock);
3239	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
3240	committing = true;
3241	spin_unlock(lock: &fs_info->trans_lock);
3242
3243	if (!committing)
3244	mutex_lock(&fs_info->qgroup_ioctl_lock);
3245	if (!btrfs_qgroup_enabled(fs_info))
3246	goto out;
3247
3248	quota_root = fs_info->quota_root;
3249	if (!quota_root) {
3250	ret = -EINVAL;
3251	goto out;
3252	}
3253
3254	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
3255	ret = qgroup_auto_inherit(fs_info, inode_rootid, inherit: &inherit);
3256	if (ret)
3257	goto out;
3258	free_inherit = true;
3259	}
3260
3261	if (inherit) {
3262	i_qgroups = (u64 *)(inherit + `1`);
3263	nums = inherit->num_qgroups + `2` * inherit->num_ref_copies +
3264	`2` * inherit->num_excl_copies;
3265	for (i = `0`; i < nums; ++i) {
3266	srcgroup = find_qgroup_rb(fs_info, qgroupid: *i_qgroups);
3267
3268	/*
3269	* Zero out invalid groups so we can ignore
3270	* them later.
3271	*/
3272	if (!srcgroup \|\|
3273	((srcgroup->qgroupid >> `48`) <= (objectid >> `48`)))
3274	*i_qgroups = `0ULL`;
3275
3276	++i_qgroups;
3277	}
3278	}
3279
3280	/*
3281	* create a tracking group for the subvol itself
3282	*/
3283	ret = add_qgroup_item(trans, quota_root, qgroupid: objectid);
3284	if (ret)
3285	goto out;
3286
3287	/*
3288	* add qgroup to all inherited groups
3289	*/
3290	if (inherit) {
3291	i_qgroups = (u64 *)(inherit + `1`);
3292	for (i = `0`; i < inherit->num_qgroups; ++i, ++i_qgroups) {
3293	if (*i_qgroups == `0`)
3294	continue;
3295	ret = add_qgroup_relation_item(trans, src: objectid,
3296	dst: *i_qgroups);
3297	if (ret && ret != -EEXIST)
3298	goto out;
3299	ret = add_qgroup_relation_item(trans, src: *i_qgroups,
3300	dst: objectid);
3301	if (ret && ret != -EEXIST)
3302	goto out;
3303	}
3304	ret = `0`;
3305
3306	qlist_prealloc = kcalloc(n: inherit->num_qgroups,
3307	size: sizeof(struct btrfs_qgroup_list *),
3308	GFP_NOFS);
3309	if (!qlist_prealloc) {
3310	ret = -ENOMEM;
3311	goto out;
3312	}
3313	for (int i = `0`; i < inherit->num_qgroups; i++) {
3314	qlist_prealloc[i] = kzalloc(size: sizeof(struct btrfs_qgroup_list),
3315	GFP_NOFS);
3316	if (!qlist_prealloc[i]) {
3317	ret = -ENOMEM;
3318	goto out;
3319	}
3320	}
3321	}
3322
3323	spin_lock(lock: &fs_info->qgroup_lock);
3324
3325	dstgroup = add_qgroup_rb(fs_info, prealloc, qgroupid: objectid);
3326	prealloc = NULL;
3327
3328	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
3329	dstgroup->lim_flags = inherit->lim.flags;
3330	dstgroup->max_rfer = inherit->lim.max_rfer;
3331	dstgroup->max_excl = inherit->lim.max_excl;
3332	dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
3333	dstgroup->rsv_excl = inherit->lim.rsv_excl;
3334
3335	qgroup_dirty(fs_info, qgroup: dstgroup);
3336	}
3337
3338	if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
3339	srcgroup = find_qgroup_rb(fs_info, qgroupid: srcid);
3340	if (!srcgroup)
3341	goto unlock;
3342
3343	/*
3344	* We call inherit after we clone the root in order to make sure
3345	* our counts don't go crazy, so at this point the only
3346	* difference between the two roots should be the root node.
3347	*/
3348	level_size = fs_info->nodesize;
3349	dstgroup->rfer = srcgroup->rfer;
3350	dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
3351	dstgroup->excl = level_size;
3352	dstgroup->excl_cmpr = level_size;
3353	srcgroup->excl = level_size;
3354	srcgroup->excl_cmpr = level_size;
3355
3356	/ inherit the limit info /
3357	dstgroup->lim_flags = srcgroup->lim_flags;
3358	dstgroup->max_rfer = srcgroup->max_rfer;
3359	dstgroup->max_excl = srcgroup->max_excl;
3360	dstgroup->rsv_rfer = srcgroup->rsv_rfer;
3361	dstgroup->rsv_excl = srcgroup->rsv_excl;
3362
3363	qgroup_dirty(fs_info, qgroup: dstgroup);
3364	qgroup_dirty(fs_info, qgroup: srcgroup);
3365
3366	/*
3367	* If the source qgroup has parent but the new one doesn't,
3368	* we need a full rescan.
3369	*/
3370	if (!inherit && !list_empty(head: &srcgroup->groups))
3371	need_rescan = true;
3372	}
3373
3374	if (!inherit)
3375	goto unlock;
3376
3377	i_qgroups = (u64 *)(inherit + `1`);
3378	for (i = `0`; i < inherit->num_qgroups; ++i) {
3379	if (*i_qgroups) {
3380	ret = add_relation_rb(fs_info, prealloc: qlist_prealloc[i], memberid: objectid,
3381	parentid: *i_qgroups);
3382	qlist_prealloc[i] = NULL;
3383	if (ret)
3384	goto unlock;
3385	}
3386	if (srcid) {
3387	/ Check if we can do a quick inherit. /
3388	ret = qgroup_snapshot_quick_inherit(fs_info, srcid, parentid: *i_qgroups);
3389	if (ret < `0`)
3390	goto unlock;
3391	if (ret > `0`)
3392	need_rescan = true;
3393	ret = `0`;
3394	}
3395	++i_qgroups;
3396	}
3397
3398	for (i = `0`; i < inherit->num_ref_copies; ++i, i_qgroups += `2`) {
3399	struct btrfs_qgroup *src;
3400	struct btrfs_qgroup *dst;
3401
3402	if (!i_qgroups[`0`] \|\| !i_qgroups[`1`])
3403	continue;
3404
3405	src = find_qgroup_rb(fs_info, qgroupid: i_qgroups[`0`]);
3406	dst = find_qgroup_rb(fs_info, qgroupid: i_qgroups[`1`]);
3407
3408	if (!src \|\| !dst) {
3409	ret = -EINVAL;
3410	goto unlock;
3411	}
3412
3413	dst->rfer = src->rfer - level_size;
3414	dst->rfer_cmpr = src->rfer_cmpr - level_size;
3415
3416	/ Manually tweaking numbers certainly needs a rescan /
3417	need_rescan = true;
3418	}
3419	for (i = `0`; i < inherit->num_excl_copies; ++i, i_qgroups += `2`) {
3420	struct btrfs_qgroup *src;
3421	struct btrfs_qgroup *dst;
3422
3423	if (!i_qgroups[`0`] \|\| !i_qgroups[`1`])
3424	continue;
3425
3426	src = find_qgroup_rb(fs_info, qgroupid: i_qgroups[`0`]);
3427	dst = find_qgroup_rb(fs_info, qgroupid: i_qgroups[`1`]);
3428
3429	if (!src \|\| !dst) {
3430	ret = -EINVAL;
3431	goto unlock;
3432	}
3433
3434	dst->excl = src->excl + level_size;
3435	dst->excl_cmpr = src->excl_cmpr + level_size;
3436	need_rescan = true;
3437	}
3438
3439	unlock:
3440	spin_unlock(lock: &fs_info->qgroup_lock);
3441	if (!ret)
3442	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup: dstgroup);
3443	out:
3444	if (!committing)
3445	mutex_unlock(lock: &fs_info->qgroup_ioctl_lock);
3446	if (need_rescan)
3447	qgroup_mark_inconsistent(fs_info);
3448	if (qlist_prealloc) {
3449	for (int i = `0`; i < inherit->num_qgroups; i++)
3450	kfree(objp: qlist_prealloc[i]);
3451	kfree(objp: qlist_prealloc);
3452	}
3453	if (free_inherit)
3454	kfree(objp: inherit);
3455	kfree(objp: prealloc);
3456	return ret;
3457	}
3458
3459	static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3460	{
3461	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3462	qgroup_rsv_total(qgroup: qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3463	return false;
3464
3465	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3466	qgroup_rsv_total(qgroup: qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3467	return false;
3468
3469	return true;
3470	}
3471
3472	static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3473	enum btrfs_qgroup_rsv_type type)
3474	{
3475	struct btrfs_qgroup *qgroup;
3476	struct btrfs_fs_info *fs_info = root->fs_info;
3477	u64 ref_root = root->root_key.objectid;
3478	int ret = `0`;
3479	LIST_HEAD(qgroup_list);
3480
3481	if (!is_fstree(rootid: ref_root))
3482	return `0`;
3483
3484	if (num_bytes == `0`)
3485	return `0`;
3486
3487	if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3488	capable(CAP_SYS_RESOURCE))
3489	enforce = false;
3490
3491	spin_lock(lock: &fs_info->qgroup_lock);
3492	if (!fs_info->quota_root)
3493	goto out;
3494
3495	qgroup = find_qgroup_rb(fs_info, qgroupid: ref_root);
3496	if (!qgroup)
3497	goto out;
3498
3499	qgroup_iterator_add(head: &qgroup_list, qgroup);
3500	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3501	struct btrfs_qgroup_list *glist;
3502
3503	if (enforce && !qgroup_check_limits(qg: qgroup, num_bytes)) {
3504	ret = -EDQUOT;
3505	goto out;
3506	}
3507
3508	list_for_each_entry(glist, &qgroup->groups, next_group)
3509	qgroup_iterator_add(head: &qgroup_list, qgroup: glist->group);
3510	}
3511
3512	ret = `0`;
3513	/*
3514	* no limits exceeded, now record the reservation into all qgroups
3515	*/
3516	list_for_each_entry(qgroup, &qgroup_list, iterator)
3517	qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
3518
3519	out:
3520	qgroup_iterator_clean(head: &qgroup_list);
3521	spin_unlock(lock: &fs_info->qgroup_lock);
3522	return ret;
3523	}
3524
3525	/*
3526	* Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
3527	* qgroup).
3528	*
3529	* Will handle all higher level qgroup too.
3530	*
3531	* NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3532	* This special case is only used for META_PERTRANS type.
3533	*/
3534	void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3535	u64 ref_root, u64 num_bytes,
3536	enum btrfs_qgroup_rsv_type type)
3537	{
3538	struct btrfs_qgroup *qgroup;
3539	LIST_HEAD(qgroup_list);
3540
3541	if (!is_fstree(rootid: ref_root))
3542	return;
3543
3544	if (num_bytes == `0`)
3545	return;
3546
3547	if (num_bytes == (u64)-`1` && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3548	WARN(`1`, "%s: Invalid type to free", __func__);
3549	return;
3550	}
3551	spin_lock(lock: &fs_info->qgroup_lock);
3552
3553	if (!fs_info->quota_root)
3554	goto out;
3555
3556	qgroup = find_qgroup_rb(fs_info, qgroupid: ref_root);
3557	if (!qgroup)
3558	goto out;
3559
3560	if (num_bytes == (u64)-`1`)
3561	/*
3562	* We're freeing all pertrans rsv, get reserved value from
3563	* level 0 qgroup as real num_bytes to free.
3564	*/
3565	num_bytes = qgroup->rsv.values[type];
3566
3567	qgroup_iterator_add(head: &qgroup_list, qgroup);
3568	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3569	struct btrfs_qgroup_list *glist;
3570
3571	qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
3572	list_for_each_entry(glist, &qgroup->groups, next_group) {
3573	qgroup_iterator_add(head: &qgroup_list, qgroup: glist->group);
3574	}
3575	}
3576	out:
3577	qgroup_iterator_clean(head: &qgroup_list);
3578	spin_unlock(lock: &fs_info->qgroup_lock);
3579	}
3580
3581	/*
3582	* Check if the leaf is the last leaf. Which means all node pointers
3583	* are at their last position.
3584	*/
3585	static bool is_last_leaf(struct btrfs_path *path)
3586	{
3587	int i;
3588
3589	for (i = `1`; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3590	if (path->slots[i] != btrfs_header_nritems(eb: path->nodes[i]) - `1`)
3591	return false;
3592	}
3593	return true;
3594	}
3595
3596	/*
3597	* returns < 0 on error, 0 when more leafs are to be scanned.
3598	* returns 1 when done.
3599	*/
3600	static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3601	struct btrfs_path *path)
3602	{
3603	struct btrfs_fs_info *fs_info = trans->fs_info;
3604	struct btrfs_root *extent_root;
3605	struct btrfs_key found;
3606	struct extent_buffer *scratch_leaf = NULL;
3607	u64 num_bytes;
3608	bool done;
3609	int slot;
3610	int ret;
3611
3612	if (!btrfs_qgroup_full_accounting(fs_info))
3613	return `1`;
3614
3615	mutex_lock(&fs_info->qgroup_rescan_lock);
3616	extent_root = btrfs_extent_root(fs_info,
3617	bytenr: fs_info->qgroup_rescan_progress.objectid);
3618	ret = btrfs_search_slot_for_read(root: extent_root,
3619	key: &fs_info->qgroup_rescan_progress,
3620	p: path, find_higher: `1`, return_any: `0`);
3621
3622	btrfs_debug(fs_info,
3623	"current progress key (%llu %u %llu), search_slot ret %d",
3624	fs_info->qgroup_rescan_progress.objectid,
3625	fs_info->qgroup_rescan_progress.type,
3626	fs_info->qgroup_rescan_progress.offset, ret);
3627
3628	if (ret) {
3629	/*
3630	* The rescan is about to end, we will not be scanning any
3631	* further blocks. We cannot unset the RESCAN flag here, because
3632	* we want to commit the transaction if everything went well.
3633	* To make the live accounting work in this phase, we set our
3634	* scan progress pointer such that every real extent objectid
3635	* will be smaller.
3636	*/
3637	fs_info->qgroup_rescan_progress.objectid = (u64)-`1`;
3638	btrfs_release_path(p: path);
3639	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3640	return ret;
3641	}
3642	done = is_last_leaf(path);
3643
3644	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found,
3645	nr: btrfs_header_nritems(eb: path->nodes[`0`]) - `1`);
3646	fs_info->qgroup_rescan_progress.objectid = found.objectid + `1`;
3647
3648	scratch_leaf = btrfs_clone_extent_buffer(src: path->nodes[`0`]);
3649	if (!scratch_leaf) {
3650	ret = -ENOMEM;
3651	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3652	goto out;
3653	}
3654	slot = path->slots[`0`];
3655	btrfs_release_path(p: path);
3656	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3657
3658	for (; slot < btrfs_header_nritems(eb: scratch_leaf); ++slot) {
3659	struct btrfs_backref_walk_ctx ctx = { `0` };
3660
3661	btrfs_item_key_to_cpu(eb: scratch_leaf, cpu_key: &found, nr: slot);
3662	if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3663	found.type != BTRFS_METADATA_ITEM_KEY)
3664	continue;
3665	if (found.type == BTRFS_METADATA_ITEM_KEY)
3666	num_bytes = fs_info->nodesize;
3667	else
3668	num_bytes = found.offset;
3669
3670	ctx.bytenr = found.objectid;
3671	ctx.fs_info = fs_info;
3672
3673	ret = btrfs_find_all_roots(ctx: &ctx, skip_commit_root_sem: false);
3674	if (ret < `0`)
3675	goto out;
3676	/ For rescan, just pass old_roots as NULL /
3677	ret = btrfs_qgroup_account_extent(trans, bytenr: found.objectid,
3678	num_bytes, NULL, new_roots: ctx.roots);
3679	if (ret < `0`)
3680	goto out;
3681	}
3682	out:
3683	if (scratch_leaf)
3684	free_extent_buffer(eb: scratch_leaf);
3685
3686	if (done && !ret) {
3687	ret = `1`;
3688	fs_info->qgroup_rescan_progress.objectid = (u64)-`1`;
3689	}
3690	return ret;
3691	}
3692
3693	static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3694	{
3695	if (btrfs_fs_closing(fs_info))
3696	return true;
3697	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
3698	return true;
3699	if (!btrfs_qgroup_enabled(fs_info))
3700	return true;
3701	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3702	return true;
3703	return false;
3704	}
3705
3706	static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3707	{
3708	struct btrfs_fs_info fs_info = container_of(work, struct* btrfs_fs_info,
3709	qgroup_rescan_work);
3710	struct btrfs_path *path;
3711	struct btrfs_trans_handle *trans = NULL;
3712	int err = -ENOMEM;
3713	int ret = `0`;
3714	bool stopped = false;
3715	bool did_leaf_rescans = false;
3716
3717	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
3718	return;
3719
3720	path = btrfs_alloc_path();
3721	if (!path)
3722	goto out;
3723	/*
3724	* Rescan should only search for commit root, and any later difference
3725	* should be recorded by qgroup
3726	*/
3727	path->search_commit_root = `1`;
3728	path->skip_locking = `1`;
3729
3730	err = `0`;
3731	while (!err && !(stopped = rescan_should_stop(fs_info))) {
3732	trans = btrfs_start_transaction(root: fs_info->fs_root, num_items: `0`);
3733	if (IS_ERR(ptr: trans)) {
3734	err = PTR_ERR(ptr: trans);
3735	break;
3736	}
3737
3738	err = qgroup_rescan_leaf(trans, path);
3739	did_leaf_rescans = true;
3740
3741	if (err > `0`)
3742	btrfs_commit_transaction(trans);
3743	else
3744	btrfs_end_transaction(trans);
3745	}
3746
3747	out:
3748	btrfs_free_path(p: path);
3749
3750	mutex_lock(&fs_info->qgroup_rescan_lock);
3751	if (err > `0` &&
3752	fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3753	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3754	} else if (err < `0` \|\| stopped) {
3755	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3756	}
3757	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3758
3759	/*
3760	* Only update status, since the previous part has already updated the
3761	* qgroup info, and only if we did any actual work. This also prevents
3762	* race with a concurrent quota disable, which has already set
3763	* fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3764	* btrfs_quota_disable().
3765	*/
3766	if (did_leaf_rescans) {
3767	trans = btrfs_start_transaction(root: fs_info->quota_root, num_items: `1`);
3768	if (IS_ERR(ptr: trans)) {
3769	err = PTR_ERR(ptr: trans);
3770	trans = NULL;
3771	btrfs_err(fs_info,
3772	"fail to start transaction for status update: %d",
3773	err);
3774	}
3775	} else {
3776	trans = NULL;
3777	}
3778
3779	mutex_lock(&fs_info->qgroup_rescan_lock);
3780	if (!stopped \|\|
3781	fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3782	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3783	if (trans) {
3784	ret = update_qgroup_status_item(trans);
3785	if (ret < `0`) {
3786	err = ret;
3787	btrfs_err(fs_info, "fail to update qgroup status: %d",
3788	err);
3789	}
3790	}
3791	fs_info->qgroup_rescan_running = false;
3792	fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3793	complete_all(&fs_info->qgroup_rescan_completion);
3794	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3795
3796	if (!trans)
3797	return;
3798
3799	btrfs_end_transaction(trans);
3800
3801	if (stopped) {
3802	btrfs_info(fs_info, "qgroup scan paused");
3803	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
3804	btrfs_info(fs_info, "qgroup scan cancelled");
3805	} else if (err >= `0`) {
3806	btrfs_info(fs_info, "qgroup scan completed%s",
3807	err > `0` ? " (inconsistency flag cleared)" : "");
3808	} else {
3809	btrfs_err(fs_info, "qgroup scan failed with %d", err);
3810	}
3811	}
3812
3813	/*
3814	* Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3815	* memory required for the rescan context.
3816	*/
3817	static int
3818	qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3819	int init_flags)
3820	{
3821	int ret = `0`;
3822
3823	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3824	btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
3825	return -EINVAL;
3826	}
3827
3828	if (!init_flags) {
3829	/ we're resuming qgroup rescan at mount time /
3830	if (!(fs_info->qgroup_flags &
3831	BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3832	btrfs_warn(fs_info,
3833	"qgroup rescan init failed, qgroup rescan is not queued");
3834	ret = -EINVAL;
3835	} else if (!(fs_info->qgroup_flags &
3836	BTRFS_QGROUP_STATUS_FLAG_ON)) {
3837	btrfs_warn(fs_info,
3838	"qgroup rescan init failed, qgroup is not enabled");
3839	ret = -EINVAL;
3840	}
3841
3842	if (ret)
3843	return ret;
3844	}
3845
3846	mutex_lock(&fs_info->qgroup_rescan_lock);
3847
3848	if (init_flags) {
3849	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3850	btrfs_warn(fs_info,
3851	"qgroup rescan is already in progress");
3852	ret = -EINPROGRESS;
3853	} else if (!(fs_info->qgroup_flags &
3854	BTRFS_QGROUP_STATUS_FLAG_ON)) {
3855	btrfs_warn(fs_info,
3856	"qgroup rescan init failed, qgroup is not enabled");
3857	ret = -EINVAL;
3858	} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
3859	/ Quota disable is in progress /
3860	ret = -EBUSY;
3861	}
3862
3863	if (ret) {
3864	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3865	return ret;
3866	}
3867	fs_info->qgroup_flags \|= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3868	}
3869
3870	memset(&fs_info->qgroup_rescan_progress, `0`,
3871	sizeof(fs_info->qgroup_rescan_progress));
3872	fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN \|
3873	BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
3874	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3875	init_completion(x: &fs_info->qgroup_rescan_completion);
3876	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3877
3878	btrfs_init_work(work: &fs_info->qgroup_rescan_work,
3879	func: btrfs_qgroup_rescan_worker, NULL);
3880	return `0`;
3881	}
3882
3883	static void
3884	qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3885	{
3886	struct rb_node *n;
3887	struct btrfs_qgroup *qgroup;
3888
3889	spin_lock(lock: &fs_info->qgroup_lock);
3890	/ clear all current qgroup tracking information /
3891	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3892	qgroup = rb_entry(n, struct btrfs_qgroup, node);
3893	qgroup->rfer = `0`;
3894	qgroup->rfer_cmpr = `0`;
3895	qgroup->excl = `0`;
3896	qgroup->excl_cmpr = `0`;
3897	qgroup_dirty(fs_info, qgroup);
3898	}
3899	spin_unlock(lock: &fs_info->qgroup_lock);
3900	}
3901
3902	int
3903	btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3904	{
3905	int ret = `0`;
3906	struct btrfs_trans_handle *trans;
3907
3908	ret = qgroup_rescan_init(fs_info, progress_objectid: `0`, init_flags: `1`);
3909	if (ret)
3910	return ret;
3911
3912	/*
3913	* We have set the rescan_progress to 0, which means no more
3914	* delayed refs will be accounted by btrfs_qgroup_account_ref.
3915	* However, btrfs_qgroup_account_ref may be right after its call
3916	* to btrfs_find_all_roots, in which case it would still do the
3917	* accounting.
3918	* To solve this, we're committing the transaction, which will
3919	* ensure we run all delayed refs and only after that, we are
3920	* going to clear all tracking information for a clean start.
3921	*/
3922
3923	trans = btrfs_attach_transaction_barrier(root: fs_info->fs_root);
3924	if (IS_ERR(ptr: trans) && trans != ERR_PTR(error: -ENOENT)) {
3925	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3926	return PTR_ERR(ptr: trans);
3927	} else if (trans != ERR_PTR(error: -ENOENT)) {
3928	ret = btrfs_commit_transaction(trans);
3929	if (ret) {
3930	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3931	return ret;
3932	}
3933	}
3934
3935	qgroup_rescan_zero_tracking(fs_info);
3936
3937	mutex_lock(&fs_info->qgroup_rescan_lock);
3938	fs_info->qgroup_rescan_running = true;
3939	btrfs_queue_work(wq: fs_info->qgroup_rescan_workers,
3940	work: &fs_info->qgroup_rescan_work);
3941	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3942
3943	return `0`;
3944	}
3945
3946	int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3947	bool interruptible)
3948	{
3949	int running;
3950	int ret = `0`;
3951
3952	mutex_lock(&fs_info->qgroup_rescan_lock);
3953	running = fs_info->qgroup_rescan_running;
3954	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3955
3956	if (!running)
3957	return `0`;
3958
3959	if (interruptible)
3960	ret = wait_for_completion_interruptible(
3961	x: &fs_info->qgroup_rescan_completion);
3962	else
3963	wait_for_completion(&fs_info->qgroup_rescan_completion);
3964
3965	return ret;
3966	}
3967
3968	/*
3969	* this is only called from open_ctree where we're still single threaded, thus
3970	* locking is omitted here.
3971	*/
3972	void
3973	btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3974	{
3975	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3976	mutex_lock(&fs_info->qgroup_rescan_lock);
3977	fs_info->qgroup_rescan_running = true;
3978	btrfs_queue_work(wq: fs_info->qgroup_rescan_workers,
3979	work: &fs_info->qgroup_rescan_work);
3980	mutex_unlock(lock: &fs_info->qgroup_rescan_lock);
3981	}
3982	}
3983
3984	#define rbtree_iterate_from_safe(node, next, start) \
3985	for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3986
3987	static int qgroup_unreserve_range(struct btrfs_inode *inode,
3988	struct extent_changeset *reserved, u64 start,
3989	u64 len)
3990	{
3991	struct rb_node *node;
3992	struct rb_node *next;
3993	struct ulist_node *entry;
3994	int ret = `0`;
3995
3996	node = reserved->range_changed.root.rb_node;
3997	if (!node)
3998	return `0`;
3999	while (node) {
4000	entry = rb_entry(node, struct ulist_node, rb_node);
4001	if (entry->val < start)
4002	node = node->rb_right;
4003	else
4004	node = node->rb_left;
4005	}
4006
4007	if (entry->val > start && rb_prev(&entry->rb_node))
4008	entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
4009	rb_node);
4010
4011	rbtree_iterate_from_safe(node, next, &entry->rb_node) {
4012	u64 entry_start;
4013	u64 entry_end;
4014	u64 entry_len;
4015	int clear_ret;
4016
4017	entry = rb_entry(node, struct ulist_node, rb_node);
4018	entry_start = entry->val;
4019	entry_end = entry->aux;
4020	entry_len = entry_end - entry_start + `1`;
4021
4022	if (entry_start >= start + len)
4023	break;
4024	if (entry_start + entry_len <= start)
4025	continue;
4026	/*
4027	* Now the entry is in [start, start + len), revert the
4028	* EXTENT_QGROUP_RESERVED bit.
4029	*/
4030	clear_ret = clear_extent_bits(tree: &inode->io_tree, start: entry_start,
4031	end: entry_end, bits: EXTENT_QGROUP_RESERVED);
4032	if (!ret && clear_ret < `0`)
4033	ret = clear_ret;
4034
4035	ulist_del(ulist: &reserved->range_changed, val: entry->val, aux: entry->aux);
4036	if (likely(reserved->bytes_changed >= entry_len)) {
4037	reserved->bytes_changed -= entry_len;
4038	} else {
4039	WARN_ON(`1`);
4040	reserved->bytes_changed = `0`;
4041	}
4042	}
4043
4044	return ret;
4045	}
4046
4047	/*
4048	* Try to free some space for qgroup.
4049	*
4050	* For qgroup, there are only 3 ways to free qgroup space:
4051	* - Flush nodatacow write
4052	* Any nodatacow write will free its reserved data space at run_delalloc_range().
4053	* In theory, we should only flush nodatacow inodes, but it's not yet
4054	* possible, so we need to flush the whole root.
4055	*
4056	* - Wait for ordered extents
4057	* When ordered extents are finished, their reserved metadata is finally
4058	* converted to per_trans status, which can be freed by later commit
4059	* transaction.
4060	*
4061	* - Commit transaction
4062	* This would free the meta_per_trans space.
4063	* In theory this shouldn't provide much space, but any more qgroup space
4064	* is needed.
4065	*/
4066	static int try_flush_qgroup(struct btrfs_root *root)
4067	{
4068	struct btrfs_trans_handle *trans;
4069	int ret;
4070
4071	/ Can't hold an open transaction or we run the risk of deadlocking. /
4072	ASSERT(current->journal_info == NULL);
4073	if (WARN_ON(current->journal_info))
4074	return `0`;
4075
4076	/*
4077	* We don't want to run flush again and again, so if there is a running
4078	* one, we won't try to start a new flush, but exit directly.
4079	*/
4080	if (test_and_set_bit(nr: BTRFS_ROOT_QGROUP_FLUSHING, addr: &root->state)) {
4081	wait_event(root->qgroup_flush_wait,
4082	!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
4083	return `0`;
4084	}
4085
4086	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: true);
4087	if (ret < `0`)
4088	goto out;
4089	btrfs_wait_ordered_extents(root, U64_MAX, range_start: `0`, range_len: (u64)-`1`);
4090
4091	trans = btrfs_attach_transaction_barrier(root);
4092	if (IS_ERR(ptr: trans)) {
4093	ret = PTR_ERR(ptr: trans);
4094	if (ret == -ENOENT)
4095	ret = `0`;
4096	goto out;
4097	}
4098
4099	ret = btrfs_commit_transaction(trans);
4100	out:
4101	clear_bit(nr: BTRFS_ROOT_QGROUP_FLUSHING, addr: &root->state);
4102	wake_up(&root->qgroup_flush_wait);
4103	return ret;
4104	}
4105
4106	static int qgroup_reserve_data(struct btrfs_inode *inode,
4107	struct extent_changeset **reserved_ret, u64 start,
4108	u64 len)
4109	{
4110	struct btrfs_root *root = inode->root;
4111	struct extent_changeset *reserved;
4112	bool new_reserved = false;
4113	u64 orig_reserved;
4114	u64 to_reserve;
4115	int ret;
4116
4117	if (btrfs_qgroup_mode(fs_info: root->fs_info) == BTRFS_QGROUP_MODE_DISABLED \|\|
4118	!is_fstree(rootid: root->root_key.objectid) \|\| len == `0`)
4119	return `0`;
4120
4121	/ @reserved parameter is mandatory for qgroup /
4122	if (WARN_ON(!reserved_ret))
4123	return -EINVAL;
4124	if (!*reserved_ret) {
4125	new_reserved = true;
4126	*reserved_ret = extent_changeset_alloc();
4127	if (!*reserved_ret)
4128	return -ENOMEM;
4129	}
4130	reserved = *reserved_ret;
4131	/ Record already reserved space /
4132	orig_reserved = reserved->bytes_changed;
4133	ret = set_record_extent_bits(tree: &inode->io_tree, start,
4134	end: start + len -`1`, bits: EXTENT_QGROUP_RESERVED, changeset: reserved);
4135
4136	/ Newly reserved space /
4137	to_reserve = reserved->bytes_changed - orig_reserved;
4138	trace_btrfs_qgroup_reserve_data(inode: &inode->vfs_inode, start, len,
4139	reserved: to_reserve, op: QGROUP_RESERVE);
4140	if (ret < `0`)
4141	goto out;
4142	ret = qgroup_reserve(root, num_bytes: to_reserve, enforce: true, type: BTRFS_QGROUP_RSV_DATA);
4143	if (ret < `0`)
4144	goto cleanup;
4145
4146	return ret;
4147
4148	cleanup:
4149	qgroup_unreserve_range(inode, reserved, start, len);
4150	out:
4151	if (new_reserved) {
4152	extent_changeset_free(changeset: reserved);
4153	*reserved_ret = NULL;
4154	}
4155	return ret;
4156	}
4157
4158	/*
4159	* Reserve qgroup space for range [start, start + len).
4160	*
4161	* This function will either reserve space from related qgroups or do nothing
4162	* if the range is already reserved.
4163	*
4164	* Return 0 for successful reservation
4165	* Return <0 for error (including -EQUOT)
4166	*
4167	* NOTE: This function may sleep for memory allocation, dirty page flushing and
4168	* commit transaction. So caller should not hold any dirty page locked.
4169	*/
4170	int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
4171	struct extent_changeset **reserved_ret, u64 start,
4172	u64 len)
4173	{
4174	int ret;
4175
4176	ret = qgroup_reserve_data(inode, reserved_ret, start, len);
4177	if (ret <= `0` && ret != -EDQUOT)
4178	return ret;
4179
4180	ret = try_flush_qgroup(root: inode->root);
4181	if (ret < `0`)
4182	return ret;
4183	return qgroup_reserve_data(inode, reserved_ret, start, len);
4184	}
4185
4186	/ Free ranges specified by @reserved, normally in error path /
4187	static int qgroup_free_reserved_data(struct btrfs_inode *inode,
4188	struct extent_changeset *reserved,
4189	u64 start, u64 len, u64 *freed_ret)
4190	{
4191	struct btrfs_root *root = inode->root;
4192	struct ulist_node *unode;
4193	struct ulist_iterator uiter;
4194	struct extent_changeset changeset;
4195	u64 freed = `0`;
4196	int ret;
4197
4198	extent_changeset_init(changeset: &changeset);
4199	len = round_up(start + len, root->fs_info->sectorsize);
4200	start = round_down(start, root->fs_info->sectorsize);
4201
4202	ULIST_ITER_INIT(&uiter);
4203	while ((unode = ulist_next(ulist: &reserved->range_changed, uiter: &uiter))) {
4204	u64 range_start = unode->val;
4205	/ unode->aux is the inclusive end /
4206	u64 range_len = unode->aux - range_start + `1`;
4207	u64 free_start;
4208	u64 free_len;
4209
4210	extent_changeset_release(changeset: &changeset);
4211
4212	/ Only free range in range [start, start + len) /
4213	if (range_start >= start + len \|\|
4214	range_start + range_len <= start)
4215	continue;
4216	free_start = max(range_start, start);
4217	free_len = min(start + len, range_start + range_len) -
4218	free_start;
4219	/*
4220	* TODO: To also modify reserved->ranges_reserved to reflect
4221	* the modification.
4222	*
4223	* However as long as we free qgroup reserved according to
4224	* EXTENT_QGROUP_RESERVED, we won't double free.
4225	* So not need to rush.
4226	*/
4227	ret = clear_record_extent_bits(tree: &inode->io_tree, start: free_start,
4228	end: free_start + free_len - `1`,
4229	bits: EXTENT_QGROUP_RESERVED, changeset: &changeset);
4230	if (ret < `0`)
4231	goto out;
4232	freed += changeset.bytes_changed;
4233	}
4234	btrfs_qgroup_free_refroot(fs_info: root->fs_info, ref_root: root->root_key.objectid, num_bytes: freed,
4235	type: BTRFS_QGROUP_RSV_DATA);
4236	if (freed_ret)
4237	*freed_ret = freed;
4238	ret = `0`;
4239	out:
4240	extent_changeset_release(changeset: &changeset);
4241	return ret;
4242	}
4243
4244	static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
4245	struct extent_changeset *reserved, u64 start, u64 len,
4246	u64 released, int* free)
4247	{
4248	struct extent_changeset changeset;
4249	int trace_op = QGROUP_RELEASE;
4250	int ret;
4251
4252	if (btrfs_qgroup_mode(fs_info: inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
4253	extent_changeset_init(changeset: &changeset);
4254	return clear_record_extent_bits(tree: &inode->io_tree, start,
4255	end: start + len - `1`,
4256	bits: EXTENT_QGROUP_RESERVED, changeset: &changeset);
4257	}
4258
4259	/ In release case, we shouldn't have @reserved /
4260	WARN_ON(!free && reserved);
4261	if (free && reserved)
4262	return qgroup_free_reserved_data(inode, reserved, start, len, freed_ret: released);
4263	extent_changeset_init(changeset: &changeset);
4264	ret = clear_record_extent_bits(tree: &inode->io_tree, start, end: start + len -`1`,
4265	bits: EXTENT_QGROUP_RESERVED, changeset: &changeset);
4266	if (ret < `0`)
4267	goto out;
4268
4269	if (free)
4270	trace_op = QGROUP_FREE;
4271	trace_btrfs_qgroup_release_data(inode: &inode->vfs_inode, start, len,
4272	reserved: changeset.bytes_changed, op: trace_op);
4273	if (free)
4274	btrfs_qgroup_free_refroot(fs_info: inode->root->fs_info,
4275	ref_root: inode->root->root_key.objectid,
4276	num_bytes: changeset.bytes_changed, type: BTRFS_QGROUP_RSV_DATA);
4277	if (released)
4278	*released = changeset.bytes_changed;
4279	out:
4280	extent_changeset_release(changeset: &changeset);
4281	return ret;
4282	}
4283
4284	/*
4285	* Free a reserved space range from io_tree and related qgroups
4286	*
4287	* Should be called when a range of pages get invalidated before reaching disk.
4288	* Or for error cleanup case.
4289	* if @reserved is given, only reserved range in [@start, @start + @len) will
4290	* be freed.
4291	*
4292	* For data written to disk, use btrfs_qgroup_release_data().
4293	*
4294	* NOTE: This function may sleep for memory allocation.
4295	*/
4296	int btrfs_qgroup_free_data(struct btrfs_inode *inode,
4297	struct extent_changeset *reserved,
4298	u64 start, u64 len, u64 *freed)
4299	{
4300	return __btrfs_qgroup_release_data(inode, reserved, start, len, released: freed, free: `1`);
4301	}
4302
4303	/*
4304	* Release a reserved space range from io_tree only.
4305	*
4306	* Should be called when a range of pages get written to disk and corresponding
4307	* FILE_EXTENT is inserted into corresponding root.
4308	*
4309	* Since new qgroup accounting framework will only update qgroup numbers at
4310	* commit_transaction() time, its reserved space shouldn't be freed from
4311	* related qgroups.
4312	*
4313	* But we should release the range from io_tree, to allow further write to be
4314	* COWed.
4315	*
4316	* NOTE: This function may sleep for memory allocation.
4317	*/
4318	int btrfs_qgroup_release_data(struct btrfs_inode inode, u64 start, u64 len, u64 released)
4319	{
4320	return __btrfs_qgroup_release_data(inode, NULL, start, len, released, free: `0`);
4321	}
4322
4323	static void add_root_meta_rsv(struct btrfs_root root, int* num_bytes,
4324	enum btrfs_qgroup_rsv_type type)
4325	{
4326	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4327	type != BTRFS_QGROUP_RSV_META_PERTRANS)
4328	return;
4329	if (num_bytes == `0`)
4330	return;
4331
4332	spin_lock(lock: &root->qgroup_meta_rsv_lock);
4333	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
4334	root->qgroup_meta_rsv_prealloc += num_bytes;
4335	else
4336	root->qgroup_meta_rsv_pertrans += num_bytes;
4337	spin_unlock(lock: &root->qgroup_meta_rsv_lock);
4338	}
4339
4340	static int sub_root_meta_rsv(struct btrfs_root root, int* num_bytes,
4341	enum btrfs_qgroup_rsv_type type)
4342	{
4343	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4344	type != BTRFS_QGROUP_RSV_META_PERTRANS)
4345	return `0`;
4346	if (num_bytes == `0`)
4347	return `0`;
4348
4349	spin_lock(lock: &root->qgroup_meta_rsv_lock);
4350	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
4351	num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
4352	num_bytes);
4353	root->qgroup_meta_rsv_prealloc -= num_bytes;
4354	} else {
4355	num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
4356	num_bytes);
4357	root->qgroup_meta_rsv_pertrans -= num_bytes;
4358	}
4359	spin_unlock(lock: &root->qgroup_meta_rsv_lock);
4360	return num_bytes;
4361	}
4362
4363	int btrfs_qgroup_reserve_meta(struct btrfs_root root, int* num_bytes,
4364	enum btrfs_qgroup_rsv_type type, bool enforce)
4365	{
4366	struct btrfs_fs_info *fs_info = root->fs_info;
4367	int ret;
4368
4369	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED \|\|
4370	!is_fstree(rootid: root->root_key.objectid) \|\| num_bytes == `0`)
4371	return `0`;
4372
4373	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4374	trace_qgroup_meta_reserve(root, diff: (s64)num_bytes, type);
4375	ret = qgroup_reserve(root, num_bytes, enforce, type);
4376	if (ret < `0`)
4377	return ret;
4378	/*
4379	* Record what we have reserved into root.
4380	*
4381	* To avoid quota disabled->enabled underflow.
4382	* In that case, we may try to free space we haven't reserved
4383	* (since quota was disabled), so record what we reserved into root.
4384	* And ensure later release won't underflow this number.
4385	*/
4386	add_root_meta_rsv(root, num_bytes, type);
4387	return ret;
4388	}
4389
4390	int __btrfs_qgroup_reserve_meta(struct btrfs_root root, int* num_bytes,
4391	enum btrfs_qgroup_rsv_type type, bool enforce,
4392	bool noflush)
4393	{
4394	int ret;
4395
4396	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4397	if ((ret <= `0` && ret != -EDQUOT) \|\| noflush)
4398	return ret;
4399
4400	ret = try_flush_qgroup(root);
4401	if (ret < `0`)
4402	return ret;
4403	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4404	}
4405
4406	/*
4407	* Per-transaction meta reservation should be all freed at transaction commit
4408	* time
4409	*/
4410	void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
4411	{
4412	struct btrfs_fs_info *fs_info = root->fs_info;
4413
4414	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED \|\|
4415	!is_fstree(rootid: root->root_key.objectid))
4416	return;
4417
4418	/ TODO: Update trace point to handle such free /
4419	trace_qgroup_meta_free_all_pertrans(root);
4420	/ Special value -1 means to free all reserved space /
4421	btrfs_qgroup_free_refroot(fs_info, ref_root: root->root_key.objectid, num_bytes: (u64)-`1`,
4422	type: BTRFS_QGROUP_RSV_META_PERTRANS);
4423	}
4424
4425	void __btrfs_qgroup_free_meta(struct btrfs_root root, int* num_bytes,
4426	enum btrfs_qgroup_rsv_type type)
4427	{
4428	struct btrfs_fs_info *fs_info = root->fs_info;
4429
4430	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED \|\|
4431	!is_fstree(rootid: root->root_key.objectid))
4432	return;
4433
4434	/*
4435	* reservation for META_PREALLOC can happen before quota is enabled,
4436	* which can lead to underflow.
4437	* Here ensure we will only free what we really have reserved.
4438	*/
4439	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
4440	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4441	trace_qgroup_meta_reserve(root, diff: -(s64)num_bytes, type);
4442	btrfs_qgroup_free_refroot(fs_info, ref_root: root->root_key.objectid,
4443	num_bytes, type);
4444	}
4445
4446	static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
4447	int num_bytes)
4448	{
4449	struct btrfs_qgroup *qgroup;
4450	LIST_HEAD(qgroup_list);
4451
4452	if (num_bytes == `0`)
4453	return;
4454	if (!fs_info->quota_root)
4455	return;
4456
4457	spin_lock(lock: &fs_info->qgroup_lock);
4458	qgroup = find_qgroup_rb(fs_info, qgroupid: ref_root);
4459	if (!qgroup)
4460	goto out;
4461
4462	qgroup_iterator_add(head: &qgroup_list, qgroup);
4463	list_for_each_entry(qgroup, &qgroup_list, iterator) {
4464	struct btrfs_qgroup_list *glist;
4465
4466	qgroup_rsv_release(fs_info, qgroup, num_bytes,
4467	type: BTRFS_QGROUP_RSV_META_PREALLOC);
4468	if (!sb_rdonly(sb: fs_info->sb))
4469	qgroup_rsv_add(fs_info, qgroup, num_bytes,
4470	type: BTRFS_QGROUP_RSV_META_PERTRANS);
4471
4472	list_for_each_entry(glist, &qgroup->groups, next_group)
4473	qgroup_iterator_add(head: &qgroup_list, qgroup: glist->group);
4474	}
4475	out:
4476	qgroup_iterator_clean(head: &qgroup_list);
4477	spin_unlock(lock: &fs_info->qgroup_lock);
4478	}
4479
4480	/*
4481	* Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
4482	*
4483	* This is called when preallocated meta reservation needs to be used.
4484	* Normally after btrfs_join_transaction() call.
4485	*/
4486	void btrfs_qgroup_convert_reserved_meta(struct btrfs_root root, int* num_bytes)
4487	{
4488	struct btrfs_fs_info *fs_info = root->fs_info;
4489
4490	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED \|\|
4491	!is_fstree(rootid: root->root_key.objectid))
4492	return;
4493	/ Same as btrfs_qgroup_free_meta_prealloc() /
4494	num_bytes = sub_root_meta_rsv(root, num_bytes,
4495	type: BTRFS_QGROUP_RSV_META_PREALLOC);
4496	trace_qgroup_meta_convert(root, diff: num_bytes);
4497	qgroup_convert_meta(fs_info, ref_root: root->root_key.objectid, num_bytes);
4498	if (!sb_rdonly(sb: fs_info->sb))
4499	add_root_meta_rsv(root, num_bytes, type: BTRFS_QGROUP_RSV_META_PERTRANS);
4500	}
4501
4502	/*
4503	* Check qgroup reserved space leaking, normally at destroy inode
4504	* time
4505	*/
4506	void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4507	{
4508	struct extent_changeset changeset;
4509	struct ulist_node *unode;
4510	struct ulist_iterator iter;
4511	int ret;
4512
4513	extent_changeset_init(changeset: &changeset);
4514	ret = clear_record_extent_bits(tree: &inode->io_tree, start: `0`, end: (u64)-`1`,
4515	bits: EXTENT_QGROUP_RESERVED, changeset: &changeset);
4516
4517	WARN_ON(ret < `0`);
4518	if (WARN_ON(changeset.bytes_changed)) {
4519	ULIST_ITER_INIT(&iter);
4520	while ((unode = ulist_next(ulist: &changeset.range_changed, uiter: &iter))) {
4521	btrfs_warn(inode->root->fs_info,
4522	"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4523	btrfs_ino(inode), unode->val, unode->aux);
4524	}
4525	btrfs_qgroup_free_refroot(fs_info: inode->root->fs_info,
4526	ref_root: inode->root->root_key.objectid,
4527	num_bytes: changeset.bytes_changed, type: BTRFS_QGROUP_RSV_DATA);
4528
4529	}
4530	extent_changeset_release(changeset: &changeset);
4531	}
4532
4533	void btrfs_qgroup_init_swapped_blocks(
4534	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4535	{
4536	int i;
4537
4538	spin_lock_init(&swapped_blocks->lock);
4539	for (i = `0`; i < BTRFS_MAX_LEVEL; i++)
4540	swapped_blocks->blocks[i] = RB_ROOT;
4541	swapped_blocks->swapped = false;
4542	}
4543
4544	/*
4545	* Delete all swapped blocks record of @root.
4546	* Every record here means we skipped a full subtree scan for qgroup.
4547	*
4548	* Gets called when committing one transaction.
4549	*/
4550	void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4551	{
4552	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4553	int i;
4554
4555	swapped_blocks = &root->swapped_blocks;
4556
4557	spin_lock(lock: &swapped_blocks->lock);
4558	if (!swapped_blocks->swapped)
4559	goto out;
4560	for (i = `0`; i < BTRFS_MAX_LEVEL; i++) {
4561	struct rb_root *cur_root = &swapped_blocks->blocks[i];
4562	struct btrfs_qgroup_swapped_block *entry;
4563	struct btrfs_qgroup_swapped_block *next;
4564
4565	rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4566	node)
4567	kfree(objp: entry);
4568	swapped_blocks->blocks[i] = RB_ROOT;
4569	}
4570	swapped_blocks->swapped = false;
4571	out:
4572	spin_unlock(lock: &swapped_blocks->lock);
4573	}
4574
4575	/*
4576	* Add subtree roots record into @subvol_root.
4577	*
4578	* @subvol_root: tree root of the subvolume tree get swapped
4579	* @bg: block group under balance
4580	* @subvol_parent/slot: pointer to the subtree root in subvolume tree
4581	* @reloc_parent/slot: pointer to the subtree root in reloc tree
4582	* BOTH POINTERS ARE BEFORE TREE SWAP
4583	* @last_snapshot: last snapshot generation of the subvolume tree
4584	*/
4585	int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4586	struct btrfs_root *subvol_root,
4587	struct btrfs_block_group *bg,
4588	struct extent_buffer subvol_parent, int* subvol_slot,
4589	struct extent_buffer reloc_parent, int* reloc_slot,
4590	u64 last_snapshot)
4591	{
4592	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4593	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4594	struct btrfs_qgroup_swapped_block *block;
4595	struct rb_node **cur;
4596	struct rb_node *parent = NULL;
4597	int level = btrfs_header_level(eb: subvol_parent) - `1`;
4598	int ret = `0`;
4599
4600	if (!btrfs_qgroup_full_accounting(fs_info))
4601	return `0`;
4602
4603	if (btrfs_node_ptr_generation(eb: subvol_parent, nr: subvol_slot) >
4604	btrfs_node_ptr_generation(eb: reloc_parent, nr: reloc_slot)) {
4605	btrfs_err_rl(fs_info,
4606	"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4607	__func__,
4608	btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4609	btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4610	return -EUCLEAN;
4611	}
4612
4613	block = kmalloc(size: sizeof(*block), GFP_NOFS);
4614	if (!block) {
4615	ret = -ENOMEM;
4616	goto out;
4617	}
4618
4619	/*
4620	* @reloc_parent/slot is still before swap, while @block is going to
4621	* record the bytenr after swap, so we do the swap here.
4622	*/
4623	block->subvol_bytenr = btrfs_node_blockptr(eb: reloc_parent, nr: reloc_slot);
4624	block->subvol_generation = btrfs_node_ptr_generation(eb: reloc_parent,
4625	nr: reloc_slot);
4626	block->reloc_bytenr = btrfs_node_blockptr(eb: subvol_parent, nr: subvol_slot);
4627	block->reloc_generation = btrfs_node_ptr_generation(eb: subvol_parent,
4628	nr: subvol_slot);
4629	block->last_snapshot = last_snapshot;
4630	block->level = level;
4631
4632	/*
4633	* If we have bg == NULL, we're called from btrfs_recover_relocation(),
4634	* no one else can modify tree blocks thus we qgroup will not change
4635	* no matter the value of trace_leaf.
4636	*/
4637	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4638	block->trace_leaf = true;
4639	else
4640	block->trace_leaf = false;
4641	btrfs_node_key_to_cpu(eb: reloc_parent, cpu_key: &block->first_key, nr: reloc_slot);
4642
4643	/ Insert @block into @blocks /
4644	spin_lock(lock: &blocks->lock);
4645	cur = &blocks->blocks[level].rb_node;
4646	while (*cur) {
4647	struct btrfs_qgroup_swapped_block *entry;
4648
4649	parent = *cur;
4650	entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4651	node);
4652
4653	if (entry->subvol_bytenr < block->subvol_bytenr) {
4654	cur = &(*cur)->rb_left;
4655	} else if (entry->subvol_bytenr > block->subvol_bytenr) {
4656	cur = &(*cur)->rb_right;
4657	} else {
4658	if (entry->subvol_generation !=
4659	block->subvol_generation \|\|
4660	entry->reloc_bytenr != block->reloc_bytenr \|\|
4661	entry->reloc_generation !=
4662	block->reloc_generation) {
4663	/*
4664	* Duplicated but mismatch entry found.
4665	* Shouldn't happen.
4666	*
4667	* Marking qgroup inconsistent should be enough
4668	* for end users.
4669	*/
4670	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4671	ret = -EEXIST;
4672	}
4673	kfree(objp: block);
4674	goto out_unlock;
4675	}
4676	}
4677	rb_link_node(node: &block->node, parent, rb_link: cur);
4678	rb_insert_color(&block->node, &blocks->blocks[level]);
4679	blocks->swapped = true;
4680	out_unlock:
4681	spin_unlock(lock: &blocks->lock);
4682	out:
4683	if (ret < `0`)
4684	qgroup_mark_inconsistent(fs_info);
4685	return ret;
4686	}
4687
4688	/*
4689	* Check if the tree block is a subtree root, and if so do the needed
4690	* delayed subtree trace for qgroup.
4691	*
4692	* This is called during btrfs_cow_block().
4693	*/
4694	int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4695	struct btrfs_root *root,
4696	struct extent_buffer *subvol_eb)
4697	{
4698	struct btrfs_fs_info *fs_info = root->fs_info;
4699	struct btrfs_tree_parent_check check = { `0` };
4700	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4701	struct btrfs_qgroup_swapped_block *block;
4702	struct extent_buffer *reloc_eb = NULL;
4703	struct rb_node *node;
4704	bool found = false;
4705	bool swapped = false;
4706	int level = btrfs_header_level(eb: subvol_eb);
4707	int ret = `0`;
4708	int i;
4709
4710	if (!btrfs_qgroup_full_accounting(fs_info))
4711	return `0`;
4712	if (!is_fstree(rootid: root->root_key.objectid) \|\| !root->reloc_root)
4713	return `0`;
4714
4715	spin_lock(lock: &blocks->lock);
4716	if (!blocks->swapped) {
4717	spin_unlock(lock: &blocks->lock);
4718	return `0`;
4719	}
4720	node = blocks->blocks[level].rb_node;
4721
4722	while (node) {
4723	block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4724	if (block->subvol_bytenr < subvol_eb->start) {
4725	node = node->rb_left;
4726	} else if (block->subvol_bytenr > subvol_eb->start) {
4727	node = node->rb_right;
4728	} else {
4729	found = true;
4730	break;
4731	}
4732	}
4733	if (!found) {
4734	spin_unlock(lock: &blocks->lock);
4735	goto out;
4736	}
4737	/ Found one, remove it from @blocks first and update blocks->swapped /
4738	rb_erase(&block->node, &blocks->blocks[level]);
4739	for (i = `0`; i < BTRFS_MAX_LEVEL; i++) {
4740	if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4741	swapped = true;
4742	break;
4743	}
4744	}
4745	blocks->swapped = swapped;
4746	spin_unlock(lock: &blocks->lock);
4747
4748	check.level = block->level;
4749	check.transid = block->reloc_generation;
4750	check.has_first_key = true;
4751	memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
4752
4753	/ Read out reloc subtree root /
4754	reloc_eb = read_tree_block(fs_info, bytenr: block->reloc_bytenr, check: &check);
4755	if (IS_ERR(ptr: reloc_eb)) {
4756	ret = PTR_ERR(ptr: reloc_eb);
4757	reloc_eb = NULL;
4758	goto free_out;
4759	}
4760	if (!extent_buffer_uptodate(eb: reloc_eb)) {
4761	ret = -EIO;
4762	goto free_out;
4763	}
4764
4765	ret = qgroup_trace_subtree_swap(trans, src_eb: reloc_eb, dst_eb: subvol_eb,
4766	last_snapshot: block->last_snapshot, trace_leaf: block->trace_leaf);
4767	free_out:
4768	kfree(objp: block);
4769	free_extent_buffer(eb: reloc_eb);
4770	out:
4771	if (ret < `0`) {
4772	btrfs_err_rl(fs_info,
4773	"failed to account subtree at bytenr %llu: %d",
4774	subvol_eb->start, ret);
4775	qgroup_mark_inconsistent(fs_info);
4776	}
4777	return ret;
4778	}
4779
4780	void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4781	{
4782	struct btrfs_qgroup_extent_record *entry;
4783	struct btrfs_qgroup_extent_record *next;
4784	struct rb_root *root;
4785
4786	root = &trans->delayed_refs.dirty_extent_root;
4787	rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4788	ulist_free(ulist: entry->old_roots);
4789	kfree(objp: entry);
4790	}
4791	*root = RB_ROOT;
4792	}
4793
4794	void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
4795	{
4796	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
4797	return;
4798
4799	if (!is_fstree(rootid: root))
4800	return;
4801
4802	btrfs_qgroup_free_refroot(fs_info, ref_root: root, num_bytes: rsv_bytes, type: BTRFS_QGROUP_RSV_DATA);
4803	}
4804
4805	int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
4806	struct btrfs_squota_delta *delta)
4807	{
4808	int ret;
4809	struct btrfs_qgroup *qgroup;
4810	struct btrfs_qgroup *qg;
4811	LIST_HEAD(qgroup_list);
4812	u64 root = delta->root;
4813	u64 num_bytes = delta->num_bytes;
4814	const int sign = (delta->is_inc ? `1` : -`1`);
4815
4816	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
4817	return `0`;
4818
4819	if (!is_fstree(rootid: root))
4820	return `0`;
4821
4822	/ If the extent predates enabling quotas, don't count it. /
4823	if (delta->generation < fs_info->qgroup_enable_gen)
4824	return `0`;
4825
4826	spin_lock(lock: &fs_info->qgroup_lock);
4827	qgroup = find_qgroup_rb(fs_info, qgroupid: root);
4828	if (!qgroup) {
4829	ret = -ENOENT;
4830	goto out;
4831	}
4832
4833	ret = `0`;
4834	qgroup_iterator_add(head: &qgroup_list, qgroup);
4835	list_for_each_entry(qg, &qgroup_list, iterator) {
4836	struct btrfs_qgroup_list *glist;
4837
4838	qg->excl += num_bytes * sign;
4839	qg->rfer += num_bytes * sign;
4840	qgroup_dirty(fs_info, qgroup: qg);
4841
4842	list_for_each_entry(glist, &qg->groups, next_group)
4843	qgroup_iterator_add(head: &qgroup_list, qgroup: glist->group);
4844	}
4845	qgroup_iterator_clean(head: &qgroup_list);
4846
4847	out:
4848	spin_unlock(lock: &fs_info->qgroup_lock);
4849	return ret;
4850	}
4851

source code of linux/fs/btrfs/qgroup.c