xfs_extent_busy.c source code [linux/fs/xfs/xfs_extent_busy.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4	* Copyright (c) 2010 David Chinner.
5	* Copyright (c) 2011 Christoph Hellwig.
6	* All Rights Reserved.
7	*/
8	#include "xfs.h"
9	#include "xfs_fs.h"
10	#include "xfs_format.h"
11	#include "xfs_log_format.h"
12	#include "xfs_shared.h"
13	#include "xfs_trans_resv.h"
14	#include "xfs_mount.h"
15	#include "xfs_alloc.h"
16	#include "xfs_extent_busy.h"
17	#include "xfs_trace.h"
18	#include "xfs_trans.h"
19	#include "xfs_log.h"
20	#include "xfs_ag.h"
21
22	static void
23	xfs_extent_busy_insert_list(
24	struct xfs_perag *pag,
25	xfs_agblock_t bno,
26	xfs_extlen_t len,
27	unsigned int flags,
28	struct list_head *busy_list)
29	{
30	struct xfs_extent_busy *new;
31	struct xfs_extent_busy *busyp;
32	struct rb_node **rbp;
33	struct rb_node *parent = NULL;
34
35	new = kzalloc(size: sizeof(struct xfs_extent_busy),
36	GFP_KERNEL \| __GFP_NOFAIL);
37	new->agno = pag->pag_agno;
38	new->bno = bno;
39	new->length = len;
40	INIT_LIST_HEAD(list: &new->list);
41	new->flags = flags;
42
43	/ trace before insert to be able to see failed inserts /
44	trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
45
46	spin_lock(lock: &pag->pagb_lock);
47	rbp = &pag->pagb_tree.rb_node;
48	while (*rbp) {
49	parent = *rbp;
50	busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
51
52	if (new->bno < busyp->bno) {
53	rbp = &(*rbp)->rb_left;
54	ASSERT(new->bno + new->length <= busyp->bno);
55	} else if (new->bno > busyp->bno) {
56	rbp = &(*rbp)->rb_right;
57	ASSERT(bno >= busyp->bno + busyp->length);
58	} else {
59	ASSERT(`0`);
60	}
61	}
62
63	rb_link_node(node: &new->rb_node, parent, rb_link: rbp);
64	rb_insert_color(&new->rb_node, &pag->pagb_tree);
65
66	/ always process discard lists in fifo order /
67	list_add_tail(new: &new->list, head: busy_list);
68	spin_unlock(lock: &pag->pagb_lock);
69	}
70
71	void
72	xfs_extent_busy_insert(
73	struct xfs_trans *tp,
74	struct xfs_perag *pag,
75	xfs_agblock_t bno,
76	xfs_extlen_t len,
77	unsigned int flags)
78	{
79	xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
80	}
81
82	void
83	xfs_extent_busy_insert_discard(
84	struct xfs_perag *pag,
85	xfs_agblock_t bno,
86	xfs_extlen_t len,
87	struct list_head *busy_list)
88	{
89	xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
90	busy_list);
91	}
92
93	/*
94	* Search for a busy extent within the range of the extent we are about to
95	* allocate. You need to be holding the busy extent tree lock when calling
96	* xfs_extent_busy_search(). This function returns 0 for no overlapping busy
97	* extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
98	* match. This is done so that a non-zero return indicates an overlap that
99	* will require a synchronous transaction, but it can still be
100	* used to distinguish between a partial or exact match.
101	*/
102	int
103	xfs_extent_busy_search(
104	struct xfs_mount *mp,
105	struct xfs_perag *pag,
106	xfs_agblock_t bno,
107	xfs_extlen_t len)
108	{
109	struct rb_node *rbp;
110	struct xfs_extent_busy *busyp;
111	int match = `0`;
112
113	/ find closest start bno overlap /
114	spin_lock(lock: &pag->pagb_lock);
115	rbp = pag->pagb_tree.rb_node;
116	while (rbp) {
117	busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
118	if (bno < busyp->bno) {
119	/ may overlap, but exact start block is lower /
120	if (bno + len > busyp->bno)
121	match = -`1`;
122	rbp = rbp->rb_left;
123	} else if (bno > busyp->bno) {
124	/ may overlap, but exact start block is higher /
125	if (bno < busyp->bno + busyp->length)
126	match = -`1`;
127	rbp = rbp->rb_right;
128	} else {
129	/ bno matches busyp, length determines exact match /
130	match = (busyp->length == len) ? `1` : -`1`;
131	break;
132	}
133	}
134	spin_unlock(lock: &pag->pagb_lock);
135	return match;
136	}
137
138	/*
139	* The found free extent [fbno, fend] overlaps part or all of the given busy
140	* extent. If the overlap covers the beginning, the end, or all of the busy
141	* extent, the overlapping portion can be made unbusy and used for the
142	* allocation. We can't split a busy extent because we can't modify a
143	* transaction/CIL context busy list, but we can update an entry's block
144	* number or length.
145	*
146	* Returns true if the extent can safely be reused, or false if the search
147	* needs to be restarted.
148	*/
149	STATIC bool
150	xfs_extent_busy_update_extent(
151	struct xfs_mount *mp,
152	struct xfs_perag *pag,
153	struct xfs_extent_busy *busyp,
154	xfs_agblock_t fbno,
155	xfs_extlen_t flen,
156	bool userdata) __releases(&pag->pagb_lock)
157	__acquires(&pag->pagb_lock)
158	{
159	xfs_agblock_t fend = fbno + flen;
160	xfs_agblock_t bbno = busyp->bno;
161	xfs_agblock_t bend = bbno + busyp->length;
162
163	/*
164	* This extent is currently being discarded. Give the thread
165	* performing the discard a chance to mark the extent unbusy
166	* and retry.
167	*/
168	if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
169	spin_unlock(lock: &pag->pagb_lock);
170	delay(ticks: `1`);
171	spin_lock(lock: &pag->pagb_lock);
172	return false;
173	}
174
175	/*
176	* If there is a busy extent overlapping a user allocation, we have
177	* no choice but to force the log and retry the search.
178	*
179	* Fortunately this does not happen during normal operation, but
180	* only if the filesystem is very low on space and has to dip into
181	* the AGFL for normal allocations.
182	*/
183	if (userdata)
184	goto out_force_log;
185
186	if (bbno < fbno && bend > fend) {
187	/*
188	* Case 1:
189	* bbno bend
190	* +BBBBBBBBBBBBBBBBB+
191	* +---------+
192	* fbno fend
193	*/
194
195	/*
196	* We would have to split the busy extent to be able to track
197	* it correct, which we cannot do because we would have to
198	* modify the list of busy extents attached to the transaction
199	* or CIL context, which is immutable.
200	*
201	* Force out the log to clear the busy extent and retry the
202	* search.
203	*/
204	goto out_force_log;
205	} else if (bbno >= fbno && bend <= fend) {
206	/*
207	* Case 2:
208	* bbno bend
209	* +BBBBBBBBBBBBBBBBB+
210	* +-----------------+
211	* fbno fend
212	*
213	* Case 3:
214	* bbno bend
215	* +BBBBBBBBBBBBBBBBB+
216	* +--------------------------+
217	* fbno fend
218	*
219	* Case 4:
220	* bbno bend
221	* +BBBBBBBBBBBBBBBBB+
222	* +--------------------------+
223	* fbno fend
224	*
225	* Case 5:
226	* bbno bend
227	* +BBBBBBBBBBBBBBBBB+
228	* +-----------------------------------+
229	* fbno fend
230	*
231	*/
232
233	/*
234	* The busy extent is fully covered by the extent we are
235	* allocating, and can simply be removed from the rbtree.
236	* However we cannot remove it from the immutable list
237	* tracking busy extents in the transaction or CIL context,
238	* so set the length to zero to mark it invalid.
239	*
240	* We also need to restart the busy extent search from the
241	* tree root, because erasing the node can rearrange the
242	* tree topology.
243	*/
244	rb_erase(&busyp->rb_node, &pag->pagb_tree);
245	busyp->length = `0`;
246	return false;
247	} else if (fend < bend) {
248	/*
249	* Case 6:
250	* bbno bend
251	* +BBBBBBBBBBBBBBBBB+
252	* +---------+
253	* fbno fend
254	*
255	* Case 7:
256	* bbno bend
257	* +BBBBBBBBBBBBBBBBB+
258	* +------------------+
259	* fbno fend
260	*
261	*/
262	busyp->bno = fend;
263	busyp->length = bend - fend;
264	} else if (bbno < fbno) {
265	/*
266	* Case 8:
267	* bbno bend
268	* +BBBBBBBBBBBBBBBBB+
269	* +-------------+
270	* fbno fend
271	*
272	* Case 9:
273	* bbno bend
274	* +BBBBBBBBBBBBBBBBB+
275	* +----------------------+
276	* fbno fend
277	*/
278	busyp->length = fbno - busyp->bno;
279	} else {
280	ASSERT(`0`);
281	}
282
283	trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
284	return true;
285
286	out_force_log:
287	spin_unlock(lock: &pag->pagb_lock);
288	xfs_log_force(mp, XFS_LOG_SYNC);
289	trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
290	spin_lock(lock: &pag->pagb_lock);
291	return false;
292	}
293
294
295	/*
296	* For a given extent [fbno, flen], make sure we can reuse it safely.
297	*/
298	void
299	xfs_extent_busy_reuse(
300	struct xfs_mount *mp,
301	struct xfs_perag *pag,
302	xfs_agblock_t fbno,
303	xfs_extlen_t flen,
304	bool userdata)
305	{
306	struct rb_node *rbp;
307
308	ASSERT(flen > `0`);
309	spin_lock(lock: &pag->pagb_lock);
310	restart:
311	rbp = pag->pagb_tree.rb_node;
312	while (rbp) {
313	struct xfs_extent_busy *busyp =
314	rb_entry(rbp, struct xfs_extent_busy, rb_node);
315	xfs_agblock_t bbno = busyp->bno;
316	xfs_agblock_t bend = bbno + busyp->length;
317
318	if (fbno + flen <= bbno) {
319	rbp = rbp->rb_left;
320	continue;
321	} else if (fbno >= bend) {
322	rbp = rbp->rb_right;
323	continue;
324	}
325
326	if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
327	userdata))
328	goto restart;
329	}
330	spin_unlock(lock: &pag->pagb_lock);
331	}
332
333	/*
334	* For a given extent [fbno, flen], search the busy extent list to find a
335	* subset of the extent that is not busy. If *rlen is smaller than
336	* args->minlen no suitable extent could be found, and the higher level
337	* code needs to force out the log and retry the allocation.
338	*
339	* Return the current busy generation for the AG if the extent is busy. This
340	* value can be used to wait for at least one of the currently busy extents
341	* to be cleared. Note that the busy list is not guaranteed to be empty after
342	* the gen is woken. The state of a specific extent must always be confirmed
343	* with another call to xfs_extent_busy_trim() before it can be used.
344	*/
345	bool
346	xfs_extent_busy_trim(
347	struct xfs_alloc_arg *args,
348	xfs_agblock_t *bno,
349	xfs_extlen_t *len,
350	unsigned *busy_gen)
351	{
352	xfs_agblock_t fbno;
353	xfs_extlen_t flen;
354	struct rb_node *rbp;
355	bool ret = false;
356
357	ASSERT(*len > `0`);
358
359	spin_lock(lock: &args->pag->pagb_lock);
360	fbno = *bno;
361	flen = *len;
362	rbp = args->pag->pagb_tree.rb_node;
363	while (rbp && flen >= args->minlen) {
364	struct xfs_extent_busy *busyp =
365	rb_entry(rbp, struct xfs_extent_busy, rb_node);
366	xfs_agblock_t fend = fbno + flen;
367	xfs_agblock_t bbno = busyp->bno;
368	xfs_agblock_t bend = bbno + busyp->length;
369
370	if (fend <= bbno) {
371	rbp = rbp->rb_left;
372	continue;
373	} else if (fbno >= bend) {
374	rbp = rbp->rb_right;
375	continue;
376	}
377
378	if (bbno <= fbno) {
379	/ start overlap /
380
381	/*
382	* Case 1:
383	* bbno bend
384	* +BBBBBBBBBBBBBBBBB+
385	* +---------+
386	* fbno fend
387	*
388	* Case 2:
389	* bbno bend
390	* +BBBBBBBBBBBBBBBBB+
391	* +-------------+
392	* fbno fend
393	*
394	* Case 3:
395	* bbno bend
396	* +BBBBBBBBBBBBBBBBB+
397	* +-------------+
398	* fbno fend
399	*
400	* Case 4:
401	* bbno bend
402	* +BBBBBBBBBBBBBBBBB+
403	* +-----------------+
404	* fbno fend
405	*
406	* No unbusy region in extent, return failure.
407	*/
408	if (fend <= bend)
409	goto fail;
410
411	/*
412	* Case 5:
413	* bbno bend
414	* +BBBBBBBBBBBBBBBBB+
415	* +----------------------+
416	* fbno fend
417	*
418	* Case 6:
419	* bbno bend
420	* +BBBBBBBBBBBBBBBBB+
421	* +--------------------------+
422	* fbno fend
423	*
424	* Needs to be trimmed to:
425	* +-------+
426	* fbno fend
427	*/
428	fbno = bend;
429	} else if (bend >= fend) {
430	/ end overlap /
431
432	/*
433	* Case 7:
434	* bbno bend
435	* +BBBBBBBBBBBBBBBBB+
436	* +------------------+
437	* fbno fend
438	*
439	* Case 8:
440	* bbno bend
441	* +BBBBBBBBBBBBBBBBB+
442	* +--------------------------+
443	* fbno fend
444	*
445	* Needs to be trimmed to:
446	* +-------+
447	* fbno fend
448	*/
449	fend = bbno;
450	} else {
451	/ middle overlap /
452
453	/*
454	* Case 9:
455	* bbno bend
456	* +BBBBBBBBBBBBBBBBB+
457	* +-----------------------------------+
458	* fbno fend
459	*
460	* Can be trimmed to:
461	* +-------+ OR +-------+
462	* fbno fend fbno fend
463	*
464	* Backward allocation leads to significant
465	* fragmentation of directories, which degrades
466	* directory performance, therefore we always want to
467	* choose the option that produces forward allocation
468	* patterns.
469	* Preferring the lower bno extent will make the next
470	* request use "fend" as the start of the next
471	* allocation; if the segment is no longer busy at
472	* that point, we'll get a contiguous allocation, but
473	* even if it is still busy, we will get a forward
474	* allocation.
475	* We try to avoid choosing the segment at "bend",
476	* because that can lead to the next allocation
477	* taking the segment at "fbno", which would be a
478	* backward allocation. We only use the segment at
479	* "fbno" if it is much larger than the current
480	* requested size, because in that case there's a
481	* good chance subsequent allocations will be
482	* contiguous.
483	*/
484	if (bbno - fbno >= args->maxlen) {
485	/ left candidate fits perfect /
486	fend = bbno;
487	} else if (fend - bend >= args->maxlen * `4`) {
488	/ right candidate has enough free space /
489	fbno = bend;
490	} else if (bbno - fbno >= args->minlen) {
491	/ left candidate fits minimum requirement /
492	fend = bbno;
493	} else {
494	goto fail;
495	}
496	}
497
498	flen = fend - fbno;
499	}
500	out:
501
502	if (fbno != bno \|\| flen != len) {
503	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
504	fbno, flen);
505	*bno = fbno;
506	*len = flen;
507	*busy_gen = args->pag->pagb_gen;
508	ret = true;
509	}
510	spin_unlock(lock: &args->pag->pagb_lock);
511	return ret;
512	fail:
513	/*
514	* Return a zero extent length as failure indications. All callers
515	* re-check if the trimmed extent satisfies the minlen requirement.
516	*/
517	flen = `0`;
518	goto out;
519	}
520
521	STATIC void
522	xfs_extent_busy_clear_one(
523	struct xfs_mount *mp,
524	struct xfs_perag *pag,
525	struct xfs_extent_busy *busyp)
526	{
527	if (busyp->length) {
528	trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
529	busyp->length);
530	rb_erase(&busyp->rb_node, &pag->pagb_tree);
531	}
532
533	list_del_init(entry: &busyp->list);
534	kfree(objp: busyp);
535	}
536
537	static void
538	xfs_extent_busy_put_pag(
539	struct xfs_perag *pag,
540	bool wakeup)
541	__releases(pag->pagb_lock)
542	{
543	if (wakeup) {
544	pag->pagb_gen++;
545	wake_up_all(&pag->pagb_wait);
546	}
547
548	spin_unlock(lock: &pag->pagb_lock);
549	xfs_perag_put(pag);
550	}
551
552	/*
553	* Remove all extents on the passed in list from the busy extents tree.
554	* If do_discard is set skip extents that need to be discarded, and mark
555	* these as undergoing a discard operation instead.
556	*/
557	void
558	xfs_extent_busy_clear(
559	struct xfs_mount *mp,
560	struct list_head *list,
561	bool do_discard)
562	{
563	struct xfs_extent_busy busyp, n;
564	struct xfs_perag *pag = NULL;
565	xfs_agnumber_t agno = NULLAGNUMBER;
566	bool wakeup = false;
567
568	list_for_each_entry_safe(busyp, n, list, list) {
569	if (busyp->agno != agno) {
570	if (pag)
571	xfs_extent_busy_put_pag(pag, wakeup);
572	agno = busyp->agno;
573	pag = xfs_perag_get(mp, agno);
574	spin_lock(lock: &pag->pagb_lock);
575	wakeup = false;
576	}
577
578	if (do_discard && busyp->length &&
579	!(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
580	busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
581	} else {
582	xfs_extent_busy_clear_one(mp, pag, busyp);
583	wakeup = true;
584	}
585	}
586
587	if (pag)
588	xfs_extent_busy_put_pag(pag, wakeup);
589	}
590
591	/*
592	* Flush out all busy extents for this AG.
593	*
594	* If the current transaction is holding busy extents, the caller may not want
595	* to wait for committed busy extents to resolve. If we are being told just to
596	* try a flush or progress has been made since we last skipped a busy extent,
597	* return immediately to allow the caller to try again.
598	*
599	* If we are freeing extents, we might actually be holding the only free extents
600	* in the transaction busy list and the log force won't resolve that situation.
601	* In this case, we must return -EAGAIN to avoid a deadlock by informing the
602	* caller it needs to commit the busy extents it holds before retrying the
603	* extent free operation.
604	*/
605	int
606	xfs_extent_busy_flush(
607	struct xfs_trans *tp,
608	struct xfs_perag *pag,
609	unsigned busy_gen,
610	uint32_t alloc_flags)
611	{
612	DEFINE_WAIT (wait);
613	int error;
614
615	error = xfs_log_force(mp: tp->t_mountp, XFS_LOG_SYNC);
616	if (error)
617	return error;
618
619	/ Avoid deadlocks on uncommitted busy extents. /
620	if (!list_empty(head: &tp->t_busy)) {
621	if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
622	return `0`;
623
624	if (busy_gen != READ_ONCE(pag->pagb_gen))
625	return `0`;
626
627	if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
628	return -EAGAIN;
629	}
630
631	/ Wait for committed busy extents to resolve. /
632	do {
633	prepare_to_wait(wq_head: &pag->pagb_wait, wq_entry: &wait, TASK_KILLABLE);
634	if (busy_gen != READ_ONCE(pag->pagb_gen))
635	break;
636	schedule();
637	} while (`1`);
638
639	finish_wait(wq_head: &pag->pagb_wait, wq_entry: &wait);
640	return `0`;
641	}
642
643	void
644	xfs_extent_busy_wait_all(
645	struct xfs_mount *mp)
646	{
647	struct xfs_perag *pag;
648	DEFINE_WAIT (wait);
649	xfs_agnumber_t agno;
650
651	for_each_perag(mp, agno, pag) {
652	do {
653	prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
654	if (RB_EMPTY_ROOT(&pag->pagb_tree))
655	break;
656	schedule();
657	} while (`1`);
658	finish_wait(&pag->pagb_wait, &wait);
659	}
660	}
661
662	/*
663	* Callback for list_sort to sort busy extents by the AG they reside in.
664	*/
665	int
666	xfs_extent_busy_ag_cmp(
667	void *priv,
668	const struct list_head *l1,
669	const struct list_head *l2)
670	{
671	struct xfs_extent_busy *b1 =
672	container_of(l1, struct xfs_extent_busy, list);
673	struct xfs_extent_busy *b2 =
674	container_of(l2, struct xfs_extent_busy, list);
675	s32 diff;
676
677	diff = b1->agno - b2->agno;
678	if (!diff)
679	diff = b1->bno - b2->bno;
680	return diff;
681	}
682
683	/ Are there any busy extents in this AG? /
684	bool
685	xfs_extent_busy_list_empty(
686	struct xfs_perag *pag)
687	{
688	bool res;
689
690	spin_lock(lock: &pag->pagb_lock);
691	res = RB_EMPTY_ROOT(&pag->pagb_tree);
692	spin_unlock(lock: &pag->pagb_lock);
693	return res;
694	}
695

source code of linux/fs/xfs/xfs_extent_busy.c