xfs_zone_space_resv.c source code [linux/fs/xfs/xfs_zone_space_resv.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2023-2025 Christoph Hellwig.
4	* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5	*/
6	#include "xfs.h"
7	#include "xfs_shared.h"
8	#include "xfs_format.h"
9	#include "xfs_trans_resv.h"
10	#include "xfs_mount.h"
11	#include "xfs_inode.h"
12	#include "xfs_rtbitmap.h"
13	#include "xfs_zone_alloc.h"
14	#include "xfs_zone_priv.h"
15	#include "xfs_zones.h"
16
17	/*
18	* Note: the zoned allocator does not support a rtextsize > 1, so this code and
19	* the allocator itself uses file system blocks interchangeable with realtime
20	* extents without doing the otherwise required conversions.
21	*/
22
23	/*
24	* Per-task space reservation.
25	*
26	* Tasks that need to wait for GC to free up space allocate one of these
27	* on-stack and adds it to the per-mount zi_reclaim_reservations lists.
28	* The GC thread will then wake the tasks in order when space becomes available.
29	*/
30	struct xfs_zone_reservation {
31	struct list_head entry;
32	struct task_struct *task;
33	xfs_filblks_t count_fsb;
34	};
35
36	/*
37	* Calculate the number of reserved blocks.
38	*
39	* XC_FREE_RTEXTENTS counts the user available capacity, to which the file
40	* system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
41	* available for writes without waiting for GC.
42	*
43	* For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
44	* block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
45	* is further restricted by at least one zone as well as the optional
46	* persistently reserved blocks. This allows the allocator to run more
47	* smoothly by not always triggering GC.
48	*/
49	uint64_t
50	xfs_zoned_default_resblks(
51	struct xfs_mount *mp,
52	enum xfs_free_counter ctr)
53	{
54	switch (ctr) {
55	case XC_FREE_RTEXTENTS:
56	return (uint64_t)XFS_RESERVED_ZONES *
57	mp->m_groups[XG_TYPE_RTG].blocks +
58	mp->m_sb.sb_rtreserved;
59	case XC_FREE_RTAVAILABLE:
60	return (uint64_t)XFS_GC_ZONES *
61	mp->m_groups[XG_TYPE_RTG].blocks;
62	default:
63	ASSERT(`0`);
64	return `0`;
65	}
66	}
67
68	void
69	xfs_zoned_resv_wake_all(
70	struct xfs_mount *mp)
71	{
72	struct xfs_zone_info *zi = mp->m_zone_info;
73	struct xfs_zone_reservation *reservation;
74
75	spin_lock(lock: &zi->zi_reservation_lock);
76	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
77	wake_up_process(tsk: reservation->task);
78	spin_unlock(lock: &zi->zi_reservation_lock);
79	}
80
81	void
82	xfs_zoned_add_available(
83	struct xfs_mount *mp,
84	xfs_filblks_t count_fsb)
85	{
86	struct xfs_zone_info *zi = mp->m_zone_info;
87	struct xfs_zone_reservation *reservation;
88
89	if (list_empty_careful(head: &zi->zi_reclaim_reservations)) {
90	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
91	return;
92	}
93
94	spin_lock(lock: &zi->zi_reservation_lock);
95	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
96	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
97	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
98	if (reservation->count_fsb > count_fsb)
99	break;
100	wake_up_process(tsk: reservation->task);
101	count_fsb -= reservation->count_fsb;
102
103	}
104	spin_unlock(lock: &zi->zi_reservation_lock);
105	}
106
107	static int
108	xfs_zoned_space_wait_error(
109	struct xfs_mount *mp)
110	{
111	if (xfs_is_shutdown(mp))
112	return -EIO;
113	if (fatal_signal_pending(current))
114	return -EINTR;
115	return `0`;
116	}
117
118	static int
119	xfs_zoned_reserve_available(
120	struct xfs_inode *ip,
121	xfs_filblks_t count_fsb,
122	unsigned int flags)
123	{
124	struct xfs_mount *mp = ip->i_mount;
125	struct xfs_zone_info *zi = mp->m_zone_info;
126	struct xfs_zone_reservation reservation = {
127	.task = current,
128	.count_fsb = count_fsb,
129	};
130	int error;
131
132	/*
133	* If there are no waiters, try to directly grab the available blocks
134	* from the percpu counter.
135	*
136	* If the caller wants to dip into the reserved pool also bypass the
137	* wait list. This relies on the fact that we have a very graciously
138	* sized reserved pool that always has enough space. If the reserved
139	* allocations fail we're in trouble.
140	*/
141	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) \|\|
142	(flags & XFS_ZR_RESERVED))) {
143	error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
144	flags & XFS_ZR_RESERVED);
145	if (error != -ENOSPC)
146	return error;
147	}
148
149	if (flags & XFS_ZR_NOWAIT)
150	return -EAGAIN;
151
152	spin_lock(lock: &zi->zi_reservation_lock);
153	list_add_tail(new: &reservation.entry, head: &zi->zi_reclaim_reservations);
154	while ((error = xfs_zoned_space_wait_error(mp)) == `0`) {
155	set_current_state(TASK_KILLABLE);
156
157	error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
158	flags & XFS_ZR_RESERVED);
159	if (error != -ENOSPC)
160	break;
161
162	/*
163	* Make sure to start GC if it is not running already. As we
164	* check the rtavailable count when filling up zones, GC is
165	* normally already running at this point, but in some setups
166	* with very few zones we may completely run out of non-
167	* reserved blocks in between filling zones.
168	*/
169	if (!xfs_is_zonegc_running(mp))
170	wake_up_process(tsk: zi->zi_gc_thread);
171
172	/*
173	* If there is no reclaimable group left and we aren't still
174	* processing a pending GC request give up as we're fully out
175	* of space.
176	*/
177	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
178	!xfs_is_zonegc_running(mp))
179	break;
180
181	spin_unlock(lock: &zi->zi_reservation_lock);
182	schedule();
183	spin_lock(lock: &zi->zi_reservation_lock);
184	}
185	list_del(entry: &reservation.entry);
186	spin_unlock(lock: &zi->zi_reservation_lock);
187
188	__set_current_state(TASK_RUNNING);
189	return error;
190	}
191
192	/*
193	* Implement greedy space allocation for short writes by trying to grab all
194	* that is left after locking out other threads from trying to do the same.
195	*
196	* This isn't exactly optimal and can hopefully be replaced by a proper
197	* percpu_counter primitive one day.
198	*/
199	static int
200	xfs_zoned_reserve_extents_greedy(
201	struct xfs_inode *ip,
202	xfs_filblks_t *count_fsb,
203	unsigned int flags)
204	{
205	struct xfs_mount *mp = ip->i_mount;
206	struct xfs_zone_info *zi = mp->m_zone_info;
207	s64 len = *count_fsb;
208	int error = -ENOSPC;
209
210	spin_lock(lock: &zi->zi_reservation_lock);
211	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
212	if (len > `0`) {
213	*count_fsb = len;
214	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
215	flags & XFS_ZR_RESERVED);
216	}
217	spin_unlock(lock: &zi->zi_reservation_lock);
218	return error;
219	}
220
221	int
222	xfs_zoned_space_reserve(
223	struct xfs_inode *ip,
224	xfs_filblks_t count_fsb,
225	unsigned int flags,
226	struct xfs_zone_alloc_ctx *ac)
227	{
228	struct xfs_mount *mp = ip->i_mount;
229	int error;
230
231	ASSERT(ac->reserved_blocks == `0`);
232	ASSERT(ac->open_zone == NULL);
233
234	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
235	flags & XFS_ZR_RESERVED);
236	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > `1`)
237	error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
238	if (error)
239	return error;
240
241	error = xfs_zoned_reserve_available(ip, count_fsb, flags);
242	if (error) {
243	xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
244	return error;
245	}
246	ac->reserved_blocks = count_fsb;
247	return `0`;
248	}
249
250	void
251	xfs_zoned_space_unreserve(
252	struct xfs_inode *ip,
253	struct xfs_zone_alloc_ctx *ac)
254	{
255	if (ac->reserved_blocks > `0`) {
256	struct xfs_mount *mp = ip->i_mount;
257
258	xfs_zoned_add_available(mp, ac->reserved_blocks);
259	xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
260	}
261	if (ac->open_zone)
262	xfs_open_zone_put(oz: ac->open_zone);
263	}
264

source code of linux/fs/xfs/xfs_zone_space_resv.c