xfs_icache.c source code [linux/fs/xfs/xfs_icache.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_inode.h"
14	#include "xfs_trans.h"
15	#include "xfs_trans_priv.h"
16	#include "xfs_inode_item.h"
17	#include "xfs_quota.h"
18	#include "xfs_trace.h"
19	#include "xfs_icache.h"
20	#include "xfs_bmap_util.h"
21	#include "xfs_dquot_item.h"
22	#include "xfs_dquot.h"
23	#include "xfs_reflink.h"
24	#include "xfs_ialloc.h"
25	#include "xfs_ag.h"
26	#include "xfs_log_priv.h"
27	#include "xfs_health.h"
28
29	#include <linux/iversion.h>
30
31	/ Radix tree tags for incore inode tree. /
32
33	/ inode is to be reclaimed /
34	#define XFS_ICI_RECLAIM_TAG 0
35	/ Inode has speculative preallocations (posteof or cow) to clean. /
36	#define XFS_ICI_BLOCKGC_TAG 1
37
38	/*
39	* The goal for walking incore inodes. These can correspond with incore inode
40	* radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
41	*/
42	enum xfs_icwalk_goal {
43	/ Goals directly associated with tagged inodes. /
44	XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
45	XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
46	};
47
48	static int xfs_icwalk(struct xfs_mount *mp,
49	enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
50	static int xfs_icwalk_ag(struct xfs_perag *pag,
51	enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
52
53	/*
54	* Private inode cache walk flags for struct xfs_icwalk. Must not
55	* coincide with XFS_ICWALK_FLAGS_VALID.
56	*/
57
58	/ Stop scanning after icw_scan_limit inodes. /
59	#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
60
61	#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
62	#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
63
64	#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT \| \
65	XFS_ICWALK_FLAG_RECLAIM_SICK \| \
66	XFS_ICWALK_FLAG_UNION)
67
68	/*
69	* Allocate and initialise an xfs_inode.
70	*/
71	struct xfs_inode *
72	xfs_inode_alloc(
73	struct xfs_mount *mp,
74	xfs_ino_t ino)
75	{
76	struct xfs_inode *ip;
77
78	/*
79	* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
80	* and return NULL here on ENOMEM.
81	*/
82	ip = alloc_inode_sb(sb: mp->m_super, cache: xfs_inode_cache, GFP_KERNEL \| __GFP_NOFAIL);
83
84	if (inode_init_always(mp->m_super, VFS_I(ip))) {
85	kmem_cache_free(s: xfs_inode_cache, objp: ip);
86	return NULL;
87	}
88
89	/ VFS doesn't initialise i_mode or i_state! /
90	VFS_I(ip)->i_mode = `0`;
91	VFS_I(ip)->i_state = `0`;
92	mapping_set_large_folios(mapping: VFS_I(ip)->i_mapping);
93
94	XFS_STATS_INC(mp, vn_active);
95	ASSERT(atomic_read(&ip->i_pincount) == `0`);
96	ASSERT(ip->i_ino == `0`);
97
98	/ initialise the xfs inode /
99	ip->i_ino = ino;
100	ip->i_mount = mp;
101	memset(&ip->i_imap, `0`, sizeof(struct xfs_imap));
102	ip->i_cowfp = NULL;
103	memset(&ip->i_af, `0`, sizeof(ip->i_af));
104	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
105	memset(&ip->i_df, `0`, sizeof(ip->i_df));
106	ip->i_flags = `0`;
107	ip->i_delayed_blks = `0`;
108	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
109	ip->i_nblocks = `0`;
110	ip->i_forkoff = `0`;
111	ip->i_sick = `0`;
112	ip->i_checked = `0`;
113	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
114	INIT_LIST_HEAD(list: &ip->i_ioend_list);
115	spin_lock_init(&ip->i_ioend_lock);
116	ip->i_next_unlinked = NULLAGINO;
117	ip->i_prev_unlinked = `0`;
118
119	return ip;
120	}
121
122	STATIC void
123	xfs_inode_free_callback(
124	struct rcu_head *head)
125	{
126	struct inode inode = container_of(head, struct* inode, i_rcu);
127	struct xfs_inode *ip = XFS_I(inode);
128
129	switch (VFS_I(ip)->i_mode & S_IFMT) {
130	case S_IFREG:
131	case S_IFDIR:
132	case S_IFLNK:
133	xfs_idestroy_fork(&ip->i_df);
134	break;
135	}
136
137	xfs_ifork_zap_attr(ip);
138
139	if (ip->i_cowfp) {
140	xfs_idestroy_fork(ip->i_cowfp);
141	kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
142	}
143	if (ip->i_itemp) {
144	ASSERT(!test_bit(XFS_LI_IN_AIL,
145	&ip->i_itemp->ili_item.li_flags));
146	xfs_inode_item_destroy(ip);
147	ip->i_itemp = NULL;
148	}
149
150	kmem_cache_free(s: xfs_inode_cache, objp: ip);
151	}
152
153	static void
154	__xfs_inode_free(
155	struct xfs_inode *ip)
156	{
157	/ asserts to verify all state is correct here /
158	ASSERT(atomic_read(&ip->i_pincount) == `0`);
159	ASSERT(!ip->i_itemp \|\| list_empty(&ip->i_itemp->ili_item.li_bio_list));
160	XFS_STATS_DEC(ip->i_mount, vn_active);
161
162	call_rcu(head: &VFS_I(ip)->i_rcu, func: xfs_inode_free_callback);
163	}
164
165	void
166	xfs_inode_free(
167	struct xfs_inode *ip)
168	{
169	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
170
171	/*
172	* Because we use RCU freeing we need to ensure the inode always
173	* appears to be reclaimed with an invalid inode number when in the
174	* free state. The ip->i_flags_lock provides the barrier against lookup
175	* races.
176	*/
177	spin_lock(lock: &ip->i_flags_lock);
178	ip->i_flags = XFS_IRECLAIM;
179	ip->i_ino = `0`;
180	spin_unlock(lock: &ip->i_flags_lock);
181
182	__xfs_inode_free(ip);
183	}
184
185	/*
186	* Queue background inode reclaim work if there are reclaimable inodes and there
187	* isn't reclaim work already scheduled or in progress.
188	*/
189	static void
190	xfs_reclaim_work_queue(
191	struct xfs_mount *mp)
192	{
193
194	rcu_read_lock();
195	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
196	queue_delayed_work(wq: mp->m_reclaim_workqueue, dwork: &mp->m_reclaim_work,
197	delay: msecs_to_jiffies(xfs_syncd_centisecs / `6` * `10`));
198	}
199	rcu_read_unlock();
200	}
201
202	/*
203	* Background scanning to trim preallocated space. This is queued based on the
204	* 'speculative_prealloc_lifetime' tunable (5m by default).
205	*/
206	static inline void
207	xfs_blockgc_queue(
208	struct xfs_perag *pag)
209	{
210	struct xfs_mount *mp = pag->pag_mount;
211
212	if (!xfs_is_blockgc_enabled(mp))
213	return;
214
215	rcu_read_lock();
216	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
217	queue_delayed_work(wq: pag->pag_mount->m_blockgc_wq,
218	dwork: &pag->pag_blockgc_work,
219	delay: msecs_to_jiffies(xfs_blockgc_secs * `1000`));
220	rcu_read_unlock();
221	}
222
223	/ Set a tag on both the AG incore inode tree and the AG radix tree. /
224	static void
225	xfs_perag_set_inode_tag(
226	struct xfs_perag *pag,
227	xfs_agino_t agino,
228	unsigned int tag)
229	{
230	struct xfs_mount *mp = pag->pag_mount;
231	bool was_tagged;
232
233	lockdep_assert_held(&pag->pag_ici_lock);
234
235	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
236	radix_tree_tag_set(&pag->pag_ici_root, index: agino, tag);
237
238	if (tag == XFS_ICI_RECLAIM_TAG)
239	pag->pag_ici_reclaimable++;
240
241	if (was_tagged)
242	return;
243
244	/ propagate the tag up into the perag radix tree /
245	spin_lock(lock: &mp->m_perag_lock);
246	radix_tree_tag_set(&mp->m_perag_tree, index: pag->pag_agno, tag);
247	spin_unlock(lock: &mp->m_perag_lock);
248
249	/ start background work /
250	switch (tag) {
251	case XFS_ICI_RECLAIM_TAG:
252	xfs_reclaim_work_queue(mp);
253	break;
254	case XFS_ICI_BLOCKGC_TAG:
255	xfs_blockgc_queue(pag);
256	break;
257	}
258
259	trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
260	}
261
262	/ Clear a tag on both the AG incore inode tree and the AG radix tree. /
263	static void
264	xfs_perag_clear_inode_tag(
265	struct xfs_perag *pag,
266	xfs_agino_t agino,
267	unsigned int tag)
268	{
269	struct xfs_mount *mp = pag->pag_mount;
270
271	lockdep_assert_held(&pag->pag_ici_lock);
272
273	/*
274	* Reclaim can signal (with a null agino) that it cleared its own tag
275	* by removing the inode from the radix tree.
276	*/
277	if (agino != NULLAGINO)
278	radix_tree_tag_clear(&pag->pag_ici_root, index: agino, tag);
279	else
280	ASSERT(tag == XFS_ICI_RECLAIM_TAG);
281
282	if (tag == XFS_ICI_RECLAIM_TAG)
283	pag->pag_ici_reclaimable--;
284
285	if (radix_tree_tagged(&pag->pag_ici_root, tag))
286	return;
287
288	/ clear the tag from the perag radix tree /
289	spin_lock(lock: &mp->m_perag_lock);
290	radix_tree_tag_clear(&mp->m_perag_tree, index: pag->pag_agno, tag);
291	spin_unlock(lock: &mp->m_perag_lock);
292
293	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
294	}
295
296	/*
297	* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
298	* part of the structure. This is made more complex by the fact we store
299	* information about the on-disk values in the VFS inode and so we can't just
300	* overwrite the values unconditionally. Hence we save the parameters we
301	* need to retain across reinitialisation, and rewrite them into the VFS inode
302	* after reinitialisation even if it fails.
303	*/
304	static int
305	xfs_reinit_inode(
306	struct xfs_mount *mp,
307	struct inode *inode)
308	{
309	int error;
310	uint32_t nlink = inode->i_nlink;
311	uint32_t generation = inode->i_generation;
312	uint64_t version = inode_peek_iversion(inode);
313	umode_t mode = inode->i_mode;
314	dev_t dev = inode->i_rdev;
315	kuid_t uid = inode->i_uid;
316	kgid_t gid = inode->i_gid;
317
318	error = inode_init_always(mp->m_super, inode);
319
320	set_nlink(inode, nlink);
321	inode->i_generation = generation;
322	inode_set_iversion_queried(inode, val: version);
323	inode->i_mode = mode;
324	inode->i_rdev = dev;
325	inode->i_uid = uid;
326	inode->i_gid = gid;
327	mapping_set_large_folios(mapping: inode->i_mapping);
328	return error;
329	}
330
331	/*
332	* Carefully nudge an inode whose VFS state has been torn down back into a
333	* usable state. Drops the i_flags_lock and the rcu read lock.
334	*/
335	static int
336	xfs_iget_recycle(
337	struct xfs_perag *pag,
338	struct xfs_inode *ip) __releases(&ip->i_flags_lock)
339	{
340	struct xfs_mount *mp = ip->i_mount;
341	struct inode *inode = VFS_I(ip);
342	int error;
343
344	trace_xfs_iget_recycle(ip);
345
346	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
347	return -EAGAIN;
348
349	/*
350	* We need to make it look like the inode is being reclaimed to prevent
351	* the actual reclaim workers from stomping over us while we recycle
352	* the inode. We can't clear the radix tree tag yet as it requires
353	* pag_ici_lock to be held exclusive.
354	*/
355	ip->i_flags \|= XFS_IRECLAIM;
356
357	spin_unlock(lock: &ip->i_flags_lock);
358	rcu_read_unlock();
359
360	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
361	error = xfs_reinit_inode(mp, inode);
362	xfs_iunlock(ip, XFS_ILOCK_EXCL);
363	if (error) {
364	/*
365	* Re-initializing the inode failed, and we are in deep
366	* trouble. Try to re-add it to the reclaim list.
367	*/
368	rcu_read_lock();
369	spin_lock(lock: &ip->i_flags_lock);
370	ip->i_flags &= ~(XFS_INEW \| XFS_IRECLAIM);
371	ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
372	spin_unlock(lock: &ip->i_flags_lock);
373	rcu_read_unlock();
374
375	trace_xfs_iget_recycle_fail(ip);
376	return error;
377	}
378
379	spin_lock(lock: &pag->pag_ici_lock);
380	spin_lock(lock: &ip->i_flags_lock);
381
382	/*
383	* Clear the per-lifetime state in the inode as we are now effectively
384	* a new inode and need to return to the initial state before reuse
385	* occurs.
386	*/
387	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
388	ip->i_flags \|= XFS_INEW;
389	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
390	XFS_ICI_RECLAIM_TAG);
391	inode->i_state = I_NEW;
392	spin_unlock(lock: &ip->i_flags_lock);
393	spin_unlock(lock: &pag->pag_ici_lock);
394
395	return `0`;
396	}
397
398	/*
399	* If we are allocating a new inode, then check what was returned is
400	* actually a free, empty inode. If we are not allocating an inode,
401	* then check we didn't find a free inode.
402	*
403	* Returns:
404	* 0 if the inode free state matches the lookup context
405	* -ENOENT if the inode is free and we are not allocating
406	* -EFSCORRUPTED if there is any state mismatch at all
407	*/
408	static int
409	xfs_iget_check_free_state(
410	struct xfs_inode *ip,
411	int flags)
412	{
413	if (flags & XFS_IGET_CREATE) {
414	/ should be a free inode /
415	if (VFS_I(ip)->i_mode != `0`) {
416	xfs_warn(ip->i_mount,
417	"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
418	ip->i_ino, VFS_I(ip)->i_mode);
419	xfs_agno_mark_sick(ip->i_mount,
420	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
421	XFS_SICK_AG_INOBT);
422	return -EFSCORRUPTED;
423	}
424
425	if (ip->i_nblocks != `0`) {
426	xfs_warn(ip->i_mount,
427	"Corruption detected! Free inode 0x%llx has blocks allocated!",
428	ip->i_ino);
429	xfs_agno_mark_sick(ip->i_mount,
430	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
431	XFS_SICK_AG_INOBT);
432	return -EFSCORRUPTED;
433	}
434	return `0`;
435	}
436
437	/ should be an allocated inode /
438	if (VFS_I(ip)->i_mode == `0`)
439	return -ENOENT;
440
441	return `0`;
442	}
443
444	/ Make all pending inactivation work start immediately. /
445	static bool
446	xfs_inodegc_queue_all(
447	struct xfs_mount *mp)
448	{
449	struct xfs_inodegc *gc;
450	int cpu;
451	bool ret = false;
452
453	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
454	gc = per_cpu_ptr(mp->m_inodegc, cpu);
455	if (!llist_empty(head: &gc->list)) {
456	mod_delayed_work_on(cpu, wq: mp->m_inodegc_wq, dwork: &gc->work, delay: `0`);
457	ret = true;
458	}
459	}
460
461	return ret;
462	}
463
464	/ Wait for all queued work and collect errors /
465	static int
466	xfs_inodegc_wait_all(
467	struct xfs_mount *mp)
468	{
469	int cpu;
470	int error = `0`;
471
472	flush_workqueue(mp->m_inodegc_wq);
473	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
474	struct xfs_inodegc *gc;
475
476	gc = per_cpu_ptr(mp->m_inodegc, cpu);
477	if (gc->error && !error)
478	error = gc->error;
479	gc->error = `0`;
480	}
481
482	return error;
483	}
484
485	/*
486	* Check the validity of the inode we just found it the cache
487	*/
488	static int
489	xfs_iget_cache_hit(
490	struct xfs_perag *pag,
491	struct xfs_inode *ip,
492	xfs_ino_t ino,
493	int flags,
494	int lock_flags) __releases(RCU)
495	{
496	struct inode *inode = VFS_I(ip);
497	struct xfs_mount *mp = ip->i_mount;
498	int error;
499
500	/*
501	* check for re-use of an inode within an RCU grace period due to the
502	* radix tree nodes not being updated yet. We monitor for this by
503	* setting the inode number to zero before freeing the inode structure.
504	* If the inode has been reallocated and set up, then the inode number
505	* will not match, so check for that, too.
506	*/
507	spin_lock(lock: &ip->i_flags_lock);
508	if (ip->i_ino != ino)
509	goto out_skip;
510
511	/*
512	* If we are racing with another cache hit that is currently
513	* instantiating this inode or currently recycling it out of
514	* reclaimable state, wait for the initialisation to complete
515	* before continuing.
516	*
517	* If we're racing with the inactivation worker we also want to wait.
518	* If we're creating a new file, it's possible that the worker
519	* previously marked the inode as free on disk but hasn't finished
520	* updating the incore state yet. The AGI buffer will be dirty and
521	* locked to the icreate transaction, so a synchronous push of the
522	* inodegc workers would result in deadlock. For a regular iget, the
523	* worker is running already, so we might as well wait.
524	*
525	* XXX(hch): eventually we should do something equivalent to
526	* wait_on_inode to wait for these flags to be cleared
527	* instead of polling for it.
528	*/
529	if (ip->i_flags & (XFS_INEW \| XFS_IRECLAIM \| XFS_INACTIVATING))
530	goto out_skip;
531
532	if (ip->i_flags & XFS_NEED_INACTIVE) {
533	/ Unlinked inodes cannot be re-grabbed. /
534	if (VFS_I(ip)->i_nlink == `0`) {
535	error = -ENOENT;
536	goto out_error;
537	}
538	goto out_inodegc_flush;
539	}
540
541	/*
542	* Check the inode free state is valid. This also detects lookup
543	* racing with unlinks.
544	*/
545	error = xfs_iget_check_free_state(ip, flags);
546	if (error)
547	goto out_error;
548
549	/ Skip inodes that have no vfs state. /
550	if ((flags & XFS_IGET_INCORE) &&
551	(ip->i_flags & XFS_IRECLAIMABLE))
552	goto out_skip;
553
554	/ The inode fits the selection criteria; process it. /
555	if (ip->i_flags & XFS_IRECLAIMABLE) {
556	/ Drops i_flags_lock and RCU read lock. /
557	error = xfs_iget_recycle(pag, ip);
558	if (error == -EAGAIN)
559	goto out_skip;
560	if (error)
561	return error;
562	} else {
563	/ If the VFS inode is being torn down, pause and try again. /
564	if (!igrab(inode))
565	goto out_skip;
566
567	/ We've got a live one. /
568	spin_unlock(lock: &ip->i_flags_lock);
569	rcu_read_unlock();
570	trace_xfs_iget_hit(ip);
571	}
572
573	if (lock_flags != `0`)
574	xfs_ilock(ip, lock_flags);
575
576	if (!(flags & XFS_IGET_INCORE))
577	xfs_iflags_clear(ip, XFS_ISTALE);
578	XFS_STATS_INC(mp, xs_ig_found);
579
580	return `0`;
581
582	out_skip:
583	trace_xfs_iget_skip(ip);
584	XFS_STATS_INC(mp, xs_ig_frecycle);
585	error = -EAGAIN;
586	out_error:
587	spin_unlock(lock: &ip->i_flags_lock);
588	rcu_read_unlock();
589	return error;
590
591	out_inodegc_flush:
592	spin_unlock(lock: &ip->i_flags_lock);
593	rcu_read_unlock();
594	/*
595	* Do not wait for the workers, because the caller could hold an AGI
596	* buffer lock. We're just going to sleep in a loop anyway.
597	*/
598	if (xfs_is_inodegc_enabled(mp))
599	xfs_inodegc_queue_all(mp);
600	return -EAGAIN;
601	}
602
603	static int
604	xfs_iget_cache_miss(
605	struct xfs_mount *mp,
606	struct xfs_perag *pag,
607	xfs_trans_t *tp,
608	xfs_ino_t ino,
609	struct xfs_inode **ipp,
610	int flags,
611	int lock_flags)
612	{
613	struct xfs_inode *ip;
614	int error;
615	xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
616	int iflags;
617
618	ip = xfs_inode_alloc(mp, ino);
619	if (!ip)
620	return -ENOMEM;
621
622	error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
623	if (error)
624	goto out_destroy;
625
626	/*
627	* For version 5 superblocks, if we are initialising a new inode and we
628	* are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
629	* simply build the new inode core with a random generation number.
630	*
631	* For version 4 (and older) superblocks, log recovery is dependent on
632	* the i_flushiter field being initialised from the current on-disk
633	* value and hence we must also read the inode off disk even when
634	* initializing new inodes.
635	*/
636	if (xfs_has_v3inodes(mp) &&
637	(flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
638	VFS_I(ip)->i_generation = get_random_u32();
639	} else {
640	struct xfs_buf *bp;
641
642	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
643	if (error)
644	goto out_destroy;
645
646	error = xfs_inode_from_disk(ip,
647	xfs_buf_offset(bp, ip->i_imap.im_boffset));
648	if (!error)
649	xfs_buf_set_ref(bp, XFS_INO_REF);
650	else
651	xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
652	xfs_trans_brelse(tp, bp);
653
654	if (error)
655	goto out_destroy;
656	}
657
658	trace_xfs_iget_miss(ip);
659
660	/*
661	* Check the inode free state is valid. This also detects lookup
662	* racing with unlinks.
663	*/
664	error = xfs_iget_check_free_state(ip, flags);
665	if (error)
666	goto out_destroy;
667
668	/*
669	* Preload the radix tree so we can insert safely under the
670	* write spinlock. Note that we cannot sleep inside the preload
671	* region.
672	*/
673	if (radix_tree_preload(GFP_KERNEL \| __GFP_NOLOCKDEP)) {
674	error = -EAGAIN;
675	goto out_destroy;
676	}
677
678	/*
679	* Because the inode hasn't been added to the radix-tree yet it can't
680	* be found by another thread, so we can do the non-sleeping lock here.
681	*/
682	if (lock_flags) {
683	if (!xfs_ilock_nowait(ip, lock_flags))
684	BUG();
685	}
686
687	/*
688	* These values must be set before inserting the inode into the radix
689	* tree as the moment it is inserted a concurrent lookup (allowed by the
690	* RCU locking mechanism) can find it and that lookup must see that this
691	* is an inode currently under construction (i.e. that XFS_INEW is set).
692	* The ip->i_flags_lock that protects the XFS_INEW flag forms the
693	* memory barrier that ensures this detection works correctly at lookup
694	* time.
695	*/
696	iflags = XFS_INEW;
697	if (flags & XFS_IGET_DONTCACHE)
698	d_mark_dontcache(inode: VFS_I(ip));
699	ip->i_udquot = NULL;
700	ip->i_gdquot = NULL;
701	ip->i_pdquot = NULL;
702	xfs_iflags_set(ip, flags: iflags);
703
704	/ insert the new inode /
705	spin_lock(lock: &pag->pag_ici_lock);
706	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
707	if (unlikely(error)) {
708	WARN_ON(error != -EEXIST);
709	XFS_STATS_INC(mp, xs_ig_dup);
710	error = -EAGAIN;
711	goto out_preload_end;
712	}
713	spin_unlock(lock: &pag->pag_ici_lock);
714	radix_tree_preload_end();
715
716	*ipp = ip;
717	return `0`;
718
719	out_preload_end:
720	spin_unlock(lock: &pag->pag_ici_lock);
721	radix_tree_preload_end();
722	if (lock_flags)
723	xfs_iunlock(ip, lock_flags);
724	out_destroy:
725	__destroy_inode(VFS_I(ip));
726	xfs_inode_free(ip);
727	return error;
728	}
729
730	/*
731	* Look up an inode by number in the given file system. The inode is looked up
732	* in the cache held in each AG. If the inode is found in the cache, initialise
733	* the vfs inode if necessary.
734	*
735	* If it is not in core, read it in from the file system's device, add it to the
736	* cache and initialise the vfs inode.
737	*
738	* The inode is locked according to the value of the lock_flags parameter.
739	* Inode lookup is only done during metadata operations and not as part of the
740	* data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
741	*/
742	int
743	xfs_iget(
744	struct xfs_mount *mp,
745	struct xfs_trans *tp,
746	xfs_ino_t ino,
747	uint flags,
748	uint lock_flags,
749	struct xfs_inode **ipp)
750	{
751	struct xfs_inode *ip;
752	struct xfs_perag *pag;
753	xfs_agino_t agino;
754	int error;
755
756	ASSERT((lock_flags & (XFS_IOLOCK_EXCL \| XFS_IOLOCK_SHARED)) == `0`);
757
758	/ reject inode numbers outside existing AGs /
759	if (!ino \|\| XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
760	return -EINVAL;
761
762	XFS_STATS_INC(mp, xs_ig_attempts);
763
764	/ get the perag structure and ensure that it's inode capable /
765	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
766	agino = XFS_INO_TO_AGINO(mp, ino);
767
768	again:
769	error = `0`;
770	rcu_read_lock();
771	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
772
773	if (ip) {
774	error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
775	if (error)
776	goto out_error_or_again;
777	} else {
778	rcu_read_unlock();
779	if (flags & XFS_IGET_INCORE) {
780	error = -ENODATA;
781	goto out_error_or_again;
782	}
783	XFS_STATS_INC(mp, xs_ig_missed);
784
785	error = xfs_iget_cache_miss(mp, pag, tp, ino, ipp: &ip,
786	flags, lock_flags);
787	if (error)
788	goto out_error_or_again;
789	}
790	xfs_perag_put(pag);
791
792	*ipp = ip;
793
794	/*
795	* If we have a real type for an on-disk inode, we can setup the inode
796	* now. If it's a new inode being created, xfs_init_new_inode will
797	* handle it.
798	*/
799	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != `0`)
800	xfs_setup_existing_inode(ip);
801	return `0`;
802
803	out_error_or_again:
804	if (!(flags & (XFS_IGET_INCORE \| XFS_IGET_NORETRY)) &&
805	error == -EAGAIN) {
806	delay(ticks: `1`);
807	goto again;
808	}
809	xfs_perag_put(pag);
810	return error;
811	}
812
813	/*
814	* Grab the inode for reclaim exclusively.
815	*
816	* We have found this inode via a lookup under RCU, so the inode may have
817	* already been freed, or it may be in the process of being recycled by
818	* xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
819	* has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
820	* will not be set. Hence we need to check for both these flag conditions to
821	* avoid inodes that are no longer reclaim candidates.
822	*
823	* Note: checking for other state flags here, under the i_flags_lock or not, is
824	* racy and should be avoided. Those races should be resolved only after we have
825	* ensured that we are able to reclaim this inode and the world can see that we
826	* are going to reclaim it.
827	*
828	* Return true if we grabbed it, false otherwise.
829	*/
830	static bool
831	xfs_reclaim_igrab(
832	struct xfs_inode *ip,
833	struct xfs_icwalk *icw)
834	{
835	ASSERT(rcu_read_lock_held());
836
837	spin_lock(lock: &ip->i_flags_lock);
838	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) \|\|
839	__xfs_iflags_test(ip, XFS_IRECLAIM)) {
840	/ not a reclaim candidate. /
841	spin_unlock(lock: &ip->i_flags_lock);
842	return false;
843	}
844
845	/ Don't reclaim a sick inode unless the caller asked for it. /
846	if (ip->i_sick &&
847	(!icw \|\| !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
848	spin_unlock(lock: &ip->i_flags_lock);
849	return false;
850	}
851
852	__xfs_iflags_set(ip, XFS_IRECLAIM);
853	spin_unlock(lock: &ip->i_flags_lock);
854	return true;
855	}
856
857	/*
858	* Inode reclaim is non-blocking, so the default action if progress cannot be
859	* made is to "requeue" the inode for reclaim by unlocking it and clearing the
860	* XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
861	* blocking anymore and hence we can wait for the inode to be able to reclaim
862	* it.
863	*
864	* We do no IO here - if callers require inodes to be cleaned they must push the
865	* AIL first to trigger writeback of dirty inodes. This enables writeback to be
866	* done in the background in a non-blocking manner, and enables memory reclaim
867	* to make progress without blocking.
868	*/
869	static void
870	xfs_reclaim_inode(
871	struct xfs_inode *ip,
872	struct xfs_perag *pag)
873	{
874	xfs_ino_t ino = ip->i_ino; / for radix_tree_delete /
875
876	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
877	goto out;
878	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
879	goto out_iunlock;
880
881	/*
882	* Check for log shutdown because aborting the inode can move the log
883	* tail and corrupt in memory state. This is fine if the log is shut
884	* down, but if the log is still active and only the mount is shut down
885	* then the in-memory log tail movement caused by the abort can be
886	* incorrectly propagated to disk.
887	*/
888	if (xlog_is_shutdown(log: ip->i_mount->m_log)) {
889	xfs_iunpin_wait(ip);
890	xfs_iflush_shutdown_abort(ip);
891	goto reclaim;
892	}
893	if (xfs_ipincount(ip))
894	goto out_clear_flush;
895	if (!xfs_inode_clean(ip))
896	goto out_clear_flush;
897
898	xfs_iflags_clear(ip, XFS_IFLUSHING);
899	reclaim:
900	trace_xfs_inode_reclaiming(ip);
901
902	/*
903	* Because we use RCU freeing we need to ensure the inode always appears
904	* to be reclaimed with an invalid inode number when in the free state.
905	* We do this as early as possible under the ILOCK so that
906	* xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
907	* detect races with us here. By doing this, we guarantee that once
908	* xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
909	* it will see either a valid inode that will serialise correctly, or it
910	* will see an invalid inode that it can skip.
911	*/
912	spin_lock(lock: &ip->i_flags_lock);
913	ip->i_flags = XFS_IRECLAIM;
914	ip->i_ino = `0`;
915	ip->i_sick = `0`;
916	ip->i_checked = `0`;
917	spin_unlock(lock: &ip->i_flags_lock);
918
919	ASSERT(!ip->i_itemp \|\| ip->i_itemp->ili_item.li_buf == NULL);
920	xfs_iunlock(ip, XFS_ILOCK_EXCL);
921
922	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
923	/*
924	* Remove the inode from the per-AG radix tree.
925	*
926	* Because radix_tree_delete won't complain even if the item was never
927	* added to the tree assert that it's been there before to catch
928	* problems with the inode life time early on.
929	*/
930	spin_lock(lock: &pag->pag_ici_lock);
931	if (!radix_tree_delete(&pag->pag_ici_root,
932	XFS_INO_TO_AGINO(ip->i_mount, ino)))
933	ASSERT(`0`);
934	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
935	spin_unlock(lock: &pag->pag_ici_lock);
936
937	/*
938	* Here we do an (almost) spurious inode lock in order to coordinate
939	* with inode cache radix tree lookups. This is because the lookup
940	* can reference the inodes in the cache without taking references.
941	*
942	* We make that OK here by ensuring that we wait until the inode is
943	* unlocked after the lookup before we go ahead and free it.
944	*/
945	xfs_ilock(ip, XFS_ILOCK_EXCL);
946	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
947	xfs_iunlock(ip, XFS_ILOCK_EXCL);
948	ASSERT(xfs_inode_clean(ip));
949
950	__xfs_inode_free(ip);
951	return;
952
953	out_clear_flush:
954	xfs_iflags_clear(ip, XFS_IFLUSHING);
955	out_iunlock:
956	xfs_iunlock(ip, XFS_ILOCK_EXCL);
957	out:
958	xfs_iflags_clear(ip, XFS_IRECLAIM);
959	}
960
961	/ Reclaim sick inodes if we're unmounting or the fs went down. /
962	static inline bool
963	xfs_want_reclaim_sick(
964	struct xfs_mount *mp)
965	{
966	return xfs_is_unmounting(mp) \|\| xfs_has_norecovery(mp) \|\|
967	xfs_is_shutdown(mp);
968	}
969
970	void
971	xfs_reclaim_inodes(
972	struct xfs_mount *mp)
973	{
974	struct xfs_icwalk icw = {
975	.icw_flags = `0`,
976	};
977
978	if (xfs_want_reclaim_sick(mp))
979	icw.icw_flags \|= XFS_ICWALK_FLAG_RECLAIM_SICK;
980
981	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
982	xfs_ail_push_all_sync(mp->m_ail);
983	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, icw: &icw);
984	}
985	}
986
987	/*
988	* The shrinker infrastructure determines how many inodes we should scan for
989	* reclaim. We want as many clean inodes ready to reclaim as possible, so we
990	* push the AIL here. We also want to proactively free up memory if we can to
991	* minimise the amount of work memory reclaim has to do so we kick the
992	* background reclaim if it isn't already scheduled.
993	*/
994	long
995	xfs_reclaim_inodes_nr(
996	struct xfs_mount *mp,
997	unsigned long nr_to_scan)
998	{
999	struct xfs_icwalk icw = {
1000	.icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
1001	.icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
1002	};
1003
1004	if (xfs_want_reclaim_sick(mp))
1005	icw.icw_flags \|= XFS_ICWALK_FLAG_RECLAIM_SICK;
1006
1007	/ kick background reclaimer and push the AIL /
1008	xfs_reclaim_work_queue(mp);
1009	xfs_ail_push_all(mp->m_ail);
1010
1011	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, icw: &icw);
1012	return `0`;
1013	}
1014
1015	/*
1016	* Return the number of reclaimable inodes in the filesystem for
1017	* the shrinker to determine how much to reclaim.
1018	*/
1019	long
1020	xfs_reclaim_inodes_count(
1021	struct xfs_mount *mp)
1022	{
1023	struct xfs_perag *pag;
1024	xfs_agnumber_t ag = `0`;
1025	long reclaimable = `0`;
1026
1027	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1028	ag = pag->pag_agno + `1`;
1029	reclaimable += pag->pag_ici_reclaimable;
1030	xfs_perag_put(pag);
1031	}
1032	return reclaimable;
1033	}
1034
1035	STATIC bool
1036	xfs_icwalk_match_id(
1037	struct xfs_inode *ip,
1038	struct xfs_icwalk *icw)
1039	{
1040	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1041	!uid_eq(left: VFS_I(ip)->i_uid, right: icw->icw_uid))
1042	return false;
1043
1044	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1045	!gid_eq(left: VFS_I(ip)->i_gid, right: icw->icw_gid))
1046	return false;
1047
1048	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1049	ip->i_projid != icw->icw_prid)
1050	return false;
1051
1052	return true;
1053	}
1054
1055	/*
1056	* A union-based inode filtering algorithm. Process the inode if any of the
1057	* criteria match. This is for global/internal scans only.
1058	*/
1059	STATIC bool
1060	xfs_icwalk_match_id_union(
1061	struct xfs_inode *ip,
1062	struct xfs_icwalk *icw)
1063	{
1064	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1065	uid_eq(left: VFS_I(ip)->i_uid, right: icw->icw_uid))
1066	return true;
1067
1068	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1069	gid_eq(left: VFS_I(ip)->i_gid, right: icw->icw_gid))
1070	return true;
1071
1072	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1073	ip->i_projid == icw->icw_prid)
1074	return true;
1075
1076	return false;
1077	}
1078
1079	/*
1080	* Is this inode @ip eligible for eof/cow block reclamation, given some
1081	* filtering parameters @icw? The inode is eligible if @icw is null or
1082	* if the predicate functions match.
1083	*/
1084	static bool
1085	xfs_icwalk_match(
1086	struct xfs_inode *ip,
1087	struct xfs_icwalk *icw)
1088	{
1089	bool match;
1090
1091	if (!icw)
1092	return true;
1093
1094	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1095	match = xfs_icwalk_match_id_union(ip, icw);
1096	else
1097	match = xfs_icwalk_match_id(ip, icw);
1098	if (!match)
1099	return false;
1100
1101	/ skip the inode if the file size is too small /
1102	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1103	XFS_ISIZE(ip) < icw->icw_min_file_size)
1104	return false;
1105
1106	return true;
1107	}
1108
1109	/*
1110	* This is a fast pass over the inode cache to try to get reclaim moving on as
1111	* many inodes as possible in a short period of time. It kicks itself every few
1112	* seconds, as well as being kicked by the inode cache shrinker when memory
1113	* goes low.
1114	*/
1115	void
1116	xfs_reclaim_worker(
1117	struct work_struct *work)
1118	{
1119	struct xfs_mount *mp = container_of(to_delayed_work(work),
1120	struct xfs_mount, m_reclaim_work);
1121
1122	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, NULL);
1123	xfs_reclaim_work_queue(mp);
1124	}
1125
1126	STATIC int
1127	xfs_inode_free_eofblocks(
1128	struct xfs_inode *ip,
1129	struct xfs_icwalk *icw,
1130	unsigned int *lockflags)
1131	{
1132	bool wait;
1133
1134	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1135
1136	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1137	return `0`;
1138
1139	/*
1140	* If the mapping is dirty the operation can block and wait for some
1141	* time. Unless we are waiting, skip it.
1142	*/
1143	if (!wait && mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1144	return `0`;
1145
1146	if (!xfs_icwalk_match(ip, icw))
1147	return `0`;
1148
1149	/*
1150	* If the caller is waiting, return -EAGAIN to keep the background
1151	* scanner moving and revisit the inode in a subsequent pass.
1152	*/
1153	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1154	if (wait)
1155	return -EAGAIN;
1156	return `0`;
1157	}
1158	*lockflags \|= XFS_IOLOCK_EXCL;
1159
1160	if (xfs_can_free_eofblocks(ip, force: false))
1161	return xfs_free_eofblocks(ip);
1162
1163	/ inode could be preallocated or append-only /
1164	trace_xfs_inode_free_eofblocks_invalid(ip);
1165	xfs_inode_clear_eofblocks_tag(ip);
1166	return `0`;
1167	}
1168
1169	static void
1170	xfs_blockgc_set_iflag(
1171	struct xfs_inode *ip,
1172	unsigned long iflag)
1173	{
1174	struct xfs_mount *mp = ip->i_mount;
1175	struct xfs_perag *pag;
1176
1177	ASSERT((iflag & ~(XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`);
1178
1179	/*
1180	* Don't bother locking the AG and looking up in the radix trees
1181	* if we already know that we have the tag set.
1182	*/
1183	if (ip->i_flags & iflag)
1184	return;
1185	spin_lock(lock: &ip->i_flags_lock);
1186	ip->i_flags \|= iflag;
1187	spin_unlock(lock: &ip->i_flags_lock);
1188
1189	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1190	spin_lock(lock: &pag->pag_ici_lock);
1191
1192	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1193	XFS_ICI_BLOCKGC_TAG);
1194
1195	spin_unlock(lock: &pag->pag_ici_lock);
1196	xfs_perag_put(pag);
1197	}
1198
1199	void
1200	xfs_inode_set_eofblocks_tag(
1201	xfs_inode_t *ip)
1202	{
1203	trace_xfs_inode_set_eofblocks_tag(ip);
1204	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1205	}
1206
1207	static void
1208	xfs_blockgc_clear_iflag(
1209	struct xfs_inode *ip,
1210	unsigned long iflag)
1211	{
1212	struct xfs_mount *mp = ip->i_mount;
1213	struct xfs_perag *pag;
1214	bool clear_tag;
1215
1216	ASSERT((iflag & ~(XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`);
1217
1218	spin_lock(lock: &ip->i_flags_lock);
1219	ip->i_flags &= ~iflag;
1220	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`;
1221	spin_unlock(lock: &ip->i_flags_lock);
1222
1223	if (!clear_tag)
1224	return;
1225
1226	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1227	spin_lock(lock: &pag->pag_ici_lock);
1228
1229	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1230	XFS_ICI_BLOCKGC_TAG);
1231
1232	spin_unlock(lock: &pag->pag_ici_lock);
1233	xfs_perag_put(pag);
1234	}
1235
1236	void
1237	xfs_inode_clear_eofblocks_tag(
1238	xfs_inode_t *ip)
1239	{
1240	trace_xfs_inode_clear_eofblocks_tag(ip);
1241	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1242	}
1243
1244	/*
1245	* Set ourselves up to free CoW blocks from this file. If it's already clean
1246	* then we can bail out quickly, but otherwise we must back off if the file
1247	* is undergoing some kind of write.
1248	*/
1249	static bool
1250	xfs_prep_free_cowblocks(
1251	struct xfs_inode *ip)
1252	{
1253	/*
1254	* Just clear the tag if we have an empty cow fork or none at all. It's
1255	* possible the inode was fully unshared since it was originally tagged.
1256	*/
1257	if (!xfs_inode_has_cow_data(ip)) {
1258	trace_xfs_inode_free_cowblocks_invalid(ip);
1259	xfs_inode_clear_cowblocks_tag(ip);
1260	return false;
1261	}
1262
1263	/*
1264	* If the mapping is dirty or under writeback we cannot touch the
1265	* CoW fork. Leave it alone if we're in the midst of a directio.
1266	*/
1267	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) \|\|
1268	mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) \|\|
1269	mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) \|\|
1270	atomic_read(v: &VFS_I(ip)->i_dio_count))
1271	return false;
1272
1273	return true;
1274	}
1275
1276	/*
1277	* Automatic CoW Reservation Freeing
1278	*
1279	* These functions automatically garbage collect leftover CoW reservations
1280	* that were made on behalf of a cowextsize hint when we start to run out
1281	* of quota or when the reservations sit around for too long. If the file
1282	* has dirty pages or is undergoing writeback, its CoW reservations will
1283	* be retained.
1284	*
1285	* The actual garbage collection piggybacks off the same code that runs
1286	* the speculative EOF preallocation garbage collector.
1287	*/
1288	STATIC int
1289	xfs_inode_free_cowblocks(
1290	struct xfs_inode *ip,
1291	struct xfs_icwalk *icw,
1292	unsigned int *lockflags)
1293	{
1294	bool wait;
1295	int ret = `0`;
1296
1297	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1298
1299	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1300	return `0`;
1301
1302	if (!xfs_prep_free_cowblocks(ip))
1303	return `0`;
1304
1305	if (!xfs_icwalk_match(ip, icw))
1306	return `0`;
1307
1308	/*
1309	* If the caller is waiting, return -EAGAIN to keep the background
1310	* scanner moving and revisit the inode in a subsequent pass.
1311	*/
1312	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1313	!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1314	if (wait)
1315	return -EAGAIN;
1316	return `0`;
1317	}
1318	*lockflags \|= XFS_IOLOCK_EXCL;
1319
1320	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1321	if (wait)
1322	return -EAGAIN;
1323	return `0`;
1324	}
1325	*lockflags \|= XFS_MMAPLOCK_EXCL;
1326
1327	/*
1328	* Check again, nobody else should be able to dirty blocks or change
1329	* the reflink iflag now that we have the first two locks held.
1330	*/
1331	if (xfs_prep_free_cowblocks(ip))
1332	ret = xfs_reflink_cancel_cow_range(ip, `0`, NULLFILEOFF, false);
1333	return ret;
1334	}
1335
1336	void
1337	xfs_inode_set_cowblocks_tag(
1338	xfs_inode_t *ip)
1339	{
1340	trace_xfs_inode_set_cowblocks_tag(ip);
1341	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1342	}
1343
1344	void
1345	xfs_inode_clear_cowblocks_tag(
1346	xfs_inode_t *ip)
1347	{
1348	trace_xfs_inode_clear_cowblocks_tag(ip);
1349	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1350	}
1351
1352	/ Disable post-EOF and CoW block auto-reclamation. /
1353	void
1354	xfs_blockgc_stop(
1355	struct xfs_mount *mp)
1356	{
1357	struct xfs_perag *pag;
1358	xfs_agnumber_t agno;
1359
1360	if (!xfs_clear_blockgc_enabled(mp))
1361	return;
1362
1363	for_each_perag(mp, agno, pag)
1364	cancel_delayed_work_sync(&pag->pag_blockgc_work);
1365	trace_xfs_blockgc_stop(mp, __return_address);
1366	}
1367
1368	/ Enable post-EOF and CoW block auto-reclamation. /
1369	void
1370	xfs_blockgc_start(
1371	struct xfs_mount *mp)
1372	{
1373	struct xfs_perag *pag;
1374	xfs_agnumber_t agno;
1375
1376	if (xfs_set_blockgc_enabled(mp))
1377	return;
1378
1379	trace_xfs_blockgc_start(mp, __return_address);
1380	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1381	xfs_blockgc_queue(pag);
1382	}
1383
1384	/ Don't try to run block gc on an inode that's in any of these states. /
1385	#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW \| \
1386	XFS_NEED_INACTIVE \| \
1387	XFS_INACTIVATING \| \
1388	XFS_IRECLAIMABLE \| \
1389	XFS_IRECLAIM)
1390	/*
1391	* Decide if the given @ip is eligible for garbage collection of speculative
1392	* preallocations, and grab it if so. Returns true if it's ready to go or
1393	* false if we should just ignore it.
1394	*/
1395	static bool
1396	xfs_blockgc_igrab(
1397	struct xfs_inode *ip)
1398	{
1399	struct inode *inode = VFS_I(ip);
1400
1401	ASSERT(rcu_read_lock_held());
1402
1403	/ Check for stale RCU freed inode /
1404	spin_lock(lock: &ip->i_flags_lock);
1405	if (!ip->i_ino)
1406	goto out_unlock_noent;
1407
1408	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1409	goto out_unlock_noent;
1410	spin_unlock(lock: &ip->i_flags_lock);
1411
1412	/ nothing to sync during shutdown /
1413	if (xfs_is_shutdown(mp: ip->i_mount))
1414	return false;
1415
1416	/ If we can't grab the inode, it must on it's way to reclaim. /
1417	if (!igrab(inode))
1418	return false;
1419
1420	/ inode is valid /
1421	return true;
1422
1423	out_unlock_noent:
1424	spin_unlock(lock: &ip->i_flags_lock);
1425	return false;
1426	}
1427
1428	/ Scan one incore inode for block preallocations that we can remove. /
1429	static int
1430	xfs_blockgc_scan_inode(
1431	struct xfs_inode *ip,
1432	struct xfs_icwalk *icw)
1433	{
1434	unsigned int lockflags = `0`;
1435	int error;
1436
1437	error = xfs_inode_free_eofblocks(ip, icw, lockflags: &lockflags);
1438	if (error)
1439	goto unlock;
1440
1441	error = xfs_inode_free_cowblocks(ip, icw, lockflags: &lockflags);
1442	unlock:
1443	if (lockflags)
1444	xfs_iunlock(ip, lockflags);
1445	xfs_irele(ip);
1446	return error;
1447	}
1448
1449	/ Background worker that trims preallocated space. /
1450	void
1451	xfs_blockgc_worker(
1452	struct work_struct *work)
1453	{
1454	struct xfs_perag *pag = container_of(to_delayed_work(work),
1455	struct xfs_perag, pag_blockgc_work);
1456	struct xfs_mount *mp = pag->pag_mount;
1457	int error;
1458
1459	trace_xfs_blockgc_worker(mp, __return_address);
1460
1461	error = xfs_icwalk_ag(pag, goal: XFS_ICWALK_BLOCKGC, NULL);
1462	if (error)
1463	xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1464	pag->pag_agno, error);
1465	xfs_blockgc_queue(pag);
1466	}
1467
1468	/*
1469	* Try to free space in the filesystem by purging inactive inodes, eofblocks
1470	* and cowblocks.
1471	*/
1472	int
1473	xfs_blockgc_free_space(
1474	struct xfs_mount *mp,
1475	struct xfs_icwalk *icw)
1476	{
1477	int error;
1478
1479	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1480
1481	error = xfs_icwalk(mp, goal: XFS_ICWALK_BLOCKGC, icw);
1482	if (error)
1483	return error;
1484
1485	return xfs_inodegc_flush(mp);
1486	}
1487
1488	/*
1489	* Reclaim all the free space that we can by scheduling the background blockgc
1490	* and inodegc workers immediately and waiting for them all to clear.
1491	*/
1492	int
1493	xfs_blockgc_flush_all(
1494	struct xfs_mount *mp)
1495	{
1496	struct xfs_perag *pag;
1497	xfs_agnumber_t agno;
1498
1499	trace_xfs_blockgc_flush_all(mp, __return_address);
1500
1501	/*
1502	* For each blockgc worker, move its queue time up to now. If it
1503	* wasn't queued, it will not be requeued. Then flush whatever's
1504	* left.
1505	*/
1506	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1507	mod_delayed_work(pag->pag_mount->m_blockgc_wq,
1508	&pag->pag_blockgc_work, `0`);
1509
1510	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1511	flush_delayed_work(&pag->pag_blockgc_work);
1512
1513	return xfs_inodegc_flush(mp);
1514	}
1515
1516	/*
1517	* Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
1518	* quota caused an allocation failure, so we make a best effort by including
1519	* each quota under low free space conditions (less than 1% free space) in the
1520	* scan.
1521	*
1522	* Callers must not hold any inode's ILOCK. If requesting a synchronous scan
1523	* (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1524	* MMAPLOCK.
1525	*/
1526	int
1527	xfs_blockgc_free_dquots(
1528	struct xfs_mount *mp,
1529	struct xfs_dquot *udqp,
1530	struct xfs_dquot *gdqp,
1531	struct xfs_dquot *pdqp,
1532	unsigned int iwalk_flags)
1533	{
1534	struct xfs_icwalk icw = {`0`};
1535	bool do_work = false;
1536
1537	if (!udqp && !gdqp && !pdqp)
1538	return `0`;
1539
1540	/*
1541	* Run a scan to free blocks using the union filter to cover all
1542	* applicable quotas in a single scan.
1543	*/
1544	icw.icw_flags = XFS_ICWALK_FLAG_UNION \| iwalk_flags;
1545
1546	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(dqp: udqp)) {
1547	icw.icw_uid = make_kuid(from: mp->m_super->s_user_ns, uid: udqp->q_id);
1548	icw.icw_flags \|= XFS_ICWALK_FLAG_UID;
1549	do_work = true;
1550	}
1551
1552	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(dqp: gdqp)) {
1553	icw.icw_gid = make_kgid(from: mp->m_super->s_user_ns, gid: gdqp->q_id);
1554	icw.icw_flags \|= XFS_ICWALK_FLAG_GID;
1555	do_work = true;
1556	}
1557
1558	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(dqp: pdqp)) {
1559	icw.icw_prid = pdqp->q_id;
1560	icw.icw_flags \|= XFS_ICWALK_FLAG_PRID;
1561	do_work = true;
1562	}
1563
1564	if (!do_work)
1565	return `0`;
1566
1567	return xfs_blockgc_free_space(mp, icw: &icw);
1568	}
1569
1570	/ Run cow/eofblocks scans on the quotas attached to the inode. /
1571	int
1572	xfs_blockgc_free_quota(
1573	struct xfs_inode *ip,
1574	unsigned int iwalk_flags)
1575	{
1576	return xfs_blockgc_free_dquots(ip->i_mount,
1577	xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1578	xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1579	xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1580	}
1581
1582	/ XFS Inode Cache Walking Code /
1583
1584	/*
1585	* The inode lookup is done in batches to keep the amount of lock traffic and
1586	* radix tree lookups to a minimum. The batch size is a trade off between
1587	* lookup reduction and stack usage. This is in the reclaim path, so we can't
1588	* be too greedy.
1589	*/
1590	#define XFS_LOOKUP_BATCH 32
1591
1592
1593	/*
1594	* Decide if we want to grab this inode in anticipation of doing work towards
1595	* the goal.
1596	*/
1597	static inline bool
1598	xfs_icwalk_igrab(
1599	enum xfs_icwalk_goal goal,
1600	struct xfs_inode *ip,
1601	struct xfs_icwalk *icw)
1602	{
1603	switch (goal) {
1604	case XFS_ICWALK_BLOCKGC:
1605	return xfs_blockgc_igrab(ip);
1606	case XFS_ICWALK_RECLAIM:
1607	return xfs_reclaim_igrab(ip, icw);
1608	default:
1609	return false;
1610	}
1611	}
1612
1613	/*
1614	* Process an inode. Each processing function must handle any state changes
1615	* made by the icwalk igrab function. Return -EAGAIN to skip an inode.
1616	*/
1617	static inline int
1618	xfs_icwalk_process_inode(
1619	enum xfs_icwalk_goal goal,
1620	struct xfs_inode *ip,
1621	struct xfs_perag *pag,
1622	struct xfs_icwalk *icw)
1623	{
1624	int error = `0`;
1625
1626	switch (goal) {
1627	case XFS_ICWALK_BLOCKGC:
1628	error = xfs_blockgc_scan_inode(ip, icw);
1629	break;
1630	case XFS_ICWALK_RECLAIM:
1631	xfs_reclaim_inode(ip, pag);
1632	break;
1633	}
1634	return error;
1635	}
1636
1637	/*
1638	* For a given per-AG structure @pag and a goal, grab qualifying inodes and
1639	* process them in some manner.
1640	*/
1641	static int
1642	xfs_icwalk_ag(
1643	struct xfs_perag *pag,
1644	enum xfs_icwalk_goal goal,
1645	struct xfs_icwalk *icw)
1646	{
1647	struct xfs_mount *mp = pag->pag_mount;
1648	uint32_t first_index;
1649	int last_error = `0`;
1650	int skipped;
1651	bool done;
1652	int nr_found;
1653
1654	restart:
1655	done = false;
1656	skipped = `0`;
1657	if (goal == XFS_ICWALK_RECLAIM)
1658	first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1659	else
1660	first_index = `0`;
1661	nr_found = `0`;
1662	do {
1663	struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1664	int error = `0`;
1665	int i;
1666
1667	rcu_read_lock();
1668
1669	nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1670	results: (void **) batch, first_index,
1671	XFS_LOOKUP_BATCH, tag: goal);
1672	if (!nr_found) {
1673	done = true;
1674	rcu_read_unlock();
1675	break;
1676	}
1677
1678	/*
1679	* Grab the inodes before we drop the lock. if we found
1680	* nothing, nr == 0 and the loop will be skipped.
1681	*/
1682	for (i = `0`; i < nr_found; i++) {
1683	struct xfs_inode *ip = batch[i];
1684
1685	if (done \|\| !xfs_icwalk_igrab(goal, ip, icw))
1686	batch[i] = NULL;
1687
1688	/*
1689	* Update the index for the next lookup. Catch
1690	* overflows into the next AG range which can occur if
1691	* we have inodes in the last block of the AG and we
1692	* are currently pointing to the last inode.
1693	*
1694	* Because we may see inodes that are from the wrong AG
1695	* due to RCU freeing and reallocation, only update the
1696	* index if it lies in this AG. It was a race that lead
1697	* us to see this inode, so another lookup from the
1698	* same index will not find it again.
1699	*/
1700	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1701	continue;
1702	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + `1`);
1703	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1704	done = true;
1705	}
1706
1707	/ unlock now we've grabbed the inodes. /
1708	rcu_read_unlock();
1709
1710	for (i = `0`; i < nr_found; i++) {
1711	if (!batch[i])
1712	continue;
1713	error = xfs_icwalk_process_inode(goal, ip: batch[i], pag,
1714	icw);
1715	if (error == -EAGAIN) {
1716	skipped++;
1717	continue;
1718	}
1719	if (error && last_error != -EFSCORRUPTED)
1720	last_error = error;
1721	}
1722
1723	/ bail out if the filesystem is corrupted. /
1724	if (error == -EFSCORRUPTED)
1725	break;
1726
1727	cond_resched();
1728
1729	if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1730	icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1731	if (icw->icw_scan_limit <= `0`)
1732	break;
1733	}
1734	} while (nr_found && !done);
1735
1736	if (goal == XFS_ICWALK_RECLAIM) {
1737	if (done)
1738	first_index = `0`;
1739	WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1740	}
1741
1742	if (skipped) {
1743	delay(ticks: `1`);
1744	goto restart;
1745	}
1746	return last_error;
1747	}
1748
1749	/ Walk all incore inodes to achieve a given goal. /
1750	static int
1751	xfs_icwalk(
1752	struct xfs_mount *mp,
1753	enum xfs_icwalk_goal goal,
1754	struct xfs_icwalk *icw)
1755	{
1756	struct xfs_perag *pag;
1757	int error = `0`;
1758	int last_error = `0`;
1759	xfs_agnumber_t agno;
1760
1761	for_each_perag_tag(mp, agno, pag, goal) {
1762	error = xfs_icwalk_ag(pag, goal, icw);
1763	if (error) {
1764	last_error = error;
1765	if (error == -EFSCORRUPTED) {
1766	xfs_perag_rele(pag);
1767	break;
1768	}
1769	}
1770	}
1771	return last_error;
1772	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1773	}
1774
1775	#ifdef DEBUG
1776	static void
1777	xfs_check_delalloc(
1778	struct xfs_inode *ip,
1779	int whichfork)
1780	{
1781	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
1782	struct xfs_bmbt_irec got;
1783	struct xfs_iext_cursor icur;
1784
1785	if (!ifp \|\| !xfs_iext_lookup_extent(ip, ifp, `0`, &icur, &got))
1786	return;
1787	do {
1788	if (isnullstartblock(got.br_startblock)) {
1789	xfs_warn(ip->i_mount,
1790	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1791	ip->i_ino,
1792	whichfork == XFS_DATA_FORK ? "data" : "cow",
1793	got.br_startoff, got.br_blockcount);
1794	}
1795	} while (xfs_iext_next_extent(ifp, &icur, &got));
1796	}
1797	#else
1798	#define xfs_check_delalloc(ip, whichfork) do { } while (0)
1799	#endif
1800
1801	/ Schedule the inode for reclaim. /
1802	static void
1803	xfs_inodegc_set_reclaimable(
1804	struct xfs_inode *ip)
1805	{
1806	struct xfs_mount *mp = ip->i_mount;
1807	struct xfs_perag *pag;
1808
1809	if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1810	xfs_check_delalloc(ip, XFS_DATA_FORK);
1811	xfs_check_delalloc(ip, XFS_COW_FORK);
1812	ASSERT(`0`);
1813	}
1814
1815	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1816	spin_lock(lock: &pag->pag_ici_lock);
1817	spin_lock(lock: &ip->i_flags_lock);
1818
1819	trace_xfs_inode_set_reclaimable(ip);
1820	ip->i_flags &= ~(XFS_NEED_INACTIVE \| XFS_INACTIVATING);
1821	ip->i_flags \|= XFS_IRECLAIMABLE;
1822	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1823	XFS_ICI_RECLAIM_TAG);
1824
1825	spin_unlock(lock: &ip->i_flags_lock);
1826	spin_unlock(lock: &pag->pag_ici_lock);
1827	xfs_perag_put(pag);
1828	}
1829
1830	/*
1831	* Free all speculative preallocations and possibly even the inode itself.
1832	* This is the last chance to make changes to an otherwise unreferenced file
1833	* before incore reclamation happens.
1834	*/
1835	static int
1836	xfs_inodegc_inactivate(
1837	struct xfs_inode *ip)
1838	{
1839	int error;
1840
1841	trace_xfs_inode_inactivating(ip);
1842	error = xfs_inactive(ip);
1843	xfs_inodegc_set_reclaimable(ip);
1844	return error;
1845
1846	}
1847
1848	void
1849	xfs_inodegc_worker(
1850	struct work_struct *work)
1851	{
1852	struct xfs_inodegc *gc = container_of(to_delayed_work(work),
1853	struct xfs_inodegc, work);
1854	struct llist_node *node = llist_del_all(head: &gc->list);
1855	struct xfs_inode ip, n;
1856	struct xfs_mount *mp = gc->mp;
1857	unsigned int nofs_flag;
1858
1859	/*
1860	* Clear the cpu mask bit and ensure that we have seen the latest
1861	* update of the gc structure associated with this CPU. This matches
1862	* with the release semantics used when setting the cpumask bit in
1863	* xfs_inodegc_queue.
1864	*/
1865	cpumask_clear_cpu(cpu: gc->cpu, dstp: &mp->m_inodegc_cpumask);
1866	smp_mb__after_atomic();
1867
1868	WRITE_ONCE(gc->items, `0`);
1869
1870	if (!node)
1871	return;
1872
1873	/*
1874	* We can allocate memory here while doing writeback on behalf of
1875	* memory reclaim. To avoid memory allocation deadlocks set the
1876	* task-wide nofs context for the following operations.
1877	*/
1878	nofs_flag = memalloc_nofs_save();
1879
1880	ip = llist_entry(node, struct xfs_inode, i_gclist);
1881	trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
1882
1883	WRITE_ONCE(gc->shrinker_hits, `0`);
1884	llist_for_each_entry_safe(ip, n, node, i_gclist) {
1885	int error;
1886
1887	xfs_iflags_set(ip, XFS_INACTIVATING);
1888	error = xfs_inodegc_inactivate(ip);
1889	if (error && !gc->error)
1890	gc->error = error;
1891	}
1892
1893	memalloc_nofs_restore(flags: nofs_flag);
1894	}
1895
1896	/*
1897	* Expedite all pending inodegc work to run immediately. This does not wait for
1898	* completion of the work.
1899	*/
1900	void
1901	xfs_inodegc_push(
1902	struct xfs_mount *mp)
1903	{
1904	if (!xfs_is_inodegc_enabled(mp))
1905	return;
1906	trace_xfs_inodegc_push(mp, __return_address);
1907	xfs_inodegc_queue_all(mp);
1908	}
1909
1910	/*
1911	* Force all currently queued inode inactivation work to run immediately and
1912	* wait for the work to finish.
1913	*/
1914	int
1915	xfs_inodegc_flush(
1916	struct xfs_mount *mp)
1917	{
1918	xfs_inodegc_push(mp);
1919	trace_xfs_inodegc_flush(mp, __return_address);
1920	return xfs_inodegc_wait_all(mp);
1921	}
1922
1923	/*
1924	* Flush all the pending work and then disable the inode inactivation background
1925	* workers and wait for them to stop. Caller must hold sb->s_umount to
1926	* coordinate changes in the inodegc_enabled state.
1927	*/
1928	void
1929	xfs_inodegc_stop(
1930	struct xfs_mount *mp)
1931	{
1932	bool rerun;
1933
1934	if (!xfs_clear_inodegc_enabled(mp))
1935	return;
1936
1937	/*
1938	* Drain all pending inodegc work, including inodes that could be
1939	* queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
1940	* threads that sample the inodegc state just prior to us clearing it.
1941	* The inodegc flag state prevents new threads from queuing more
1942	* inodes, so we queue pending work items and flush the workqueue until
1943	* all inodegc lists are empty. IOWs, we cannot use drain_workqueue
1944	* here because it does not allow other unserialized mechanisms to
1945	* reschedule inodegc work while this draining is in progress.
1946	*/
1947	xfs_inodegc_queue_all(mp);
1948	do {
1949	flush_workqueue(mp->m_inodegc_wq);
1950	rerun = xfs_inodegc_queue_all(mp);
1951	} while (rerun);
1952
1953	trace_xfs_inodegc_stop(mp, __return_address);
1954	}
1955
1956	/*
1957	* Enable the inode inactivation background workers and schedule deferred inode
1958	* inactivation work if there is any. Caller must hold sb->s_umount to
1959	* coordinate changes in the inodegc_enabled state.
1960	*/
1961	void
1962	xfs_inodegc_start(
1963	struct xfs_mount *mp)
1964	{
1965	if (xfs_set_inodegc_enabled(mp))
1966	return;
1967
1968	trace_xfs_inodegc_start(mp, __return_address);
1969	xfs_inodegc_queue_all(mp);
1970	}
1971
1972	#ifdef CONFIG_XFS_RT
1973	static inline bool
1974	xfs_inodegc_want_queue_rt_file(
1975	struct xfs_inode *ip)
1976	{
1977	struct xfs_mount *mp = ip->i_mount;
1978
1979	if (!XFS_IS_REALTIME_INODE(ip))
1980	return false;
1981
1982	if (__percpu_counter_compare(fbc: &mp->m_frextents,
1983	rhs: mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
1984	XFS_FDBLOCKS_BATCH) < `0`)
1985	return true;
1986
1987	return false;
1988	}
1989	#else
1990	# define xfs_inodegc_want_queue_rt_file(ip) (false)
1991	#endif /* CONFIG_XFS_RT */
1992
1993	/*
1994	* Schedule the inactivation worker when:
1995	*
1996	* - We've accumulated more than one inode cluster buffer's worth of inodes.
1997	* - There is less than 5% free space left.
1998	* - Any of the quotas for this inode are near an enforcement limit.
1999	*/
2000	static inline bool
2001	xfs_inodegc_want_queue_work(
2002	struct xfs_inode *ip,
2003	unsigned int items)
2004	{
2005	struct xfs_mount *mp = ip->i_mount;
2006
2007	if (items > mp->m_ino_geo.inodes_per_cluster)
2008	return true;
2009
2010	if (__percpu_counter_compare(fbc: &mp->m_fdblocks,
2011	rhs: mp->m_low_space[XFS_LOWSP_5_PCNT],
2012	XFS_FDBLOCKS_BATCH) < `0`)
2013	return true;
2014
2015	if (xfs_inodegc_want_queue_rt_file(ip))
2016	return true;
2017
2018	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
2019	return true;
2020
2021	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
2022	return true;
2023
2024	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
2025	return true;
2026
2027	return false;
2028	}
2029
2030	/*
2031	* Upper bound on the number of inodes in each AG that can be queued for
2032	* inactivation at any given time, to avoid monopolizing the workqueue.
2033	*/
2034	#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
2035
2036	/*
2037	* Make the frontend wait for inactivations when:
2038	*
2039	* - Memory shrinkers queued the inactivation worker and it hasn't finished.
2040	* - The queue depth exceeds the maximum allowable percpu backlog.
2041	*
2042	* Note: If we are in a NOFS context here (e.g. current thread is running a
2043	* transaction) the we don't want to block here as inodegc progress may require
2044	* filesystem resources we hold to make progress and that could result in a
2045	* deadlock. Hence we skip out of here if we are in a scoped NOFS context.
2046	*/
2047	static inline bool
2048	xfs_inodegc_want_flush_work(
2049	struct xfs_inode *ip,
2050	unsigned int items,
2051	unsigned int shrinker_hits)
2052	{
2053	if (current->flags & PF_MEMALLOC_NOFS)
2054	return false;
2055
2056	if (shrinker_hits > `0`)
2057	return true;
2058
2059	if (items > XFS_INODEGC_MAX_BACKLOG)
2060	return true;
2061
2062	return false;
2063	}
2064
2065	/*
2066	* Queue a background inactivation worker if there are inodes that need to be
2067	* inactivated and higher level xfs code hasn't disabled the background
2068	* workers.
2069	*/
2070	static void
2071	xfs_inodegc_queue(
2072	struct xfs_inode *ip)
2073	{
2074	struct xfs_mount *mp = ip->i_mount;
2075	struct xfs_inodegc *gc;
2076	int items;
2077	unsigned int shrinker_hits;
2078	unsigned int cpu_nr;
2079	unsigned long queue_delay = `1`;
2080
2081	trace_xfs_inode_set_need_inactive(ip);
2082	spin_lock(lock: &ip->i_flags_lock);
2083	ip->i_flags \|= XFS_NEED_INACTIVE;
2084	spin_unlock(lock: &ip->i_flags_lock);
2085
2086	cpu_nr = get_cpu();
2087	gc = this_cpu_ptr(mp->m_inodegc);
2088	llist_add(new: &ip->i_gclist, head: &gc->list);
2089	items = READ_ONCE(gc->items);
2090	WRITE_ONCE(gc->items, items + `1`);
2091	shrinker_hits = READ_ONCE(gc->shrinker_hits);
2092
2093	/*
2094	* Ensure the list add is always seen by anyone who finds the cpumask
2095	* bit set. This effectively gives the cpumask bit set operation
2096	* release ordering semantics.
2097	*/
2098	smp_mb__before_atomic();
2099	if (!cpumask_test_cpu(cpu: cpu_nr, cpumask: &mp->m_inodegc_cpumask))
2100	cpumask_test_and_set_cpu(cpu: cpu_nr, cpumask: &mp->m_inodegc_cpumask);
2101
2102	/*
2103	* We queue the work while holding the current CPU so that the work
2104	* is scheduled to run on this CPU.
2105	*/
2106	if (!xfs_is_inodegc_enabled(mp)) {
2107	put_cpu();
2108	return;
2109	}
2110
2111	if (xfs_inodegc_want_queue_work(ip, items))
2112	queue_delay = `0`;
2113
2114	trace_xfs_inodegc_queue(mp, __return_address);
2115	mod_delayed_work_on(current_cpu(), wq: mp->m_inodegc_wq, dwork: &gc->work,
2116	delay: queue_delay);
2117	put_cpu();
2118
2119	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2120	trace_xfs_inodegc_throttle(mp, __return_address);
2121	flush_delayed_work(dwork: &gc->work);
2122	}
2123	}
2124
2125	/*
2126	* We set the inode flag atomically with the radix tree tag. Once we get tag
2127	* lookups on the radix tree, this inode flag can go away.
2128	*
2129	* We always use background reclaim here because even if the inode is clean, it
2130	* still may be under IO and hence we have wait for IO completion to occur
2131	* before we can reclaim the inode. The background reclaim path handles this
2132	* more efficiently than we can here, so simply let background reclaim tear down
2133	* all inodes.
2134	*/
2135	void
2136	xfs_inode_mark_reclaimable(
2137	struct xfs_inode *ip)
2138	{
2139	struct xfs_mount *mp = ip->i_mount;
2140	bool need_inactive;
2141
2142	XFS_STATS_INC(mp, vn_reclaim);
2143
2144	/*
2145	* We should never get here with any of the reclaim flags already set.
2146	*/
2147	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2148
2149	need_inactive = xfs_inode_needs_inactive(ip);
2150	if (need_inactive) {
2151	xfs_inodegc_queue(ip);
2152	return;
2153	}
2154
2155	/ Going straight to reclaim, so drop the dquots. /
2156	xfs_qm_dqdetach(ip);
2157	xfs_inodegc_set_reclaimable(ip);
2158	}
2159
2160	/*
2161	* Register a phony shrinker so that we can run background inodegc sooner when
2162	* there's memory pressure. Inactivation does not itself free any memory but
2163	* it does make inodes reclaimable, which eventually frees memory.
2164	*
2165	* The count function, seek value, and batch value are crafted to trigger the
2166	* scan function during the second round of scanning. Hopefully this means
2167	* that we reclaimed enough memory that initiating metadata transactions won't
2168	* make things worse.
2169	*/
2170	#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
2171	#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2172
2173	static unsigned long
2174	xfs_inodegc_shrinker_count(
2175	struct shrinker *shrink,
2176	struct shrink_control *sc)
2177	{
2178	struct xfs_mount *mp = shrink->private_data;
2179	struct xfs_inodegc *gc;
2180	int cpu;
2181
2182	if (!xfs_is_inodegc_enabled(mp))
2183	return `0`;
2184
2185	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2186	gc = per_cpu_ptr(mp->m_inodegc, cpu);
2187	if (!llist_empty(head: &gc->list))
2188	return XFS_INODEGC_SHRINKER_COUNT;
2189	}
2190
2191	return `0`;
2192	}
2193
2194	static unsigned long
2195	xfs_inodegc_shrinker_scan(
2196	struct shrinker *shrink,
2197	struct shrink_control *sc)
2198	{
2199	struct xfs_mount *mp = shrink->private_data;
2200	struct xfs_inodegc *gc;
2201	int cpu;
2202	bool no_items = true;
2203
2204	if (!xfs_is_inodegc_enabled(mp))
2205	return SHRINK_STOP;
2206
2207	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2208
2209	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2210	gc = per_cpu_ptr(mp->m_inodegc, cpu);
2211	if (!llist_empty(head: &gc->list)) {
2212	unsigned int h = READ_ONCE(gc->shrinker_hits);
2213
2214	WRITE_ONCE(gc->shrinker_hits, h + `1`);
2215	mod_delayed_work_on(cpu, wq: mp->m_inodegc_wq, dwork: &gc->work, delay: `0`);
2216	no_items = false;
2217	}
2218	}
2219
2220	/*
2221	* If there are no inodes to inactivate, we don't want the shrinker
2222	* to think there's deferred work to call us back about.
2223	*/
2224	if (no_items)
2225	return LONG_MAX;
2226
2227	return SHRINK_STOP;
2228	}
2229
2230	/ Register a shrinker so we can accelerate inodegc and throttle queuing. /
2231	int
2232	xfs_inodegc_register_shrinker(
2233	struct xfs_mount *mp)
2234	{
2235	mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
2236	fmt: "xfs-inodegc:%s",
2237	mp->m_super->s_id);
2238	if (!mp->m_inodegc_shrinker)
2239	return -ENOMEM;
2240
2241	mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
2242	mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
2243	mp->m_inodegc_shrinker->seeks = `0`;
2244	mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
2245	mp->m_inodegc_shrinker->private_data = mp;
2246
2247	shrinker_register(shrinker: mp->m_inodegc_shrinker);
2248
2249	return `0`;
2250	}
2251

source code of linux/fs/xfs/xfs_icache.c