shmem.c source code [linux/mm/shmem.c]

1	/*
2	* Resizable virtual memory filesystem for Linux.
3	*
4	* Copyright (C) 2000 Linus Torvalds.
5	* 2000 Transmeta Corp.
6	* 2000-2001 Christoph Rohland
7	* 2000-2001 SAP AG
8	* 2002 Red Hat Inc.
9	* Copyright (C) 2002-2011 Hugh Dickins.
10	* Copyright (C) 2011 Google Inc.
11	* Copyright (C) 2002-2005 VERITAS Software Corporation.
12	* Copyright (C) 2004 Andi Kleen, SuSE Labs
13	*
14	* Extended attribute support for tmpfs:
15	* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16	* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17	*
18	* tiny-shmem:
19	* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20	*
21	* This file is released under the GPL.
22	*/
23
24	#include <linux/fs.h>
25	#include <linux/init.h>
26	#include <linux/vfs.h>
27	#include <linux/mount.h>
28	#include <linux/ramfs.h>
29	#include <linux/pagemap.h>
30	#include <linux/file.h>
31	#include <linux/fileattr.h>
32	#include <linux/mm.h>
33	#include <linux/random.h>
34	#include <linux/sched/signal.h>
35	#include <linux/export.h>
36	#include <linux/shmem_fs.h>
37	#include <linux/swap.h>
38	#include <linux/uio.h>
39	#include <linux/hugetlb.h>
40	#include <linux/fs_parser.h>
41	#include <linux/swapfile.h>
42	#include <linux/iversion.h>
43	#include <linux/unicode.h>
44	#include "swap.h"
45
46	static struct vfsmount *shm_mnt __ro_after_init;
47
48	#ifdef CONFIG_SHMEM
49	/*
50	* This virtual memory filesystem is heavily based on the ramfs. It
51	* extends ramfs by the ability to use swap and honor resource limits
52	* which makes it a completely usable filesystem.
53	*/
54
55	#include <linux/xattr.h>
56	#include <linux/exportfs.h>
57	#include <linux/posix_acl.h>
58	#include <linux/posix_acl_xattr.h>
59	#include <linux/mman.h>
60	#include <linux/string.h>
61	#include <linux/slab.h>
62	#include <linux/backing-dev.h>
63	#include <linux/writeback.h>
64	#include <linux/pagevec.h>
65	#include <linux/percpu_counter.h>
66	#include <linux/falloc.h>
67	#include <linux/splice.h>
68	#include <linux/security.h>
69	#include <linux/leafops.h>
70	#include <linux/mempolicy.h>
71	#include <linux/namei.h>
72	#include <linux/ctype.h>
73	#include <linux/migrate.h>
74	#include <linux/highmem.h>
75	#include <linux/seq_file.h>
76	#include <linux/magic.h>
77	#include <linux/syscalls.h>
78	#include <linux/fcntl.h>
79	#include <uapi/linux/memfd.h>
80	#include <linux/rmap.h>
81	#include <linux/uuid.h>
82	#include <linux/quotaops.h>
83	#include <linux/rcupdate_wait.h>
84
85	#include <linux/uaccess.h>
86
87	#include "internal.h"
88
89	#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
90
91	/ Pretend that each entry is of this size in directory's i_size /
92	#define BOGO_DIRENT_SIZE 20
93
94	/ Pretend that one inode + its dentry occupy this much memory /
95	#define BOGO_INODE_SIZE 1024
96
97	/ Symlink up to this size is kmalloc'ed instead of using a swappable page /
98	#define SHORT_SYMLINK_LEN 128
99
100	/*
101	* shmem_fallocate communicates with shmem_fault or shmem_writeout via
102	* inode->i_private (with i_rwsem making sure that it has only one user at
103	* a time): we would prefer not to enlarge the shmem inode just for that.
104	*/
105	struct shmem_falloc {
106	wait_queue_head_t waitq; /* faults into hole wait for punch to end /
107	pgoff_t start; / start of range currently being fallocated /
108	pgoff_t next; / the next page offset to be fallocated /
109	pgoff_t nr_falloced; / how many new pages have been fallocated /
110	pgoff_t nr_unswapped; / how often writeout refused to swap out /
111	};
112
113	struct shmem_options {
114	unsigned long long blocks;
115	unsigned long long inodes;
116	struct mempolicy *mpol;
117	kuid_t uid;
118	kgid_t gid;
119	umode_t mode;
120	bool full_inums;
121	int huge;
122	int seen;
123	bool noswap;
124	unsigned short quota_types;
125	struct shmem_quota_limits qlimits;
126	#if IS_ENABLED(CONFIG_UNICODE)
127	struct unicode_map *encoding;
128	bool strict_encoding;
129	#endif
130	#define SHMEM_SEEN_BLOCKS 1
131	#define SHMEM_SEEN_INODES 2
132	#define SHMEM_SEEN_HUGE 4
133	#define SHMEM_SEEN_INUMS 8
134	#define SHMEM_SEEN_QUOTA 16
135	};
136
137	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
138	static unsigned long huge_shmem_orders_always __read_mostly;
139	static unsigned long huge_shmem_orders_madvise __read_mostly;
140	static unsigned long huge_shmem_orders_inherit __read_mostly;
141	static unsigned long huge_shmem_orders_within_size __read_mostly;
142	static bool shmem_orders_configured __initdata;
143	#endif
144
145	#ifdef CONFIG_TMPFS
146	static unsigned long shmem_default_max_blocks(void)
147	{
148	return totalram_pages() / `2`;
149	}
150
151	static unsigned long shmem_default_max_inodes(void)
152	{
153	unsigned long nr_pages = totalram_pages();
154
155	return min3(nr_pages - totalhigh_pages(), nr_pages / `2`,
156	ULONG_MAX / BOGO_INODE_SIZE);
157	}
158	#endif
159
160	static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
161	struct folio foliop, enum** sgp_type sgp, gfp_t gfp,
162	struct vm_area_struct vma, vm_fault_t fault_type);
163
164	static inline struct shmem_sb_info SHMEM_SB(struct* super_block *sb)
165	{
166	return sb->s_fs_info;
167	}
168
169	/*
170	* shmem_file_setup pre-accounts the whole fixed size of a VM object,
171	* for shared memory and for shared anonymous (/dev/zero) mappings
172	* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
173	* consistent with the pre-accounting of private mappings ...
174	*/
175	static inline int shmem_acct_size(unsigned long flags, loff_t size)
176	{
177	return (flags & SHMEM_F_NORESERVE) ?
178	`0` : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
179	}
180
181	static inline void shmem_unacct_size(unsigned long flags, loff_t size)
182	{
183	if (!(flags & SHMEM_F_NORESERVE))
184	vm_unacct_memory(VM_ACCT(size));
185	}
186
187	static inline int shmem_reacct_size(unsigned long flags,
188	loff_t oldsize, loff_t newsize)
189	{
190	if (!(flags & SHMEM_F_NORESERVE)) {
191	if (VM_ACCT(newsize) > VM_ACCT(oldsize))
192	return security_vm_enough_memory_mm(current->mm,
193	VM_ACCT(newsize) - VM_ACCT(oldsize));
194	else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
195	vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
196	}
197	return `0`;
198	}
199
200	/*
201	* ... whereas tmpfs objects are accounted incrementally as
202	* pages are allocated, in order to allow large sparse files.
203	* shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
204	* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
205	*/
206	static inline int shmem_acct_blocks(unsigned long flags, long pages)
207	{
208	if (!(flags & SHMEM_F_NORESERVE))
209	return `0`;
210
211	return security_vm_enough_memory_mm(current->mm,
212	pages: pages * VM_ACCT(PAGE_SIZE));
213	}
214
215	static inline void shmem_unacct_blocks(unsigned long flags, long pages)
216	{
217	if (flags & SHMEM_F_NORESERVE)
218	vm_unacct_memory(pages: pages * VM_ACCT(PAGE_SIZE));
219	}
220
221	int shmem_inode_acct_blocks(struct inode inode, long* pages)
222	{
223	struct shmem_inode_info *info = SHMEM_I(inode);
224	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
225	int err = -ENOSPC;
226
227	if (shmem_acct_blocks(flags: info->flags, pages))
228	return err;
229
230	might_sleep(); / when quotas /
231	if (sbinfo->max_blocks) {
232	if (!percpu_counter_limited_add(fbc: &sbinfo->used_blocks,
233	limit: sbinfo->max_blocks, amount: pages))
234	goto unacct;
235
236	err = dquot_alloc_block_nodirty(inode, nr: pages);
237	if (err) {
238	percpu_counter_sub(fbc: &sbinfo->used_blocks, amount: pages);
239	goto unacct;
240	}
241	} else {
242	err = dquot_alloc_block_nodirty(inode, nr: pages);
243	if (err)
244	goto unacct;
245	}
246
247	return `0`;
248
249	unacct:
250	shmem_unacct_blocks(flags: info->flags, pages);
251	return err;
252	}
253
254	static void shmem_inode_unacct_blocks(struct inode inode, long* pages)
255	{
256	struct shmem_inode_info *info = SHMEM_I(inode);
257	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
258
259	might_sleep(); / when quotas /
260	dquot_free_block_nodirty(inode, nr: pages);
261
262	if (sbinfo->max_blocks)
263	percpu_counter_sub(fbc: &sbinfo->used_blocks, amount: pages);
264	shmem_unacct_blocks(flags: info->flags, pages);
265	}
266
267	static const struct super_operations shmem_ops;
268	static const struct address_space_operations shmem_aops;
269	static const struct file_operations shmem_file_operations;
270	static const struct inode_operations shmem_inode_operations;
271	static const struct inode_operations shmem_dir_inode_operations;
272	static const struct inode_operations shmem_special_inode_operations;
273	static const struct vm_operations_struct shmem_vm_ops;
274	static const struct vm_operations_struct shmem_anon_vm_ops;
275	static struct file_system_type shmem_fs_type;
276
277	bool shmem_mapping(const struct address_space *mapping)
278	{
279	return mapping->a_ops == &shmem_aops;
280	}
281	EXPORT_SYMBOL_GPL(shmem_mapping);
282
283	bool vma_is_anon_shmem(const struct vm_area_struct *vma)
284	{
285	return vma->vm_ops == &shmem_anon_vm_ops;
286	}
287
288	bool vma_is_shmem(const struct vm_area_struct *vma)
289	{
290	return vma_is_anon_shmem(vma) \|\| vma->vm_ops == &shmem_vm_ops;
291	}
292
293	static LIST_HEAD(shmem_swaplist);
294	static DEFINE_SPINLOCK(shmem_swaplist_lock);
295
296	#ifdef CONFIG_TMPFS_QUOTA
297
298	static int shmem_enable_quotas(struct super_block *sb,
299	unsigned short quota_types)
300	{
301	int type, err = `0`;
302
303	sb_dqopt(sb)->flags \|= DQUOT_QUOTA_SYS_FILE \| DQUOT_NOLIST_DIRTY;
304	for (type = `0`; type < SHMEM_MAXQUOTAS; type++) {
305	if (!(quota_types & (`1` << type)))
306	continue;
307	err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
308	DQUOT_USAGE_ENABLED \|
309	DQUOT_LIMITS_ENABLED);
310	if (err)
311	goto out_err;
312	}
313	return `0`;
314
315	out_err:
316	pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
317	type, err);
318	for (type--; type >= `0`; type--)
319	dquot_quota_off(sb, type);
320	return err;
321	}
322
323	static void shmem_disable_quotas(struct super_block *sb)
324	{
325	int type;
326
327	for (type = `0`; type < SHMEM_MAXQUOTAS; type++)
328	dquot_quota_off(sb, type);
329	}
330
331	static struct dquot __rcu shmem_get_dquots(struct** inode *inode)
332	{
333	return SHMEM_I(inode)->i_dquot;
334	}
335	#endif /* CONFIG_TMPFS_QUOTA */
336
337	/*
338	* shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
339	* produces a novel ino for the newly allocated inode.
340	*
341	* It may also be called when making a hard link to permit the space needed by
342	* each dentry. However, in that case, no new inode number is needed since that
343	* internally draws from another pool of inode numbers (currently global
344	* get_next_ino()). This case is indicated by passing NULL as inop.
345	*/
346	#define SHMEM_INO_BATCH 1024
347	static int shmem_reserve_inode(struct super_block sb, ino_t inop)
348	{
349	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
350	ino_t ino;
351
352	if (!(sb->s_flags & SB_KERNMOUNT)) {
353	raw_spin_lock(&sbinfo->stat_lock);
354	if (sbinfo->max_inodes) {
355	if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
356	raw_spin_unlock(&sbinfo->stat_lock);
357	return -ENOSPC;
358	}
359	sbinfo->free_ispace -= BOGO_INODE_SIZE;
360	}
361	if (inop) {
362	ino = sbinfo->next_ino++;
363	if (unlikely(is_zero_ino(ino)))
364	ino = sbinfo->next_ino++;
365	if (unlikely(!sbinfo->full_inums &&
366	ino > UINT_MAX)) {
367	/*
368	* Emulate get_next_ino uint wraparound for
369	* compatibility
370	*/
371	if (IS_ENABLED(CONFIG_64BIT))
372	pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
373	__func__, MINOR(sb->s_dev));
374	sbinfo->next_ino = `1`;
375	ino = sbinfo->next_ino++;
376	}
377	*inop = ino;
378	}
379	raw_spin_unlock(&sbinfo->stat_lock);
380	} else if (inop) {
381	/*
382	* __shmem_file_setup, one of our callers, is lock-free: it
383	* doesn't hold stat_lock in shmem_reserve_inode since
384	* max_inodes is always 0, and is called from potentially
385	* unknown contexts. As such, use a per-cpu batched allocator
386	* which doesn't require the per-sb stat_lock unless we are at
387	* the batch boundary.
388	*
389	* We don't need to worry about inode{32,64} since SB_KERNMOUNT
390	* shmem mounts are not exposed to userspace, so we don't need
391	* to worry about things like glibc compatibility.
392	*/
393	ino_t *next_ino;
394
395	next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
396	ino = *next_ino;
397	if (unlikely(ino % SHMEM_INO_BATCH == `0`)) {
398	raw_spin_lock(&sbinfo->stat_lock);
399	ino = sbinfo->next_ino;
400	sbinfo->next_ino += SHMEM_INO_BATCH;
401	raw_spin_unlock(&sbinfo->stat_lock);
402	if (unlikely(is_zero_ino(ino)))
403	ino++;
404	}
405	*inop = ino;
406	*next_ino = ++ino;
407	put_cpu();
408	}
409
410	return `0`;
411	}
412
413	static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
414	{
415	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
416	if (sbinfo->max_inodes) {
417	raw_spin_lock(&sbinfo->stat_lock);
418	sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
419	raw_spin_unlock(&sbinfo->stat_lock);
420	}
421	}
422
423	/**
424	* shmem_recalc_inode - recalculate the block usage of an inode
425	* @inode: inode to recalc
426	* @alloced: the change in number of pages allocated to inode
427	* @swapped: the change in number of pages swapped from inode
428	*
429	* We have to calculate the free blocks since the mm can drop
430	* undirtied hole pages behind our back.
431	*
432	* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
433	* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
434	*
435	* Return: true if swapped was incremented from 0, for shmem_writeout().
436	*/
437	bool shmem_recalc_inode(struct inode inode, long* alloced, long swapped)
438	{
439	struct shmem_inode_info *info = SHMEM_I(inode);
440	bool first_swapped = false;
441	long freed;
442
443	spin_lock(lock: &info->lock);
444	info->alloced += alloced;
445	info->swapped += swapped;
446	freed = info->alloced - info->swapped -
447	READ_ONCE(inode->i_mapping->nrpages);
448	/*
449	* Special case: whereas normally shmem_recalc_inode() is called
450	* after i_mapping->nrpages has already been adjusted (up or down),
451	* shmem_writeout() has to raise swapped before nrpages is lowered -
452	* to stop a racing shmem_recalc_inode() from thinking that a page has
453	* been freed. Compensate here, to avoid the need for a followup call.
454	*/
455	if (swapped > `0`) {
456	if (info->swapped == swapped)
457	first_swapped = true;
458	freed += swapped;
459	}
460	if (freed > `0`)
461	info->alloced -= freed;
462	spin_unlock(lock: &info->lock);
463
464	/ The quota case may block /
465	if (freed > `0`)
466	shmem_inode_unacct_blocks(inode, pages: freed);
467	return first_swapped;
468	}
469
470	bool shmem_charge(struct inode inode, long* pages)
471	{
472	struct address_space *mapping = inode->i_mapping;
473
474	if (shmem_inode_acct_blocks(inode, pages))
475	return false;
476
477	/ nrpages adjustment first, then shmem_recalc_inode() when balanced /
478	xa_lock_irq(&mapping->i_pages);
479	mapping->nrpages += pages;
480	xa_unlock_irq(&mapping->i_pages);
481
482	shmem_recalc_inode(inode, alloced: pages, swapped: `0`);
483	return true;
484	}
485
486	void shmem_uncharge(struct inode inode, long* pages)
487	{
488	/ pages argument is currently unused: keep it to help debugging /
489	/ nrpages adjustment done by __filemap_remove_folio() or caller /
490
491	shmem_recalc_inode(inode, alloced: `0`, swapped: `0`);
492	}
493
494	/*
495	* Replace item expected in xarray by a new item, while holding xa_lock.
496	*/
497	static int shmem_replace_entry(struct address_space *mapping,
498	pgoff_t index, void expected, void* *replacement)
499	{
500	XA_STATE(xas, &mapping->i_pages, index);
501	void *item;
502
503	VM_BUG_ON(!expected);
504	VM_BUG_ON(!replacement);
505	item = xas_load(&xas);
506	if (item != expected)
507	return -ENOENT;
508	xas_store(&xas, entry: replacement);
509	return `0`;
510	}
511
512	/*
513	* Sometimes, before we decide whether to proceed or to fail, we must check
514	* that an entry was not already brought back or split by a racing thread.
515	*
516	* Checking folio is not enough: by the time a swapcache folio is locked, it
517	* might be reused, and again be swapcache, using the same swap as before.
518	* Returns the swap entry's order if it still presents, else returns -1.
519	*/
520	static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
521	swp_entry_t swap)
522	{
523	XA_STATE(xas, &mapping->i_pages, index);
524	int ret = -`1`;
525	void *entry;
526
527	rcu_read_lock();
528	do {
529	entry = xas_load(&xas);
530	if (entry == swp_to_radix_entry(entry: swap))
531	ret = xas_get_order(xas: &xas);
532	} while (xas_retry(xas: &xas, entry));
533	rcu_read_unlock();
534	return ret;
535	}
536
537	/*
538	* Definitions for "huge tmpfs": tmpfs mounted with the huge= option
539	*
540	* SHMEM_HUGE_NEVER:
541	* disables huge pages for the mount;
542	* SHMEM_HUGE_ALWAYS:
543	* enables huge pages for the mount;
544	* SHMEM_HUGE_WITHIN_SIZE:
545	* only allocate huge pages if the page will be fully within i_size,
546	* also respect madvise() hints;
547	* SHMEM_HUGE_ADVISE:
548	* only allocate huge pages if requested with madvise();
549	*/
550
551	#define SHMEM_HUGE_NEVER 0
552	#define SHMEM_HUGE_ALWAYS 1
553	#define SHMEM_HUGE_WITHIN_SIZE 2
554	#define SHMEM_HUGE_ADVISE 3
555
556	/*
557	* Special values.
558	* Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
559	*
560	* SHMEM_HUGE_DENY:
561	* disables huge on shm_mnt and all mounts, for emergency use;
562	* SHMEM_HUGE_FORCE:
563	* enables huge on shm_mnt and all mounts, w/o needing option, for testing;
564	*
565	*/
566	#define SHMEM_HUGE_DENY (-1)
567	#define SHMEM_HUGE_FORCE (-2)
568
569	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
570	/ ifdef here to avoid bloating shmem.o when not necessary /
571
572	#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
573	#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
574	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
575	#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
576	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
577	#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
578	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
579	#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
580	#else
581	#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
582	#endif
583
584	static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;
585
586	#undef SHMEM_HUGE_DEFAULT
587
588	#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
589	#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
590	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
591	#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
592	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
593	#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
594	#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
595	#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
596	#else
597	#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
598	#endif
599
600	static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;
601
602	#undef TMPFS_HUGE_DEFAULT
603
604	static unsigned int shmem_get_orders_within_size(struct inode *inode,
605	unsigned long within_size_orders, pgoff_t index,
606	loff_t write_end)
607	{
608	pgoff_t aligned_index;
609	unsigned long order;
610	loff_t i_size;
611
612	order = highest_order(orders: within_size_orders);
613	while (within_size_orders) {
614	aligned_index = round_up(index + `1`, `1` << order);
615	i_size = max(write_end, i_size_read(inode));
616	i_size = round_up(i_size, PAGE_SIZE);
617	if (i_size >> PAGE_SHIFT >= aligned_index)
618	return within_size_orders;
619
620	order = next_order(orders: &within_size_orders, prev: order);
621	}
622
623	return `0`;
624	}
625
626	static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
627	loff_t write_end, bool shmem_huge_force,
628	struct vm_area_struct *vma,
629	vm_flags_t vm_flags)
630	{
631	unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
632	`0` : BIT(HPAGE_PMD_ORDER);
633	unsigned long within_size_orders;
634
635	if (!S_ISREG(inode->i_mode))
636	return `0`;
637	if (shmem_huge == SHMEM_HUGE_DENY)
638	return `0`;
639	if (shmem_huge_force \|\| shmem_huge == SHMEM_HUGE_FORCE)
640	return maybe_pmd_order;
641
642	/*
643	* The huge order allocation for anon shmem is controlled through
644	* the mTHP interface, so we still use PMD-sized huge order to
645	* check whether global control is enabled.
646	*
647	* For tmpfs with 'huge=always' or 'huge=within_size' mount option,
648	* we will always try PMD-sized order first. If that failed, it will
649	* fall back to small large folios.
650	*/
651	switch (SHMEM_SB(sb: inode->i_sb)->huge) {
652	case SHMEM_HUGE_ALWAYS:
653	return THP_ORDERS_ALL_FILE_DEFAULT;
654	case SHMEM_HUGE_WITHIN_SIZE:
655	within_size_orders = shmem_get_orders_within_size(inode,
656	THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
657	if (within_size_orders > `0`)
658	return within_size_orders;
659
660	fallthrough;
661	case SHMEM_HUGE_ADVISE:
662	if (vm_flags & VM_HUGEPAGE)
663	return THP_ORDERS_ALL_FILE_DEFAULT;
664	fallthrough;
665	default:
666	return `0`;
667	}
668	}
669
670	static int shmem_parse_huge(const char *str)
671	{
672	int huge;
673
674	if (!str)
675	return -EINVAL;
676
677	if (!strcmp(str, "never"))
678	huge = SHMEM_HUGE_NEVER;
679	else if (!strcmp(str, "always"))
680	huge = SHMEM_HUGE_ALWAYS;
681	else if (!strcmp(str, "within_size"))
682	huge = SHMEM_HUGE_WITHIN_SIZE;
683	else if (!strcmp(str, "advise"))
684	huge = SHMEM_HUGE_ADVISE;
685	else if (!strcmp(str, "deny"))
686	huge = SHMEM_HUGE_DENY;
687	else if (!strcmp(str, "force"))
688	huge = SHMEM_HUGE_FORCE;
689	else
690	return -EINVAL;
691
692	if (!has_transparent_hugepage() &&
693	huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
694	return -EINVAL;
695
696	/ Do not override huge allocation policy with non-PMD sized mTHP /
697	if (huge == SHMEM_HUGE_FORCE &&
698	huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
699	return -EINVAL;
700
701	return huge;
702	}
703
704	#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
705	static const char shmem_format_huge(int* huge)
706	{
707	switch (huge) {
708	case SHMEM_HUGE_NEVER:
709	return "never";
710	case SHMEM_HUGE_ALWAYS:
711	return "always";
712	case SHMEM_HUGE_WITHIN_SIZE:
713	return "within_size";
714	case SHMEM_HUGE_ADVISE:
715	return "advise";
716	case SHMEM_HUGE_DENY:
717	return "deny";
718	case SHMEM_HUGE_FORCE:
719	return "force";
720	default:
721	VM_BUG_ON(`1`);
722	return "bad_val";
723	}
724	}
725	#endif
726
727	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
728	struct shrink_control sc, unsigned* long nr_to_free)
729	{
730	LIST_HEAD(list), pos, next;
731	struct inode *inode;
732	struct shmem_inode_info *info;
733	struct folio *folio;
734	unsigned long batch = sc ? sc->nr_to_scan : `128`;
735	unsigned long split = `0`, freed = `0`;
736
737	if (list_empty(head: &sbinfo->shrinklist))
738	return SHRINK_STOP;
739
740	spin_lock(lock: &sbinfo->shrinklist_lock);
741	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
742	info = list_entry(pos, struct shmem_inode_info, shrinklist);
743
744	/ pin the inode /
745	inode = igrab(&info->vfs_inode);
746
747	/ inode is about to be evicted /
748	if (!inode) {
749	list_del_init(entry: &info->shrinklist);
750	goto next;
751	}
752
753	list_move(list: &info->shrinklist, head: &list);
754	next:
755	sbinfo->shrinklist_len--;
756	if (!--batch)
757	break;
758	}
759	spin_unlock(lock: &sbinfo->shrinklist_lock);
760
761	list_for_each_safe(pos, next, &list) {
762	pgoff_t next, end;
763	loff_t i_size;
764	int ret;
765
766	info = list_entry(pos, struct shmem_inode_info, shrinklist);
767	inode = &info->vfs_inode;
768
769	if (nr_to_free && freed >= nr_to_free)
770	goto move_back;
771
772	i_size = i_size_read(inode);
773	folio = filemap_get_entry(mapping: inode->i_mapping, index: i_size / PAGE_SIZE);
774	if (!folio \|\| xa_is_value(entry: folio))
775	goto drop;
776
777	/ No large folio at the end of the file: nothing to split /
778	if (!folio_test_large(folio)) {
779	folio_put(folio);
780	goto drop;
781	}
782
783	/ Check if there is anything to gain from splitting /
784	next = folio_next_index(folio);
785	end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
786	if (end <= folio->index \|\| end >= next) {
787	folio_put(folio);
788	goto drop;
789	}
790
791	/*
792	* Move the inode on the list back to shrinklist if we failed
793	* to lock the page at this time.
794	*
795	* Waiting for the lock may lead to deadlock in the
796	* reclaim path.
797	*/
798	if (!folio_trylock(folio)) {
799	folio_put(folio);
800	goto move_back;
801	}
802
803	ret = split_folio(folio);
804	folio_unlock(folio);
805	folio_put(folio);
806
807	/ If split failed move the inode on the list back to shrinklist /
808	if (ret)
809	goto move_back;
810
811	freed += next - end;
812	split++;
813	drop:
814	list_del_init(entry: &info->shrinklist);
815	goto put;
816	move_back:
817	/*
818	* Make sure the inode is either on the global list or deleted
819	* from any local list before iput() since it could be deleted
820	* in another thread once we put the inode (then the local list
821	* is corrupted).
822	*/
823	spin_lock(lock: &sbinfo->shrinklist_lock);
824	list_move(list: &info->shrinklist, head: &sbinfo->shrinklist);
825	sbinfo->shrinklist_len++;
826	spin_unlock(lock: &sbinfo->shrinklist_lock);
827	put:
828	iput(inode);
829	}
830
831	return split;
832	}
833
834	static long shmem_unused_huge_scan(struct super_block *sb,
835	struct shrink_control *sc)
836	{
837	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
838
839	if (!READ_ONCE(sbinfo->shrinklist_len))
840	return SHRINK_STOP;
841
842	return shmem_unused_huge_shrink(sbinfo, sc, nr_to_free: `0`);
843	}
844
845	static long shmem_unused_huge_count(struct super_block *sb,
846	struct shrink_control *sc)
847	{
848	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
849	return READ_ONCE(sbinfo->shrinklist_len);
850	}
851	#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
852
853	#define shmem_huge SHMEM_HUGE_DENY
854
855	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
856	struct shrink_control sc, unsigned* long nr_to_free)
857	{
858	return `0`;
859	}
860
861	static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
862	loff_t write_end, bool shmem_huge_force,
863	struct vm_area_struct *vma,
864	vm_flags_t vm_flags)
865	{
866	return `0`;
867	}
868	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
869
870	static void shmem_update_stats(struct folio folio, int* nr_pages)
871	{
872	if (folio_test_pmd_mappable(folio))
873	lruvec_stat_mod_folio(folio, idx: NR_SHMEM_THPS, val: nr_pages);
874	lruvec_stat_mod_folio(folio, idx: NR_FILE_PAGES, val: nr_pages);
875	lruvec_stat_mod_folio(folio, idx: NR_SHMEM, val: nr_pages);
876	}
877
878	/*
879	* Somewhat like filemap_add_folio, but error if expected item has gone.
880	*/
881	int shmem_add_to_page_cache(struct folio *folio,
882	struct address_space *mapping,
883	pgoff_t index, void *expected, gfp_t gfp)
884	{
885	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
886	unsigned long nr = folio_nr_pages(folio);
887	swp_entry_t iter, swap;
888	void *entry;
889
890	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
891	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
892	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
893
894	folio_ref_add(folio, nr);
895	folio->mapping = mapping;
896	folio->index = index;
897
898	gfp &= GFP_RECLAIM_MASK;
899	folio_throttle_swaprate(folio, gfp);
900	swap = radix_to_swp_entry(arg: expected);
901
902	do {
903	iter = swap;
904	xas_lock_irq(&xas);
905	xas_for_each_conflict(&xas, entry) {
906	/*
907	* The range must either be empty, or filled with
908	* expected swap entries. Shmem swap entries are never
909	* partially freed without split of both entry and
910	* folio, so there shouldn't be any holes.
911	*/
912	if (!expected \|\| entry != swp_to_radix_entry(entry: iter)) {
913	xas_set_err(xas: &xas, err: -EEXIST);
914	goto unlock;
915	}
916	iter.val += `1` << xas_get_order(xas: &xas);
917	}
918	if (expected && iter.val - nr != swap.val) {
919	xas_set_err(xas: &xas, err: -EEXIST);
920	goto unlock;
921	}
922	xas_store(&xas, entry: folio);
923	if (xas_error(xas: &xas))
924	goto unlock;
925	shmem_update_stats(folio, nr_pages: nr);
926	mapping->nrpages += nr;
927	unlock:
928	xas_unlock_irq(&xas);
929	} while (xas_nomem(&xas, gfp));
930
931	if (xas_error(xas: &xas)) {
932	folio->mapping = NULL;
933	folio_ref_sub(folio, nr);
934	return xas_error(xas: &xas);
935	}
936
937	return `0`;
938	}
939
940	/*
941	* Somewhat like filemap_remove_folio, but substitutes swap for @folio.
942	*/
943	static void shmem_delete_from_page_cache(struct folio folio, void* *radswap)
944	{
945	struct address_space *mapping = folio->mapping;
946	long nr = folio_nr_pages(folio);
947	int error;
948
949	xa_lock_irq(&mapping->i_pages);
950	error = shmem_replace_entry(mapping, index: folio->index, expected: folio, replacement: radswap);
951	folio->mapping = NULL;
952	mapping->nrpages -= nr;
953	shmem_update_stats(folio, nr_pages: -nr);
954	xa_unlock_irq(&mapping->i_pages);
955	folio_put_refs(folio, refs: nr);
956	BUG_ON(error);
957	}
958
959	/*
960	* Remove swap entry from page cache, free the swap and its page cache. Returns
961	* the number of pages being freed. 0 means entry not found in XArray (0 pages
962	* being freed).
963	*/
964	static long shmem_free_swap(struct address_space *mapping,
965	pgoff_t index, pgoff_t end, void *radswap)
966	{
967	XA_STATE(xas, &mapping->i_pages, index);
968	unsigned int nr_pages = `0`;
969	pgoff_t base;
970	void *entry;
971
972	xas_lock_irq(&xas);
973	entry = xas_load(&xas);
974	if (entry == radswap) {
975	nr_pages = `1` << xas_get_order(xas: &xas);
976	base = round_down(xas.xa_index, nr_pages);
977	if (base < index \|\| base + nr_pages - `1` > end)
978	nr_pages = `0`;
979	else
980	xas_store(&xas, NULL);
981	}
982	xas_unlock_irq(&xas);
983
984	if (nr_pages)
985	free_swap_and_cache_nr(entry: radix_to_swp_entry(arg: radswap), nr: nr_pages);
986
987	return nr_pages;
988	}
989
990	/*
991	* Determine (in bytes) how many of the shmem object's pages mapped by the
992	* given offsets are swapped out.
993	*
994	* This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
995	* as long as the inode doesn't go away and racy results are not a problem.
996	*/
997	unsigned long shmem_partial_swap_usage(struct address_space *mapping,
998	pgoff_t start, pgoff_t end)
999	{
1000	XA_STATE(xas, &mapping->i_pages, start);
1001	struct folio *folio;
1002	unsigned long swapped = `0`;
1003	unsigned long max = end - `1`;
1004
1005	rcu_read_lock();
1006	xas_for_each(&xas, folio, max) {
1007	if (xas_retry(xas: &xas, entry: folio))
1008	continue;
1009	if (xa_is_value(entry: folio))
1010	swapped += `1` << xas_get_order(xas: &xas);
1011	if (xas.xa_index == max)
1012	break;
1013	if (need_resched()) {
1014	xas_pause(&xas);
1015	cond_resched_rcu();
1016	}
1017	}
1018	rcu_read_unlock();
1019
1020	return swapped << PAGE_SHIFT;
1021	}
1022
1023	/*
1024	* Determine (in bytes) how many of the shmem object's pages mapped by the
1025	* given vma is swapped out.
1026	*
1027	* This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
1028	* as long as the inode doesn't go away and racy results are not a problem.
1029	*/
1030	unsigned long shmem_swap_usage(struct vm_area_struct *vma)
1031	{
1032	struct inode *inode = file_inode(f: vma->vm_file);
1033	struct shmem_inode_info *info = SHMEM_I(inode);
1034	struct address_space *mapping = inode->i_mapping;
1035	unsigned long swapped;
1036
1037	/ Be careful as we don't hold info->lock /
1038	swapped = READ_ONCE(info->swapped);
1039
1040	/*
1041	* The easier cases are when the shmem object has nothing in swap, or
1042	* the vma maps it whole. Then we can simply use the stats that we
1043	* already track.
1044	*/
1045	if (!swapped)
1046	return `0`;
1047
1048	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
1049	return swapped << PAGE_SHIFT;
1050
1051	/ Here comes the more involved part /
1052	return shmem_partial_swap_usage(mapping, start: vma->vm_pgoff,
1053	end: vma->vm_pgoff + vma_pages(vma));
1054	}
1055
1056	/*
1057	* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
1058	*/
1059	void shmem_unlock_mapping(struct address_space *mapping)
1060	{
1061	struct folio_batch fbatch;
1062	pgoff_t index = `0`;
1063
1064	folio_batch_init(fbatch: &fbatch);
1065	/*
1066	* Minor point, but we might as well stop if someone else SHM_LOCKs it.
1067	*/
1068	while (!mapping_unevictable(mapping) &&
1069	filemap_get_folios(mapping, start: &index, end: ~`0UL`, fbatch: &fbatch)) {
1070	check_move_unevictable_folios(fbatch: &fbatch);
1071	folio_batch_release(fbatch: &fbatch);
1072	cond_resched();
1073	}
1074	}
1075
1076	static struct folio shmem_get_partial_folio(struct* inode *inode, pgoff_t index)
1077	{
1078	struct folio *folio;
1079
1080	/*
1081	* At first avoid shmem_get_folio(,,,SGP_READ): that fails
1082	* beyond i_size, and reports fallocated folios as holes.
1083	*/
1084	folio = filemap_get_entry(mapping: inode->i_mapping, index);
1085	if (!folio)
1086	return folio;
1087	if (!xa_is_value(entry: folio)) {
1088	folio_lock(folio);
1089	if (folio->mapping == inode->i_mapping)
1090	return folio;
1091	/ The folio has been swapped out /
1092	folio_unlock(folio);
1093	folio_put(folio);
1094	}
1095	/*
1096	* But read a folio back from swap if any of it is within i_size
1097	* (although in some cases this is just a waste of time).
1098	*/
1099	folio = NULL;
1100	shmem_get_folio(inode, index, write_end: `0`, foliop: &folio, sgp: SGP_READ);
1101	return folio;
1102	}
1103
1104	/*
1105	* Remove range of pages and swap entries from page cache, and free them.
1106	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1107	*/
1108	static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
1109	bool unfalloc)
1110	{
1111	struct address_space *mapping = inode->i_mapping;
1112	struct shmem_inode_info *info = SHMEM_I(inode);
1113	pgoff_t start = (lstart + PAGE_SIZE - `1`) >> PAGE_SHIFT;
1114	pgoff_t end = (lend + `1`) >> PAGE_SHIFT;
1115	struct folio_batch fbatch;
1116	pgoff_t indices[PAGEVEC_SIZE];
1117	struct folio *folio;
1118	bool same_folio;
1119	long nr_swaps_freed = `0`;
1120	pgoff_t index;
1121	int i;
1122
1123	if (lend == -`1`)
1124	end = -`1`; / unsigned, so actually very big /
1125
1126	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1127	info->fallocend = start;
1128
1129	folio_batch_init(fbatch: &fbatch);
1130	index = start;
1131	while (index < end && find_lock_entries(mapping, start: &index, end: end - `1`,
1132	fbatch: &fbatch, indices)) {
1133	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); i++) {
1134	folio = fbatch.folios[i];
1135
1136	if (xa_is_value(entry: folio)) {
1137	if (unfalloc)
1138	continue;
1139	nr_swaps_freed += shmem_free_swap(mapping, index: indices[i],
1140	end: end - `1`, radswap: folio);
1141	continue;
1142	}
1143
1144	if (!unfalloc \|\| !folio_test_uptodate(folio))
1145	truncate_inode_folio(mapping, folio);
1146	folio_unlock(folio);
1147	}
1148	folio_batch_remove_exceptionals(fbatch: &fbatch);
1149	folio_batch_release(fbatch: &fbatch);
1150	cond_resched();
1151	}
1152
1153	/*
1154	* When undoing a failed fallocate, we want none of the partial folio
1155	* zeroing and splitting below, but shall want to truncate the whole
1156	* folio when !uptodate indicates that it was added by this fallocate,
1157	* even when [lstart, lend] covers only a part of the folio.
1158	*/
1159	if (unfalloc)
1160	goto whole_folios;
1161
1162	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1163	folio = shmem_get_partial_folio(inode, index: lstart >> PAGE_SHIFT);
1164	if (folio) {
1165	same_folio = lend < folio_next_pos(folio);
1166	folio_mark_dirty(folio);
1167	if (!truncate_inode_partial_folio(folio, start: lstart, end: lend)) {
1168	start = folio_next_index(folio);
1169	if (same_folio)
1170	end = folio->index;
1171	}
1172	folio_unlock(folio);
1173	folio_put(folio);
1174	folio = NULL;
1175	}
1176
1177	if (!same_folio)
1178	folio = shmem_get_partial_folio(inode, index: lend >> PAGE_SHIFT);
1179	if (folio) {
1180	folio_mark_dirty(folio);
1181	if (!truncate_inode_partial_folio(folio, start: lstart, end: lend))
1182	end = folio->index;
1183	folio_unlock(folio);
1184	folio_put(folio);
1185	}
1186
1187	whole_folios:
1188
1189	index = start;
1190	while (index < end) {
1191	cond_resched();
1192
1193	if (!find_get_entries(mapping, start: &index, end: end - `1`, fbatch: &fbatch,
1194	indices)) {
1195	/ If all gone or hole-punch or unfalloc, we're done /
1196	if (index == start \|\| end != -`1`)
1197	break;
1198	/ But if truncating, restart to make sure all gone /
1199	index = start;
1200	continue;
1201	}
1202	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); i++) {
1203	folio = fbatch.folios[i];
1204
1205	if (xa_is_value(entry: folio)) {
1206	int order;
1207	long swaps_freed;
1208
1209	if (unfalloc)
1210	continue;
1211	swaps_freed = shmem_free_swap(mapping, index: indices[i],
1212	end: end - `1`, radswap: folio);
1213	if (!swaps_freed) {
1214	pgoff_t base = indices[i];
1215
1216	order = shmem_confirm_swap(mapping, index: indices[i],
1217	swap: radix_to_swp_entry(arg: folio));
1218	/*
1219	* If found a large swap entry cross the end or start
1220	* border, skip it as the truncate_inode_partial_folio
1221	* above should have at least zerod its content once.
1222	*/
1223	if (order > `0`) {
1224	base = round_down(base, `1` << order);
1225	if (base < start \|\| base + (`1` << order) > end)
1226	continue;
1227	}
1228	/ Swap was replaced by page or extended, retry /
1229	index = base;
1230	break;
1231	}
1232	nr_swaps_freed += swaps_freed;
1233	continue;
1234	}
1235
1236	folio_lock(folio);
1237
1238	if (!unfalloc \|\| !folio_test_uptodate(folio)) {
1239	if (folio_mapping(folio) != mapping) {
1240	/ Page was replaced by swap: retry /
1241	folio_unlock(folio);
1242	index = indices[i];
1243	break;
1244	}
1245	VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1246	folio);
1247
1248	if (!folio_test_large(folio)) {
1249	truncate_inode_folio(mapping, folio);
1250	} else if (truncate_inode_partial_folio(folio, start: lstart, end: lend)) {
1251	/*
1252	* If we split a page, reset the loop so
1253	* that we pick up the new sub pages.
1254	* Otherwise the THP was entirely
1255	* dropped or the target range was
1256	* zeroed, so just continue the loop as
1257	* is.
1258	*/
1259	if (!folio_test_large(folio)) {
1260	folio_unlock(folio);
1261	index = start;
1262	break;
1263	}
1264	}
1265	}
1266	folio_unlock(folio);
1267	}
1268	folio_batch_remove_exceptionals(fbatch: &fbatch);
1269	folio_batch_release(fbatch: &fbatch);
1270	}
1271
1272	shmem_recalc_inode(inode, alloced: `0`, swapped: -nr_swaps_freed);
1273	}
1274
1275	void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
1276	{
1277	shmem_undo_range(inode, lstart, lend, unfalloc: false);
1278	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
1279	inode_inc_iversion(inode);
1280	}
1281	EXPORT_SYMBOL_GPL(shmem_truncate_range);
1282
1283	static int shmem_getattr(struct mnt_idmap *idmap,
1284	const struct path path, struct* kstat *stat,
1285	u32 request_mask, unsigned int query_flags)
1286	{
1287	struct inode *inode = path->dentry->d_inode;
1288	struct shmem_inode_info *info = SHMEM_I(inode);
1289
1290	if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1291	shmem_recalc_inode(inode, alloced: `0`, swapped: `0`);
1292
1293	if (info->fsflags & FS_APPEND_FL)
1294	stat->attributes \|= STATX_ATTR_APPEND;
1295	if (info->fsflags & FS_IMMUTABLE_FL)
1296	stat->attributes \|= STATX_ATTR_IMMUTABLE;
1297	if (info->fsflags & FS_NODUMP_FL)
1298	stat->attributes \|= STATX_ATTR_NODUMP;
1299	stat->attributes_mask \|= (STATX_ATTR_APPEND \|
1300	STATX_ATTR_IMMUTABLE \|
1301	STATX_ATTR_NODUMP);
1302	generic_fillattr(idmap, request_mask, inode, stat);
1303
1304	if (shmem_huge_global_enabled(inode, index: `0`, write_end: `0`, shmem_huge_force: false, NULL, vm_flags: `0`))
1305	stat->blksize = HPAGE_PMD_SIZE;
1306
1307	if (request_mask & STATX_BTIME) {
1308	stat->result_mask \|= STATX_BTIME;
1309	stat->btime.tv_sec = info->i_crtime.tv_sec;
1310	stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1311	}
1312
1313	return `0`;
1314	}
1315
1316	static int shmem_setattr(struct mnt_idmap *idmap,
1317	struct dentry dentry, struct* iattr *attr)
1318	{
1319	struct inode *inode = d_inode(dentry);
1320	struct shmem_inode_info *info = SHMEM_I(inode);
1321	int error;
1322	bool update_mtime = false;
1323	bool update_ctime = true;
1324
1325	error = setattr_prepare(idmap, dentry, attr);
1326	if (error)
1327	return error;
1328
1329	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1330	if ((inode->i_mode ^ attr->ia_mode) & `0111`) {
1331	return -EPERM;
1332	}
1333	}
1334
1335	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1336	loff_t oldsize = inode->i_size;
1337	loff_t newsize = attr->ia_size;
1338
1339	/ protected by i_rwsem /
1340	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
1341	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
1342	return -EPERM;
1343
1344	if (newsize != oldsize) {
1345	if (info->flags & SHMEM_F_MAPPING_FROZEN)
1346	return -EPERM;
1347	error = shmem_reacct_size(flags: SHMEM_I(inode)->flags,
1348	oldsize, newsize);
1349	if (error)
1350	return error;
1351	i_size_write(inode, i_size: newsize);
1352	update_mtime = true;
1353	} else {
1354	update_ctime = false;
1355	}
1356	if (newsize <= oldsize) {
1357	loff_t holebegin = round_up(newsize, PAGE_SIZE);
1358	if (oldsize > holebegin)
1359	unmap_mapping_range(mapping: inode->i_mapping,
1360	holebegin, holelen: `0`, even_cows: `1`);
1361	if (info->alloced)
1362	shmem_truncate_range(inode,
1363	newsize, (loff_t)-`1`);
1364	/ unmap again to remove racily COWed private pages /
1365	if (oldsize > holebegin)
1366	unmap_mapping_range(mapping: inode->i_mapping,
1367	holebegin, holelen: `0`, even_cows: `1`);
1368	}
1369	}
1370
1371	if (is_quota_modification(idmap, inode, ia: attr)) {
1372	error = dquot_initialize(inode);
1373	if (error)
1374	return error;
1375	}
1376
1377	/ Transfer quota accounting /
1378	if (i_uid_needs_update(idmap, attr, inode) \|\|
1379	i_gid_needs_update(idmap, attr, inode)) {
1380	error = dquot_transfer(idmap, inode, iattr: attr);
1381	if (error)
1382	return error;
1383	}
1384
1385	setattr_copy(idmap, inode, attr);
1386	if (attr->ia_valid & ATTR_MODE)
1387	error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1388	if (!error && update_ctime) {
1389	inode_set_ctime_current(inode);
1390	if (update_mtime)
1391	inode_set_mtime_to_ts(inode, ts: inode_get_ctime(inode));
1392	inode_inc_iversion(inode);
1393	}
1394	return error;
1395	}
1396
1397	static void shmem_evict_inode(struct inode *inode)
1398	{
1399	struct shmem_inode_info *info = SHMEM_I(inode);
1400	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
1401	size_t freed = `0`;
1402
1403	if (shmem_mapping(inode->i_mapping)) {
1404	shmem_unacct_size(flags: info->flags, size: inode->i_size);
1405	inode->i_size = `0`;
1406	mapping_set_exiting(mapping: inode->i_mapping);
1407	shmem_truncate_range(inode, `0`, (loff_t)-`1`);
1408	if (!list_empty(head: &info->shrinklist)) {
1409	spin_lock(lock: &sbinfo->shrinklist_lock);
1410	if (!list_empty(head: &info->shrinklist)) {
1411	list_del_init(entry: &info->shrinklist);
1412	sbinfo->shrinklist_len--;
1413	}
1414	spin_unlock(lock: &sbinfo->shrinklist_lock);
1415	}
1416	while (!list_empty(head: &info->swaplist)) {
1417	/ Wait while shmem_unuse() is scanning this inode... /
1418	wait_var_event(&info->stop_eviction,
1419	!atomic_read(&info->stop_eviction));
1420	spin_lock(lock: &shmem_swaplist_lock);
1421	/ ...but beware of the race if we peeked too early /
1422	if (!atomic_read(v: &info->stop_eviction))
1423	list_del_init(entry: &info->swaplist);
1424	spin_unlock(lock: &shmem_swaplist_lock);
1425	}
1426	}
1427
1428	simple_xattrs_free(xattrs: &info->xattrs, freed_space: sbinfo->max_inodes ? &freed : NULL);
1429	shmem_free_inode(sb: inode->i_sb, freed_ispace: freed);
1430	WARN_ON(inode->i_blocks);
1431	clear_inode(inode);
1432	#ifdef CONFIG_TMPFS_QUOTA
1433	dquot_free_inode(inode);
1434	dquot_drop(inode);
1435	#endif
1436	}
1437
1438	static unsigned int shmem_find_swap_entries(struct address_space *mapping,
1439	pgoff_t start, struct folio_batch *fbatch,
1440	pgoff_t indices, unsigned* int type)
1441	{
1442	XA_STATE(xas, &mapping->i_pages, start);
1443	struct folio *folio;
1444	swp_entry_t entry;
1445
1446	rcu_read_lock();
1447	xas_for_each(&xas, folio, ULONG_MAX) {
1448	if (xas_retry(xas: &xas, entry: folio))
1449	continue;
1450
1451	if (!xa_is_value(entry: folio))
1452	continue;
1453
1454	entry = radix_to_swp_entry(arg: folio);
1455	/*
1456	* swapin error entries can be found in the mapping. But they're
1457	* deliberately ignored here as we've done everything we can do.
1458	*/
1459	if (swp_type(entry) != type)
1460	continue;
1461
1462	indices[folio_batch_count(fbatch)] = xas.xa_index;
1463	if (!folio_batch_add(fbatch, folio))
1464	break;
1465
1466	if (need_resched()) {
1467	xas_pause(&xas);
1468	cond_resched_rcu();
1469	}
1470	}
1471	rcu_read_unlock();
1472
1473	return folio_batch_count(fbatch);
1474	}
1475
1476	/*
1477	* Move the swapped pages for an inode to page cache. Returns the count
1478	* of pages swapped in, or the error in case of failure.
1479	*/
1480	static int shmem_unuse_swap_entries(struct inode *inode,
1481	struct folio_batch fbatch, pgoff_t indices)
1482	{
1483	int i = `0`;
1484	int ret = `0`;
1485	int error = `0`;
1486	struct address_space *mapping = inode->i_mapping;
1487
1488	for (i = `0`; i < folio_batch_count(fbatch); i++) {
1489	struct folio *folio = fbatch->folios[i];
1490
1491	error = shmem_swapin_folio(inode, index: indices[i], foliop: &folio, sgp: SGP_CACHE,
1492	gfp: mapping_gfp_mask(mapping), NULL, NULL);
1493	if (error == `0`) {
1494	folio_unlock(folio);
1495	folio_put(folio);
1496	ret++;
1497	}
1498	if (error == -ENOMEM)
1499	break;
1500	error = `0`;
1501	}
1502	return error ? error : ret;
1503	}
1504
1505	/*
1506	* If swap found in inode, free it and move page from swapcache to filecache.
1507	*/
1508	static int shmem_unuse_inode(struct inode inode, unsigned* int type)
1509	{
1510	struct address_space *mapping = inode->i_mapping;
1511	pgoff_t start = `0`;
1512	struct folio_batch fbatch;
1513	pgoff_t indices[PAGEVEC_SIZE];
1514	int ret = `0`;
1515
1516	do {
1517	folio_batch_init(fbatch: &fbatch);
1518	if (!shmem_find_swap_entries(mapping, start, fbatch: &fbatch,
1519	indices, type)) {
1520	ret = `0`;
1521	break;
1522	}
1523
1524	ret = shmem_unuse_swap_entries(inode, fbatch: &fbatch, indices);
1525	if (ret < `0`)
1526	break;
1527
1528	start = indices[folio_batch_count(fbatch: &fbatch) - `1`];
1529	} while (true);
1530
1531	return ret;
1532	}
1533
1534	/*
1535	* Read all the shared memory data that resides in the swap
1536	* device 'type' back into memory, so the swap device can be
1537	* unused.
1538	*/
1539	int shmem_unuse(unsigned int type)
1540	{
1541	struct shmem_inode_info info, next;
1542	int error = `0`;
1543
1544	if (list_empty(head: &shmem_swaplist))
1545	return `0`;
1546
1547	spin_lock(lock: &shmem_swaplist_lock);
1548	start_over:
1549	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1550	if (!info->swapped) {
1551	list_del_init(entry: &info->swaplist);
1552	continue;
1553	}
1554	/*
1555	* Drop the swaplist mutex while searching the inode for swap;
1556	* but before doing so, make sure shmem_evict_inode() will not
1557	* remove placeholder inode from swaplist, nor let it be freed
1558	* (igrab() would protect from unlink, but not from unmount).
1559	*/
1560	atomic_inc(v: &info->stop_eviction);
1561	spin_unlock(lock: &shmem_swaplist_lock);
1562
1563	error = shmem_unuse_inode(inode: &info->vfs_inode, type);
1564	cond_resched();
1565
1566	spin_lock(lock: &shmem_swaplist_lock);
1567	if (atomic_dec_and_test(v: &info->stop_eviction))
1568	wake_up_var(var: &info->stop_eviction);
1569	if (error)
1570	break;
1571	if (list_empty(head: &info->swaplist))
1572	goto start_over;
1573	next = list_next_entry(info, swaplist);
1574	if (!info->swapped)
1575	list_del_init(entry: &info->swaplist);
1576	}
1577	spin_unlock(lock: &shmem_swaplist_lock);
1578
1579	return error;
1580	}
1581
1582	/**
1583	* shmem_writeout - Write the folio to swap
1584	* @folio: The folio to write
1585	* @plug: swap plug
1586	* @folio_list: list to put back folios on split
1587	*
1588	* Move the folio from the page cache to the swap cache.
1589	*/
1590	int shmem_writeout(struct folio folio, struct* swap_iocb **plug,
1591	struct list_head *folio_list)
1592	{
1593	struct address_space *mapping = folio->mapping;
1594	struct inode *inode = mapping->host;
1595	struct shmem_inode_info *info = SHMEM_I(inode);
1596	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
1597	pgoff_t index;
1598	int nr_pages;
1599	bool split = false;
1600
1601	if ((info->flags & SHMEM_F_LOCKED) \|\| sbinfo->noswap)
1602	goto redirty;
1603
1604	if (!total_swap_pages)
1605	goto redirty;
1606
1607	/*
1608	* If CONFIG_THP_SWAP is not enabled, the large folio should be
1609	* split when swapping.
1610	*
1611	* And shrinkage of pages beyond i_size does not split swap, so
1612	* swapout of a large folio crossing i_size needs to split too
1613	* (unless fallocate has been used to preallocate beyond EOF).
1614	*/
1615	if (folio_test_large(folio)) {
1616	index = shmem_fallocend(inode,
1617	DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1618	if ((index > folio->index && index < folio_next_index(folio)) \|\|
1619	!IS_ENABLED(CONFIG_THP_SWAP))
1620	split = true;
1621	}
1622
1623	if (split) {
1624	try_split:
1625	/ Ensure the subpages are still dirty /
1626	folio_test_set_dirty(folio);
1627	if (split_folio_to_list(folio, list: folio_list))
1628	goto redirty;
1629	folio_clear_dirty(folio);
1630	}
1631
1632	index = folio->index;
1633	nr_pages = folio_nr_pages(folio);
1634
1635	/*
1636	* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1637	* value into swapfile.c, the only way we can correctly account for a
1638	* fallocated folio arriving here is now to initialize it and write it.
1639	*
1640	* That's okay for a folio already fallocated earlier, but if we have
1641	* not yet completed the fallocation, then (a) we want to keep track
1642	* of this folio in case we have to undo it, and (b) it may not be a
1643	* good idea to continue anyway, once we're pushing into swap. So
1644	* reactivate the folio, and let shmem_fallocate() quit when too many.
1645	*/
1646	if (!folio_test_uptodate(folio)) {
1647	if (inode->i_private) {
1648	struct shmem_falloc *shmem_falloc;
1649	spin_lock(lock: &inode->i_lock);
1650	shmem_falloc = inode->i_private;
1651	if (shmem_falloc &&
1652	!shmem_falloc->waitq &&
1653	index >= shmem_falloc->start &&
1654	index < shmem_falloc->next)
1655	shmem_falloc->nr_unswapped += nr_pages;
1656	else
1657	shmem_falloc = NULL;
1658	spin_unlock(lock: &inode->i_lock);
1659	if (shmem_falloc)
1660	goto redirty;
1661	}
1662	folio_zero_range(folio, start: `0`, length: folio_size(folio));
1663	flush_dcache_folio(folio);
1664	folio_mark_uptodate(folio);
1665	}
1666
1667	if (!folio_alloc_swap(folio)) {
1668	bool first_swapped = shmem_recalc_inode(inode, alloced: `0`, swapped: nr_pages);
1669	int error;
1670
1671	/*
1672	* Add inode to shmem_unuse()'s list of swapped-out inodes,
1673	* if it's not already there. Do it now before the folio is
1674	* removed from page cache, when its pagelock no longer
1675	* protects the inode from eviction. And do it now, after
1676	* we've incremented swapped, because shmem_unuse() will
1677	* prune a !swapped inode from the swaplist.
1678	*/
1679	if (first_swapped) {
1680	spin_lock(lock: &shmem_swaplist_lock);
1681	if (list_empty(head: &info->swaplist))
1682	list_add(new: &info->swaplist, head: &shmem_swaplist);
1683	spin_unlock(lock: &shmem_swaplist_lock);
1684	}
1685
1686	swap_shmem_alloc(folio->swap, nr_pages);
1687	shmem_delete_from_page_cache(folio, radswap: swp_to_radix_entry(entry: folio->swap));
1688
1689	BUG_ON(folio_mapped(folio));
1690	error = swap_writeout(folio, swap_plug: plug);
1691	if (error != AOP_WRITEPAGE_ACTIVATE) {
1692	/ folio has been unlocked /
1693	return error;
1694	}
1695
1696	/*
1697	* The intention here is to avoid holding on to the swap when
1698	* zswap was unable to compress and unable to writeback; but
1699	* it will be appropriate if other reactivate cases are added.
1700	*/
1701	error = shmem_add_to_page_cache(folio, mapping, index,
1702	expected: swp_to_radix_entry(entry: folio->swap),
1703	__GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
1704	/ Swap entry might be erased by racing shmem_free_swap() /
1705	if (!error) {
1706	shmem_recalc_inode(inode, alloced: `0`, swapped: -nr_pages);
1707	swap_free_nr(entry: folio->swap, nr_pages);
1708	}
1709
1710	/*
1711	* The swap_cache_del_folio() below could be left for
1712	* shrink_folio_list()'s folio_free_swap() to dispose of;
1713	* but I'm a little nervous about letting this folio out of
1714	* shmem_writeout() in a hybrid half-tmpfs-half-swap state
1715	* e.g. folio_mapping(folio) might give an unexpected answer.
1716	*/
1717	swap_cache_del_folio(folio);
1718	goto redirty;
1719	}
1720	if (nr_pages > `1`)
1721	goto try_split;
1722	redirty:
1723	folio_mark_dirty(folio);
1724	return AOP_WRITEPAGE_ACTIVATE; / Return with folio locked /
1725	}
1726	EXPORT_SYMBOL_GPL(shmem_writeout);
1727
1728	#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1729	static void shmem_show_mpol(struct seq_file seq, struct* mempolicy *mpol)
1730	{
1731	char buffer[`64`];
1732
1733	if (!mpol \|\| mpol->mode == MPOL_DEFAULT)
1734	return; / show nothing /
1735
1736	mpol_to_str(buffer, maxlen: sizeof(buffer), pol: mpol);
1737
1738	seq_printf(m: seq, fmt: ",mpol=%s", buffer);
1739	}
1740
1741	static struct mempolicy shmem_get_sbmpol(struct* shmem_sb_info *sbinfo)
1742	{
1743	struct mempolicy *mpol = NULL;
1744	if (sbinfo->mpol) {
1745	raw_spin_lock(&sbinfo->stat_lock); / prevent replace/use races /
1746	mpol = sbinfo->mpol;
1747	mpol_get(pol: mpol);
1748	raw_spin_unlock(&sbinfo->stat_lock);
1749	}
1750	return mpol;
1751	}
1752	#else /* !CONFIG_NUMA \|\| !CONFIG_TMPFS */
1753	static inline void shmem_show_mpol(struct seq_file seq, struct* mempolicy *mpol)
1754	{
1755	}
1756	static inline struct mempolicy shmem_get_sbmpol(struct* shmem_sb_info *sbinfo)
1757	{
1758	return NULL;
1759	}
1760	#endif /* CONFIG_NUMA && CONFIG_TMPFS */
1761
1762	static struct mempolicy shmem_get_pgoff_policy(struct* shmem_inode_info *info,
1763	pgoff_t index, unsigned int order, pgoff_t *ilx);
1764
1765	static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1766	struct shmem_inode_info *info, pgoff_t index)
1767	{
1768	struct mempolicy *mpol;
1769	pgoff_t ilx;
1770	struct folio *folio;
1771
1772	mpol = shmem_get_pgoff_policy(info, index, order: `0`, ilx: &ilx);
1773	folio = swap_cluster_readahead(entry: swap, flag: gfp, mpol, ilx);
1774	mpol_cond_put(pol: mpol);
1775
1776	return folio;
1777	}
1778
1779	/*
1780	* Make sure huge_gfp is always more limited than limit_gfp.
1781	* Some of the flags set permissions, while others set limitations.
1782	*/
1783	static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1784	{
1785	gfp_t allowflags = __GFP_IO \| __GFP_FS \| __GFP_RECLAIM;
1786	gfp_t denyflags = __GFP_NOWARN \| __GFP_NORETRY;
1787	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1788	gfp_t result = huge_gfp & ~(allowflags \| GFP_ZONEMASK);
1789
1790	/ Allow allocations only from the originally specified zones. /
1791	result \|= zoneflags;
1792
1793	/*
1794	* Minimize the result gfp by taking the union with the deny flags,
1795	* and the intersection of the allow flags.
1796	*/
1797	result \|= (limit_gfp & denyflags);
1798	result \|= (huge_gfp & limit_gfp) & allowflags;
1799
1800	return result;
1801	}
1802
1803	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1804	bool shmem_hpage_pmd_enabled(void)
1805	{
1806	if (shmem_huge == SHMEM_HUGE_DENY)
1807	return false;
1808	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
1809	return true;
1810	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
1811	return true;
1812	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
1813	return true;
1814	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
1815	shmem_huge != SHMEM_HUGE_NEVER)
1816	return true;
1817
1818	return false;
1819	}
1820
1821	unsigned long shmem_allowable_huge_orders(struct inode *inode,
1822	struct vm_area_struct *vma, pgoff_t index,
1823	loff_t write_end, bool shmem_huge_force)
1824	{
1825	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1826	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1827	vm_flags_t vm_flags = vma ? vma->vm_flags : `0`;
1828	unsigned int global_orders;
1829
1830	if (thp_disabled_by_hw() \|\| (vma && vma_thp_disabled(vma, vm_flags, forced_collapse: shmem_huge_force)))
1831	return `0`;
1832
1833	global_orders = shmem_huge_global_enabled(inode, index, write_end,
1834	shmem_huge_force, vma, vm_flags);
1835	/ Tmpfs huge pages allocation /
1836	if (!vma \|\| !vma_is_anon_shmem(vma))
1837	return global_orders;
1838
1839	/*
1840	* Following the 'deny' semantics of the top level, force the huge
1841	* option off from all mounts.
1842	*/
1843	if (shmem_huge == SHMEM_HUGE_DENY)
1844	return `0`;
1845
1846	/*
1847	* Only allow inherit orders if the top-level value is 'force', which
1848	* means non-PMD sized THP can not override 'huge' mount option now.
1849	*/
1850	if (shmem_huge == SHMEM_HUGE_FORCE)
1851	return READ_ONCE(huge_shmem_orders_inherit);
1852
1853	/ Allow mTHP that will be fully within i_size. /
1854	mask \|= shmem_get_orders_within_size(inode, within_size_orders, index, write_end: `0`);
1855
1856	if (vm_flags & VM_HUGEPAGE)
1857	mask \|= READ_ONCE(huge_shmem_orders_madvise);
1858
1859	if (global_orders > `0`)
1860	mask \|= READ_ONCE(huge_shmem_orders_inherit);
1861
1862	return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1863	}
1864
1865	static unsigned long shmem_suitable_orders(struct inode inode, struct* vm_fault *vmf,
1866	struct address_space *mapping, pgoff_t index,
1867	unsigned long orders)
1868	{
1869	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1870	pgoff_t aligned_index;
1871	unsigned long pages;
1872	int order;
1873
1874	if (vma) {
1875	orders = thp_vma_suitable_orders(vma, addr: vmf->address, orders);
1876	if (!orders)
1877	return `0`;
1878	}
1879
1880	/ Find the highest order that can add into the page cache /
1881	order = highest_order(orders);
1882	while (orders) {
1883	pages = `1UL` << order;
1884	aligned_index = round_down(index, pages);
1885	/*
1886	* Check for conflict before waiting on a huge allocation.
1887	* Conflict might be that a huge page has just been allocated
1888	* and added to page cache by a racing thread, or that there
1889	* is already at least one small page in the huge extent.
1890	* Be careful to retry when appropriate, but not forever!
1891	* Elsewhere -EEXIST would be the right code, but not here.
1892	*/
1893	if (!xa_find(xa: &mapping->i_pages, index: &aligned_index,
1894	max: aligned_index + pages - `1`, XA_PRESENT))
1895	break;
1896	order = next_order(orders: &orders, prev: order);
1897	}
1898
1899	return orders;
1900	}
1901	#else
1902	static unsigned long shmem_suitable_orders(struct inode inode, struct* vm_fault *vmf,
1903	struct address_space *mapping, pgoff_t index,
1904	unsigned long orders)
1905	{
1906	return `0`;
1907	}
1908	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1909
1910	static struct folio shmem_alloc_folio(gfp_t gfp, int* order,
1911	struct shmem_inode_info *info, pgoff_t index)
1912	{
1913	struct mempolicy *mpol;
1914	pgoff_t ilx;
1915	struct folio *folio;
1916
1917	mpol = shmem_get_pgoff_policy(info, index, order, ilx: &ilx);
1918	folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1919	mpol_cond_put(pol: mpol);
1920
1921	return folio;
1922	}
1923
1924	static struct folio shmem_alloc_and_add_folio(struct* vm_fault *vmf,
1925	gfp_t gfp, struct inode *inode, pgoff_t index,
1926	struct mm_struct fault_mm, unsigned* long orders)
1927	{
1928	struct address_space *mapping = inode->i_mapping;
1929	struct shmem_inode_info *info = SHMEM_I(inode);
1930	unsigned long suitable_orders = `0`;
1931	struct folio *folio = NULL;
1932	pgoff_t aligned_index;
1933	long pages;
1934	int error, order;
1935
1936	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1937	orders = `0`;
1938
1939	if (orders > `0`) {
1940	suitable_orders = shmem_suitable_orders(inode, vmf,
1941	mapping, index, orders);
1942
1943	order = highest_order(orders: suitable_orders);
1944	while (suitable_orders) {
1945	pages = `1UL` << order;
1946	aligned_index = round_down(index, pages);
1947	folio = shmem_alloc_folio(gfp, order, info, index: aligned_index);
1948	if (folio) {
1949	index = aligned_index;
1950	goto allocated;
1951	}
1952
1953	if (pages == HPAGE_PMD_NR)
1954	count_vm_event(item: THP_FILE_FALLBACK);
1955	count_mthp_stat(order, item: MTHP_STAT_SHMEM_FALLBACK);
1956	order = next_order(orders: &suitable_orders, prev: order);
1957	}
1958	} else {
1959	pages = `1`;
1960	folio = shmem_alloc_folio(gfp, order: `0`, info, index);
1961	}
1962	if (!folio)
1963	return ERR_PTR(error: -ENOMEM);
1964
1965	allocated:
1966	__folio_set_locked(folio);
1967	__folio_set_swapbacked(folio);
1968
1969	gfp &= GFP_RECLAIM_MASK;
1970	error = mem_cgroup_charge(folio, mm: fault_mm, gfp);
1971	if (error) {
1972	if (xa_find(xa: &mapping->i_pages, index: &index,
1973	max: index + pages - `1`, XA_PRESENT)) {
1974	error = -EEXIST;
1975	} else if (pages > `1`) {
1976	if (pages == HPAGE_PMD_NR) {
1977	count_vm_event(item: THP_FILE_FALLBACK);
1978	count_vm_event(item: THP_FILE_FALLBACK_CHARGE);
1979	}
1980	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_SHMEM_FALLBACK);
1981	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1982	}
1983	goto unlock;
1984	}
1985
1986	error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1987	if (error)
1988	goto unlock;
1989
1990	error = shmem_inode_acct_blocks(inode, pages);
1991	if (error) {
1992	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
1993	long freed;
1994	/*
1995	* Try to reclaim some space by splitting a few
1996	* large folios beyond i_size on the filesystem.
1997	*/
1998	shmem_unused_huge_shrink(sbinfo, NULL, nr_to_free: pages);
1999	/*
2000	* And do a shmem_recalc_inode() to account for freed pages:
2001	* except our folio is there in cache, so not quite balanced.
2002	*/
2003	spin_lock(lock: &info->lock);
2004	freed = pages + info->alloced - info->swapped -
2005	READ_ONCE(mapping->nrpages);
2006	if (freed > `0`)
2007	info->alloced -= freed;
2008	spin_unlock(lock: &info->lock);
2009	if (freed > `0`)
2010	shmem_inode_unacct_blocks(inode, pages: freed);
2011	error = shmem_inode_acct_blocks(inode, pages);
2012	if (error) {
2013	filemap_remove_folio(folio);
2014	goto unlock;
2015	}
2016	}
2017
2018	shmem_recalc_inode(inode, alloced: pages, swapped: `0`);
2019	folio_add_lru(folio);
2020	return folio;
2021
2022	unlock:
2023	folio_unlock(folio);
2024	folio_put(folio);
2025	return ERR_PTR(error);
2026	}
2027
2028	static struct folio shmem_swap_alloc_folio(struct* inode *inode,
2029	struct vm_area_struct *vma, pgoff_t index,
2030	swp_entry_t entry, int order, gfp_t gfp)
2031	{
2032	struct shmem_inode_info *info = SHMEM_I(inode);
2033	int nr_pages = `1` << order;
2034	struct folio *new;
2035	gfp_t alloc_gfp;
2036	void *shadow;
2037
2038	/*
2039	* We have arrived here because our zones are constrained, so don't
2040	* limit chance of success with further cpuset and node constraints.
2041	*/
2042	gfp &= ~GFP_CONSTRAINT_MASK;
2043	alloc_gfp = gfp;
2044	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
2045	if (WARN_ON_ONCE(order))
2046	return ERR_PTR(error: -EINVAL);
2047	} else if (order) {
2048	/*
2049	* If uffd is active for the vma, we need per-page fault
2050	* fidelity to maintain the uffd semantics, then fallback
2051	* to swapin order-0 folio, as well as for zswap case.
2052	* Any existing sub folio in the swap cache also blocks
2053	* mTHP swapin.
2054	*/
2055	if ((vma && unlikely(userfaultfd_armed(vma))) \|\|
2056	!zswap_never_enabled() \|\|
2057	non_swapcache_batch(entry, max_nr: nr_pages) != nr_pages)
2058	goto fallback;
2059
2060	alloc_gfp = limit_gfp_mask(huge_gfp: vma_thp_gfp_mask(vma), limit_gfp: gfp);
2061	}
2062	retry:
2063	new = shmem_alloc_folio(gfp: alloc_gfp, order, info, index);
2064	if (!new) {
2065	new = ERR_PTR(error: -ENOMEM);
2066	goto fallback;
2067	}
2068
2069	if (mem_cgroup_swapin_charge_folio(folio: new, mm: vma ? vma->vm_mm : NULL,
2070	gfp: alloc_gfp, entry)) {
2071	folio_put(folio: new);
2072	new = ERR_PTR(error: -ENOMEM);
2073	goto fallback;
2074	}
2075
2076	/*
2077	* Prevent parallel swapin from proceeding with the swap cache flag.
2078	*
2079	* Of course there is another possible concurrent scenario as well,
2080	* that is to say, the swap cache flag of a large folio has already
2081	* been set by swapcache_prepare(), while another thread may have
2082	* already split the large swap entry stored in the shmem mapping.
2083	* In this case, shmem_add_to_page_cache() will help identify the
2084	* concurrent swapin and return -EEXIST.
2085	*/
2086	if (swapcache_prepare(entry, nr: nr_pages)) {
2087	folio_put(folio: new);
2088	new = ERR_PTR(error: -EEXIST);
2089	/ Try smaller folio to avoid cache conflict /
2090	goto fallback;
2091	}
2092
2093	__folio_set_locked(folio: new);
2094	__folio_set_swapbacked(folio: new);
2095	new->swap = entry;
2096
2097	memcg1_swapin(entry, nr_pages);
2098	shadow = swap_cache_get_shadow(entry);
2099	if (shadow)
2100	workingset_refault(folio: new, shadow);
2101	folio_add_lru(new);
2102	swap_read_folio(folio: new, NULL);
2103	return new;
2104	fallback:
2105	/ Order 0 swapin failed, nothing to fallback to, abort /
2106	if (!order)
2107	return new;
2108	entry.val += index - round_down(index, nr_pages);
2109	alloc_gfp = gfp;
2110	nr_pages = `1`;
2111	order = `0`;
2112	goto retry;
2113	}
2114
2115	/*
2116	* When a page is moved from swapcache to shmem filecache (either by the
2117	* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
2118	* shmem_unuse_inode()), it may have been read in earlier from swap, in
2119	* ignorance of the mapping it belongs to. If that mapping has special
2120	* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
2121	* we may need to copy to a suitable page before moving to filecache.
2122	*
2123	* In a future release, this may well be extended to respect cpuset and
2124	* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
2125	* but for now it is a simple matter of zone.
2126	*/
2127	static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
2128	{
2129	return folio_zonenum(folio) > gfp_zone(flags: gfp);
2130	}
2131
2132	static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
2133	struct shmem_inode_info *info, pgoff_t index,
2134	struct vm_area_struct *vma)
2135	{
2136	struct swap_cluster_info *ci;
2137	struct folio new, old = *foliop;
2138	swp_entry_t entry = old->swap;
2139	int nr_pages = folio_nr_pages(folio: old);
2140	int error = `0`;
2141
2142	/*
2143	* We have arrived here because our zones are constrained, so don't
2144	* limit chance of success by further cpuset and node constraints.
2145	*/
2146	gfp &= ~GFP_CONSTRAINT_MASK;
2147	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2148	if (nr_pages > `1`) {
2149	gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2150
2151	gfp = limit_gfp_mask(huge_gfp, limit_gfp: gfp);
2152	}
2153	#endif
2154
2155	new = shmem_alloc_folio(gfp, order: folio_order(folio: old), info, index);
2156	if (!new)
2157	return -ENOMEM;
2158
2159	folio_ref_add(folio: new, nr: nr_pages);
2160	folio_copy(dst: new, src: old);
2161	flush_dcache_folio(folio: new);
2162
2163	__folio_set_locked(folio: new);
2164	__folio_set_swapbacked(folio: new);
2165	folio_mark_uptodate(folio: new);
2166	new->swap = entry;
2167	folio_set_swapcache(folio: new);
2168
2169	ci = swap_cluster_get_and_lock_irq(folio: old);
2170	__swap_cache_replace_folio(ci, old, new);
2171	mem_cgroup_replace_folio(old, new);
2172	shmem_update_stats(folio: new, nr_pages);
2173	shmem_update_stats(folio: old, nr_pages: -nr_pages);
2174	swap_cluster_unlock_irq(ci);
2175
2176	folio_add_lru(new);
2177	*foliop = new;
2178
2179	folio_clear_swapcache(folio: old);
2180	old->private = NULL;
2181
2182	folio_unlock(folio: old);
2183	/*
2184	* The old folio are removed from swap cache, drop the 'nr_pages'
2185	* reference, as well as one temporary reference getting from swap
2186	* cache.
2187	*/
2188	folio_put_refs(folio: old, refs: nr_pages + `1`);
2189	return error;
2190	}
2191
2192	static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2193	struct folio *folio, swp_entry_t swap,
2194	bool skip_swapcache)
2195	{
2196	struct address_space *mapping = inode->i_mapping;
2197	swp_entry_t swapin_error;
2198	void *old;
2199	int nr_pages;
2200
2201	swapin_error = make_poisoned_swp_entry();
2202	old = xa_cmpxchg_irq(xa: &mapping->i_pages, index,
2203	old: swp_to_radix_entry(entry: swap),
2204	entry: swp_to_radix_entry(entry: swapin_error), gfp: `0`);
2205	if (old != swp_to_radix_entry(entry: swap))
2206	return;
2207
2208	nr_pages = folio_nr_pages(folio);
2209	folio_wait_writeback(folio);
2210	if (!skip_swapcache)
2211	swap_cache_del_folio(folio);
2212	/*
2213	* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2214	* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2215	* in shmem_evict_inode().
2216	*/
2217	shmem_recalc_inode(inode, alloced: -nr_pages, swapped: -nr_pages);
2218	swap_free_nr(entry: swap, nr_pages);
2219	}
2220
2221	static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2222	swp_entry_t swap, gfp_t gfp)
2223	{
2224	struct address_space *mapping = inode->i_mapping;
2225	XA_STATE_ORDER(xas, &mapping->i_pages, index, `0`);
2226	int split_order = `0`;
2227	int i;
2228
2229	/ Convert user data gfp flags to xarray node gfp flags /
2230	gfp &= GFP_RECLAIM_MASK;
2231
2232	for (;;) {
2233	void *old = NULL;
2234	int cur_order;
2235	pgoff_t swap_index;
2236
2237	xas_lock_irq(&xas);
2238	old = xas_load(&xas);
2239	if (!xa_is_value(entry: old) \|\| swp_to_radix_entry(entry: swap) != old) {
2240	xas_set_err(xas: &xas, err: -EEXIST);
2241	goto unlock;
2242	}
2243
2244	cur_order = xas_get_order(xas: &xas);
2245	if (!cur_order)
2246	goto unlock;
2247
2248	/ Try to split large swap entry in pagecache /
2249	swap_index = round_down(index, `1` << cur_order);
2250	split_order = xas_try_split_min_order(order: cur_order);
2251
2252	while (cur_order > `0`) {
2253	pgoff_t aligned_index =
2254	round_down(index, `1` << cur_order);
2255	pgoff_t swap_offset = aligned_index - swap_index;
2256
2257	xas_set_order(xas: &xas, index, order: split_order);
2258	xas_try_split(xas: &xas, entry: old, order: cur_order);
2259	if (xas_error(xas: &xas))
2260	goto unlock;
2261
2262	/*
2263	* Re-set the swap entry after splitting, and the swap
2264	* offset of the original large entry must be continuous.
2265	*/
2266	for (i = `0`; i < `1` << cur_order;
2267	i += (`1` << split_order)) {
2268	swp_entry_t tmp;
2269
2270	tmp = swp_entry(type: swp_type(entry: swap),
2271	offset: swp_offset(entry: swap) + swap_offset +
2272	i);
2273	__xa_store(&mapping->i_pages, index: aligned_index + i,
2274	entry: swp_to_radix_entry(entry: tmp), `0`);
2275	}
2276	cur_order = split_order;
2277	split_order = xas_try_split_min_order(order: split_order);
2278	}
2279
2280	unlock:
2281	xas_unlock_irq(&xas);
2282
2283	if (!xas_nomem(&xas, gfp))
2284	break;
2285	}
2286
2287	if (xas_error(xas: &xas))
2288	return xas_error(xas: &xas);
2289
2290	return `0`;
2291	}
2292
2293	/*
2294	* Swap in the folio pointed to by *foliop.
2295	* Caller has to make sure that *foliop contains a valid swapped folio.
2296	* Returns 0 and the folio in foliop if success. On failure, returns the
2297	* error code and NULL in *foliop.
2298	*/
2299	static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2300	struct folio foliop, enum** sgp_type sgp,
2301	gfp_t gfp, struct vm_area_struct *vma,
2302	vm_fault_t *fault_type)
2303	{
2304	struct address_space *mapping = inode->i_mapping;
2305	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2306	struct shmem_inode_info *info = SHMEM_I(inode);
2307	swp_entry_t swap;
2308	softleaf_t index_entry;
2309	struct swap_info_struct *si;
2310	struct folio *folio = NULL;
2311	bool skip_swapcache = false;
2312	int error, nr_pages, order;
2313	pgoff_t offset;
2314
2315	VM_BUG_ON(!foliop \|\| !xa_is_value(foliop));
2316	index_entry = radix_to_swp_entry(arg: *foliop);
2317	swap = index_entry;
2318	*foliop = NULL;
2319
2320	if (softleaf_is_poison_marker(entry: index_entry))
2321	return -EIO;
2322
2323	si = get_swap_device(entry: index_entry);
2324	order = shmem_confirm_swap(mapping, index, swap: index_entry);
2325	if (unlikely(!si)) {
2326	if (order < `0`)
2327	return -EEXIST;
2328	else
2329	return -EINVAL;
2330	}
2331	if (unlikely(order < `0`)) {
2332	put_swap_device(si);
2333	return -EEXIST;
2334	}
2335
2336	/ index may point to the middle of a large entry, get the sub entry /
2337	if (order) {
2338	offset = index - round_down(index, `1` << order);
2339	swap = swp_entry(type: swp_type(entry: swap), offset: swp_offset(entry: swap) + offset);
2340	}
2341
2342	/ Look it up and read it in.. /
2343	folio = swap_cache_get_folio(entry: swap);
2344	if (!folio) {
2345	if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2346	/ Direct swapin skipping swap cache & readahead /
2347	folio = shmem_swap_alloc_folio(inode, vma, index,
2348	entry: index_entry, order, gfp);
2349	if (IS_ERR(ptr: folio)) {
2350	error = PTR_ERR(ptr: folio);
2351	folio = NULL;
2352	goto failed;
2353	}
2354	skip_swapcache = true;
2355	} else {
2356	/ Cached swapin only supports order 0 folio /
2357	folio = shmem_swapin_cluster(swap, gfp, info, index);
2358	if (!folio) {
2359	error = -ENOMEM;
2360	goto failed;
2361	}
2362	}
2363	if (fault_type) {
2364	*fault_type \|= VM_FAULT_MAJOR;
2365	count_vm_event(item: PGMAJFAULT);
2366	count_memcg_event_mm(mm: fault_mm, idx: PGMAJFAULT);
2367	}
2368	} else {
2369	swap_update_readahead(folio, NULL, addr: `0`);
2370	}
2371
2372	if (order > folio_order(folio)) {
2373	/*
2374	* Swapin may get smaller folios due to various reasons:
2375	* It may fallback to order 0 due to memory pressure or race,
2376	* swap readahead may swap in order 0 folios into swapcache
2377	* asynchronously, while the shmem mapping can still stores
2378	* large swap entries. In such cases, we should split the
2379	* large swap entry to prevent possible data corruption.
2380	*/
2381	error = shmem_split_large_entry(inode, index, swap: index_entry, gfp);
2382	if (error)
2383	goto failed_nolock;
2384	}
2385
2386	/*
2387	* If the folio is large, round down swap and index by folio size.
2388	* No matter what race occurs, the swap layer ensures we either get
2389	* a valid folio that has its swap entry aligned by size, or a
2390	* temporarily invalid one which we'll abort very soon and retry.
2391	*
2392	* shmem_add_to_page_cache ensures the whole range contains expected
2393	* entries and prevents any corruption, so any race split is fine
2394	* too, it will succeed as long as the entries are still there.
2395	*/
2396	nr_pages = folio_nr_pages(folio);
2397	if (nr_pages > `1`) {
2398	swap.val = round_down(swap.val, nr_pages);
2399	index = round_down(index, nr_pages);
2400	}
2401
2402	/*
2403	* We have to do this with the folio locked to prevent races.
2404	* The shmem_confirm_swap below only checks if the first swap
2405	* entry matches the folio, that's enough to ensure the folio
2406	* is not used outside of shmem, as shmem swap entries
2407	* and swap cache folios are never partially freed.
2408	*/
2409	folio_lock(folio);
2410	if ((!skip_swapcache && !folio_test_swapcache(folio)) \|\|
2411	shmem_confirm_swap(mapping, index, swap) < `0` \|\|
2412	folio->swap.val != swap.val) {
2413	error = -EEXIST;
2414	goto unlock;
2415	}
2416	if (!folio_test_uptodate(folio)) {
2417	error = -EIO;
2418	goto failed;
2419	}
2420	folio_wait_writeback(folio);
2421
2422	/*
2423	* Some architectures may have to restore extra metadata to the
2424	* folio after reading from swap.
2425	*/
2426	arch_swap_restore(entry: folio_swap(entry: swap, folio), folio);
2427
2428	if (shmem_should_replace_folio(folio, gfp)) {
2429	error = shmem_replace_folio(foliop: &folio, gfp, info, index, vma);
2430	if (error)
2431	goto failed;
2432	}
2433
2434	error = shmem_add_to_page_cache(folio, mapping, index,
2435	expected: swp_to_radix_entry(entry: swap), gfp);
2436	if (error)
2437	goto failed;
2438
2439	shmem_recalc_inode(inode, alloced: `0`, swapped: -nr_pages);
2440
2441	if (sgp == SGP_WRITE)
2442	folio_mark_accessed(folio);
2443
2444	if (skip_swapcache) {
2445	folio->swap.val = `0`;
2446	swapcache_clear(si, entry: swap, nr: nr_pages);
2447	} else {
2448	swap_cache_del_folio(folio);
2449	}
2450	folio_mark_dirty(folio);
2451	swap_free_nr(entry: swap, nr_pages);
2452	put_swap_device(si);
2453
2454	*foliop = folio;
2455	return `0`;
2456	failed:
2457	if (shmem_confirm_swap(mapping, index, swap) < `0`)
2458	error = -EEXIST;
2459	if (error == -EIO)
2460	shmem_set_folio_swapin_error(inode, index, folio, swap,
2461	skip_swapcache);
2462	unlock:
2463	if (folio)
2464	folio_unlock(folio);
2465	failed_nolock:
2466	if (skip_swapcache)
2467	swapcache_clear(si, entry: folio->swap, nr: folio_nr_pages(folio));
2468	if (folio)
2469	folio_put(folio);
2470	put_swap_device(si);
2471
2472	return error;
2473	}
2474
2475	/*
2476	* shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2477	*
2478	* If we allocate a new one we do not mark it dirty. That's up to the
2479	* vm. If we swap it in we mark it dirty since we also free the swap
2480	* entry since a page cannot live in both the swap and page cache.
2481	*
2482	* vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2483	*/
2484	static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2485	loff_t write_end, struct folio foliop, enum** sgp_type sgp,
2486	gfp_t gfp, struct vm_fault vmf, vm_fault_t fault_type)
2487	{
2488	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2489	struct mm_struct *fault_mm;
2490	struct folio *folio;
2491	int error;
2492	bool alloced;
2493	unsigned long orders = `0`;
2494
2495	if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2496	return -EINVAL;
2497
2498	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2499	return -EFBIG;
2500	repeat:
2501	if (sgp <= SGP_CACHE &&
2502	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2503	return -EINVAL;
2504
2505	alloced = false;
2506	fault_mm = vma ? vma->vm_mm : NULL;
2507
2508	folio = filemap_get_entry(mapping: inode->i_mapping, index);
2509	if (folio && vma && userfaultfd_minor(vma)) {
2510	if (!xa_is_value(entry: folio))
2511	folio_put(folio);
2512	*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2513	return `0`;
2514	}
2515
2516	if (xa_is_value(entry: folio)) {
2517	error = shmem_swapin_folio(inode, index, foliop: &folio,
2518	sgp, gfp, vma, fault_type);
2519	if (error == -EEXIST)
2520	goto repeat;
2521
2522	*foliop = folio;
2523	return error;
2524	}
2525
2526	if (folio) {
2527	folio_lock(folio);
2528
2529	/ Has the folio been truncated or swapped out? /
2530	if (unlikely(folio->mapping != inode->i_mapping)) {
2531	folio_unlock(folio);
2532	folio_put(folio);
2533	goto repeat;
2534	}
2535	if (sgp == SGP_WRITE)
2536	folio_mark_accessed(folio);
2537	if (folio_test_uptodate(folio))
2538	goto out;
2539	/ fallocated folio /
2540	if (sgp != SGP_READ)
2541	goto clear;
2542	folio_unlock(folio);
2543	folio_put(folio);
2544	}
2545
2546	/*
2547	* SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2548	* SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2549	*/
2550	*foliop = NULL;
2551	if (sgp == SGP_READ)
2552	return `0`;
2553	if (sgp == SGP_NOALLOC)
2554	return -ENOENT;
2555
2556	/*
2557	* Fast cache lookup and swap lookup did not find it: allocate.
2558	*/
2559
2560	if (vma && userfaultfd_missing(vma)) {
2561	*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2562	return `0`;
2563	}
2564
2565	/ Find hugepage orders that are allowed for anonymous shmem and tmpfs. /
2566	orders = shmem_allowable_huge_orders(inode, vma, index, write_end, shmem_huge_force: false);
2567	if (orders > `0`) {
2568	gfp_t huge_gfp;
2569
2570	huge_gfp = vma_thp_gfp_mask(vma);
2571	huge_gfp = limit_gfp_mask(huge_gfp, limit_gfp: gfp);
2572	folio = shmem_alloc_and_add_folio(vmf, gfp: huge_gfp,
2573	inode, index, fault_mm, orders);
2574	if (!IS_ERR(ptr: folio)) {
2575	if (folio_test_pmd_mappable(folio))
2576	count_vm_event(item: THP_FILE_ALLOC);
2577	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_SHMEM_ALLOC);
2578	goto alloced;
2579	}
2580	if (PTR_ERR(ptr: folio) == -EEXIST)
2581	goto repeat;
2582	}
2583
2584	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, orders: `0`);
2585	if (IS_ERR(ptr: folio)) {
2586	error = PTR_ERR(ptr: folio);
2587	if (error == -EEXIST)
2588	goto repeat;
2589	folio = NULL;
2590	goto unlock;
2591	}
2592
2593	alloced:
2594	alloced = true;
2595	if (folio_test_large(folio) &&
2596	DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2597	folio_next_index(folio)) {
2598	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
2599	struct shmem_inode_info *info = SHMEM_I(inode);
2600	/*
2601	* Part of the large folio is beyond i_size: subject
2602	* to shrink under memory pressure.
2603	*/
2604	spin_lock(lock: &sbinfo->shrinklist_lock);
2605	/*
2606	* _careful to defend against unlocked access to
2607	* ->shrink_list in shmem_unused_huge_shrink()
2608	*/
2609	if (list_empty_careful(head: &info->shrinklist)) {
2610	list_add_tail(new: &info->shrinklist,
2611	head: &sbinfo->shrinklist);
2612	sbinfo->shrinklist_len++;
2613	}
2614	spin_unlock(lock: &sbinfo->shrinklist_lock);
2615	}
2616
2617	if (sgp == SGP_WRITE)
2618	folio_set_referenced(folio);
2619	/*
2620	* Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2621	*/
2622	if (sgp == SGP_FALLOC)
2623	sgp = SGP_WRITE;
2624	clear:
2625	/*
2626	* Let SGP_WRITE caller clear ends if write does not fill folio;
2627	* but SGP_FALLOC on a folio fallocated earlier must initialize
2628	* it now, lest undo on failure cancel our earlier guarantee.
2629	*/
2630	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2631	long i, n = folio_nr_pages(folio);
2632
2633	for (i = `0`; i < n; i++)
2634	clear_highpage(folio_page(folio, i));
2635	flush_dcache_folio(folio);
2636	folio_mark_uptodate(folio);
2637	}
2638
2639	/ Perhaps the file has been truncated since we checked /
2640	if (sgp <= SGP_CACHE &&
2641	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2642	error = -EINVAL;
2643	goto unlock;
2644	}
2645	out:
2646	*foliop = folio;
2647	return `0`;
2648
2649	/*
2650	* Error recovery.
2651	*/
2652	unlock:
2653	if (alloced)
2654	filemap_remove_folio(folio);
2655	shmem_recalc_inode(inode, alloced: `0`, swapped: `0`);
2656	if (folio) {
2657	folio_unlock(folio);
2658	folio_put(folio);
2659	}
2660	return error;
2661	}
2662
2663	/**
2664	* shmem_get_folio - find, and lock a shmem folio.
2665	* @inode: inode to search
2666	* @index: the page index.
2667	* @write_end: end of a write, could extend inode size
2668	* @foliop: pointer to the folio if found
2669	* @sgp: SGP_* flags to control behavior
2670	*
2671	* Looks up the page cache entry at @inode & @index. If a folio is
2672	* present, it is returned locked with an increased refcount.
2673	*
2674	* If the caller modifies data in the folio, it must call folio_mark_dirty()
2675	* before unlocking the folio to ensure that the folio is not reclaimed.
2676	* There is no need to reserve space before calling folio_mark_dirty().
2677	*
2678	* When no folio is found, the behavior depends on @sgp:
2679	* - for SGP_READ, *@foliop is %NULL and 0 is returned
2680	* - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2681	* - for all other flags a new folio is allocated, inserted into the
2682	* page cache and returned locked in @foliop.
2683	*
2684	* Context: May sleep.
2685	* Return: 0 if successful, else a negative error code.
2686	*/
2687	int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2688	struct folio foliop, enum** sgp_type sgp)
2689	{
2690	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2691	gfp: mapping_gfp_mask(mapping: inode->i_mapping), NULL, NULL);
2692	}
2693	EXPORT_SYMBOL_GPL(shmem_get_folio);
2694
2695	/*
2696	* This is like autoremove_wake_function, but it removes the wait queue
2697	* entry unconditionally - even if something else had already woken the
2698	* target.
2699	*/
2700	static int synchronous_wake_function(wait_queue_entry_t *wait,
2701	unsigned int mode, int sync, void *key)
2702	{
2703	int ret = default_wake_function(wq_entry: wait, mode, flags: sync, key);
2704	list_del_init(entry: &wait->entry);
2705	return ret;
2706	}
2707
2708	/*
2709	* Trinity finds that probing a hole which tmpfs is punching can
2710	* prevent the hole-punch from ever completing: which in turn
2711	* locks writers out with its hold on i_rwsem. So refrain from
2712	* faulting pages into the hole while it's being punched. Although
2713	* shmem_undo_range() does remove the additions, it may be unable to
2714	* keep up, as each new page needs its own unmap_mapping_range() call,
2715	* and the i_mmap tree grows ever slower to scan if new vmas are added.
2716	*
2717	* It does not matter if we sometimes reach this check just before the
2718	* hole-punch begins, so that one fault then races with the punch:
2719	* we just need to make racing faults a rare case.
2720	*
2721	* The implementation below would be much simpler if we just used a
2722	* standard mutex or completion: but we cannot take i_rwsem in fault,
2723	* and bloating every shmem inode for this unlikely case would be sad.
2724	*/
2725	static vm_fault_t shmem_falloc_wait(struct vm_fault vmf, struct* inode *inode)
2726	{
2727	struct shmem_falloc *shmem_falloc;
2728	struct file *fpin = NULL;
2729	vm_fault_t ret = `0`;
2730
2731	spin_lock(lock: &inode->i_lock);
2732	shmem_falloc = inode->i_private;
2733	if (shmem_falloc &&
2734	shmem_falloc->waitq &&
2735	vmf->pgoff >= shmem_falloc->start &&
2736	vmf->pgoff < shmem_falloc->next) {
2737	wait_queue_head_t *shmem_falloc_waitq;
2738	DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2739
2740	ret = VM_FAULT_NOPAGE;
2741	fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2742	shmem_falloc_waitq = shmem_falloc->waitq;
2743	prepare_to_wait(wq_head: shmem_falloc_waitq, wq_entry: &shmem_fault_wait,
2744	TASK_UNINTERRUPTIBLE);
2745	spin_unlock(lock: &inode->i_lock);
2746	schedule();
2747
2748	/*
2749	* shmem_falloc_waitq points into the shmem_fallocate()
2750	* stack of the hole-punching task: shmem_falloc_waitq
2751	* is usually invalid by the time we reach here, but
2752	* finish_wait() does not dereference it in that case;
2753	* though i_lock needed lest racing with wake_up_all().
2754	*/
2755	spin_lock(lock: &inode->i_lock);
2756	finish_wait(wq_head: shmem_falloc_waitq, wq_entry: &shmem_fault_wait);
2757	}
2758	spin_unlock(lock: &inode->i_lock);
2759	if (fpin) {
2760	fput(fpin);
2761	ret = VM_FAULT_RETRY;
2762	}
2763	return ret;
2764	}
2765
2766	static vm_fault_t shmem_fault(struct vm_fault *vmf)
2767	{
2768	struct inode *inode = file_inode(f: vmf->vma->vm_file);
2769	gfp_t gfp = mapping_gfp_mask(mapping: inode->i_mapping);
2770	struct folio *folio = NULL;
2771	vm_fault_t ret = `0`;
2772	int err;
2773
2774	/*
2775	* Trinity finds that probing a hole which tmpfs is punching can
2776	* prevent the hole-punch from ever completing: noted in i_private.
2777	*/
2778	if (unlikely(inode->i_private)) {
2779	ret = shmem_falloc_wait(vmf, inode);
2780	if (ret)
2781	return ret;
2782	}
2783
2784	WARN_ON_ONCE(vmf->page != NULL);
2785	err = shmem_get_folio_gfp(inode, index: vmf->pgoff, write_end: `0`, foliop: &folio, sgp: SGP_CACHE,
2786	gfp, vmf, fault_type: &ret);
2787	if (err)
2788	return vmf_error(err);
2789	if (folio) {
2790	vmf->page = folio_file_page(folio, index: vmf->pgoff);
2791	ret \|= VM_FAULT_LOCKED;
2792	}
2793	return ret;
2794	}
2795
2796	unsigned long shmem_get_unmapped_area(struct file *file,
2797	unsigned long uaddr, unsigned long len,
2798	unsigned long pgoff, unsigned long flags)
2799	{
2800	unsigned long addr;
2801	unsigned long offset;
2802	unsigned long inflated_len;
2803	unsigned long inflated_addr;
2804	unsigned long inflated_offset;
2805	unsigned long hpage_size;
2806
2807	if (len > TASK_SIZE)
2808	return -ENOMEM;
2809
2810	addr = mm_get_unmapped_area(filp: file, addr: uaddr, len, pgoff, flags);
2811
2812	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2813	return addr;
2814	if (IS_ERR_VALUE(addr))
2815	return addr;
2816	if (addr & ~PAGE_MASK)
2817	return addr;
2818	if (addr > TASK_SIZE - len)
2819	return addr;
2820
2821	if (shmem_huge == SHMEM_HUGE_DENY)
2822	return addr;
2823	if (flags & MAP_FIXED)
2824	return addr;
2825	/*
2826	* Our priority is to support MAP_SHARED mapped hugely;
2827	* and support MAP_PRIVATE mapped hugely too, until it is COWed.
2828	* But if caller specified an address hint and we allocated area there
2829	* successfully, respect that as before.
2830	*/
2831	if (uaddr == addr)
2832	return addr;
2833
2834	hpage_size = HPAGE_PMD_SIZE;
2835	if (shmem_huge != SHMEM_HUGE_FORCE) {
2836	struct super_block *sb;
2837	unsigned long __maybe_unused hpage_orders;
2838	int order = `0`;
2839
2840	if (file) {
2841	VM_BUG_ON(file->f_op != &shmem_file_operations);
2842	sb = file_inode(f: file)->i_sb;
2843	} else {
2844	/*
2845	* Called directly from mm/mmap.c, or drivers/char/mem.c
2846	* for "/dev/zero", to create a shared anonymous object.
2847	*/
2848	if (IS_ERR(ptr: shm_mnt))
2849	return addr;
2850	sb = shm_mnt->mnt_sb;
2851
2852	/*
2853	* Find the highest mTHP order used for anonymous shmem to
2854	* provide a suitable alignment address.
2855	*/
2856	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2857	hpage_orders = READ_ONCE(huge_shmem_orders_always);
2858	hpage_orders \|= READ_ONCE(huge_shmem_orders_within_size);
2859	hpage_orders \|= READ_ONCE(huge_shmem_orders_madvise);
2860	if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2861	hpage_orders \|= READ_ONCE(huge_shmem_orders_inherit);
2862
2863	if (hpage_orders > `0`) {
2864	order = highest_order(orders: hpage_orders);
2865	hpage_size = PAGE_SIZE << order;
2866	}
2867	#endif
2868	}
2869	if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2870	return addr;
2871	}
2872
2873	if (len < hpage_size)
2874	return addr;
2875
2876	offset = (pgoff << PAGE_SHIFT) & (hpage_size - `1`);
2877	if (offset && offset + len < `2` * hpage_size)
2878	return addr;
2879	if ((addr & (hpage_size - `1`)) == offset)
2880	return addr;
2881
2882	inflated_len = len + hpage_size - PAGE_SIZE;
2883	if (inflated_len > TASK_SIZE)
2884	return addr;
2885	if (inflated_len < len)
2886	return addr;
2887
2888	inflated_addr = mm_get_unmapped_area(NULL, addr: uaddr, len: inflated_len, pgoff: `0`, flags);
2889	if (IS_ERR_VALUE(inflated_addr))
2890	return addr;
2891	if (inflated_addr & ~PAGE_MASK)
2892	return addr;
2893
2894	inflated_offset = inflated_addr & (hpage_size - `1`);
2895	inflated_addr += offset - inflated_offset;
2896	if (inflated_offset > offset)
2897	inflated_addr += hpage_size;
2898
2899	if (inflated_addr > TASK_SIZE - len)
2900	return addr;
2901	return inflated_addr;
2902	}
2903
2904	#ifdef CONFIG_NUMA
2905	static int shmem_set_policy(struct vm_area_struct vma, struct* mempolicy *mpol)
2906	{
2907	struct inode *inode = file_inode(f: vma->vm_file);
2908	return mpol_set_shared_policy(sp: &SHMEM_I(inode)->policy, vma, mpol);
2909	}
2910
2911	static struct mempolicy shmem_get_policy(struct* vm_area_struct *vma,
2912	unsigned long addr, pgoff_t *ilx)
2913	{
2914	struct inode *inode = file_inode(f: vma->vm_file);
2915	pgoff_t index;
2916
2917	/*
2918	* Bias interleave by inode number to distribute better across nodes;
2919	* but this interface is independent of which page order is used, so
2920	* supplies only that bias, letting caller apply the offset (adjusted
2921	* by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2922	*/
2923	*ilx = inode->i_ino;
2924	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2925	return mpol_shared_policy_lookup(sp: &SHMEM_I(inode)->policy, idx: index);
2926	}
2927
2928	static struct mempolicy shmem_get_pgoff_policy(struct* shmem_inode_info *info,
2929	pgoff_t index, unsigned int order, pgoff_t *ilx)
2930	{
2931	struct mempolicy *mpol;
2932
2933	/ Bias interleave by inode number to distribute better across nodes /
2934	*ilx = info->vfs_inode.i_ino + (index >> order);
2935
2936	mpol = mpol_shared_policy_lookup(sp: &info->policy, idx: index);
2937	return mpol ? mpol : get_task_policy(current);
2938	}
2939	#else
2940	static struct mempolicy shmem_get_pgoff_policy(struct* shmem_inode_info *info,
2941	pgoff_t index, unsigned int order, pgoff_t *ilx)
2942	{
2943	*ilx = `0`;
2944	return NULL;
2945	}
2946	#endif /* CONFIG_NUMA */
2947
2948	int shmem_lock(struct file file, int* lock, struct ucounts *ucounts)
2949	{
2950	struct inode *inode = file_inode(f: file);
2951	struct shmem_inode_info *info = SHMEM_I(inode);
2952	int retval = -ENOMEM;
2953
2954	/*
2955	* What serializes the accesses to info->flags?
2956	* ipc_lock_object() when called from shmctl_do_lock(),
2957	* no serialization needed when called from shm_destroy().
2958	*/
2959	if (lock && !(info->flags & SHMEM_F_LOCKED)) {
2960	if (!user_shm_lock(inode->i_size, ucounts))
2961	goto out_nomem;
2962	info->flags \|= SHMEM_F_LOCKED;
2963	mapping_set_unevictable(mapping: file->f_mapping);
2964	}
2965	if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
2966	user_shm_unlock(inode->i_size, ucounts);
2967	info->flags &= ~SHMEM_F_LOCKED;
2968	mapping_clear_unevictable(mapping: file->f_mapping);
2969	}
2970	retval = `0`;
2971
2972	out_nomem:
2973	return retval;
2974	}
2975
2976	static int shmem_mmap_prepare(struct vm_area_desc *desc)
2977	{
2978	struct file *file = desc->file;
2979	struct inode *inode = file_inode(f: file);
2980
2981	file_accessed(file);
2982	/ This is anonymous shared memory if it is unlinked at the time of mmap /
2983	if (inode->i_nlink)
2984	desc->vm_ops = &shmem_vm_ops;
2985	else
2986	desc->vm_ops = &shmem_anon_vm_ops;
2987	return `0`;
2988	}
2989
2990	static int shmem_file_open(struct inode inode, struct* file *file)
2991	{
2992	file->f_mode \|= FMODE_CAN_ODIRECT;
2993	return generic_file_open(inode, filp: file);
2994	}
2995
2996	#ifdef CONFIG_TMPFS_XATTR
2997	static int shmem_initxattrs(struct inode , const* struct xattr , void* *);
2998
2999	#if IS_ENABLED(CONFIG_UNICODE)
3000	/*
3001	* shmem_inode_casefold_flags - Deal with casefold file attribute flag
3002	*
3003	* The casefold file attribute needs some special checks. I can just be added to
3004	* an empty dir, and can't be removed from a non-empty dir.
3005	*/
3006	static int shmem_inode_casefold_flags(struct inode inode, unsigned* int fsflags,
3007	struct dentry dentry, unsigned* int *i_flags)
3008	{
3009	unsigned int old = inode->i_flags;
3010	struct super_block *sb = inode->i_sb;
3011
3012	if (fsflags & FS_CASEFOLD_FL) {
3013	if (!(old & S_CASEFOLD)) {
3014	if (!sb->s_encoding)
3015	return -EOPNOTSUPP;
3016
3017	if (!S_ISDIR(inode->i_mode))
3018	return -ENOTDIR;
3019
3020	if (dentry && !simple_empty(dentry))
3021	return -ENOTEMPTY;
3022	}
3023
3024	i_flags = i_flags \| S_CASEFOLD;
3025	} else if (old & S_CASEFOLD) {
3026	if (dentry && !simple_empty(dentry))
3027	return -ENOTEMPTY;
3028	}
3029
3030	return `0`;
3031	}
3032	#else
3033	static int shmem_inode_casefold_flags(struct inode inode, unsigned* int fsflags,
3034	struct dentry dentry, unsigned* int *i_flags)
3035	{
3036	if (fsflags & FS_CASEFOLD_FL)
3037	return -EOPNOTSUPP;
3038
3039	return `0`;
3040	}
3041	#endif
3042
3043	/*
3044	* chattr's fsflags are unrelated to extended attributes,
3045	* but tmpfs has chosen to enable them under the same config option.
3046	*/
3047	static int shmem_set_inode_flags(struct inode inode, unsigned* int fsflags, struct dentry *dentry)
3048	{
3049	unsigned int i_flags = `0`;
3050	int ret;
3051
3052	ret = shmem_inode_casefold_flags(inode, fsflags, dentry, i_flags: &i_flags);
3053	if (ret)
3054	return ret;
3055
3056	if (fsflags & FS_NOATIME_FL)
3057	i_flags \|= S_NOATIME;
3058	if (fsflags & FS_APPEND_FL)
3059	i_flags \|= S_APPEND;
3060	if (fsflags & FS_IMMUTABLE_FL)
3061	i_flags \|= S_IMMUTABLE;
3062	/*
3063	* But FS_NODUMP_FL does not require any action in i_flags.
3064	*/
3065	inode_set_flags(inode, flags: i_flags, S_NOATIME \| S_APPEND \| S_IMMUTABLE \| S_CASEFOLD);
3066
3067	return `0`;
3068	}
3069	#else
3070	static void shmem_set_inode_flags(struct inode inode, unsigned* int fsflags, struct dentry *dentry)
3071	{
3072	}
3073	#define shmem_initxattrs NULL
3074	#endif
3075
3076	static struct offset_ctx shmem_get_offset_ctx(struct* inode *inode)
3077	{
3078	return &SHMEM_I(inode)->dir_offsets;
3079	}
3080
3081	static struct inode __shmem_get_inode(struct* mnt_idmap *idmap,
3082	struct super_block *sb,
3083	struct inode *dir, umode_t mode,
3084	dev_t dev, unsigned long flags)
3085	{
3086	struct inode *inode;
3087	struct shmem_inode_info *info;
3088	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3089	ino_t ino;
3090	int err;
3091
3092	err = shmem_reserve_inode(sb, inop: &ino);
3093	if (err)
3094	return ERR_PTR(error: err);
3095
3096	inode = new_inode(sb);
3097	if (!inode) {
3098	shmem_free_inode(sb, freed_ispace: `0`);
3099	return ERR_PTR(error: -ENOSPC);
3100	}
3101
3102	inode->i_ino = ino;
3103	inode_init_owner(idmap, inode, dir, mode);
3104	inode->i_blocks = `0`;
3105	simple_inode_init_ts(inode);
3106	inode->i_generation = get_random_u32();
3107	info = SHMEM_I(inode);
3108	memset(info, `0`, (char )inode - (char* *)info);
3109	spin_lock_init(&info->lock);
3110	atomic_set(v: &info->stop_eviction, i: `0`);
3111	info->seals = F_SEAL_SEAL;
3112	info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : `0`;
3113	info->i_crtime = inode_get_mtime(inode);
3114	info->fsflags = (dir == NULL) ? `0` :
3115	SHMEM_I(inode: dir)->fsflags & SHMEM_FL_INHERITED;
3116	if (info->fsflags)
3117	shmem_set_inode_flags(inode, fsflags: info->fsflags, NULL);
3118	INIT_LIST_HEAD(list: &info->shrinklist);
3119	INIT_LIST_HEAD(list: &info->swaplist);
3120	simple_xattrs_init(xattrs: &info->xattrs);
3121	cache_no_acl(inode);
3122	if (sbinfo->noswap)
3123	mapping_set_unevictable(mapping: inode->i_mapping);
3124
3125	/ Don't consider 'deny' for emergencies and 'force' for testing /
3126	if (sbinfo->huge)
3127	mapping_set_large_folios(mapping: inode->i_mapping);
3128
3129	switch (mode & S_IFMT) {
3130	default:
3131	inode->i_op = &shmem_special_inode_operations;
3132	init_special_inode(inode, mode, dev);
3133	break;
3134	case S_IFREG:
3135	inode->i_mapping->a_ops = &shmem_aops;
3136	inode->i_op = &shmem_inode_operations;
3137	inode->i_fop = &shmem_file_operations;
3138	mpol_shared_policy_init(sp: &info->policy,
3139	mpol: shmem_get_sbmpol(sbinfo));
3140	break;
3141	case S_IFDIR:
3142	inc_nlink(inode);
3143	/ Some things misbehave if size == 0 on a directory /
3144	inode->i_size = `2` * BOGO_DIRENT_SIZE;
3145	inode->i_op = &shmem_dir_inode_operations;
3146	inode->i_fop = &simple_offset_dir_operations;
3147	simple_offset_init(octx: shmem_get_offset_ctx(inode));
3148	break;
3149	case S_IFLNK:
3150	/*
3151	* Must not load anything in the rbtree,
3152	* mpol_free_shared_policy will not be called.
3153	*/
3154	mpol_shared_policy_init(sp: &info->policy, NULL);
3155	break;
3156	}
3157
3158	lockdep_annotate_inode_mutex_key(inode);
3159	return inode;
3160	}
3161
3162	#ifdef CONFIG_TMPFS_QUOTA
3163	static struct inode shmem_get_inode(struct* mnt_idmap *idmap,
3164	struct super_block sb, struct* inode *dir,
3165	umode_t mode, dev_t dev, unsigned long flags)
3166	{
3167	int err;
3168	struct inode *inode;
3169
3170	inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3171	if (IS_ERR(ptr: inode))
3172	return inode;
3173
3174	err = dquot_initialize(inode);
3175	if (err)
3176	goto errout;
3177
3178	err = dquot_alloc_inode(inode);
3179	if (err) {
3180	dquot_drop(inode);
3181	goto errout;
3182	}
3183	return inode;
3184
3185	errout:
3186	inode->i_flags \|= S_NOQUOTA;
3187	iput(inode);
3188	return ERR_PTR(error: err);
3189	}
3190	#else
3191	static inline struct inode shmem_get_inode(struct* mnt_idmap *idmap,
3192	struct super_block sb, struct* inode *dir,
3193	umode_t mode, dev_t dev, unsigned long flags)
3194	{
3195	return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3196	}
3197	#endif /* CONFIG_TMPFS_QUOTA */
3198
3199	#ifdef CONFIG_USERFAULTFD
3200	int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
3201	struct vm_area_struct *dst_vma,
3202	unsigned long dst_addr,
3203	unsigned long src_addr,
3204	uffd_flags_t flags,
3205	struct folio **foliop)
3206	{
3207	struct inode *inode = file_inode(f: dst_vma->vm_file);
3208	struct shmem_inode_info *info = SHMEM_I(inode);
3209	struct address_space *mapping = inode->i_mapping;
3210	gfp_t gfp = mapping_gfp_mask(mapping);
3211	pgoff_t pgoff = linear_page_index(vma: dst_vma, address: dst_addr);
3212	void *page_kaddr;
3213	struct folio *folio;
3214	int ret;
3215	pgoff_t max_off;
3216
3217	if (shmem_inode_acct_blocks(inode, pages: `1`)) {
3218	/*
3219	* We may have got a page, returned -ENOENT triggering a retry,
3220	* and now we find ourselves with -ENOMEM. Release the page, to
3221	* avoid a BUG_ON in our caller.
3222	*/
3223	if (unlikely(*foliop)) {
3224	folio_put(folio: *foliop);
3225	*foliop = NULL;
3226	}
3227	return -ENOMEM;
3228	}
3229
3230	if (!*foliop) {
3231	ret = -ENOMEM;
3232	folio = shmem_alloc_folio(gfp, order: `0`, info, index: pgoff);
3233	if (!folio)
3234	goto out_unacct_blocks;
3235
3236	if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_COPY)) {
3237	page_kaddr = kmap_local_folio(folio, offset: `0`);
3238	/*
3239	* The read mmap_lock is held here. Despite the
3240	* mmap_lock being read recursive a deadlock is still
3241	* possible if a writer has taken a lock. For example:
3242	*
3243	* process A thread 1 takes read lock on own mmap_lock
3244	* process A thread 2 calls mmap, blocks taking write lock
3245	* process B thread 1 takes page fault, read lock on own mmap lock
3246	* process B thread 2 calls mmap, blocks taking write lock
3247	* process A thread 1 blocks taking read lock on process B
3248	* process B thread 1 blocks taking read lock on process A
3249	*
3250	* Disable page faults to prevent potential deadlock
3251	* and retry the copy outside the mmap_lock.
3252	*/
3253	pagefault_disable();
3254	ret = copy_from_user(to: page_kaddr,
3255	from: (const void __user *)src_addr,
3256	PAGE_SIZE);
3257	pagefault_enable();
3258	kunmap_local(page_kaddr);
3259
3260	/ fallback to copy_from_user outside mmap_lock /
3261	if (unlikely(ret)) {
3262	*foliop = folio;
3263	ret = -ENOENT;
3264	/ don't free the page /
3265	goto out_unacct_blocks;
3266	}
3267
3268	flush_dcache_folio(folio);
3269	} else { / ZEROPAGE /
3270	clear_user_highpage(page: &folio->page, vaddr: dst_addr);
3271	}
3272	} else {
3273	folio = *foliop;
3274	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3275	*foliop = NULL;
3276	}
3277
3278	VM_BUG_ON(folio_test_locked(folio));
3279	VM_BUG_ON(folio_test_swapbacked(folio));
3280	__folio_set_locked(folio);
3281	__folio_set_swapbacked(folio);
3282	__folio_mark_uptodate(folio);
3283
3284	ret = -EFAULT;
3285	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3286	if (unlikely(pgoff >= max_off))
3287	goto out_release;
3288
3289	ret = mem_cgroup_charge(folio, mm: dst_vma->vm_mm, gfp);
3290	if (ret)
3291	goto out_release;
3292	ret = shmem_add_to_page_cache(folio, mapping, index: pgoff, NULL, gfp);
3293	if (ret)
3294	goto out_release;
3295
3296	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3297	page: &folio->page, newly_allocated: true, flags);
3298	if (ret)
3299	goto out_delete_from_cache;
3300
3301	shmem_recalc_inode(inode, alloced: `1`, swapped: `0`);
3302	folio_unlock(folio);
3303	return `0`;
3304	out_delete_from_cache:
3305	filemap_remove_folio(folio);
3306	out_release:
3307	folio_unlock(folio);
3308	folio_put(folio);
3309	out_unacct_blocks:
3310	shmem_inode_unacct_blocks(inode, pages: `1`);
3311	return ret;
3312	}
3313	#endif /* CONFIG_USERFAULTFD */
3314
3315	#ifdef CONFIG_TMPFS
3316	static const struct inode_operations shmem_symlink_inode_operations;
3317	static const struct inode_operations shmem_short_symlink_operations;
3318
3319	static int
3320	shmem_write_begin(const struct kiocb iocb, struct* address_space *mapping,
3321	loff_t pos, unsigned len,
3322	struct folio *foliop, void* **fsdata)
3323	{
3324	struct inode *inode = mapping->host;
3325	struct shmem_inode_info *info = SHMEM_I(inode);
3326	pgoff_t index = pos >> PAGE_SHIFT;
3327	struct folio *folio;
3328	int ret = `0`;
3329
3330	/ i_rwsem is held by caller /
3331	if (unlikely(info->seals & (F_SEAL_GROW \|
3332	F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))) {
3333	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))
3334	return -EPERM;
3335	if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3336	return -EPERM;
3337	}
3338
3339	if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
3340	pos + len > inode->i_size))
3341	return -EPERM;
3342
3343	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3344	if (ret)
3345	return ret;
3346
3347	if (folio_contain_hwpoisoned_page(folio)) {
3348	folio_unlock(folio);
3349	folio_put(folio);
3350	return -EIO;
3351	}
3352
3353	*foliop = folio;
3354	return `0`;
3355	}
3356
3357	static int
3358	shmem_write_end(const struct kiocb iocb, struct* address_space *mapping,
3359	loff_t pos, unsigned len, unsigned copied,
3360	struct folio folio, void* *fsdata)
3361	{
3362	struct inode *inode = mapping->host;
3363
3364	if (pos + copied > inode->i_size)
3365	i_size_write(inode, i_size: pos + copied);
3366
3367	if (!folio_test_uptodate(folio)) {
3368	if (copied < folio_size(folio)) {
3369	size_t from = offset_in_folio(folio, pos);
3370	folio_zero_segments(folio, start1: `0`, xend1: from,
3371	start2: from + copied, xend2: folio_size(folio));
3372	}
3373	folio_mark_uptodate(folio);
3374	}
3375	folio_mark_dirty(folio);
3376	folio_unlock(folio);
3377	folio_put(folio);
3378
3379	return copied;
3380	}
3381
3382	static ssize_t shmem_file_read_iter(struct kiocb iocb, struct* iov_iter *to)
3383	{
3384	struct file *file = iocb->ki_filp;
3385	struct inode *inode = file_inode(f: file);
3386	struct address_space *mapping = inode->i_mapping;
3387	pgoff_t index;
3388	unsigned long offset;
3389	int error = `0`;
3390	ssize_t retval = `0`;
3391
3392	for (;;) {
3393	struct folio *folio = NULL;
3394	struct page *page = NULL;
3395	unsigned long nr, ret;
3396	loff_t end_offset, i_size = i_size_read(inode);
3397	bool fallback_page_copy = false;
3398	size_t fsize;
3399
3400	if (unlikely(iocb->ki_pos >= i_size))
3401	break;
3402
3403	index = iocb->ki_pos >> PAGE_SHIFT;
3404	error = shmem_get_folio(inode, index, `0`, &folio, SGP_READ);
3405	if (error) {
3406	if (error == -EINVAL)
3407	error = `0`;
3408	break;
3409	}
3410	if (folio) {
3411	folio_unlock(folio);
3412
3413	page = folio_file_page(folio, index);
3414	if (PageHWPoison(page)) {
3415	folio_put(folio);
3416	error = -EIO;
3417	break;
3418	}
3419
3420	if (folio_test_large(folio) &&
3421	folio_test_has_hwpoisoned(folio))
3422	fallback_page_copy = true;
3423	}
3424
3425	/*
3426	* We must evaluate after, since reads (unlike writes)
3427	* are called without i_rwsem protection against truncate
3428	*/
3429	i_size = i_size_read(inode);
3430	if (unlikely(iocb->ki_pos >= i_size)) {
3431	if (folio)
3432	folio_put(folio);
3433	break;
3434	}
3435	end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
3436	if (folio && likely(!fallback_page_copy))
3437	fsize = folio_size(folio);
3438	else
3439	fsize = PAGE_SIZE;
3440	offset = iocb->ki_pos & (fsize - `1`);
3441	nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
3442
3443	if (folio) {
3444	/*
3445	* If users can be writing to this page using arbitrary
3446	* virtual addresses, take care about potential aliasing
3447	* before reading the page on the kernel side.
3448	*/
3449	if (mapping_writably_mapped(mapping)) {
3450	if (likely(!fallback_page_copy))
3451	flush_dcache_folio(folio);
3452	else
3453	flush_dcache_page(page);
3454	}
3455
3456	/*
3457	* Mark the folio accessed if we read the beginning.
3458	*/
3459	if (!offset)
3460	folio_mark_accessed(folio);
3461	/*
3462	* Ok, we have the page, and it's up-to-date, so
3463	* now we can copy it to user space...
3464	*/
3465	if (likely(!fallback_page_copy))
3466	ret = copy_folio_to_iter(folio, offset, bytes: nr, i: to);
3467	else
3468	ret = copy_page_to_iter(page, offset, bytes: nr, i: to);
3469	folio_put(folio);
3470	} else if (user_backed_iter(i: to)) {
3471	/*
3472	* Copy to user tends to be so well optimized, but
3473	* clear_user() not so much, that it is noticeably
3474	* faster to copy the zero page instead of clearing.
3475	*/
3476	ret = copy_page_to_iter(ZERO_PAGE(`0`), offset, bytes: nr, i: to);
3477	} else {
3478	/*
3479	* But submitting the same page twice in a row to
3480	* splice() - or others? - can result in confusion:
3481	* so don't attempt that optimization on pipes etc.
3482	*/
3483	ret = iov_iter_zero(bytes: nr, to);
3484	}
3485
3486	retval += ret;
3487	iocb->ki_pos += ret;
3488
3489	if (!iov_iter_count(i: to))
3490	break;
3491	if (ret < nr) {
3492	error = -EFAULT;
3493	break;
3494	}
3495	cond_resched();
3496	}
3497
3498	file_accessed(file);
3499	return retval ? retval : error;
3500	}
3501
3502	static ssize_t shmem_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
3503	{
3504	struct file *file = iocb->ki_filp;
3505	struct inode *inode = file->f_mapping->host;
3506	ssize_t ret;
3507
3508	inode_lock(inode);
3509	ret = generic_write_checks(iocb, from);
3510	if (ret <= `0`)
3511	goto unlock;
3512	ret = file_remove_privs(file);
3513	if (ret)
3514	goto unlock;
3515	ret = file_update_time(file);
3516	if (ret)
3517	goto unlock;
3518	ret = generic_perform_write(iocb, from);
3519	unlock:
3520	inode_unlock(inode);
3521	return ret;
3522	}
3523
3524	static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3525	struct pipe_buffer *buf)
3526	{
3527	return true;
3528	}
3529
3530	static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3531	struct pipe_buffer *buf)
3532	{
3533	}
3534
3535	static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3536	struct pipe_buffer *buf)
3537	{
3538	return false;
3539	}
3540
3541	static const struct pipe_buf_operations zero_pipe_buf_ops = {
3542	.release = zero_pipe_buf_release,
3543	.try_steal = zero_pipe_buf_try_steal,
3544	.get = zero_pipe_buf_get,
3545	};
3546
3547	static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3548	loff_t fpos, size_t size)
3549	{
3550	size_t offset = fpos & ~PAGE_MASK;
3551
3552	size = min_t(size_t, size, PAGE_SIZE - offset);
3553
3554	if (!pipe_is_full(pipe)) {
3555	struct pipe_buffer *buf = pipe_head_buf(pipe);
3556
3557	buf = (struct* pipe_buffer) {
3558	.ops = &zero_pipe_buf_ops,
3559	.page = ZERO_PAGE(`0`),
3560	.offset = offset,
3561	.len = size,
3562	};
3563	pipe->head++;
3564	}
3565
3566	return size;
3567	}
3568
3569	static ssize_t shmem_file_splice_read(struct file in, loff_t ppos,
3570	struct pipe_inode_info *pipe,
3571	size_t len, unsigned int flags)
3572	{
3573	struct inode *inode = file_inode(f: in);
3574	struct address_space *mapping = inode->i_mapping;
3575	struct folio *folio = NULL;
3576	size_t total_spliced = `0`, used, npages, n, part;
3577	loff_t isize;
3578	int error = `0`;
3579
3580	/ Work out how much data we can actually add into the pipe /
3581	used = pipe_buf_usage(pipe);
3582	npages = max_t(ssize_t, pipe->max_usage - used, `0`);
3583	len = min_t(size_t, len, npages * PAGE_SIZE);
3584
3585	do {
3586	bool fallback_page_splice = false;
3587	struct page *page = NULL;
3588	pgoff_t index;
3589	size_t size;
3590
3591	if (*ppos >= i_size_read(inode))
3592	break;
3593
3594	index = *ppos >> PAGE_SHIFT;
3595	error = shmem_get_folio(inode, index, `0`, &folio, SGP_READ);
3596	if (error) {
3597	if (error == -EINVAL)
3598	error = `0`;
3599	break;
3600	}
3601	if (folio) {
3602	folio_unlock(folio);
3603
3604	page = folio_file_page(folio, index);
3605	if (PageHWPoison(page)) {
3606	error = -EIO;
3607	break;
3608	}
3609
3610	if (folio_test_large(folio) &&
3611	folio_test_has_hwpoisoned(folio))
3612	fallback_page_splice = true;
3613	}
3614
3615	/*
3616	* i_size must be checked after we know the pages are Uptodate.
3617	*
3618	* Checking i_size after the check allows us to calculate
3619	* the correct value for "nr", which means the zero-filled
3620	* part of the page is not copied back to userspace (unless
3621	* another truncate extends the file - this is desired though).
3622	*/
3623	isize = i_size_read(inode);
3624	if (unlikely(*ppos >= isize))
3625	break;
3626	/*
3627	* Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
3628	* pages.
3629	*/
3630	size = len;
3631	if (unlikely(fallback_page_splice)) {
3632	size_t offset = *ppos & ~PAGE_MASK;
3633
3634	size = umin(size, PAGE_SIZE - offset);
3635	}
3636	part = min_t(loff_t, isize - *ppos, size);
3637
3638	if (folio) {
3639	/*
3640	* If users can be writing to this page using arbitrary
3641	* virtual addresses, take care about potential aliasing
3642	* before reading the page on the kernel side.
3643	*/
3644	if (mapping_writably_mapped(mapping)) {
3645	if (likely(!fallback_page_splice))
3646	flush_dcache_folio(folio);
3647	else
3648	flush_dcache_page(page);
3649	}
3650	folio_mark_accessed(folio);
3651	/*
3652	* Ok, we have the page, and it's up-to-date, so we can
3653	* now splice it into the pipe.
3654	*/
3655	n = splice_folio_into_pipe(pipe, folio, fpos: *ppos, size: part);
3656	folio_put(folio);
3657	folio = NULL;
3658	} else {
3659	n = splice_zeropage_into_pipe(pipe, fpos: *ppos, size: part);
3660	}
3661
3662	if (!n)
3663	break;
3664	len -= n;
3665	total_spliced += n;
3666	*ppos += n;
3667	in->f_ra.prev_pos = *ppos;
3668	if (pipe_is_full(pipe))
3669	break;
3670
3671	cond_resched();
3672	} while (len);
3673
3674	if (folio)
3675	folio_put(folio);
3676
3677	file_accessed(file: in);
3678	return total_spliced ? total_spliced : error;
3679	}
3680
3681	static loff_t shmem_file_llseek(struct file file, loff_t offset, int* whence)
3682	{
3683	struct address_space *mapping = file->f_mapping;
3684	struct inode *inode = mapping->host;
3685
3686	if (whence != SEEK_DATA && whence != SEEK_HOLE)
3687	return generic_file_llseek_size(file, offset, whence,
3688	MAX_LFS_FILESIZE, eof: i_size_read(inode));
3689	if (offset < `0`)
3690	return -ENXIO;
3691
3692	inode_lock(inode);
3693	/ We're holding i_rwsem so we can access i_size directly /
3694	offset = mapping_seek_hole_data(mapping, start: offset, end: inode->i_size, whence);
3695	if (offset >= `0`)
3696	offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3697	inode_unlock(inode);
3698	return offset;
3699	}
3700
3701	static long shmem_fallocate(struct file file, int* mode, loff_t offset,
3702	loff_t len)
3703	{
3704	struct inode *inode = file_inode(f: file);
3705	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
3706	struct shmem_inode_info *info = SHMEM_I(inode);
3707	struct shmem_falloc shmem_falloc;
3708	pgoff_t start, index, end, undo_fallocend;
3709	int error;
3710
3711	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
3712	return -EOPNOTSUPP;
3713
3714	inode_lock(inode);
3715
3716	if (info->flags & SHMEM_F_MAPPING_FROZEN) {
3717	error = -EPERM;
3718	goto out;
3719	}
3720
3721	if (mode & FALLOC_FL_PUNCH_HOLE) {
3722	struct address_space *mapping = file->f_mapping;
3723	loff_t unmap_start = round_up(offset, PAGE_SIZE);
3724	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - `1`;
3725	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3726
3727	/ protected by i_rwsem /
3728	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
3729	error = -EPERM;
3730	goto out;
3731	}
3732
3733	shmem_falloc.waitq = &shmem_falloc_waitq;
3734	shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3735	shmem_falloc.next = (unmap_end + `1`) >> PAGE_SHIFT;
3736	spin_lock(lock: &inode->i_lock);
3737	inode->i_private = &shmem_falloc;
3738	spin_unlock(lock: &inode->i_lock);
3739
3740	if ((u64)unmap_end > (u64)unmap_start)
3741	unmap_mapping_range(mapping, holebegin: unmap_start,
3742	holelen: `1` + unmap_end - unmap_start, even_cows: `0`);
3743	shmem_truncate_range(inode, offset, offset + len - `1`);
3744	/ No need to unmap again: hole-punching leaves COWed pages /
3745
3746	spin_lock(lock: &inode->i_lock);
3747	inode->i_private = NULL;
3748	wake_up_all(&shmem_falloc_waitq);
3749	WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3750	spin_unlock(lock: &inode->i_lock);
3751	error = `0`;
3752	goto out;
3753	}
3754
3755	/ We need to check rlimit even when FALLOC_FL_KEEP_SIZE /
3756	error = inode_newsize_ok(inode, offset: offset + len);
3757	if (error)
3758	goto out;
3759
3760	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3761	error = -EPERM;
3762	goto out;
3763	}
3764
3765	start = offset >> PAGE_SHIFT;
3766	end = (offset + len + PAGE_SIZE - `1`) >> PAGE_SHIFT;
3767	/ Try to avoid a swapstorm if len is impossible to satisfy /
3768	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3769	error = -ENOSPC;
3770	goto out;
3771	}
3772
3773	shmem_falloc.waitq = NULL;
3774	shmem_falloc.start = start;
3775	shmem_falloc.next = start;
3776	shmem_falloc.nr_falloced = `0`;
3777	shmem_falloc.nr_unswapped = `0`;
3778	spin_lock(lock: &inode->i_lock);
3779	inode->i_private = &shmem_falloc;
3780	spin_unlock(lock: &inode->i_lock);
3781
3782	/*
3783	* info->fallocend is only relevant when huge pages might be
3784	* involved: to prevent split_huge_page() freeing fallocated
3785	* pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3786	*/
3787	undo_fallocend = info->fallocend;
3788	if (info->fallocend < end)
3789	info->fallocend = end;
3790
3791	for (index = start; index < end; ) {
3792	struct folio *folio;
3793
3794	/*
3795	* Check for fatal signal so that we abort early in OOM
3796	* situations. We don't want to abort in case of non-fatal
3797	* signals as large fallocate can take noticeable time and
3798	* e.g. periodic timers may result in fallocate constantly
3799	* restarting.
3800	*/
3801	if (fatal_signal_pending(current))
3802	error = -EINTR;
3803	else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3804	error = -ENOMEM;
3805	else
3806	error = shmem_get_folio(inode, index, offset + len,
3807	&folio, SGP_FALLOC);
3808	if (error) {
3809	info->fallocend = undo_fallocend;
3810	/ Remove the !uptodate folios we added /
3811	if (index > start) {
3812	shmem_undo_range(inode,
3813	lstart: (loff_t)start << PAGE_SHIFT,
3814	lend: ((loff_t)index << PAGE_SHIFT) - `1`, unfalloc: true);
3815	}
3816	goto undone;
3817	}
3818
3819	/*
3820	* Here is a more important optimization than it appears:
3821	* a second SGP_FALLOC on the same large folio will clear it,
3822	* making it uptodate and un-undoable if we fail later.
3823	*/
3824	index = folio_next_index(folio);
3825	/ Beware 32-bit wraparound /
3826	if (!index)
3827	index--;
3828
3829	/*
3830	* Inform shmem_writeout() how far we have reached.
3831	* No need for lock or barrier: we have the page lock.
3832	*/
3833	if (!folio_test_uptodate(folio))
3834	shmem_falloc.nr_falloced += index - shmem_falloc.next;
3835	shmem_falloc.next = index;
3836
3837	/*
3838	* If !uptodate, leave it that way so that freeable folios
3839	* can be recognized if we need to rollback on error later.
3840	* But mark it dirty so that memory pressure will swap rather
3841	* than free the folios we are allocating (and SGP_CACHE folios
3842	* might still be clean: we now need to mark those dirty too).
3843	*/
3844	folio_mark_dirty(folio);
3845	folio_unlock(folio);
3846	folio_put(folio);
3847	cond_resched();
3848	}
3849
3850	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3851	i_size_write(inode, i_size: offset + len);
3852	undone:
3853	spin_lock(lock: &inode->i_lock);
3854	inode->i_private = NULL;
3855	spin_unlock(lock: &inode->i_lock);
3856	out:
3857	if (!error)
3858	file_modified(file);
3859	inode_unlock(inode);
3860	return error;
3861	}
3862
3863	static int shmem_statfs(struct dentry dentry, struct* kstatfs *buf)
3864	{
3865	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: dentry->d_sb);
3866
3867	buf->f_type = TMPFS_MAGIC;
3868	buf->f_bsize = PAGE_SIZE;
3869	buf->f_namelen = NAME_MAX;
3870	if (sbinfo->max_blocks) {
3871	buf->f_blocks = sbinfo->max_blocks;
3872	buf->f_bavail =
3873	buf->f_bfree = sbinfo->max_blocks -
3874	percpu_counter_sum(fbc: &sbinfo->used_blocks);
3875	}
3876	if (sbinfo->max_inodes) {
3877	buf->f_files = sbinfo->max_inodes;
3878	buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3879	}
3880	/ else leave those fields 0 like simple_statfs /
3881
3882	buf->f_fsid = uuid_to_fsid(uuid: dentry->d_sb->s_uuid.b);
3883
3884	return `0`;
3885	}
3886
3887	/*
3888	* File creation. Allocate an inode, and we're done..
3889	*/
3890	static int
3891	shmem_mknod(struct mnt_idmap idmap, struct* inode *dir,
3892	struct dentry *dentry, umode_t mode, dev_t dev)
3893	{
3894	struct inode *inode;
3895	int error;
3896
3897	if (!generic_ci_validate_strict_name(dir, name: &dentry->d_name))
3898	return -EINVAL;
3899
3900	inode = shmem_get_inode(idmap, sb: dir->i_sb, dir, mode, dev, VM_NORESERVE);
3901	if (IS_ERR(ptr: inode))
3902	return PTR_ERR(ptr: inode);
3903
3904	error = simple_acl_create(dir, inode);
3905	if (error)
3906	goto out_iput;
3907	error = security_inode_init_security(inode, dir, qstr: &dentry->d_name,
3908	initxattrs: shmem_initxattrs, NULL);
3909	if (error && error != -EOPNOTSUPP)
3910	goto out_iput;
3911
3912	error = simple_offset_add(octx: shmem_get_offset_ctx(inode: dir), dentry);
3913	if (error)
3914	goto out_iput;
3915
3916	dir->i_size += BOGO_DIRENT_SIZE;
3917	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
3918	inode_inc_iversion(inode: dir);
3919
3920	d_make_persistent(dentry, inode);
3921	return error;
3922
3923	out_iput:
3924	iput(inode);
3925	return error;
3926	}
3927
3928	static int
3929	shmem_tmpfile(struct mnt_idmap idmap, struct* inode *dir,
3930	struct file *file, umode_t mode)
3931	{
3932	struct inode *inode;
3933	int error;
3934
3935	inode = shmem_get_inode(idmap, sb: dir->i_sb, dir, mode, dev: `0`, VM_NORESERVE);
3936	if (IS_ERR(ptr: inode)) {
3937	error = PTR_ERR(ptr: inode);
3938	goto err_out;
3939	}
3940	error = security_inode_init_security(inode, dir, NULL,
3941	initxattrs: shmem_initxattrs, NULL);
3942	if (error && error != -EOPNOTSUPP)
3943	goto out_iput;
3944	error = simple_acl_create(dir, inode);
3945	if (error)
3946	goto out_iput;
3947	d_tmpfile(file, inode);
3948
3949	err_out:
3950	return finish_open_simple(file, error);
3951	out_iput:
3952	iput(inode);
3953	return error;
3954	}
3955
3956	static struct dentry shmem_mkdir(struct* mnt_idmap idmap, struct* inode *dir,
3957	struct dentry *dentry, umode_t mode)
3958	{
3959	int error;
3960
3961	error = shmem_mknod(idmap, dir, dentry, mode: mode \| S_IFDIR, dev: `0`);
3962	if (error)
3963	return ERR_PTR(error);
3964	inc_nlink(inode: dir);
3965	return NULL;
3966	}
3967
3968	static int shmem_create(struct mnt_idmap idmap, struct* inode *dir,
3969	struct dentry *dentry, umode_t mode, bool excl)
3970	{
3971	return shmem_mknod(idmap, dir, dentry, mode: mode \| S_IFREG, dev: `0`);
3972	}
3973
3974	/*
3975	* Link a file..
3976	*/
3977	static int shmem_link(struct dentry old_dentry, struct* inode *dir,
3978	struct dentry *dentry)
3979	{
3980	struct inode *inode = d_inode(dentry: old_dentry);
3981	int ret;
3982
3983	/*
3984	* No ordinary (disk based) filesystem counts links as inodes;
3985	* but each new link needs a new dentry, pinning lowmem, and
3986	* tmpfs dentries cannot be pruned until they are unlinked.
3987	* But if an O_TMPFILE file is linked into the tmpfs, the
3988	* first link must skip that, to get the accounting right.
3989	*/
3990	if (inode->i_nlink) {
3991	ret = shmem_reserve_inode(sb: inode->i_sb, NULL);
3992	if (ret)
3993	return ret;
3994	}
3995
3996	ret = simple_offset_add(octx: shmem_get_offset_ctx(inode: dir), dentry);
3997	if (ret) {
3998	if (inode->i_nlink)
3999	shmem_free_inode(sb: inode->i_sb, freed_ispace: `0`);
4000	return ret;
4001	}
4002
4003	dir->i_size += BOGO_DIRENT_SIZE;
4004	inode_inc_iversion(inode: dir);
4005	return simple_link(old_dentry, dir, dentry);
4006	}
4007
4008	static int shmem_unlink(struct inode dir, struct* dentry *dentry)
4009	{
4010	struct inode *inode = d_inode(dentry);
4011
4012	if (inode->i_nlink > `1` && !S_ISDIR(inode->i_mode))
4013	shmem_free_inode(sb: inode->i_sb, freed_ispace: `0`);
4014
4015	simple_offset_remove(octx: shmem_get_offset_ctx(inode: dir), dentry);
4016
4017	dir->i_size -= BOGO_DIRENT_SIZE;
4018	inode_inc_iversion(inode: dir);
4019	simple_unlink(dir, dentry);
4020
4021	/*
4022	* For now, VFS can't deal with case-insensitive negative dentries, so
4023	* we invalidate them
4024	*/
4025	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4026	d_invalidate(dentry);
4027
4028	return `0`;
4029	}
4030
4031	static int shmem_rmdir(struct inode dir, struct* dentry *dentry)
4032	{
4033	if (!simple_empty(dentry))
4034	return -ENOTEMPTY;
4035
4036	drop_nlink(inode: d_inode(dentry));
4037	drop_nlink(inode: dir);
4038	return shmem_unlink(dir, dentry);
4039	}
4040
4041	static int shmem_whiteout(struct mnt_idmap *idmap,
4042	struct inode old_dir, struct* dentry *old_dentry)
4043	{
4044	struct dentry *whiteout;
4045	int error;
4046
4047	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
4048	if (!whiteout)
4049	return -ENOMEM;
4050	error = shmem_mknod(idmap, dir: old_dir, dentry: whiteout,
4051	S_IFCHR \| WHITEOUT_MODE, WHITEOUT_DEV);
4052	dput(whiteout);
4053	return error;
4054	}
4055
4056	/*
4057	* The VFS layer already does all the dentry stuff for rename,
4058	* we just have to decrement the usage count for the target if
4059	* it exists so that the VFS layer correctly free's it when it
4060	* gets overwritten.
4061	*/
4062	static int shmem_rename2(struct mnt_idmap *idmap,
4063	struct inode old_dir, struct* dentry *old_dentry,
4064	struct inode new_dir, struct* dentry *new_dentry,
4065	unsigned int flags)
4066	{
4067	struct inode *inode = d_inode(dentry: old_dentry);
4068	int they_are_dirs = S_ISDIR(inode->i_mode);
4069	bool had_offset = false;
4070	int error;
4071
4072	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
4073	return -EINVAL;
4074
4075	if (flags & RENAME_EXCHANGE)
4076	return simple_offset_rename_exchange(old_dir, old_dentry,
4077	new_dir, new_dentry);
4078
4079	if (!simple_empty(new_dentry))
4080	return -ENOTEMPTY;
4081
4082	error = simple_offset_add(octx: shmem_get_offset_ctx(inode: new_dir), dentry: new_dentry);
4083	if (error == -EBUSY)
4084	had_offset = true;
4085	else if (unlikely(error))
4086	return error;
4087
4088	if (flags & RENAME_WHITEOUT) {
4089	error = shmem_whiteout(idmap, old_dir, old_dentry);
4090	if (error) {
4091	if (!had_offset)
4092	simple_offset_remove(octx: shmem_get_offset_ctx(inode: new_dir),
4093	dentry: new_dentry);
4094	return error;
4095	}
4096	}
4097
4098	simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
4099	if (d_really_is_positive(dentry: new_dentry)) {
4100	(void) shmem_unlink(dir: new_dir, dentry: new_dentry);
4101	if (they_are_dirs) {
4102	drop_nlink(inode: d_inode(dentry: new_dentry));
4103	drop_nlink(inode: old_dir);
4104	}
4105	} else if (they_are_dirs) {
4106	drop_nlink(inode: old_dir);
4107	inc_nlink(inode: new_dir);
4108	}
4109
4110	old_dir->i_size -= BOGO_DIRENT_SIZE;
4111	new_dir->i_size += BOGO_DIRENT_SIZE;
4112	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
4113	inode_inc_iversion(inode: old_dir);
4114	inode_inc_iversion(inode: new_dir);
4115	return `0`;
4116	}
4117
4118	static int shmem_symlink(struct mnt_idmap idmap, struct* inode *dir,
4119	struct dentry dentry, const* char *symname)
4120	{
4121	int error;
4122	int len;
4123	struct inode *inode;
4124	struct folio *folio;
4125	char *link;
4126
4127	len = strlen(symname) + `1`;
4128	if (len > PAGE_SIZE)
4129	return -ENAMETOOLONG;
4130
4131	inode = shmem_get_inode(idmap, sb: dir->i_sb, dir, S_IFLNK \| `0777`, dev: `0`,
4132	VM_NORESERVE);
4133	if (IS_ERR(ptr: inode))
4134	return PTR_ERR(ptr: inode);
4135
4136	error = security_inode_init_security(inode, dir, qstr: &dentry->d_name,
4137	initxattrs: shmem_initxattrs, NULL);
4138	if (error && error != -EOPNOTSUPP)
4139	goto out_iput;
4140
4141	error = simple_offset_add(octx: shmem_get_offset_ctx(inode: dir), dentry);
4142	if (error)
4143	goto out_iput;
4144
4145	inode->i_size = len-`1`;
4146	if (len <= SHORT_SYMLINK_LEN) {
4147	link = kmemdup(symname, len, GFP_KERNEL);
4148	if (!link) {
4149	error = -ENOMEM;
4150	goto out_remove_offset;
4151	}
4152	inode->i_op = &shmem_short_symlink_operations;
4153	inode_set_cached_link(inode, link, linklen: len - `1`);
4154	} else {
4155	inode_nohighmem(inode);
4156	inode->i_mapping->a_ops = &shmem_aops;
4157	error = shmem_get_folio(inode, `0`, `0`, &folio, SGP_WRITE);
4158	if (error)
4159	goto out_remove_offset;
4160	inode->i_op = &shmem_symlink_inode_operations;
4161	memcpy(folio_address(folio), symname, len);
4162	folio_mark_uptodate(folio);
4163	folio_mark_dirty(folio);
4164	folio_unlock(folio);
4165	folio_put(folio);
4166	}
4167	dir->i_size += BOGO_DIRENT_SIZE;
4168	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
4169	inode_inc_iversion(inode: dir);
4170	d_make_persistent(dentry, inode);
4171	return `0`;
4172
4173	out_remove_offset:
4174	simple_offset_remove(octx: shmem_get_offset_ctx(inode: dir), dentry);
4175	out_iput:
4176	iput(inode);
4177	return error;
4178	}
4179
4180	static void shmem_put_link(void *arg)
4181	{
4182	folio_mark_accessed(arg);
4183	folio_put(folio: arg);
4184	}
4185
4186	static const char shmem_get_link(struct* dentry dentry, struct* inode *inode,
4187	struct delayed_call *done)
4188	{
4189	struct folio *folio = NULL;
4190	int error;
4191
4192	if (!dentry) {
4193	folio = filemap_get_folio(mapping: inode->i_mapping, index: `0`);
4194	if (IS_ERR(ptr: folio))
4195	return ERR_PTR(error: -ECHILD);
4196	if (PageHWPoison(folio_page(folio, `0`)) \|\|
4197	!folio_test_uptodate(folio)) {
4198	folio_put(folio);
4199	return ERR_PTR(error: -ECHILD);
4200	}
4201	} else {
4202	error = shmem_get_folio(inode, `0`, `0`, &folio, SGP_READ);
4203	if (error)
4204	return ERR_PTR(error);
4205	if (!folio)
4206	return ERR_PTR(error: -ECHILD);
4207	if (PageHWPoison(folio_page(folio, `0`))) {
4208	folio_unlock(folio);
4209	folio_put(folio);
4210	return ERR_PTR(error: -ECHILD);
4211	}
4212	folio_unlock(folio);
4213	}
4214	set_delayed_call(call: done, fn: shmem_put_link, arg: folio);
4215	return folio_address(folio);
4216	}
4217
4218	#ifdef CONFIG_TMPFS_XATTR
4219
4220	static int shmem_fileattr_get(struct dentry dentry, struct* file_kattr *fa)
4221	{
4222	struct shmem_inode_info *info = SHMEM_I(inode: d_inode(dentry));
4223
4224	fileattr_fill_flags(fa, flags: info->fsflags & SHMEM_FL_USER_VISIBLE);
4225
4226	return `0`;
4227	}
4228
4229	static int shmem_fileattr_set(struct mnt_idmap *idmap,
4230	struct dentry dentry, struct* file_kattr *fa)
4231	{
4232	struct inode *inode = d_inode(dentry);
4233	struct shmem_inode_info *info = SHMEM_I(inode);
4234	int ret, flags;
4235
4236	if (fileattr_has_fsx(fa))
4237	return -EOPNOTSUPP;
4238	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
4239	return -EOPNOTSUPP;
4240
4241	flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) \|
4242	(fa->flags & SHMEM_FL_USER_MODIFIABLE);
4243
4244	ret = shmem_set_inode_flags(inode, fsflags: flags, dentry);
4245
4246	if (ret)
4247	return ret;
4248
4249	info->fsflags = flags;
4250
4251	inode_set_ctime_current(inode);
4252	inode_inc_iversion(inode);
4253	return `0`;
4254	}
4255
4256	/*
4257	* Superblocks without xattr inode operations may get some security.* xattr
4258	* support from the LSM "for free". As soon as we have any other xattrs
4259	* like ACLs, we also need to implement the security.* handlers at
4260	* filesystem level, though.
4261	*/
4262
4263	/*
4264	* Callback for security_inode_init_security() for acquiring xattrs.
4265	*/
4266	static int shmem_initxattrs(struct inode *inode,
4267	const struct xattr xattr_array, void* *fs_info)
4268	{
4269	struct shmem_inode_info *info = SHMEM_I(inode);
4270	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
4271	const struct xattr *xattr;
4272	struct simple_xattr *new_xattr;
4273	size_t ispace = `0`;
4274	size_t len;
4275
4276	if (sbinfo->max_inodes) {
4277	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4278	ispace += simple_xattr_space(name: xattr->name,
4279	size: xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
4280	}
4281	if (ispace) {
4282	raw_spin_lock(&sbinfo->stat_lock);
4283	if (sbinfo->free_ispace < ispace)
4284	ispace = `0`;
4285	else
4286	sbinfo->free_ispace -= ispace;
4287	raw_spin_unlock(&sbinfo->stat_lock);
4288	if (!ispace)
4289	return -ENOSPC;
4290	}
4291	}
4292
4293	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4294	new_xattr = simple_xattr_alloc(value: xattr->value, size: xattr->value_len);
4295	if (!new_xattr)
4296	break;
4297
4298	len = strlen(xattr->name) + `1`;
4299	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
4300	GFP_KERNEL_ACCOUNT);
4301	if (!new_xattr->name) {
4302	kvfree(addr: new_xattr);
4303	break;
4304	}
4305
4306	memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4307	XATTR_SECURITY_PREFIX_LEN);
4308	memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4309	xattr->name, len);
4310
4311	simple_xattr_add(xattrs: &info->xattrs, new_xattr);
4312	}
4313
4314	if (xattr->name != NULL) {
4315	if (ispace) {
4316	raw_spin_lock(&sbinfo->stat_lock);
4317	sbinfo->free_ispace += ispace;
4318	raw_spin_unlock(&sbinfo->stat_lock);
4319	}
4320	simple_xattrs_free(xattrs: &info->xattrs, NULL);
4321	return -ENOMEM;
4322	}
4323
4324	return `0`;
4325	}
4326
4327	static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4328	struct dentry unused, struct* inode *inode,
4329	const char name, void* *buffer, size_t size)
4330	{
4331	struct shmem_inode_info *info = SHMEM_I(inode);
4332
4333	name = xattr_full_name(handler, name);
4334	return simple_xattr_get(xattrs: &info->xattrs, name, buffer, size);
4335	}
4336
4337	static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4338	struct mnt_idmap *idmap,
4339	struct dentry unused, struct* inode *inode,
4340	const char name, const* void *value,
4341	size_t size, int flags)
4342	{
4343	struct shmem_inode_info *info = SHMEM_I(inode);
4344	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: inode->i_sb);
4345	struct simple_xattr *old_xattr;
4346	size_t ispace = `0`;
4347
4348	name = xattr_full_name(handler, name);
4349	if (value && sbinfo->max_inodes) {
4350	ispace = simple_xattr_space(name, size);
4351	raw_spin_lock(&sbinfo->stat_lock);
4352	if (sbinfo->free_ispace < ispace)
4353	ispace = `0`;
4354	else
4355	sbinfo->free_ispace -= ispace;
4356	raw_spin_unlock(&sbinfo->stat_lock);
4357	if (!ispace)
4358	return -ENOSPC;
4359	}
4360
4361	old_xattr = simple_xattr_set(xattrs: &info->xattrs, name, value, size, flags);
4362	if (!IS_ERR(ptr: old_xattr)) {
4363	ispace = `0`;
4364	if (old_xattr && sbinfo->max_inodes)
4365	ispace = simple_xattr_space(name: old_xattr->name,
4366	size: old_xattr->size);
4367	simple_xattr_free(xattr: old_xattr);
4368	old_xattr = NULL;
4369	inode_set_ctime_current(inode);
4370	inode_inc_iversion(inode);
4371	}
4372	if (ispace) {
4373	raw_spin_lock(&sbinfo->stat_lock);
4374	sbinfo->free_ispace += ispace;
4375	raw_spin_unlock(&sbinfo->stat_lock);
4376	}
4377	return PTR_ERR(ptr: old_xattr);
4378	}
4379
4380	static const struct xattr_handler shmem_security_xattr_handler = {
4381	.prefix = XATTR_SECURITY_PREFIX,
4382	.get = shmem_xattr_handler_get,
4383	.set = shmem_xattr_handler_set,
4384	};
4385
4386	static const struct xattr_handler shmem_trusted_xattr_handler = {
4387	.prefix = XATTR_TRUSTED_PREFIX,
4388	.get = shmem_xattr_handler_get,
4389	.set = shmem_xattr_handler_set,
4390	};
4391
4392	static const struct xattr_handler shmem_user_xattr_handler = {
4393	.prefix = XATTR_USER_PREFIX,
4394	.get = shmem_xattr_handler_get,
4395	.set = shmem_xattr_handler_set,
4396	};
4397
4398	static const struct xattr_handler * const shmem_xattr_handlers[] = {
4399	&shmem_security_xattr_handler,
4400	&shmem_trusted_xattr_handler,
4401	&shmem_user_xattr_handler,
4402	NULL
4403	};
4404
4405	static ssize_t shmem_listxattr(struct dentry dentry, char* *buffer, size_t size)
4406	{
4407	struct shmem_inode_info *info = SHMEM_I(inode: d_inode(dentry));
4408	return simple_xattr_list(inode: d_inode(dentry), xattrs: &info->xattrs, buffer, size);
4409	}
4410	#endif /* CONFIG_TMPFS_XATTR */
4411
4412	static const struct inode_operations shmem_short_symlink_operations = {
4413	.getattr = shmem_getattr,
4414	.setattr = shmem_setattr,
4415	.get_link = simple_get_link,
4416	#ifdef CONFIG_TMPFS_XATTR
4417	.listxattr = shmem_listxattr,
4418	#endif
4419	};
4420
4421	static const struct inode_operations shmem_symlink_inode_operations = {
4422	.getattr = shmem_getattr,
4423	.setattr = shmem_setattr,
4424	.get_link = shmem_get_link,
4425	#ifdef CONFIG_TMPFS_XATTR
4426	.listxattr = shmem_listxattr,
4427	#endif
4428	};
4429
4430	static struct dentry shmem_get_parent(struct* dentry *child)
4431	{
4432	return ERR_PTR(error: -ESTALE);
4433	}
4434
4435	static int shmem_match(struct inode ino, void* *vfh)
4436	{
4437	__u32 *fh = vfh;
4438	__u64 inum = fh[`2`];
4439	inum = (inum << `32`) \| fh[`1`];
4440	return ino->i_ino == inum && fh[`0`] == ino->i_generation;
4441	}
4442
4443	/ Find any alias of inode, but prefer a hashed alias /
4444	static struct dentry shmem_find_alias(struct* inode *inode)
4445	{
4446	struct dentry *alias = d_find_alias(inode);
4447
4448	return alias ?: d_find_any_alias(inode);
4449	}
4450
4451	static struct dentry shmem_fh_to_dentry(struct* super_block *sb,
4452	struct fid fid, int* fh_len, int fh_type)
4453	{
4454	struct inode *inode;
4455	struct dentry *dentry = NULL;
4456	u64 inum;
4457
4458	if (fh_len < `3`)
4459	return NULL;
4460
4461	inum = fid->raw[`2`];
4462	inum = (inum << `32`) \| fid->raw[`1`];
4463
4464	inode = ilookup5(sb, hashval: (unsigned long)(inum + fid->raw[`0`]),
4465	test: shmem_match, data: fid->raw);
4466	if (inode) {
4467	dentry = shmem_find_alias(inode);
4468	iput(inode);
4469	}
4470
4471	return dentry;
4472	}
4473
4474	static int shmem_encode_fh(struct inode inode, __u32 fh, int *len,
4475	struct inode *parent)
4476	{
4477	if (*len < `3`) {
4478	*len = `3`;
4479	return FILEID_INVALID;
4480	}
4481
4482	if (inode_unhashed(inode)) {
4483	/ Unfortunately insert_inode_hash is not idempotent,*
4484	* so as we hash inodes here rather than at creation
4485	* time, we need a lock to ensure we only try
4486	* to do it once
4487	*/
4488	static DEFINE_SPINLOCK(lock);
4489	spin_lock(lock: &lock);
4490	if (inode_unhashed(inode))
4491	__insert_inode_hash(inode,
4492	hashval: inode->i_ino + inode->i_generation);
4493	spin_unlock(lock: &lock);
4494	}
4495
4496	fh[`0`] = inode->i_generation;
4497	fh[`1`] = inode->i_ino;
4498	fh[`2`] = ((__u64)inode->i_ino) >> `32`;
4499
4500	*len = `3`;
4501	return `1`;
4502	}
4503
4504	static const struct export_operations shmem_export_ops = {
4505	.get_parent = shmem_get_parent,
4506	.encode_fh = shmem_encode_fh,
4507	.fh_to_dentry = shmem_fh_to_dentry,
4508	};
4509
4510	enum shmem_param {
4511	Opt_gid,
4512	Opt_huge,
4513	Opt_mode,
4514	Opt_mpol,
4515	Opt_nr_blocks,
4516	Opt_nr_inodes,
4517	Opt_size,
4518	Opt_uid,
4519	Opt_inode32,
4520	Opt_inode64,
4521	Opt_noswap,
4522	Opt_quota,
4523	Opt_usrquota,
4524	Opt_grpquota,
4525	Opt_usrquota_block_hardlimit,
4526	Opt_usrquota_inode_hardlimit,
4527	Opt_grpquota_block_hardlimit,
4528	Opt_grpquota_inode_hardlimit,
4529	Opt_casefold_version,
4530	Opt_casefold,
4531	Opt_strict_encoding,
4532	};
4533
4534	static const struct constant_table shmem_param_enums_huge[] = {
4535	{"never", SHMEM_HUGE_NEVER },
4536	{"always", SHMEM_HUGE_ALWAYS },
4537	{"within_size", SHMEM_HUGE_WITHIN_SIZE },
4538	{"advise", SHMEM_HUGE_ADVISE },
4539	{}
4540	};
4541
4542	const struct fs_parameter_spec shmem_fs_parameters[] = {
4543	fsparam_gid ("gid", Opt_gid),
4544	fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
4545	fsparam_u32oct("mode", Opt_mode),
4546	fsparam_string("mpol", Opt_mpol),
4547	fsparam_string("nr_blocks", Opt_nr_blocks),
4548	fsparam_string("nr_inodes", Opt_nr_inodes),
4549	fsparam_string("size", Opt_size),
4550	fsparam_uid ("uid", Opt_uid),
4551	fsparam_flag ("inode32", Opt_inode32),
4552	fsparam_flag ("inode64", Opt_inode64),
4553	fsparam_flag ("noswap", Opt_noswap),
4554	#ifdef CONFIG_TMPFS_QUOTA
4555	fsparam_flag ("quota", Opt_quota),
4556	fsparam_flag ("usrquota", Opt_usrquota),
4557	fsparam_flag ("grpquota", Opt_grpquota),
4558	fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4559	fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4560	fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4561	fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4562	#endif
4563	fsparam_string("casefold", Opt_casefold_version),
4564	fsparam_flag ("casefold", Opt_casefold),
4565	fsparam_flag ("strict_encoding", Opt_strict_encoding),
4566	{}
4567	};
4568
4569	#if IS_ENABLED(CONFIG_UNICODE)
4570	static int shmem_parse_opt_casefold(struct fs_context fc, struct* fs_parameter *param,
4571	bool latest_version)
4572	{
4573	struct shmem_options *ctx = fc->fs_private;
4574	int version = UTF8_LATEST;
4575	struct unicode_map *encoding;
4576	char *version_str = param->string + `5`;
4577
4578	if (!latest_version) {
4579	if (strncmp(param->string, "utf8-", `5`))
4580	return invalfc(fc, "Only UTF-8 encodings are supported "
4581	"in the format: utf8-<version number>");
4582
4583	version = utf8_parse_version(version: version_str);
4584	if (version < `0`)
4585	return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
4586	}
4587
4588	encoding = utf8_load(version);
4589
4590	if (IS_ERR(ptr: encoding)) {
4591	return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
4592	unicode_major(version), unicode_minor(version),
4593	unicode_rev(version));
4594	}
4595
4596	pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
4597	unicode_major(version), unicode_minor(version), unicode_rev(version));
4598
4599	ctx->encoding = encoding;
4600
4601	return `0`;
4602	}
4603	#else
4604	static int shmem_parse_opt_casefold(struct fs_context fc, struct* fs_parameter *param,
4605	bool latest_version)
4606	{
4607	return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4608	}
4609	#endif
4610
4611	static int shmem_parse_one(struct fs_context fc, struct* fs_parameter *param)
4612	{
4613	struct shmem_options *ctx = fc->fs_private;
4614	struct fs_parse_result result;
4615	unsigned long long size;
4616	char *rest;
4617	int opt;
4618	kuid_t kuid;
4619	kgid_t kgid;
4620
4621	opt = fs_parse(fc, desc: shmem_fs_parameters, param, result: &result);
4622	if (opt < `0`)
4623	return opt;
4624
4625	switch (opt) {
4626	case Opt_size:
4627	size = memparse(ptr: param->string, retptr: &rest);
4628	if (*rest == `'%'`) {
4629	size <<= PAGE_SHIFT;
4630	size *= totalram_pages();
4631	do_div(size, `100`);
4632	rest++;
4633	}
4634	if (*rest)
4635	goto bad_value;
4636	ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4637	ctx->seen \|= SHMEM_SEEN_BLOCKS;
4638	break;
4639	case Opt_nr_blocks:
4640	ctx->blocks = memparse(ptr: param->string, retptr: &rest);
4641	if (*rest \|\| ctx->blocks > LONG_MAX)
4642	goto bad_value;
4643	ctx->seen \|= SHMEM_SEEN_BLOCKS;
4644	break;
4645	case Opt_nr_inodes:
4646	ctx->inodes = memparse(ptr: param->string, retptr: &rest);
4647	if (*rest \|\| ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4648	goto bad_value;
4649	ctx->seen \|= SHMEM_SEEN_INODES;
4650	break;
4651	case Opt_mode:
4652	ctx->mode = result.uint_32 & `07777`;
4653	break;
4654	case Opt_uid:
4655	kuid = result.uid;
4656
4657	/*
4658	* The requested uid must be representable in the
4659	* filesystem's idmapping.
4660	*/
4661	if (!kuid_has_mapping(ns: fc->user_ns, uid: kuid))
4662	goto bad_value;
4663
4664	ctx->uid = kuid;
4665	break;
4666	case Opt_gid:
4667	kgid = result.gid;
4668
4669	/*
4670	* The requested gid must be representable in the
4671	* filesystem's idmapping.
4672	*/
4673	if (!kgid_has_mapping(ns: fc->user_ns, gid: kgid))
4674	goto bad_value;
4675
4676	ctx->gid = kgid;
4677	break;
4678	case Opt_huge:
4679	ctx->huge = result.uint_32;
4680	if (ctx->huge != SHMEM_HUGE_NEVER &&
4681	!(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4682	has_transparent_hugepage()))
4683	goto unsupported_parameter;
4684	ctx->seen \|= SHMEM_SEEN_HUGE;
4685	break;
4686	case Opt_mpol:
4687	if (IS_ENABLED(CONFIG_NUMA)) {
4688	mpol_put(pol: ctx->mpol);
4689	ctx->mpol = NULL;
4690	if (mpol_parse_str(str: param->string, mpol: &ctx->mpol))
4691	goto bad_value;
4692	break;
4693	}
4694	goto unsupported_parameter;
4695	case Opt_inode32:
4696	ctx->full_inums = false;
4697	ctx->seen \|= SHMEM_SEEN_INUMS;
4698	break;
4699	case Opt_inode64:
4700	if (sizeof(ino_t) < `8`) {
4701	return invalfc(fc,
4702	"Cannot use inode64 with <64bit inums in kernel\n");
4703	}
4704	ctx->full_inums = true;
4705	ctx->seen \|= SHMEM_SEEN_INUMS;
4706	break;
4707	case Opt_noswap:
4708	if ((fc->user_ns != &init_user_ns) \|\| !capable(CAP_SYS_ADMIN)) {
4709	return invalfc(fc,
4710	"Turning off swap in unprivileged tmpfs mounts unsupported");
4711	}
4712	ctx->noswap = true;
4713	break;
4714	case Opt_quota:
4715	if (fc->user_ns != &init_user_ns)
4716	return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4717	ctx->seen \|= SHMEM_SEEN_QUOTA;
4718	ctx->quota_types \|= (QTYPE_MASK_USR \| QTYPE_MASK_GRP);
4719	break;
4720	case Opt_usrquota:
4721	if (fc->user_ns != &init_user_ns)
4722	return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4723	ctx->seen \|= SHMEM_SEEN_QUOTA;
4724	ctx->quota_types \|= QTYPE_MASK_USR;
4725	break;
4726	case Opt_grpquota:
4727	if (fc->user_ns != &init_user_ns)
4728	return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4729	ctx->seen \|= SHMEM_SEEN_QUOTA;
4730	ctx->quota_types \|= QTYPE_MASK_GRP;
4731	break;
4732	case Opt_usrquota_block_hardlimit:
4733	size = memparse(ptr: param->string, retptr: &rest);
4734	if (*rest \|\| !size)
4735	goto bad_value;
4736	if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4737	return invalfc(fc,
4738	"User quota block hardlimit too large.");
4739	ctx->qlimits.usrquota_bhardlimit = size;
4740	break;
4741	case Opt_grpquota_block_hardlimit:
4742	size = memparse(ptr: param->string, retptr: &rest);
4743	if (*rest \|\| !size)
4744	goto bad_value;
4745	if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4746	return invalfc(fc,
4747	"Group quota block hardlimit too large.");
4748	ctx->qlimits.grpquota_bhardlimit = size;
4749	break;
4750	case Opt_usrquota_inode_hardlimit:
4751	size = memparse(ptr: param->string, retptr: &rest);
4752	if (*rest \|\| !size)
4753	goto bad_value;
4754	if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4755	return invalfc(fc,
4756	"User quota inode hardlimit too large.");
4757	ctx->qlimits.usrquota_ihardlimit = size;
4758	break;
4759	case Opt_grpquota_inode_hardlimit:
4760	size = memparse(ptr: param->string, retptr: &rest);
4761	if (*rest \|\| !size)
4762	goto bad_value;
4763	if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4764	return invalfc(fc,
4765	"Group quota inode hardlimit too large.");
4766	ctx->qlimits.grpquota_ihardlimit = size;
4767	break;
4768	case Opt_casefold_version:
4769	return shmem_parse_opt_casefold(fc, param, latest_version: false);
4770	case Opt_casefold:
4771	return shmem_parse_opt_casefold(fc, param, latest_version: true);
4772	case Opt_strict_encoding:
4773	#if IS_ENABLED(CONFIG_UNICODE)
4774	ctx->strict_encoding = true;
4775	break;
4776	#else
4777	return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4778	#endif
4779	}
4780	return `0`;
4781
4782	unsupported_parameter:
4783	return invalfc(fc, "Unsupported parameter '%s'", param->key);
4784	bad_value:
4785	return invalfc(fc, "Bad value for '%s'", param->key);
4786	}
4787
4788	static char shmem_next_opt(char* **s)
4789	{
4790	char sbegin = s;
4791	char *p;
4792
4793	if (sbegin == NULL)
4794	return NULL;
4795
4796	/*
4797	* NUL-terminate this option: unfortunately,
4798	* mount options form a comma-separated list,
4799	* but mpol's nodelist may also contain commas.
4800	*/
4801	for (;;) {
4802	p = strchr(*s, `','`);
4803	if (p == NULL)
4804	break;
4805	*s = p + `1`;
4806	if (!isdigit(c: *(p+`1`))) {
4807	*p = `'\0'`;
4808	return sbegin;
4809	}
4810	}
4811
4812	*s = NULL;
4813	return sbegin;
4814	}
4815
4816	static int shmem_parse_monolithic(struct fs_context fc, void* *data)
4817	{
4818	return vfs_parse_monolithic_sep(fc, data, sep: shmem_next_opt);
4819	}
4820
4821	/*
4822	* Reconfigure a shmem filesystem.
4823	*/
4824	static int shmem_reconfigure(struct fs_context *fc)
4825	{
4826	struct shmem_options *ctx = fc->fs_private;
4827	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: fc->root->d_sb);
4828	unsigned long used_isp;
4829	struct mempolicy *mpol = NULL;
4830	const char *err;
4831
4832	raw_spin_lock(&sbinfo->stat_lock);
4833	used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4834
4835	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4836	if (!sbinfo->max_blocks) {
4837	err = "Cannot retroactively limit size";
4838	goto out;
4839	}
4840	if (percpu_counter_compare(fbc: &sbinfo->used_blocks,
4841	rhs: ctx->blocks) > `0`) {
4842	err = "Too small a size for current use";
4843	goto out;
4844	}
4845	}
4846	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4847	if (!sbinfo->max_inodes) {
4848	err = "Cannot retroactively limit inodes";
4849	goto out;
4850	}
4851	if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4852	err = "Too few inodes for current use";
4853	goto out;
4854	}
4855	}
4856
4857	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4858	sbinfo->next_ino > UINT_MAX) {
4859	err = "Current inum too high to switch to 32-bit inums";
4860	goto out;
4861	}
4862
4863	/*
4864	* "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
4865	* counterpart for (re-)enabling swap.
4866	*/
4867	if (ctx->noswap && !sbinfo->noswap) {
4868	err = "Cannot disable swap on remount";
4869	goto out;
4870	}
4871
4872	if (ctx->seen & SHMEM_SEEN_QUOTA &&
4873	!sb_any_quota_loaded(sb: fc->root->d_sb)) {
4874	err = "Cannot enable quota on remount";
4875	goto out;
4876	}
4877
4878	#ifdef CONFIG_TMPFS_QUOTA
4879	#define CHANGED_LIMIT(name) \
4880	(ctx->qlimits.name## hardlimit && \
4881	(ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4882
4883	if (CHANGED_LIMIT(usrquota_b) \|\| CHANGED_LIMIT(usrquota_i) \|\|
4884	CHANGED_LIMIT(grpquota_b) \|\| CHANGED_LIMIT(grpquota_i)) {
4885	err = "Cannot change global quota limit on remount";
4886	goto out;
4887	}
4888	#endif /* CONFIG_TMPFS_QUOTA */
4889
4890	if (ctx->seen & SHMEM_SEEN_HUGE)
4891	sbinfo->huge = ctx->huge;
4892	if (ctx->seen & SHMEM_SEEN_INUMS)
4893	sbinfo->full_inums = ctx->full_inums;
4894	if (ctx->seen & SHMEM_SEEN_BLOCKS)
4895	sbinfo->max_blocks = ctx->blocks;
4896	if (ctx->seen & SHMEM_SEEN_INODES) {
4897	sbinfo->max_inodes = ctx->inodes;
4898	sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4899	}
4900
4901	/*
4902	* Preserve previous mempolicy unless mpol remount option was specified.
4903	*/
4904	if (ctx->mpol) {
4905	mpol = sbinfo->mpol;
4906	sbinfo->mpol = ctx->mpol; / transfers initial ref /
4907	ctx->mpol = NULL;
4908	}
4909
4910	if (ctx->noswap)
4911	sbinfo->noswap = true;
4912
4913	raw_spin_unlock(&sbinfo->stat_lock);
4914	mpol_put(pol: mpol);
4915	return `0`;
4916	out:
4917	raw_spin_unlock(&sbinfo->stat_lock);
4918	return invalfc(fc, "%s", err);
4919	}
4920
4921	static int shmem_show_options(struct seq_file seq, struct* dentry *root)
4922	{
4923	struct shmem_sb_info *sbinfo = SHMEM_SB(sb: root->d_sb);
4924	struct mempolicy *mpol;
4925
4926	if (sbinfo->max_blocks != shmem_default_max_blocks())
4927	seq_printf(m: seq, fmt: ",size=%luk", K(sbinfo->max_blocks));
4928	if (sbinfo->max_inodes != shmem_default_max_inodes())
4929	seq_printf(m: seq, fmt: ",nr_inodes=%lu", sbinfo->max_inodes);
4930	if (sbinfo->mode != (`0777` \| S_ISVTX))
4931	seq_printf(m: seq, fmt: ",mode=%03ho", sbinfo->mode);
4932	if (!uid_eq(left: sbinfo->uid, GLOBAL_ROOT_UID))
4933	seq_printf(seq, ",uid=%u",
4934	from_kuid_munged(&init_user_ns, sbinfo->uid));
4935	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4936	seq_printf(seq, ",gid=%u",
4937	from_kgid_munged(&init_user_ns, sbinfo->gid));
4938
4939	/*
4940	* Showing inode{64,32} might be useful even if it's the system default,
4941	* since then people don't have to resort to checking both here and
4942	* /proc/config.gz to confirm 64-bit inums were successfully applied
4943	* (which may not even exist if IKCONFIG_PROC isn't enabled).
4944	*
4945	* We hide it when inode64 isn't the default and we are using 32-bit
4946	* inodes, since that probably just means the feature isn't even under
4947	* consideration.
4948	*
4949	* As such:
4950	*
4951	* +-----------------+-----------------+
4952	* \| TMPFS_INODE64=y \| TMPFS_INODE64=n \|
4953	* +------------------+-----------------+-----------------+
4954	* \| full_inums=true \| show \| show \|
4955	* \| full_inums=false \| show \| hide \|
4956	* +------------------+-----------------+-----------------+
4957	*
4958	*/
4959	if (IS_ENABLED(CONFIG_TMPFS_INODE64) \|\| sbinfo->full_inums)
4960	seq_printf(seq, ",inode%d", (sbinfo->full_inums ? `64` : `32`));
4961	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4962	/ Rightly or wrongly, show huge mount option unmasked by shmem_huge /
4963	if (sbinfo->huge)
4964	seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4965	#endif
4966	mpol = shmem_get_sbmpol(sbinfo);
4967	shmem_show_mpol(seq, mpol);
4968	mpol_put(mpol);
4969	if (sbinfo->noswap)
4970	seq_printf(seq, ",noswap");
4971	#ifdef CONFIG_TMPFS_QUOTA
4972	if (sb_has_quota_active(root->d_sb, USRQUOTA))
4973	seq_printf(seq, ",usrquota");
4974	if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4975	seq_printf(seq, ",grpquota");
4976	if (sbinfo->qlimits.usrquota_bhardlimit)
4977	seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4978	sbinfo->qlimits.usrquota_bhardlimit);
4979	if (sbinfo->qlimits.grpquota_bhardlimit)
4980	seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4981	sbinfo->qlimits.grpquota_bhardlimit);
4982	if (sbinfo->qlimits.usrquota_ihardlimit)
4983	seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4984	sbinfo->qlimits.usrquota_ihardlimit);
4985	if (sbinfo->qlimits.grpquota_ihardlimit)
4986	seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4987	sbinfo->qlimits.grpquota_ihardlimit);
4988	#endif
4989	return `0`;
4990	}
4991
4992	#endif /* CONFIG_TMPFS */
4993
4994	static void shmem_put_super(struct super_block *sb)
4995	{
4996	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4997
4998	#if IS_ENABLED(CONFIG_UNICODE)
4999	if (sb->s_encoding)
5000	utf8_unload(um: sb->s_encoding);
5001	#endif
5002
5003	#ifdef CONFIG_TMPFS_QUOTA
5004	shmem_disable_quotas(sb);
5005	#endif
5006	free_percpu(pdata: sbinfo->ino_batch);
5007	percpu_counter_destroy(fbc: &sbinfo->used_blocks);
5008	mpol_put(pol: sbinfo->mpol);
5009	kfree(objp: sbinfo);
5010	sb->s_fs_info = NULL;
5011	}
5012
5013	#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
5014	static const struct dentry_operations shmem_ci_dentry_ops = {
5015	.d_hash = generic_ci_d_hash,
5016	.d_compare = generic_ci_d_compare,
5017	};
5018	#endif
5019
5020	static int shmem_fill_super(struct super_block sb, struct* fs_context *fc)
5021	{
5022	struct shmem_options *ctx = fc->fs_private;
5023	struct inode *inode;
5024	struct shmem_sb_info *sbinfo;
5025	int error = -ENOMEM;
5026
5027	/ Round up to L1_CACHE_BYTES to resist false sharing /
5028	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
5029	L1_CACHE_BYTES), GFP_KERNEL);
5030	if (!sbinfo)
5031	return error;
5032
5033	sb->s_fs_info = sbinfo;
5034
5035	#ifdef CONFIG_TMPFS
5036	/*
5037	* Per default we only allow half of the physical ram per
5038	* tmpfs instance, limiting inodes to one per page of lowmem;
5039	* but the internal instance is left unlimited.
5040	*/
5041	if (!(sb->s_flags & SB_KERNMOUNT)) {
5042	if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
5043	ctx->blocks = shmem_default_max_blocks();
5044	if (!(ctx->seen & SHMEM_SEEN_INODES))
5045	ctx->inodes = shmem_default_max_inodes();
5046	if (!(ctx->seen & SHMEM_SEEN_INUMS))
5047	ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
5048	sbinfo->noswap = ctx->noswap;
5049	} else {
5050	sb->s_flags \|= SB_NOUSER;
5051	}
5052	sb->s_export_op = &shmem_export_ops;
5053	sb->s_flags \|= SB_NOSEC;
5054
5055	#if IS_ENABLED(CONFIG_UNICODE)
5056	if (!ctx->encoding && ctx->strict_encoding) {
5057	pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
5058	error = -EINVAL;
5059	goto failed;
5060	}
5061
5062	if (ctx->encoding) {
5063	sb->s_encoding = ctx->encoding;
5064	set_default_d_op(sb, &shmem_ci_dentry_ops);
5065	if (ctx->strict_encoding)
5066	sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
5067	}
5068	#endif
5069
5070	#else
5071	sb->s_flags \|= SB_NOUSER;
5072	#endif /* CONFIG_TMPFS */
5073	sb->s_d_flags \|= DCACHE_DONTCACHE;
5074	sbinfo->max_blocks = ctx->blocks;
5075	sbinfo->max_inodes = ctx->inodes;
5076	sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
5077	if (sb->s_flags & SB_KERNMOUNT) {
5078	sbinfo->ino_batch = alloc_percpu(ino_t);
5079	if (!sbinfo->ino_batch)
5080	goto failed;
5081	}
5082	sbinfo->uid = ctx->uid;
5083	sbinfo->gid = ctx->gid;
5084	sbinfo->full_inums = ctx->full_inums;
5085	sbinfo->mode = ctx->mode;
5086	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5087	if (ctx->seen & SHMEM_SEEN_HUGE)
5088	sbinfo->huge = ctx->huge;
5089	else
5090	sbinfo->huge = tmpfs_huge;
5091	#endif
5092	sbinfo->mpol = ctx->mpol;
5093	ctx->mpol = NULL;
5094
5095	raw_spin_lock_init(&sbinfo->stat_lock);
5096	if (percpu_counter_init(&sbinfo->used_blocks, `0`, GFP_KERNEL))
5097	goto failed;
5098	spin_lock_init(&sbinfo->shrinklist_lock);
5099	INIT_LIST_HEAD(list: &sbinfo->shrinklist);
5100
5101	sb->s_maxbytes = MAX_LFS_FILESIZE;
5102	sb->s_blocksize = PAGE_SIZE;
5103	sb->s_blocksize_bits = PAGE_SHIFT;
5104	sb->s_magic = TMPFS_MAGIC;
5105	sb->s_op = &shmem_ops;
5106	sb->s_time_gran = `1`;
5107	#ifdef CONFIG_TMPFS_XATTR
5108	sb->s_xattr = shmem_xattr_handlers;
5109	#endif
5110	#ifdef CONFIG_TMPFS_POSIX_ACL
5111	sb->s_flags \|= SB_POSIXACL;
5112	#endif
5113	uuid_t uuid;
5114	uuid_gen(u: &uuid);
5115	super_set_uuid(sb, uuid: uuid.b, len: sizeof(uuid));
5116
5117	#ifdef CONFIG_TMPFS_QUOTA
5118	if (ctx->seen & SHMEM_SEEN_QUOTA) {
5119	sb->dq_op = &shmem_quota_operations;
5120	sb->s_qcop = &dquot_quotactl_sysfile_ops;
5121	sb->s_quota_types = QTYPE_MASK_USR \| QTYPE_MASK_GRP;
5122
5123	/ Copy the default limits from ctx into sbinfo /
5124	memcpy(&sbinfo->qlimits, &ctx->qlimits,
5125	sizeof(struct shmem_quota_limits));
5126
5127	if (shmem_enable_quotas(sb, quota_types: ctx->quota_types))
5128	goto failed;
5129	}
5130	#endif /* CONFIG_TMPFS_QUOTA */
5131
5132	inode = shmem_get_inode(idmap: &nop_mnt_idmap, sb, NULL,
5133	S_IFDIR \| sbinfo->mode, dev: `0`, VM_NORESERVE);
5134	if (IS_ERR(ptr: inode)) {
5135	error = PTR_ERR(ptr: inode);
5136	goto failed;
5137	}
5138	inode->i_uid = sbinfo->uid;
5139	inode->i_gid = sbinfo->gid;
5140	sb->s_root = d_make_root(inode);
5141	if (!sb->s_root)
5142	goto failed;
5143	return `0`;
5144
5145	failed:
5146	shmem_put_super(sb);
5147	return error;
5148	}
5149
5150	static int shmem_get_tree(struct fs_context *fc)
5151	{
5152	return get_tree_nodev(fc, fill_super: shmem_fill_super);
5153	}
5154
5155	static void shmem_free_fc(struct fs_context *fc)
5156	{
5157	struct shmem_options *ctx = fc->fs_private;
5158
5159	if (ctx) {
5160	mpol_put(pol: ctx->mpol);
5161	kfree(objp: ctx);
5162	}
5163	}
5164
5165	static const struct fs_context_operations shmem_fs_context_ops = {
5166	.free = shmem_free_fc,
5167	.get_tree = shmem_get_tree,
5168	#ifdef CONFIG_TMPFS
5169	.parse_monolithic = shmem_parse_monolithic,
5170	.parse_param = shmem_parse_one,
5171	.reconfigure = shmem_reconfigure,
5172	#endif
5173	};
5174
5175	static struct kmem_cache *shmem_inode_cachep __ro_after_init;
5176
5177	static struct inode shmem_alloc_inode(struct* super_block *sb)
5178	{
5179	struct shmem_inode_info *info;
5180	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
5181	if (!info)
5182	return NULL;
5183	return &info->vfs_inode;
5184	}
5185
5186	static void shmem_free_in_core_inode(struct inode *inode)
5187	{
5188	if (S_ISLNK(inode->i_mode))
5189	kfree(objp: inode->i_link);
5190	kmem_cache_free(s: shmem_inode_cachep, objp: SHMEM_I(inode));
5191	}
5192
5193	static void shmem_destroy_inode(struct inode *inode)
5194	{
5195	if (S_ISREG(inode->i_mode))
5196	mpol_free_shared_policy(sp: &SHMEM_I(inode)->policy);
5197	if (S_ISDIR(inode->i_mode))
5198	simple_offset_destroy(octx: shmem_get_offset_ctx(inode));
5199	}
5200
5201	static void shmem_init_inode(void *foo)
5202	{
5203	struct shmem_inode_info *info = foo;
5204	inode_init_once(&info->vfs_inode);
5205	}
5206
5207	static void __init shmem_init_inodecache(void)
5208	{
5209	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
5210	sizeof(struct shmem_inode_info),
5211	`0`, SLAB_PANIC\|SLAB_ACCOUNT, shmem_init_inode);
5212	}
5213
5214	static void __init shmem_destroy_inodecache(void)
5215	{
5216	kmem_cache_destroy(s: shmem_inode_cachep);
5217	}
5218
5219	/ Keep the page in page cache instead of truncating it /
5220	static int shmem_error_remove_folio(struct address_space *mapping,
5221	struct folio *folio)
5222	{
5223	return `0`;
5224	}
5225
5226	static const struct address_space_operations shmem_aops = {
5227	.dirty_folio = noop_dirty_folio,
5228	#ifdef CONFIG_TMPFS
5229	.write_begin = shmem_write_begin,
5230	.write_end = shmem_write_end,
5231	#endif
5232	#ifdef CONFIG_MIGRATION
5233	.migrate_folio = migrate_folio,
5234	#endif
5235	.error_remove_folio = shmem_error_remove_folio,
5236	};
5237
5238	static const struct file_operations shmem_file_operations = {
5239	.mmap_prepare = shmem_mmap_prepare,
5240	.open = shmem_file_open,
5241	.get_unmapped_area = shmem_get_unmapped_area,
5242	#ifdef CONFIG_TMPFS
5243	.llseek = shmem_file_llseek,
5244	.read_iter = shmem_file_read_iter,
5245	.write_iter = shmem_file_write_iter,
5246	.fsync = noop_fsync,
5247	.splice_read = shmem_file_splice_read,
5248	.splice_write = iter_file_splice_write,
5249	.fallocate = shmem_fallocate,
5250	#endif
5251	};
5252
5253	static const struct inode_operations shmem_inode_operations = {
5254	.getattr = shmem_getattr,
5255	.setattr = shmem_setattr,
5256	#ifdef CONFIG_TMPFS_XATTR
5257	.listxattr = shmem_listxattr,
5258	.set_acl = simple_set_acl,
5259	.fileattr_get = shmem_fileattr_get,
5260	.fileattr_set = shmem_fileattr_set,
5261	#endif
5262	};
5263
5264	static const struct inode_operations shmem_dir_inode_operations = {
5265	#ifdef CONFIG_TMPFS
5266	.getattr = shmem_getattr,
5267	.create = shmem_create,
5268	.lookup = simple_lookup,
5269	.link = shmem_link,
5270	.unlink = shmem_unlink,
5271	.symlink = shmem_symlink,
5272	.mkdir = shmem_mkdir,
5273	.rmdir = shmem_rmdir,
5274	.mknod = shmem_mknod,
5275	.rename = shmem_rename2,
5276	.tmpfile = shmem_tmpfile,
5277	.get_offset_ctx = shmem_get_offset_ctx,
5278	#endif
5279	#ifdef CONFIG_TMPFS_XATTR
5280	.listxattr = shmem_listxattr,
5281	.fileattr_get = shmem_fileattr_get,
5282	.fileattr_set = shmem_fileattr_set,
5283	#endif
5284	#ifdef CONFIG_TMPFS_POSIX_ACL
5285	.setattr = shmem_setattr,
5286	.set_acl = simple_set_acl,
5287	#endif
5288	};
5289
5290	static const struct inode_operations shmem_special_inode_operations = {
5291	.getattr = shmem_getattr,
5292	#ifdef CONFIG_TMPFS_XATTR
5293	.listxattr = shmem_listxattr,
5294	#endif
5295	#ifdef CONFIG_TMPFS_POSIX_ACL
5296	.setattr = shmem_setattr,
5297	.set_acl = simple_set_acl,
5298	#endif
5299	};
5300
5301	static const struct super_operations shmem_ops = {
5302	.alloc_inode = shmem_alloc_inode,
5303	.free_inode = shmem_free_in_core_inode,
5304	.destroy_inode = shmem_destroy_inode,
5305	#ifdef CONFIG_TMPFS
5306	.statfs = shmem_statfs,
5307	.show_options = shmem_show_options,
5308	#endif
5309	#ifdef CONFIG_TMPFS_QUOTA
5310	.get_dquots = shmem_get_dquots,
5311	#endif
5312	.evict_inode = shmem_evict_inode,
5313	.drop_inode = inode_just_drop,
5314	.put_super = shmem_put_super,
5315	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5316	.nr_cached_objects = shmem_unused_huge_count,
5317	.free_cached_objects = shmem_unused_huge_scan,
5318	#endif
5319	};
5320
5321	static const struct vm_operations_struct shmem_vm_ops = {
5322	.fault = shmem_fault,
5323	.map_pages = filemap_map_pages,
5324	#ifdef CONFIG_NUMA
5325	.set_policy = shmem_set_policy,
5326	.get_policy = shmem_get_policy,
5327	#endif
5328	};
5329
5330	static const struct vm_operations_struct shmem_anon_vm_ops = {
5331	.fault = shmem_fault,
5332	.map_pages = filemap_map_pages,
5333	#ifdef CONFIG_NUMA
5334	.set_policy = shmem_set_policy,
5335	.get_policy = shmem_get_policy,
5336	#endif
5337	};
5338
5339	int shmem_init_fs_context(struct fs_context *fc)
5340	{
5341	struct shmem_options *ctx;
5342
5343	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
5344	if (!ctx)
5345	return -ENOMEM;
5346
5347	ctx->mode = `0777` \| S_ISVTX;
5348	ctx->uid = current_fsuid();
5349	ctx->gid = current_fsgid();
5350
5351	#if IS_ENABLED(CONFIG_UNICODE)
5352	ctx->encoding = NULL;
5353	#endif
5354
5355	fc->fs_private = ctx;
5356	fc->ops = &shmem_fs_context_ops;
5357	#ifdef CONFIG_TMPFS
5358	fc->sb_flags \|= SB_I_VERSION;
5359	#endif
5360	return `0`;
5361	}
5362
5363	static struct file_system_type shmem_fs_type = {
5364	.owner = THIS_MODULE,
5365	.name = "tmpfs",
5366	.init_fs_context = shmem_init_fs_context,
5367	#ifdef CONFIG_TMPFS
5368	.parameters = shmem_fs_parameters,
5369	#endif
5370	.kill_sb = kill_anon_super,
5371	.fs_flags = FS_USERNS_MOUNT \| FS_ALLOW_IDMAP \| FS_MGTIME,
5372	};
5373
5374	#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5375
5376	#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
5377	{ \
5378	.attr = { .name = __stringify(_name), .mode = _mode }, \
5379	.show = _show, \
5380	.store = _store, \
5381	}
5382
5383	#define TMPFS_ATTR_W(_name, _store) \
5384	static struct kobj_attribute tmpfs_attr_##_name = \
5385	__INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
5386
5387	#define TMPFS_ATTR_RW(_name, _show, _store) \
5388	static struct kobj_attribute tmpfs_attr_##_name = \
5389	__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
5390
5391	#define TMPFS_ATTR_RO(_name, _show) \
5392	static struct kobj_attribute tmpfs_attr_##_name = \
5393	__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
5394
5395	#if IS_ENABLED(CONFIG_UNICODE)
5396	static ssize_t casefold_show(struct kobject kobj, struct* kobj_attribute *a,
5397	char *buf)
5398	{
5399	return sysfs_emit(buf, fmt: "supported\n");
5400	}
5401	TMPFS_ATTR_RO(casefold, casefold_show);
5402	#endif
5403
5404	static struct attribute *tmpfs_attributes[] = {
5405	#if IS_ENABLED(CONFIG_UNICODE)
5406	&tmpfs_attr_casefold.attr,
5407	#endif
5408	NULL
5409	};
5410
5411	static const struct attribute_group tmpfs_attribute_group = {
5412	.attrs = tmpfs_attributes,
5413	.name = "features"
5414	};
5415
5416	static struct kobject *tmpfs_kobj;
5417
5418	static int __init tmpfs_sysfs_init(void)
5419	{
5420	int ret;
5421
5422	tmpfs_kobj = kobject_create_and_add(name: "tmpfs", parent: fs_kobj);
5423	if (!tmpfs_kobj)
5424	return -ENOMEM;
5425
5426	ret = sysfs_create_group(kobj: tmpfs_kobj, grp: &tmpfs_attribute_group);
5427	if (ret)
5428	kobject_put(kobj: tmpfs_kobj);
5429
5430	return ret;
5431	}
5432	#endif /* CONFIG_SYSFS && CONFIG_TMPFS */
5433
5434	void __init shmem_init(void)
5435	{
5436	int error;
5437
5438	shmem_init_inodecache();
5439
5440	#ifdef CONFIG_TMPFS_QUOTA
5441	register_quota_format(fmt: &shmem_quota_format);
5442	#endif
5443
5444	error = register_filesystem(&shmem_fs_type);
5445	if (error) {
5446	pr_err("Could not register tmpfs\n");
5447	goto out2;
5448	}
5449
5450	shm_mnt = kern_mount(&shmem_fs_type);
5451	if (IS_ERR(ptr: shm_mnt)) {
5452	error = PTR_ERR(ptr: shm_mnt);
5453	pr_err("Could not kern_mount tmpfs\n");
5454	goto out1;
5455	}
5456
5457	#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5458	error = tmpfs_sysfs_init();
5459	if (error) {
5460	pr_err("Could not init tmpfs sysfs\n");
5461	goto out1;
5462	}
5463	#endif
5464
5465	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5466	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5467	SHMEM_SB(sb: shm_mnt->mnt_sb)->huge = shmem_huge;
5468	else
5469	shmem_huge = SHMEM_HUGE_NEVER; / just in case it was patched /
5470
5471	/*
5472	* Default to setting PMD-sized THP to inherit the global setting and
5473	* disable all other multi-size THPs.
5474	*/
5475	if (!shmem_orders_configured)
5476	huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5477	#endif
5478	return;
5479
5480	out1:
5481	unregister_filesystem(&shmem_fs_type);
5482	out2:
5483	#ifdef CONFIG_TMPFS_QUOTA
5484	unregister_quota_format(fmt: &shmem_quota_format);
5485	#endif
5486	shmem_destroy_inodecache();
5487	shm_mnt = ERR_PTR(error);
5488	}
5489
5490	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
5491	static ssize_t shmem_enabled_show(struct kobject *kobj,
5492	struct kobj_attribute attr, char* *buf)
5493	{
5494	static const int values[] = {
5495	SHMEM_HUGE_ALWAYS,
5496	SHMEM_HUGE_WITHIN_SIZE,
5497	SHMEM_HUGE_ADVISE,
5498	SHMEM_HUGE_NEVER,
5499	SHMEM_HUGE_DENY,
5500	SHMEM_HUGE_FORCE,
5501	};
5502	int len = `0`;
5503	int i;
5504
5505	for (i = `0`; i < ARRAY_SIZE(values); i++) {
5506	len += sysfs_emit_at(buf, at: len,
5507	fmt: shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5508	i ? " " : "", shmem_format_huge(huge: values[i]));
5509	}
5510	len += sysfs_emit_at(buf, at: len, fmt: "\n");
5511
5512	return len;
5513	}
5514
5515	static ssize_t shmem_enabled_store(struct kobject *kobj,
5516	struct kobj_attribute attr, const* char *buf, size_t count)
5517	{
5518	char tmp[`16`];
5519	int huge, err;
5520
5521	if (count + `1` > sizeof(tmp))
5522	return -EINVAL;
5523	memcpy(tmp, buf, count);
5524	tmp[count] = `'\0'`;
5525	if (count && tmp[count - `1`] == `'\n'`)
5526	tmp[count - `1`] = `'\0'`;
5527
5528	huge = shmem_parse_huge(str: tmp);
5529	if (huge == -EINVAL)
5530	return huge;
5531
5532	shmem_huge = huge;
5533	if (shmem_huge > SHMEM_HUGE_DENY)
5534	SHMEM_SB(sb: shm_mnt->mnt_sb)->huge = shmem_huge;
5535
5536	err = start_stop_khugepaged();
5537	return err ? err : count;
5538	}
5539
5540	struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5541	static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5542
5543	static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5544	struct kobj_attribute attr, char* *buf)
5545	{
5546	int order = to_thpsize(kobj)->order;
5547	const char *output;
5548
5549	if (test_bit(order, &huge_shmem_orders_always))
5550	output = "[always] inherit within_size advise never";
5551	else if (test_bit(order, &huge_shmem_orders_inherit))
5552	output = "always [inherit] within_size advise never";
5553	else if (test_bit(order, &huge_shmem_orders_within_size))
5554	output = "always inherit [within_size] advise never";
5555	else if (test_bit(order, &huge_shmem_orders_madvise))
5556	output = "always inherit within_size [advise] never";
5557	else
5558	output = "always inherit within_size advise [never]";
5559
5560	return sysfs_emit(buf, fmt: "%s\n", output);
5561	}
5562
5563	static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5564	struct kobj_attribute *attr,
5565	const char *buf, size_t count)
5566	{
5567	int order = to_thpsize(kobj)->order;
5568	ssize_t ret = count;
5569
5570	if (sysfs_streq(s1: buf, s2: "always")) {
5571	spin_lock(lock: &huge_shmem_orders_lock);
5572	clear_bit(nr: order, addr: &huge_shmem_orders_inherit);
5573	clear_bit(nr: order, addr: &huge_shmem_orders_madvise);
5574	clear_bit(nr: order, addr: &huge_shmem_orders_within_size);
5575	set_bit(nr: order, addr: &huge_shmem_orders_always);
5576	spin_unlock(lock: &huge_shmem_orders_lock);
5577	} else if (sysfs_streq(s1: buf, s2: "inherit")) {
5578	/ Do not override huge allocation policy with non-PMD sized mTHP /
5579	if (shmem_huge == SHMEM_HUGE_FORCE &&
5580	order != HPAGE_PMD_ORDER)
5581	return -EINVAL;
5582
5583	spin_lock(lock: &huge_shmem_orders_lock);
5584	clear_bit(nr: order, addr: &huge_shmem_orders_always);
5585	clear_bit(nr: order, addr: &huge_shmem_orders_madvise);
5586	clear_bit(nr: order, addr: &huge_shmem_orders_within_size);
5587	set_bit(nr: order, addr: &huge_shmem_orders_inherit);
5588	spin_unlock(lock: &huge_shmem_orders_lock);
5589	} else if (sysfs_streq(s1: buf, s2: "within_size")) {
5590	spin_lock(lock: &huge_shmem_orders_lock);
5591	clear_bit(nr: order, addr: &huge_shmem_orders_always);
5592	clear_bit(nr: order, addr: &huge_shmem_orders_inherit);
5593	clear_bit(nr: order, addr: &huge_shmem_orders_madvise);
5594	set_bit(nr: order, addr: &huge_shmem_orders_within_size);
5595	spin_unlock(lock: &huge_shmem_orders_lock);
5596	} else if (sysfs_streq(s1: buf, s2: "advise")) {
5597	spin_lock(lock: &huge_shmem_orders_lock);
5598	clear_bit(nr: order, addr: &huge_shmem_orders_always);
5599	clear_bit(nr: order, addr: &huge_shmem_orders_inherit);
5600	clear_bit(nr: order, addr: &huge_shmem_orders_within_size);
5601	set_bit(nr: order, addr: &huge_shmem_orders_madvise);
5602	spin_unlock(lock: &huge_shmem_orders_lock);
5603	} else if (sysfs_streq(s1: buf, s2: "never")) {
5604	spin_lock(lock: &huge_shmem_orders_lock);
5605	clear_bit(nr: order, addr: &huge_shmem_orders_always);
5606	clear_bit(nr: order, addr: &huge_shmem_orders_inherit);
5607	clear_bit(nr: order, addr: &huge_shmem_orders_within_size);
5608	clear_bit(nr: order, addr: &huge_shmem_orders_madvise);
5609	spin_unlock(lock: &huge_shmem_orders_lock);
5610	} else {
5611	ret = -EINVAL;
5612	}
5613
5614	if (ret > `0`) {
5615	int err = start_stop_khugepaged();
5616
5617	if (err)
5618	ret = err;
5619	}
5620	return ret;
5621	}
5622
5623	struct kobj_attribute thpsize_shmem_enabled_attr =
5624	__ATTR(shmem_enabled, `0644`, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5625	#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5626
5627	#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
5628
5629	static int __init setup_transparent_hugepage_shmem(char *str)
5630	{
5631	int huge;
5632
5633	huge = shmem_parse_huge(str);
5634	if (huge == -EINVAL) {
5635	pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
5636	return huge;
5637	}
5638
5639	shmem_huge = huge;
5640	return `1`;
5641	}
5642	__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
5643
5644	static int __init setup_transparent_hugepage_tmpfs(char *str)
5645	{
5646	int huge;
5647
5648	huge = shmem_parse_huge(str);
5649	if (huge < `0`) {
5650	pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
5651	return huge;
5652	}
5653
5654	tmpfs_huge = huge;
5655	return `1`;
5656	}
5657	__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
5658
5659	static char str_dup[PAGE_SIZE] __initdata;
5660	static int __init setup_thp_shmem(char *str)
5661	{
5662	char token, range, policy, subtoken;
5663	unsigned long always, inherit, madvise, within_size;
5664	char start_size, end_size;
5665	int start, end, nr;
5666	char *p;
5667
5668	if (!str \|\| strlen(str) + `1` > PAGE_SIZE)
5669	goto err;
5670	strscpy(str_dup, str);
5671
5672	always = huge_shmem_orders_always;
5673	inherit = huge_shmem_orders_inherit;
5674	madvise = huge_shmem_orders_madvise;
5675	within_size = huge_shmem_orders_within_size;
5676	p = str_dup;
5677	while ((token = strsep(&p, ";")) != NULL) {
5678	range = strsep(&token, ":");
5679	policy = token;
5680
5681	if (!policy)
5682	goto err;
5683
5684	while ((subtoken = strsep(&range, ",")) != NULL) {
5685	if (strchr(subtoken, `'-'`)) {
5686	start_size = strsep(&subtoken, "-");
5687	end_size = subtoken;
5688
5689	start = get_order_from_str(size_str: start_size,
5690	THP_ORDERS_ALL_FILE_DEFAULT);
5691	end = get_order_from_str(size_str: end_size,
5692	THP_ORDERS_ALL_FILE_DEFAULT);
5693	} else {
5694	start_size = end_size = subtoken;
5695	start = end = get_order_from_str(size_str: subtoken,
5696	THP_ORDERS_ALL_FILE_DEFAULT);
5697	}
5698
5699	if (start < `0`) {
5700	pr_err("invalid size %s in thp_shmem boot parameter\n",
5701	start_size);
5702	goto err;
5703	}
5704
5705	if (end < `0`) {
5706	pr_err("invalid size %s in thp_shmem boot parameter\n",
5707	end_size);
5708	goto err;
5709	}
5710
5711	if (start > end)
5712	goto err;
5713
5714	nr = end - start + `1`;
5715	if (!strcmp(policy, "always")) {
5716	bitmap_set(map: &always, start, nbits: nr);
5717	bitmap_clear(map: &inherit, start, nbits: nr);
5718	bitmap_clear(map: &madvise, start, nbits: nr);
5719	bitmap_clear(map: &within_size, start, nbits: nr);
5720	} else if (!strcmp(policy, "advise")) {
5721	bitmap_set(map: &madvise, start, nbits: nr);
5722	bitmap_clear(map: &inherit, start, nbits: nr);
5723	bitmap_clear(map: &always, start, nbits: nr);
5724	bitmap_clear(map: &within_size, start, nbits: nr);
5725	} else if (!strcmp(policy, "inherit")) {
5726	bitmap_set(map: &inherit, start, nbits: nr);
5727	bitmap_clear(map: &madvise, start, nbits: nr);
5728	bitmap_clear(map: &always, start, nbits: nr);
5729	bitmap_clear(map: &within_size, start, nbits: nr);
5730	} else if (!strcmp(policy, "within_size")) {
5731	bitmap_set(map: &within_size, start, nbits: nr);
5732	bitmap_clear(map: &inherit, start, nbits: nr);
5733	bitmap_clear(map: &madvise, start, nbits: nr);
5734	bitmap_clear(map: &always, start, nbits: nr);
5735	} else if (!strcmp(policy, "never")) {
5736	bitmap_clear(map: &inherit, start, nbits: nr);
5737	bitmap_clear(map: &madvise, start, nbits: nr);
5738	bitmap_clear(map: &always, start, nbits: nr);
5739	bitmap_clear(map: &within_size, start, nbits: nr);
5740	} else {
5741	pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
5742	goto err;
5743	}
5744	}
5745	}
5746
5747	huge_shmem_orders_always = always;
5748	huge_shmem_orders_madvise = madvise;
5749	huge_shmem_orders_inherit = inherit;
5750	huge_shmem_orders_within_size = within_size;
5751	shmem_orders_configured = true;
5752	return `1`;
5753
5754	err:
5755	pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
5756	return `0`;
5757	}
5758	__setup("thp_shmem=", setup_thp_shmem);
5759
5760	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5761
5762	#else /* !CONFIG_SHMEM */
5763
5764	/*
5765	* tiny-shmem: simple shmemfs and tmpfs using ramfs code
5766	*
5767	* This is intended for small system where the benefits of the full
5768	* shmem code (swap-backed and resource-limited) are outweighed by
5769	* their complexity. On systems without swap this code should be
5770	* effectively equivalent, but much lighter weight.
5771	*/
5772
5773	static struct file_system_type shmem_fs_type = {
5774	.name = "tmpfs",
5775	.init_fs_context = ramfs_init_fs_context,
5776	.parameters = ramfs_fs_parameters,
5777	.kill_sb = ramfs_kill_sb,
5778	.fs_flags = FS_USERNS_MOUNT,
5779	};
5780
5781	void __init shmem_init(void)
5782	{
5783	BUG_ON(register_filesystem(&shmem_fs_type) != `0`);
5784
5785	shm_mnt = kern_mount(&shmem_fs_type);
5786	BUG_ON(IS_ERR(shm_mnt));
5787	}
5788
5789	int shmem_unuse(unsigned int type)
5790	{
5791	return `0`;
5792	}
5793
5794	int shmem_lock(struct file file, int* lock, struct ucounts *ucounts)
5795	{
5796	return `0`;
5797	}
5798
5799	void shmem_unlock_mapping(struct address_space *mapping)
5800	{
5801	}
5802
5803	#ifdef CONFIG_MMU
5804	unsigned long shmem_get_unmapped_area(struct file *file,
5805	unsigned long addr, unsigned long len,
5806	unsigned long pgoff, unsigned long flags)
5807	{
5808	return mm_get_unmapped_area(file, addr, len, pgoff, flags);
5809	}
5810	#endif
5811
5812	void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
5813	{
5814	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5815	}
5816	EXPORT_SYMBOL_GPL(shmem_truncate_range);
5817
5818	#define shmem_vm_ops generic_file_vm_ops
5819	#define shmem_anon_vm_ops generic_file_vm_ops
5820	#define shmem_file_operations ramfs_file_operations
5821
5822	static inline int shmem_acct_size(unsigned long flags, loff_t size)
5823	{
5824	return `0`;
5825	}
5826
5827	static inline void shmem_unacct_size(unsigned long flags, loff_t size)
5828	{
5829	}
5830
5831	static inline struct inode shmem_get_inode(struct* mnt_idmap *idmap,
5832	struct super_block sb, struct* inode *dir,
5833	umode_t mode, dev_t dev, unsigned long flags)
5834	{
5835	struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5836	return inode ? inode : ERR_PTR(-ENOSPC);
5837	}
5838
5839	#endif /* CONFIG_SHMEM */
5840
5841	/ common code /
5842
5843	static struct file __shmem_file_setup(struct* vfsmount mnt, const* char *name,
5844	loff_t size, unsigned long vm_flags,
5845	unsigned int i_flags)
5846	{
5847	unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : `0`;
5848	struct inode *inode;
5849	struct file *res;
5850
5851	if (IS_ERR(ptr: mnt))
5852	return ERR_CAST(ptr: mnt);
5853
5854	if (size < `0` \|\| size > MAX_LFS_FILESIZE)
5855	return ERR_PTR(error: -EINVAL);
5856
5857	if (is_idmapped_mnt(mnt))
5858	return ERR_PTR(error: -EINVAL);
5859
5860	if (shmem_acct_size(flags, size))
5861	return ERR_PTR(error: -ENOMEM);
5862
5863	inode = shmem_get_inode(idmap: &nop_mnt_idmap, sb: mnt->mnt_sb, NULL,
5864	S_IFREG \| S_IRWXUGO, dev: `0`, flags: vm_flags);
5865	if (IS_ERR(ptr: inode)) {
5866	shmem_unacct_size(flags, size);
5867	return ERR_CAST(ptr: inode);
5868	}
5869	inode->i_flags \|= i_flags;
5870	inode->i_size = size;
5871	clear_nlink(inode); / It is unlinked /
5872	res = ERR_PTR(error: ramfs_nommu_expand_for_mapping(inode, newsize: size));
5873	if (!IS_ERR(ptr: res))
5874	res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5875	&shmem_file_operations);
5876	if (IS_ERR(ptr: res))
5877	iput(inode);
5878	return res;
5879	}
5880
5881	/**
5882	* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5883	* kernel internal. There will be NO LSM permission checks against the
5884	* underlying inode. So users of this interface must do LSM checks at a
5885	* higher layer. The users are the big_key and shm implementations. LSM
5886	* checks are provided at the key or shm level rather than the inode.
5887	* @name: name for dentry (to be seen in /proc/<pid>/maps)
5888	* @size: size to be set for the file
5889	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5890	*/
5891	struct file shmem_kernel_file_setup(const* char name, loff_t size, unsigned* long flags)
5892	{
5893	return __shmem_file_setup(mnt: shm_mnt, name, size, vm_flags: flags, S_PRIVATE);
5894	}
5895	EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5896
5897	/**
5898	* shmem_file_setup - get an unlinked file living in tmpfs
5899	* @name: name for dentry (to be seen in /proc/<pid>/maps)
5900	* @size: size to be set for the file
5901	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5902	*/
5903	struct file shmem_file_setup(const* char name, loff_t size, unsigned* long flags)
5904	{
5905	return __shmem_file_setup(mnt: shm_mnt, name, size, vm_flags: flags, i_flags: `0`);
5906	}
5907	EXPORT_SYMBOL_GPL(shmem_file_setup);
5908
5909	/**
5910	* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5911	* @mnt: the tmpfs mount where the file will be created
5912	* @name: name for dentry (to be seen in /proc/<pid>/maps)
5913	* @size: size to be set for the file
5914	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5915	*/
5916	struct file shmem_file_setup_with_mnt(struct* vfsmount mnt, const* char *name,
5917	loff_t size, unsigned long flags)
5918	{
5919	return __shmem_file_setup(mnt, name, size, vm_flags: flags, i_flags: `0`);
5920	}
5921	EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5922
5923	static struct file __shmem_zero_setup(unsigned* long start, unsigned long end, vm_flags_t vm_flags)
5924	{
5925	loff_t size = end - start;
5926
5927	/*
5928	* Cloning a new file under mmap_lock leads to a lock ordering conflict
5929	* between XFS directory reading and selinux: since this file is only
5930	* accessible to the user through its mapping, use S_PRIVATE flag to
5931	* bypass file security, in the same way as shmem_kernel_file_setup().
5932	*/
5933	return shmem_kernel_file_setup("dev/zero", size, vm_flags);
5934	}
5935
5936	/**
5937	* shmem_zero_setup - setup a shared anonymous mapping
5938	* @vma: the vma to be mmapped is prepared by do_mmap
5939	* Returns: 0 on success, or error
5940	*/
5941	int shmem_zero_setup(struct vm_area_struct *vma)
5942	{
5943	struct file *file = __shmem_zero_setup(start: vma->vm_start, end: vma->vm_end, vm_flags: vma->vm_flags);
5944
5945	if (IS_ERR(ptr: file))
5946	return PTR_ERR(ptr: file);
5947
5948	if (vma->vm_file)
5949	fput(vma->vm_file);
5950	vma->vm_file = file;
5951	vma->vm_ops = &shmem_anon_vm_ops;
5952
5953	return `0`;
5954	}
5955
5956	/**
5957	* shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
5958	* descriptor for convenience.
5959	* @desc: Describes VMA
5960	* Returns: 0 on success, or error
5961	*/
5962	int shmem_zero_setup_desc(struct vm_area_desc *desc)
5963	{
5964	struct file *file = __shmem_zero_setup(start: desc->start, end: desc->end, vm_flags: desc->vm_flags);
5965
5966	if (IS_ERR(ptr: file))
5967	return PTR_ERR(ptr: file);
5968
5969	desc->vm_file = file;
5970	desc->vm_ops = &shmem_anon_vm_ops;
5971
5972	return `0`;
5973	}
5974
5975	/**
5976	* shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5977	* @mapping: the folio's address_space
5978	* @index: the folio index
5979	* @gfp: the page allocator flags to use if allocating
5980	*
5981	* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5982	* with any new page allocations done using the specified allocation flags.
5983	* But read_cache_page_gfp() uses the ->read_folio() method: which does not
5984	* suit tmpfs, since it may have pages in swapcache, and needs to find those
5985	* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5986	*
5987	* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY \| __GFP_NOWARN in
5988	* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5989	*/
5990	struct folio shmem_read_folio_gfp(struct* address_space *mapping,
5991	pgoff_t index, gfp_t gfp)
5992	{
5993	#ifdef CONFIG_SHMEM
5994	struct inode *inode = mapping->host;
5995	struct folio *folio;
5996	int error;
5997
5998	error = shmem_get_folio_gfp(inode, index, write_end: i_size_read(inode),
5999	foliop: &folio, sgp: SGP_CACHE, gfp, NULL, NULL);
6000	if (error)
6001	return ERR_PTR(error);
6002
6003	folio_unlock(folio);
6004	return folio;
6005	#else
6006	/*
6007	* The tiny !SHMEM case uses ramfs without swap
6008	*/
6009	return mapping_read_folio_gfp(mapping, index, gfp);
6010	#endif
6011	}
6012	EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
6013
6014	struct page shmem_read_mapping_page_gfp(struct* address_space *mapping,
6015	pgoff_t index, gfp_t gfp)
6016	{
6017	struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
6018	struct page *page;
6019
6020	if (IS_ERR(ptr: folio))
6021	return &folio->page;
6022
6023	page = folio_file_page(folio, index);
6024	if (PageHWPoison(page)) {
6025	folio_put(folio);
6026	return ERR_PTR(error: -EIO);
6027	}
6028
6029	return page;
6030	}
6031	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
6032

source code of linux/mm/shmem.c