inode.c source code [linux/fs/hugetlbfs/inode.c]

1	/*
2	* hugetlbpage-backed filesystem. Based on ramfs.
3	*
4	* Nadia Yvette Chambers, 2002
5	*
6	* Copyright (C) 2002 Linus Torvalds.
7	* License: GPL
8	*/
9
10	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12	#include <linux/thread_info.h>
13	#include <asm/current.h>
14	#include <linux/falloc.h>
15	#include <linux/fs.h>
16	#include <linux/mount.h>
17	#include <linux/file.h>
18	#include <linux/kernel.h>
19	#include <linux/writeback.h>
20	#include <linux/pagemap.h>
21	#include <linux/highmem.h>
22	#include <linux/init.h>
23	#include <linux/string.h>
24	#include <linux/capability.h>
25	#include <linux/ctype.h>
26	#include <linux/backing-dev.h>
27	#include <linux/hugetlb.h>
28	#include <linux/pagevec.h>
29	#include <linux/fs_parser.h>
30	#include <linux/mman.h>
31	#include <linux/slab.h>
32	#include <linux/dnotify.h>
33	#include <linux/statfs.h>
34	#include <linux/security.h>
35	#include <linux/magic.h>
36	#include <linux/migrate.h>
37	#include <linux/uio.h>
38
39	#include <linux/uaccess.h>
40	#include <linux/sched/mm.h>
41
42	static const struct address_space_operations hugetlbfs_aops;
43	const struct file_operations hugetlbfs_file_operations;
44	static const struct inode_operations hugetlbfs_dir_inode_operations;
45	static const struct inode_operations hugetlbfs_inode_operations;
46
47	enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
48
49	struct hugetlbfs_fs_context {
50	struct hstate *hstate;
51	unsigned long long max_size_opt;
52	unsigned long long min_size_opt;
53	long max_hpages;
54	long nr_inodes;
55	long min_hpages;
56	enum hugetlbfs_size_type max_val_type;
57	enum hugetlbfs_size_type min_val_type;
58	kuid_t uid;
59	kgid_t gid;
60	umode_t mode;
61	};
62
63	int sysctl_hugetlb_shm_group;
64
65	enum hugetlb_param {
66	Opt_gid,
67	Opt_min_size,
68	Opt_mode,
69	Opt_nr_inodes,
70	Opt_pagesize,
71	Opt_size,
72	Opt_uid,
73	};
74
75	static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
76	fsparam_u32 ("gid", Opt_gid),
77	fsparam_string("min_size", Opt_min_size),
78	fsparam_u32oct("mode", Opt_mode),
79	fsparam_string("nr_inodes", Opt_nr_inodes),
80	fsparam_string("pagesize", Opt_pagesize),
81	fsparam_string("size", Opt_size),
82	fsparam_u32 ("uid", Opt_uid),
83	{}
84	};
85
86	/*
87	* Mask used when checking the page offset value passed in via system
88	* calls. This value will be converted to a loff_t which is signed.
89	* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
90	* value. The extra bit (- 1 in the shift value) is to take the sign
91	* bit into account.
92	*/
93	#define PGOFF_LOFFT_MAX \
94	(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
95
96	static int hugetlbfs_file_mmap(struct file file, struct* vm_area_struct *vma)
97	{
98	struct inode *inode = file_inode(f: file);
99	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100	loff_t len, vma_len;
101	int ret;
102	struct hstate *h = hstate_file(f: file);
103	vm_flags_t vm_flags;
104
105	/*
106	* vma address alignment (but not the pgoff alignment) has
107	* already been checked by prepare_hugepage_range. If you add
108	* any error returns here, do so after setting VM_HUGETLB, so
109	* is_vm_hugetlb_page tests below unmap_region go the right
110	* way when do_mmap unwinds (may be important on powerpc
111	* and ia64).
112	*/
113	vm_flags_set(vma, VM_HUGETLB \| VM_DONTEXPAND);
114	vma->vm_ops = &hugetlb_vm_ops;
115
116	ret = seal_check_write(seals: info->seals, vma);
117	if (ret)
118	return ret;
119
120	/*
121	* page based offset in vm_pgoff could be sufficiently large to
122	* overflow a loff_t when converted to byte offset. This can
123	* only happen on architectures where sizeof(loff_t) ==
124	* sizeof(unsigned long). So, only check in those instances.
125	*/
126	if (sizeof(unsigned long) == sizeof(loff_t)) {
127	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
128	return -EINVAL;
129	}
130
131	/ must be huge page aligned /
132	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
133	return -EINVAL;
134
135	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
136	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
137	/ check for overflow /
138	if (len < vma_len)
139	return -EINVAL;
140
141	inode_lock(inode);
142	file_accessed(file);
143
144	ret = -ENOMEM;
145
146	vm_flags = vma->vm_flags;
147	/*
148	* for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
149	* reserving here. Note: only for SHM hugetlbfs file, the inode
150	* flag S_PRIVATE is set.
151	*/
152	if (inode->i_flags & S_PRIVATE)
153	vm_flags \|= VM_NORESERVE;
154
155	if (!hugetlb_reserve_pages(inode,
156	from: vma->vm_pgoff >> huge_page_order(h),
157	to: len >> huge_page_shift(h), vma,
158	vm_flags))
159	goto out;
160
161	ret = `0`;
162	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
163	i_size_write(inode, i_size: len);
164	out:
165	inode_unlock(inode);
166
167	return ret;
168	}
169
170	/*
171	* Called under mmap_write_lock(mm).
172	*/
173
174	static unsigned long
175	hugetlb_get_unmapped_area_bottomup(struct file file, unsigned* long addr,
176	unsigned long len, unsigned long pgoff, unsigned long flags)
177	{
178	struct hstate *h = hstate_file(f: file);
179	struct vm_unmapped_area_info info;
180
181	info.flags = `0`;
182	info.length = len;
183	info.low_limit = current->mm->mmap_base;
184	info.high_limit = arch_get_mmap_end(addr, len, flags);
185	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
186	info.align_offset = `0`;
187	return vm_unmapped_area(info: &info);
188	}
189
190	static unsigned long
191	hugetlb_get_unmapped_area_topdown(struct file file, unsigned* long addr,
192	unsigned long len, unsigned long pgoff, unsigned long flags)
193	{
194	struct hstate *h = hstate_file(f: file);
195	struct vm_unmapped_area_info info;
196
197	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
198	info.length = len;
199	info.low_limit = PAGE_SIZE;
200	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
201	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
202	info.align_offset = `0`;
203	addr = vm_unmapped_area(info: &info);
204
205	/*
206	* A failed mmap() very likely causes application failure,
207	* so fall back to the bottom-up function here. This scenario
208	* can happen with large stack limits and large mmap()
209	* allocations.
210	*/
211	if (unlikely(offset_in_page(addr))) {
212	VM_BUG_ON(addr != -ENOMEM);
213	info.flags = `0`;
214	info.low_limit = current->mm->mmap_base;
215	info.high_limit = arch_get_mmap_end(addr, len, flags);
216	addr = vm_unmapped_area(info: &info);
217	}
218
219	return addr;
220	}
221
222	unsigned long
223	generic_hugetlb_get_unmapped_area(struct file file, unsigned* long addr,
224	unsigned long len, unsigned long pgoff,
225	unsigned long flags)
226	{
227	struct mm_struct *mm = current->mm;
228	struct vm_area_struct *vma;
229	struct hstate *h = hstate_file(f: file);
230	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
231
232	if (len & ~huge_page_mask(h))
233	return -EINVAL;
234	if (len > TASK_SIZE)
235	return -ENOMEM;
236
237	if (flags & MAP_FIXED) {
238	if (prepare_hugepage_range(file, addr, len))
239	return -EINVAL;
240	return addr;
241	}
242
243	if (addr) {
244	addr = ALIGN(addr, huge_page_size(h));
245	vma = find_vma(mm, addr);
246	if (mmap_end - len >= addr &&
247	(!vma \|\| addr + len <= vm_start_gap(vma)))
248	return addr;
249	}
250
251	/*
252	* Use mm->get_unmapped_area value as a hint to use topdown routine.
253	* If architectures have special needs, they should define their own
254	* version of hugetlb_get_unmapped_area.
255	*/
256	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
257	return hugetlb_get_unmapped_area_topdown(file, addr, len,
258	pgoff, flags);
259	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
260	pgoff, flags);
261	}
262
263	#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
264	static unsigned long
265	hugetlb_get_unmapped_area(struct file file, unsigned* long addr,
266	unsigned long len, unsigned long pgoff,
267	unsigned long flags)
268	{
269	return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
270	}
271	#endif
272
273	/*
274	* Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
275	* Returns the maximum number of bytes one can read without touching the 1st raw
276	* HWPOISON subpage.
277	*
278	* The implementation borrows the iteration logic from copy_page_to_iter*.
279	*/
280	static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
281	{
282	size_t n = `0`;
283	size_t res = `0`;
284
285	/ First subpage to start the loop. /
286	page = nth_page(page, offset / PAGE_SIZE);
287	offset %= PAGE_SIZE;
288	while (`1`) {
289	if (is_raw_hwpoison_page_in_hugepage(page))
290	break;
291
292	/ Safe to read n bytes without touching HWPOISON subpage. /
293	n = min(bytes, (size_t)PAGE_SIZE - offset);
294	res += n;
295	bytes -= n;
296	if (!bytes \|\| !n)
297	break;
298	offset += n;
299	if (offset == PAGE_SIZE) {
300	page = nth_page(page, `1`);
301	offset = `0`;
302	}
303	}
304
305	return res;
306	}
307
308	/*
309	* Support for read() - Find the page attached to f_mapping and copy out the
310	* data. This provides functionality similar to filemap_read().
311	*/
312	static ssize_t hugetlbfs_read_iter(struct kiocb iocb, struct* iov_iter *to)
313	{
314	struct file *file = iocb->ki_filp;
315	struct hstate *h = hstate_file(f: file);
316	struct address_space *mapping = file->f_mapping;
317	struct inode *inode = mapping->host;
318	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
319	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
320	unsigned long end_index;
321	loff_t isize;
322	ssize_t retval = `0`;
323
324	while (iov_iter_count(i: to)) {
325	struct folio *folio;
326	size_t nr, copied, want;
327
328	/ nr is the maximum number of bytes to copy from this page /
329	nr = huge_page_size(h);
330	isize = i_size_read(inode);
331	if (!isize)
332	break;
333	end_index = (isize - `1`) >> huge_page_shift(h);
334	if (index > end_index)
335	break;
336	if (index == end_index) {
337	nr = ((isize - `1`) & ~huge_page_mask(h)) + `1`;
338	if (nr <= offset)
339	break;
340	}
341	nr = nr - offset;
342
343	/ Find the folio /
344	folio = filemap_lock_hugetlb_folio(h, mapping, idx: index);
345	if (IS_ERR(ptr: folio)) {
346	/*
347	* We have a HOLE, zero out the user-buffer for the
348	* length of the hole or request.
349	*/
350	copied = iov_iter_zero(bytes: nr, to);
351	} else {
352	folio_unlock(folio);
353
354	if (!folio_test_hwpoison(folio))
355	want = nr;
356	else {
357	/*
358	* Adjust how many bytes safe to read without
359	* touching the 1st raw HWPOISON subpage after
360	* offset.
361	*/
362	want = adjust_range_hwpoison(page: &folio->page, offset, bytes: nr);
363	if (want == `0`) {
364	folio_put(folio);
365	retval = -EIO;
366	break;
367	}
368	}
369
370	/*
371	* We have the folio, copy it to user space buffer.
372	*/
373	copied = copy_folio_to_iter(folio, offset, bytes: want, i: to);
374	folio_put(folio);
375	}
376	offset += copied;
377	retval += copied;
378	if (copied != nr && iov_iter_count(i: to)) {
379	if (!retval)
380	retval = -EFAULT;
381	break;
382	}
383	index += offset >> huge_page_shift(h);
384	offset &= ~huge_page_mask(h);
385	}
386	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
387	return retval;
388	}
389
390	static int hugetlbfs_write_begin(struct file *file,
391	struct address_space *mapping,
392	loff_t pos, unsigned len,
393	struct page *pagep, void* **fsdata)
394	{
395	return -EINVAL;
396	}
397
398	static int hugetlbfs_write_end(struct file file, struct* address_space *mapping,
399	loff_t pos, unsigned len, unsigned copied,
400	struct page page, void* *fsdata)
401	{
402	BUG();
403	return -EINVAL;
404	}
405
406	static void hugetlb_delete_from_page_cache(struct folio *folio)
407	{
408	folio_clear_dirty(folio);
409	folio_clear_uptodate(folio);
410	filemap_remove_folio(folio);
411	}
412
413	/*
414	* Called with i_mmap_rwsem held for inode based vma maps. This makes
415	* sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
416	* mutex for the page in the mapping. So, we can not race with page being
417	* faulted into the vma.
418	*/
419	static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
420	unsigned long addr, struct page *page)
421	{
422	pte_t *ptep, pte;
423
424	ptep = hugetlb_walk(vma, addr, sz: huge_page_size(h: hstate_vma(vma)));
425	if (!ptep)
426	return false;
427
428	pte = huge_ptep_get(ptep);
429	if (huge_pte_none(pte) \|\| !pte_present(a: pte))
430	return false;
431
432	if (pte_page(pte) == page)
433	return true;
434
435	return false;
436	}
437
438	/*
439	* Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
440	* No, because the interval tree returns us only those vmas
441	* which overlap the truncated area starting at pgoff,
442	* and no vma on a 32-bit arch can span beyond the 4GB.
443	*/
444	static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
445	{
446	unsigned long offset = `0`;
447
448	if (vma->vm_pgoff < start)
449	offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
450
451	return vma->vm_start + offset;
452	}
453
454	static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
455	{
456	unsigned long t_end;
457
458	if (!end)
459	return vma->vm_end;
460
461	t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
462	if (t_end > vma->vm_end)
463	t_end = vma->vm_end;
464	return t_end;
465	}
466
467	/*
468	* Called with hugetlb fault mutex held. Therefore, no more mappings to
469	* this folio can be created while executing the routine.
470	*/
471	static void hugetlb_unmap_file_folio(struct hstate *h,
472	struct address_space *mapping,
473	struct folio *folio, pgoff_t index)
474	{
475	struct rb_root_cached *root = &mapping->i_mmap;
476	struct hugetlb_vma_lock *vma_lock;
477	struct page *page = &folio->page;
478	struct vm_area_struct *vma;
479	unsigned long v_start;
480	unsigned long v_end;
481	pgoff_t start, end;
482
483	start = index * pages_per_huge_page(h);
484	end = (index + `1`) * pages_per_huge_page(h);
485
486	i_mmap_lock_write(mapping);
487	retry:
488	vma_lock = NULL;
489	vma_interval_tree_foreach(vma, root, start, end - `1`) {
490	v_start = vma_offset_start(vma, start);
491	v_end = vma_offset_end(vma, end);
492
493	if (!hugetlb_vma_maps_page(vma, addr: v_start, page))
494	continue;
495
496	if (!hugetlb_vma_trylock_write(vma)) {
497	vma_lock = vma->vm_private_data;
498	/*
499	* If we can not get vma lock, we need to drop
500	* immap_sema and take locks in order. First,
501	* take a ref on the vma_lock structure so that
502	* we can be guaranteed it will not go away when
503	* dropping immap_sema.
504	*/
505	kref_get(kref: &vma_lock->refs);
506	break;
507	}
508
509	unmap_hugepage_range(vma, v_start, v_end, NULL,
510	ZAP_FLAG_DROP_MARKER);
511	hugetlb_vma_unlock_write(vma);
512	}
513
514	i_mmap_unlock_write(mapping);
515
516	if (vma_lock) {
517	/*
518	* Wait on vma_lock. We know it is still valid as we have
519	* a reference. We must 'open code' vma locking as we do
520	* not know if vma_lock is still attached to vma.
521	*/
522	down_write(sem: &vma_lock->rw_sema);
523	i_mmap_lock_write(mapping);
524
525	vma = vma_lock->vma;
526	if (!vma) {
527	/*
528	* If lock is no longer attached to vma, then just
529	* unlock, drop our reference and retry looking for
530	* other vmas.
531	*/
532	up_write(sem: &vma_lock->rw_sema);
533	kref_put(kref: &vma_lock->refs, release: hugetlb_vma_lock_release);
534	goto retry;
535	}
536
537	/*
538	* vma_lock is still attached to vma. Check to see if vma
539	* still maps page and if so, unmap.
540	*/
541	v_start = vma_offset_start(vma, start);
542	v_end = vma_offset_end(vma, end);
543	if (hugetlb_vma_maps_page(vma, addr: v_start, page))
544	unmap_hugepage_range(vma, v_start, v_end, NULL,
545	ZAP_FLAG_DROP_MARKER);
546
547	kref_put(kref: &vma_lock->refs, release: hugetlb_vma_lock_release);
548	hugetlb_vma_unlock_write(vma);
549
550	goto retry;
551	}
552	}
553
554	static void
555	hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
556	zap_flags_t zap_flags)
557	{
558	struct vm_area_struct *vma;
559
560	/*
561	* end == 0 indicates that the entire range after start should be
562	* unmapped. Note, end is exclusive, whereas the interval tree takes
563	* an inclusive "last".
564	*/
565	vma_interval_tree_foreach(vma, root, start, end ? end - `1` : ULONG_MAX) {
566	unsigned long v_start;
567	unsigned long v_end;
568
569	if (!hugetlb_vma_trylock_write(vma))
570	continue;
571
572	v_start = vma_offset_start(vma, start);
573	v_end = vma_offset_end(vma, end);
574
575	unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
576
577	/*
578	* Note that vma lock only exists for shared/non-private
579	* vmas. Therefore, lock is not held when calling
580	* unmap_hugepage_range for private vmas.
581	*/
582	hugetlb_vma_unlock_write(vma);
583	}
584	}
585
586	/*
587	* Called with hugetlb fault mutex held.
588	* Returns true if page was actually removed, false otherwise.
589	*/
590	static bool remove_inode_single_folio(struct hstate h, struct* inode *inode,
591	struct address_space *mapping,
592	struct folio *folio, pgoff_t index,
593	bool truncate_op)
594	{
595	bool ret = false;
596
597	/*
598	* If folio is mapped, it was faulted in after being
599	* unmapped in caller. Unmap (again) while holding
600	* the fault mutex. The mutex will prevent faults
601	* until we finish removing the folio.
602	*/
603	if (unlikely(folio_mapped(folio)))
604	hugetlb_unmap_file_folio(h, mapping, folio, index);
605
606	folio_lock(folio);
607	/*
608	* We must remove the folio from page cache before removing
609	* the region/ reserve map (hugetlb_unreserve_pages). In
610	* rare out of memory conditions, removal of the region/reserve
611	* map could fail. Correspondingly, the subpool and global
612	* reserve usage count can need to be adjusted.
613	*/
614	VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
615	hugetlb_delete_from_page_cache(folio);
616	ret = true;
617	if (!truncate_op) {
618	if (unlikely(hugetlb_unreserve_pages(inode, index,
619	index + `1`, `1`)))
620	hugetlb_fix_reserve_counts(inode);
621	}
622
623	folio_unlock(folio);
624	return ret;
625	}
626
627	/*
628	* remove_inode_hugepages handles two distinct cases: truncation and hole
629	* punch. There are subtle differences in operation for each case.
630	*
631	* truncation is indicated by end of range being LLONG_MAX
632	* In this case, we first scan the range and release found pages.
633	* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
634	* maps and global counts. Page faults can race with truncation.
635	* During faults, hugetlb_no_page() checks i_size before page allocation,
636	* and again after obtaining page table lock. It will 'back out'
637	* allocations in the truncated range.
638	* hole punch is indicated if end is not LLONG_MAX
639	* In the hole punch case we scan the range and release found pages.
640	* Only when releasing a page is the associated region/reserve map
641	* deleted. The region/reserve map for ranges without associated
642	* pages are not modified. Page faults can race with hole punch.
643	* This is indicated if we find a mapped page.
644	* Note: If the passed end of range value is beyond the end of file, but
645	* not LLONG_MAX this routine still performs a hole punch operation.
646	*/
647	static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
648	loff_t lend)
649	{
650	struct hstate *h = hstate_inode(i: inode);
651	struct address_space *mapping = &inode->i_data;
652	const pgoff_t end = lend >> PAGE_SHIFT;
653	struct folio_batch fbatch;
654	pgoff_t next, index;
655	int i, freed = `0`;
656	bool truncate_op = (lend == LLONG_MAX);
657
658	folio_batch_init(fbatch: &fbatch);
659	next = lstart >> PAGE_SHIFT;
660	while (filemap_get_folios(mapping, start: &next, end: end - `1`, fbatch: &fbatch)) {
661	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); ++i) {
662	struct folio *folio = fbatch.folios[i];
663	u32 hash = `0`;
664
665	index = folio->index >> huge_page_order(h);
666	hash = hugetlb_fault_mutex_hash(mapping, idx: index);
667	mutex_lock(&hugetlb_fault_mutex_table[hash]);
668
669	/*
670	* Remove folio that was part of folio_batch.
671	*/
672	if (remove_inode_single_folio(h, inode, mapping, folio,
673	index, truncate_op))
674	freed++;
675
676	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
677	}
678	folio_batch_release(fbatch: &fbatch);
679	cond_resched();
680	}
681
682	if (truncate_op)
683	(void)hugetlb_unreserve_pages(inode,
684	start: lstart >> huge_page_shift(h),
685	LONG_MAX, freed);
686	}
687
688	static void hugetlbfs_evict_inode(struct inode *inode)
689	{
690	struct resv_map *resv_map;
691
692	remove_inode_hugepages(inode, lstart: `0`, LLONG_MAX);
693
694	/*
695	* Get the resv_map from the address space embedded in the inode.
696	* This is the address space which points to any resv_map allocated
697	* at inode creation time. If this is a device special inode,
698	* i_mapping may not point to the original address space.
699	*/
700	resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
701	/ Only regular and link inodes have associated reserve maps /
702	if (resv_map)
703	resv_map_release(ref: &resv_map->refs);
704	clear_inode(inode);
705	}
706
707	static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
708	{
709	pgoff_t pgoff;
710	struct address_space *mapping = inode->i_mapping;
711	struct hstate *h = hstate_inode(i: inode);
712
713	BUG_ON(offset & ~huge_page_mask(h));
714	pgoff = offset >> PAGE_SHIFT;
715
716	i_size_write(inode, i_size: offset);
717	i_mmap_lock_write(mapping);
718	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
719	hugetlb_vmdelete_list(root: &mapping->i_mmap, start: pgoff, end: `0`,
720	ZAP_FLAG_DROP_MARKER);
721	i_mmap_unlock_write(mapping);
722	remove_inode_hugepages(inode, lstart: offset, LLONG_MAX);
723	}
724
725	static void hugetlbfs_zero_partial_page(struct hstate *h,
726	struct address_space *mapping,
727	loff_t start,
728	loff_t end)
729	{
730	pgoff_t idx = start >> huge_page_shift(h);
731	struct folio *folio;
732
733	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
734	if (IS_ERR(ptr: folio))
735	return;
736
737	start = start & ~huge_page_mask(h);
738	end = end & ~huge_page_mask(h);
739	if (!end)
740	end = huge_page_size(h);
741
742	folio_zero_segment(folio, start: (size_t)start, xend: (size_t)end);
743
744	folio_unlock(folio);
745	folio_put(folio);
746	}
747
748	static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
749	{
750	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
751	struct address_space *mapping = inode->i_mapping;
752	struct hstate *h = hstate_inode(i: inode);
753	loff_t hpage_size = huge_page_size(h);
754	loff_t hole_start, hole_end;
755
756	/*
757	* hole_start and hole_end indicate the full pages within the hole.
758	*/
759	hole_start = round_up(offset, hpage_size);
760	hole_end = round_down(offset + len, hpage_size);
761
762	inode_lock(inode);
763
764	/ protected by i_rwsem /
765	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
766	inode_unlock(inode);
767	return -EPERM;
768	}
769
770	i_mmap_lock_write(mapping);
771
772	/ If range starts before first full page, zero partial page. /
773	if (offset < hole_start)
774	hugetlbfs_zero_partial_page(h, mapping,
775	start: offset, min(offset + len, hole_start));
776
777	/ Unmap users of full pages in the hole. /
778	if (hole_end > hole_start) {
779	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
780	hugetlb_vmdelete_list(root: &mapping->i_mmap,
781	start: hole_start >> PAGE_SHIFT,
782	end: hole_end >> PAGE_SHIFT, zap_flags: `0`);
783	}
784
785	/ If range extends beyond last full page, zero partial page. /
786	if ((offset + len) > hole_end && (offset + len) > hole_start)
787	hugetlbfs_zero_partial_page(h, mapping,
788	start: hole_end, end: offset + len);
789
790	i_mmap_unlock_write(mapping);
791
792	/ Remove full pages from the file. /
793	if (hole_end > hole_start)
794	remove_inode_hugepages(inode, lstart: hole_start, lend: hole_end);
795
796	inode_unlock(inode);
797
798	return `0`;
799	}
800
801	static long hugetlbfs_fallocate(struct file file, int* mode, loff_t offset,
802	loff_t len)
803	{
804	struct inode *inode = file_inode(f: file);
805	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
806	struct address_space *mapping = inode->i_mapping;
807	struct hstate *h = hstate_inode(i: inode);
808	struct vm_area_struct pseudo_vma;
809	struct mm_struct *mm = current->mm;
810	loff_t hpage_size = huge_page_size(h);
811	unsigned long hpage_shift = huge_page_shift(h);
812	pgoff_t start, index, end;
813	int error;
814	u32 hash;
815
816	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
817	return -EOPNOTSUPP;
818
819	if (mode & FALLOC_FL_PUNCH_HOLE)
820	return hugetlbfs_punch_hole(inode, offset, len);
821
822	/*
823	* Default preallocate case.
824	* For this range, start is rounded down and end is rounded up
825	* as well as being converted to page offsets.
826	*/
827	start = offset >> hpage_shift;
828	end = (offset + len + hpage_size - `1`) >> hpage_shift;
829
830	inode_lock(inode);
831
832	/ We need to check rlimit even when FALLOC_FL_KEEP_SIZE /
833	error = inode_newsize_ok(inode, offset: offset + len);
834	if (error)
835	goto out;
836
837	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
838	error = -EPERM;
839	goto out;
840	}
841
842	/*
843	* Initialize a pseudo vma as this is required by the huge page
844	* allocation routines.
845	*/
846	vma_init(vma: &pseudo_vma, mm);
847	vm_flags_init(vma: &pseudo_vma, VM_HUGETLB \| VM_MAYSHARE \| VM_SHARED);
848	pseudo_vma.vm_file = file;
849
850	for (index = start; index < end; index++) {
851	/*
852	* This is supposed to be the vaddr where the page is being
853	* faulted in, but we have no vaddr here.
854	*/
855	struct folio *folio;
856	unsigned long addr;
857
858	cond_resched();
859
860	/*
861	* fallocate(2) manpage permits EINTR; we may have been
862	* interrupted because we are using up too much memory.
863	*/
864	if (signal_pending(current)) {
865	error = -EINTR;
866	break;
867	}
868
869	/ addr is the offset within the file (zero based) /
870	addr = index * hpage_size;
871
872	/ mutex taken here, fault path and hole punch /
873	hash = hugetlb_fault_mutex_hash(mapping, idx: index);
874	mutex_lock(&hugetlb_fault_mutex_table[hash]);
875
876	/ See if already present in mapping to avoid alloc/free /
877	folio = filemap_get_folio(mapping, index: index << huge_page_order(h));
878	if (!IS_ERR(ptr: folio)) {
879	folio_put(folio);
880	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
881	continue;
882	}
883
884	/*
885	* Allocate folio without setting the avoid_reserve argument.
886	* There certainly are no reserves associated with the
887	* pseudo_vma. However, there could be shared mappings with
888	* reserves for the file at the inode level. If we fallocate
889	* folios in these areas, we need to consume the reserves
890	* to keep reservation accounting consistent.
891	*/
892	folio = alloc_hugetlb_folio(vma: &pseudo_vma, addr, avoid_reserve: `0`);
893	if (IS_ERR(ptr: folio)) {
894	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
895	error = PTR_ERR(ptr: folio);
896	goto out;
897	}
898	clear_huge_page(page: &folio->page, addr_hint: addr, pages_per_huge_page: pages_per_huge_page(h));
899	__folio_mark_uptodate(folio);
900	error = hugetlb_add_to_page_cache(folio, mapping, idx: index);
901	if (unlikely(error)) {
902	restore_reserve_on_error(h, vma: &pseudo_vma, address: addr, folio);
903	folio_put(folio);
904	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
905	goto out;
906	}
907
908	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
909
910	folio_set_hugetlb_migratable(folio);
911	/*
912	* folio_unlock because locked by hugetlb_add_to_page_cache()
913	* folio_put() due to reference from alloc_hugetlb_folio()
914	*/
915	folio_unlock(folio);
916	folio_put(folio);
917	}
918
919	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
920	i_size_write(inode, i_size: offset + len);
921	inode_set_ctime_current(inode);
922	out:
923	inode_unlock(inode);
924	return error;
925	}
926
927	static int hugetlbfs_setattr(struct mnt_idmap *idmap,
928	struct dentry dentry, struct* iattr *attr)
929	{
930	struct inode *inode = d_inode(dentry);
931	struct hstate *h = hstate_inode(i: inode);
932	int error;
933	unsigned int ia_valid = attr->ia_valid;
934	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
935
936	error = setattr_prepare(idmap, dentry, attr);
937	if (error)
938	return error;
939
940	if (ia_valid & ATTR_SIZE) {
941	loff_t oldsize = inode->i_size;
942	loff_t newsize = attr->ia_size;
943
944	if (newsize & ~huge_page_mask(h))
945	return -EINVAL;
946	/ protected by i_rwsem /
947	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
948	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
949	return -EPERM;
950	hugetlb_vmtruncate(inode, offset: newsize);
951	}
952
953	setattr_copy(idmap, inode, attr);
954	mark_inode_dirty(inode);
955	return `0`;
956	}
957
958	static struct inode hugetlbfs_get_root(struct* super_block *sb,
959	struct hugetlbfs_fs_context *ctx)
960	{
961	struct inode *inode;
962
963	inode = new_inode(sb);
964	if (inode) {
965	inode->i_ino = get_next_ino();
966	inode->i_mode = S_IFDIR \| ctx->mode;
967	inode->i_uid = ctx->uid;
968	inode->i_gid = ctx->gid;
969	simple_inode_init_ts(inode);
970	inode->i_op = &hugetlbfs_dir_inode_operations;
971	inode->i_fop = &simple_dir_operations;
972	/ directory inodes start off with i_nlink == 2 (for "." entry) /
973	inc_nlink(inode);
974	lockdep_annotate_inode_mutex_key(inode);
975	}
976	return inode;
977	}
978
979	/*
980	* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
981	* be taken from reclaim -- unlike regular filesystems. This needs an
982	* annotation because huge_pmd_share() does an allocation under hugetlb's
983	* i_mmap_rwsem.
984	*/
985	static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
986
987	static struct inode hugetlbfs_get_inode(struct* super_block *sb,
988	struct mnt_idmap *idmap,
989	struct inode *dir,
990	umode_t mode, dev_t dev)
991	{
992	struct inode *inode;
993	struct resv_map *resv_map = NULL;
994
995	/*
996	* Reserve maps are only needed for inodes that can have associated
997	* page allocations.
998	*/
999	if (S_ISREG(mode) \|\| S_ISLNK(mode)) {
1000	resv_map = resv_map_alloc();
1001	if (!resv_map)
1002	return NULL;
1003	}
1004
1005	inode = new_inode(sb);
1006	if (inode) {
1007	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
1008
1009	inode->i_ino = get_next_ino();
1010	inode_init_owner(idmap, inode, dir, mode);
1011	lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
1012	&hugetlbfs_i_mmap_rwsem_key);
1013	inode->i_mapping->a_ops = &hugetlbfs_aops;
1014	simple_inode_init_ts(inode);
1015	inode->i_mapping->i_private_data = resv_map;
1016	info->seals = F_SEAL_SEAL;
1017	switch (mode & S_IFMT) {
1018	default:
1019	init_special_inode(inode, mode, dev);
1020	break;
1021	case S_IFREG:
1022	inode->i_op = &hugetlbfs_inode_operations;
1023	inode->i_fop = &hugetlbfs_file_operations;
1024	break;
1025	case S_IFDIR:
1026	inode->i_op = &hugetlbfs_dir_inode_operations;
1027	inode->i_fop = &simple_dir_operations;
1028
1029	/ directory inodes start off with i_nlink == 2 (for "." entry) /
1030	inc_nlink(inode);
1031	break;
1032	case S_IFLNK:
1033	inode->i_op = &page_symlink_inode_operations;
1034	inode_nohighmem(inode);
1035	break;
1036	}
1037	lockdep_annotate_inode_mutex_key(inode);
1038	} else {
1039	if (resv_map)
1040	kref_put(kref: &resv_map->refs, release: resv_map_release);
1041	}
1042
1043	return inode;
1044	}
1045
1046	/*
1047	* File creation. Allocate an inode, and we're done..
1048	*/
1049	static int hugetlbfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
1050	struct dentry *dentry, umode_t mode, dev_t dev)
1051	{
1052	struct inode *inode;
1053
1054	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode, dev);
1055	if (!inode)
1056	return -ENOSPC;
1057	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
1058	d_instantiate(dentry, inode);
1059	dget(dentry);/ Extra count - pin the dentry in core /
1060	return `0`;
1061	}
1062
1063	static int hugetlbfs_mkdir(struct mnt_idmap idmap, struct* inode *dir,
1064	struct dentry *dentry, umode_t mode)
1065	{
1066	int retval = hugetlbfs_mknod(idmap, dir, dentry,
1067	mode: mode \| S_IFDIR, dev: `0`);
1068	if (!retval)
1069	inc_nlink(inode: dir);
1070	return retval;
1071	}
1072
1073	static int hugetlbfs_create(struct mnt_idmap *idmap,
1074	struct inode dir, struct* dentry *dentry,
1075	umode_t mode, bool excl)
1076	{
1077	return hugetlbfs_mknod(idmap, dir, dentry, mode: mode \| S_IFREG, dev: `0`);
1078	}
1079
1080	static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
1081	struct inode dir, struct* file *file,
1082	umode_t mode)
1083	{
1084	struct inode *inode;
1085
1086	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode: mode \| S_IFREG, dev: `0`);
1087	if (!inode)
1088	return -ENOSPC;
1089	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
1090	d_tmpfile(file, inode);
1091	return finish_open_simple(file, error: `0`);
1092	}
1093
1094	static int hugetlbfs_symlink(struct mnt_idmap *idmap,
1095	struct inode dir, struct* dentry *dentry,
1096	const char *symname)
1097	{
1098	const umode_t mode = S_IFLNK\|S_IRWXUGO;
1099	struct inode *inode;
1100	int error = -ENOSPC;
1101
1102	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode, dev: `0`);
1103	if (inode) {
1104	int l = strlen(symname)+`1`;
1105	error = page_symlink(inode, symname, len: l);
1106	if (!error) {
1107	d_instantiate(dentry, inode);
1108	dget(dentry);
1109	} else
1110	iput(inode);
1111	}
1112	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
1113
1114	return error;
1115	}
1116
1117	#ifdef CONFIG_MIGRATION
1118	static int hugetlbfs_migrate_folio(struct address_space *mapping,
1119	struct folio dst, struct* folio *src,
1120	enum migrate_mode mode)
1121	{
1122	int rc;
1123
1124	rc = migrate_huge_page_move_mapping(mapping, dst, src);
1125	if (rc != MIGRATEPAGE_SUCCESS)
1126	return rc;
1127
1128	if (hugetlb_folio_subpool(folio: src)) {
1129	hugetlb_set_folio_subpool(folio: dst,
1130	subpool: hugetlb_folio_subpool(folio: src));
1131	hugetlb_set_folio_subpool(folio: src, NULL);
1132	}
1133
1134	if (mode != MIGRATE_SYNC_NO_COPY)
1135	folio_migrate_copy(newfolio: dst, folio: src);
1136	else
1137	folio_migrate_flags(newfolio: dst, folio: src);
1138
1139	return MIGRATEPAGE_SUCCESS;
1140	}
1141	#else
1142	#define hugetlbfs_migrate_folio NULL
1143	#endif
1144
1145	static int hugetlbfs_error_remove_folio(struct address_space *mapping,
1146	struct folio *folio)
1147	{
1148	return `0`;
1149	}
1150
1151	/*
1152	* Display the mount options in /proc/mounts.
1153	*/
1154	static int hugetlbfs_show_options(struct seq_file m, struct* dentry *root)
1155	{
1156	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb: root->d_sb);
1157	struct hugepage_subpool *spool = sbinfo->spool;
1158	unsigned long hpage_size = huge_page_size(h: sbinfo->hstate);
1159	unsigned hpage_shift = huge_page_shift(h: sbinfo->hstate);
1160	char mod;
1161
1162	if (!uid_eq(left: sbinfo->uid, GLOBAL_ROOT_UID))
1163	seq_printf(m, ",uid=%u",
1164	from_kuid_munged(&init_user_ns, sbinfo->uid));
1165	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
1166	seq_printf(m, ",gid=%u",
1167	from_kgid_munged(&init_user_ns, sbinfo->gid));
1168	if (sbinfo->mode != `0755`)
1169	seq_printf(m, ",mode=%o", sbinfo->mode);
1170	if (sbinfo->max_inodes != -`1`)
1171	seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
1172
1173	hpage_size /= `1024`;
1174	mod = `'K'`;
1175	if (hpage_size >= `1024`) {
1176	hpage_size /= `1024`;
1177	mod = `'M'`;
1178	}
1179	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
1180	if (spool) {
1181	if (spool->max_hpages != -`1`)
1182	seq_printf(m, ",size=%llu",
1183	(unsigned long long)spool->max_hpages << hpage_shift);
1184	if (spool->min_hpages != -`1`)
1185	seq_printf(m, ",min_size=%llu",
1186	(unsigned long long)spool->min_hpages << hpage_shift);
1187	}
1188	return `0`;
1189	}
1190
1191	static int hugetlbfs_statfs(struct dentry dentry, struct* kstatfs *buf)
1192	{
1193	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb: dentry->d_sb);
1194	struct hstate *h = hstate_inode(i: d_inode(dentry));
1195	u64 id = huge_encode_dev(dev: dentry->d_sb->s_dev);
1196
1197	buf->f_fsid = u64_to_fsid(v: id);
1198	buf->f_type = HUGETLBFS_MAGIC;
1199	buf->f_bsize = huge_page_size(h);
1200	if (sbinfo) {
1201	spin_lock(lock: &sbinfo->stat_lock);
1202	/ If no limits set, just report 0 or -1 for max/free/used*
1203	* blocks, like simple_statfs() */
1204	if (sbinfo->spool) {
1205	long free_pages;
1206
1207	spin_lock_irq(lock: &sbinfo->spool->lock);
1208	buf->f_blocks = sbinfo->spool->max_hpages;
1209	free_pages = sbinfo->spool->max_hpages
1210	- sbinfo->spool->used_hpages;
1211	buf->f_bavail = buf->f_bfree = free_pages;
1212	spin_unlock_irq(lock: &sbinfo->spool->lock);
1213	buf->f_files = sbinfo->max_inodes;
1214	buf->f_ffree = sbinfo->free_inodes;
1215	}
1216	spin_unlock(lock: &sbinfo->stat_lock);
1217	}
1218	buf->f_namelen = NAME_MAX;
1219	return `0`;
1220	}
1221
1222	static void hugetlbfs_put_super(struct super_block *sb)
1223	{
1224	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
1225
1226	if (sbi) {
1227	sb->s_fs_info = NULL;
1228
1229	if (sbi->spool)
1230	hugepage_put_subpool(spool: sbi->spool);
1231
1232	kfree(objp: sbi);
1233	}
1234	}
1235
1236	static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1237	{
1238	if (sbinfo->free_inodes >= `0`) {
1239	spin_lock(lock: &sbinfo->stat_lock);
1240	if (unlikely(!sbinfo->free_inodes)) {
1241	spin_unlock(lock: &sbinfo->stat_lock);
1242	return `0`;
1243	}
1244	sbinfo->free_inodes--;
1245	spin_unlock(lock: &sbinfo->stat_lock);
1246	}
1247
1248	return `1`;
1249	}
1250
1251	static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1252	{
1253	if (sbinfo->free_inodes >= `0`) {
1254	spin_lock(lock: &sbinfo->stat_lock);
1255	sbinfo->free_inodes++;
1256	spin_unlock(lock: &sbinfo->stat_lock);
1257	}
1258	}
1259
1260
1261	static struct kmem_cache *hugetlbfs_inode_cachep;
1262
1263	static struct inode hugetlbfs_alloc_inode(struct* super_block *sb)
1264	{
1265	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
1266	struct hugetlbfs_inode_info *p;
1267
1268	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
1269	return NULL;
1270	p = alloc_inode_sb(sb, cache: hugetlbfs_inode_cachep, GFP_KERNEL);
1271	if (unlikely(!p)) {
1272	hugetlbfs_inc_free_inodes(sbinfo);
1273	return NULL;
1274	}
1275	return &p->vfs_inode;
1276	}
1277
1278	static void hugetlbfs_free_inode(struct inode *inode)
1279	{
1280	kmem_cache_free(s: hugetlbfs_inode_cachep, objp: HUGETLBFS_I(inode));
1281	}
1282
1283	static void hugetlbfs_destroy_inode(struct inode *inode)
1284	{
1285	hugetlbfs_inc_free_inodes(sbinfo: HUGETLBFS_SB(sb: inode->i_sb));
1286	}
1287
1288	static const struct address_space_operations hugetlbfs_aops = {
1289	.write_begin = hugetlbfs_write_begin,
1290	.write_end = hugetlbfs_write_end,
1291	.dirty_folio = noop_dirty_folio,
1292	.migrate_folio = hugetlbfs_migrate_folio,
1293	.error_remove_folio = hugetlbfs_error_remove_folio,
1294	};
1295
1296
1297	static void init_once(void *foo)
1298	{
1299	struct hugetlbfs_inode_info *ei = foo;
1300
1301	inode_init_once(&ei->vfs_inode);
1302	}
1303
1304	const struct file_operations hugetlbfs_file_operations = {
1305	.read_iter = hugetlbfs_read_iter,
1306	.mmap = hugetlbfs_file_mmap,
1307	.fsync = noop_fsync,
1308	.get_unmapped_area = hugetlb_get_unmapped_area,
1309	.llseek = default_llseek,
1310	.fallocate = hugetlbfs_fallocate,
1311	};
1312
1313	static const struct inode_operations hugetlbfs_dir_inode_operations = {
1314	.create = hugetlbfs_create,
1315	.lookup = simple_lookup,
1316	.link = simple_link,
1317	.unlink = simple_unlink,
1318	.symlink = hugetlbfs_symlink,
1319	.mkdir = hugetlbfs_mkdir,
1320	.rmdir = simple_rmdir,
1321	.mknod = hugetlbfs_mknod,
1322	.rename = simple_rename,
1323	.setattr = hugetlbfs_setattr,
1324	.tmpfile = hugetlbfs_tmpfile,
1325	};
1326
1327	static const struct inode_operations hugetlbfs_inode_operations = {
1328	.setattr = hugetlbfs_setattr,
1329	};
1330
1331	static const struct super_operations hugetlbfs_ops = {
1332	.alloc_inode = hugetlbfs_alloc_inode,
1333	.free_inode = hugetlbfs_free_inode,
1334	.destroy_inode = hugetlbfs_destroy_inode,
1335	.evict_inode = hugetlbfs_evict_inode,
1336	.statfs = hugetlbfs_statfs,
1337	.put_super = hugetlbfs_put_super,
1338	.show_options = hugetlbfs_show_options,
1339	};
1340
1341	/*
1342	* Convert size option passed from command line to number of huge pages
1343	* in the pool specified by hstate. Size option could be in bytes
1344	* (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1345	*/
1346	static long
1347	hugetlbfs_size_to_hpages(struct hstate h, unsigned* long long size_opt,
1348	enum hugetlbfs_size_type val_type)
1349	{
1350	if (val_type == NO_SIZE)
1351	return -`1`;
1352
1353	if (val_type == SIZE_PERCENT) {
1354	size_opt <<= huge_page_shift(h);
1355	size_opt *= h->max_huge_pages;
1356	do_div(size_opt, `100`);
1357	}
1358
1359	size_opt >>= huge_page_shift(h);
1360	return size_opt;
1361	}
1362
1363	/*
1364	* Parse one mount parameter.
1365	*/
1366	static int hugetlbfs_parse_param(struct fs_context fc, struct* fs_parameter *param)
1367	{
1368	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1369	struct fs_parse_result result;
1370	struct hstate *h;
1371	char *rest;
1372	unsigned long ps;
1373	int opt;
1374
1375	opt = fs_parse(fc, desc: hugetlb_fs_parameters, param, result: &result);
1376	if (opt < `0`)
1377	return opt;
1378
1379	switch (opt) {
1380	case Opt_uid:
1381	ctx->uid = make_kuid(current_user_ns(), uid: result.uint_32);
1382	if (!uid_valid(uid: ctx->uid))
1383	goto bad_val;
1384	return `0`;
1385
1386	case Opt_gid:
1387	ctx->gid = make_kgid(current_user_ns(), gid: result.uint_32);
1388	if (!gid_valid(gid: ctx->gid))
1389	goto bad_val;
1390	return `0`;
1391
1392	case Opt_mode:
1393	ctx->mode = result.uint_32 & `01777U`;
1394	return `0`;
1395
1396	case Opt_size:
1397	/ memparse() will accept a K/M/G without a digit /
1398	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1399	goto bad_val;
1400	ctx->max_size_opt = memparse(ptr: param->string, retptr: &rest);
1401	ctx->max_val_type = SIZE_STD;
1402	if (*rest == `'%'`)
1403	ctx->max_val_type = SIZE_PERCENT;
1404	return `0`;
1405
1406	case Opt_nr_inodes:
1407	/ memparse() will accept a K/M/G without a digit /
1408	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1409	goto bad_val;
1410	ctx->nr_inodes = memparse(ptr: param->string, retptr: &rest);
1411	return `0`;
1412
1413	case Opt_pagesize:
1414	ps = memparse(ptr: param->string, retptr: &rest);
1415	h = size_to_hstate(size: ps);
1416	if (!h) {
1417	pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
1418	return -EINVAL;
1419	}
1420	ctx->hstate = h;
1421	return `0`;
1422
1423	case Opt_min_size:
1424	/ memparse() will accept a K/M/G without a digit /
1425	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1426	goto bad_val;
1427	ctx->min_size_opt = memparse(ptr: param->string, retptr: &rest);
1428	ctx->min_val_type = SIZE_STD;
1429	if (*rest == `'%'`)
1430	ctx->min_val_type = SIZE_PERCENT;
1431	return `0`;
1432
1433	default:
1434	return -EINVAL;
1435	}
1436
1437	bad_val:
1438	return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
1439	param->string, param->key);
1440	}
1441
1442	/*
1443	* Validate the parsed options.
1444	*/
1445	static int hugetlbfs_validate(struct fs_context *fc)
1446	{
1447	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1448
1449	/*
1450	* Use huge page pool size (in hstate) to convert the size
1451	* options to number of huge pages. If NO_SIZE, -1 is returned.
1452	*/
1453	ctx->max_hpages = hugetlbfs_size_to_hpages(h: ctx->hstate,
1454	size_opt: ctx->max_size_opt,
1455	val_type: ctx->max_val_type);
1456	ctx->min_hpages = hugetlbfs_size_to_hpages(h: ctx->hstate,
1457	size_opt: ctx->min_size_opt,
1458	val_type: ctx->min_val_type);
1459
1460	/*
1461	* If max_size was specified, then min_size must be smaller
1462	*/
1463	if (ctx->max_val_type > NO_SIZE &&
1464	ctx->min_hpages > ctx->max_hpages) {
1465	pr_err("Minimum size can not be greater than maximum size\n");
1466	return -EINVAL;
1467	}
1468
1469	return `0`;
1470	}
1471
1472	static int
1473	hugetlbfs_fill_super(struct super_block sb, struct* fs_context *fc)
1474	{
1475	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1476	struct hugetlbfs_sb_info *sbinfo;
1477
1478	sbinfo = kmalloc(size: sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1479	if (!sbinfo)
1480	return -ENOMEM;
1481	sb->s_fs_info = sbinfo;
1482	spin_lock_init(&sbinfo->stat_lock);
1483	sbinfo->hstate = ctx->hstate;
1484	sbinfo->max_inodes = ctx->nr_inodes;
1485	sbinfo->free_inodes = ctx->nr_inodes;
1486	sbinfo->spool = NULL;
1487	sbinfo->uid = ctx->uid;
1488	sbinfo->gid = ctx->gid;
1489	sbinfo->mode = ctx->mode;
1490
1491	/*
1492	* Allocate and initialize subpool if maximum or minimum size is
1493	* specified. Any needed reservations (for minimum size) are taken
1494	* when the subpool is created.
1495	*/
1496	if (ctx->max_hpages != -`1` \|\| ctx->min_hpages != -`1`) {
1497	sbinfo->spool = hugepage_new_subpool(h: ctx->hstate,
1498	max_hpages: ctx->max_hpages,
1499	min_hpages: ctx->min_hpages);
1500	if (!sbinfo->spool)
1501	goto out_free;
1502	}
1503	sb->s_maxbytes = MAX_LFS_FILESIZE;
1504	sb->s_blocksize = huge_page_size(h: ctx->hstate);
1505	sb->s_blocksize_bits = huge_page_shift(h: ctx->hstate);
1506	sb->s_magic = HUGETLBFS_MAGIC;
1507	sb->s_op = &hugetlbfs_ops;
1508	sb->s_time_gran = `1`;
1509
1510	/*
1511	* Due to the special and limited functionality of hugetlbfs, it does
1512	* not work well as a stacking filesystem.
1513	*/
1514	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
1515	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
1516	if (!sb->s_root)
1517	goto out_free;
1518	return `0`;
1519	out_free:
1520	kfree(objp: sbinfo->spool);
1521	kfree(objp: sbinfo);
1522	return -ENOMEM;
1523	}
1524
1525	static int hugetlbfs_get_tree(struct fs_context *fc)
1526	{
1527	int err = hugetlbfs_validate(fc);
1528	if (err)
1529	return err;
1530	return get_tree_nodev(fc, fill_super: hugetlbfs_fill_super);
1531	}
1532
1533	static void hugetlbfs_fs_context_free(struct fs_context *fc)
1534	{
1535	kfree(objp: fc->fs_private);
1536	}
1537
1538	static const struct fs_context_operations hugetlbfs_fs_context_ops = {
1539	.free = hugetlbfs_fs_context_free,
1540	.parse_param = hugetlbfs_parse_param,
1541	.get_tree = hugetlbfs_get_tree,
1542	};
1543
1544	static int hugetlbfs_init_fs_context(struct fs_context *fc)
1545	{
1546	struct hugetlbfs_fs_context *ctx;
1547
1548	ctx = kzalloc(size: sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
1549	if (!ctx)
1550	return -ENOMEM;
1551
1552	ctx->max_hpages = -`1`; / No limit on size by default /
1553	ctx->nr_inodes = -`1`; / No limit on number of inodes by default /
1554	ctx->uid = current_fsuid();
1555	ctx->gid = current_fsgid();
1556	ctx->mode = `0755`;
1557	ctx->hstate = &default_hstate;
1558	ctx->min_hpages = -`1`; / No default minimum size /
1559	ctx->max_val_type = NO_SIZE;
1560	ctx->min_val_type = NO_SIZE;
1561	fc->fs_private = ctx;
1562	fc->ops = &hugetlbfs_fs_context_ops;
1563	return `0`;
1564	}
1565
1566	static struct file_system_type hugetlbfs_fs_type = {
1567	.name = "hugetlbfs",
1568	.init_fs_context = hugetlbfs_init_fs_context,
1569	.parameters = hugetlb_fs_parameters,
1570	.kill_sb = kill_litter_super,
1571	.fs_flags = FS_ALLOW_IDMAP,
1572	};
1573
1574	static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1575
1576	static int can_do_hugetlb_shm(void)
1577	{
1578	kgid_t shm_group;
1579	shm_group = make_kgid(from: &init_user_ns, gid: sysctl_hugetlb_shm_group);
1580	return capable(CAP_IPC_LOCK) \|\| in_group_p(shm_group);
1581	}
1582
1583	static int get_hstate_idx(int page_size_log)
1584	{
1585	struct hstate *h = hstate_sizelog(page_size_log);
1586
1587	if (!h)
1588	return -`1`;
1589	return hstate_index(h);
1590	}
1591
1592	/*
1593	* Note that size should be aligned to proper hugepage size in caller side,
1594	* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1595	*/
1596	struct file hugetlb_file_setup(const* char *name, size_t size,
1597	vm_flags_t acctflag, int creat_flags,
1598	int page_size_log)
1599	{
1600	struct inode *inode;
1601	struct vfsmount *mnt;
1602	int hstate_idx;
1603	struct file *file;
1604
1605	hstate_idx = get_hstate_idx(page_size_log);
1606	if (hstate_idx < `0`)
1607	return ERR_PTR(error: -ENODEV);
1608
1609	mnt = hugetlbfs_vfsmount[hstate_idx];
1610	if (!mnt)
1611	return ERR_PTR(error: -ENOENT);
1612
1613	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1614	struct ucounts *ucounts = current_ucounts();
1615
1616	if (user_shm_lock(size, ucounts)) {
1617	pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
1618	current->comm, current->pid);
1619	user_shm_unlock(size, ucounts);
1620	}
1621	return ERR_PTR(error: -EPERM);
1622	}
1623
1624	file = ERR_PTR(error: -ENOSPC);
1625	/ hugetlbfs_vfsmount[] mounts do not use idmapped mounts. /
1626	inode = hugetlbfs_get_inode(sb: mnt->mnt_sb, idmap: &nop_mnt_idmap, NULL,
1627	S_IFREG \| S_IRWXUGO, dev: `0`);
1628	if (!inode)
1629	goto out;
1630	if (creat_flags == HUGETLB_SHMFS_INODE)
1631	inode->i_flags \|= S_PRIVATE;
1632
1633	inode->i_size = size;
1634	clear_nlink(inode);
1635
1636	if (!hugetlb_reserve_pages(inode, from: `0`,
1637	to: size >> huge_page_shift(h: hstate_inode(i: inode)), NULL,
1638	vm_flags: acctflag))
1639	file = ERR_PTR(error: -ENOMEM);
1640	else
1641	file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
1642	&hugetlbfs_file_operations);
1643	if (!IS_ERR(ptr: file))
1644	return file;
1645
1646	iput(inode);
1647	out:
1648	return file;
1649	}
1650
1651	static struct vfsmount __init mount_one_hugetlbfs(struct* hstate *h)
1652	{
1653	struct fs_context *fc;
1654	struct vfsmount *mnt;
1655
1656	fc = fs_context_for_mount(fs_type: &hugetlbfs_fs_type, SB_KERNMOUNT);
1657	if (IS_ERR(ptr: fc)) {
1658	mnt = ERR_CAST(ptr: fc);
1659	} else {
1660	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1661	ctx->hstate = h;
1662	mnt = fc_mount(fc);
1663	put_fs_context(fc);
1664	}
1665	if (IS_ERR(ptr: mnt))
1666	pr_err("Cannot mount internal hugetlbfs for page size %luK",
1667	huge_page_size(h) / SZ_1K);
1668	return mnt;
1669	}
1670
1671	static int __init init_hugetlbfs_fs(void)
1672	{
1673	struct vfsmount *mnt;
1674	struct hstate *h;
1675	int error;
1676	int i;
1677
1678	if (!hugepages_supported()) {
1679	pr_info("disabling because there are no supported hugepage sizes\n");
1680	return -ENOTSUPP;
1681	}
1682
1683	error = -ENOMEM;
1684	hugetlbfs_inode_cachep = kmem_cache_create(name: "hugetlbfs_inode_cache",
1685	size: sizeof(struct hugetlbfs_inode_info),
1686	align: `0`, SLAB_ACCOUNT, ctor: init_once);
1687	if (hugetlbfs_inode_cachep == NULL)
1688	goto out;
1689
1690	error = register_filesystem(&hugetlbfs_fs_type);
1691	if (error)
1692	goto out_free;
1693
1694	/ default hstate mount is required /
1695	mnt = mount_one_hugetlbfs(h: &default_hstate);
1696	if (IS_ERR(ptr: mnt)) {
1697	error = PTR_ERR(ptr: mnt);
1698	goto out_unreg;
1699	}
1700	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
1701
1702	/ other hstates are optional /
1703	i = `0`;
1704	for_each_hstate(h) {
1705	if (i == default_hstate_idx) {
1706	i++;
1707	continue;
1708	}
1709
1710	mnt = mount_one_hugetlbfs(h);
1711	if (IS_ERR(ptr: mnt))
1712	hugetlbfs_vfsmount[i] = NULL;
1713	else
1714	hugetlbfs_vfsmount[i] = mnt;
1715	i++;
1716	}
1717
1718	return `0`;
1719
1720	out_unreg:
1721	(void)unregister_filesystem(&hugetlbfs_fs_type);
1722	out_free:
1723	kmem_cache_destroy(s: hugetlbfs_inode_cachep);
1724	out:
1725	return error;
1726	}
1727	fs_initcall(init_hugetlbfs_fs)
1728

source code of linux/fs/hugetlbfs/inode.c