memory.c source code [linux/mm/memory.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/memory.c
4	*
5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6	*/
7
8	/*
9	* demand-loading started 01.12.91 - seems it is high on the list of
10	* things wanted, and it should be easy to implement. - Linus
11	*/
12
13	/*
14	* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
15	* pages started 02.12.91, seems to work. - Linus.
16	*
17	* Tested sharing by executing about 30 /bin/sh: under the old kernel it
18	* would have taken more than the 6M I have free, but it worked well as
19	* far as I could see.
20	*
21	* Also corrected some "invalidate()"s - I wasn't doing enough of them.
22	*/
23
24	/*
25	* Real VM (paging to/from disk) started 18.12.91. Much more work and
26	* thought has to go into this. Oh, well..
27	* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
28	* Found it. Everything seems to work now.
29	* 20.12.91 - Ok, making the swap-device changeable like the root.
30	*/
31
32	/*
33	* 05.04.94 - Multi-page memory management added for v1.1.
34	* Idea by Alex Bligh (alex@cconcepts.co.uk)
35	*
36	* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
37	* (Gerhard.Wichert@pdb.siemens.de)
38	*
39	* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
40	*/
41
42	#include <linux/kernel_stat.h>
43	#include <linux/mm.h>
44	#include <linux/mm_inline.h>
45	#include <linux/sched/mm.h>
46	#include <linux/sched/numa_balancing.h>
47	#include <linux/sched/task.h>
48	#include <linux/hugetlb.h>
49	#include <linux/mman.h>
50	#include <linux/swap.h>
51	#include <linux/highmem.h>
52	#include <linux/pagemap.h>
53	#include <linux/memremap.h>
54	#include <linux/kmsan.h>
55	#include <linux/ksm.h>
56	#include <linux/rmap.h>
57	#include <linux/export.h>
58	#include <linux/delayacct.h>
59	#include <linux/init.h>
60	#include <linux/writeback.h>
61	#include <linux/memcontrol.h>
62	#include <linux/mmu_notifier.h>
63	#include <linux/leafops.h>
64	#include <linux/elf.h>
65	#include <linux/gfp.h>
66	#include <linux/migrate.h>
67	#include <linux/string.h>
68	#include <linux/shmem_fs.h>
69	#include <linux/memory-tiers.h>
70	#include <linux/debugfs.h>
71	#include <linux/userfaultfd_k.h>
72	#include <linux/dax.h>
73	#include <linux/oom.h>
74	#include <linux/numa.h>
75	#include <linux/perf_event.h>
76	#include <linux/ptrace.h>
77	#include <linux/vmalloc.h>
78	#include <linux/sched/sysctl.h>
79	#include <linux/pgalloc.h>
80	#include <linux/uaccess.h>
81
82	#include <trace/events/kmem.h>
83
84	#include <asm/io.h>
85	#include <asm/mmu_context.h>
86	#include <asm/tlb.h>
87	#include <asm/tlbflush.h>
88
89	#include "pgalloc-track.h"
90	#include "internal.h"
91	#include "swap.h"
92
93	#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
94	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
95	#endif
96
97	static vm_fault_t do_fault(struct vm_fault *vmf);
98	static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
99	static bool vmf_pte_changed(struct vm_fault *vmf);
100
101	/*
102	* Return true if the original pte was a uffd-wp pte marker (so the pte was
103	* wr-protected).
104	*/
105	static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
106	{
107	if (!userfaultfd_wp(vma: vmf->vma))
108	return false;
109	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
110	return false;
111
112	return pte_is_uffd_wp_marker(pte: vmf->orig_pte);
113	}
114
115	/*
116	* Randomize the address space (stacks, mmaps, brk, etc.).
117	*
118	* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
119	* as ancient (libc5 based) binaries can segfault. )
120	*/
121	int randomize_va_space __read_mostly =
122	#ifdef CONFIG_COMPAT_BRK
123	`1`;
124	#else
125	`2`;
126	#endif
127
128	static const struct ctl_table mmu_sysctl_table[] = {
129	{
130	.procname = "randomize_va_space",
131	.data = &randomize_va_space,
132	.maxlen = sizeof(int),
133	.mode = `0644`,
134	.proc_handler = proc_dointvec,
135	},
136	};
137
138	static int __init init_mm_sysctl(void)
139	{
140	register_sysctl_init("kernel", mmu_sysctl_table);
141	return `0`;
142	}
143
144	subsys_initcall(init_mm_sysctl);
145
146	#ifndef arch_wants_old_prefaulted_pte
147	static inline bool arch_wants_old_prefaulted_pte(void)
148	{
149	/*
150	* Transitioning a PTE from 'old' to 'young' can be expensive on
151	* some architectures, even if it's performed in hardware. By
152	* default, "false" means prefaulted entries will be 'young'.
153	*/
154	return false;
155	}
156	#endif
157
158	static int __init disable_randmaps(char *s)
159	{
160	randomize_va_space = `0`;
161	return `1`;
162	}
163	__setup("norandmaps", disable_randmaps);
164
165	unsigned long zero_pfn __read_mostly;
166	EXPORT_SYMBOL(zero_pfn);
167
168	unsigned long highest_memmap_pfn __read_mostly;
169
170	/*
171	* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
172	*/
173	static int __init init_zero_pfn(void)
174	{
175	zero_pfn = page_to_pfn(ZERO_PAGE(`0`));
176	return `0`;
177	}
178	early_initcall(init_zero_pfn);
179
180	void mm_trace_rss_stat(struct mm_struct mm, int* member)
181	{
182	trace_rss_stat(mm, member);
183	}
184
185	/*
186	* Note: this doesn't free the actual pages themselves. That
187	* has been handled earlier when unmapping all the memory regions.
188	*/
189	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd,
190	unsigned long addr)
191	{
192	pgtable_t token = pmd_pgtable(*pmd);
193	pmd_clear(pmdp: pmd);
194	pte_free_tlb(tlb, token, addr);
195	mm_dec_nr_ptes(mm: tlb->mm);
196	}
197
198	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
199	unsigned long addr, unsigned long end,
200	unsigned long floor, unsigned long ceiling)
201	{
202	pmd_t *pmd;
203	unsigned long next;
204	unsigned long start;
205
206	start = addr;
207	pmd = pmd_offset(pud, address: addr);
208	do {
209	next = pmd_addr_end(addr, end);
210	if (pmd_none_or_clear_bad(pmd))
211	continue;
212	free_pte_range(tlb, pmd, addr);
213	} while (pmd++, addr = next, addr != end);
214
215	start &= PUD_MASK;
216	if (start < floor)
217	return;
218	if (ceiling) {
219	ceiling &= PUD_MASK;
220	if (!ceiling)
221	return;
222	}
223	if (end - `1` > ceiling - `1`)
224	return;
225
226	pmd = pmd_offset(pud, address: start);
227	pud_clear(pudp: pud);
228	pmd_free_tlb(tlb, pmd, start);
229	mm_dec_nr_pmds(mm: tlb->mm);
230	}
231
232	static inline void free_pud_range(struct mmu_gather tlb, p4d_t p4d,
233	unsigned long addr, unsigned long end,
234	unsigned long floor, unsigned long ceiling)
235	{
236	pud_t *pud;
237	unsigned long next;
238	unsigned long start;
239
240	start = addr;
241	pud = pud_offset(p4d, address: addr);
242	do {
243	next = pud_addr_end(addr, end);
244	if (pud_none_or_clear_bad(pud))
245	continue;
246	free_pmd_range(tlb, pud, addr, end: next, floor, ceiling);
247	} while (pud++, addr = next, addr != end);
248
249	start &= P4D_MASK;
250	if (start < floor)
251	return;
252	if (ceiling) {
253	ceiling &= P4D_MASK;
254	if (!ceiling)
255	return;
256	}
257	if (end - `1` > ceiling - `1`)
258	return;
259
260	pud = pud_offset(p4d, address: start);
261	p4d_clear(p4dp: p4d);
262	pud_free_tlb(tlb, pud, start);
263	mm_dec_nr_puds(mm: tlb->mm);
264	}
265
266	static inline void free_p4d_range(struct mmu_gather tlb, pgd_t pgd,
267	unsigned long addr, unsigned long end,
268	unsigned long floor, unsigned long ceiling)
269	{
270	p4d_t *p4d;
271	unsigned long next;
272	unsigned long start;
273
274	start = addr;
275	p4d = p4d_offset(pgd, address: addr);
276	do {
277	next = p4d_addr_end(addr, end);
278	if (p4d_none_or_clear_bad(p4d))
279	continue;
280	free_pud_range(tlb, p4d, addr, end: next, floor, ceiling);
281	} while (p4d++, addr = next, addr != end);
282
283	start &= PGDIR_MASK;
284	if (start < floor)
285	return;
286	if (ceiling) {
287	ceiling &= PGDIR_MASK;
288	if (!ceiling)
289	return;
290	}
291	if (end - `1` > ceiling - `1`)
292	return;
293
294	p4d = p4d_offset(pgd, address: start);
295	pgd_clear(pgd);
296	p4d_free_tlb(tlb, p4d, start);
297	}
298
299	/**
300	* free_pgd_range - Unmap and free page tables in the range
301	* @tlb: the mmu_gather containing pending TLB flush info
302	* @addr: virtual address start
303	* @end: virtual address end
304	* @floor: lowest address boundary
305	* @ceiling: highest address boundary
306	*
307	* This function tears down all user-level page tables in the
308	* specified virtual address range [@addr..@end). It is part of
309	* the memory unmap flow.
310	*/
311	void free_pgd_range(struct mmu_gather *tlb,
312	unsigned long addr, unsigned long end,
313	unsigned long floor, unsigned long ceiling)
314	{
315	pgd_t *pgd;
316	unsigned long next;
317
318	/*
319	* The next few lines have given us lots of grief...
320	*
321	* Why are we testing PMD* at this top level? Because often
322	* there will be no work to do at all, and we'd prefer not to
323	* go all the way down to the bottom just to discover that.
324	*
325	* Why all these "- 1"s? Because 0 represents both the bottom
326	* of the address space and the top of it (using -1 for the
327	* top wouldn't help much: the masks would do the wrong thing).
328	* The rule is that addr 0 and floor 0 refer to the bottom of
329	* the address space, but end 0 and ceiling 0 refer to the top
330	* Comparisons need to use "end - 1" and "ceiling - 1" (though
331	* that end 0 case should be mythical).
332	*
333	* Wherever addr is brought up or ceiling brought down, we must
334	* be careful to reject "the opposite 0" before it confuses the
335	* subsequent tests. But what about where end is brought down
336	* by PMD_SIZE below? no, end can't go down to 0 there.
337	*
338	* Whereas we round start (addr) and ceiling down, by different
339	* masks at different levels, in order to test whether a table
340	* now has no other vmas using it, so can be freed, we don't
341	* bother to round floor or end up - the tests don't need that.
342	*/
343
344	addr &= PMD_MASK;
345	if (addr < floor) {
346	addr += PMD_SIZE;
347	if (!addr)
348	return;
349	}
350	if (ceiling) {
351	ceiling &= PMD_MASK;
352	if (!ceiling)
353	return;
354	}
355	if (end - `1` > ceiling - `1`)
356	end -= PMD_SIZE;
357	if (addr > end - `1`)
358	return;
359	/*
360	* We add page table cache pages with PAGE_SIZE,
361	* (see pte_free_tlb()), flush the tlb if we need
362	*/
363	tlb_change_page_size(tlb, PAGE_SIZE);
364	pgd = pgd_offset(tlb->mm, addr);
365	do {
366	next = pgd_addr_end(addr, end);
367	if (pgd_none_or_clear_bad(pgd))
368	continue;
369	free_p4d_range(tlb, pgd, addr, end: next, floor, ceiling);
370	} while (pgd++, addr = next, addr != end);
371	}
372
373	void free_pgtables(struct mmu_gather tlb, struct* ma_state *mas,
374	struct vm_area_struct vma, unsigned* long floor,
375	unsigned long ceiling, bool mm_wr_locked)
376	{
377	struct unlink_vma_file_batch vb;
378
379	tlb_free_vmas(tlb);
380
381	do {
382	unsigned long addr = vma->vm_start;
383	struct vm_area_struct *next;
384
385	/*
386	* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
387	* be 0. This will underflow and is okay.
388	*/
389	next = mas_find(mas, max: ceiling - `1`);
390	if (unlikely(xa_is_zero(next)))
391	next = NULL;
392
393	/*
394	* Hide vma from rmap and truncate_pagecache before freeing
395	* pgtables
396	*/
397	if (mm_wr_locked)
398	vma_start_write(vma);
399	unlink_anon_vmas(vma);
400
401	unlink_file_vma_batch_init(vb: &vb);
402	unlink_file_vma_batch_add(vb: &vb, vma);
403
404	/*
405	* Optimization: gather nearby vmas into one call down
406	*/
407	while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
408	vma = next;
409	next = mas_find(mas, max: ceiling - `1`);
410	if (unlikely(xa_is_zero(next)))
411	next = NULL;
412	if (mm_wr_locked)
413	vma_start_write(vma);
414	unlink_anon_vmas(vma);
415	unlink_file_vma_batch_add(vb: &vb, vma);
416	}
417	unlink_file_vma_batch_final(vb: &vb);
418
419	free_pgd_range(tlb, addr, end: vma->vm_end,
420	floor, ceiling: next ? next->vm_start : ceiling);
421	vma = next;
422	} while (vma);
423	}
424
425	void pmd_install(struct mm_struct mm, pmd_t pmd, pgtable_t *pte)
426	{
427	spinlock_t *ptl = pmd_lock(mm, pmd);
428
429	if (likely(pmd_none(pmd))) { /* Has another populated it ? /
430	mm_inc_nr_ptes(mm);
431	/*
432	* Ensure all pte setup (eg. pte page lock and page clearing) are
433	* visible before the pte is made visible to other CPUs by being
434	* put into page tables.
435	*
436	* The other side of the story is the pointer chasing in the page
437	* table walking code (when walking the page table without locking;
438	* ie. most of the time). Fortunately, these data accesses consist
439	* of a chain of data-dependent loads, meaning most CPUs (alpha
440	* being the notable exception) will already guarantee loads are
441	* seen in-order. See the alpha page table accessors for the
442	* smp_rmb() barriers in page table walking code.
443	*/
444	smp_wmb(); / Could be smp_wmb__xxx(before\|after)_spin_lock /
445	pmd_populate(mm, pmd, pte: *pte);
446	*pte = NULL;
447	}
448	spin_unlock(lock: ptl);
449	}
450
451	int __pte_alloc(struct mm_struct mm, pmd_t pmd)
452	{
453	pgtable_t new = pte_alloc_one(mm);
454	if (!new)
455	return -ENOMEM;
456
457	pmd_install(mm, pmd, pte: &new);
458	if (new)
459	pte_free(mm, pte_page: new);
460	return `0`;
461	}
462
463	int __pte_alloc_kernel(pmd_t *pmd)
464	{
465	pte_t *new = pte_alloc_one_kernel(&init_mm);
466	if (!new)
467	return -ENOMEM;
468
469	spin_lock(lock: &init_mm.page_table_lock);
470	if (likely(pmd_none(pmd))) { /* Has another populated it ? /
471	smp_wmb(); / See comment in pmd_install() /
472	pmd_populate_kernel(mm: &init_mm, pmd, pte: new);
473	new = NULL;
474	}
475	spin_unlock(lock: &init_mm.page_table_lock);
476	if (new)
477	pte_free_kernel(mm: &init_mm, pte: new);
478	return `0`;
479	}
480
481	static inline void init_rss_vec(int *rss)
482	{
483	memset(rss, `0`, sizeof(int) * NR_MM_COUNTERS);
484	}
485
486	static inline void add_mm_rss_vec(struct mm_struct mm, int* *rss)
487	{
488	int i;
489
490	for (i = `0`; i < NR_MM_COUNTERS; i++)
491	if (rss[i])
492	add_mm_counter(mm, member: i, value: rss[i]);
493	}
494
495	static bool is_bad_page_map_ratelimited(void)
496	{
497	static unsigned long resume;
498	static unsigned long nr_shown;
499	static unsigned long nr_unshown;
500
501	/*
502	* Allow a burst of 60 reports, then keep quiet for that minute;
503	* or allow a steady drip of one report per second.
504	*/
505	if (nr_shown == `60`) {
506	if (time_before(jiffies, resume)) {
507	nr_unshown++;
508	return true;
509	}
510	if (nr_unshown) {
511	pr_alert("BUG: Bad page map: %lu messages suppressed\n",
512	nr_unshown);
513	nr_unshown = `0`;
514	}
515	nr_shown = `0`;
516	}
517	if (nr_shown++ == `0`)
518	resume = jiffies + `60` * HZ;
519	return false;
520	}
521
522	static void __print_bad_page_map_pgtable(struct mm_struct mm, unsigned* long addr)
523	{
524	unsigned long long pgdv, p4dv, pudv, pmdv;
525	p4d_t p4d, *p4dp;
526	pud_t pud, *pudp;
527	pmd_t pmd, *pmdp;
528	pgd_t *pgdp;
529
530	/*
531	* Although this looks like a fully lockless pgtable walk, it is not:
532	* see locking requirements for print_bad_page_map().
533	*/
534	pgdp = pgd_offset(mm, addr);
535	pgdv = pgd_val(pgd: *pgdp);
536
537	if (!pgd_present(pgd: pgdp) \|\| pgd_leaf(pgdp)) {
538	pr_alert("pgd:%08llx\n", pgdv);
539	return;
540	}
541
542	p4dp = p4d_offset(pgd: pgdp, address: addr);
543	p4d = p4dp_get(p4dp);
544	p4dv = p4d_val(p4d);
545
546	if (!p4d_present(p4d) \|\| p4d_leaf(p4d)) {
547	pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
548	return;
549	}
550
551	pudp = pud_offset(p4d: p4dp, address: addr);
552	pud = pudp_get(pudp);
553	pudv = pud_val(pud);
554
555	if (!pud_present(pud) \|\| pud_leaf(pud)) {
556	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
557	return;
558	}
559
560	pmdp = pmd_offset(pud: pudp, address: addr);
561	pmd = pmdp_get(pmdp);
562	pmdv = pmd_val(pmd);
563
564	/*
565	* Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
566	* because the table should already be mapped by the caller and
567	* doing another map would be bad. print_bad_page_map() should
568	* already take care of printing the PTE.
569	*/
570	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
571	p4dv, pudv, pmdv);
572	}
573
574	/*
575	* This function is called to print an error when a bad page table entry (e.g.,
576	* corrupted page table entry) is found. For example, we might have a
577	* PFN-mapped pte in a region that doesn't allow it.
578	*
579	* The calling function must still handle the error.
580	*
581	* This function must be called during a proper page table walk, as it will
582	* re-walk the page table to dump information: the caller MUST prevent page
583	* table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
584	* page table lock.
585	*/
586	static void print_bad_page_map(struct vm_area_struct *vma,
587	unsigned long addr, unsigned long long entry, struct page *page,
588	enum pgtable_level level)
589	{
590	struct address_space *mapping;
591	pgoff_t index;
592
593	if (is_bad_page_map_ratelimited())
594	return;
595
596	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
597	index = linear_page_index(vma, address: addr);
598
599	pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
600	pgtable_level_to_str(level), entry);
601	__print_bad_page_map_pgtable(mm: vma->vm_mm, addr);
602	if (page)
603	dump_page(page, reason: "bad page map");
604	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
605	(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
606	pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
607	vma->vm_file,
608	vma->vm_ops ? vma->vm_ops->fault : NULL,
609	vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
610	vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
611	mapping ? mapping->a_ops->read_folio : NULL);
612	dump_stack();
613	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
614	}
615	#define print_bad_pte(vma, addr, pte, page) \
616	print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
617
618	/**
619	* __vm_normal_page() - Get the "struct page" associated with a page table entry.
620	* @vma: The VMA mapping the page table entry.
621	* @addr: The address where the page table entry is mapped.
622	* @pfn: The PFN stored in the page table entry.
623	* @special: Whether the page table entry is marked "special".
624	* @level: The page table level for error reporting purposes only.
625	* @entry: The page table entry value for error reporting purposes only.
626	*
627	* "Special" mappings do not wish to be associated with a "struct page" (either
628	* it doesn't exist, or it exists but they don't want to touch it). In this
629	* case, NULL is returned here. "Normal" mappings do have a struct page and
630	* are ordinarily refcounted.
631	*
632	* Page mappings of the shared zero folios are always considered "special", as
633	* they are not ordinarily refcounted: neither the refcount nor the mapcount
634	* of these folios is adjusted when mapping them into user page tables.
635	* Selected page table walkers (such as GUP) can still identify mappings of the
636	* shared zero folios and work with the underlying "struct page".
637	*
638	* There are 2 broad cases. Firstly, an architecture may define a "special"
639	* page table entry bit, such as pte_special(), in which case this function is
640	* trivial. Secondly, an architecture may not have a spare page table
641	* entry bit, which requires a more complicated scheme, described below.
642	*
643	* With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
644	* page table entries that actually map "normal" pages: however, that page
645	* cannot be looked up through the PFN stored in the page table entry, but
646	* instead will be looked up through vm_ops->find_normal_page(). So far, this
647	* only applies to PTEs.
648	*
649	* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
650	* special mapping (even if there are underlying and valid "struct pages").
651	* COWed pages of a VM_PFNMAP are always normal.
652	*
653	* The way we recognize COWed pages within VM_PFNMAP mappings is through the
654	* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
655	* set, and the vm_pgoff will point to the first PFN mapped: thus every special
656	* mapping will always honor the rule
657	*
658	* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
659	*
660	* And for normal mappings this is false.
661	*
662	* This restricts such mappings to be a linear translation from virtual address
663	* to pfn. To get around this restriction, we allow arbitrary mappings so long
664	* as the vma is not a COW mapping; in that case, we know that all ptes are
665	* special (because none can have been COWed).
666	*
667	*
668	* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
669	*
670	* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
671	* page" backing, however the difference is that _all_ pages with a struct
672	* page (that is, those where pfn_valid is true, except the shared zero
673	* folios) are refcounted and considered normal pages by the VM.
674	*
675	* The disadvantage is that pages are refcounted (which can be slower and
676	* simply not an option for some PFNMAP users). The advantage is that we
677	* don't have to follow the strict linearity rule of PFNMAP mappings in
678	* order to support COWable mappings.
679	*
680	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
681	* NULL if this is a "special" mapping.
682	*/
683	static inline struct page __vm_normal_page(struct* vm_area_struct *vma,
684	unsigned long addr, unsigned long pfn, bool special,
685	unsigned long long entry, enum pgtable_level level)
686	{
687	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
688	if (unlikely(special)) {
689	#ifdef CONFIG_FIND_NORMAL_PAGE
690	if (vma->vm_ops && vma->vm_ops->find_normal_page)
691	return vma->vm_ops->find_normal_page(vma, addr);
692	#endif /* CONFIG_FIND_NORMAL_PAGE */
693	if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
694	return NULL;
695	if (is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn))
696	return NULL;
697
698	print_bad_page_map(vma, addr, entry, NULL, level);
699	return NULL;
700	}
701	/*
702	* With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
703	* mappings (incl. shared zero folios) are marked accordingly.
704	*/
705	} else {
706	if (unlikely(vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))) {
707	if (vma->vm_flags & VM_MIXEDMAP) {
708	/ If it has a "struct page", it's "normal". /
709	if (!pfn_valid(pfn))
710	return NULL;
711	} else {
712	unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
713
714	/ Only CoW'ed anon folios are "normal". /
715	if (pfn == vma->vm_pgoff + off)
716	return NULL;
717	if (!is_cow_mapping(flags: vma->vm_flags))
718	return NULL;
719	}
720	}
721
722	if (is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn))
723	return NULL;
724	}
725
726	if (unlikely(pfn > highest_memmap_pfn)) {
727	/ Corrupted page table entry. /
728	print_bad_page_map(vma, addr, entry, NULL, level);
729	return NULL;
730	}
731	/*
732	* NOTE! We still have PageReserved() pages in the page tables.
733	* For example, VDSO mappings can cause them to exist.
734	*/
735	VM_WARN_ON_ONCE(is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn));
736	return pfn_to_page(pfn);
737	}
738
739	/**
740	* vm_normal_page() - Get the "struct page" associated with a PTE
741	* @vma: The VMA mapping the @pte.
742	* @addr: The address where the @pte is mapped.
743	* @pte: The PTE.
744	*
745	* Get the "struct page" associated with a PTE. See __vm_normal_page()
746	* for details on "normal" and "special" mappings.
747	*
748	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
749	* NULL if this is a "special" mapping.
750	*/
751	struct page vm_normal_page(struct* vm_area_struct vma, unsigned* long addr,
752	pte_t pte)
753	{
754	return __vm_normal_page(vma, addr, pfn: pte_pfn(pte), special: pte_special(pte),
755	entry: pte_val(pte), level: PGTABLE_LEVEL_PTE);
756	}
757
758	/**
759	* vm_normal_folio() - Get the "struct folio" associated with a PTE
760	* @vma: The VMA mapping the @pte.
761	* @addr: The address where the @pte is mapped.
762	* @pte: The PTE.
763	*
764	* Get the "struct folio" associated with a PTE. See __vm_normal_page()
765	* for details on "normal" and "special" mappings.
766	*
767	* Return: Returns the "struct folio" if this is a "normal" mapping. Returns
768	* NULL if this is a "special" mapping.
769	*/
770	struct folio vm_normal_folio(struct* vm_area_struct vma, unsigned* long addr,
771	pte_t pte)
772	{
773	struct page *page = vm_normal_page(vma, addr, pte);
774
775	if (page)
776	return page_folio(page);
777	return NULL;
778	}
779
780	#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
781	/**
782	* vm_normal_page_pmd() - Get the "struct page" associated with a PMD
783	* @vma: The VMA mapping the @pmd.
784	* @addr: The address where the @pmd is mapped.
785	* @pmd: The PMD.
786	*
787	* Get the "struct page" associated with a PTE. See __vm_normal_page()
788	* for details on "normal" and "special" mappings.
789	*
790	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
791	* NULL if this is a "special" mapping.
792	*/
793	struct page vm_normal_page_pmd(struct* vm_area_struct vma, unsigned* long addr,
794	pmd_t pmd)
795	{
796	return __vm_normal_page(vma, addr, pfn: pmd_pfn(pmd), special: pmd_special(pmd),
797	entry: pmd_val(pmd), level: PGTABLE_LEVEL_PMD);
798	}
799
800	/**
801	* vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
802	* @vma: The VMA mapping the @pmd.
803	* @addr: The address where the @pmd is mapped.
804	* @pmd: The PMD.
805	*
806	* Get the "struct folio" associated with a PTE. See __vm_normal_page()
807	* for details on "normal" and "special" mappings.
808	*
809	* Return: Returns the "struct folio" if this is a "normal" mapping. Returns
810	* NULL if this is a "special" mapping.
811	*/
812	struct folio vm_normal_folio_pmd(struct* vm_area_struct *vma,
813	unsigned long addr, pmd_t pmd)
814	{
815	struct page *page = vm_normal_page_pmd(vma, addr, pmd);
816
817	if (page)
818	return page_folio(page);
819	return NULL;
820	}
821
822	/**
823	* vm_normal_page_pud() - Get the "struct page" associated with a PUD
824	* @vma: The VMA mapping the @pud.
825	* @addr: The address where the @pud is mapped.
826	* @pud: The PUD.
827	*
828	* Get the "struct page" associated with a PUD. See __vm_normal_page()
829	* for details on "normal" and "special" mappings.
830	*
831	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
832	* NULL if this is a "special" mapping.
833	*/
834	struct page vm_normal_page_pud(struct* vm_area_struct *vma,
835	unsigned long addr, pud_t pud)
836	{
837	return __vm_normal_page(vma, addr, pud_pfn(pud), special: pud_special(pud),
838	entry: pud_val(pud), level: PGTABLE_LEVEL_PUD);
839	}
840	#endif
841
842	/**
843	* restore_exclusive_pte - Restore a device-exclusive entry
844	* @vma: VMA covering @address
845	* @folio: the mapped folio
846	* @page: the mapped folio page
847	* @address: the virtual address
848	* @ptep: pte pointer into the locked page table mapping the folio page
849	* @orig_pte: pte value at @ptep
850	*
851	* Restore a device-exclusive non-swap entry to an ordinary present pte.
852	*
853	* The folio and the page table must be locked, and MMU notifiers must have
854	* been called to invalidate any (exclusive) device mappings.
855	*
856	* Locking the folio makes sure that anybody who just converted the pte to
857	* a device-exclusive entry can map it into the device to make forward
858	* progress without others converting it back until the folio was unlocked.
859	*
860	* If the folio lock ever becomes an issue, we can stop relying on the folio
861	* lock; it might make some scenarios with heavy thrashing less likely to
862	* make forward progress, but these scenarios might not be valid use cases.
863	*
864	* Note that the folio lock does not protect against all cases of concurrent
865	* page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
866	* must use MMU notifiers to sync against any concurrent changes.
867	*/
868	static void restore_exclusive_pte(struct vm_area_struct *vma,
869	struct folio folio, struct* page page, unsigned* long address,
870	pte_t *ptep, pte_t orig_pte)
871	{
872	pte_t pte;
873
874	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
875
876	pte = pte_mkold(pte: mk_pte(page, READ_ONCE(vma->vm_page_prot)));
877	if (pte_swp_soft_dirty(pte: orig_pte))
878	pte = pte_mksoft_dirty(pte);
879
880	if (pte_swp_uffd_wp(pte: orig_pte))
881	pte = pte_mkuffd_wp(pte);
882
883	if ((vma->vm_flags & VM_WRITE) &&
884	can_change_pte_writable(vma, addr: address, pte)) {
885	if (folio_test_dirty(folio))
886	pte = pte_mkdirty(pte);
887	pte = pte_mkwrite(pte, vma);
888	}
889	set_pte_at(vma->vm_mm, address, ptep, pte);
890
891	/*
892	* No need to invalidate - it was non-present before. However
893	* secondary CPUs may have mappings that need invalidating.
894	*/
895	update_mmu_cache(vma, addr: address, ptep);
896	}
897
898	/*
899	* Tries to restore an exclusive pte if the page lock can be acquired without
900	* sleeping.
901	*/
902	static int try_restore_exclusive_pte(struct vm_area_struct *vma,
903	unsigned long addr, pte_t *ptep, pte_t orig_pte)
904	{
905	const softleaf_t entry = softleaf_from_pte(pte: orig_pte);
906	struct page *page = softleaf_to_page(entry);
907	struct folio *folio = page_folio(page);
908
909	if (folio_trylock(folio)) {
910	restore_exclusive_pte(vma, folio, page, address: addr, ptep, orig_pte);
911	folio_unlock(folio);
912	return `0`;
913	}
914
915	return -EBUSY;
916	}
917
918	/*
919	* copy one vm_area from one task to the other. Assumes the page tables
920	* already present in the new task to be cleared in the whole range
921	* covered by this vma.
922	*/
923
924	static unsigned long
925	copy_nonpresent_pte(struct mm_struct dst_mm, struct* mm_struct *src_mm,
926	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *dst_vma,
927	struct vm_area_struct src_vma, unsigned* long addr, int *rss)
928	{
929	vm_flags_t vm_flags = dst_vma->vm_flags;
930	pte_t orig_pte = ptep_get(ptep: src_pte);
931	softleaf_t entry = softleaf_from_pte(pte: orig_pte);
932	pte_t pte = orig_pte;
933	struct folio *folio;
934	struct page *page;
935
936	if (likely(softleaf_is_swap(entry))) {
937	if (swap_duplicate(entry) < `0`)
938	return -EIO;
939
940	/ make sure dst_mm is on swapoff's mmlist. /
941	if (unlikely(list_empty(&dst_mm->mmlist))) {
942	spin_lock(lock: &mmlist_lock);
943	if (list_empty(head: &dst_mm->mmlist))
944	list_add(new: &dst_mm->mmlist,
945	head: &src_mm->mmlist);
946	spin_unlock(lock: &mmlist_lock);
947	}
948	/ Mark the swap entry as shared. /
949	if (pte_swp_exclusive(pte: orig_pte)) {
950	pte = pte_swp_clear_exclusive(pte: orig_pte);
951	set_pte_at(src_mm, addr, src_pte, pte);
952	}
953	rss[MM_SWAPENTS]++;
954	} else if (softleaf_is_migration(entry)) {
955	folio = softleaf_to_folio(entry);
956
957	rss[mm_counter(folio)]++;
958
959	if (!softleaf_is_migration_read(entry) &&
960	is_cow_mapping(flags: vm_flags)) {
961	/*
962	* COW mappings require pages in both parent and child
963	* to be set to read. A previously exclusive entry is
964	* now shared.
965	*/
966	entry = make_readable_migration_entry(
967	offset: swp_offset(entry));
968	pte = softleaf_to_pte(entry);
969	if (pte_swp_soft_dirty(pte: orig_pte))
970	pte = pte_swp_mksoft_dirty(pte);
971	if (pte_swp_uffd_wp(pte: orig_pte))
972	pte = pte_swp_mkuffd_wp(pte);
973	set_pte_at(src_mm, addr, src_pte, pte);
974	}
975	} else if (softleaf_is_device_private(entry)) {
976	page = softleaf_to_page(entry);
977	folio = page_folio(page);
978
979	/*
980	* Update rss count even for unaddressable pages, as
981	* they should treated just like normal pages in this
982	* respect.
983	*
984	* We will likely want to have some new rss counters
985	* for unaddressable pages, at some point. But for now
986	* keep things as they are.
987	*/
988	folio_get(folio);
989	rss[mm_counter(folio)]++;
990	/ Cannot fail as these pages cannot get pinned. /
991	folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
992
993	/*
994	* We do not preserve soft-dirty information, because so
995	* far, checkpoint/restore is the only feature that
996	* requires that. And checkpoint/restore does not work
997	* when a device driver is involved (you cannot easily
998	* save and restore device driver state).
999	*/
1000	if (softleaf_is_device_private_write(entry) &&
1001	is_cow_mapping(flags: vm_flags)) {
1002	entry = make_readable_device_private_entry(
1003	offset: swp_offset(entry));
1004	pte = swp_entry_to_pte(entry);
1005	if (pte_swp_uffd_wp(pte: orig_pte))
1006	pte = pte_swp_mkuffd_wp(pte);
1007	set_pte_at(src_mm, addr, src_pte, pte);
1008	}
1009	} else if (softleaf_is_device_exclusive(entry)) {
1010	/*
1011	* Make device exclusive entries present by restoring the
1012	* original entry then copying as for a present pte. Device
1013	* exclusive entries currently only support private writable
1014	* (ie. COW) mappings.
1015	*/
1016	VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
1017	if (try_restore_exclusive_pte(vma: src_vma, addr, ptep: src_pte, orig_pte))
1018	return -EBUSY;
1019	return -ENOENT;
1020	} else if (softleaf_is_marker(entry)) {
1021	pte_marker marker = copy_pte_marker(entry, dst_vma);
1022
1023	if (marker)
1024	set_pte_at(dst_mm, addr, dst_pte,
1025	make_pte_marker(marker));
1026	return `0`;
1027	}
1028	if (!userfaultfd_wp(vma: dst_vma))
1029	pte = pte_swp_clear_uffd_wp(pte);
1030	set_pte_at(dst_mm, addr, dst_pte, pte);
1031	return `0`;
1032	}
1033
1034	/*
1035	* Copy a present and normal page.
1036	*
1037	* NOTE! The usual case is that this isn't required;
1038	* instead, the caller can just increase the page refcount
1039	* and re-use the pte the traditional way.
1040	*
1041	* And if we need a pre-allocated page but don't yet have
1042	* one, return a negative error to let the preallocation
1043	* code know so that it can do so outside the page table
1044	* lock.
1045	*/
1046	static inline int
1047	copy_present_page(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1048	pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
1049	struct folio prealloc, struct** page *page)
1050	{
1051	struct folio *new_folio;
1052	pte_t pte;
1053
1054	new_folio = *prealloc;
1055	if (!new_folio)
1056	return -EAGAIN;
1057
1058	/*
1059	* We have a prealloc page, all good! Take it
1060	* over and copy the page & arm it.
1061	*/
1062
1063	if (copy_mc_user_highpage(to: &new_folio->page, from: page, vaddr: addr, vma: src_vma))
1064	return -EHWPOISON;
1065
1066	*prealloc = NULL;
1067	__folio_mark_uptodate(folio: new_folio);
1068	folio_add_new_anon_rmap(new_folio, dst_vma, address: addr, RMAP_EXCLUSIVE);
1069	folio_add_lru_vma(new_folio, dst_vma);
1070	rss[MM_ANONPAGES]++;
1071
1072	/ All done, just insert the new page copy in the child /
1073	pte = folio_mk_pte(folio: new_folio, pgprot: dst_vma->vm_page_prot);
1074	pte = maybe_mkwrite(pte: pte_mkdirty(pte), vma: dst_vma);
1075	if (userfaultfd_pte_wp(vma: dst_vma, pte: ptep_get(ptep: src_pte)))
1076	/ Uffd-wp needs to be delivered to dest pte as well /
1077	pte = pte_mkuffd_wp(pte);
1078	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
1079	return `0`;
1080	}
1081
1082	static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
1083	struct vm_area_struct src_vma, pte_t dst_pte, pte_t *src_pte,
1084	pte_t pte, unsigned long addr, int nr)
1085	{
1086	struct mm_struct *src_mm = src_vma->vm_mm;
1087
1088	/ If it's a COW mapping, write protect it both processes. /
1089	if (is_cow_mapping(flags: src_vma->vm_flags) && pte_write(pte)) {
1090	wrprotect_ptes(mm: src_mm, addr, ptep: src_pte, nr);
1091	pte = pte_wrprotect(pte);
1092	}
1093
1094	/ If it's a shared mapping, mark it clean in the child. /
1095	if (src_vma->vm_flags & VM_SHARED)
1096	pte = pte_mkclean(pte);
1097	pte = pte_mkold(pte);
1098
1099	if (!userfaultfd_wp(vma: dst_vma))
1100	pte = pte_clear_uffd_wp(pte);
1101
1102	set_ptes(mm: dst_vma->vm_mm, addr, ptep: dst_pte, pte, nr);
1103	}
1104
1105	/*
1106	* Copy one present PTE, trying to batch-process subsequent PTEs that map
1107	* consecutive pages of the same folio by copying them as well.
1108	*
1109	* Returns -EAGAIN if one preallocated page is required to copy the next PTE.
1110	* Otherwise, returns the number of copied PTEs (at least 1).
1111	*/
1112	static inline int
1113	copy_present_ptes(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1114	pte_t dst_pte, pte_t src_pte, pte_t pte, unsigned long addr,
1115	int max_nr, int rss, struct* folio **prealloc)
1116	{
1117	fpb_t flags = FPB_MERGE_WRITE;
1118	struct page *page;
1119	struct folio *folio;
1120	int err, nr;
1121
1122	page = vm_normal_page(vma: src_vma, addr, pte);
1123	if (unlikely(!page))
1124	goto copy_pte;
1125
1126	folio = page_folio(page);
1127
1128	/*
1129	* If we likely have to copy, just don't bother with batching. Make
1130	* sure that the common "small folio" case is as fast as possible
1131	* by keeping the batching logic separate.
1132	*/
1133	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != `1`)) {
1134	if (!(src_vma->vm_flags & VM_SHARED))
1135	flags \|= FPB_RESPECT_DIRTY;
1136	if (vma_soft_dirty_enabled(vma: src_vma))
1137	flags \|= FPB_RESPECT_SOFT_DIRTY;
1138
1139	nr = folio_pte_batch_flags(folio, vma: src_vma, ptep: src_pte, ptentp: &pte, max_nr, flags);
1140	folio_ref_add(folio, nr);
1141	if (folio_test_anon(folio)) {
1142	if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
1143	nr, dst_vma, src_vma))) {
1144	folio_ref_sub(folio, nr);
1145	return -EAGAIN;
1146	}
1147	rss[MM_ANONPAGES] += nr;
1148	VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1149	} else {
1150	folio_dup_file_rmap_ptes(folio, page, nr_pages: nr, dst_vma);
1151	rss[mm_counter_file(folio)] += nr;
1152	}
1153	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
1154	addr, nr);
1155	return nr;
1156	}
1157
1158	folio_get(folio);
1159	if (folio_test_anon(folio)) {
1160	/*
1161	* If this page may have been pinned by the parent process,
1162	* copy the page immediately for the child so that we'll always
1163	* guarantee the pinned page won't be randomly replaced in the
1164	* future.
1165	*/
1166	if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
1167	/ Page may be pinned, we have to copy. /
1168	folio_put(folio);
1169	err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
1170	addr, rss, prealloc, page);
1171	return err ? err : `1`;
1172	}
1173	rss[MM_ANONPAGES]++;
1174	VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1175	} else {
1176	folio_dup_file_rmap_pte(folio, page, dst_vma);
1177	rss[mm_counter_file(folio)]++;
1178	}
1179
1180	copy_pte:
1181	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr: `1`);
1182	return `1`;
1183	}
1184
1185	static inline struct folio folio_prealloc(struct* mm_struct *src_mm,
1186	struct vm_area_struct vma, unsigned* long addr, bool need_zero)
1187	{
1188	struct folio *new_folio;
1189
1190	if (need_zero)
1191	new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
1192	else
1193	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, `0`, vma, addr);
1194
1195	if (!new_folio)
1196	return NULL;
1197
1198	if (mem_cgroup_charge(folio: new_folio, mm: src_mm, GFP_KERNEL)) {
1199	folio_put(folio: new_folio);
1200	return NULL;
1201	}
1202	folio_throttle_swaprate(folio: new_folio, GFP_KERNEL);
1203
1204	return new_folio;
1205	}
1206
1207	static int
1208	copy_pte_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1209	pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
1210	unsigned long end)
1211	{
1212	struct mm_struct *dst_mm = dst_vma->vm_mm;
1213	struct mm_struct *src_mm = src_vma->vm_mm;
1214	pte_t orig_src_pte, orig_dst_pte;
1215	pte_t src_pte, dst_pte;
1216	pmd_t dummy_pmdval;
1217	pte_t ptent;
1218	spinlock_t src_ptl, dst_ptl;
1219	int progress, max_nr, ret = `0`;
1220	int rss[NR_MM_COUNTERS];
1221	softleaf_t entry = softleaf_mk_none();
1222	struct folio *prealloc = NULL;
1223	int nr;
1224
1225	again:
1226	progress = `0`;
1227	init_rss_vec(rss);
1228
1229	/*
1230	* copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
1231	* error handling here, assume that exclusive mmap_lock on dst and src
1232	* protects anon from unexpected THP transitions; with shmem and file
1233	* protected by mmap_lock-less collapse skipping areas with anon_vma
1234	* (whereas vma_needs_copy() skips areas without anon_vma). A rework
1235	* can remove such assumptions later, but this is good enough for now.
1236	*/
1237	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1238	if (!dst_pte) {
1239	ret = -ENOMEM;
1240	goto out;
1241	}
1242
1243	/*
1244	* We already hold the exclusive mmap_lock, the copy_pte_range() and
1245	* retract_page_tables() are using vma->anon_vma to be exclusive, so
1246	* the PTE page is stable, and there is no need to get pmdval and do
1247	* pmd_same() check.
1248	*/
1249	src_pte = pte_offset_map_rw_nolock(mm: src_mm, pmd: src_pmd, addr, pmdvalp: &dummy_pmdval,
1250	ptlp: &src_ptl);
1251	if (!src_pte) {
1252	pte_unmap_unlock(dst_pte, dst_ptl);
1253	/ ret == 0 /
1254	goto out;
1255	}
1256	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1257	orig_src_pte = src_pte;
1258	orig_dst_pte = dst_pte;
1259	arch_enter_lazy_mmu_mode();
1260
1261	do {
1262	nr = `1`;
1263
1264	/*
1265	* We are holding two locks at this point - either of them
1266	* could generate latencies in another task on another CPU.
1267	*/
1268	if (progress >= `32`) {
1269	progress = `0`;
1270	if (need_resched() \|\|
1271	spin_needbreak(lock: src_ptl) \|\| spin_needbreak(lock: dst_ptl))
1272	break;
1273	}
1274	ptent = ptep_get(ptep: src_pte);
1275	if (pte_none(pte: ptent)) {
1276	progress++;
1277	continue;
1278	}
1279	if (unlikely(!pte_present(ptent))) {
1280	ret = copy_nonpresent_pte(dst_mm, src_mm,
1281	dst_pte, src_pte,
1282	dst_vma, src_vma,
1283	addr, rss);
1284	if (ret == -EIO) {
1285	entry = softleaf_from_pte(pte: ptep_get(ptep: src_pte));
1286	break;
1287	} else if (ret == -EBUSY) {
1288	break;
1289	} else if (!ret) {
1290	progress += `8`;
1291	continue;
1292	}
1293	ptent = ptep_get(ptep: src_pte);
1294	VM_WARN_ON_ONCE(!pte_present(ptent));
1295
1296	/*
1297	* Device exclusive entry restored, continue by copying
1298	* the now present pte.
1299	*/
1300	WARN_ON_ONCE(ret != -ENOENT);
1301	}
1302	/ copy_present_ptes() will clear `prealloc' if consumed /*
1303	max_nr = (end - addr) / PAGE_SIZE;
1304	ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
1305	pte: ptent, addr, max_nr, rss, prealloc: &prealloc);
1306	/*
1307	* If we need a pre-allocated page for this pte, drop the
1308	* locks, allocate, and try again.
1309	* If copy failed due to hwpoison in source page, break out.
1310	*/
1311	if (unlikely(ret == -EAGAIN \|\| ret == -EHWPOISON))
1312	break;
1313	if (unlikely(prealloc)) {
1314	/*
1315	* pre-alloc page cannot be reused by next time so as
1316	* to strictly follow mempolicy (e.g., alloc_page_vma()
1317	* will allocate page according to address). This
1318	* could only happen if one pinned pte changed.
1319	*/
1320	folio_put(folio: prealloc);
1321	prealloc = NULL;
1322	}
1323	nr = ret;
1324	progress += `8` * nr;
1325	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
1326	addr != end);
1327
1328	arch_leave_lazy_mmu_mode();
1329	pte_unmap_unlock(orig_src_pte, src_ptl);
1330	add_mm_rss_vec(mm: dst_mm, rss);
1331	pte_unmap_unlock(orig_dst_pte, dst_ptl);
1332	cond_resched();
1333
1334	if (ret == -EIO) {
1335	VM_WARN_ON_ONCE(!entry.val);
1336	if (add_swap_count_continuation(entry, GFP_KERNEL) < `0`) {
1337	ret = -ENOMEM;
1338	goto out;
1339	}
1340	entry.val = `0`;
1341	} else if (ret == -EBUSY \|\| unlikely(ret == -EHWPOISON)) {
1342	goto out;
1343	} else if (ret == -EAGAIN) {
1344	prealloc = folio_prealloc(src_mm, vma: src_vma, addr, need_zero: false);
1345	if (!prealloc)
1346	return -ENOMEM;
1347	} else if (ret < `0`) {
1348	VM_WARN_ON_ONCE(`1`);
1349	}
1350
1351	/ We've captured and resolved the error. Reset, try again. /
1352	ret = `0`;
1353
1354	if (addr != end)
1355	goto again;
1356	out:
1357	if (unlikely(prealloc))
1358	folio_put(folio: prealloc);
1359	return ret;
1360	}
1361
1362	static inline int
1363	copy_pmd_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1364	pud_t dst_pud, pud_t src_pud, unsigned long addr,
1365	unsigned long end)
1366	{
1367	struct mm_struct *dst_mm = dst_vma->vm_mm;
1368	struct mm_struct *src_mm = src_vma->vm_mm;
1369	pmd_t src_pmd, dst_pmd;
1370	unsigned long next;
1371
1372	dst_pmd = pmd_alloc(mm: dst_mm, pud: dst_pud, address: addr);
1373	if (!dst_pmd)
1374	return -ENOMEM;
1375	src_pmd = pmd_offset(pud: src_pud, address: addr);
1376	do {
1377	next = pmd_addr_end(addr, end);
1378	if (pmd_is_huge(pmd: *src_pmd)) {
1379	int err;
1380
1381	VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1382	err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1383	addr, dst_vma, src_vma);
1384	if (err == -ENOMEM)
1385	return -ENOMEM;
1386	if (!err)
1387	continue;
1388	/ fall through /
1389	}
1390	if (pmd_none_or_clear_bad(pmd: src_pmd))
1391	continue;
1392	if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1393	addr, end: next))
1394	return -ENOMEM;
1395	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1396	return `0`;
1397	}
1398
1399	static inline int
1400	copy_pud_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1401	p4d_t dst_p4d, p4d_t src_p4d, unsigned long addr,
1402	unsigned long end)
1403	{
1404	struct mm_struct *dst_mm = dst_vma->vm_mm;
1405	struct mm_struct *src_mm = src_vma->vm_mm;
1406	pud_t src_pud, dst_pud;
1407	unsigned long next;
1408
1409	dst_pud = pud_alloc(mm: dst_mm, p4d: dst_p4d, address: addr);
1410	if (!dst_pud)
1411	return -ENOMEM;
1412	src_pud = pud_offset(p4d: src_p4d, address: addr);
1413	do {
1414	next = pud_addr_end(addr, end);
1415	if (pud_trans_huge(pud: *src_pud)) {
1416	int err;
1417
1418	VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1419	err = copy_huge_pud(dst_mm, src_mm,
1420	dst_pud, src_pud, addr, vma: src_vma);
1421	if (err == -ENOMEM)
1422	return -ENOMEM;
1423	if (!err)
1424	continue;
1425	/ fall through /
1426	}
1427	if (pud_none_or_clear_bad(pud: src_pud))
1428	continue;
1429	if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1430	addr, end: next))
1431	return -ENOMEM;
1432	} while (dst_pud++, src_pud++, addr = next, addr != end);
1433	return `0`;
1434	}
1435
1436	static inline int
1437	copy_p4d_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1438	pgd_t dst_pgd, pgd_t src_pgd, unsigned long addr,
1439	unsigned long end)
1440	{
1441	struct mm_struct *dst_mm = dst_vma->vm_mm;
1442	p4d_t src_p4d, dst_p4d;
1443	unsigned long next;
1444
1445	dst_p4d = p4d_alloc(mm: dst_mm, pgd: dst_pgd, address: addr);
1446	if (!dst_p4d)
1447	return -ENOMEM;
1448	src_p4d = p4d_offset(pgd: src_pgd, address: addr);
1449	do {
1450	next = p4d_addr_end(addr, end);
1451	if (p4d_none_or_clear_bad(p4d: src_p4d))
1452	continue;
1453	if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1454	addr, end: next))
1455	return -ENOMEM;
1456	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1457	return `0`;
1458	}
1459
1460	/*
1461	* Return true if the vma needs to copy the pgtable during this fork(). Return
1462	* false when we can speed up fork() by allowing lazy page faults later until
1463	* when the child accesses the memory range.
1464	*/
1465	static bool
1466	vma_needs_copy(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1467	{
1468	/*
1469	* We check against dst_vma as while sane VMA flags will have been
1470	* copied, VM_UFFD_WP may be set only on dst_vma.
1471	*/
1472	if (dst_vma->vm_flags & VM_COPY_ON_FORK)
1473	return true;
1474	/*
1475	* The presence of an anon_vma indicates an anonymous VMA has page
1476	* tables which naturally cannot be reconstituted on page fault.
1477	*/
1478	if (src_vma->anon_vma)
1479	return true;
1480
1481	/*
1482	* Don't copy ptes where a page fault will fill them correctly. Fork
1483	* becomes much lighter when there are big shared or private readonly
1484	* mappings. The tradeoff is that copy_page_range is more efficient
1485	* than faulting.
1486	*/
1487	return false;
1488	}
1489
1490	int
1491	copy_page_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1492	{
1493	pgd_t src_pgd, dst_pgd;
1494	unsigned long addr = src_vma->vm_start;
1495	unsigned long end = src_vma->vm_end;
1496	struct mm_struct *dst_mm = dst_vma->vm_mm;
1497	struct mm_struct *src_mm = src_vma->vm_mm;
1498	struct mmu_notifier_range range;
1499	unsigned long next;
1500	bool is_cow;
1501	int ret;
1502
1503	if (!vma_needs_copy(dst_vma, src_vma))
1504	return `0`;
1505
1506	if (is_vm_hugetlb_page(vma: src_vma))
1507	return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
1508
1509	/*
1510	* We need to invalidate the secondary MMU mappings only when
1511	* there could be a permission downgrade on the ptes of the
1512	* parent mm. And a permission downgrade will only happen if
1513	* is_cow_mapping() returns true.
1514	*/
1515	is_cow = is_cow_mapping(flags: src_vma->vm_flags);
1516
1517	if (is_cow) {
1518	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_PAGE,
1519	flags: `0`, mm: src_mm, start: addr, end);
1520	mmu_notifier_invalidate_range_start(range: &range);
1521	/*
1522	* Disabling preemption is not needed for the write side, as
1523	* the read side doesn't spin, but goes to the mmap_lock.
1524	*
1525	* Use the raw variant of the seqcount_t write API to avoid
1526	* lockdep complaining about preemptibility.
1527	*/
1528	vma_assert_write_locked(vma: src_vma);
1529	raw_write_seqcount_begin(&src_mm->write_protect_seq);
1530	}
1531
1532	ret = `0`;
1533	dst_pgd = pgd_offset(dst_mm, addr);
1534	src_pgd = pgd_offset(src_mm, addr);
1535	do {
1536	next = pgd_addr_end(addr, end);
1537	if (pgd_none_or_clear_bad(pgd: src_pgd))
1538	continue;
1539	if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1540	addr, next))) {
1541	ret = -ENOMEM;
1542	break;
1543	}
1544	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1545
1546	if (is_cow) {
1547	raw_write_seqcount_end(&src_mm->write_protect_seq);
1548	mmu_notifier_invalidate_range_end(range: &range);
1549	}
1550	return ret;
1551	}
1552
1553	/ Whether we should zap all COWed (private) pages too /
1554	static inline bool should_zap_cows(struct zap_details *details)
1555	{
1556	/ By default, zap all pages /
1557	if (!details \|\| details->reclaim_pt)
1558	return true;
1559
1560	/ Or, we zap COWed pages only if the caller wants to /
1561	return details->even_cows;
1562	}
1563
1564	/ Decides whether we should zap this folio with the folio pointer specified /
1565	static inline bool should_zap_folio(struct zap_details *details,
1566	struct folio *folio)
1567	{
1568	/ If we can make a decision without folio.. /*
1569	if (should_zap_cows(details))
1570	return true;
1571
1572	/ Otherwise we should only zap non-anon folios /
1573	return !folio_test_anon(folio);
1574	}
1575
1576	static inline bool zap_drop_markers(struct zap_details *details)
1577	{
1578	if (!details)
1579	return false;
1580
1581	return details->zap_flags & ZAP_FLAG_DROP_MARKER;
1582	}
1583
1584	/*
1585	* This function makes sure that we'll replace the none pte with an uffd-wp
1586	* swap special pte marker when necessary. Must be with the pgtable lock held.
1587	*
1588	* Returns true if uffd-wp ptes was installed, false otherwise.
1589	*/
1590	static inline bool
1591	zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
1592	unsigned long addr, pte_t pte, int* nr,
1593	struct zap_details *details, pte_t pteval)
1594	{
1595	bool was_installed = false;
1596
1597	if (!uffd_supports_wp_marker())
1598	return false;
1599
1600	/ Zap on anonymous always means dropping everything /
1601	if (vma_is_anonymous(vma))
1602	return false;
1603
1604	if (zap_drop_markers(details))
1605	return false;
1606
1607	for (;;) {
1608	/ the PFN in the PTE is irrelevant. /
1609	if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
1610	was_installed = true;
1611	if (--nr == `0`)
1612	break;
1613	pte++;
1614	addr += PAGE_SIZE;
1615	}
1616
1617	return was_installed;
1618	}
1619
1620	static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
1621	struct vm_area_struct vma, struct* folio *folio,
1622	struct page page, pte_t pte, pte_t ptent, unsigned int nr,
1623	unsigned long addr, struct zap_details details, int* *rss,
1624	bool force_flush, bool force_break, bool *any_skipped)
1625	{
1626	struct mm_struct *mm = tlb->mm;
1627	bool delay_rmap = false;
1628
1629	if (!folio_test_anon(folio)) {
1630	ptent = get_and_clear_full_ptes(mm, addr, ptep: pte, nr, full: tlb->fullmm);
1631	if (pte_dirty(pte: ptent)) {
1632	folio_mark_dirty(folio);
1633	if (tlb_delay_rmap(tlb)) {
1634	delay_rmap = true;
1635	*force_flush = true;
1636	}
1637	}
1638	if (pte_young(pte: ptent) && likely(vma_has_recency(vma)))
1639	folio_mark_accessed(folio);
1640	rss[mm_counter(folio)] -= nr;
1641	} else {
1642	/ We don't need up-to-date accessed/dirty bits. /
1643	clear_full_ptes(mm, addr, ptep: pte, nr, full: tlb->fullmm);
1644	rss[MM_ANONPAGES] -= nr;
1645	}
1646	/ Checking a single PTE in a batch is sufficient. /
1647	arch_check_zapped_pte(vma, pte: ptent);
1648	tlb_remove_tlb_entries(tlb, ptep: pte, nr, address: addr);
1649	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
1650	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
1651	nr, details, pteval: ptent);
1652
1653	if (!delay_rmap) {
1654	folio_remove_rmap_ptes(folio, page, nr_pages: nr, vma);
1655
1656	if (unlikely(folio_mapcount(folio) < `0`))
1657	print_bad_pte(vma, addr, ptent, page);
1658	}
1659	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
1660	*force_flush = true;
1661	*force_break = true;
1662	}
1663	}
1664
1665	/*
1666	* Zap or skip at least one present PTE, trying to batch-process subsequent
1667	* PTEs that map consecutive pages of the same folio.
1668	*
1669	* Returns the number of processed (skipped or zapped) PTEs (at least 1).
1670	*/
1671	static inline int zap_present_ptes(struct mmu_gather *tlb,
1672	struct vm_area_struct vma, pte_t pte, pte_t ptent,
1673	unsigned int max_nr, unsigned long addr,
1674	struct zap_details details, int* rss, bool force_flush,
1675	bool force_break, bool any_skipped)
1676	{
1677	struct mm_struct *mm = tlb->mm;
1678	struct folio *folio;
1679	struct page *page;
1680	int nr;
1681
1682	page = vm_normal_page(vma, addr, pte: ptent);
1683	if (!page) {
1684	/ We don't need up-to-date accessed/dirty bits. /
1685	ptep_get_and_clear_full(mm, addr, ptep: pte, full: tlb->fullmm);
1686	arch_check_zapped_pte(vma, pte: ptent);
1687	tlb_remove_tlb_entry(tlb, pte, addr);
1688	if (userfaultfd_pte_wp(vma, pte: ptent))
1689	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
1690	pte, nr: `1`, details, pteval: ptent);
1691	ksm_might_unmap_zero_page(mm, pte: ptent);
1692	return `1`;
1693	}
1694
1695	folio = page_folio(page);
1696	if (unlikely(!should_zap_folio(details, folio))) {
1697	*any_skipped = true;
1698	return `1`;
1699	}
1700
1701	/*
1702	* Make sure that the common "small folio" case is as fast as possible
1703	* by keeping the batching logic separate.
1704	*/
1705	if (unlikely(folio_test_large(folio) && max_nr != `1`)) {
1706	nr = folio_pte_batch(folio, ptep: pte, pte: ptent, max_nr);
1707	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
1708	addr, details, rss, force_flush,
1709	force_break, any_skipped);
1710	return nr;
1711	}
1712	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr: `1`, addr,
1713	details, rss, force_flush, force_break, any_skipped);
1714	return `1`;
1715	}
1716
1717	static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
1718	struct vm_area_struct vma, pte_t pte, pte_t ptent,
1719	unsigned int max_nr, unsigned long addr,
1720	struct zap_details details, int* rss, bool any_skipped)
1721	{
1722	softleaf_t entry;
1723	int nr = `1`;
1724
1725	*any_skipped = true;
1726	entry = softleaf_from_pte(pte: ptent);
1727	if (softleaf_is_device_private(entry) \|\|
1728	softleaf_is_device_exclusive(entry)) {
1729	struct page *page = softleaf_to_page(entry);
1730	struct folio *folio = page_folio(page);
1731
1732	if (unlikely(!should_zap_folio(details, folio)))
1733	return `1`;
1734	/*
1735	* Both device private/exclusive mappings should only
1736	* work with anonymous page so far, so we don't need to
1737	* consider uffd-wp bit when zap. For more information,
1738	* see zap_install_uffd_wp_if_needed().
1739	*/
1740	WARN_ON_ONCE(!vma_is_anonymous(vma));
1741	rss[mm_counter(folio)]--;
1742	folio_remove_rmap_pte(folio, page, vma);
1743	folio_put(folio);
1744	} else if (softleaf_is_swap(entry)) {
1745	/ Genuine swap entries, hence a private anon pages /
1746	if (!should_zap_cows(details))
1747	return `1`;
1748
1749	nr = swap_pte_batch(start_ptep: pte, max_nr, pte: ptent);
1750	rss[MM_SWAPENTS] -= nr;
1751	free_swap_and_cache_nr(entry, nr);
1752	} else if (softleaf_is_migration(entry)) {
1753	struct folio *folio = softleaf_to_folio(entry);
1754
1755	if (!should_zap_folio(details, folio))
1756	return `1`;
1757	rss[mm_counter(folio)]--;
1758	} else if (softleaf_is_uffd_wp_marker(entry)) {
1759	/*
1760	* For anon: always drop the marker; for file: only
1761	* drop the marker if explicitly requested.
1762	*/
1763	if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
1764	return `1`;
1765	} else if (softleaf_is_guard_marker(entry)) {
1766	/*
1767	* Ordinary zapping should not remove guard PTE
1768	* markers. Only do so if we should remove PTE markers
1769	* in general.
1770	*/
1771	if (!zap_drop_markers(details))
1772	return `1`;
1773	} else if (softleaf_is_hwpoison(entry) \|\|
1774	softleaf_is_poison_marker(entry)) {
1775	if (!should_zap_cows(details))
1776	return `1`;
1777	} else {
1778	/ We should have covered all the swap entry types /
1779	pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
1780	WARN_ON_ONCE(`1`);
1781	}
1782	clear_not_present_full_ptes(mm: vma->vm_mm, addr, ptep: pte, nr, full: tlb->fullmm);
1783	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, pteval: ptent);
1784
1785	return nr;
1786	}
1787
1788	static inline int do_zap_pte_range(struct mmu_gather *tlb,
1789	struct vm_area_struct vma, pte_t pte,
1790	unsigned long addr, unsigned long end,
1791	struct zap_details details, int* *rss,
1792	bool force_flush, bool force_break,
1793	bool *any_skipped)
1794	{
1795	pte_t ptent = ptep_get(ptep: pte);
1796	int max_nr = (end - addr) / PAGE_SIZE;
1797	int nr = `0`;
1798
1799	/ Skip all consecutive none ptes /
1800	if (pte_none(pte: ptent)) {
1801	for (nr = `1`; nr < max_nr; nr++) {
1802	ptent = ptep_get(ptep: pte + nr);
1803	if (!pte_none(pte: ptent))
1804	break;
1805	}
1806	max_nr -= nr;
1807	if (!max_nr)
1808	return nr;
1809	pte += nr;
1810	addr += nr * PAGE_SIZE;
1811	}
1812
1813	if (pte_present(a: ptent))
1814	nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
1815	details, rss, force_flush, force_break,
1816	any_skipped);
1817	else
1818	nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
1819	details, rss, any_skipped);
1820
1821	return nr;
1822	}
1823
1824	static unsigned long zap_pte_range(struct mmu_gather *tlb,
1825	struct vm_area_struct vma, pmd_t pmd,
1826	unsigned long addr, unsigned long end,
1827	struct zap_details *details)
1828	{
1829	bool force_flush = false, force_break = false;
1830	struct mm_struct *mm = tlb->mm;
1831	int rss[NR_MM_COUNTERS];
1832	spinlock_t *ptl;
1833	pte_t *start_pte;
1834	pte_t *pte;
1835	pmd_t pmdval;
1836	unsigned long start = addr;
1837	bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
1838	bool direct_reclaim = true;
1839	int nr;
1840
1841	retry:
1842	tlb_change_page_size(tlb, PAGE_SIZE);
1843	init_rss_vec(rss);
1844	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl);
1845	if (!pte)
1846	return addr;
1847
1848	flush_tlb_batched_pending(mm);
1849	arch_enter_lazy_mmu_mode();
1850	do {
1851	bool any_skipped = false;
1852
1853	if (need_resched()) {
1854	direct_reclaim = false;
1855	break;
1856	}
1857
1858	nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
1859	force_flush: &force_flush, force_break: &force_break, any_skipped: &any_skipped);
1860	if (any_skipped)
1861	can_reclaim_pt = false;
1862	if (unlikely(force_break)) {
1863	addr += nr * PAGE_SIZE;
1864	direct_reclaim = false;
1865	break;
1866	}
1867	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
1868
1869	/*
1870	* Fast path: try to hold the pmd lock and unmap the PTE page.
1871	*
1872	* If the pte lock was released midway (retry case), or if the attempt
1873	* to hold the pmd lock failed, then we need to recheck all pte entries
1874	* to ensure they are still none, thereby preventing the pte entries
1875	* from being repopulated by another thread.
1876	*/
1877	if (can_reclaim_pt && direct_reclaim && addr == end)
1878	direct_reclaim = try_get_and_clear_pmd(mm, pmd, pmdval: &pmdval);
1879
1880	add_mm_rss_vec(mm, rss);
1881	arch_leave_lazy_mmu_mode();
1882
1883	/ Do the actual TLB flush before dropping ptl /
1884	if (force_flush) {
1885	tlb_flush_mmu_tlbonly(tlb);
1886	tlb_flush_rmaps(tlb, vma);
1887	}
1888	pte_unmap_unlock(start_pte, ptl);
1889
1890	/*
1891	* If we forced a TLB flush (either due to running out of
1892	* batch buffers or because we needed to flush dirty TLB
1893	* entries before releasing the ptl), free the batched
1894	* memory too. Come back again if we didn't do everything.
1895	*/
1896	if (force_flush)
1897	tlb_flush_mmu(tlb);
1898
1899	if (addr != end) {
1900	cond_resched();
1901	force_flush = false;
1902	force_break = false;
1903	goto retry;
1904	}
1905
1906	if (can_reclaim_pt) {
1907	if (direct_reclaim)
1908	free_pte(mm, addr: start, tlb, pmdval);
1909	else
1910	try_to_free_pte(mm, pmd, addr: start, tlb);
1911	}
1912
1913	return addr;
1914	}
1915
1916	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1917	struct vm_area_struct vma, pud_t pud,
1918	unsigned long addr, unsigned long end,
1919	struct zap_details *details)
1920	{
1921	pmd_t *pmd;
1922	unsigned long next;
1923
1924	pmd = pmd_offset(pud, address: addr);
1925	do {
1926	next = pmd_addr_end(addr, end);
1927	if (pmd_is_huge(pmd: *pmd)) {
1928	if (next - addr != HPAGE_PMD_SIZE)
1929	__split_huge_pmd(vma, pmd, address: addr, freeze: false);
1930	else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
1931	addr = next;
1932	continue;
1933	}
1934	/ fall through /
1935	} else if (details && details->single_folio &&
1936	folio_test_pmd_mappable(folio: details->single_folio) &&
1937	next - addr == HPAGE_PMD_SIZE && pmd_none(pmd: *pmd)) {
1938	spinlock_t *ptl = pmd_lock(mm: tlb->mm, pmd);
1939	/*
1940	* Take and drop THP pmd lock so that we cannot return
1941	* prematurely, while zap_huge_pmd() has cleared *pmd,
1942	* but not yet decremented compound_mapcount().
1943	*/
1944	spin_unlock(lock: ptl);
1945	}
1946	if (pmd_none(pmd: *pmd)) {
1947	addr = next;
1948	continue;
1949	}
1950	addr = zap_pte_range(tlb, vma, pmd, addr, end: next, details);
1951	if (addr != next)
1952	pmd--;
1953	} while (pmd++, cond_resched(), addr != end);
1954
1955	return addr;
1956	}
1957
1958	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1959	struct vm_area_struct vma, p4d_t p4d,
1960	unsigned long addr, unsigned long end,
1961	struct zap_details *details)
1962	{
1963	pud_t *pud;
1964	unsigned long next;
1965
1966	pud = pud_offset(p4d, address: addr);
1967	do {
1968	next = pud_addr_end(addr, end);
1969	if (pud_trans_huge(pud: *pud)) {
1970	if (next - addr != HPAGE_PUD_SIZE)
1971	split_huge_pud(vma, pud, addr);
1972	else if (zap_huge_pud(tlb, vma, pud, addr))
1973	goto next;
1974	/ fall through /
1975	}
1976	if (pud_none_or_clear_bad(pud))
1977	continue;
1978	next = zap_pmd_range(tlb, vma, pud, addr, end: next, details);
1979	next:
1980	cond_resched();
1981	} while (pud++, addr = next, addr != end);
1982
1983	return addr;
1984	}
1985
1986	static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1987	struct vm_area_struct vma, pgd_t pgd,
1988	unsigned long addr, unsigned long end,
1989	struct zap_details *details)
1990	{
1991	p4d_t *p4d;
1992	unsigned long next;
1993
1994	p4d = p4d_offset(pgd, address: addr);
1995	do {
1996	next = p4d_addr_end(addr, end);
1997	if (p4d_none_or_clear_bad(p4d))
1998	continue;
1999	next = zap_pud_range(tlb, vma, p4d, addr, end: next, details);
2000	} while (p4d++, addr = next, addr != end);
2001
2002	return addr;
2003	}
2004
2005	void unmap_page_range(struct mmu_gather *tlb,
2006	struct vm_area_struct *vma,
2007	unsigned long addr, unsigned long end,
2008	struct zap_details *details)
2009	{
2010	pgd_t *pgd;
2011	unsigned long next;
2012
2013	BUG_ON(addr >= end);
2014	tlb_start_vma(tlb, vma);
2015	pgd = pgd_offset(vma->vm_mm, addr);
2016	do {
2017	next = pgd_addr_end(addr, end);
2018	if (pgd_none_or_clear_bad(pgd))
2019	continue;
2020	next = zap_p4d_range(tlb, vma, pgd, addr, end: next, details);
2021	} while (pgd++, addr = next, addr != end);
2022	tlb_end_vma(tlb, vma);
2023	}
2024
2025
2026	static void unmap_single_vma(struct mmu_gather *tlb,
2027	struct vm_area_struct vma, unsigned* long start_addr,
2028	unsigned long end_addr, struct zap_details *details)
2029	{
2030	unsigned long start = max(vma->vm_start, start_addr);
2031	unsigned long end;
2032
2033	if (start >= vma->vm_end)
2034	return;
2035	end = min(vma->vm_end, end_addr);
2036	if (end <= vma->vm_start)
2037	return;
2038
2039	if (vma->vm_file)
2040	uprobe_munmap(vma, start, end);
2041
2042	if (start != end) {
2043	if (unlikely(is_vm_hugetlb_page(vma))) {
2044	/*
2045	* It is undesirable to test vma->vm_file as it
2046	* should be non-null for valid hugetlb area.
2047	* However, vm_file will be NULL in the error
2048	* cleanup path of mmap_region. When
2049	* hugetlbfs ->mmap method fails,
2050	* mmap_region() nullifies vma->vm_file
2051	* before calling this function to clean up.
2052	* Since no pte has actually been setup, it is
2053	* safe to do nothing in this case.
2054	*/
2055	if (vma->vm_file) {
2056	zap_flags_t zap_flags = details ?
2057	details->zap_flags : `0`;
2058	__unmap_hugepage_range(tlb, vma, start, end,
2059	NULL, zap_flags);
2060	}
2061	} else
2062	unmap_page_range(tlb, vma, addr: start, end, details);
2063	}
2064	}
2065
2066	/**
2067	* unmap_vmas - unmap a range of memory covered by a list of vma's
2068	* @tlb: address of the caller's struct mmu_gather
2069	* @mas: the maple state
2070	* @vma: the starting vma
2071	* @start_addr: virtual address at which to start unmapping
2072	* @end_addr: virtual address at which to end unmapping
2073	* @tree_end: The maximum index to check
2074	*
2075	* Unmap all pages in the vma list.
2076	*
2077	* Only addresses between `start' and `end' will be unmapped.
2078	*
2079	* The VMA list must be sorted in ascending virtual address order.
2080	*
2081	* unmap_vmas() assumes that the caller will flush the whole unmapped address
2082	* range after unmap_vmas() returns. So the only responsibility here is to
2083	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
2084	* drops the lock and schedules.
2085	*/
2086	void unmap_vmas(struct mmu_gather tlb, struct* ma_state *mas,
2087	struct vm_area_struct vma, unsigned* long start_addr,
2088	unsigned long end_addr, unsigned long tree_end)
2089	{
2090	struct mmu_notifier_range range;
2091	struct zap_details details = {
2092	.zap_flags = ZAP_FLAG_DROP_MARKER \| ZAP_FLAG_UNMAP,
2093	/ Careful - we need to zap private pages too! /
2094	.even_cows = true,
2095	};
2096
2097	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_UNMAP, flags: `0`, mm: vma->vm_mm,
2098	start: start_addr, end: end_addr);
2099	mmu_notifier_invalidate_range_start(range: &range);
2100	do {
2101	unsigned long start = start_addr;
2102	unsigned long end = end_addr;
2103	hugetlb_zap_begin(vma, start: &start, end: &end);
2104	unmap_single_vma(tlb, vma, start_addr: start, end_addr: end, details: &details);
2105	hugetlb_zap_end(vma, details: &details);
2106	vma = mas_find(mas, max: tree_end - `1`);
2107	} while (vma && likely(!xa_is_zero(vma)));
2108	mmu_notifier_invalidate_range_end(range: &range);
2109	}
2110
2111	/**
2112	* zap_page_range_single_batched - remove user pages in a given range
2113	* @tlb: pointer to the caller's struct mmu_gather
2114	* @vma: vm_area_struct holding the applicable pages
2115	* @address: starting address of pages to remove
2116	* @size: number of bytes to remove
2117	* @details: details of shared cache invalidation
2118	*
2119	* @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
2120	* hugetlb, @tlb is flushed and re-initialized by this function.
2121	*/
2122	void zap_page_range_single_batched(struct mmu_gather *tlb,
2123	struct vm_area_struct vma, unsigned* long address,
2124	unsigned long size, struct zap_details *details)
2125	{
2126	const unsigned long end = address + size;
2127	struct mmu_notifier_range range;
2128
2129	VM_WARN_ON_ONCE(!tlb \|\| tlb->mm != vma->vm_mm);
2130
2131	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2132	start: address, end);
2133	hugetlb_zap_begin(vma, start: &range.start, end: &range.end);
2134	update_hiwater_rss(mm: vma->vm_mm);
2135	mmu_notifier_invalidate_range_start(range: &range);
2136	/*
2137	* unmap 'address-end' not 'range.start-range.end' as range
2138	* could have been expanded for hugetlb pmd sharing.
2139	*/
2140	unmap_single_vma(tlb, vma, start_addr: address, end_addr: end, details);
2141	mmu_notifier_invalidate_range_end(range: &range);
2142	if (is_vm_hugetlb_page(vma)) {
2143	/*
2144	* flush tlb and free resources before hugetlb_zap_end(), to
2145	* avoid concurrent page faults' allocation failure.
2146	*/
2147	tlb_finish_mmu(tlb);
2148	hugetlb_zap_end(vma, details);
2149	tlb_gather_mmu(tlb, mm: vma->vm_mm);
2150	}
2151	}
2152
2153	/**
2154	* zap_page_range_single - remove user pages in a given range
2155	* @vma: vm_area_struct holding the applicable pages
2156	* @address: starting address of pages to zap
2157	* @size: number of bytes to zap
2158	* @details: details of shared cache invalidation
2159	*
2160	* The range must fit into one VMA.
2161	*/
2162	void zap_page_range_single(struct vm_area_struct vma, unsigned* long address,
2163	unsigned long size, struct zap_details *details)
2164	{
2165	struct mmu_gather tlb;
2166
2167	tlb_gather_mmu(tlb: &tlb, mm: vma->vm_mm);
2168	zap_page_range_single_batched(tlb: &tlb, vma, address, size, details);
2169	tlb_finish_mmu(tlb: &tlb);
2170	}
2171
2172	/**
2173	* zap_vma_ptes - remove ptes mapping the vma
2174	* @vma: vm_area_struct holding ptes to be zapped
2175	* @address: starting address of pages to zap
2176	* @size: number of bytes to zap
2177	*
2178	* This function only unmaps ptes assigned to VM_PFNMAP vmas.
2179	*
2180	* The entire address range must be fully contained within the vma.
2181	*
2182	*/
2183	void zap_vma_ptes(struct vm_area_struct vma, unsigned* long address,
2184	unsigned long size)
2185	{
2186	if (!range_in_vma(vma, start: address, end: address + size) \|\|
2187	!(vma->vm_flags & VM_PFNMAP))
2188	return;
2189
2190	zap_page_range_single(vma, address, size, NULL);
2191	}
2192	EXPORT_SYMBOL_GPL(zap_vma_ptes);
2193
2194	static pmd_t walk_to_pmd(struct* mm_struct mm, unsigned* long addr)
2195	{
2196	pgd_t *pgd;
2197	p4d_t *p4d;
2198	pud_t *pud;
2199	pmd_t *pmd;
2200
2201	pgd = pgd_offset(mm, addr);
2202	p4d = p4d_alloc(mm, pgd, address: addr);
2203	if (!p4d)
2204	return NULL;
2205	pud = pud_alloc(mm, p4d, address: addr);
2206	if (!pud)
2207	return NULL;
2208	pmd = pmd_alloc(mm, pud, address: addr);
2209	if (!pmd)
2210	return NULL;
2211
2212	VM_BUG_ON(pmd_trans_huge(*pmd));
2213	return pmd;
2214	}
2215
2216	pte_t __get_locked_pte(struct* mm_struct mm, unsigned* long addr,
2217	spinlock_t **ptl)
2218	{
2219	pmd_t *pmd = walk_to_pmd(mm, addr);
2220
2221	if (!pmd)
2222	return NULL;
2223	return pte_alloc_map_lock(mm, pmd, addr, ptl);
2224	}
2225
2226	static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
2227	{
2228	VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
2229	/*
2230	* Whoever wants to forbid the zeropage after some zeropages
2231	* might already have been mapped has to scan the page tables and
2232	* bail out on any zeropages. Zeropages in COW mappings can
2233	* be unshared using FAULT_FLAG_UNSHARE faults.
2234	*/
2235	if (mm_forbids_zeropage(vma->vm_mm))
2236	return false;
2237	/ zeropages in COW mappings are common and unproblematic. /
2238	if (is_cow_mapping(flags: vma->vm_flags))
2239	return true;
2240	/ Mappings that do not allow for writable PTEs are unproblematic. /
2241	if (!(vma->vm_flags & (VM_WRITE \| VM_MAYWRITE)))
2242	return true;
2243	/*
2244	* Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
2245	* find the shared zeropage and longterm-pin it, which would
2246	* be problematic as soon as the zeropage gets replaced by a different
2247	* page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
2248	* now differ to what GUP looked up. FSDAX is incompatible to
2249	* FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
2250	* check_vma_flags).
2251	*/
2252	return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
2253	(vma_is_fsdax(vma) \|\| vma->vm_flags & VM_IO);
2254	}
2255
2256	static int validate_page_before_insert(struct vm_area_struct *vma,
2257	struct page *page)
2258	{
2259	struct folio *folio = page_folio(page);
2260
2261	if (!folio_ref_count(folio))
2262	return -EINVAL;
2263	if (unlikely(is_zero_folio(folio))) {
2264	if (!vm_mixed_zeropage_allowed(vma))
2265	return -EINVAL;
2266	return `0`;
2267	}
2268	if (folio_test_anon(folio) \|\| page_has_type(page))
2269	return -EINVAL;
2270	flush_dcache_folio(folio);
2271	return `0`;
2272	}
2273
2274	static int insert_page_into_pte_locked(struct vm_area_struct vma, pte_t pte,
2275	unsigned long addr, struct page *page,
2276	pgprot_t prot, bool mkwrite)
2277	{
2278	struct folio *folio = page_folio(page);
2279	pte_t pteval = ptep_get(ptep: pte);
2280
2281	if (!pte_none(pte: pteval)) {
2282	if (!mkwrite)
2283	return -EBUSY;
2284
2285	/ see insert_pfn(). /
2286	if (pte_pfn(pte: pteval) != page_to_pfn(page)) {
2287	WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
2288	return -EFAULT;
2289	}
2290	pteval = maybe_mkwrite(pte: pteval, vma);
2291	pteval = pte_mkyoung(pte: pteval);
2292	if (ptep_set_access_flags(vma, address: addr, ptep: pte, entry: pteval, dirty: `1`))
2293	update_mmu_cache(vma, addr, ptep: pte);
2294	return `0`;
2295	}
2296
2297	/ Ok, finally just insert the thing.. /
2298	pteval = mk_pte(page, pgprot: prot);
2299	if (unlikely(is_zero_folio(folio))) {
2300	pteval = pte_mkspecial(pte: pteval);
2301	} else {
2302	folio_get(folio);
2303	pteval = mk_pte(page, pgprot: prot);
2304	if (mkwrite) {
2305	pteval = pte_mkyoung(pte: pteval);
2306	pteval = maybe_mkwrite(pte: pte_mkdirty(pte: pteval), vma);
2307	}
2308	inc_mm_counter(mm: vma->vm_mm, member: mm_counter_file(folio));
2309	folio_add_file_rmap_pte(folio, page, vma);
2310	}
2311	set_pte_at(vma->vm_mm, addr, pte, pteval);
2312	return `0`;
2313	}
2314
2315	static int insert_page(struct vm_area_struct vma, unsigned* long addr,
2316	struct page *page, pgprot_t prot, bool mkwrite)
2317	{
2318	int retval;
2319	pte_t *pte;
2320	spinlock_t *ptl;
2321
2322	retval = validate_page_before_insert(vma, page);
2323	if (retval)
2324	goto out;
2325	retval = -ENOMEM;
2326	pte = get_locked_pte(mm: vma->vm_mm, addr, ptl: &ptl);
2327	if (!pte)
2328	goto out;
2329	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
2330	mkwrite);
2331	pte_unmap_unlock(pte, ptl);
2332	out:
2333	return retval;
2334	}
2335
2336	static int insert_page_in_batch_locked(struct vm_area_struct vma, pte_t pte,
2337	unsigned long addr, struct page *page, pgprot_t prot)
2338	{
2339	int err;
2340
2341	err = validate_page_before_insert(vma, page);
2342	if (err)
2343	return err;
2344	return insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite: false);
2345	}
2346
2347	/ insert_pages() amortizes the cost of spinlock operations*
2348	* when inserting pages in a loop.
2349	*/
2350	static int insert_pages(struct vm_area_struct vma, unsigned* long addr,
2351	struct page *pages, unsigned* long *num, pgprot_t prot)
2352	{
2353	pmd_t *pmd = NULL;
2354	pte_t start_pte, pte;
2355	spinlock_t *pte_lock;
2356	struct mm_struct *const mm = vma->vm_mm;
2357	unsigned long curr_page_idx = `0`;
2358	unsigned long remaining_pages_total = *num;
2359	unsigned long pages_to_write_in_pmd;
2360	int ret;
2361	more:
2362	ret = -EFAULT;
2363	pmd = walk_to_pmd(mm, addr);
2364	if (!pmd)
2365	goto out;
2366
2367	pages_to_write_in_pmd = min_t(unsigned long,
2368	remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
2369
2370	/ Allocate the PTE if necessary; takes PMD lock once only. /
2371	ret = -ENOMEM;
2372	if (pte_alloc(mm, pmd))
2373	goto out;
2374
2375	while (pages_to_write_in_pmd) {
2376	int pte_idx = `0`;
2377	const int batch_size = min_t(int, pages_to_write_in_pmd, `8`);
2378
2379	start_pte = pte_offset_map_lock(mm, pmd, addr, ptlp: &pte_lock);
2380	if (!start_pte) {
2381	ret = -EFAULT;
2382	goto out;
2383	}
2384	for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
2385	int err = insert_page_in_batch_locked(vma, pte,
2386	addr, page: pages[curr_page_idx], prot);
2387	if (unlikely(err)) {
2388	pte_unmap_unlock(start_pte, pte_lock);
2389	ret = err;
2390	remaining_pages_total -= pte_idx;
2391	goto out;
2392	}
2393	addr += PAGE_SIZE;
2394	++curr_page_idx;
2395	}
2396	pte_unmap_unlock(start_pte, pte_lock);
2397	pages_to_write_in_pmd -= batch_size;
2398	remaining_pages_total -= batch_size;
2399	}
2400	if (remaining_pages_total)
2401	goto more;
2402	ret = `0`;
2403	out:
2404	*num = remaining_pages_total;
2405	return ret;
2406	}
2407
2408	/**
2409	* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2410	* @vma: user vma to map to
2411	* @addr: target start user address of these pages
2412	* @pages: source kernel pages
2413	* @num: in: number of pages to map. out: number of pages that were not
2414	* mapped. (0 means all pages were successfully mapped).
2415	*
2416	* Preferred over vm_insert_page() when inserting multiple pages.
2417	*
2418	* In case of error, we may have mapped a subset of the provided
2419	* pages. It is the caller's responsibility to account for this case.
2420	*
2421	* The same restrictions apply as in vm_insert_page().
2422	*/
2423	int vm_insert_pages(struct vm_area_struct vma, unsigned* long addr,
2424	struct page *pages, unsigned* long *num)
2425	{
2426	const unsigned long end_addr = addr + (num PAGE_SIZE) - `1`;
2427
2428	if (addr < vma->vm_start \|\| end_addr >= vma->vm_end)
2429	return -EFAULT;
2430	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2431	BUG_ON(mmap_read_trylock(vma->vm_mm));
2432	BUG_ON(vma->vm_flags & VM_PFNMAP);
2433	vm_flags_set(vma, VM_MIXEDMAP);
2434	}
2435	/ Defer page refcount checking till we're about to map that page. /
2436	return insert_pages(vma, addr, pages, num, prot: vma->vm_page_prot);
2437	}
2438	EXPORT_SYMBOL(vm_insert_pages);
2439
2440	/**
2441	* vm_insert_page - insert single page into user vma
2442	* @vma: user vma to map to
2443	* @addr: target user address of this page
2444	* @page: source kernel page
2445	*
2446	* This allows drivers to insert individual pages they've allocated
2447	* into a user vma. The zeropage is supported in some VMAs,
2448	* see vm_mixed_zeropage_allowed().
2449	*
2450	* The page has to be a nice clean _individual_ kernel allocation.
2451	* If you allocate a compound page, you need to have marked it as
2452	* such (__GFP_COMP), or manually just split the page up yourself
2453	* (see split_page()).
2454	*
2455	* NOTE! Traditionally this was done with "remap_pfn_range()" which
2456	* took an arbitrary page protection parameter. This doesn't allow
2457	* that. Your vma protection will have to be set up correctly, which
2458	* means that if you want a shared writable mapping, you'd better
2459	* ask for a shared writable mapping!
2460	*
2461	* The page does not need to be reserved.
2462	*
2463	* Usually this function is called from f_op->mmap() handler
2464	* under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2465	* Caller must set VM_MIXEDMAP on vma if it wants to call this
2466	* function from other places, for example from page-fault handler.
2467	*
2468	* Return: %0 on success, negative error code otherwise.
2469	*/
2470	int vm_insert_page(struct vm_area_struct vma, unsigned* long addr,
2471	struct page *page)
2472	{
2473	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2474	return -EFAULT;
2475	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2476	BUG_ON(mmap_read_trylock(vma->vm_mm));
2477	BUG_ON(vma->vm_flags & VM_PFNMAP);
2478	vm_flags_set(vma, VM_MIXEDMAP);
2479	}
2480	return insert_page(vma, addr, page, prot: vma->vm_page_prot, mkwrite: false);
2481	}
2482	EXPORT_SYMBOL(vm_insert_page);
2483
2484	/*
2485	* __vm_map_pages - maps range of kernel pages into user vma
2486	* @vma: user vma to map to
2487	* @pages: pointer to array of source kernel pages
2488	* @num: number of pages in page array
2489	* @offset: user's requested vm_pgoff
2490	*
2491	* This allows drivers to map range of kernel pages into a user vma.
2492	* The zeropage is supported in some VMAs, see
2493	* vm_mixed_zeropage_allowed().
2494	*
2495	* Return: 0 on success and error code otherwise.
2496	*/
2497	static int __vm_map_pages(struct vm_area_struct vma, struct* page **pages,
2498	unsigned long num, unsigned long offset)
2499	{
2500	unsigned long count = vma_pages(vma);
2501	unsigned long uaddr = vma->vm_start;
2502	int ret, i;
2503
2504	/ Fail if the user requested offset is beyond the end of the object /
2505	if (offset >= num)
2506	return -ENXIO;
2507
2508	/ Fail if the user requested size exceeds available object size /
2509	if (count > num - offset)
2510	return -ENXIO;
2511
2512	for (i = `0`; i < count; i++) {
2513	ret = vm_insert_page(vma, uaddr, pages[offset + i]);
2514	if (ret < `0`)
2515	return ret;
2516	uaddr += PAGE_SIZE;
2517	}
2518
2519	return `0`;
2520	}
2521
2522	/**
2523	* vm_map_pages - maps range of kernel pages starts with non zero offset
2524	* @vma: user vma to map to
2525	* @pages: pointer to array of source kernel pages
2526	* @num: number of pages in page array
2527	*
2528	* Maps an object consisting of @num pages, catering for the user's
2529	* requested vm_pgoff
2530	*
2531	* If we fail to insert any page into the vma, the function will return
2532	* immediately leaving any previously inserted pages present. Callers
2533	* from the mmap handler may immediately return the error as their caller
2534	* will destroy the vma, removing any successfully inserted pages. Other
2535	* callers should make their own arrangements for calling unmap_region().
2536	*
2537	* Context: Process context. Called by mmap handlers.
2538	* Return: 0 on success and error code otherwise.
2539	*/
2540	int vm_map_pages(struct vm_area_struct vma, struct* page **pages,
2541	unsigned long num)
2542	{
2543	return __vm_map_pages(vma, pages, num, offset: vma->vm_pgoff);
2544	}
2545	EXPORT_SYMBOL(vm_map_pages);
2546
2547	/**
2548	* vm_map_pages_zero - map range of kernel pages starts with zero offset
2549	* @vma: user vma to map to
2550	* @pages: pointer to array of source kernel pages
2551	* @num: number of pages in page array
2552	*
2553	* Similar to vm_map_pages(), except that it explicitly sets the offset
2554	* to 0. This function is intended for the drivers that did not consider
2555	* vm_pgoff.
2556	*
2557	* Context: Process context. Called by mmap handlers.
2558	* Return: 0 on success and error code otherwise.
2559	*/
2560	int vm_map_pages_zero(struct vm_area_struct vma, struct* page **pages,
2561	unsigned long num)
2562	{
2563	return __vm_map_pages(vma, pages, num, offset: `0`);
2564	}
2565	EXPORT_SYMBOL(vm_map_pages_zero);
2566
2567	static vm_fault_t insert_pfn(struct vm_area_struct vma, unsigned* long addr,
2568	unsigned long pfn, pgprot_t prot, bool mkwrite)
2569	{
2570	struct mm_struct *mm = vma->vm_mm;
2571	pte_t *pte, entry;
2572	spinlock_t *ptl;
2573
2574	pte = get_locked_pte(mm, addr, ptl: &ptl);
2575	if (!pte)
2576	return VM_FAULT_OOM;
2577	entry = ptep_get(ptep: pte);
2578	if (!pte_none(pte: entry)) {
2579	if (mkwrite) {
2580	/*
2581	* For read faults on private mappings the PFN passed
2582	* in may not match the PFN we have mapped if the
2583	* mapped PFN is a writeable COW page. In the mkwrite
2584	* case we are creating a writable PTE for a shared
2585	* mapping and we expect the PFNs to match. If they
2586	* don't match, we are likely racing with block
2587	* allocation and mapping invalidation so just skip the
2588	* update.
2589	*/
2590	if (pte_pfn(pte: entry) != pfn) {
2591	WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
2592	goto out_unlock;
2593	}
2594	entry = pte_mkyoung(pte: entry);
2595	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
2596	if (ptep_set_access_flags(vma, address: addr, ptep: pte, entry, dirty: `1`))
2597	update_mmu_cache(vma, addr, ptep: pte);
2598	}
2599	goto out_unlock;
2600	}
2601
2602	/ Ok, finally just insert the thing.. /
2603	entry = pte_mkspecial(pte: pfn_pte(page_nr: pfn, pgprot: prot));
2604
2605	if (mkwrite) {
2606	entry = pte_mkyoung(pte: entry);
2607	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
2608	}
2609
2610	set_pte_at(mm, addr, pte, entry);
2611	update_mmu_cache(vma, addr, ptep: pte); / XXX: why not for insert_page? /
2612
2613	out_unlock:
2614	pte_unmap_unlock(pte, ptl);
2615	return VM_FAULT_NOPAGE;
2616	}
2617
2618	/**
2619	* vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2620	* @vma: user vma to map to
2621	* @addr: target user address of this page
2622	* @pfn: source kernel pfn
2623	* @pgprot: pgprot flags for the inserted page
2624	*
2625	* This is exactly like vmf_insert_pfn(), except that it allows drivers
2626	* to override pgprot on a per-page basis.
2627	*
2628	* This only makes sense for IO mappings, and it makes no sense for
2629	* COW mappings. In general, using multiple vmas is preferable;
2630	* vmf_insert_pfn_prot should only be used if using multiple VMAs is
2631	* impractical.
2632	*
2633	* pgprot typically only differs from @vma->vm_page_prot when drivers set
2634	* caching- and encryption bits different than those of @vma->vm_page_prot,
2635	* because the caching- or encryption mode may not be known at mmap() time.
2636	*
2637	* This is ok as long as @vma->vm_page_prot is not used by the core vm
2638	* to set caching and encryption bits for those vmas (except for COW pages).
2639	* This is ensured by core vm only modifying these page table entries using
2640	* functions that don't touch caching- or encryption bits, using pte_modify()
2641	* if needed. (See for example mprotect()).
2642	*
2643	* Also when new page-table entries are created, this is only done using the
2644	* fault() callback, and never using the value of vma->vm_page_prot,
2645	* except for page-table entries that point to anonymous pages as the result
2646	* of COW.
2647	*
2648	* Context: Process context. May allocate using %GFP_KERNEL.
2649	* Return: vm_fault_t value.
2650	*/
2651	vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct vma, unsigned* long addr,
2652	unsigned long pfn, pgprot_t pgprot)
2653	{
2654	/*
2655	* Technically, architectures with pte_special can avoid all these
2656	* restrictions (same for remap_pfn_range). However we would like
2657	* consistency in testing and feature parity among all, so we should
2658	* try to keep these invariants in place for everybody.
2659	*/
2660	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
2661	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
2662	(VM_PFNMAP\|VM_MIXEDMAP));
2663	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2664	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2665
2666	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2667	return VM_FAULT_SIGBUS;
2668
2669	if (!pfn_modify_allowed(pfn, prot: pgprot))
2670	return VM_FAULT_SIGBUS;
2671
2672	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
2673
2674	return insert_pfn(vma, addr, pfn, prot: pgprot, mkwrite: false);
2675	}
2676	EXPORT_SYMBOL(vmf_insert_pfn_prot);
2677
2678	/**
2679	* vmf_insert_pfn - insert single pfn into user vma
2680	* @vma: user vma to map to
2681	* @addr: target user address of this page
2682	* @pfn: source kernel pfn
2683	*
2684	* Similar to vm_insert_page, this allows drivers to insert individual pages
2685	* they've allocated into a user vma. Same comments apply.
2686	*
2687	* This function should only be called from a vm_ops->fault handler, and
2688	* in that case the handler should return the result of this function.
2689	*
2690	* vma cannot be a COW mapping.
2691	*
2692	* As this is called only for pages that do not currently exist, we
2693	* do not need to flush old virtual caches or the TLB.
2694	*
2695	* Context: Process context. May allocate using %GFP_KERNEL.
2696	* Return: vm_fault_t value.
2697	*/
2698	vm_fault_t vmf_insert_pfn(struct vm_area_struct vma, unsigned* long addr,
2699	unsigned long pfn)
2700	{
2701	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2702	}
2703	EXPORT_SYMBOL(vmf_insert_pfn);
2704
2705	static bool vm_mixed_ok(struct vm_area_struct vma, unsigned* long pfn,
2706	bool mkwrite)
2707	{
2708	if (unlikely(is_zero_pfn(pfn)) &&
2709	(mkwrite \|\| !vm_mixed_zeropage_allowed(vma)))
2710	return false;
2711	/ these checks mirror the abort conditions in vm_normal_page /
2712	if (vma->vm_flags & VM_MIXEDMAP)
2713	return true;
2714	if (is_zero_pfn(pfn))
2715	return true;
2716	return false;
2717	}
2718
2719	static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2720	unsigned long addr, unsigned long pfn, bool mkwrite)
2721	{
2722	pgprot_t pgprot = vma->vm_page_prot;
2723	int err;
2724
2725	if (!vm_mixed_ok(vma, pfn, mkwrite))
2726	return VM_FAULT_SIGBUS;
2727
2728	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2729	return VM_FAULT_SIGBUS;
2730
2731	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
2732
2733	if (!pfn_modify_allowed(pfn, prot: pgprot))
2734	return VM_FAULT_SIGBUS;
2735
2736	/*
2737	* If we don't have pte special, then we have to use the pfn_valid()
2738	* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we must
2739	* refcount the page if pfn_valid is true (hence insert_page rather
2740	* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
2741	* without pte special, it would there be refcounted as a normal page.
2742	*/
2743	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
2744	struct page *page;
2745
2746	/*
2747	* At this point we are committed to insert_page()
2748	* regardless of whether the caller specified flags that
2749	* result in pfn_t_has_page() == false.
2750	*/
2751	page = pfn_to_page(pfn);
2752	err = insert_page(vma, addr, page, prot: pgprot, mkwrite);
2753	} else {
2754	return insert_pfn(vma, addr, pfn, prot: pgprot, mkwrite);
2755	}
2756
2757	if (err == -ENOMEM)
2758	return VM_FAULT_OOM;
2759	if (err < `0` && err != -EBUSY)
2760	return VM_FAULT_SIGBUS;
2761
2762	return VM_FAULT_NOPAGE;
2763	}
2764
2765	vm_fault_t vmf_insert_page_mkwrite(struct vm_fault vmf, struct* page *page,
2766	bool write)
2767	{
2768	pgprot_t pgprot = vmf->vma->vm_page_prot;
2769	unsigned long addr = vmf->address;
2770	int err;
2771
2772	if (addr < vmf->vma->vm_start \|\| addr >= vmf->vma->vm_end)
2773	return VM_FAULT_SIGBUS;
2774
2775	err = insert_page(vma: vmf->vma, addr, page, prot: pgprot, mkwrite: write);
2776	if (err == -ENOMEM)
2777	return VM_FAULT_OOM;
2778	if (err < `0` && err != -EBUSY)
2779	return VM_FAULT_SIGBUS;
2780
2781	return VM_FAULT_NOPAGE;
2782	}
2783	EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
2784
2785	vm_fault_t vmf_insert_mixed(struct vm_area_struct vma, unsigned* long addr,
2786	unsigned long pfn)
2787	{
2788	return __vm_insert_mixed(vma, addr, pfn, mkwrite: false);
2789	}
2790	EXPORT_SYMBOL(vmf_insert_mixed);
2791
2792	/*
2793	* If the insertion of PTE failed because someone else already added a
2794	* different entry in the mean time, we treat that as success as we assume
2795	* the same entry was actually inserted.
2796	*/
2797	vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2798	unsigned long addr, unsigned long pfn)
2799	{
2800	return __vm_insert_mixed(vma, addr, pfn, mkwrite: true);
2801	}
2802
2803	/*
2804	* maps a range of physical memory into the requested pages. the old
2805	* mappings are removed. any references to nonexistent pages results
2806	* in null mappings (currently treated as "copy-on-access")
2807	*/
2808	static int remap_pte_range(struct mm_struct mm, pmd_t pmd,
2809	unsigned long addr, unsigned long end,
2810	unsigned long pfn, pgprot_t prot)
2811	{
2812	pte_t pte, mapped_pte;
2813	spinlock_t *ptl;
2814	int err = `0`;
2815
2816	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2817	if (!pte)
2818	return -ENOMEM;
2819	arch_enter_lazy_mmu_mode();
2820	do {
2821	BUG_ON(!pte_none(ptep_get(pte)));
2822	if (!pfn_modify_allowed(pfn, prot)) {
2823	err = -EACCES;
2824	break;
2825	}
2826	set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2827	pfn++;
2828	} while (pte++, addr += PAGE_SIZE, addr != end);
2829	arch_leave_lazy_mmu_mode();
2830	pte_unmap_unlock(mapped_pte, ptl);
2831	return err;
2832	}
2833
2834	static inline int remap_pmd_range(struct mm_struct mm, pud_t pud,
2835	unsigned long addr, unsigned long end,
2836	unsigned long pfn, pgprot_t prot)
2837	{
2838	pmd_t *pmd;
2839	unsigned long next;
2840	int err;
2841
2842	pfn -= addr >> PAGE_SHIFT;
2843	pmd = pmd_alloc(mm, pud, address: addr);
2844	if (!pmd)
2845	return -ENOMEM;
2846	VM_BUG_ON(pmd_trans_huge(*pmd));
2847	do {
2848	next = pmd_addr_end(addr, end);
2849	err = remap_pte_range(mm, pmd, addr, end: next,
2850	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2851	if (err)
2852	return err;
2853	} while (pmd++, addr = next, addr != end);
2854	return `0`;
2855	}
2856
2857	static inline int remap_pud_range(struct mm_struct mm, p4d_t p4d,
2858	unsigned long addr, unsigned long end,
2859	unsigned long pfn, pgprot_t prot)
2860	{
2861	pud_t *pud;
2862	unsigned long next;
2863	int err;
2864
2865	pfn -= addr >> PAGE_SHIFT;
2866	pud = pud_alloc(mm, p4d, address: addr);
2867	if (!pud)
2868	return -ENOMEM;
2869	do {
2870	next = pud_addr_end(addr, end);
2871	err = remap_pmd_range(mm, pud, addr, end: next,
2872	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2873	if (err)
2874	return err;
2875	} while (pud++, addr = next, addr != end);
2876	return `0`;
2877	}
2878
2879	static inline int remap_p4d_range(struct mm_struct mm, pgd_t pgd,
2880	unsigned long addr, unsigned long end,
2881	unsigned long pfn, pgprot_t prot)
2882	{
2883	p4d_t *p4d;
2884	unsigned long next;
2885	int err;
2886
2887	pfn -= addr >> PAGE_SHIFT;
2888	p4d = p4d_alloc(mm, pgd, address: addr);
2889	if (!p4d)
2890	return -ENOMEM;
2891	do {
2892	next = p4d_addr_end(addr, end);
2893	err = remap_pud_range(mm, p4d, addr, end: next,
2894	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2895	if (err)
2896	return err;
2897	} while (p4d++, addr = next, addr != end);
2898	return `0`;
2899	}
2900
2901	static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
2902	unsigned long end, unsigned long vm_start, unsigned long vm_end,
2903	unsigned long pfn, pgoff_t *vm_pgoff_p)
2904	{
2905	/*
2906	* There's a horrible special case to handle copy-on-write
2907	* behaviour that some programs depend on. We mark the "original"
2908	* un-COW'ed pages by matching them up with "vma->vm_pgoff".
2909	* See vm_normal_page() for details.
2910	*/
2911	if (is_cow_mapping(flags: vm_flags)) {
2912	if (addr != vm_start \|\| end != vm_end)
2913	return -EINVAL;
2914	*vm_pgoff_p = pfn;
2915	}
2916
2917	return `0`;
2918	}
2919
2920	static int remap_pfn_range_internal(struct vm_area_struct vma, unsigned* long addr,
2921	unsigned long pfn, unsigned long size, pgprot_t prot)
2922	{
2923	pgd_t *pgd;
2924	unsigned long next;
2925	unsigned long end = addr + PAGE_ALIGN(size);
2926	struct mm_struct *mm = vma->vm_mm;
2927	int err;
2928
2929	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2930	return -EINVAL;
2931
2932	VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
2933
2934	BUG_ON(addr >= end);
2935	pfn -= addr >> PAGE_SHIFT;
2936	pgd = pgd_offset(mm, addr);
2937	flush_cache_range(vma, start: addr, end);
2938	do {
2939	next = pgd_addr_end(addr, end);
2940	err = remap_p4d_range(mm, pgd, addr, end: next,
2941	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2942	if (err)
2943	return err;
2944	} while (pgd++, addr = next, addr != end);
2945
2946	return `0`;
2947	}
2948
2949	/*
2950	* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
2951	* must have pre-validated the caching bits of the pgprot_t.
2952	*/
2953	static int remap_pfn_range_notrack(struct vm_area_struct vma, unsigned* long addr,
2954	unsigned long pfn, unsigned long size, pgprot_t prot)
2955	{
2956	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
2957
2958	if (!error)
2959	return `0`;
2960
2961	/*
2962	* A partial pfn range mapping is dangerous: it does not
2963	* maintain page reference counts, and callers may free
2964	* pages due to the error. So zap it early.
2965	*/
2966	zap_page_range_single(vma, address: addr, size, NULL);
2967	return error;
2968	}
2969
2970	#ifdef __HAVE_PFNMAP_TRACKING
2971	static inline struct pfnmap_track_ctx pfnmap_track_ctx_alloc(unsigned* long pfn,
2972	unsigned long size, pgprot_t *prot)
2973	{
2974	struct pfnmap_track_ctx *ctx;
2975
2976	if (pfnmap_track(pfn, size, prot))
2977	return ERR_PTR(error: -EINVAL);
2978
2979	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
2980	if (unlikely(!ctx)) {
2981	pfnmap_untrack(pfn, size);
2982	return ERR_PTR(error: -ENOMEM);
2983	}
2984
2985	ctx->pfn = pfn;
2986	ctx->size = size;
2987	kref_init(kref: &ctx->kref);
2988	return ctx;
2989	}
2990
2991	void pfnmap_track_ctx_release(struct kref *ref)
2992	{
2993	struct pfnmap_track_ctx ctx = container_of(ref, struct* pfnmap_track_ctx, kref);
2994
2995	pfnmap_untrack(pfn: ctx->pfn, size: ctx->size);
2996	kfree(objp: ctx);
2997	}
2998
2999	static int remap_pfn_range_track(struct vm_area_struct vma, unsigned* long addr,
3000	unsigned long pfn, unsigned long size, pgprot_t prot)
3001	{
3002	struct pfnmap_track_ctx *ctx = NULL;
3003	int err;
3004
3005	size = PAGE_ALIGN(size);
3006
3007	/*
3008	* If we cover the full VMA, we'll perform actual tracking, and
3009	* remember to untrack when the last reference to our tracking
3010	* context from a VMA goes away. We'll keep tracking the whole pfn
3011	* range even during VMA splits and partial unmapping.
3012	*
3013	* If we only cover parts of the VMA, we'll only setup the cachemode
3014	* in the pgprot for the pfn range.
3015	*/
3016	if (addr == vma->vm_start && addr + size == vma->vm_end) {
3017	if (vma->pfnmap_track_ctx)
3018	return -EINVAL;
3019	ctx = pfnmap_track_ctx_alloc(pfn, size, prot: &prot);
3020	if (IS_ERR(ptr: ctx))
3021	return PTR_ERR(ptr: ctx);
3022	} else if (pfnmap_setup_cachemode(pfn, size, prot: &prot)) {
3023	return -EINVAL;
3024	}
3025
3026	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3027	if (ctx) {
3028	if (err)
3029	kref_put(kref: &ctx->kref, release: pfnmap_track_ctx_release);
3030	else
3031	vma->pfnmap_track_ctx = ctx;
3032	}
3033	return err;
3034	}
3035
3036	static int do_remap_pfn_range(struct vm_area_struct vma, unsigned* long addr,
3037	unsigned long pfn, unsigned long size, pgprot_t prot)
3038	{
3039	return remap_pfn_range_track(vma, addr, pfn, size, prot);
3040	}
3041	#else
3042	static int do_remap_pfn_range(struct vm_area_struct vma, unsigned* long addr,
3043	unsigned long pfn, unsigned long size, pgprot_t prot)
3044	{
3045	return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3046	}
3047	#endif
3048
3049	void remap_pfn_range_prepare(struct vm_area_desc desc, unsigned* long pfn)
3050	{
3051	/*
3052	* We set addr=VMA start, end=VMA end here, so this won't fail, but we
3053	* check it again on complete and will fail there if specified addr is
3054	* invalid.
3055	*/
3056	get_remap_pgoff(vm_flags: desc->vm_flags, addr: desc->start, end: desc->end,
3057	vm_start: desc->start, vm_end: desc->end, pfn, vm_pgoff_p: &desc->pgoff);
3058	desc->vm_flags \|= VM_REMAP_FLAGS;
3059	}
3060
3061	static int remap_pfn_range_prepare_vma(struct vm_area_struct vma, unsigned* long addr,
3062	unsigned long pfn, unsigned long size)
3063	{
3064	unsigned long end = addr + PAGE_ALIGN(size);
3065	int err;
3066
3067	err = get_remap_pgoff(vm_flags: vma->vm_flags, addr, end,
3068	vm_start: vma->vm_start, vm_end: vma->vm_end,
3069	pfn, vm_pgoff_p: &vma->vm_pgoff);
3070	if (err)
3071	return err;
3072
3073	vm_flags_set(vma, VM_REMAP_FLAGS);
3074	return `0`;
3075	}
3076
3077	/**
3078	* remap_pfn_range - remap kernel memory to userspace
3079	* @vma: user vma to map to
3080	* @addr: target page aligned user address to start at
3081	* @pfn: page frame number of kernel physical memory address
3082	* @size: size of mapping area
3083	* @prot: page protection flags for this mapping
3084	*
3085	* Note: this is only safe if the mm semaphore is held when called.
3086	*
3087	* Return: %0 on success, negative error code otherwise.
3088	*/
3089	int remap_pfn_range(struct vm_area_struct vma, unsigned* long addr,
3090	unsigned long pfn, unsigned long size, pgprot_t prot)
3091	{
3092	int err;
3093
3094	err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
3095	if (err)
3096	return err;
3097
3098	return do_remap_pfn_range(vma, addr, pfn, size, prot);
3099	}
3100	EXPORT_SYMBOL(remap_pfn_range);
3101
3102	int remap_pfn_range_complete(struct vm_area_struct vma, unsigned* long addr,
3103	unsigned long pfn, unsigned long size, pgprot_t prot)
3104	{
3105	return do_remap_pfn_range(vma, addr, pfn, size, prot);
3106	}
3107
3108	/**
3109	* vm_iomap_memory - remap memory to userspace
3110	* @vma: user vma to map to
3111	* @start: start of the physical memory to be mapped
3112	* @len: size of area
3113	*
3114	* This is a simplified io_remap_pfn_range() for common driver use. The
3115	* driver just needs to give us the physical memory range to be mapped,
3116	* we'll figure out the rest from the vma information.
3117	*
3118	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
3119	* whatever write-combining details or similar.
3120	*
3121	* Return: %0 on success, negative error code otherwise.
3122	*/
3123	int vm_iomap_memory(struct vm_area_struct vma, phys_addr_t start, unsigned* long len)
3124	{
3125	unsigned long vm_len, pfn, pages;
3126
3127	/ Check that the physical memory area passed in looks valid /
3128	if (start + len < start)
3129	return -EINVAL;
3130	/*
3131	* You really shouldn't map things that aren't page-aligned,
3132	* but we've historically allowed it because IO memory might
3133	* just have smaller alignment.
3134	*/
3135	len += start & ~PAGE_MASK;
3136	pfn = start >> PAGE_SHIFT;
3137	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
3138	if (pfn + pages < pfn)
3139	return -EINVAL;
3140
3141	/ We start the mapping 'vm_pgoff' pages into the area /
3142	if (vma->vm_pgoff > pages)
3143	return -EINVAL;
3144	pfn += vma->vm_pgoff;
3145	pages -= vma->vm_pgoff;
3146
3147	/ Can we fit all of the mapping? /
3148	vm_len = vma->vm_end - vma->vm_start;
3149	if (vm_len >> PAGE_SHIFT > pages)
3150	return -EINVAL;
3151
3152	/ Ok, let it rip /
3153	return io_remap_pfn_range(vma, addr: vma->vm_start, orig_pfn: pfn, size: vm_len, orig_prot: vma->vm_page_prot);
3154	}
3155	EXPORT_SYMBOL(vm_iomap_memory);
3156
3157	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
3158	unsigned long addr, unsigned long end,
3159	pte_fn_t fn, void *data, bool create,
3160	pgtbl_mod_mask *mask)
3161	{
3162	pte_t pte, mapped_pte;
3163	int err = `0`;
3164	spinlock_t *ptl;
3165
3166	if (create) {
3167	mapped_pte = pte = (mm == &init_mm) ?
3168	pte_alloc_kernel_track(pmd, addr, mask) :
3169	pte_alloc_map_lock(mm, pmd, addr, &ptl);
3170	if (!pte)
3171	return -ENOMEM;
3172	} else {
3173	mapped_pte = pte = (mm == &init_mm) ?
3174	pte_offset_kernel(pmd, address: addr) :
3175	pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl);
3176	if (!pte)
3177	return -EINVAL;
3178	}
3179
3180	arch_enter_lazy_mmu_mode();
3181
3182	if (fn) {
3183	do {
3184	if (create \|\| !pte_none(pte: ptep_get(ptep: pte))) {
3185	err = fn(pte, addr, data);
3186	if (err)
3187	break;
3188	}
3189	} while (pte++, addr += PAGE_SIZE, addr != end);
3190	}
3191	*mask \|= PGTBL_PTE_MODIFIED;
3192
3193	arch_leave_lazy_mmu_mode();
3194
3195	if (mm != &init_mm)
3196	pte_unmap_unlock(mapped_pte, ptl);
3197	return err;
3198	}
3199
3200	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
3201	unsigned long addr, unsigned long end,
3202	pte_fn_t fn, void *data, bool create,
3203	pgtbl_mod_mask *mask)
3204	{
3205	pmd_t *pmd;
3206	unsigned long next;
3207	int err = `0`;
3208
3209	BUG_ON(pud_leaf(*pud));
3210
3211	if (create) {
3212	pmd = pmd_alloc_track(mm, pud, address: addr, mod_mask: mask);
3213	if (!pmd)
3214	return -ENOMEM;
3215	} else {
3216	pmd = pmd_offset(pud, address: addr);
3217	}
3218	do {
3219	next = pmd_addr_end(addr, end);
3220	if (pmd_none(pmd: *pmd) && !create)
3221	continue;
3222	if (WARN_ON_ONCE(pmd_leaf(*pmd)))
3223	return -EINVAL;
3224	if (!pmd_none(pmd: pmd) && WARN_ON_ONCE(pmd_bad(pmd))) {
3225	if (!create)
3226	continue;
3227	pmd_clear_bad(pmd);
3228	}
3229	err = apply_to_pte_range(mm, pmd, addr, end: next,
3230	fn, data, create, mask);
3231	if (err)
3232	break;
3233	} while (pmd++, addr = next, addr != end);
3234
3235	return err;
3236	}
3237
3238	static int apply_to_pud_range(struct mm_struct mm, p4d_t p4d,
3239	unsigned long addr, unsigned long end,
3240	pte_fn_t fn, void *data, bool create,
3241	pgtbl_mod_mask *mask)
3242	{
3243	pud_t *pud;
3244	unsigned long next;
3245	int err = `0`;
3246
3247	if (create) {
3248	pud = pud_alloc_track(mm, p4d, address: addr, mod_mask: mask);
3249	if (!pud)
3250	return -ENOMEM;
3251	} else {
3252	pud = pud_offset(p4d, address: addr);
3253	}
3254	do {
3255	next = pud_addr_end(addr, end);
3256	if (pud_none(pud: *pud) && !create)
3257	continue;
3258	if (WARN_ON_ONCE(pud_leaf(*pud)))
3259	return -EINVAL;
3260	if (!pud_none(pud: pud) && WARN_ON_ONCE(pud_bad(pud))) {
3261	if (!create)
3262	continue;
3263	pud_clear_bad(pud);
3264	}
3265	err = apply_to_pmd_range(mm, pud, addr, end: next,
3266	fn, data, create, mask);
3267	if (err)
3268	break;
3269	} while (pud++, addr = next, addr != end);
3270
3271	return err;
3272	}
3273
3274	static int apply_to_p4d_range(struct mm_struct mm, pgd_t pgd,
3275	unsigned long addr, unsigned long end,
3276	pte_fn_t fn, void *data, bool create,
3277	pgtbl_mod_mask *mask)
3278	{
3279	p4d_t *p4d;
3280	unsigned long next;
3281	int err = `0`;
3282
3283	if (create) {
3284	p4d = p4d_alloc_track(mm, pgd, address: addr, mod_mask: mask);
3285	if (!p4d)
3286	return -ENOMEM;
3287	} else {
3288	p4d = p4d_offset(pgd, address: addr);
3289	}
3290	do {
3291	next = p4d_addr_end(addr, end);
3292	if (p4d_none(p4d: *p4d) && !create)
3293	continue;
3294	if (WARN_ON_ONCE(p4d_leaf(*p4d)))
3295	return -EINVAL;
3296	if (!p4d_none(p4d: p4d) && WARN_ON_ONCE(p4d_bad(p4d))) {
3297	if (!create)
3298	continue;
3299	p4d_clear_bad(p4d);
3300	}
3301	err = apply_to_pud_range(mm, p4d, addr, end: next,
3302	fn, data, create, mask);
3303	if (err)
3304	break;
3305	} while (p4d++, addr = next, addr != end);
3306
3307	return err;
3308	}
3309
3310	static int __apply_to_page_range(struct mm_struct mm, unsigned* long addr,
3311	unsigned long size, pte_fn_t fn,
3312	void *data, bool create)
3313	{
3314	pgd_t *pgd;
3315	unsigned long start = addr, next;
3316	unsigned long end = addr + size;
3317	pgtbl_mod_mask mask = `0`;
3318	int err = `0`;
3319
3320	if (WARN_ON(addr >= end))
3321	return -EINVAL;
3322
3323	pgd = pgd_offset(mm, addr);
3324	do {
3325	next = pgd_addr_end(addr, end);
3326	if (pgd_none(pgd: *pgd) && !create)
3327	continue;
3328	if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
3329	err = -EINVAL;
3330	break;
3331	}
3332	if (!pgd_none(pgd: pgd) && WARN_ON_ONCE(pgd_bad(pgd))) {
3333	if (!create)
3334	continue;
3335	pgd_clear_bad(pgd);
3336	}
3337	err = apply_to_p4d_range(mm, pgd, addr, end: next,
3338	fn, data, create, mask: &mask);
3339	if (err)
3340	break;
3341	} while (pgd++, addr = next, addr != end);
3342
3343	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
3344	arch_sync_kernel_mappings(start, end: start + size);
3345
3346	return err;
3347	}
3348
3349	/*
3350	* Scan a region of virtual memory, filling in page tables as necessary
3351	* and calling a provided function on each leaf page table.
3352	*/
3353	int apply_to_page_range(struct mm_struct mm, unsigned* long addr,
3354	unsigned long size, pte_fn_t fn, void *data)
3355	{
3356	return __apply_to_page_range(mm, addr, size, fn, data, create: true);
3357	}
3358	EXPORT_SYMBOL_GPL(apply_to_page_range);
3359
3360	/*
3361	* Scan a region of virtual memory, calling a provided function on
3362	* each leaf page table where it exists.
3363	*
3364	* Unlike apply_to_page_range, this does _not_ fill in page tables
3365	* where they are absent.
3366	*/
3367	int apply_to_existing_page_range(struct mm_struct mm, unsigned* long addr,
3368	unsigned long size, pte_fn_t fn, void *data)
3369	{
3370	return __apply_to_page_range(mm, addr, size, fn, data, create: false);
3371	}
3372
3373	/*
3374	* handle_pte_fault chooses page fault handler according to an entry which was
3375	* read non-atomically. Before making any commitment, on those architectures
3376	* or configurations (e.g. i386 with PAE) which might give a mix of unmatched
3377	* parts, do_swap_page must check under lock before unmapping the pte and
3378	* proceeding (but do_wp_page is only called after already making such a check;
3379	* and do_anonymous_page can safely check later on).
3380	*/
3381	static inline int pte_unmap_same(struct vm_fault *vmf)
3382	{
3383	int same = `1`;
3384	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPTION)
3385	if (sizeof(pte_t) > sizeof(unsigned long)) {
3386	spin_lock(lock: vmf->ptl);
3387	same = pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte);
3388	spin_unlock(lock: vmf->ptl);
3389	}
3390	#endif
3391	pte_unmap(pte: vmf->pte);
3392	vmf->pte = NULL;
3393	return same;
3394	}
3395
3396	/*
3397	* Return:
3398	* 0: copied succeeded
3399	* -EHWPOISON: copy failed due to hwpoison in source page
3400	* -EAGAIN: copied failed (some other reason)
3401	*/
3402	static inline int __wp_page_copy_user(struct page dst, struct* page *src,
3403	struct vm_fault *vmf)
3404	{
3405	int ret;
3406	void *kaddr;
3407	void __user *uaddr;
3408	struct vm_area_struct *vma = vmf->vma;
3409	struct mm_struct *mm = vma->vm_mm;
3410	unsigned long addr = vmf->address;
3411
3412	if (likely(src)) {
3413	if (copy_mc_user_highpage(to: dst, from: src, vaddr: addr, vma))
3414	return -EHWPOISON;
3415	return `0`;
3416	}
3417
3418	/*
3419	* If the source page was a PFN mapping, we don't have
3420	* a "struct page" for it. We do a best-effort copy by
3421	* just copying from the original user address. If that
3422	* fails, we just zero-fill it. Live with it.
3423	*/
3424	kaddr = kmap_local_page(page: dst);
3425	pagefault_disable();
3426	uaddr = (void __user *)(addr & PAGE_MASK);
3427
3428	/*
3429	* On architectures with software "accessed" bits, we would
3430	* take a double page fault, so mark it accessed here.
3431	*/
3432	vmf->pte = NULL;
3433	if (!arch_has_hw_pte_young() && !pte_young(pte: vmf->orig_pte)) {
3434	pte_t entry;
3435
3436	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
3437	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3438	/*
3439	* Other thread has already handled the fault
3440	* and update local tlb only
3441	*/
3442	if (vmf->pte)
3443	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
3444	ret = -EAGAIN;
3445	goto pte_unlock;
3446	}
3447
3448	entry = pte_mkyoung(pte: vmf->orig_pte);
3449	if (ptep_set_access_flags(vma, address: addr, ptep: vmf->pte, entry, dirty: `0`))
3450	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr: `1`);
3451	}
3452
3453	/*
3454	* This really shouldn't fail, because the page is there
3455	* in the page tables. But it might just be unreadable,
3456	* in which case we just give up and fill the result with
3457	* zeroes.
3458	*/
3459	if (__copy_from_user_inatomic(to: kaddr, from: uaddr, PAGE_SIZE)) {
3460	if (vmf->pte)
3461	goto warn;
3462
3463	/ Re-validate under PTL if the page is still mapped /
3464	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
3465	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3466	/ The PTE changed under us, update local tlb /
3467	if (vmf->pte)
3468	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
3469	ret = -EAGAIN;
3470	goto pte_unlock;
3471	}
3472
3473	/*
3474	* The same page can be mapped back since last copy attempt.
3475	* Try to copy again under PTL.
3476	*/
3477	if (__copy_from_user_inatomic(to: kaddr, from: uaddr, PAGE_SIZE)) {
3478	/*
3479	* Give a warn in case there can be some obscure
3480	* use-case
3481	*/
3482	warn:
3483	WARN_ON_ONCE(`1`);
3484	clear_page(page: kaddr);
3485	}
3486	}
3487
3488	ret = `0`;
3489
3490	pte_unlock:
3491	if (vmf->pte)
3492	pte_unmap_unlock(vmf->pte, vmf->ptl);
3493	pagefault_enable();
3494	kunmap_local(kaddr);
3495	flush_dcache_page(page: dst);
3496
3497	return ret;
3498	}
3499
3500	static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
3501	{
3502	struct file *vm_file = vma->vm_file;
3503
3504	if (vm_file)
3505	return mapping_gfp_mask(mapping: vm_file->f_mapping) \| __GFP_FS \| __GFP_IO;
3506
3507	/*
3508	* Special mappings (e.g. VDSO) do not have any file so fake
3509	* a default GFP_KERNEL for them.
3510	*/
3511	return GFP_KERNEL;
3512	}
3513
3514	/*
3515	* Notify the address space that the page is about to become writable so that
3516	* it can prohibit this or wait for the page to get into an appropriate state.
3517	*
3518	* We do this without the lock held, so that it can sleep if it needs to.
3519	*/
3520	static vm_fault_t do_page_mkwrite(struct vm_fault vmf, struct* folio *folio)
3521	{
3522	vm_fault_t ret;
3523	unsigned int old_flags = vmf->flags;
3524
3525	vmf->flags = FAULT_FLAG_WRITE\|FAULT_FLAG_MKWRITE;
3526
3527	if (vmf->vma->vm_file &&
3528	IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
3529	return VM_FAULT_SIGBUS;
3530
3531	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
3532	/ Restore original flags so that caller is not surprised /
3533	vmf->flags = old_flags;
3534	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))
3535	return ret;
3536	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
3537	folio_lock(folio);
3538	if (!folio->mapping) {
3539	folio_unlock(folio);
3540	return `0`; / retry /
3541	}
3542	ret \|= VM_FAULT_LOCKED;
3543	} else
3544	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3545	return ret;
3546	}
3547
3548	/*
3549	* Handle dirtying of a page in shared file mapping on a write fault.
3550	*
3551	* The function expects the page to be locked and unlocks it.
3552	*/
3553	static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
3554	{
3555	struct vm_area_struct *vma = vmf->vma;
3556	struct address_space *mapping;
3557	struct folio *folio = page_folio(vmf->page);
3558	bool dirtied;
3559	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
3560
3561	dirtied = folio_mark_dirty(folio);
3562	VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
3563	/*
3564	* Take a local copy of the address_space - folio.mapping may be zeroed
3565	* by truncate after folio_unlock(). The address_space itself remains
3566	* pinned by vma->vm_file's reference. We rely on folio_unlock()'s
3567	* release semantics to prevent the compiler from undoing this copying.
3568	*/
3569	mapping = folio_raw_mapping(folio);
3570	folio_unlock(folio);
3571
3572	if (!page_mkwrite)
3573	file_update_time(file: vma->vm_file);
3574
3575	/*
3576	* Throttle page dirtying rate down to writeback speed.
3577	*
3578	* mapping may be NULL here because some device drivers do not
3579	* set page.mapping but still dirty their pages
3580	*
3581	* Drop the mmap_lock before waiting on IO, if we can. The file
3582	* is pinning the mapping, as per above.
3583	*/
3584	if ((dirtied \|\| page_mkwrite) && mapping) {
3585	struct file *fpin;
3586
3587	fpin = maybe_unlock_mmap_for_io(vmf, NULL);
3588	balance_dirty_pages_ratelimited(mapping);
3589	if (fpin) {
3590	fput(fpin);
3591	return VM_FAULT_COMPLETED;
3592	}
3593	}
3594
3595	return `0`;
3596	}
3597
3598	/*
3599	* Handle write page faults for pages that can be reused in the current vma
3600	*
3601	* This can happen either due to the mapping being with the VM_SHARED flag,
3602	* or due to us being the last reference standing to the page. In either
3603	* case, all we need to do here is to mark the page as writable and update
3604	* any related book-keeping.
3605	*/
3606	static inline void wp_page_reuse(struct vm_fault vmf, struct* folio *folio)
3607	__releases(vmf->ptl)
3608	{
3609	struct vm_area_struct *vma = vmf->vma;
3610	pte_t entry;
3611
3612	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
3613	VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));
3614
3615	if (folio) {
3616	VM_BUG_ON(folio_test_anon(folio) &&
3617	!PageAnonExclusive(vmf->page));
3618	/*
3619	* Clear the folio's cpupid information as the existing
3620	* information potentially belongs to a now completely
3621	* unrelated process.
3622	*/
3623	folio_xchg_last_cpupid(folio, (`1` << LAST_CPUPID_SHIFT) - `1`);
3624	}
3625
3626	flush_cache_page(vma, vmaddr: vmf->address, pfn: pte_pfn(pte: vmf->orig_pte));
3627	entry = pte_mkyoung(pte: vmf->orig_pte);
3628	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
3629	if (ptep_set_access_flags(vma, address: vmf->address, ptep: vmf->pte, entry, dirty: `1`))
3630	update_mmu_cache_range(vmf, vma, addr: vmf->address, ptep: vmf->pte, nr: `1`);
3631	pte_unmap_unlock(vmf->pte, vmf->ptl);
3632	count_vm_event(item: PGREUSE);
3633	}
3634
3635	/*
3636	* We could add a bitflag somewhere, but for now, we know that all
3637	* vm_ops that have a ->map_pages have been audited and don't need
3638	* the mmap_lock to be held.
3639	*/
3640	static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
3641	{
3642	struct vm_area_struct *vma = vmf->vma;
3643
3644	if (vma->vm_ops->map_pages \|\| !(vmf->flags & FAULT_FLAG_VMA_LOCK))
3645	return `0`;
3646	vma_end_read(vma);
3647	return VM_FAULT_RETRY;
3648	}
3649
3650	/**
3651	* __vmf_anon_prepare - Prepare to handle an anonymous fault.
3652	* @vmf: The vm_fault descriptor passed from the fault handler.
3653	*
3654	* When preparing to insert an anonymous page into a VMA from a
3655	* fault handler, call this function rather than anon_vma_prepare().
3656	* If this vma does not already have an associated anon_vma and we are
3657	* only protected by the per-VMA lock, the caller must retry with the
3658	* mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to
3659	* determine if this VMA can share its anon_vma, and that's not safe to
3660	* do with only the per-VMA lock held for this VMA.
3661	*
3662	* Return: 0 if fault handling can proceed. Any other value should be
3663	* returned to the caller.
3664	*/
3665	vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
3666	{
3667	struct vm_area_struct *vma = vmf->vma;
3668	vm_fault_t ret = `0`;
3669
3670	if (likely(vma->anon_vma))
3671	return `0`;
3672	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
3673	if (!mmap_read_trylock(mm: vma->vm_mm))
3674	return VM_FAULT_RETRY;
3675	}
3676	if (__anon_vma_prepare(vma))
3677	ret = VM_FAULT_OOM;
3678	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
3679	mmap_read_unlock(mm: vma->vm_mm);
3680	return ret;
3681	}
3682
3683	/*
3684	* Handle the case of a page which we actually need to copy to a new page,
3685	* either due to COW or unsharing.
3686	*
3687	* Called with mmap_lock locked and the old page referenced, but
3688	* without the ptl held.
3689	*
3690	* High level logic flow:
3691	*
3692	* - Allocate a page, copy the content of the old page to the new one.
3693	* - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3694	* - Take the PTL. If the pte changed, bail out and release the allocated page
3695	* - If the pte is still the way we remember it, update the page table and all
3696	* relevant references. This includes dropping the reference the page-table
3697	* held to the old page, as well as updating the rmap.
3698	* - In any case, unlock the PTL and drop the reference we took to the old page.
3699	*/
3700	static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3701	{
3702	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3703	struct vm_area_struct *vma = vmf->vma;
3704	struct mm_struct *mm = vma->vm_mm;
3705	struct folio *old_folio = NULL;
3706	struct folio *new_folio = NULL;
3707	pte_t entry;
3708	int page_copied = `0`;
3709	struct mmu_notifier_range range;
3710	vm_fault_t ret;
3711	bool pfn_is_zero;
3712
3713	delayacct_wpcopy_start();
3714
3715	if (vmf->page)
3716	old_folio = page_folio(vmf->page);
3717	ret = vmf_anon_prepare(vmf);
3718	if (unlikely(ret))
3719	goto out;
3720
3721	pfn_is_zero = is_zero_pfn(pfn: pte_pfn(pte: vmf->orig_pte));
3722	new_folio = folio_prealloc(src_mm: mm, vma, addr: vmf->address, need_zero: pfn_is_zero);
3723	if (!new_folio)
3724	goto oom;
3725
3726	if (!pfn_is_zero) {
3727	int err;
3728
3729	err = __wp_page_copy_user(dst: &new_folio->page, src: vmf->page, vmf);
3730	if (err) {
3731	/*
3732	* COW failed, if the fault was solved by other,
3733	* it's fine. If not, userspace would re-fault on
3734	* the same address and we will handle the fault
3735	* from the second attempt.
3736	* The -EHWPOISON case will not be retried.
3737	*/
3738	folio_put(folio: new_folio);
3739	if (old_folio)
3740	folio_put(folio: old_folio);
3741
3742	delayacct_wpcopy_end();
3743	return err == -EHWPOISON ? VM_FAULT_HWPOISON : `0`;
3744	}
3745	kmsan_copy_page_meta(dst: &new_folio->page, src: vmf->page);
3746	}
3747
3748	__folio_mark_uptodate(folio: new_folio);
3749
3750	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
3751	start: vmf->address & PAGE_MASK,
3752	end: (vmf->address & PAGE_MASK) + PAGE_SIZE);
3753	mmu_notifier_invalidate_range_start(range: &range);
3754
3755	/*
3756	* Re-check the pte - we dropped the lock
3757	*/
3758	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr: vmf->address, ptlp: &vmf->ptl);
3759	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3760	if (old_folio) {
3761	if (!folio_test_anon(folio: old_folio)) {
3762	dec_mm_counter(mm, member: mm_counter_file(folio: old_folio));
3763	inc_mm_counter(mm, member: MM_ANONPAGES);
3764	}
3765	} else {
3766	ksm_might_unmap_zero_page(mm, pte: vmf->orig_pte);
3767	inc_mm_counter(mm, member: MM_ANONPAGES);
3768	}
3769	flush_cache_page(vma, vmaddr: vmf->address, pfn: pte_pfn(pte: vmf->orig_pte));
3770	entry = folio_mk_pte(folio: new_folio, pgprot: vma->vm_page_prot);
3771	entry = pte_sw_mkyoung(pte: entry);
3772	if (unlikely(unshare)) {
3773	if (pte_soft_dirty(pte: vmf->orig_pte))
3774	entry = pte_mksoft_dirty(pte: entry);
3775	if (pte_uffd_wp(pte: vmf->orig_pte))
3776	entry = pte_mkuffd_wp(pte: entry);
3777	} else {
3778	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
3779	}
3780
3781	/*
3782	* Clear the pte entry and flush it first, before updating the
3783	* pte with the new entry, to keep TLBs on different CPUs in
3784	* sync. This code used to set the new PTE then flush TLBs, but
3785	* that left a window where the new PTE could be loaded into
3786	* some TLBs while the old PTE remains in others.
3787	*/
3788	ptep_clear_flush(vma, address: vmf->address, ptep: vmf->pte);
3789	folio_add_new_anon_rmap(new_folio, vma, address: vmf->address, RMAP_EXCLUSIVE);
3790	folio_add_lru_vma(new_folio, vma);
3791	BUG_ON(unshare && pte_write(entry));
3792	set_pte_at(mm, vmf->address, vmf->pte, entry);
3793	update_mmu_cache_range(vmf, vma, addr: vmf->address, ptep: vmf->pte, nr: `1`);
3794	if (old_folio) {
3795	/*
3796	* Only after switching the pte to the new page may
3797	* we remove the mapcount here. Otherwise another
3798	* process may come and find the rmap count decremented
3799	* before the pte is switched to the new page, and
3800	* "reuse" the old page writing into it while our pte
3801	* here still points into it and can be read by other
3802	* threads.
3803	*
3804	* The critical issue is to order this
3805	* folio_remove_rmap_pte() with the ptp_clear_flush
3806	* above. Those stores are ordered by (if nothing else,)
3807	* the barrier present in the atomic_add_negative
3808	* in folio_remove_rmap_pte();
3809	*
3810	* Then the TLB flush in ptep_clear_flush ensures that
3811	* no process can access the old page before the
3812	* decremented mapcount is visible. And the old page
3813	* cannot be reused until after the decremented
3814	* mapcount is visible. So transitively, TLBs to
3815	* old page will be flushed before it can be reused.
3816	*/
3817	folio_remove_rmap_pte(old_folio, vmf->page, vma);
3818	}
3819
3820	/ Free the old page.. /
3821	new_folio = old_folio;
3822	page_copied = `1`;
3823	pte_unmap_unlock(vmf->pte, vmf->ptl);
3824	} else if (vmf->pte) {
3825	update_mmu_tlb(vma, address: vmf->address, ptep: vmf->pte);
3826	pte_unmap_unlock(vmf->pte, vmf->ptl);
3827	}
3828
3829	mmu_notifier_invalidate_range_end(range: &range);
3830
3831	if (new_folio)
3832	folio_put(folio: new_folio);
3833	if (old_folio) {
3834	if (page_copied)
3835	free_swap_cache(folio: old_folio);
3836	folio_put(folio: old_folio);
3837	}
3838
3839	delayacct_wpcopy_end();
3840	return `0`;
3841	oom:
3842	ret = VM_FAULT_OOM;
3843	out:
3844	if (old_folio)
3845	folio_put(folio: old_folio);
3846
3847	delayacct_wpcopy_end();
3848	return ret;
3849	}
3850
3851	/**
3852	* finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3853	* writeable once the page is prepared
3854	*
3855	* @vmf: structure describing the fault
3856	* @folio: the folio of vmf->page
3857	*
3858	* This function handles all that is needed to finish a write page fault in a
3859	* shared mapping due to PTE being read-only once the mapped page is prepared.
3860	* It handles locking of PTE and modifying it.
3861	*
3862	* The function expects the page to be locked or other protection against
3863	* concurrent faults / writeback (such as DAX radix tree locks).
3864	*
3865	* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
3866	* we acquired PTE lock.
3867	*/
3868	static vm_fault_t finish_mkwrite_fault(struct vm_fault vmf, struct* folio *folio)
3869	{
3870	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3871	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
3872	ptlp: &vmf->ptl);
3873	if (!vmf->pte)
3874	return VM_FAULT_NOPAGE;
3875	/*
3876	* We might have raced with another page fault while we released the
3877	* pte_offset_map_lock.
3878	*/
3879	if (!pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte)) {
3880	update_mmu_tlb(vma: vmf->vma, address: vmf->address, ptep: vmf->pte);
3881	pte_unmap_unlock(vmf->pte, vmf->ptl);
3882	return VM_FAULT_NOPAGE;
3883	}
3884	wp_page_reuse(vmf, folio);
3885	return `0`;
3886	}
3887
3888	/*
3889	* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
3890	* mapping
3891	*/
3892	static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3893	{
3894	struct vm_area_struct *vma = vmf->vma;
3895
3896	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3897	vm_fault_t ret;
3898
3899	pte_unmap_unlock(vmf->pte, vmf->ptl);
3900	ret = vmf_can_call_fault(vmf);
3901	if (ret)
3902	return ret;
3903
3904	vmf->flags \|= FAULT_FLAG_MKWRITE;
3905	ret = vma->vm_ops->pfn_mkwrite(vmf);
3906	if (ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))
3907	return ret;
3908	return finish_mkwrite_fault(vmf, NULL);
3909	}
3910	wp_page_reuse(vmf, NULL);
3911	return `0`;
3912	}
3913
3914	static vm_fault_t wp_page_shared(struct vm_fault vmf, struct* folio *folio)
3915	__releases(vmf->ptl)
3916	{
3917	struct vm_area_struct *vma = vmf->vma;
3918	vm_fault_t ret = `0`;
3919
3920	folio_get(folio);
3921
3922	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3923	vm_fault_t tmp;
3924
3925	pte_unmap_unlock(vmf->pte, vmf->ptl);
3926	tmp = vmf_can_call_fault(vmf);
3927	if (tmp) {
3928	folio_put(folio);
3929	return tmp;
3930	}
3931
3932	tmp = do_page_mkwrite(vmf, folio);
3933	if (unlikely(!tmp \|\| (tmp &
3934	(VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
3935	folio_put(folio);
3936	return tmp;
3937	}
3938	tmp = finish_mkwrite_fault(vmf, folio);
3939	if (unlikely(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))) {
3940	folio_unlock(folio);
3941	folio_put(folio);
3942	return tmp;
3943	}
3944	} else {
3945	wp_page_reuse(vmf, folio);
3946	folio_lock(folio);
3947	}
3948	ret \|= fault_dirty_shared_page(vmf);
3949	folio_put(folio);
3950
3951	return ret;
3952	}
3953
3954	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3955	static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
3956	struct vm_area_struct *vma)
3957	{
3958	bool exclusive = false;
3959
3960	/ Let's just free up a large folio if only a single page is mapped. /
3961	if (folio_large_mapcount(folio) <= `1`)
3962	return false;
3963
3964	/*
3965	* The assumption for anonymous folios is that each page can only get
3966	* mapped once into each MM. The only exception are KSM folios, which
3967	* are always small.
3968	*
3969	* Each taken mapcount must be paired with exactly one taken reference,
3970	* whereby the refcount must be incremented before the mapcount when
3971	* mapping a page, and the refcount must be decremented after the
3972	* mapcount when unmapping a page.
3973	*
3974	* If all folio references are from mappings, and all mappings are in
3975	* the page tables of this MM, then this folio is exclusive to this MM.
3976	*/
3977	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
3978	return false;
3979
3980	VM_WARN_ON_ONCE(folio_test_ksm(folio));
3981
3982	if (unlikely(folio_test_swapcache(folio))) {
3983	/*
3984	* Note: freeing up the swapcache will fail if some PTEs are
3985	* still swap entries.
3986	*/
3987	if (!folio_trylock(folio))
3988	return false;
3989	folio_free_swap(folio);
3990	folio_unlock(folio);
3991	}
3992
3993	if (folio_large_mapcount(folio) != folio_ref_count(folio))
3994	return false;
3995
3996	/ Stabilize the mapcount vs. refcount and recheck. /
3997	folio_lock_large_mapcount(folio);
3998	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
3999
4000	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
4001	goto unlock;
4002	if (folio_large_mapcount(folio) != folio_ref_count(folio))
4003	goto unlock;
4004
4005	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
4006	VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
4007	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != vma->vm_mm->mm_id &&
4008	folio_mm_id(folio, `1`) != vma->vm_mm->mm_id);
4009
4010	/*
4011	* Do we need the folio lock? Likely not. If there would have been
4012	* references from page migration/swapout, we would have detected
4013	* an additional folio reference and never ended up here.
4014	*/
4015	exclusive = true;
4016	unlock:
4017	folio_unlock_large_mapcount(folio);
4018	return exclusive;
4019	}
4020	#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4021	static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
4022	struct vm_area_struct *vma)
4023	{
4024	BUILD_BUG();
4025	}
4026	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4027
4028	static bool wp_can_reuse_anon_folio(struct folio *folio,
4029	struct vm_area_struct *vma)
4030	{
4031	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
4032	return __wp_can_reuse_large_anon_folio(folio, vma);
4033
4034	/*
4035	* We have to verify under folio lock: these early checks are
4036	* just an optimization to avoid locking the folio and freeing
4037	* the swapcache if there is little hope that we can reuse.
4038	*
4039	* KSM doesn't necessarily raise the folio refcount.
4040	*/
4041	if (folio_test_ksm(folio) \|\| folio_ref_count(folio) > `3`)
4042	return false;
4043	if (!folio_test_lru(folio))
4044	/*
4045	* We cannot easily detect+handle references from
4046	* remote LRU caches or references to LRU folios.
4047	*/
4048	lru_add_drain();
4049	if (folio_ref_count(folio) > `1` + folio_test_swapcache(folio))
4050	return false;
4051	if (!folio_trylock(folio))
4052	return false;
4053	if (folio_test_swapcache(folio))
4054	folio_free_swap(folio);
4055	if (folio_test_ksm(folio) \|\| folio_ref_count(folio) != `1`) {
4056	folio_unlock(folio);
4057	return false;
4058	}
4059	/*
4060	* Ok, we've got the only folio reference from our mapping
4061	* and the folio is locked, it's dark out, and we're wearing
4062	* sunglasses. Hit it.
4063	*/
4064	folio_move_anon_rmap(folio, vma);
4065	folio_unlock(folio);
4066	return true;
4067	}
4068
4069	/*
4070	* This routine handles present pages, when
4071	* * users try to write to a shared page (FAULT_FLAG_WRITE)
4072	* * GUP wants to take a R/O pin on a possibly shared anonymous page
4073	* (FAULT_FLAG_UNSHARE)
4074	*
4075	* It is done by copying the page to a new address and decrementing the
4076	* shared-page counter for the old page.
4077	*
4078	* Note that this routine assumes that the protection checks have been
4079	* done by the caller (the low-level page fault routine in most cases).
4080	* Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
4081	* done any necessary COW.
4082	*
4083	* In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
4084	* though the page will change only once the write actually happens. This
4085	* avoids a few races, and potentially makes it more efficient.
4086	*
4087	* We enter with non-exclusive mmap_lock (to exclude vma changes,
4088	* but allow concurrent faults), with pte both mapped and locked.
4089	* We return with mmap_lock still held, but pte unmapped and unlocked.
4090	*/
4091	static vm_fault_t do_wp_page(struct vm_fault *vmf)
4092	__releases(vmf->ptl)
4093	{
4094	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
4095	struct vm_area_struct *vma = vmf->vma;
4096	struct folio *folio = NULL;
4097	pte_t pte;
4098
4099	if (likely(!unshare)) {
4100	if (userfaultfd_pte_wp(vma, pte: ptep_get(ptep: vmf->pte))) {
4101	if (!userfaultfd_wp_async(vma)) {
4102	pte_unmap_unlock(vmf->pte, vmf->ptl);
4103	return handle_userfault(vmf, VM_UFFD_WP);
4104	}
4105
4106	/*
4107	* Nothing needed (cache flush, TLB invalidations,
4108	* etc.) because we're only removing the uffd-wp bit,
4109	* which is completely invisible to the user.
4110	*/
4111	pte = pte_clear_uffd_wp(pte: ptep_get(ptep: vmf->pte));
4112
4113	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
4114	/*
4115	* Update this to be prepared for following up CoW
4116	* handling
4117	*/
4118	vmf->orig_pte = pte;
4119	}
4120
4121	/*
4122	* Userfaultfd write-protect can defer flushes. Ensure the TLB
4123	* is flushed in this case before copying.
4124	*/
4125	if (unlikely(userfaultfd_wp(vmf->vma) &&
4126	mm_tlb_flush_pending(vmf->vma->vm_mm)))
4127	flush_tlb_page(vma: vmf->vma, a: vmf->address);
4128	}
4129
4130	vmf->page = vm_normal_page(vma, addr: vmf->address, pte: vmf->orig_pte);
4131
4132	if (vmf->page)
4133	folio = page_folio(vmf->page);
4134
4135	/*
4136	* Shared mapping: we are guaranteed to have VM_WRITE and
4137	* FAULT_FLAG_WRITE set at this point.
4138	*/
4139	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
4140	/*
4141	* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
4142	* VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
4143	*
4144	* We should not cow pages in a shared writeable mapping.
4145	* Just mark the pages writable and/or call ops->pfn_mkwrite.
4146	*/
4147	if (!vmf->page \|\| is_fsdax_page(page: vmf->page)) {
4148	vmf->page = NULL;
4149	return wp_pfn_shared(vmf);
4150	}
4151	return wp_page_shared(vmf, folio);
4152	}
4153
4154	/*
4155	* Private mapping: create an exclusive anonymous page copy if reuse
4156	* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
4157	*
4158	* If we encounter a page that is marked exclusive, we must reuse
4159	* the page without further checks.
4160	*/
4161	if (folio && folio_test_anon(folio) &&
4162	(PageAnonExclusive(page: vmf->page) \|\| wp_can_reuse_anon_folio(folio, vma))) {
4163	if (!PageAnonExclusive(page: vmf->page))
4164	SetPageAnonExclusive(vmf->page);
4165	if (unlikely(unshare)) {
4166	pte_unmap_unlock(vmf->pte, vmf->ptl);
4167	return `0`;
4168	}
4169	wp_page_reuse(vmf, folio);
4170	return `0`;
4171	}
4172	/*
4173	* Ok, we need to copy. Oh, well..
4174	*/
4175	if (folio)
4176	folio_get(folio);
4177
4178	pte_unmap_unlock(vmf->pte, vmf->ptl);
4179	#ifdef CONFIG_KSM
4180	if (folio && folio_test_ksm(folio))
4181	count_vm_event(item: COW_KSM);
4182	#endif
4183	return wp_page_copy(vmf);
4184	}
4185
4186	static void unmap_mapping_range_vma(struct vm_area_struct *vma,
4187	unsigned long start_addr, unsigned long end_addr,
4188	struct zap_details *details)
4189	{
4190	zap_page_range_single(vma, address: start_addr, size: end_addr - start_addr, details);
4191	}
4192
4193	static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
4194	pgoff_t first_index,
4195	pgoff_t last_index,
4196	struct zap_details *details)
4197	{
4198	struct vm_area_struct *vma;
4199	pgoff_t vba, vea, zba, zea;
4200
4201	vma_interval_tree_foreach(vma, root, first_index, last_index) {
4202	vba = vma->vm_pgoff;
4203	vea = vba + vma_pages(vma) - `1`;
4204	zba = max(first_index, vba);
4205	zea = min(last_index, vea);
4206
4207	unmap_mapping_range_vma(vma,
4208	start_addr: ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
4209	end_addr: ((zea - vba + `1`) << PAGE_SHIFT) + vma->vm_start,
4210	details);
4211	}
4212	}
4213
4214	/**
4215	* unmap_mapping_folio() - Unmap single folio from processes.
4216	* @folio: The locked folio to be unmapped.
4217	*
4218	* Unmap this folio from any userspace process which still has it mmaped.
4219	* Typically, for efficiency, the range of nearby pages has already been
4220	* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
4221	* truncation or invalidation holds the lock on a folio, it may find that
4222	* the page has been remapped again: and then uses unmap_mapping_folio()
4223	* to unmap it finally.
4224	*/
4225	void unmap_mapping_folio(struct folio *folio)
4226	{
4227	struct address_space *mapping = folio->mapping;
4228	struct zap_details details = { };
4229	pgoff_t first_index;
4230	pgoff_t last_index;
4231
4232	VM_BUG_ON(!folio_test_locked(folio));
4233
4234	first_index = folio->index;
4235	last_index = folio_next_index(folio) - `1`;
4236
4237	details.even_cows = false;
4238	details.single_folio = folio;
4239	details.zap_flags = ZAP_FLAG_DROP_MARKER;
4240
4241	i_mmap_lock_read(mapping);
4242	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4243	unmap_mapping_range_tree(root: &mapping->i_mmap, first_index,
4244	last_index, details: &details);
4245	i_mmap_unlock_read(mapping);
4246	}
4247
4248	/**
4249	* unmap_mapping_pages() - Unmap pages from processes.
4250	* @mapping: The address space containing pages to be unmapped.
4251	* @start: Index of first page to be unmapped.
4252	* @nr: Number of pages to be unmapped. 0 to unmap to end of file.
4253	* @even_cows: Whether to unmap even private COWed pages.
4254	*
4255	* Unmap the pages in this address space from any userspace process which
4256	* has them mmaped. Generally, you want to remove COWed pages as well when
4257	* a file is being truncated, but not when invalidating pages from the page
4258	* cache.
4259	*/
4260	void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
4261	pgoff_t nr, bool even_cows)
4262	{
4263	struct zap_details details = { };
4264	pgoff_t first_index = start;
4265	pgoff_t last_index = start + nr - `1`;
4266
4267	details.even_cows = even_cows;
4268	if (last_index < first_index)
4269	last_index = ULONG_MAX;
4270
4271	i_mmap_lock_read(mapping);
4272	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4273	unmap_mapping_range_tree(root: &mapping->i_mmap, first_index,
4274	last_index, details: &details);
4275	i_mmap_unlock_read(mapping);
4276	}
4277	EXPORT_SYMBOL_GPL(unmap_mapping_pages);
4278
4279	/**
4280	* unmap_mapping_range - unmap the portion of all mmaps in the specified
4281	* address_space corresponding to the specified byte range in the underlying
4282	* file.
4283	*
4284	* @mapping: the address space containing mmaps to be unmapped.
4285	* @holebegin: byte in first page to unmap, relative to the start of
4286	* the underlying file. This will be rounded down to a PAGE_SIZE
4287	* boundary. Note that this is different from truncate_pagecache(), which
4288	* must keep the partial page. In contrast, we must get rid of
4289	* partial pages.
4290	* @holelen: size of prospective hole in bytes. This will be rounded
4291	* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
4292	* end of the file.
4293	* @even_cows: 1 when truncating a file, unmap even private COWed pages;
4294	* but 0 when invalidating pagecache, don't throw away private data.
4295	*/
4296	void unmap_mapping_range(struct address_space *mapping,
4297	loff_t const holebegin, loff_t const holelen, int even_cows)
4298	{
4299	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
4300	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - `1`) >> PAGE_SHIFT;
4301
4302	/ Check for overflow. /
4303	if (sizeof(holelen) > sizeof(hlen)) {
4304	long long holeend =
4305	(holebegin + holelen + PAGE_SIZE - `1`) >> PAGE_SHIFT;
4306	if (holeend & ~(long long)ULONG_MAX)
4307	hlen = ULONG_MAX - hba + `1`;
4308	}
4309
4310	unmap_mapping_pages(mapping, hba, hlen, even_cows);
4311	}
4312	EXPORT_SYMBOL(unmap_mapping_range);
4313
4314	/*
4315	* Restore a potential device exclusive pte to a working pte entry
4316	*/
4317	static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
4318	{
4319	struct folio *folio = page_folio(vmf->page);
4320	struct vm_area_struct *vma = vmf->vma;
4321	struct mmu_notifier_range range;
4322	vm_fault_t ret;
4323
4324	/*
4325	* We need a reference to lock the folio because we don't hold
4326	* the PTL so a racing thread can remove the device-exclusive
4327	* entry and unmap it. If the folio is free the entry must
4328	* have been removed already. If it happens to have already
4329	* been re-allocated after being freed all we do is lock and
4330	* unlock it.
4331	*/
4332	if (!folio_try_get(folio))
4333	return `0`;
4334
4335	ret = folio_lock_or_retry(folio, vmf);
4336	if (ret) {
4337	folio_put(folio);
4338	return ret;
4339	}
4340	mmu_notifier_range_init_owner(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`,
4341	mm: vma->vm_mm, start: vmf->address & PAGE_MASK,
4342	end: (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
4343	mmu_notifier_invalidate_range_start(range: &range);
4344
4345	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
4346	ptlp: &vmf->ptl);
4347	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4348	restore_exclusive_pte(vma, folio, page: vmf->page, address: vmf->address,
4349	ptep: vmf->pte, orig_pte: vmf->orig_pte);
4350
4351	if (vmf->pte)
4352	pte_unmap_unlock(vmf->pte, vmf->ptl);
4353	folio_unlock(folio);
4354	folio_put(folio);
4355
4356	mmu_notifier_invalidate_range_end(range: &range);
4357	return `0`;
4358	}
4359
4360	static inline bool should_try_to_free_swap(struct folio *folio,
4361	struct vm_area_struct *vma,
4362	unsigned int fault_flags)
4363	{
4364	if (!folio_test_swapcache(folio))
4365	return false;
4366	if (mem_cgroup_swap_full(folio) \|\| (vma->vm_flags & VM_LOCKED) \|\|
4367	folio_test_mlocked(folio))
4368	return true;
4369	/*
4370	* If we want to map a page that's in the swapcache writable, we
4371	* have to detect via the refcount if we're really the exclusive
4372	* user. Try freeing the swapcache to get rid of the swapcache
4373	* reference only in case it's likely that we'll be the exclusive user.
4374	*/
4375	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
4376	folio_ref_count(folio) == (`1` + folio_nr_pages(folio));
4377	}
4378
4379	static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
4380	{
4381	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
4382	addr: vmf->address, ptlp: &vmf->ptl);
4383	if (!vmf->pte)
4384	return `0`;
4385	/*
4386	* Be careful so that we will only recover a special uffd-wp pte into a
4387	* none pte. Otherwise it means the pte could have changed, so retry.
4388	*
4389	* This should also cover the case where e.g. the pte changed
4390	* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
4391	* So pte_is_marker() check is not enough to safely drop the pte.
4392	*/
4393	if (pte_same(a: vmf->orig_pte, b: ptep_get(ptep: vmf->pte)))
4394	pte_clear(mm: vmf->vma->vm_mm, addr: vmf->address, ptep: vmf->pte);
4395	pte_unmap_unlock(vmf->pte, vmf->ptl);
4396	return `0`;
4397	}
4398
4399	static vm_fault_t do_pte_missing(struct vm_fault *vmf)
4400	{
4401	if (vma_is_anonymous(vma: vmf->vma))
4402	return do_anonymous_page(vmf);
4403	else
4404	return do_fault(vmf);
4405	}
4406
4407	/*
4408	* This is actually a page-missing access, but with uffd-wp special pte
4409	* installed. It means this pte was wr-protected before being unmapped.
4410	*/
4411	static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
4412	{
4413	/*
4414	* Just in case there're leftover special ptes even after the region
4415	* got unregistered - we can simply clear them.
4416	*/
4417	if (unlikely(!userfaultfd_wp(vmf->vma)))
4418	return pte_marker_clear(vmf);
4419
4420	return do_pte_missing(vmf);
4421	}
4422
4423	static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
4424	{
4425	const softleaf_t entry = softleaf_from_pte(pte: vmf->orig_pte);
4426	const pte_marker marker = softleaf_to_marker(entry);
4427
4428	/*
4429	* PTE markers should never be empty. If anything weird happened,
4430	* the best thing to do is to kill the process along with its mm.
4431	*/
4432	if (WARN_ON_ONCE(!marker))
4433	return VM_FAULT_SIGBUS;
4434
4435	/ Higher priority than uffd-wp when data corrupted /
4436	if (marker & PTE_MARKER_POISONED)
4437	return VM_FAULT_HWPOISON;
4438
4439	/ Hitting a guard page is always a fatal condition. /
4440	if (marker & PTE_MARKER_GUARD)
4441	return VM_FAULT_SIGSEGV;
4442
4443	if (softleaf_is_uffd_wp_marker(entry))
4444	return pte_marker_handle_uffd_wp(vmf);
4445
4446	/ This is an unknown pte marker /
4447	return VM_FAULT_SIGBUS;
4448	}
4449
4450	static struct folio __alloc_swap_folio(struct* vm_fault *vmf)
4451	{
4452	struct vm_area_struct *vma = vmf->vma;
4453	struct folio *folio;
4454	softleaf_t entry;
4455
4456	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, `0`, vma, vmf->address);
4457	if (!folio)
4458	return NULL;
4459
4460	entry = softleaf_from_pte(pte: vmf->orig_pte);
4461	if (mem_cgroup_swapin_charge_folio(folio, mm: vma->vm_mm,
4462	GFP_KERNEL, entry)) {
4463	folio_put(folio);
4464	return NULL;
4465	}
4466
4467	return folio;
4468	}
4469
4470	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4471	/*
4472	* Check if the PTEs within a range are contiguous swap entries
4473	* and have consistent swapcache, zeromap.
4474	*/
4475	static bool can_swapin_thp(struct vm_fault vmf, pte_t ptep, int nr_pages)
4476	{
4477	unsigned long addr;
4478	softleaf_t entry;
4479	int idx;
4480	pte_t pte;
4481
4482	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
4483	idx = (vmf->address - addr) / PAGE_SIZE;
4484	pte = ptep_get(ptep);
4485
4486	if (!pte_same(a: pte, b: pte_move_swp_offset(pte: vmf->orig_pte, delta: -idx)))
4487	return false;
4488	entry = softleaf_from_pte(pte);
4489	if (swap_pte_batch(start_ptep: ptep, max_nr: nr_pages, pte) != nr_pages)
4490	return false;
4491
4492	/*
4493	* swap_read_folio() can't handle the case a large folio is hybridly
4494	* from different backends. And they are likely corner cases. Similar
4495	* things might be added once zswap support large folios.
4496	*/
4497	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
4498	return false;
4499	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
4500	return false;
4501
4502	return true;
4503	}
4504
4505	static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
4506	unsigned long addr,
4507	unsigned long orders)
4508	{
4509	int order, nr;
4510
4511	order = highest_order(orders);
4512
4513	/*
4514	* To swap in a THP with nr pages, we require that its first swap_offset
4515	* is aligned with that number, as it was when the THP was swapped out.
4516	* This helps filter out most invalid entries.
4517	*/
4518	while (orders) {
4519	nr = `1` << order;
4520	if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
4521	break;
4522	order = next_order(orders: &orders, prev: order);
4523	}
4524
4525	return orders;
4526	}
4527
4528	static struct folio alloc_swap_folio(struct* vm_fault *vmf)
4529	{
4530	struct vm_area_struct *vma = vmf->vma;
4531	unsigned long orders;
4532	struct folio *folio;
4533	unsigned long addr;
4534	softleaf_t entry;
4535	spinlock_t *ptl;
4536	pte_t *pte;
4537	gfp_t gfp;
4538	int order;
4539
4540	/*
4541	* If uffd is active for the vma we need per-page fault fidelity to
4542	* maintain the uffd semantics.
4543	*/
4544	if (unlikely(userfaultfd_armed(vma)))
4545	goto fallback;
4546
4547	/*
4548	* A large swapped out folio could be partially or fully in zswap. We
4549	* lack handling for such cases, so fallback to swapping in order-0
4550	* folio.
4551	*/
4552	if (!zswap_never_enabled())
4553	goto fallback;
4554
4555	entry = softleaf_from_pte(pte: vmf->orig_pte);
4556	/*
4557	* Get a list of all the (large) orders below PMD_ORDER that are enabled
4558	* and suitable for swapping THP.
4559	*/
4560	orders = thp_vma_allowable_orders(vma, vm_flags: vma->vm_flags, type: TVA_PAGEFAULT,
4561	BIT(PMD_ORDER) - `1`);
4562	orders = thp_vma_suitable_orders(vma, addr: vmf->address, orders);
4563	orders = thp_swap_suitable_orders(swp_offset: swp_offset(entry),
4564	addr: vmf->address, orders);
4565
4566	if (!orders)
4567	goto fallback;
4568
4569	pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
4570	addr: vmf->address & PMD_MASK, ptlp: &ptl);
4571	if (unlikely(!pte))
4572	goto fallback;
4573
4574	/*
4575	* For do_swap_page, find the highest order where the aligned range is
4576	* completely swap entries with contiguous swap offsets.
4577	*/
4578	order = highest_order(orders);
4579	while (orders) {
4580	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4581	if (can_swapin_thp(vmf, ptep: pte + pte_index(address: addr), nr_pages: `1` << order))
4582	break;
4583	order = next_order(orders: &orders, prev: order);
4584	}
4585
4586	pte_unmap_unlock(pte, ptl);
4587
4588	/ Try allocating the highest of the remaining orders. /
4589	gfp = vma_thp_gfp_mask(vma);
4590	while (orders) {
4591	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4592	folio = vma_alloc_folio(gfp, order, vma, addr);
4593	if (folio) {
4594	if (!mem_cgroup_swapin_charge_folio(folio, mm: vma->vm_mm,
4595	gfp, entry))
4596	return folio;
4597	count_mthp_stat(order, item: MTHP_STAT_SWPIN_FALLBACK_CHARGE);
4598	folio_put(folio);
4599	}
4600	count_mthp_stat(order, item: MTHP_STAT_SWPIN_FALLBACK);
4601	order = next_order(orders: &orders, prev: order);
4602	}
4603
4604	fallback:
4605	return __alloc_swap_folio(vmf);
4606	}
4607	#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4608	static struct folio alloc_swap_folio(struct* vm_fault *vmf)
4609	{
4610	return __alloc_swap_folio(vmf);
4611	}
4612	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4613
4614	static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
4615
4616	/*
4617	* We enter with non-exclusive mmap_lock (to exclude vma changes,
4618	* but allow concurrent faults), and pte mapped but not yet locked.
4619	* We return with pte unmapped and unlocked.
4620	*
4621	* We return with the mmap_lock locked or unlocked in the same cases
4622	* as does filemap_fault().
4623	*/
4624	vm_fault_t do_swap_page(struct vm_fault *vmf)
4625	{
4626	struct vm_area_struct *vma = vmf->vma;
4627	struct folio swapcache, folio = NULL;
4628	DECLARE_WAITQUEUE(wait, current);
4629	struct page *page;
4630	struct swap_info_struct *si = NULL;
4631	rmap_t rmap_flags = RMAP_NONE;
4632	bool need_clear_cache = false;
4633	bool exclusive = false;
4634	softleaf_t entry;
4635	pte_t pte;
4636	vm_fault_t ret = `0`;
4637	void *shadow = NULL;
4638	int nr_pages;
4639	unsigned long page_idx;
4640	unsigned long address;
4641	pte_t *ptep;
4642
4643	if (!pte_unmap_same(vmf))
4644	goto out;
4645
4646	entry = softleaf_from_pte(pte: vmf->orig_pte);
4647	if (unlikely(!softleaf_is_swap(entry))) {
4648	if (softleaf_is_migration(entry)) {
4649	migration_entry_wait(mm: vma->vm_mm, pmd: vmf->pmd,
4650	address: vmf->address);
4651	} else if (softleaf_is_device_exclusive(entry)) {
4652	vmf->page = softleaf_to_page(entry);
4653	ret = remove_device_exclusive_entry(vmf);
4654	} else if (softleaf_is_device_private(entry)) {
4655	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
4656	/*
4657	* migrate_to_ram is not yet ready to operate
4658	* under VMA lock.
4659	*/
4660	vma_end_read(vma);
4661	ret = VM_FAULT_RETRY;
4662	goto out;
4663	}
4664
4665	vmf->page = softleaf_to_page(entry);
4666	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
4667	addr: vmf->address, ptlp: &vmf->ptl);
4668	if (unlikely(!vmf->pte \|\|
4669	!pte_same(ptep_get(vmf->pte),
4670	vmf->orig_pte)))
4671	goto unlock;
4672
4673	/*
4674	* Get a page reference while we know the page can't be
4675	* freed.
4676	*/
4677	if (trylock_page(page: vmf->page)) {
4678	struct dev_pagemap *pgmap;
4679
4680	get_page(page: vmf->page);
4681	pte_unmap_unlock(vmf->pte, vmf->ptl);
4682	pgmap = page_pgmap(page: vmf->page);
4683	ret = pgmap->ops->migrate_to_ram(vmf);
4684	unlock_page(page: vmf->page);
4685	put_page(page: vmf->page);
4686	} else {
4687	pte_unmap_unlock(vmf->pte, vmf->ptl);
4688	}
4689	} else if (softleaf_is_hwpoison(entry)) {
4690	ret = VM_FAULT_HWPOISON;
4691	} else if (softleaf_is_marker(entry)) {
4692	ret = handle_pte_marker(vmf);
4693	} else {
4694	print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
4695	ret = VM_FAULT_SIGBUS;
4696	}
4697	goto out;
4698	}
4699
4700	/ Prevent swapoff from happening to us. /
4701	si = get_swap_device(entry);
4702	if (unlikely(!si))
4703	goto out;
4704
4705	folio = swap_cache_get_folio(entry);
4706	if (folio)
4707	swap_update_readahead(folio, vma, addr: vmf->address);
4708	swapcache = folio;
4709
4710	if (!folio) {
4711	if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
4712	__swap_count(entry) == `1`) {
4713	/ skip swapcache /
4714	folio = alloc_swap_folio(vmf);
4715	if (folio) {
4716	__folio_set_locked(folio);
4717	__folio_set_swapbacked(folio);
4718
4719	nr_pages = folio_nr_pages(folio);
4720	if (folio_test_large(folio))
4721	entry.val = ALIGN_DOWN(entry.val, nr_pages);
4722	/*
4723	* Prevent parallel swapin from proceeding with
4724	* the cache flag. Otherwise, another thread
4725	* may finish swapin first, free the entry, and
4726	* swapout reusing the same entry. It's
4727	* undetectable as pte_same() returns true due
4728	* to entry reuse.
4729	*/
4730	if (swapcache_prepare(entry, nr: nr_pages)) {
4731	/*
4732	* Relax a bit to prevent rapid
4733	* repeated page faults.
4734	*/
4735	add_wait_queue(wq_head: &swapcache_wq, wq_entry: &wait);
4736	schedule_timeout_uninterruptible(timeout: `1`);
4737	remove_wait_queue(wq_head: &swapcache_wq, wq_entry: &wait);
4738	goto out_page;
4739	}
4740	need_clear_cache = true;
4741
4742	memcg1_swapin(entry, nr_pages);
4743
4744	shadow = swap_cache_get_shadow(entry);
4745	if (shadow)
4746	workingset_refault(folio, shadow);
4747
4748	folio_add_lru(folio);
4749
4750	/ To provide entry to swap_read_folio() /
4751	folio->swap = entry;
4752	swap_read_folio(folio, NULL);
4753	folio->private = NULL;
4754	}
4755	} else {
4756	folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
4757	vmf);
4758	swapcache = folio;
4759	}
4760
4761	if (!folio) {
4762	/*
4763	* Back out if somebody else faulted in this pte
4764	* while we released the pte lock.
4765	*/
4766	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
4767	addr: vmf->address, ptlp: &vmf->ptl);
4768	if (likely(vmf->pte &&
4769	pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4770	ret = VM_FAULT_OOM;
4771	goto unlock;
4772	}
4773
4774	/ Had to read the page from swap area: Major fault /
4775	ret = VM_FAULT_MAJOR;
4776	count_vm_event(item: PGMAJFAULT);
4777	count_memcg_event_mm(mm: vma->vm_mm, idx: PGMAJFAULT);
4778	}
4779
4780	ret \|= folio_lock_or_retry(folio, vmf);
4781	if (ret & VM_FAULT_RETRY)
4782	goto out_release;
4783
4784	page = folio_file_page(folio, index: swp_offset(entry));
4785	if (swapcache) {
4786	/*
4787	* Make sure folio_free_swap() or swapoff did not release the
4788	* swapcache from under us. The page pin, and pte_same test
4789	* below, are not enough to exclude that. Even if it is still
4790	* swapcache, we need to check that the page's swap has not
4791	* changed.
4792	*/
4793	if (unlikely(!folio_matches_swap_entry(folio, entry)))
4794	goto out_page;
4795
4796	if (unlikely(PageHWPoison(page))) {
4797	/*
4798	* hwpoisoned dirty swapcache pages are kept for killing
4799	* owner processes (which may be unknown at hwpoison time)
4800	*/
4801	ret = VM_FAULT_HWPOISON;
4802	goto out_page;
4803	}
4804
4805	/*
4806	* KSM sometimes has to copy on read faults, for example, if
4807	* folio->index of non-ksm folios would be nonlinear inside the
4808	* anon VMA -- the ksm flag is lost on actual swapout.
4809	*/
4810	folio = ksm_might_need_to_copy(folio, vma, addr: vmf->address);
4811	if (unlikely(!folio)) {
4812	ret = VM_FAULT_OOM;
4813	folio = swapcache;
4814	goto out_page;
4815	} else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
4816	ret = VM_FAULT_HWPOISON;
4817	folio = swapcache;
4818	goto out_page;
4819	}
4820	if (folio != swapcache)
4821	page = folio_page(folio, `0`);
4822
4823	/*
4824	* If we want to map a page that's in the swapcache writable, we
4825	* have to detect via the refcount if we're really the exclusive
4826	* owner. Try removing the extra reference from the local LRU
4827	* caches if required.
4828	*/
4829	if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
4830	!folio_test_ksm(folio) && !folio_test_lru(folio))
4831	lru_add_drain();
4832	}
4833
4834	folio_throttle_swaprate(folio, GFP_KERNEL);
4835
4836	/*
4837	* Back out if somebody else already faulted in this pte.
4838	*/
4839	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
4840	ptlp: &vmf->ptl);
4841	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4842	goto out_nomap;
4843
4844	if (unlikely(!folio_test_uptodate(folio))) {
4845	ret = VM_FAULT_SIGBUS;
4846	goto out_nomap;
4847	}
4848
4849	/ allocated large folios for SWP_SYNCHRONOUS_IO /
4850	if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
4851	unsigned long nr = folio_nr_pages(folio);
4852	unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
4853	unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
4854	pte_t *folio_ptep = vmf->pte - idx;
4855	pte_t folio_pte = ptep_get(ptep: folio_ptep);
4856
4857	if (!pte_same(a: folio_pte, b: pte_move_swp_offset(pte: vmf->orig_pte, delta: -idx)) \|\|
4858	swap_pte_batch(start_ptep: folio_ptep, max_nr: nr, pte: folio_pte) != nr)
4859	goto out_nomap;
4860
4861	page_idx = idx;
4862	address = folio_start;
4863	ptep = folio_ptep;
4864	goto check_folio;
4865	}
4866
4867	nr_pages = `1`;
4868	page_idx = `0`;
4869	address = vmf->address;
4870	ptep = vmf->pte;
4871	if (folio_test_large(folio) && folio_test_swapcache(folio)) {
4872	int nr = folio_nr_pages(folio);
4873	unsigned long idx = folio_page_idx(folio, page);
4874	unsigned long folio_start = address - idx * PAGE_SIZE;
4875	unsigned long folio_end = folio_start + nr * PAGE_SIZE;
4876	pte_t *folio_ptep;
4877	pte_t folio_pte;
4878
4879	if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
4880	goto check_folio;
4881	if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
4882	goto check_folio;
4883
4884	folio_ptep = vmf->pte - idx;
4885	folio_pte = ptep_get(ptep: folio_ptep);
4886	if (!pte_same(a: folio_pte, b: pte_move_swp_offset(pte: vmf->orig_pte, delta: -idx)) \|\|
4887	swap_pte_batch(start_ptep: folio_ptep, max_nr: nr, pte: folio_pte) != nr)
4888	goto check_folio;
4889
4890	page_idx = idx;
4891	address = folio_start;
4892	ptep = folio_ptep;
4893	nr_pages = nr;
4894	entry = folio->swap;
4895	page = &folio->page;
4896	}
4897
4898	check_folio:
4899	/*
4900	* PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
4901	* must never point at an anonymous page in the swapcache that is
4902	* PG_anon_exclusive. Sanity check that this holds and especially, that
4903	* no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
4904	* check after taking the PT lock and making sure that nobody
4905	* concurrently faulted in this page and set PG_anon_exclusive.
4906	*/
4907	BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
4908	BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
4909
4910	/*
4911	* Check under PT lock (to protect against concurrent fork() sharing
4912	* the swap entry concurrently) for certainly exclusive pages.
4913	*/
4914	if (!folio_test_ksm(folio)) {
4915	exclusive = pte_swp_exclusive(pte: vmf->orig_pte);
4916	if (folio != swapcache) {
4917	/*
4918	* We have a fresh page that is not exposed to the
4919	* swapcache -> certainly exclusive.
4920	*/
4921	exclusive = true;
4922	} else if (exclusive && folio_test_writeback(folio) &&
4923	data_race(si->flags & SWP_STABLE_WRITES)) {
4924	/*
4925	* This is tricky: not all swap backends support
4926	* concurrent page modifications while under writeback.
4927	*
4928	* So if we stumble over such a page in the swapcache
4929	* we must not set the page exclusive, otherwise we can
4930	* map it writable without further checks and modify it
4931	* while still under writeback.
4932	*
4933	* For these problematic swap backends, simply drop the
4934	* exclusive marker: this is perfectly fine as we start
4935	* writeback only if we fully unmapped the page and
4936	* there are no unexpected references on the page after
4937	* unmapping succeeded. After fully unmapped, no
4938	* further GUP references (FOLL_GET and FOLL_PIN) can
4939	* appear, so dropping the exclusive marker and mapping
4940	* it only R/O is fine.
4941	*/
4942	exclusive = false;
4943	}
4944	}
4945
4946	/*
4947	* Some architectures may have to restore extra metadata to the page
4948	* when reading from swap. This metadata may be indexed by swap entry
4949	* so this must be called before swap_free().
4950	*/
4951	arch_swap_restore(entry: folio_swap(entry, folio), folio);
4952
4953	/*
4954	* Remove the swap entry and conditionally try to free up the swapcache.
4955	* We're already holding a reference on the page but haven't mapped it
4956	* yet.
4957	*/
4958	swap_free_nr(entry, nr_pages);
4959	if (should_try_to_free_swap(folio, vma, fault_flags: vmf->flags))
4960	folio_free_swap(folio);
4961
4962	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: nr_pages);
4963	add_mm_counter(mm: vma->vm_mm, member: MM_SWAPENTS, value: -nr_pages);
4964	pte = mk_pte(page, pgprot: vma->vm_page_prot);
4965	if (pte_swp_soft_dirty(pte: vmf->orig_pte))
4966	pte = pte_mksoft_dirty(pte);
4967	if (pte_swp_uffd_wp(pte: vmf->orig_pte))
4968	pte = pte_mkuffd_wp(pte);
4969
4970	/*
4971	* Same logic as in do_wp_page(); however, optimize for pages that are
4972	* certainly not shared either because we just allocated them without
4973	* exposing them to the swapcache or because the swap entry indicates
4974	* exclusivity.
4975	*/
4976	if (!folio_test_ksm(folio) &&
4977	(exclusive \|\| folio_ref_count(folio) == `1`)) {
4978	if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
4979	!pte_needs_soft_dirty_wp(vma, pte)) {
4980	pte = pte_mkwrite(pte, vma);
4981	if (vmf->flags & FAULT_FLAG_WRITE) {
4982	pte = pte_mkdirty(pte);
4983	vmf->flags &= ~FAULT_FLAG_WRITE;
4984	}
4985	}
4986	rmap_flags \|= RMAP_EXCLUSIVE;
4987	}
4988	folio_ref_add(folio, nr: nr_pages - `1`);
4989	flush_icache_pages(vma, page, nr: nr_pages);
4990	vmf->orig_pte = pte_advance_pfn(pte, nr: page_idx);
4991
4992	/ ksm created a completely new copy /
4993	if (unlikely(folio != swapcache && swapcache)) {
4994	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
4995	folio_add_lru_vma(folio, vma);
4996	} else if (!folio_test_anon(folio)) {
4997	/*
4998	* We currently only expect small !anon folios which are either
4999	* fully exclusive or fully shared, or new allocated large
5000	* folios which are fully exclusive. If we ever get large
5001	* folios within swapcache here, we have to be careful.
5002	*/
5003	VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
5004	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
5005	folio_add_new_anon_rmap(folio, vma, address, flags: rmap_flags);
5006	} else {
5007	folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
5008	flags: rmap_flags);
5009	}
5010
5011	VM_BUG_ON(!folio_test_anon(folio) \|\|
5012	(pte_write(pte) && !PageAnonExclusive(page)));
5013	set_ptes(mm: vma->vm_mm, addr: address, ptep, pte, nr: nr_pages);
5014	arch_do_swap_page_nr(mm: vma->vm_mm, vma, addr: address,
5015	pte, oldpte: pte, nr: nr_pages);
5016
5017	folio_unlock(folio);
5018	if (folio != swapcache && swapcache) {
5019	/*
5020	* Hold the lock to avoid the swap entry to be reused
5021	* until we take the PT lock for the pte_same() check
5022	* (to avoid false positives from pte_same). For
5023	* further safety release the lock after the swap_free
5024	* so that the swap count won't change under a
5025	* parallel locked swapcache.
5026	*/
5027	folio_unlock(folio: swapcache);
5028	folio_put(folio: swapcache);
5029	}
5030
5031	if (vmf->flags & FAULT_FLAG_WRITE) {
5032	ret \|= do_wp_page(vmf);
5033	if (ret & VM_FAULT_ERROR)
5034	ret &= VM_FAULT_ERROR;
5035	goto out;
5036	}
5037
5038	/ No need to invalidate - it was non-present before /
5039	update_mmu_cache_range(vmf, vma, addr: address, ptep, nr: nr_pages);
5040	unlock:
5041	if (vmf->pte)
5042	pte_unmap_unlock(vmf->pte, vmf->ptl);
5043	out:
5044	/ Clear the swap cache pin for direct swapin after PTL unlock /
5045	if (need_clear_cache) {
5046	swapcache_clear(si, entry, nr: nr_pages);
5047	if (waitqueue_active(wq_head: &swapcache_wq))
5048	wake_up(&swapcache_wq);
5049	}
5050	if (si)
5051	put_swap_device(si);
5052	return ret;
5053	out_nomap:
5054	if (vmf->pte)
5055	pte_unmap_unlock(vmf->pte, vmf->ptl);
5056	out_page:
5057	folio_unlock(folio);
5058	out_release:
5059	folio_put(folio);
5060	if (folio != swapcache && swapcache) {
5061	folio_unlock(folio: swapcache);
5062	folio_put(folio: swapcache);
5063	}
5064	if (need_clear_cache) {
5065	swapcache_clear(si, entry, nr: nr_pages);
5066	if (waitqueue_active(wq_head: &swapcache_wq))
5067	wake_up(&swapcache_wq);
5068	}
5069	if (si)
5070	put_swap_device(si);
5071	return ret;
5072	}
5073
5074	static bool pte_range_none(pte_t pte, int* nr_pages)
5075	{
5076	int i;
5077
5078	for (i = `0`; i < nr_pages; i++) {
5079	if (!pte_none(pte: ptep_get_lockless(ptep: pte + i)))
5080	return false;
5081	}
5082
5083	return true;
5084	}
5085
5086	static struct folio alloc_anon_folio(struct* vm_fault *vmf)
5087	{
5088	struct vm_area_struct *vma = vmf->vma;
5089	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5090	unsigned long orders;
5091	struct folio *folio;
5092	unsigned long addr;
5093	pte_t *pte;
5094	gfp_t gfp;
5095	int order;
5096
5097	/*
5098	* If uffd is active for the vma we need per-page fault fidelity to
5099	* maintain the uffd semantics.
5100	*/
5101	if (unlikely(userfaultfd_armed(vma)))
5102	goto fallback;
5103
5104	/*
5105	* Get a list of all the (large) orders below PMD_ORDER that are enabled
5106	* for this vma. Then filter out the orders that can't be allocated over
5107	* the faulting address and still be fully contained in the vma.
5108	*/
5109	orders = thp_vma_allowable_orders(vma, vm_flags: vma->vm_flags, type: TVA_PAGEFAULT,
5110	BIT(PMD_ORDER) - `1`);
5111	orders = thp_vma_suitable_orders(vma, addr: vmf->address, orders);
5112
5113	if (!orders)
5114	goto fallback;
5115
5116	pte = pte_offset_map(pmd: vmf->pmd, addr: vmf->address & PMD_MASK);
5117	if (!pte)
5118	return ERR_PTR(error: -EAGAIN);
5119
5120	/*
5121	* Find the highest order where the aligned range is completely
5122	* pte_none(). Note that all remaining orders will be completely
5123	* pte_none().
5124	*/
5125	order = highest_order(orders);
5126	while (orders) {
5127	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5128	if (pte_range_none(pte: pte + pte_index(address: addr), nr_pages: `1` << order))
5129	break;
5130	order = next_order(orders: &orders, prev: order);
5131	}
5132
5133	pte_unmap(pte);
5134
5135	if (!orders)
5136	goto fallback;
5137
5138	/ Try allocating the highest of the remaining orders. /
5139	gfp = vma_thp_gfp_mask(vma);
5140	while (orders) {
5141	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5142	folio = vma_alloc_folio(gfp, order, vma, addr);
5143	if (folio) {
5144	if (mem_cgroup_charge(folio, mm: vma->vm_mm, gfp)) {
5145	count_mthp_stat(order, item: MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
5146	folio_put(folio);
5147	goto next;
5148	}
5149	folio_throttle_swaprate(folio, gfp);
5150	/*
5151	* When a folio is not zeroed during allocation
5152	* (__GFP_ZERO not used) or user folios require special
5153	* handling, folio_zero_user() is used to make sure
5154	* that the page corresponding to the faulting address
5155	* will be hot in the cache after zeroing.
5156	*/
5157	if (user_alloc_needs_zeroing())
5158	folio_zero_user(folio, addr_hint: vmf->address);
5159	return folio;
5160	}
5161	next:
5162	count_mthp_stat(order, item: MTHP_STAT_ANON_FAULT_FALLBACK);
5163	order = next_order(orders: &orders, prev: order);
5164	}
5165
5166	fallback:
5167	#endif
5168	return folio_prealloc(src_mm: vma->vm_mm, vma, addr: vmf->address, need_zero: true);
5169	}
5170
5171	/*
5172	* We enter with non-exclusive mmap_lock (to exclude vma changes,
5173	* but allow concurrent faults), and pte mapped but not yet locked.
5174	* We return with mmap_lock still held, but pte unmapped and unlocked.
5175	*/
5176	static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
5177	{
5178	struct vm_area_struct *vma = vmf->vma;
5179	unsigned long addr = vmf->address;
5180	struct folio *folio;
5181	vm_fault_t ret = `0`;
5182	int nr_pages = `1`;
5183	pte_t entry;
5184
5185	/ File mapping without ->vm_ops ? /
5186	if (vma->vm_flags & VM_SHARED)
5187	return VM_FAULT_SIGBUS;
5188
5189	/*
5190	* Use pte_alloc() instead of pte_alloc_map(), so that OOM can
5191	* be distinguished from a transient failure of pte_offset_map().
5192	*/
5193	if (pte_alloc(vma->vm_mm, vmf->pmd))
5194	return VM_FAULT_OOM;
5195
5196	/ Use the zero-page for reads /
5197	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
5198	!mm_forbids_zeropage(vma->vm_mm)) {
5199	entry = pte_mkspecial(pte: pfn_pte(page_nr: my_zero_pfn(addr: vmf->address),
5200	pgprot: vma->vm_page_prot));
5201	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
5202	addr: vmf->address, ptlp: &vmf->ptl);
5203	if (!vmf->pte)
5204	goto unlock;
5205	if (vmf_pte_changed(vmf)) {
5206	update_mmu_tlb(vma, address: vmf->address, ptep: vmf->pte);
5207	goto unlock;
5208	}
5209	ret = check_stable_address_space(mm: vma->vm_mm);
5210	if (ret)
5211	goto unlock;
5212	/ Deliver the page fault to userland, check inside PT lock /
5213	if (userfaultfd_missing(vma)) {
5214	pte_unmap_unlock(vmf->pte, vmf->ptl);
5215	return handle_userfault(vmf, VM_UFFD_MISSING);
5216	}
5217	goto setpte;
5218	}
5219
5220	/ Allocate our own private page. /
5221	ret = vmf_anon_prepare(vmf);
5222	if (ret)
5223	return ret;
5224	/ Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault /
5225	folio = alloc_anon_folio(vmf);
5226	if (IS_ERR(ptr: folio))
5227	return `0`;
5228	if (!folio)
5229	goto oom;
5230
5231	nr_pages = folio_nr_pages(folio);
5232	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
5233
5234	/*
5235	* The memory barrier inside __folio_mark_uptodate makes sure that
5236	* preceding stores to the page contents become visible before
5237	* the set_pte_at() write.
5238	*/
5239	__folio_mark_uptodate(folio);
5240
5241	entry = folio_mk_pte(folio, pgprot: vma->vm_page_prot);
5242	entry = pte_sw_mkyoung(pte: entry);
5243	if (vma->vm_flags & VM_WRITE)
5244	entry = pte_mkwrite(pte: pte_mkdirty(pte: entry), vma);
5245
5246	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
5247	if (!vmf->pte)
5248	goto release;
5249	if (nr_pages == `1` && vmf_pte_changed(vmf)) {
5250	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
5251	goto release;
5252	} else if (nr_pages > `1` && !pte_range_none(pte: vmf->pte, nr_pages)) {
5253	update_mmu_tlb_range(vma, address: addr, ptep: vmf->pte, nr: nr_pages);
5254	goto release;
5255	}
5256
5257	ret = check_stable_address_space(mm: vma->vm_mm);
5258	if (ret)
5259	goto release;
5260
5261	/ Deliver the page fault to userland, check inside PT lock /
5262	if (userfaultfd_missing(vma)) {
5263	pte_unmap_unlock(vmf->pte, vmf->ptl);
5264	folio_put(folio);
5265	return handle_userfault(vmf, VM_UFFD_MISSING);
5266	}
5267
5268	folio_ref_add(folio, nr: nr_pages - `1`);
5269	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: nr_pages);
5270	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_ANON_FAULT_ALLOC);
5271	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
5272	folio_add_lru_vma(folio, vma);
5273	setpte:
5274	if (vmf_orig_pte_uffd_wp(vmf))
5275	entry = pte_mkuffd_wp(pte: entry);
5276	set_ptes(mm: vma->vm_mm, addr, ptep: vmf->pte, pte: entry, nr: nr_pages);
5277
5278	/ No need to invalidate - it was non-present before /
5279	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr: nr_pages);
5280	unlock:
5281	if (vmf->pte)
5282	pte_unmap_unlock(vmf->pte, vmf->ptl);
5283	return ret;
5284	release:
5285	folio_put(folio);
5286	goto unlock;
5287	oom:
5288	return VM_FAULT_OOM;
5289	}
5290
5291	/*
5292	* The mmap_lock must have been held on entry, and may have been
5293	* released depending on flags and vma->vm_ops->fault() return value.
5294	* See filemap_fault() and __lock_page_retry().
5295	*/
5296	static vm_fault_t __do_fault(struct vm_fault *vmf)
5297	{
5298	struct vm_area_struct *vma = vmf->vma;
5299	struct folio *folio;
5300	vm_fault_t ret;
5301
5302	/*
5303	* Preallocate pte before we take page_lock because this might lead to
5304	* deadlocks for memcg reclaim which waits for pages under writeback:
5305	* lock_page(A)
5306	* SetPageWriteback(A)
5307	* unlock_page(A)
5308	* lock_page(B)
5309	* lock_page(B)
5310	* pte_alloc_one
5311	* shrink_folio_list
5312	* wait_on_page_writeback(A)
5313	* SetPageWriteback(B)
5314	* unlock_page(B)
5315	* # flush A, B to clear the writeback
5316	*/
5317	if (pmd_none(pmd: *vmf->pmd) && !vmf->prealloc_pte) {
5318	vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5319	if (!vmf->prealloc_pte)
5320	return VM_FAULT_OOM;
5321	}
5322
5323	ret = vma->vm_ops->fault(vmf);
5324	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY \|
5325	VM_FAULT_DONE_COW)))
5326	return ret;
5327
5328	folio = page_folio(vmf->page);
5329	if (unlikely(PageHWPoison(vmf->page))) {
5330	vm_fault_t poisonret = VM_FAULT_HWPOISON;
5331	if (ret & VM_FAULT_LOCKED) {
5332	if (page_mapped(page: vmf->page))
5333	unmap_mapping_folio(folio);
5334	/ Retry if a clean folio was removed from the cache. /
5335	if (mapping_evict_folio(mapping: folio->mapping, folio))
5336	poisonret = VM_FAULT_NOPAGE;
5337	folio_unlock(folio);
5338	}
5339	folio_put(folio);
5340	vmf->page = NULL;
5341	return poisonret;
5342	}
5343
5344	if (unlikely(!(ret & VM_FAULT_LOCKED)))
5345	folio_lock(folio);
5346	else
5347	VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);
5348
5349	return ret;
5350	}
5351
5352	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5353	static void deposit_prealloc_pte(struct vm_fault *vmf)
5354	{
5355	struct vm_area_struct *vma = vmf->vma;
5356
5357	pgtable_trans_huge_deposit(mm: vma->vm_mm, pmdp: vmf->pmd, pgtable: vmf->prealloc_pte);
5358	/*
5359	* We are going to consume the prealloc table,
5360	* count that as nr_ptes.
5361	*/
5362	mm_inc_nr_ptes(mm: vma->vm_mm);
5363	vmf->prealloc_pte = NULL;
5364	}
5365
5366	vm_fault_t do_set_pmd(struct vm_fault vmf, struct* folio folio, struct* page *page)
5367	{
5368	struct vm_area_struct *vma = vmf->vma;
5369	bool write = vmf->flags & FAULT_FLAG_WRITE;
5370	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
5371	pmd_t entry;
5372	vm_fault_t ret = VM_FAULT_FALLBACK;
5373
5374	/*
5375	* It is too late to allocate a small folio, we already have a large
5376	* folio in the pagecache: especially s390 KVM cannot tolerate any
5377	* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
5378	* PMD mappings if THPs are disabled. As we already have a THP,
5379	* behave as if we are forcing a collapse.
5380	*/
5381	if (thp_disabled_by_hw() \|\| vma_thp_disabled(vma, vm_flags: vma->vm_flags,
5382	/ forced_collapse=/ true))
5383	return ret;
5384
5385	if (!thp_vma_suitable_order(vma, addr: haddr, PMD_ORDER))
5386	return ret;
5387
5388	if (folio_order(folio) != HPAGE_PMD_ORDER)
5389	return ret;
5390	page = &folio->page;
5391
5392	/*
5393	* Just backoff if any subpage of a THP is corrupted otherwise
5394	* the corrupted page may mapped by PMD silently to escape the
5395	* check. This kind of THP just can be PTE mapped. Access to
5396	* the corrupted subpage should trigger SIGBUS as expected.
5397	*/
5398	if (unlikely(folio_test_has_hwpoisoned(folio)))
5399	return ret;
5400
5401	/*
5402	* Archs like ppc64 need additional space to store information
5403	* related to pte entry. Use the preallocated table for that.
5404	*/
5405	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
5406	vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5407	if (!vmf->prealloc_pte)
5408	return VM_FAULT_OOM;
5409	}
5410
5411	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
5412	if (unlikely(!pmd_none(*vmf->pmd)))
5413	goto out;
5414
5415	flush_icache_pages(vma, page, HPAGE_PMD_NR);
5416
5417	entry = folio_mk_pmd(folio, pgprot: vma->vm_page_prot);
5418	if (write)
5419	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
5420
5421	add_mm_counter(mm: vma->vm_mm, member: mm_counter_file(folio), HPAGE_PMD_NR);
5422	folio_add_file_rmap_pmd(folio, page, vma);
5423
5424	/*
5425	* deposit and withdraw with pmd lock held
5426	*/
5427	if (arch_needs_pgtable_deposit())
5428	deposit_prealloc_pte(vmf);
5429
5430	set_pmd_at(mm: vma->vm_mm, addr: haddr, pmdp: vmf->pmd, pmd: entry);
5431
5432	update_mmu_cache_pmd(vma, addr: haddr, pmd: vmf->pmd);
5433
5434	/ fault is handled /
5435	ret = `0`;
5436	count_vm_event(item: THP_FILE_MAPPED);
5437	out:
5438	spin_unlock(lock: vmf->ptl);
5439	return ret;
5440	}
5441	#else
5442	vm_fault_t do_set_pmd(struct vm_fault vmf, struct* folio folio, struct* page *page)
5443	{
5444	return VM_FAULT_FALLBACK;
5445	}
5446	#endif
5447
5448	/**
5449	* set_pte_range - Set a range of PTEs to point to pages in a folio.
5450	* @vmf: Fault description.
5451	* @folio: The folio that contains @page.
5452	* @page: The first page to create a PTE for.
5453	* @nr: The number of PTEs to create.
5454	* @addr: The first address to create a PTE for.
5455	*/
5456	void set_pte_range(struct vm_fault vmf, struct* folio *folio,
5457	struct page page, unsigned* int nr, unsigned long addr)
5458	{
5459	struct vm_area_struct *vma = vmf->vma;
5460	bool write = vmf->flags & FAULT_FLAG_WRITE;
5461	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
5462	pte_t entry;
5463
5464	flush_icache_pages(vma, page, nr);
5465	entry = mk_pte(page, pgprot: vma->vm_page_prot);
5466
5467	if (prefault && arch_wants_old_prefaulted_pte())
5468	entry = pte_mkold(pte: entry);
5469	else
5470	entry = pte_sw_mkyoung(pte: entry);
5471
5472	if (write)
5473	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
5474	else if (pte_write(pte: entry) && folio_test_dirty(folio))
5475	entry = pte_mkdirty(pte: entry);
5476	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
5477	entry = pte_mkuffd_wp(pte: entry);
5478	/ copy-on-write page /
5479	if (write && !(vma->vm_flags & VM_SHARED)) {
5480	VM_BUG_ON_FOLIO(nr != `1`, folio);
5481	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
5482	folio_add_lru_vma(folio, vma);
5483	} else {
5484	folio_add_file_rmap_ptes(folio, page, nr_pages: nr, vma);
5485	}
5486	set_ptes(mm: vma->vm_mm, addr, ptep: vmf->pte, pte: entry, nr);
5487
5488	/ no need to invalidate: a not-present page won't be cached /
5489	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr);
5490	}
5491
5492	static bool vmf_pte_changed(struct vm_fault *vmf)
5493	{
5494	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
5495	return !pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte);
5496
5497	return !pte_none(pte: ptep_get(ptep: vmf->pte));
5498	}
5499
5500	/**
5501	* finish_fault - finish page fault once we have prepared the page to fault
5502	*
5503	* @vmf: structure describing the fault
5504	*
5505	* This function handles all that is needed to finish a page fault once the
5506	* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
5507	* given page, adds reverse page mapping, handles memcg charges and LRU
5508	* addition.
5509	*
5510	* The function expects the page to be locked and on success it consumes a
5511	* reference of a page being mapped (for the PTE which maps it).
5512	*
5513	* Return: %0 on success, %VM_FAULT_ code in case of error.
5514	*/
5515	vm_fault_t finish_fault(struct vm_fault *vmf)
5516	{
5517	struct vm_area_struct *vma = vmf->vma;
5518	struct page *page;
5519	struct folio *folio;
5520	vm_fault_t ret;
5521	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
5522	!(vma->vm_flags & VM_SHARED);
5523	int type, nr_pages;
5524	unsigned long addr;
5525	bool needs_fallback = false;
5526
5527	fallback:
5528	addr = vmf->address;
5529
5530	/ Did we COW the page? /
5531	if (is_cow)
5532	page = vmf->cow_page;
5533	else
5534	page = vmf->page;
5535
5536	folio = page_folio(page);
5537	/*
5538	* check even for read faults because we might have lost our CoWed
5539	* page
5540	*/
5541	if (!(vma->vm_flags & VM_SHARED)) {
5542	ret = check_stable_address_space(mm: vma->vm_mm);
5543	if (ret)
5544	return ret;
5545	}
5546
5547	if (!needs_fallback && vma->vm_file) {
5548	struct address_space *mapping = vma->vm_file->f_mapping;
5549	pgoff_t file_end;
5550
5551	file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
5552
5553	/*
5554	* Do not allow to map with PTEs beyond i_size and with PMD
5555	* across i_size to preserve SIGBUS semantics.
5556	*
5557	* Make an exception for shmem/tmpfs that for long time
5558	* intentionally mapped with PMDs across i_size.
5559	*/
5560	needs_fallback = !shmem_mapping(mapping) &&
5561	file_end < folio_next_index(folio);
5562	}
5563
5564	if (pmd_none(pmd: *vmf->pmd)) {
5565	if (!needs_fallback && folio_test_pmd_mappable(folio)) {
5566	ret = do_set_pmd(vmf, folio, page);
5567	if (ret != VM_FAULT_FALLBACK)
5568	return ret;
5569	}
5570
5571	if (vmf->prealloc_pte)
5572	pmd_install(mm: vma->vm_mm, pmd: vmf->pmd, pte: &vmf->prealloc_pte);
5573	else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
5574	return VM_FAULT_OOM;
5575	}
5576
5577	nr_pages = folio_nr_pages(folio);
5578
5579	/ Using per-page fault to maintain the uffd semantics /
5580	if (unlikely(userfaultfd_armed(vma)) \|\| unlikely(needs_fallback)) {
5581	nr_pages = `1`;
5582	} else if (nr_pages > `1`) {
5583	pgoff_t idx = folio_page_idx(folio, page);
5584	/ The page offset of vmf->address within the VMA. /
5585	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5586	/ The index of the entry in the pagetable for fault page. /
5587	pgoff_t pte_off = pte_index(address: vmf->address);
5588
5589	/*
5590	* Fallback to per-page fault in case the folio size in page
5591	* cache beyond the VMA limits and PMD pagetable limits.
5592	*/
5593	if (unlikely(vma_off < idx \|\|
5594	vma_off + (nr_pages - idx) > vma_pages(vma) \|\|
5595	pte_off < idx \|\|
5596	pte_off + (nr_pages - idx) > PTRS_PER_PTE)) {
5597	nr_pages = `1`;
5598	} else {
5599	/ Now we can set mappings for the whole large folio. /
5600	addr = vmf->address - idx * PAGE_SIZE;
5601	page = &folio->page;
5602	}
5603	}
5604
5605	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
5606	addr, ptlp: &vmf->ptl);
5607	if (!vmf->pte)
5608	return VM_FAULT_NOPAGE;
5609
5610	/ Re-check under ptl /
5611	if (nr_pages == `1` && unlikely(vmf_pte_changed(vmf))) {
5612	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
5613	ret = VM_FAULT_NOPAGE;
5614	goto unlock;
5615	} else if (nr_pages > `1` && !pte_range_none(pte: vmf->pte, nr_pages)) {
5616	needs_fallback = true;
5617	pte_unmap_unlock(vmf->pte, vmf->ptl);
5618	goto fallback;
5619	}
5620
5621	folio_ref_add(folio, nr: nr_pages - `1`);
5622	set_pte_range(vmf, folio, page, nr: nr_pages, addr);
5623	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
5624	add_mm_counter(mm: vma->vm_mm, member: type, value: nr_pages);
5625	ret = `0`;
5626
5627	unlock:
5628	pte_unmap_unlock(vmf->pte, vmf->ptl);
5629	return ret;
5630	}
5631
5632	static unsigned long fault_around_pages __read_mostly =
5633	`65536` >> PAGE_SHIFT;
5634
5635	#ifdef CONFIG_DEBUG_FS
5636	static int fault_around_bytes_get(void data, u64 val)
5637	{
5638	*val = fault_around_pages << PAGE_SHIFT;
5639	return `0`;
5640	}
5641
5642	/*
5643	* fault_around_bytes must be rounded down to the nearest page order as it's
5644	* what do_fault_around() expects to see.
5645	*/
5646	static int fault_around_bytes_set(void *data, u64 val)
5647	{
5648	if (val / PAGE_SIZE > PTRS_PER_PTE)
5649	return -EINVAL;
5650
5651	/*
5652	* The minimum value is 1 page, however this results in no fault-around
5653	* at all. See should_fault_around().
5654	*/
5655	val = max(val, PAGE_SIZE);
5656	fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
5657
5658	return `0`;
5659	}
5660	DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
5661	fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
5662
5663	static int __init fault_around_debugfs(void)
5664	{
5665	debugfs_create_file_unsafe(name: "fault_around_bytes", mode: `0644`, NULL, NULL,
5666	fops: &fault_around_bytes_fops);
5667	return `0`;
5668	}
5669	late_initcall(fault_around_debugfs);
5670	#endif
5671
5672	/*
5673	* do_fault_around() tries to map few pages around the fault address. The hope
5674	* is that the pages will be needed soon and this will lower the number of
5675	* faults to handle.
5676	*
5677	* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5678	* not ready to be mapped: not up-to-date, locked, etc.
5679	*
5680	* This function doesn't cross VMA or page table boundaries, in order to call
5681	* map_pages() and acquire a PTE lock only once.
5682	*
5683	* fault_around_pages defines how many pages we'll try to map.
5684	* do_fault_around() expects it to be set to a power of two less than or equal
5685	* to PTRS_PER_PTE.
5686	*
5687	* The virtual address of the area that we map is naturally aligned to
5688	* fault_around_pages * PAGE_SIZE rounded down to the machine page size
5689	* (and therefore to page order). This way it's easier to guarantee
5690	* that we don't cross page table boundaries.
5691	*/
5692	static vm_fault_t do_fault_around(struct vm_fault *vmf)
5693	{
5694	pgoff_t nr_pages = READ_ONCE(fault_around_pages);
5695	pgoff_t pte_off = pte_index(address: vmf->address);
5696	/ The page offset of vmf->address within the VMA. /
5697	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5698	pgoff_t from_pte, to_pte;
5699	vm_fault_t ret;
5700
5701	/ The PTE offset of the start address, clamped to the VMA. /
5702	from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
5703	pte_off - min(pte_off, vma_off));
5704
5705	/ The PTE offset of the end address, clamped to the VMA and PTE. /
5706	to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
5707	pte_off + vma_pages(vmf->vma) - vma_off) - `1`;
5708
5709	if (pmd_none(pmd: *vmf->pmd)) {
5710	vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
5711	if (!vmf->prealloc_pte)
5712	return VM_FAULT_OOM;
5713	}
5714
5715	rcu_read_lock();
5716	ret = vmf->vma->vm_ops->map_pages(vmf,
5717	vmf->pgoff + from_pte - pte_off,
5718	vmf->pgoff + to_pte - pte_off);
5719	rcu_read_unlock();
5720
5721	return ret;
5722	}
5723
5724	/ Return true if we should do read fault-around, false otherwise /
5725	static inline bool should_fault_around(struct vm_fault *vmf)
5726	{
5727	/ No ->map_pages? No way to fault around... /
5728	if (!vmf->vma->vm_ops->map_pages)
5729	return false;
5730
5731	if (uffd_disable_fault_around(vma: vmf->vma))
5732	return false;
5733
5734	/ A single page implies no faulting 'around' at all. /
5735	return fault_around_pages > `1`;
5736	}
5737
5738	static vm_fault_t do_read_fault(struct vm_fault *vmf)
5739	{
5740	vm_fault_t ret = `0`;
5741	struct folio *folio;
5742
5743	/*
5744	* Let's call ->map_pages() first and use ->fault() as fallback
5745	* if page by the offset is not ready to be mapped (cold cache or
5746	* something).
5747	*/
5748	if (should_fault_around(vmf)) {
5749	ret = do_fault_around(vmf);
5750	if (ret)
5751	return ret;
5752	}
5753
5754	ret = vmf_can_call_fault(vmf);
5755	if (ret)
5756	return ret;
5757
5758	ret = __do_fault(vmf);
5759	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5760	return ret;
5761
5762	ret \|= finish_fault(vmf);
5763	folio = page_folio(vmf->page);
5764	folio_unlock(folio);
5765	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5766	folio_put(folio);
5767	return ret;
5768	}
5769
5770	static vm_fault_t do_cow_fault(struct vm_fault *vmf)
5771	{
5772	struct vm_area_struct *vma = vmf->vma;
5773	struct folio *folio;
5774	vm_fault_t ret;
5775
5776	ret = vmf_can_call_fault(vmf);
5777	if (!ret)
5778	ret = vmf_anon_prepare(vmf);
5779	if (ret)
5780	return ret;
5781
5782	folio = folio_prealloc(src_mm: vma->vm_mm, vma, addr: vmf->address, need_zero: false);
5783	if (!folio)
5784	return VM_FAULT_OOM;
5785
5786	vmf->cow_page = &folio->page;
5787
5788	ret = __do_fault(vmf);
5789	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5790	goto uncharge_out;
5791	if (ret & VM_FAULT_DONE_COW)
5792	return ret;
5793
5794	if (copy_mc_user_highpage(to: vmf->cow_page, from: vmf->page, vaddr: vmf->address, vma)) {
5795	ret = VM_FAULT_HWPOISON;
5796	goto unlock;
5797	}
5798	__folio_mark_uptodate(folio);
5799
5800	ret \|= finish_fault(vmf);
5801	unlock:
5802	unlock_page(page: vmf->page);
5803	put_page(page: vmf->page);
5804	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5805	goto uncharge_out;
5806	return ret;
5807	uncharge_out:
5808	folio_put(folio);
5809	return ret;
5810	}
5811
5812	static vm_fault_t do_shared_fault(struct vm_fault *vmf)
5813	{
5814	struct vm_area_struct *vma = vmf->vma;
5815	vm_fault_t ret, tmp;
5816	struct folio *folio;
5817
5818	ret = vmf_can_call_fault(vmf);
5819	if (ret)
5820	return ret;
5821
5822	ret = __do_fault(vmf);
5823	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5824	return ret;
5825
5826	folio = page_folio(vmf->page);
5827
5828	/*
5829	* Check if the backing address space wants to know that the page is
5830	* about to become writable
5831	*/
5832	if (vma->vm_ops->page_mkwrite) {
5833	folio_unlock(folio);
5834	tmp = do_page_mkwrite(vmf, folio);
5835	if (unlikely(!tmp \|\|
5836	(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
5837	folio_put(folio);
5838	return tmp;
5839	}
5840	}
5841
5842	ret \|= finish_fault(vmf);
5843	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \|
5844	VM_FAULT_RETRY))) {
5845	folio_unlock(folio);
5846	folio_put(folio);
5847	return ret;
5848	}
5849
5850	ret \|= fault_dirty_shared_page(vmf);
5851	return ret;
5852	}
5853
5854	/*
5855	* We enter with non-exclusive mmap_lock (to exclude vma changes,
5856	* but allow concurrent faults).
5857	* The mmap_lock may have been released depending on flags and our
5858	* return value. See filemap_fault() and __folio_lock_or_retry().
5859	* If mmap_lock is released, vma may become invalid (for example
5860	* by other thread calling munmap()).
5861	*/
5862	static vm_fault_t do_fault(struct vm_fault *vmf)
5863	{
5864	struct vm_area_struct *vma = vmf->vma;
5865	struct mm_struct *vm_mm = vma->vm_mm;
5866	vm_fault_t ret;
5867
5868	/*
5869	* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
5870	*/
5871	if (!vma->vm_ops->fault) {
5872	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
5873	addr: vmf->address, ptlp: &vmf->ptl);
5874	if (unlikely(!vmf->pte))
5875	ret = VM_FAULT_SIGBUS;
5876	else {
5877	/*
5878	* Make sure this is not a temporary clearing of pte
5879	* by holding ptl and checking again. A R/M/W update
5880	* of pte involves: take ptl, clearing the pte so that
5881	* we don't have concurrent modification by hardware
5882	* followed by an update.
5883	*/
5884	if (unlikely(pte_none(ptep_get(vmf->pte))))
5885	ret = VM_FAULT_SIGBUS;
5886	else
5887	ret = VM_FAULT_NOPAGE;
5888
5889	pte_unmap_unlock(vmf->pte, vmf->ptl);
5890	}
5891	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
5892	ret = do_read_fault(vmf);
5893	else if (!(vma->vm_flags & VM_SHARED))
5894	ret = do_cow_fault(vmf);
5895	else
5896	ret = do_shared_fault(vmf);
5897
5898	/ preallocated pagetable is unused: free it /
5899	if (vmf->prealloc_pte) {
5900	pte_free(mm: vm_mm, pte_page: vmf->prealloc_pte);
5901	vmf->prealloc_pte = NULL;
5902	}
5903	return ret;
5904	}
5905
5906	int numa_migrate_check(struct folio folio, struct* vm_fault *vmf,
5907	unsigned long addr, int *flags,
5908	bool writable, int *last_cpupid)
5909	{
5910	struct vm_area_struct *vma = vmf->vma;
5911
5912	/*
5913	* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
5914	* much anyway since they can be in shared cache state. This misses
5915	* the case where a mapping is writable but the process never writes
5916	* to it but pte_write gets cleared during protection updates and
5917	* pte_dirty has unpredictable behaviour between PTE scan updates,
5918	* background writeback, dirty balancing and application behaviour.
5919	*/
5920	if (!writable)
5921	*flags \|= TNF_NO_GROUP;
5922
5923	/*
5924	* Flag if the folio is shared between multiple address spaces. This
5925	* is later used when determining whether to group tasks together
5926	*/
5927	if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
5928	*flags \|= TNF_SHARED;
5929	/*
5930	* For memory tiering mode, cpupid of slow memory page is used
5931	* to record page access time. So use default value.
5932	*/
5933	if (folio_use_access_time(folio))
5934	*last_cpupid = (-`1` & LAST_CPUPID_MASK);
5935	else
5936	*last_cpupid = folio_last_cpupid(folio);
5937
5938	/ Record the current PID acceesing VMA /
5939	vma_set_access_pid_bit(vma);
5940
5941	count_vm_numa_event(NUMA_HINT_FAULTS);
5942	#ifdef CONFIG_NUMA_BALANCING
5943	count_memcg_folio_events(folio, idx: NUMA_HINT_FAULTS, nr: `1`);
5944	#endif
5945	if (folio_nid(folio) == numa_node_id()) {
5946	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
5947	*flags \|= TNF_FAULT_LOCAL;
5948	}
5949
5950	return mpol_misplaced(folio, vmf, addr);
5951	}
5952
5953	static void numa_rebuild_single_mapping(struct vm_fault vmf, struct* vm_area_struct *vma,
5954	unsigned long fault_addr, pte_t *fault_pte,
5955	bool writable)
5956	{
5957	pte_t pte, old_pte;
5958
5959	old_pte = ptep_modify_prot_start(vma, addr: fault_addr, ptep: fault_pte);
5960	pte = pte_modify(pte: old_pte, newprot: vma->vm_page_prot);
5961	pte = pte_mkyoung(pte);
5962	if (writable)
5963	pte = pte_mkwrite(pte, vma);
5964	ptep_modify_prot_commit(vma, addr: fault_addr, ptep: fault_pte, old_pte, pte);
5965	update_mmu_cache_range(vmf, vma, addr: fault_addr, ptep: fault_pte, nr: `1`);
5966	}
5967
5968	static void numa_rebuild_large_mapping(struct vm_fault vmf, struct* vm_area_struct *vma,
5969	struct folio *folio, pte_t fault_pte,
5970	bool ignore_writable, bool pte_write_upgrade)
5971	{
5972	int nr = pte_pfn(pte: fault_pte) - folio_pfn(folio);
5973	unsigned long start, end, addr = vmf->address;
5974	unsigned long addr_start = addr - (nr << PAGE_SHIFT);
5975	unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
5976	pte_t *start_ptep;
5977
5978	/ Stay within the VMA and within the page table. /
5979	start = max3(addr_start, pt_start, vma->vm_start);
5980	end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
5981	vma->vm_end);
5982	start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);
5983
5984	/ Restore all PTEs' mapping of the large folio /
5985	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
5986	pte_t ptent = ptep_get(ptep: start_ptep);
5987	bool writable = false;
5988
5989	if (!pte_present(a: ptent) \|\| !pte_protnone(pte: ptent))
5990	continue;
5991
5992	if (pfn_folio(pfn: pte_pfn(pte: ptent)) != folio)
5993	continue;
5994
5995	if (!ignore_writable) {
5996	ptent = pte_modify(pte: ptent, newprot: vma->vm_page_prot);
5997	writable = pte_write(pte: ptent);
5998	if (!writable && pte_write_upgrade &&
5999	can_change_pte_writable(vma, addr, pte: ptent))
6000	writable = true;
6001	}
6002
6003	numa_rebuild_single_mapping(vmf, vma, fault_addr: addr, fault_pte: start_ptep, writable);
6004	}
6005	}
6006
6007	static vm_fault_t do_numa_page(struct vm_fault *vmf)
6008	{
6009	struct vm_area_struct *vma = vmf->vma;
6010	struct folio *folio = NULL;
6011	int nid = NUMA_NO_NODE;
6012	bool writable = false, ignore_writable = false;
6013	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
6014	int last_cpupid;
6015	int target_nid;
6016	pte_t pte, old_pte;
6017	int flags = `0`, nr_pages;
6018
6019	/*
6020	* The pte cannot be used safely until we verify, while holding the page
6021	* table lock, that its contents have not changed during fault handling.
6022	*/
6023	spin_lock(lock: vmf->ptl);
6024	/ Read the live PTE from the page tables: /
6025	old_pte = ptep_get(ptep: vmf->pte);
6026
6027	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
6028	pte_unmap_unlock(vmf->pte, vmf->ptl);
6029	return `0`;
6030	}
6031
6032	pte = pte_modify(pte: old_pte, newprot: vma->vm_page_prot);
6033
6034	/*
6035	* Detect now whether the PTE could be writable; this information
6036	* is only valid while holding the PT lock.
6037	*/
6038	writable = pte_write(pte);
6039	if (!writable && pte_write_upgrade &&
6040	can_change_pte_writable(vma, addr: vmf->address, pte))
6041	writable = true;
6042
6043	folio = vm_normal_folio(vma, addr: vmf->address, pte);
6044	if (!folio \|\| folio_is_zone_device(folio))
6045	goto out_map;
6046
6047	nid = folio_nid(folio);
6048	nr_pages = folio_nr_pages(folio);
6049
6050	target_nid = numa_migrate_check(folio, vmf, addr: vmf->address, flags: &flags,
6051	writable, last_cpupid: &last_cpupid);
6052	if (target_nid == NUMA_NO_NODE)
6053	goto out_map;
6054	if (migrate_misplaced_folio_prepare(folio, vma, node: target_nid)) {
6055	flags \|= TNF_MIGRATE_FAIL;
6056	goto out_map;
6057	}
6058	/ The folio is isolated and isolation code holds a folio reference. /
6059	pte_unmap_unlock(vmf->pte, vmf->ptl);
6060	writable = false;
6061	ignore_writable = true;
6062
6063	/ Migrate to the requested node /
6064	if (!migrate_misplaced_folio(folio, node: target_nid)) {
6065	nid = target_nid;
6066	flags \|= TNF_MIGRATED;
6067	task_numa_fault(last_node: last_cpupid, node: nid, pages: nr_pages, flags);
6068	return `0`;
6069	}
6070
6071	flags \|= TNF_MIGRATE_FAIL;
6072	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
6073	addr: vmf->address, ptlp: &vmf->ptl);
6074	if (unlikely(!vmf->pte))
6075	return `0`;
6076	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
6077	pte_unmap_unlock(vmf->pte, vmf->ptl);
6078	return `0`;
6079	}
6080	out_map:
6081	/*
6082	* Make it present again, depending on how arch implements
6083	* non-accessible ptes, some can allow access by kernel mode.
6084	*/
6085	if (folio && folio_test_large(folio))
6086	numa_rebuild_large_mapping(vmf, vma, folio, fault_pte: pte, ignore_writable,
6087	pte_write_upgrade);
6088	else
6089	numa_rebuild_single_mapping(vmf, vma, fault_addr: vmf->address, fault_pte: vmf->pte,
6090	writable);
6091	pte_unmap_unlock(vmf->pte, vmf->ptl);
6092
6093	if (nid != NUMA_NO_NODE)
6094	task_numa_fault(last_node: last_cpupid, node: nid, pages: nr_pages, flags);
6095	return `0`;
6096	}
6097
6098	static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
6099	{
6100	struct vm_area_struct *vma = vmf->vma;
6101	if (vma_is_anonymous(vma))
6102	return do_huge_pmd_anonymous_page(vmf);
6103	if (vma->vm_ops->huge_fault)
6104	return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6105	return VM_FAULT_FALLBACK;
6106	}
6107
6108	/ `inline' is required to avoid gcc 4.1.2 build error /
6109	static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
6110	{
6111	struct vm_area_struct *vma = vmf->vma;
6112	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
6113	vm_fault_t ret;
6114
6115	if (vma_is_anonymous(vma)) {
6116	if (likely(!unshare) &&
6117	userfaultfd_huge_pmd_wp(vma, pmd: vmf->orig_pmd)) {
6118	if (userfaultfd_wp_async(vma: vmf->vma))
6119	goto split;
6120	return handle_userfault(vmf, VM_UFFD_WP);
6121	}
6122	return do_huge_pmd_wp_page(vmf);
6123	}
6124
6125	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
6126	if (vma->vm_ops->huge_fault) {
6127	ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6128	if (!(ret & VM_FAULT_FALLBACK))
6129	return ret;
6130	}
6131	}
6132
6133	split:
6134	/ COW or write-notify handled on pte level: split pmd. /
6135	__split_huge_pmd(vma, pmd: vmf->pmd, address: vmf->address, freeze: false);
6136
6137	return VM_FAULT_FALLBACK;
6138	}
6139
6140	static vm_fault_t create_huge_pud(struct vm_fault *vmf)
6141	{
6142	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
6143	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6144	struct vm_area_struct *vma = vmf->vma;
6145	/ No support for anonymous transparent PUD pages yet /
6146	if (vma_is_anonymous(vma))
6147	return VM_FAULT_FALLBACK;
6148	if (vma->vm_ops->huge_fault)
6149	return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6150	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
6151	return VM_FAULT_FALLBACK;
6152	}
6153
6154	static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
6155	{
6156	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
6157	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6158	struct vm_area_struct *vma = vmf->vma;
6159	vm_fault_t ret;
6160
6161	/ No support for anonymous transparent PUD pages yet /
6162	if (vma_is_anonymous(vma))
6163	goto split;
6164	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
6165	if (vma->vm_ops->huge_fault) {
6166	ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6167	if (!(ret & VM_FAULT_FALLBACK))
6168	return ret;
6169	}
6170	}
6171	split:
6172	/ COW or write-notify not handled on PUD level: split pud./
6173	__split_huge_pud(vma, pud: vmf->pud, address: vmf->address);
6174	#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
6175	return VM_FAULT_FALLBACK;
6176	}
6177
6178	/*
6179	* The page faults may be spurious because of the racy access to the
6180	* page table. For example, a non-populated virtual page is accessed
6181	* on 2 CPUs simultaneously, thus the page faults are triggered on
6182	* both CPUs. However, it's possible that one CPU (say CPU A) cannot
6183	* find the reason for the page fault if the other CPU (say CPU B) has
6184	* changed the page table before the PTE is checked on CPU A. Most of
6185	* the time, the spurious page faults can be ignored safely. However,
6186	* if the page fault is for the write access, it's possible that a
6187	* stale read-only TLB entry exists in the local CPU and needs to be
6188	* flushed on some architectures. This is called the spurious page
6189	* fault fixing.
6190	*
6191	* Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
6192	* by default and used as such on most architectures, while
6193	* flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
6194	* used as such on most architectures.
6195	*/
6196	static void fix_spurious_fault(struct vm_fault *vmf,
6197	enum pgtable_level ptlevel)
6198	{
6199	/ Skip spurious TLB flush for retried page fault /
6200	if (vmf->flags & FAULT_FLAG_TRIED)
6201	return;
6202	/*
6203	* This is needed only for protection faults but the arch code
6204	* is not yet telling us if this is a protection fault or not.
6205	* This still avoids useless tlb flushes for .text page faults
6206	* with threads.
6207	*/
6208	if (vmf->flags & FAULT_FLAG_WRITE) {
6209	if (ptlevel == PGTABLE_LEVEL_PTE)
6210	flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
6211	vmf->pte);
6212	else
6213	flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
6214	vmf->pmd);
6215	}
6216	}
6217	/*
6218	* These routines also need to handle stuff like marking pages dirty
6219	* and/or accessed for architectures that don't do it in hardware (most
6220	* RISC architectures). The early dirtying is also good on the i386.
6221	*
6222	* There is also a hook called "update_mmu_cache()" that architectures
6223	* with external mmu caches can use to update those (ie the Sparc or
6224	* PowerPC hashed page tables that act as extended TLBs).
6225	*
6226	* We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
6227	* concurrent faults).
6228	*
6229	* The mmap_lock may have been released depending on flags and our return value.
6230	* See filemap_fault() and __folio_lock_or_retry().
6231	*/
6232	static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
6233	{
6234	pte_t entry;
6235
6236	if (unlikely(pmd_none(*vmf->pmd))) {
6237	/*
6238	* Leave __pte_alloc() until later: because vm_ops->fault may
6239	* want to allocate huge page, and if we expose page table
6240	* for an instant, it will be difficult to retract from
6241	* concurrent faults and from rmap lookups.
6242	*/
6243	vmf->pte = NULL;
6244	vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
6245	} else {
6246	pmd_t dummy_pmdval;
6247
6248	/*
6249	* A regular pmd is established and it can't morph into a huge
6250	* pmd by anon khugepaged, since that takes mmap_lock in write
6251	* mode; but shmem or file collapse to THP could still morph
6252	* it into a huge pmd: just retry later if so.
6253	*
6254	* Use the maywrite version to indicate that vmf->pte may be
6255	* modified, but since we will use pte_same() to detect the
6256	* change of the !pte_none() entry, there is no need to recheck
6257	* the pmdval. Here we chooes to pass a dummy variable instead
6258	* of NULL, which helps new user think about why this place is
6259	* special.
6260	*/
6261	vmf->pte = pte_offset_map_rw_nolock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
6262	addr: vmf->address, pmdvalp: &dummy_pmdval,
6263	ptlp: &vmf->ptl);
6264	if (unlikely(!vmf->pte))
6265	return `0`;
6266	vmf->orig_pte = ptep_get_lockless(ptep: vmf->pte);
6267	vmf->flags \|= FAULT_FLAG_ORIG_PTE_VALID;
6268
6269	if (pte_none(pte: vmf->orig_pte)) {
6270	pte_unmap(pte: vmf->pte);
6271	vmf->pte = NULL;
6272	}
6273	}
6274
6275	if (!vmf->pte)
6276	return do_pte_missing(vmf);
6277
6278	if (!pte_present(a: vmf->orig_pte))
6279	return do_swap_page(vmf);
6280
6281	if (pte_protnone(pte: vmf->orig_pte) && vma_is_accessible(vma: vmf->vma))
6282	return do_numa_page(vmf);
6283
6284	spin_lock(lock: vmf->ptl);
6285	entry = vmf->orig_pte;
6286	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
6287	update_mmu_tlb(vma: vmf->vma, address: vmf->address, ptep: vmf->pte);
6288	goto unlock;
6289	}
6290	if (vmf->flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) {
6291	if (!pte_write(pte: entry))
6292	return do_wp_page(vmf);
6293	else if (likely(vmf->flags & FAULT_FLAG_WRITE))
6294	entry = pte_mkdirty(pte: entry);
6295	}
6296	entry = pte_mkyoung(pte: entry);
6297	if (ptep_set_access_flags(vma: vmf->vma, address: vmf->address, ptep: vmf->pte, entry,
6298	dirty: vmf->flags & FAULT_FLAG_WRITE))
6299	update_mmu_cache_range(vmf, vma: vmf->vma, addr: vmf->address,
6300	ptep: vmf->pte, nr: `1`);
6301	else
6302	fix_spurious_fault(vmf, ptlevel: PGTABLE_LEVEL_PTE);
6303	unlock:
6304	pte_unmap_unlock(vmf->pte, vmf->ptl);
6305	return `0`;
6306	}
6307
6308	/*
6309	* On entry, we hold either the VMA lock or the mmap_lock
6310	* (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
6311	* the result, the mmap_lock is not held on exit. See filemap_fault()
6312	* and __folio_lock_or_retry().
6313	*/
6314	static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
6315	unsigned long address, unsigned int flags)
6316	{
6317	struct vm_fault vmf = {
6318	.vma = vma,
6319	.address = address & PAGE_MASK,
6320	.real_address = address,
6321	.flags = flags,
6322	.pgoff = linear_page_index(vma, address),
6323	.gfp_mask = __get_fault_gfp_mask(vma),
6324	};
6325	struct mm_struct *mm = vma->vm_mm;
6326	vm_flags_t vm_flags = vma->vm_flags;
6327	pgd_t *pgd;
6328	p4d_t *p4d;
6329	vm_fault_t ret;
6330
6331	pgd = pgd_offset(mm, address);
6332	p4d = p4d_alloc(mm, pgd, address);
6333	if (!p4d)
6334	return VM_FAULT_OOM;
6335
6336	vmf.pud = pud_alloc(mm, p4d, address);
6337	if (!vmf.pud)
6338	return VM_FAULT_OOM;
6339	retry_pud:
6340	if (pud_none(pud: *vmf.pud) &&
6341	thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
6342	ret = create_huge_pud(vmf: &vmf);
6343	if (!(ret & VM_FAULT_FALLBACK))
6344	return ret;
6345	} else {
6346	pud_t orig_pud = *vmf.pud;
6347
6348	barrier();
6349	if (pud_trans_huge(pud: orig_pud)) {
6350
6351	/*
6352	* TODO once we support anonymous PUDs: NUMA case and
6353	* FAULT_FLAG_UNSHARE handling.
6354	*/
6355	if ((flags & FAULT_FLAG_WRITE) && !pud_write(pud: orig_pud)) {
6356	ret = wp_huge_pud(vmf: &vmf, orig_pud);
6357	if (!(ret & VM_FAULT_FALLBACK))
6358	return ret;
6359	} else {
6360	huge_pud_set_accessed(vmf: &vmf, orig_pud);
6361	return `0`;
6362	}
6363	}
6364	}
6365
6366	vmf.pmd = pmd_alloc(mm, pud: vmf.pud, address);
6367	if (!vmf.pmd)
6368	return VM_FAULT_OOM;
6369
6370	/ Huge pud page fault raced with pmd_alloc? /
6371	if (pud_trans_unstable(pud: vmf.pud))
6372	goto retry_pud;
6373
6374	if (pmd_none(pmd: *vmf.pmd) &&
6375	thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
6376	ret = create_huge_pmd(vmf: &vmf);
6377	if (ret & VM_FAULT_FALLBACK)
6378	goto fallback;
6379	else
6380	return ret;
6381	}
6382
6383	vmf.orig_pmd = pmdp_get_lockless(pmdp: vmf.pmd);
6384	if (pmd_none(pmd: vmf.orig_pmd))
6385	goto fallback;
6386
6387	if (unlikely(!pmd_present(vmf.orig_pmd))) {
6388	if (pmd_is_device_private_entry(pmd: vmf.orig_pmd))
6389	return do_huge_pmd_device_private(vmf: &vmf);
6390
6391	if (pmd_is_migration_entry(pmd: vmf.orig_pmd))
6392	pmd_migration_entry_wait(mm, pmd: vmf.pmd);
6393	return `0`;
6394	}
6395	if (pmd_trans_huge(pmd: vmf.orig_pmd)) {
6396	if (pmd_protnone(pmd: vmf.orig_pmd) && vma_is_accessible(vma))
6397	return do_huge_pmd_numa_page(vmf: &vmf);
6398
6399	if ((flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) &&
6400	!pmd_write(pmd: vmf.orig_pmd)) {
6401	ret = wp_huge_pmd(vmf: &vmf);
6402	if (!(ret & VM_FAULT_FALLBACK))
6403	return ret;
6404	} else {
6405	vmf.ptl = pmd_lock(mm, pmd: vmf.pmd);
6406	if (!huge_pmd_set_accessed(vmf: &vmf))
6407	fix_spurious_fault(vmf: &vmf, ptlevel: PGTABLE_LEVEL_PMD);
6408	spin_unlock(lock: vmf.ptl);
6409	return `0`;
6410	}
6411	}
6412
6413	fallback:
6414	return handle_pte_fault(vmf: &vmf);
6415	}
6416
6417	/**
6418	* mm_account_fault - Do page fault accounting
6419	* @mm: mm from which memcg should be extracted. It can be NULL.
6420	* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
6421	* of perf event counters, but we'll still do the per-task accounting to
6422	* the task who triggered this page fault.
6423	* @address: the faulted address.
6424	* @flags: the fault flags.
6425	* @ret: the fault retcode.
6426	*
6427	* This will take care of most of the page fault accounting. Meanwhile, it
6428	* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ\|MIN] perf counter
6429	* updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
6430	* still be in per-arch page fault handlers at the entry of page fault.
6431	*/
6432	static inline void mm_account_fault(struct mm_struct mm, struct* pt_regs *regs,
6433	unsigned long address, unsigned int flags,
6434	vm_fault_t ret)
6435	{
6436	bool major;
6437
6438	/ Incomplete faults will be accounted upon completion. /
6439	if (ret & VM_FAULT_RETRY)
6440	return;
6441
6442	/*
6443	* To preserve the behavior of older kernels, PGFAULT counters record
6444	* both successful and failed faults, as opposed to perf counters,
6445	* which ignore failed cases.
6446	*/
6447	count_vm_event(item: PGFAULT);
6448	count_memcg_event_mm(mm, idx: PGFAULT);
6449
6450	/*
6451	* Do not account for unsuccessful faults (e.g. when the address wasn't
6452	* valid). That includes arch_vma_access_permitted() failing before
6453	* reaching here. So this is not a "this many hardware page faults"
6454	* counter. We should use the hw profiling for that.
6455	*/
6456	if (ret & VM_FAULT_ERROR)
6457	return;
6458
6459	/*
6460	* We define the fault as a major fault when the final successful fault
6461	* is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
6462	* handle it immediately previously).
6463	*/
6464	major = (ret & VM_FAULT_MAJOR) \|\| (flags & FAULT_FLAG_TRIED);
6465
6466	if (major)
6467	current->maj_flt++;
6468	else
6469	current->min_flt++;
6470
6471	/*
6472	* If the fault is done for GUP, regs will be NULL. We only do the
6473	* accounting for the per thread fault counters who triggered the
6474	* fault, and we skip the perf event updates.
6475	*/
6476	if (!regs)
6477	return;
6478
6479	if (major)
6480	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS_MAJ, nr: `1`, regs, addr: address);
6481	else
6482	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS_MIN, nr: `1`, regs, addr: address);
6483	}
6484
6485	#ifdef CONFIG_LRU_GEN
6486	static void lru_gen_enter_fault(struct vm_area_struct *vma)
6487	{
6488	/ the LRU algorithm only applies to accesses with recency /
6489	current->in_lru_fault = vma_has_recency(vma);
6490	}
6491
6492	static void lru_gen_exit_fault(void)
6493	{
6494	current->in_lru_fault = false;
6495	}
6496	#else
6497	static void lru_gen_enter_fault(struct vm_area_struct *vma)
6498	{
6499	}
6500
6501	static void lru_gen_exit_fault(void)
6502	{
6503	}
6504	#endif /* CONFIG_LRU_GEN */
6505
6506	static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
6507	unsigned int *flags)
6508	{
6509	if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
6510	if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
6511	return VM_FAULT_SIGSEGV;
6512	/*
6513	* FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
6514	* just treat it like an ordinary read-fault otherwise.
6515	*/
6516	if (!is_cow_mapping(flags: vma->vm_flags))
6517	*flags &= ~FAULT_FLAG_UNSHARE;
6518	} else if (*flags & FAULT_FLAG_WRITE) {
6519	/ Write faults on read-only mappings are impossible ... /
6520	if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
6521	return VM_FAULT_SIGSEGV;
6522	/ ... and FOLL_FORCE only applies to COW mappings. /
6523	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
6524	!is_cow_mapping(vma->vm_flags)))
6525	return VM_FAULT_SIGSEGV;
6526	}
6527	#ifdef CONFIG_PER_VMA_LOCK
6528	/*
6529	* Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
6530	* the assumption that lock is dropped on VM_FAULT_RETRY.
6531	*/
6532	if (WARN_ON_ONCE((*flags &
6533	(FAULT_FLAG_VMA_LOCK \| FAULT_FLAG_RETRY_NOWAIT)) ==
6534	(FAULT_FLAG_VMA_LOCK \| FAULT_FLAG_RETRY_NOWAIT)))
6535	return VM_FAULT_SIGSEGV;
6536	#endif
6537
6538	return `0`;
6539	}
6540
6541	/*
6542	* By the time we get here, we already hold either the VMA lock or the
6543	* mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
6544	*
6545	* The mmap_lock may have been released depending on flags and our
6546	* return value. See filemap_fault() and __folio_lock_or_retry().
6547	*/
6548	vm_fault_t handle_mm_fault(struct vm_area_struct vma, unsigned* long address,
6549	unsigned int flags, struct pt_regs *regs)
6550	{
6551	/ If the fault handler drops the mmap_lock, vma may be freed /
6552	struct mm_struct *mm = vma->vm_mm;
6553	vm_fault_t ret;
6554	bool is_droppable;
6555
6556	__set_current_state(TASK_RUNNING);
6557
6558	ret = sanitize_fault_flags(vma, flags: &flags);
6559	if (ret)
6560	goto out;
6561
6562	if (!arch_vma_access_permitted(vma, write: flags & FAULT_FLAG_WRITE,
6563	execute: flags & FAULT_FLAG_INSTRUCTION,
6564	foreign: flags & FAULT_FLAG_REMOTE)) {
6565	ret = VM_FAULT_SIGSEGV;
6566	goto out;
6567	}
6568
6569	is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
6570
6571	/*
6572	* Enable the memcg OOM handling for faults triggered in user
6573	* space. Kernel faults are handled more gracefully.
6574	*/
6575	if (flags & FAULT_FLAG_USER)
6576	mem_cgroup_enter_user_fault();
6577
6578	lru_gen_enter_fault(vma);
6579
6580	if (unlikely(is_vm_hugetlb_page(vma)))
6581	ret = hugetlb_fault(mm: vma->vm_mm, vma, address, flags);
6582	else
6583	ret = __handle_mm_fault(vma, address, flags);
6584
6585	/*
6586	* Warning: It is no longer safe to dereference vma-> after this point,
6587	* because mmap_lock might have been dropped by __handle_mm_fault(), so
6588	* vma might be destroyed from underneath us.
6589	*/
6590
6591	lru_gen_exit_fault();
6592
6593	/ If the mapping is droppable, then errors due to OOM aren't fatal. /
6594	if (is_droppable)
6595	ret &= ~VM_FAULT_OOM;
6596
6597	if (flags & FAULT_FLAG_USER) {
6598	mem_cgroup_exit_user_fault();
6599	/*
6600	* The task may have entered a memcg OOM situation but
6601	* if the allocation error was handled gracefully (no
6602	* VM_FAULT_OOM), there is no need to kill anything.
6603	* Just clean up the OOM state peacefully.
6604	*/
6605	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
6606	mem_cgroup_oom_synchronize(wait: false);
6607	}
6608	out:
6609	mm_account_fault(mm, regs, address, flags, ret);
6610
6611	return ret;
6612	}
6613	EXPORT_SYMBOL_GPL(handle_mm_fault);
6614
6615	#ifndef __PAGETABLE_P4D_FOLDED
6616	/*
6617	* Allocate p4d page table.
6618	* We've already handled the fast-path in-line.
6619	*/
6620	int __p4d_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)
6621	{
6622	p4d_t *new = p4d_alloc_one(mm, address);
6623	if (!new)
6624	return -ENOMEM;
6625
6626	spin_lock(lock: &mm->page_table_lock);
6627	if (pgd_present(pgd: pgd)) { /* Another has populated it /
6628	p4d_free(mm, p4d: new);
6629	} else {
6630	smp_wmb(); / See comment in pmd_install() /
6631	pgd_populate(mm, pgd, p4d: new);
6632	}
6633	spin_unlock(lock: &mm->page_table_lock);
6634	return `0`;
6635	}
6636	#endif /* __PAGETABLE_P4D_FOLDED */
6637
6638	#ifndef __PAGETABLE_PUD_FOLDED
6639	/*
6640	* Allocate page upper directory.
6641	* We've already handled the fast-path in-line.
6642	*/
6643	int __pud_alloc(struct mm_struct mm, p4d_t p4d, unsigned long address)
6644	{
6645	pud_t *new = pud_alloc_one(mm, address);
6646	if (!new)
6647	return -ENOMEM;
6648
6649	spin_lock(lock: &mm->page_table_lock);
6650	if (!p4d_present(p4d: *p4d)) {
6651	mm_inc_nr_puds(mm);
6652	smp_wmb(); / See comment in pmd_install() /
6653	p4d_populate(mm, p4d, pud: new);
6654	} else / Another has populated it /
6655	pud_free(mm, pud: new);
6656	spin_unlock(lock: &mm->page_table_lock);
6657	return `0`;
6658	}
6659	#endif /* __PAGETABLE_PUD_FOLDED */
6660
6661	#ifndef __PAGETABLE_PMD_FOLDED
6662	/*
6663	* Allocate page middle directory.
6664	* We've already handled the fast-path in-line.
6665	*/
6666	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
6667	{
6668	spinlock_t *ptl;
6669	pmd_t *new = pmd_alloc_one(mm, address);
6670	if (!new)
6671	return -ENOMEM;
6672
6673	ptl = pud_lock(mm, pud);
6674	if (!pud_present(pud: *pud)) {
6675	mm_inc_nr_pmds(mm);
6676	smp_wmb(); / See comment in pmd_install() /
6677	pud_populate(mm, pud, pmd: new);
6678	} else { / Another has populated it /
6679	pmd_free(mm, pmd: new);
6680	}
6681	spin_unlock(lock: ptl);
6682	return `0`;
6683	}
6684	#endif /* __PAGETABLE_PMD_FOLDED */
6685
6686	static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
6687	spinlock_t lock, pte_t ptep,
6688	pgprot_t pgprot, unsigned long pfn_base,
6689	unsigned long addr_mask, bool writable,
6690	bool special)
6691	{
6692	args->lock = lock;
6693	args->ptep = ptep;
6694	args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
6695	args->addr_mask = addr_mask;
6696	args->pgprot = pgprot;
6697	args->writable = writable;
6698	args->special = special;
6699	}
6700
6701	static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
6702	{
6703	#ifdef CONFIG_LOCKDEP
6704	struct file *file = vma->vm_file;
6705	struct address_space *mapping = file ? file->f_mapping : NULL;
6706
6707	if (mapping)
6708	lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) \|\|
6709	lockdep_is_held(&vma->vm_mm->mmap_lock));
6710	else
6711	lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
6712	#endif
6713	}
6714
6715	/**
6716	* follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6717	* @args: Pointer to struct @follow_pfnmap_args
6718	*
6719	* The caller needs to setup args->vma and args->address to point to the
6720	* virtual address as the target of such lookup. On a successful return,
6721	* the results will be put into other output fields.
6722	*
6723	* After the caller finished using the fields, the caller must invoke
6724	* another follow_pfnmap_end() to proper releases the locks and resources
6725	* of such look up request.
6726	*
6727	* During the start() and end() calls, the results in @args will be valid
6728	* as proper locks will be held. After the end() is called, all the fields
6729	* in @follow_pfnmap_args will be invalid to be further accessed. Further
6730	* use of such information after end() may require proper synchronizations
6731	* by the caller with page table updates, otherwise it can create a
6732	* security bug.
6733	*
6734	* If the PTE maps a refcounted page, callers are responsible to protect
6735	* against invalidation with MMU notifiers; otherwise access to the PFN at
6736	* a later point in time can trigger use-after-free.
6737	*
6738	* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
6739	* should be taken for read, and the mmap semaphore cannot be released
6740	* before the end() is invoked.
6741	*
6742	* This function must not be used to modify PTE content.
6743	*
6744	* Return: zero on success, negative otherwise.
6745	*/
6746	int follow_pfnmap_start(struct follow_pfnmap_args *args)
6747	{
6748	struct vm_area_struct *vma = args->vma;
6749	unsigned long address = args->address;
6750	struct mm_struct *mm = vma->vm_mm;
6751	spinlock_t *lock;
6752	pgd_t *pgdp;
6753	p4d_t *p4dp, p4d;
6754	pud_t *pudp, pud;
6755	pmd_t *pmdp, pmd;
6756	pte_t *ptep, pte;
6757
6758	pfnmap_lockdep_assert(vma);
6759
6760	if (unlikely(address < vma->vm_start \|\| address >= vma->vm_end))
6761	goto out;
6762
6763	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
6764	goto out;
6765	retry:
6766	pgdp = pgd_offset(mm, address);
6767	if (pgd_none(pgd: pgdp) \|\| unlikely(pgd_bad(pgdp)))
6768	goto out;
6769
6770	p4dp = p4d_offset(pgd: pgdp, address);
6771	p4d = p4dp_get(p4dp);
6772	if (p4d_none(p4d) \|\| unlikely(p4d_bad(p4d)))
6773	goto out;
6774
6775	pudp = pud_offset(p4d: p4dp, address);
6776	pud = pudp_get(pudp);
6777	if (pud_none(pud))
6778	goto out;
6779	if (pud_leaf(pud)) {
6780	lock = pud_lock(mm, pud: pudp);
6781	if (!unlikely(pud_leaf(pud))) {
6782	spin_unlock(lock);
6783	goto retry;
6784	}
6785	pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
6786	pud_pfn(pud), PUD_MASK, pud_write(pud),
6787	special: pud_special(pud));
6788	return `0`;
6789	}
6790
6791	pmdp = pmd_offset(pud: pudp, address);
6792	pmd = pmdp_get_lockless(pmdp);
6793	if (pmd_leaf(pte: pmd)) {
6794	lock = pmd_lock(mm, pmd: pmdp);
6795	if (!unlikely(pmd_leaf(pmd))) {
6796	spin_unlock(lock);
6797	goto retry;
6798	}
6799	pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
6800	pfn_base: pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
6801	special: pmd_special(pmd));
6802	return `0`;
6803	}
6804
6805	ptep = pte_offset_map_lock(mm, pmd: pmdp, addr: address, ptlp: &lock);
6806	if (!ptep)
6807	goto out;
6808	pte = ptep_get(ptep);
6809	if (!pte_present(a: pte))
6810	goto unlock;
6811	pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
6812	pfn_base: pte_pfn(pte), PAGE_MASK, writable: pte_write(pte),
6813	special: pte_special(pte));
6814	return `0`;
6815	unlock:
6816	pte_unmap_unlock(ptep, lock);
6817	out:
6818	return -EINVAL;
6819	}
6820	EXPORT_SYMBOL_GPL(follow_pfnmap_start);
6821
6822	/**
6823	* follow_pfnmap_end(): End a follow_pfnmap_start() process
6824	* @args: Pointer to struct @follow_pfnmap_args
6825	*
6826	* Must be used in pair of follow_pfnmap_start(). See the start() function
6827	* above for more information.
6828	*/
6829	void follow_pfnmap_end(struct follow_pfnmap_args *args)
6830	{
6831	if (args->lock)
6832	spin_unlock(lock: args->lock);
6833	if (args->ptep)
6834	pte_unmap(pte: args->ptep);
6835	}
6836	EXPORT_SYMBOL_GPL(follow_pfnmap_end);
6837
6838	#ifdef CONFIG_HAVE_IOREMAP_PROT
6839	/**
6840	* generic_access_phys - generic implementation for iomem mmap access
6841	* @vma: the vma to access
6842	* @addr: userspace address, not relative offset within @vma
6843	* @buf: buffer to read/write
6844	* @len: length of transfer
6845	* @write: set to FOLL_WRITE when writing, otherwise reading
6846	*
6847	* This is a generic implementation for &vm_operations_struct.access for an
6848	* iomem mapping. This callback is used by access_process_vm() when the @vma is
6849	* not page based.
6850	*/
6851	int generic_access_phys(struct vm_area_struct vma, unsigned* long addr,
6852	void buf, int* len, int write)
6853	{
6854	resource_size_t phys_addr;
6855	pgprot_t prot = __pgprot(`0`);
6856	void __iomem *maddr;
6857	int offset = offset_in_page(addr);
6858	int ret = -EINVAL;
6859	bool writable;
6860	struct follow_pfnmap_args args = { .vma = vma, .address = addr };
6861
6862	retry:
6863	if (follow_pfnmap_start(&args))
6864	return -EINVAL;
6865	prot = args.pgprot;
6866	phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
6867	writable = args.writable;
6868	follow_pfnmap_end(&args);
6869
6870	if ((write & FOLL_WRITE) && !writable)
6871	return -EINVAL;
6872
6873	maddr = ioremap_prot(offset: phys_addr, PAGE_ALIGN(len + offset), prot);
6874	if (!maddr)
6875	return -ENOMEM;
6876
6877	if (follow_pfnmap_start(&args))
6878	goto out_unmap;
6879
6880	if ((pgprot_val(prot) != pgprot_val(args.pgprot)) \|\|
6881	(phys_addr != (args.pfn << PAGE_SHIFT)) \|\|
6882	(writable != args.writable)) {
6883	follow_pfnmap_end(&args);
6884	iounmap(addr: maddr);
6885	goto retry;
6886	}
6887
6888	if (write)
6889	memcpy_toio(maddr + offset, buf, len);
6890	else
6891	memcpy_fromio(buf, maddr + offset, len);
6892	ret = len;
6893	follow_pfnmap_end(&args);
6894	out_unmap:
6895	iounmap(addr: maddr);
6896
6897	return ret;
6898	}
6899	EXPORT_SYMBOL_GPL(generic_access_phys);
6900	#endif
6901
6902	/*
6903	* Access another process' address space as given in mm.
6904	*/
6905	static int __access_remote_vm(struct mm_struct mm, unsigned* long addr,
6906	void buf, int* len, unsigned int gup_flags)
6907	{
6908	void *old_buf = buf;
6909	int write = gup_flags & FOLL_WRITE;
6910
6911	if (mmap_read_lock_killable(mm))
6912	return `0`;
6913
6914	/ Untag the address before looking up the VMA /
6915	addr = untagged_addr_remote(mm, addr);
6916
6917	/ Avoid triggering the temporary warning in __get_user_pages /
6918	if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
6919	return `0`;
6920
6921	/ ignore errors, just check how much was successfully transferred /
6922	while (len) {
6923	int bytes, offset;
6924	void *maddr;
6925	struct folio *folio;
6926	struct vm_area_struct *vma = NULL;
6927	struct page *page = get_user_page_vma_remote(mm, addr,
6928	gup_flags, vmap: &vma);
6929
6930	if (IS_ERR(ptr: page)) {
6931	/ We might need to expand the stack to access it /
6932	vma = vma_lookup(mm, addr);
6933	if (!vma) {
6934	vma = expand_stack(mm, addr);
6935
6936	/ mmap_lock was dropped on failure /
6937	if (!vma)
6938	return buf - old_buf;
6939
6940	/ Try again if stack expansion worked /
6941	continue;
6942	}
6943
6944	/*
6945	* Check if this is a VM_IO \| VM_PFNMAP VMA, which
6946	* we can access using slightly different code.
6947	*/
6948	bytes = `0`;
6949	#ifdef CONFIG_HAVE_IOREMAP_PROT
6950	if (vma->vm_ops && vma->vm_ops->access)
6951	bytes = vma->vm_ops->access(vma, addr, buf,
6952	len, write);
6953	#endif
6954	if (bytes <= `0`)
6955	break;
6956	} else {
6957	folio = page_folio(page);
6958	bytes = len;
6959	offset = addr & (PAGE_SIZE-`1`);
6960	if (bytes > PAGE_SIZE-offset)
6961	bytes = PAGE_SIZE-offset;
6962
6963	maddr = kmap_local_folio(folio, offset: folio_page_idx(folio, page) * PAGE_SIZE);
6964	if (write) {
6965	copy_to_user_page(vma, page, addr,
6966	maddr + offset, buf, bytes);
6967	folio_mark_dirty_lock(folio);
6968	} else {
6969	copy_from_user_page(vma, page, addr,
6970	buf, maddr + offset, bytes);
6971	}
6972	folio_release_kmap(folio, addr: maddr);
6973	}
6974	len -= bytes;
6975	buf += bytes;
6976	addr += bytes;
6977	}
6978	mmap_read_unlock(mm);
6979
6980	return buf - old_buf;
6981	}
6982
6983	/**
6984	* access_remote_vm - access another process' address space
6985	* @mm: the mm_struct of the target address space
6986	* @addr: start address to access
6987	* @buf: source or destination buffer
6988	* @len: number of bytes to transfer
6989	* @gup_flags: flags modifying lookup behaviour
6990	*
6991	* The caller must hold a reference on @mm.
6992	*
6993	* Return: number of bytes copied from source to destination.
6994	*/
6995	int access_remote_vm(struct mm_struct mm, unsigned* long addr,
6996	void buf, int* len, unsigned int gup_flags)
6997	{
6998	return __access_remote_vm(mm, addr, buf, len, gup_flags);
6999	}
7000
7001	/*
7002	* Access another process' address space.
7003	* Source/target buffer must be kernel space,
7004	* Do not walk the page table directly, use get_user_pages
7005	*/
7006	int access_process_vm(struct task_struct tsk, unsigned* long addr,
7007	void buf, int* len, unsigned int gup_flags)
7008	{
7009	struct mm_struct *mm;
7010	int ret;
7011
7012	mm = get_task_mm(task: tsk);
7013	if (!mm)
7014	return `0`;
7015
7016	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
7017
7018	mmput(mm);
7019
7020	return ret;
7021	}
7022	EXPORT_SYMBOL_GPL(access_process_vm);
7023
7024	#ifdef CONFIG_BPF_SYSCALL
7025	/*
7026	* Copy a string from another process's address space as given in mm.
7027	* If there is any error return -EFAULT.
7028	*/
7029	static int __copy_remote_vm_str(struct mm_struct mm, unsigned* long addr,
7030	void buf, int* len, unsigned int gup_flags)
7031	{
7032	void *old_buf = buf;
7033	int err = `0`;
7034
7035	(char* *)buf = `'\0'`;
7036
7037	if (mmap_read_lock_killable(mm))
7038	return -EFAULT;
7039
7040	addr = untagged_addr_remote(mm, addr);
7041
7042	/ Avoid triggering the temporary warning in __get_user_pages /
7043	if (!vma_lookup(mm, addr)) {
7044	err = -EFAULT;
7045	goto out;
7046	}
7047
7048	while (len) {
7049	int bytes, offset, retval;
7050	void *maddr;
7051	struct folio *folio;
7052	struct page *page;
7053	struct vm_area_struct *vma = NULL;
7054
7055	page = get_user_page_vma_remote(mm, addr, gup_flags, vmap: &vma);
7056	if (IS_ERR(ptr: page)) {
7057	/*
7058	* Treat as a total failure for now until we decide how
7059	* to handle the CONFIG_HAVE_IOREMAP_PROT case and
7060	* stack expansion.
7061	*/
7062	(char* *)buf = `'\0'`;
7063	err = -EFAULT;
7064	goto out;
7065	}
7066
7067	folio = page_folio(page);
7068	bytes = len;
7069	offset = addr & (PAGE_SIZE - `1`);
7070	if (bytes > PAGE_SIZE - offset)
7071	bytes = PAGE_SIZE - offset;
7072
7073	maddr = kmap_local_folio(folio, offset: folio_page_idx(folio, page) * PAGE_SIZE);
7074	retval = strscpy(buf, maddr + offset, bytes);
7075	if (retval >= `0`) {
7076	/ Found the end of the string /
7077	buf += retval;
7078	folio_release_kmap(folio, addr: maddr);
7079	break;
7080	}
7081
7082	buf += bytes - `1`;
7083	/*
7084	* Because strscpy always NUL terminates we need to
7085	* copy the last byte in the page if we are going to
7086	* load more pages
7087	*/
7088	if (bytes != len) {
7089	addr += bytes - `1`;
7090	copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - `1`), `1`);
7091	buf += `1`;
7092	addr += `1`;
7093	}
7094	len -= bytes;
7095
7096	folio_release_kmap(folio, addr: maddr);
7097	}
7098
7099	out:
7100	mmap_read_unlock(mm);
7101	if (err)
7102	return err;
7103	return buf - old_buf;
7104	}
7105
7106	/**
7107	* copy_remote_vm_str - copy a string from another process's address space.
7108	* @tsk: the task of the target address space
7109	* @addr: start address to read from
7110	* @buf: destination buffer
7111	* @len: number of bytes to copy
7112	* @gup_flags: flags modifying lookup behaviour
7113	*
7114	* The caller must hold a reference on @mm.
7115	*
7116	* Return: number of bytes copied from @addr (source) to @buf (destination);
7117	* not including the trailing NUL. Always guaranteed to leave NUL-terminated
7118	* buffer. On any error, return -EFAULT.
7119	*/
7120	int copy_remote_vm_str(struct task_struct tsk, unsigned* long addr,
7121	void buf, int* len, unsigned int gup_flags)
7122	{
7123	struct mm_struct *mm;
7124	int ret;
7125
7126	if (unlikely(len == `0`))
7127	return `0`;
7128
7129	mm = get_task_mm(task: tsk);
7130	if (!mm) {
7131	(char* *)buf = `'\0'`;
7132	return -EFAULT;
7133	}
7134
7135	ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
7136
7137	mmput(mm);
7138
7139	return ret;
7140	}
7141	EXPORT_SYMBOL_GPL(copy_remote_vm_str);
7142	#endif /* CONFIG_BPF_SYSCALL */
7143
7144	/*
7145	* Print the name of a VMA.
7146	*/
7147	void print_vma_addr(char prefix, unsigned* long ip)
7148	{
7149	struct mm_struct *mm = current->mm;
7150	struct vm_area_struct *vma;
7151
7152	/*
7153	* we might be running from an atomic context so we cannot sleep
7154	*/
7155	if (!mmap_read_trylock(mm))
7156	return;
7157
7158	vma = vma_lookup(mm, addr: ip);
7159	if (vma && vma->vm_file) {
7160	struct file *f = vma->vm_file;
7161	ip -= vma->vm_start;
7162	ip += vma->vm_pgoff << PAGE_SHIFT;
7163	printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
7164	vma->vm_start,
7165	vma->vm_end - vma->vm_start);
7166	}
7167	mmap_read_unlock(mm);
7168	}
7169
7170	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
7171	void __might_fault(const char file, int* line)
7172	{
7173	if (pagefault_disabled())
7174	return;
7175	__might_sleep(file, line);
7176	if (current->mm)
7177	might_lock_read(&current->mm->mmap_lock);
7178	}
7179	EXPORT_SYMBOL(__might_fault);
7180	#endif
7181
7182	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_HUGETLBFS)
7183	/*
7184	* Process all subpages of the specified huge page with the specified
7185	* operation. The target subpage will be processed last to keep its
7186	* cache lines hot.
7187	*/
7188	static inline int process_huge_page(
7189	unsigned long addr_hint, unsigned int nr_pages,
7190	int (process_subpage)(unsigned* long addr, int idx, void *arg),
7191	void *arg)
7192	{
7193	int i, n, base, l, ret;
7194	unsigned long addr = addr_hint &
7195	~(((unsigned long)nr_pages << PAGE_SHIFT) - `1`);
7196
7197	/ Process target subpage last to keep its cache lines hot /
7198	might_sleep();
7199	n = (addr_hint - addr) / PAGE_SIZE;
7200	if (`2` * n <= nr_pages) {
7201	/ If target subpage in first half of huge page /
7202	base = `0`;
7203	l = n;
7204	/ Process subpages at the end of huge page /
7205	for (i = nr_pages - `1`; i >= `2` * n; i--) {
7206	cond_resched();
7207	ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7208	if (ret)
7209	return ret;
7210	}
7211	} else {
7212	/ If target subpage in second half of huge page /
7213	base = nr_pages - `2` * (nr_pages - n);
7214	l = nr_pages - n;
7215	/ Process subpages at the begin of huge page /
7216	for (i = `0`; i < base; i++) {
7217	cond_resched();
7218	ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7219	if (ret)
7220	return ret;
7221	}
7222	}
7223	/*
7224	* Process remaining subpages in left-right-left-right pattern
7225	* towards the target subpage
7226	*/
7227	for (i = `0`; i < l; i++) {
7228	int left_idx = base + i;
7229	int right_idx = base + `2` * l - `1` - i;
7230
7231	cond_resched();
7232	ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
7233	if (ret)
7234	return ret;
7235	cond_resched();
7236	ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
7237	if (ret)
7238	return ret;
7239	}
7240	return `0`;
7241	}
7242
7243	static void clear_gigantic_page(struct folio folio, unsigned* long addr_hint,
7244	unsigned int nr_pages)
7245	{
7246	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
7247	int i;
7248
7249	might_sleep();
7250	for (i = `0`; i < nr_pages; i++) {
7251	cond_resched();
7252	clear_user_highpage(folio_page(folio, i), vaddr: addr + i * PAGE_SIZE);
7253	}
7254	}
7255
7256	static int clear_subpage(unsigned long addr, int idx, void *arg)
7257	{
7258	struct folio *folio = arg;
7259
7260	clear_user_highpage(folio_page(folio, idx), vaddr: addr);
7261	return `0`;
7262	}
7263
7264	/**
7265	* folio_zero_user - Zero a folio which will be mapped to userspace.
7266	* @folio: The folio to zero.
7267	* @addr_hint: The address will be accessed or the base address if uncelar.
7268	*/
7269	void folio_zero_user(struct folio folio, unsigned* long addr_hint)
7270	{
7271	unsigned int nr_pages = folio_nr_pages(folio);
7272
7273	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7274	clear_gigantic_page(folio, addr_hint, nr_pages);
7275	else
7276	process_huge_page(addr_hint, nr_pages, process_subpage: clear_subpage, arg: folio);
7277	}
7278
7279	static int copy_user_gigantic_page(struct folio dst, struct* folio *src,
7280	unsigned long addr_hint,
7281	struct vm_area_struct *vma,
7282	unsigned int nr_pages)
7283	{
7284	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
7285	struct page *dst_page;
7286	struct page *src_page;
7287	int i;
7288
7289	for (i = `0`; i < nr_pages; i++) {
7290	dst_page = folio_page(dst, i);
7291	src_page = folio_page(src, i);
7292
7293	cond_resched();
7294	if (copy_mc_user_highpage(to: dst_page, from: src_page,
7295	vaddr: addr + i*PAGE_SIZE, vma))
7296	return -EHWPOISON;
7297	}
7298	return `0`;
7299	}
7300
7301	struct copy_subpage_arg {
7302	struct folio *dst;
7303	struct folio *src;
7304	struct vm_area_struct *vma;
7305	};
7306
7307	static int copy_subpage(unsigned long addr, int idx, void *arg)
7308	{
7309	struct copy_subpage_arg *copy_arg = arg;
7310	struct page *dst = folio_page(copy_arg->dst, idx);
7311	struct page *src = folio_page(copy_arg->src, idx);
7312
7313	if (copy_mc_user_highpage(to: dst, from: src, vaddr: addr, vma: copy_arg->vma))
7314	return -EHWPOISON;
7315	return `0`;
7316	}
7317
7318	int copy_user_large_folio(struct folio dst, struct* folio *src,
7319	unsigned long addr_hint, struct vm_area_struct *vma)
7320	{
7321	unsigned int nr_pages = folio_nr_pages(folio: dst);
7322	struct copy_subpage_arg arg = {
7323	.dst = dst,
7324	.src = src,
7325	.vma = vma,
7326	};
7327
7328	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7329	return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);
7330
7331	return process_huge_page(addr_hint, nr_pages, process_subpage: copy_subpage, arg: &arg);
7332	}
7333
7334	long copy_folio_from_user(struct folio *dst_folio,
7335	const void __user *usr_src,
7336	bool allow_pagefault)
7337	{
7338	void *kaddr;
7339	unsigned long i, rc = `0`;
7340	unsigned int nr_pages = folio_nr_pages(folio: dst_folio);
7341	unsigned long ret_val = nr_pages * PAGE_SIZE;
7342	struct page *subpage;
7343
7344	for (i = `0`; i < nr_pages; i++) {
7345	subpage = folio_page(dst_folio, i);
7346	kaddr = kmap_local_page(page: subpage);
7347	if (!allow_pagefault)
7348	pagefault_disable();
7349	rc = copy_from_user(to: kaddr, from: usr_src + i * PAGE_SIZE, PAGE_SIZE);
7350	if (!allow_pagefault)
7351	pagefault_enable();
7352	kunmap_local(kaddr);
7353
7354	ret_val -= (PAGE_SIZE - rc);
7355	if (rc)
7356	break;
7357
7358	flush_dcache_page(page: subpage);
7359
7360	cond_resched();
7361	}
7362	return ret_val;
7363	}
7364	#endif /* CONFIG_TRANSPARENT_HUGEPAGE \|\| CONFIG_HUGETLBFS */
7365
7366	#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
7367
7368	static struct kmem_cache *page_ptl_cachep;
7369
7370	void __init ptlock_cache_init(void)
7371	{
7372	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), `0`,
7373	SLAB_PANIC, NULL);
7374	}
7375
7376	bool ptlock_alloc(struct ptdesc *ptdesc)
7377	{
7378	spinlock_t *ptl;
7379
7380	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
7381	if (!ptl)
7382	return false;
7383	ptdesc->ptl = ptl;
7384	return true;
7385	}
7386
7387	void ptlock_free(struct ptdesc *ptdesc)
7388	{
7389	if (ptdesc->ptl)
7390	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
7391	}
7392	#endif
7393
7394	void vma_pgtable_walk_begin(struct vm_area_struct *vma)
7395	{
7396	if (is_vm_hugetlb_page(vma))
7397	hugetlb_vma_lock_read(vma);
7398	}
7399
7400	void vma_pgtable_walk_end(struct vm_area_struct *vma)
7401	{
7402	if (is_vm_hugetlb_page(vma))
7403	hugetlb_vma_unlock_read(vma);
7404	}
7405

source code of linux/mm/memory.c