huge_memory.c source code [linux/mm/huge_memory.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2009 Red Hat, Inc.
4	*/
5
6	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8	#include <linux/mm.h>
9	#include <linux/sched.h>
10	#include <linux/sched/mm.h>
11	#include <linux/sched/coredump.h>
12	#include <linux/sched/numa_balancing.h>
13	#include <linux/highmem.h>
14	#include <linux/hugetlb.h>
15	#include <linux/mmu_notifier.h>
16	#include <linux/rmap.h>
17	#include <linux/swap.h>
18	#include <linux/shrinker.h>
19	#include <linux/mm_inline.h>
20	#include <linux/swapops.h>
21	#include <linux/backing-dev.h>
22	#include <linux/dax.h>
23	#include <linux/khugepaged.h>
24	#include <linux/freezer.h>
25	#include <linux/pfn_t.h>
26	#include <linux/mman.h>
27	#include <linux/memremap.h>
28	#include <linux/pagemap.h>
29	#include <linux/debugfs.h>
30	#include <linux/migrate.h>
31	#include <linux/hashtable.h>
32	#include <linux/userfaultfd_k.h>
33	#include <linux/page_idle.h>
34	#include <linux/shmem_fs.h>
35	#include <linux/oom.h>
36	#include <linux/numa.h>
37	#include <linux/page_owner.h>
38	#include <linux/sched/sysctl.h>
39	#include <linux/memory-tiers.h>
40
41	#include <asm/tlb.h>
42	#include <asm/pgalloc.h>
43	#include "internal.h"
44	#include "swap.h"
45
46	#define CREATE_TRACE_POINTS
47	#include <trace/events/thp.h>
48
49	/*
50	* By default, transparent hugepage support is disabled in order to avoid
51	* risking an increased memory footprint for applications that are not
52	* guaranteed to benefit from it. When transparent hugepage support is
53	* enabled, it is for all mappings, and khugepaged scans all mappings.
54	* Defrag is invoked by khugepaged hugepage allocations and by page faults
55	* for all hugepage allocations.
56	*/
57	unsigned long transparent_hugepage_flags __read_mostly =
58	#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
59	(`1`<<TRANSPARENT_HUGEPAGE_FLAG)\|
60	#endif
61	#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
62	(`1`<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)\|
63	#endif
64	(`1`<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)\|
65	(`1`<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)\|
66	(`1`<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
67
68	static struct shrinker *deferred_split_shrinker;
69	static unsigned long deferred_split_count(struct shrinker *shrink,
70	struct shrink_control *sc);
71	static unsigned long deferred_split_scan(struct shrinker *shrink,
72	struct shrink_control *sc);
73
74	static atomic_t huge_zero_refcount;
75	struct page *huge_zero_page __read_mostly;
76	unsigned long huge_zero_pfn __read_mostly = ~`0UL`;
77
78	bool hugepage_vma_check(struct vm_area_struct vma, unsigned* long vm_flags,
79	bool smaps, bool in_pf, bool enforce_sysfs)
80	{
81	if (!vma->vm_mm) / vdso /
82	return false;
83
84	/*
85	* Explicitly disabled through madvise or prctl, or some
86	* architectures may disable THP for some mappings, for
87	* example, s390 kvm.
88	* */
89	if ((vm_flags & VM_NOHUGEPAGE) \|\|
90	test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
91	return false;
92	/*
93	* If the hardware/firmware marked hugepage support disabled.
94	*/
95	if (transparent_hugepage_flags & (`1` << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
96	return false;
97
98	/ khugepaged doesn't collapse DAX vma, but page fault is fine. /
99	if (vma_is_dax(vma))
100	return in_pf;
101
102	/*
103	* khugepaged special VMA and hugetlb VMA.
104	* Must be checked after dax since some dax mappings may have
105	* VM_MIXEDMAP set.
106	*/
107	if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
108	return false;
109
110	/*
111	* Check alignment for file vma and size for both file and anon vma.
112	*
113	* Skip the check for page fault. Huge fault does the check in fault
114	* handlers. And this check is not suitable for huge PUD fault.
115	*/
116	if (!in_pf &&
117	!transhuge_vma_suitable(vma, addr: (vma->vm_end - HPAGE_PMD_SIZE)))
118	return false;
119
120	/*
121	* Enabled via shmem mount options or sysfs settings.
122	* Must be done before hugepage flags check since shmem has its
123	* own flags.
124	*/
125	if (!in_pf && shmem_file(file: vma->vm_file))
126	return shmem_is_huge(inode: file_inode(f: vma->vm_file), index: vma->vm_pgoff,
127	shmem_huge_force: !enforce_sysfs, mm: vma->vm_mm, vm_flags);
128
129	/ Enforce sysfs THP requirements as necessary /
130	if (enforce_sysfs &&
131	(!hugepage_flags_enabled() \|\| (!(vm_flags & VM_HUGEPAGE) &&
132	!hugepage_flags_always())))
133	return false;
134
135	if (!vma_is_anonymous(vma)) {
136	/*
137	* Trust that ->huge_fault() handlers know what they are doing
138	* in fault path.
139	*/
140	if (((in_pf \|\| smaps)) && vma->vm_ops->huge_fault)
141	return true;
142	/ Only regular file is valid in collapse path /
143	if (((!in_pf \|\| smaps)) && file_thp_enabled(vma))
144	return true;
145	return false;
146	}
147
148	if (vma_is_temporary_stack(vma))
149	return false;
150
151	/*
152	* THPeligible bit of smaps should show 1 for proper VMAs even
153	* though anon_vma is not initialized yet.
154	*
155	* Allow page fault since anon_vma may be not initialized until
156	* the first page fault.
157	*/
158	if (!vma->anon_vma)
159	return (smaps \|\| in_pf);
160
161	return true;
162	}
163
164	static bool get_huge_zero_page(void)
165	{
166	struct page *zero_page;
167	retry:
168	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
169	return true;
170
171	zero_page = alloc_pages(gfp: (GFP_TRANSHUGE \| __GFP_ZERO) & ~__GFP_MOVABLE,
172	HPAGE_PMD_ORDER);
173	if (!zero_page) {
174	count_vm_event(item: THP_ZERO_PAGE_ALLOC_FAILED);
175	return false;
176	}
177	preempt_disable();
178	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
179	preempt_enable();
180	__free_pages(page: zero_page, order: compound_order(page: zero_page));
181	goto retry;
182	}
183	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
184
185	/ We take additional reference here. It will be put back by shrinker /
186	atomic_set(v: &huge_zero_refcount, i: `2`);
187	preempt_enable();
188	count_vm_event(item: THP_ZERO_PAGE_ALLOC);
189	return true;
190	}
191
192	static void put_huge_zero_page(void)
193	{
194	/*
195	* Counter should never go to zero here. Only shrinker can put
196	* last reference.
197	*/
198	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
199	}
200
201	struct page mm_get_huge_zero_page(struct* mm_struct *mm)
202	{
203	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
204	return READ_ONCE(huge_zero_page);
205
206	if (!get_huge_zero_page())
207	return NULL;
208
209	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, addr: &mm->flags))
210	put_huge_zero_page();
211
212	return READ_ONCE(huge_zero_page);
213	}
214
215	void mm_put_huge_zero_page(struct mm_struct *mm)
216	{
217	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
218	put_huge_zero_page();
219	}
220
221	static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
222	struct shrink_control *sc)
223	{
224	/ we can free zero page only if last reference remains /
225	return atomic_read(v: &huge_zero_refcount) == `1` ? HPAGE_PMD_NR : `0`;
226	}
227
228	static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
229	struct shrink_control *sc)
230	{
231	if (atomic_cmpxchg(v: &huge_zero_refcount, old: `1`, new: `0`) == `1`) {
232	struct page *zero_page = xchg(&huge_zero_page, NULL);
233	BUG_ON(zero_page == NULL);
234	WRITE_ONCE(huge_zero_pfn, ~`0UL`);
235	__free_pages(page: zero_page, order: compound_order(page: zero_page));
236	return HPAGE_PMD_NR;
237	}
238
239	return `0`;
240	}
241
242	static struct shrinker *huge_zero_page_shrinker;
243
244	#ifdef CONFIG_SYSFS
245	static ssize_t enabled_show(struct kobject *kobj,
246	struct kobj_attribute attr, char* *buf)
247	{
248	const char *output;
249
250	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
251	output = "[always] madvise never";
252	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
253	&transparent_hugepage_flags))
254	output = "always [madvise] never";
255	else
256	output = "always madvise [never]";
257
258	return sysfs_emit(buf, fmt: "%s\n", output);
259	}
260
261	static ssize_t enabled_store(struct kobject *kobj,
262	struct kobj_attribute *attr,
263	const char *buf, size_t count)
264	{
265	ssize_t ret = count;
266
267	if (sysfs_streq(s1: buf, s2: "always")) {
268	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
269	set_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
270	} else if (sysfs_streq(s1: buf, s2: "madvise")) {
271	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
272	set_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
273	} else if (sysfs_streq(s1: buf, s2: "never")) {
274	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG, addr: &transparent_hugepage_flags);
275	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
276	} else
277	ret = -EINVAL;
278
279	if (ret > `0`) {
280	int err = start_stop_khugepaged();
281	if (err)
282	ret = err;
283	}
284	return ret;
285	}
286
287	static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
288
289	ssize_t single_hugepage_flag_show(struct kobject *kobj,
290	struct kobj_attribute attr, char* *buf,
291	enum transparent_hugepage_flag flag)
292	{
293	return sysfs_emit(buf, fmt: "%d\n",
294	!!test_bit(flag, &transparent_hugepage_flags));
295	}
296
297	ssize_t single_hugepage_flag_store(struct kobject *kobj,
298	struct kobj_attribute *attr,
299	const char *buf, size_t count,
300	enum transparent_hugepage_flag flag)
301	{
302	unsigned long value;
303	int ret;
304
305	ret = kstrtoul(s: buf, base: `10`, res: &value);
306	if (ret < `0`)
307	return ret;
308	if (value > `1`)
309	return -EINVAL;
310
311	if (value)
312	set_bit(nr: flag, addr: &transparent_hugepage_flags);
313	else
314	clear_bit(nr: flag, addr: &transparent_hugepage_flags);
315
316	return count;
317	}
318
319	static ssize_t defrag_show(struct kobject *kobj,
320	struct kobj_attribute attr, char* *buf)
321	{
322	const char *output;
323
324	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
325	&transparent_hugepage_flags))
326	output = "[always] defer defer+madvise madvise never";
327	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
328	&transparent_hugepage_flags))
329	output = "always [defer] defer+madvise madvise never";
330	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
331	&transparent_hugepage_flags))
332	output = "always defer [defer+madvise] madvise never";
333	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
334	&transparent_hugepage_flags))
335	output = "always defer defer+madvise [madvise] never";
336	else
337	output = "always defer defer+madvise madvise [never]";
338
339	return sysfs_emit(buf, fmt: "%s\n", output);
340	}
341
342	static ssize_t defrag_store(struct kobject *kobj,
343	struct kobj_attribute *attr,
344	const char *buf, size_t count)
345	{
346	if (sysfs_streq(s1: buf, s2: "always")) {
347	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
348	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
349	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
350	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
351	} else if (sysfs_streq(s1: buf, s2: "defer+madvise")) {
352	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
353	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
354	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
355	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
356	} else if (sysfs_streq(s1: buf, s2: "defer")) {
357	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
358	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
359	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
360	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
361	} else if (sysfs_streq(s1: buf, s2: "madvise")) {
362	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
363	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
364	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
365	set_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
366	} else if (sysfs_streq(s1: buf, s2: "never")) {
367	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, addr: &transparent_hugepage_flags);
368	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, addr: &transparent_hugepage_flags);
369	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, addr: &transparent_hugepage_flags);
370	clear_bit(nr: TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, addr: &transparent_hugepage_flags);
371	} else
372	return -EINVAL;
373
374	return count;
375	}
376	static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
377
378	static ssize_t use_zero_page_show(struct kobject *kobj,
379	struct kobj_attribute attr, char* *buf)
380	{
381	return single_hugepage_flag_show(kobj, attr, buf,
382	flag: TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
383	}
384	static ssize_t use_zero_page_store(struct kobject *kobj,
385	struct kobj_attribute attr, const* char *buf, size_t count)
386	{
387	return single_hugepage_flag_store(kobj, attr, buf, count,
388	flag: TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
389	}
390	static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
391
392	static ssize_t hpage_pmd_size_show(struct kobject *kobj,
393	struct kobj_attribute attr, char* *buf)
394	{
395	return sysfs_emit(buf, fmt: "%lu\n", HPAGE_PMD_SIZE);
396	}
397	static struct kobj_attribute hpage_pmd_size_attr =
398	__ATTR_RO(hpage_pmd_size);
399
400	static struct attribute *hugepage_attr[] = {
401	&enabled_attr.attr,
402	&defrag_attr.attr,
403	&use_zero_page_attr.attr,
404	&hpage_pmd_size_attr.attr,
405	#ifdef CONFIG_SHMEM
406	&shmem_enabled_attr.attr,
407	#endif
408	NULL,
409	};
410
411	static const struct attribute_group hugepage_attr_group = {
412	.attrs = hugepage_attr,
413	};
414
415	static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
416	{
417	int err;
418
419	*hugepage_kobj = kobject_create_and_add(name: "transparent_hugepage", parent: mm_kobj);
420	if (unlikely(!*hugepage_kobj)) {
421	pr_err("failed to create transparent hugepage kobject\n");
422	return -ENOMEM;
423	}
424
425	err = sysfs_create_group(kobj: *hugepage_kobj, grp: &hugepage_attr_group);
426	if (err) {
427	pr_err("failed to register transparent hugepage group\n");
428	goto delete_obj;
429	}
430
431	err = sysfs_create_group(kobj: *hugepage_kobj, grp: &khugepaged_attr_group);
432	if (err) {
433	pr_err("failed to register transparent hugepage group\n");
434	goto remove_hp_group;
435	}
436
437	return `0`;
438
439	remove_hp_group:
440	sysfs_remove_group(kobj: *hugepage_kobj, grp: &hugepage_attr_group);
441	delete_obj:
442	kobject_put(kobj: *hugepage_kobj);
443	return err;
444	}
445
446	static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
447	{
448	sysfs_remove_group(kobj: hugepage_kobj, grp: &khugepaged_attr_group);
449	sysfs_remove_group(kobj: hugepage_kobj, grp: &hugepage_attr_group);
450	kobject_put(kobj: hugepage_kobj);
451	}
452	#else
453	static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
454	{
455	return `0`;
456	}
457
458	static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
459	{
460	}
461	#endif /* CONFIG_SYSFS */
462
463	static int __init thp_shrinker_init(void)
464	{
465	huge_zero_page_shrinker = shrinker_alloc(flags: `0`, fmt: "thp-zero");
466	if (!huge_zero_page_shrinker)
467	return -ENOMEM;
468
469	deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE \|
470	SHRINKER_MEMCG_AWARE \|
471	SHRINKER_NONSLAB,
472	fmt: "thp-deferred_split");
473	if (!deferred_split_shrinker) {
474	shrinker_free(shrinker: huge_zero_page_shrinker);
475	return -ENOMEM;
476	}
477
478	huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
479	huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
480	shrinker_register(shrinker: huge_zero_page_shrinker);
481
482	deferred_split_shrinker->count_objects = deferred_split_count;
483	deferred_split_shrinker->scan_objects = deferred_split_scan;
484	shrinker_register(shrinker: deferred_split_shrinker);
485
486	return `0`;
487	}
488
489	static void __init thp_shrinker_exit(void)
490	{
491	shrinker_free(shrinker: huge_zero_page_shrinker);
492	shrinker_free(shrinker: deferred_split_shrinker);
493	}
494
495	static int __init hugepage_init(void)
496	{
497	int err;
498	struct kobject *hugepage_kobj;
499
500	if (!has_transparent_hugepage()) {
501	transparent_hugepage_flags = `1` << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
502	return -EINVAL;
503	}
504
505	/*
506	* hugepages can't be allocated by the buddy allocator
507	*/
508	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
509	/*
510	* we use page->mapping and page->index in second tail page
511	* as list_head: assuming THP order >= 2
512	*/
513	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < `2`);
514
515	err = hugepage_init_sysfs(hugepage_kobj: &hugepage_kobj);
516	if (err)
517	goto err_sysfs;
518
519	err = khugepaged_init();
520	if (err)
521	goto err_slab;
522
523	err = thp_shrinker_init();
524	if (err)
525	goto err_shrinker;
526
527	/*
528	* By default disable transparent hugepages on smaller systems,
529	* where the extra memory used could hurt more than TLB overhead
530	* is likely to save. The admin can still enable it through /sys.
531	*/
532	if (totalram_pages() < (`512` << (`20` - PAGE_SHIFT))) {
533	transparent_hugepage_flags = `0`;
534	return `0`;
535	}
536
537	err = start_stop_khugepaged();
538	if (err)
539	goto err_khugepaged;
540
541	return `0`;
542	err_khugepaged:
543	thp_shrinker_exit();
544	err_shrinker:
545	khugepaged_destroy();
546	err_slab:
547	hugepage_exit_sysfs(hugepage_kobj);
548	err_sysfs:
549	return err;
550	}
551	subsys_initcall(hugepage_init);
552
553	static int __init setup_transparent_hugepage(char *str)
554	{
555	int ret = `0`;
556	if (!str)
557	goto out;
558	if (!strcmp(str, "always")) {
559	set_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
560	addr: &transparent_hugepage_flags);
561	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
562	addr: &transparent_hugepage_flags);
563	ret = `1`;
564	} else if (!strcmp(str, "madvise")) {
565	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
566	addr: &transparent_hugepage_flags);
567	set_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
568	addr: &transparent_hugepage_flags);
569	ret = `1`;
570	} else if (!strcmp(str, "never")) {
571	clear_bit(nr: TRANSPARENT_HUGEPAGE_FLAG,
572	addr: &transparent_hugepage_flags);
573	clear_bit(nr: TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
574	addr: &transparent_hugepage_flags);
575	ret = `1`;
576	}
577	out:
578	if (!ret)
579	pr_warn("transparent_hugepage= cannot parse, ignored\n");
580	return ret;
581	}
582	__setup("transparent_hugepage=", setup_transparent_hugepage);
583
584	pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
585	{
586	if (likely(vma->vm_flags & VM_WRITE))
587	pmd = pmd_mkwrite(pmd, vma);
588	return pmd;
589	}
590
591	#ifdef CONFIG_MEMCG
592	static inline
593	struct deferred_split get_deferred_split_queue(struct* folio *folio)
594	{
595	struct mem_cgroup *memcg = folio_memcg(folio);
596	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
597
598	if (memcg)
599	return &memcg->deferred_split_queue;
600	else
601	return &pgdat->deferred_split_queue;
602	}
603	#else
604	static inline
605	struct deferred_split get_deferred_split_queue(struct* folio *folio)
606	{
607	struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
608
609	return &pgdat->deferred_split_queue;
610	}
611	#endif
612
613	void folio_prep_large_rmappable(struct folio *folio)
614	{
615	VM_BUG_ON_FOLIO(folio_order(folio) < `2`, folio);
616	INIT_LIST_HEAD(list: &folio->_deferred_list);
617	folio_set_large_rmappable(folio);
618	}
619
620	static inline bool is_transparent_hugepage(struct folio *folio)
621	{
622	if (!folio_test_large(folio))
623	return false;
624
625	return is_huge_zero_page(page: &folio->page) \|\|
626	folio_test_large_rmappable(folio);
627	}
628
629	static unsigned long __thp_get_unmapped_area(struct file *filp,
630	unsigned long addr, unsigned long len,
631	loff_t off, unsigned long flags, unsigned long size)
632	{
633	loff_t off_end = off + len;
634	loff_t off_align = round_up(off, size);
635	unsigned long len_pad, ret;
636
637	if (off_end <= off_align \|\| (off_end - off_align) < size)
638	return `0`;
639
640	len_pad = len + size;
641	if (len_pad < len \|\| (off + len_pad) < off)
642	return `0`;
643
644	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
645	off >> PAGE_SHIFT, flags);
646
647	/*
648	* The failure might be due to length padding. The caller will retry
649	* without the padding.
650	*/
651	if (IS_ERR_VALUE(ret))
652	return `0`;
653
654	/*
655	* Do not try to align to THP boundary if allocation at the address
656	* hint succeeds.
657	*/
658	if (ret == addr)
659	return addr;
660
661	ret += (off - ret) & (size - `1`);
662	return ret;
663	}
664
665	unsigned long thp_get_unmapped_area(struct file filp, unsigned* long addr,
666	unsigned long len, unsigned long pgoff, unsigned long flags)
667	{
668	unsigned long ret;
669	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
670
671	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
672	if (ret)
673	return ret;
674
675	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
676	}
677	EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
678
679	static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
680	struct page *page, gfp_t gfp)
681	{
682	struct vm_area_struct *vma = vmf->vma;
683	struct folio *folio = page_folio(page);
684	pgtable_t pgtable;
685	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
686	vm_fault_t ret = `0`;
687
688	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
689
690	if (mem_cgroup_charge(folio, mm: vma->vm_mm, gfp)) {
691	folio_put(folio);
692	count_vm_event(item: THP_FAULT_FALLBACK);
693	count_vm_event(item: THP_FAULT_FALLBACK_CHARGE);
694	return VM_FAULT_FALLBACK;
695	}
696	folio_throttle_swaprate(folio, gfp);
697
698	pgtable = pte_alloc_one(vma->vm_mm);
699	if (unlikely(!pgtable)) {
700	ret = VM_FAULT_OOM;
701	goto release;
702	}
703
704	clear_huge_page(page, addr_hint: vmf->address, HPAGE_PMD_NR);
705	/*
706	* The memory barrier inside __folio_mark_uptodate makes sure that
707	* clear_huge_page writes become visible before the set_pmd_at()
708	* write.
709	*/
710	__folio_mark_uptodate(folio);
711
712	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
713	if (unlikely(!pmd_none(*vmf->pmd))) {
714	goto unlock_release;
715	} else {
716	pmd_t entry;
717
718	ret = check_stable_address_space(mm: vma->vm_mm);
719	if (ret)
720	goto unlock_release;
721
722	/ Deliver the page fault to userland /
723	if (userfaultfd_missing(vma)) {
724	spin_unlock(lock: vmf->ptl);
725	folio_put(folio);
726	pte_free(mm: vma->vm_mm, pte_page: pgtable);
727	ret = handle_userfault(vmf, VM_UFFD_MISSING);
728	VM_BUG_ON(ret & VM_FAULT_FALLBACK);
729	return ret;
730	}
731
732	entry = mk_huge_pmd(page, vma->vm_page_prot);
733	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
734	folio_add_new_anon_rmap(folio, vma, address: haddr);
735	folio_add_lru_vma(folio, vma);
736	pgtable_trans_huge_deposit(mm: vma->vm_mm, pmdp: vmf->pmd, pgtable);
737	set_pmd_at(mm: vma->vm_mm, addr: haddr, pmdp: vmf->pmd, pmd: entry);
738	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
739	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
740	mm_inc_nr_ptes(mm: vma->vm_mm);
741	spin_unlock(lock: vmf->ptl);
742	count_vm_event(item: THP_FAULT_ALLOC);
743	count_memcg_event_mm(mm: vma->vm_mm, idx: THP_FAULT_ALLOC);
744	}
745
746	return `0`;
747	unlock_release:
748	spin_unlock(lock: vmf->ptl);
749	release:
750	if (pgtable)
751	pte_free(mm: vma->vm_mm, pte_page: pgtable);
752	folio_put(folio);
753	return ret;
754
755	}
756
757	/*
758	* always: directly stall for all thp allocations
759	* defer: wake kswapd and fail if not immediately available
760	* defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
761	* fail if not immediately available
762	* madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
763	* available
764	* never: never stall for any thp allocation
765	*/
766	gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
767	{
768	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
769
770	/ Always do synchronous compaction /
771	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
772	return GFP_TRANSHUGE \| (vma_madvised ? `0` : __GFP_NORETRY);
773
774	/ Kick kcompactd and fail quickly /
775	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
776	return GFP_TRANSHUGE_LIGHT \| __GFP_KSWAPD_RECLAIM;
777
778	/ Synchronous compaction if madvised, otherwise kick kcompactd /
779	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
780	return GFP_TRANSHUGE_LIGHT \|
781	(vma_madvised ? __GFP_DIRECT_RECLAIM :
782	__GFP_KSWAPD_RECLAIM);
783
784	/ Only do synchronous compaction if madvised /
785	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
786	return GFP_TRANSHUGE_LIGHT \|
787	(vma_madvised ? __GFP_DIRECT_RECLAIM : `0`);
788
789	return GFP_TRANSHUGE_LIGHT;
790	}
791
792	/ Caller must hold page table lock. /
793	static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
794	struct vm_area_struct vma, unsigned* long haddr, pmd_t *pmd,
795	struct page *zero_page)
796	{
797	pmd_t entry;
798	if (!pmd_none(pmd: *pmd))
799	return;
800	entry = mk_pmd(zero_page, vma->vm_page_prot);
801	entry = pmd_mkhuge(pmd: entry);
802	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
803	set_pmd_at(mm, addr: haddr, pmdp: pmd, pmd: entry);
804	mm_inc_nr_ptes(mm);
805	}
806
807	vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
808	{
809	struct vm_area_struct *vma = vmf->vma;
810	gfp_t gfp;
811	struct folio *folio;
812	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
813
814	if (!transhuge_vma_suitable(vma, addr: haddr))
815	return VM_FAULT_FALLBACK;
816	if (unlikely(anon_vma_prepare(vma)))
817	return VM_FAULT_OOM;
818	khugepaged_enter_vma(vma, vm_flags: vma->vm_flags);
819
820	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
821	!mm_forbids_zeropage(vma->vm_mm) &&
822	transparent_hugepage_use_zero_page()) {
823	pgtable_t pgtable;
824	struct page *zero_page;
825	vm_fault_t ret;
826	pgtable = pte_alloc_one(vma->vm_mm);
827	if (unlikely(!pgtable))
828	return VM_FAULT_OOM;
829	zero_page = mm_get_huge_zero_page(mm: vma->vm_mm);
830	if (unlikely(!zero_page)) {
831	pte_free(mm: vma->vm_mm, pte_page: pgtable);
832	count_vm_event(item: THP_FAULT_FALLBACK);
833	return VM_FAULT_FALLBACK;
834	}
835	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
836	ret = `0`;
837	if (pmd_none(pmd: *vmf->pmd)) {
838	ret = check_stable_address_space(mm: vma->vm_mm);
839	if (ret) {
840	spin_unlock(lock: vmf->ptl);
841	pte_free(mm: vma->vm_mm, pte_page: pgtable);
842	} else if (userfaultfd_missing(vma)) {
843	spin_unlock(lock: vmf->ptl);
844	pte_free(mm: vma->vm_mm, pte_page: pgtable);
845	ret = handle_userfault(vmf, VM_UFFD_MISSING);
846	VM_BUG_ON(ret & VM_FAULT_FALLBACK);
847	} else {
848	set_huge_zero_page(pgtable, mm: vma->vm_mm, vma,
849	haddr, pmd: vmf->pmd, zero_page);
850	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
851	spin_unlock(lock: vmf->ptl);
852	}
853	} else {
854	spin_unlock(lock: vmf->ptl);
855	pte_free(mm: vma->vm_mm, pte_page: pgtable);
856	}
857	return ret;
858	}
859	gfp = vma_thp_gfp_mask(vma);
860	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, addr: haddr, hugepage: true);
861	if (unlikely(!folio)) {
862	count_vm_event(item: THP_FAULT_FALLBACK);
863	return VM_FAULT_FALLBACK;
864	}
865	return __do_huge_pmd_anonymous_page(vmf, page: &folio->page, gfp);
866	}
867
868	static void insert_pfn_pmd(struct vm_area_struct vma, unsigned* long addr,
869	pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
870	pgtable_t pgtable)
871	{
872	struct mm_struct *mm = vma->vm_mm;
873	pmd_t entry;
874	spinlock_t *ptl;
875
876	ptl = pmd_lock(mm, pmd);
877	if (!pmd_none(pmd: *pmd)) {
878	if (write) {
879	if (pmd_pfn(pmd: *pmd) != pfn_t_to_pfn(pfn)) {
880	WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
881	goto out_unlock;
882	}
883	entry = pmd_mkyoung(pmd: *pmd);
884	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
885	if (pmdp_set_access_flags(vma, address: addr, pmdp: pmd, entry, dirty: `1`))
886	update_mmu_cache_pmd(vma, addr, pmd);
887	}
888
889	goto out_unlock;
890	}
891
892	entry = pmd_mkhuge(pmd: pfn_t_pmd(pfn, pgprot: prot));
893	if (pfn_t_devmap(pfn))
894	entry = pmd_mkdevmap(pmd: entry);
895	if (write) {
896	entry = pmd_mkyoung(pmd: pmd_mkdirty(pmd: entry));
897	entry = maybe_pmd_mkwrite(pmd: entry, vma);
898	}
899
900	if (pgtable) {
901	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
902	mm_inc_nr_ptes(mm);
903	pgtable = NULL;
904	}
905
906	set_pmd_at(mm, addr, pmdp: pmd, pmd: entry);
907	update_mmu_cache_pmd(vma, addr, pmd);
908
909	out_unlock:
910	spin_unlock(lock: ptl);
911	if (pgtable)
912	pte_free(mm, pte_page: pgtable);
913	}
914
915	/**
916	* vmf_insert_pfn_pmd - insert a pmd size pfn
917	* @vmf: Structure describing the fault
918	* @pfn: pfn to insert
919	* @write: whether it's a write fault
920	*
921	* Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
922	*
923	* Return: vm_fault_t value.
924	*/
925	vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
926	{
927	unsigned long addr = vmf->address & PMD_MASK;
928	struct vm_area_struct *vma = vmf->vma;
929	pgprot_t pgprot = vma->vm_page_prot;
930	pgtable_t pgtable = NULL;
931
932	/*
933	* If we had pmd_special, we could avoid all these restrictions,
934	* but we need to be consistent with PTEs and architectures that
935	* can't support a 'special' bit.
936	*/
937	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) &&
938	!pfn_t_devmap(pfn));
939	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
940	(VM_PFNMAP\|VM_MIXEDMAP));
941	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
942
943	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
944	return VM_FAULT_SIGBUS;
945
946	if (arch_needs_pgtable_deposit()) {
947	pgtable = pte_alloc_one(vma->vm_mm);
948	if (!pgtable)
949	return VM_FAULT_OOM;
950	}
951
952	track_pfn_insert(vma, prot: &pgprot, pfn);
953
954	insert_pfn_pmd(vma, addr, pmd: vmf->pmd, pfn, prot: pgprot, write, pgtable);
955	return VM_FAULT_NOPAGE;
956	}
957	EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
958
959	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
960	static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
961	{
962	if (likely(vma->vm_flags & VM_WRITE))
963	pud = pud_mkwrite(pud);
964	return pud;
965	}
966
967	static void insert_pfn_pud(struct vm_area_struct vma, unsigned* long addr,
968	pud_t *pud, pfn_t pfn, bool write)
969	{
970	struct mm_struct *mm = vma->vm_mm;
971	pgprot_t prot = vma->vm_page_prot;
972	pud_t entry;
973	spinlock_t *ptl;
974
975	ptl = pud_lock(mm, pud);
976	if (!pud_none(pud: *pud)) {
977	if (write) {
978	if (pud_pfn(pud: *pud) != pfn_t_to_pfn(pfn)) {
979	WARN_ON_ONCE(!is_huge_zero_pud(*pud));
980	goto out_unlock;
981	}
982	entry = pud_mkyoung(pud: *pud);
983	entry = maybe_pud_mkwrite(pud: pud_mkdirty(pud: entry), vma);
984	if (pudp_set_access_flags(vma, address: addr, pudp: pud, entry, dirty: `1`))
985	update_mmu_cache_pud(vma, addr, pud);
986	}
987	goto out_unlock;
988	}
989
990	entry = pud_mkhuge(pud: pfn_t_pud(pfn, pgprot: prot));
991	if (pfn_t_devmap(pfn))
992	entry = pud_mkdevmap(pud: entry);
993	if (write) {
994	entry = pud_mkyoung(pud: pud_mkdirty(pud: entry));
995	entry = maybe_pud_mkwrite(pud: entry, vma);
996	}
997	set_pud_at(mm, addr, pudp: pud, pud: entry);
998	update_mmu_cache_pud(vma, addr, pud);
999
1000	out_unlock:
1001	spin_unlock(lock: ptl);
1002	}
1003
1004	/**
1005	* vmf_insert_pfn_pud - insert a pud size pfn
1006	* @vmf: Structure describing the fault
1007	* @pfn: pfn to insert
1008	* @write: whether it's a write fault
1009	*
1010	* Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1011	*
1012	* Return: vm_fault_t value.
1013	*/
1014	vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
1015	{
1016	unsigned long addr = vmf->address & PUD_MASK;
1017	struct vm_area_struct *vma = vmf->vma;
1018	pgprot_t pgprot = vma->vm_page_prot;
1019
1020	/*
1021	* If we had pud_special, we could avoid all these restrictions,
1022	* but we need to be consistent with PTEs and architectures that
1023	* can't support a 'special' bit.
1024	*/
1025	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) &&
1026	!pfn_t_devmap(pfn));
1027	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
1028	(VM_PFNMAP\|VM_MIXEDMAP));
1029	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1030
1031	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1032	return VM_FAULT_SIGBUS;
1033
1034	track_pfn_insert(vma, prot: &pgprot, pfn);
1035
1036	insert_pfn_pud(vma, addr, pud: vmf->pud, pfn, write);
1037	return VM_FAULT_NOPAGE;
1038	}
1039	EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1040	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1041
1042	static void touch_pmd(struct vm_area_struct vma, unsigned* long addr,
1043	pmd_t *pmd, bool write)
1044	{
1045	pmd_t _pmd;
1046
1047	_pmd = pmd_mkyoung(pmd: *pmd);
1048	if (write)
1049	_pmd = pmd_mkdirty(pmd: _pmd);
1050	if (pmdp_set_access_flags(vma, address: addr & HPAGE_PMD_MASK,
1051	pmdp: pmd, entry: _pmd, dirty: write))
1052	update_mmu_cache_pmd(vma, addr, pmd);
1053	}
1054
1055	struct page follow_devmap_pmd(struct* vm_area_struct vma, unsigned* long addr,
1056	pmd_t pmd, int* flags, struct dev_pagemap **pgmap)
1057	{
1058	unsigned long pfn = pmd_pfn(pmd: *pmd);
1059	struct mm_struct *mm = vma->vm_mm;
1060	struct page *page;
1061	int ret;
1062
1063	assert_spin_locked(pmd_lockptr(mm, pmd));
1064
1065	if (flags & FOLL_WRITE && !pmd_write(pmd: *pmd))
1066	return NULL;
1067
1068	if (pmd_present(pmd: pmd) && pmd_devmap(pmd: pmd))
1069	/ pass /;
1070	else
1071	return NULL;
1072
1073	if (flags & FOLL_TOUCH)
1074	touch_pmd(vma, addr, pmd, write: flags & FOLL_WRITE);
1075
1076	/*
1077	* device mapped pages can only be returned if the
1078	* caller will manage the page reference count.
1079	*/
1080	if (!(flags & (FOLL_GET \| FOLL_PIN)))
1081	return ERR_PTR(error: -EEXIST);
1082
1083	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
1084	pgmap = get_dev_pagemap(pfn, pgmap: pgmap);
1085	if (!*pgmap)
1086	return ERR_PTR(error: -EFAULT);
1087	page = pfn_to_page(pfn);
1088	ret = try_grab_page(page, flags);
1089	if (ret)
1090	page = ERR_PTR(error: ret);
1091
1092	return page;
1093	}
1094
1095	int copy_huge_pmd(struct mm_struct dst_mm, struct* mm_struct *src_mm,
1096	pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
1097	struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1098	{
1099	spinlock_t dst_ptl, src_ptl;
1100	struct page *src_page;
1101	pmd_t pmd;
1102	pgtable_t pgtable = NULL;
1103	int ret = -ENOMEM;
1104
1105	/ Skip if can be re-fill on fault /
1106	if (!vma_is_anonymous(vma: dst_vma))
1107	return `0`;
1108
1109	pgtable = pte_alloc_one(dst_mm);
1110	if (unlikely(!pgtable))
1111	goto out;
1112
1113	dst_ptl = pmd_lock(mm: dst_mm, pmd: dst_pmd);
1114	src_ptl = pmd_lockptr(mm: src_mm, pmd: src_pmd);
1115	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1116
1117	ret = -EAGAIN;
1118	pmd = *src_pmd;
1119
1120	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1121	if (unlikely(is_swap_pmd(pmd))) {
1122	swp_entry_t entry = pmd_to_swp_entry(pmd);
1123
1124	VM_BUG_ON(!is_pmd_migration_entry(pmd));
1125	if (!is_readable_migration_entry(entry)) {
1126	entry = make_readable_migration_entry(
1127	offset: swp_offset(entry));
1128	pmd = swp_entry_to_pmd(entry);
1129	if (pmd_swp_soft_dirty(pmd: *src_pmd))
1130	pmd = pmd_swp_mksoft_dirty(pmd);
1131	if (pmd_swp_uffd_wp(pmd: *src_pmd))
1132	pmd = pmd_swp_mkuffd_wp(pmd);
1133	set_pmd_at(mm: src_mm, addr, pmdp: src_pmd, pmd);
1134	}
1135	add_mm_counter(mm: dst_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
1136	mm_inc_nr_ptes(mm: dst_mm);
1137	pgtable_trans_huge_deposit(mm: dst_mm, pmdp: dst_pmd, pgtable);
1138	if (!userfaultfd_wp(vma: dst_vma))
1139	pmd = pmd_swp_clear_uffd_wp(pmd);
1140	set_pmd_at(mm: dst_mm, addr, pmdp: dst_pmd, pmd);
1141	ret = `0`;
1142	goto out_unlock;
1143	}
1144	#endif
1145
1146	if (unlikely(!pmd_trans_huge(pmd))) {
1147	pte_free(mm: dst_mm, pte_page: pgtable);
1148	goto out_unlock;
1149	}
1150	/*
1151	* When page table lock is held, the huge zero pmd should not be
1152	* under splitting since we don't split the page itself, only pmd to
1153	* a page table.
1154	*/
1155	if (is_huge_zero_pmd(pmd)) {
1156	/*
1157	* get_huge_zero_page() will never allocate a new page here,
1158	* since we already have a zero page to copy. It just takes a
1159	* reference.
1160	*/
1161	mm_get_huge_zero_page(mm: dst_mm);
1162	goto out_zero_page;
1163	}
1164
1165	src_page = pmd_page(pmd);
1166	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1167
1168	get_page(page: src_page);
1169	if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
1170	/ Page maybe pinned: split and retry the fault on PTEs. /
1171	put_page(page: src_page);
1172	pte_free(mm: dst_mm, pte_page: pgtable);
1173	spin_unlock(lock: src_ptl);
1174	spin_unlock(lock: dst_ptl);
1175	__split_huge_pmd(vma: src_vma, pmd: src_pmd, address: addr, freeze: false, NULL);
1176	return -EAGAIN;
1177	}
1178	add_mm_counter(mm: dst_mm, member: MM_ANONPAGES, HPAGE_PMD_NR);
1179	out_zero_page:
1180	mm_inc_nr_ptes(mm: dst_mm);
1181	pgtable_trans_huge_deposit(mm: dst_mm, pmdp: dst_pmd, pgtable);
1182	pmdp_set_wrprotect(mm: src_mm, addr, pmdp: src_pmd);
1183	if (!userfaultfd_wp(vma: dst_vma))
1184	pmd = pmd_clear_uffd_wp(pmd);
1185	pmd = pmd_mkold(pmd: pmd_wrprotect(pmd));
1186	set_pmd_at(mm: dst_mm, addr, pmdp: dst_pmd, pmd);
1187
1188	ret = `0`;
1189	out_unlock:
1190	spin_unlock(lock: src_ptl);
1191	spin_unlock(lock: dst_ptl);
1192	out:
1193	return ret;
1194	}
1195
1196	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1197	static void touch_pud(struct vm_area_struct vma, unsigned* long addr,
1198	pud_t *pud, bool write)
1199	{
1200	pud_t _pud;
1201
1202	_pud = pud_mkyoung(pud: *pud);
1203	if (write)
1204	_pud = pud_mkdirty(pud: _pud);
1205	if (pudp_set_access_flags(vma, address: addr & HPAGE_PUD_MASK,
1206	pudp: pud, entry: _pud, dirty: write))
1207	update_mmu_cache_pud(vma, addr, pud);
1208	}
1209
1210	struct page follow_devmap_pud(struct* vm_area_struct vma, unsigned* long addr,
1211	pud_t pud, int* flags, struct dev_pagemap **pgmap)
1212	{
1213	unsigned long pfn = pud_pfn(pud: *pud);
1214	struct mm_struct *mm = vma->vm_mm;
1215	struct page *page;
1216	int ret;
1217
1218	assert_spin_locked(pud_lockptr(mm, pud));
1219
1220	if (flags & FOLL_WRITE && !pud_write(pud: *pud))
1221	return NULL;
1222
1223	if (pud_present(pud: pud) && pud_devmap(pud: pud))
1224	/ pass /;
1225	else
1226	return NULL;
1227
1228	if (flags & FOLL_TOUCH)
1229	touch_pud(vma, addr, pud, write: flags & FOLL_WRITE);
1230
1231	/*
1232	* device mapped pages can only be returned if the
1233	* caller will manage the page reference count.
1234	*
1235	* At least one of FOLL_GET \| FOLL_PIN must be set, so assert that here:
1236	*/
1237	if (!(flags & (FOLL_GET \| FOLL_PIN)))
1238	return ERR_PTR(error: -EEXIST);
1239
1240	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1241	pgmap = get_dev_pagemap(pfn, pgmap: pgmap);
1242	if (!*pgmap)
1243	return ERR_PTR(error: -EFAULT);
1244	page = pfn_to_page(pfn);
1245
1246	ret = try_grab_page(page, flags);
1247	if (ret)
1248	page = ERR_PTR(error: ret);
1249
1250	return page;
1251	}
1252
1253	int copy_huge_pud(struct mm_struct dst_mm, struct* mm_struct *src_mm,
1254	pud_t dst_pud, pud_t src_pud, unsigned long addr,
1255	struct vm_area_struct *vma)
1256	{
1257	spinlock_t dst_ptl, src_ptl;
1258	pud_t pud;
1259	int ret;
1260
1261	dst_ptl = pud_lock(mm: dst_mm, pud: dst_pud);
1262	src_ptl = pud_lockptr(mm: src_mm, pud: src_pud);
1263	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1264
1265	ret = -EAGAIN;
1266	pud = *src_pud;
1267	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1268	goto out_unlock;
1269
1270	/*
1271	* When page table lock is held, the huge zero pud should not be
1272	* under splitting since we don't split the page itself, only pud to
1273	* a page table.
1274	*/
1275	if (is_huge_zero_pud(pud)) {
1276	/ No huge zero pud yet /
1277	}
1278
1279	/*
1280	* TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
1281	* and split if duplicating fails.
1282	*/
1283	pudp_set_wrprotect(mm: src_mm, address: addr, pudp: src_pud);
1284	pud = pud_mkold(pud: pud_wrprotect(pud));
1285	set_pud_at(mm: dst_mm, addr, pudp: dst_pud, pud);
1286
1287	ret = `0`;
1288	out_unlock:
1289	spin_unlock(lock: src_ptl);
1290	spin_unlock(lock: dst_ptl);
1291	return ret;
1292	}
1293
1294	void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1295	{
1296	bool write = vmf->flags & FAULT_FLAG_WRITE;
1297
1298	vmf->ptl = pud_lock(mm: vmf->vma->vm_mm, pud: vmf->pud);
1299	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1300	goto unlock;
1301
1302	touch_pud(vma: vmf->vma, addr: vmf->address, pud: vmf->pud, write);
1303	unlock:
1304	spin_unlock(lock: vmf->ptl);
1305	}
1306	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1307
1308	void huge_pmd_set_accessed(struct vm_fault *vmf)
1309	{
1310	bool write = vmf->flags & FAULT_FLAG_WRITE;
1311
1312	vmf->ptl = pmd_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd);
1313	if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1314	goto unlock;
1315
1316	touch_pmd(vma: vmf->vma, addr: vmf->address, pmd: vmf->pmd, write);
1317
1318	unlock:
1319	spin_unlock(lock: vmf->ptl);
1320	}
1321
1322	vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1323	{
1324	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1325	struct vm_area_struct *vma = vmf->vma;
1326	struct folio *folio;
1327	struct page *page;
1328	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1329	pmd_t orig_pmd = vmf->orig_pmd;
1330
1331	vmf->ptl = pmd_lockptr(mm: vma->vm_mm, pmd: vmf->pmd);
1332	VM_BUG_ON_VMA(!vma->anon_vma, vma);
1333
1334	if (is_huge_zero_pmd(pmd: orig_pmd))
1335	goto fallback;
1336
1337	spin_lock(lock: vmf->ptl);
1338
1339	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1340	spin_unlock(lock: vmf->ptl);
1341	return `0`;
1342	}
1343
1344	page = pmd_page(orig_pmd);
1345	folio = page_folio(page);
1346	VM_BUG_ON_PAGE(!PageHead(page), page);
1347
1348	/ Early check when only holding the PT lock. /
1349	if (PageAnonExclusive(page))
1350	goto reuse;
1351
1352	if (!folio_trylock(folio)) {
1353	folio_get(folio);
1354	spin_unlock(lock: vmf->ptl);
1355	folio_lock(folio);
1356	spin_lock(lock: vmf->ptl);
1357	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1358	spin_unlock(lock: vmf->ptl);
1359	folio_unlock(folio);
1360	folio_put(folio);
1361	return `0`;
1362	}
1363	folio_put(folio);
1364	}
1365
1366	/ Recheck after temporarily dropping the PT lock. /
1367	if (PageAnonExclusive(page)) {
1368	folio_unlock(folio);
1369	goto reuse;
1370	}
1371
1372	/*
1373	* See do_wp_page(): we can only reuse the folio exclusively if
1374	* there are no additional references. Note that we always drain
1375	* the LRU cache immediately after adding a THP.
1376	*/
1377	if (folio_ref_count(folio) >
1378	`1` + folio_test_swapcache(folio) * folio_nr_pages(folio))
1379	goto unlock_fallback;
1380	if (folio_test_swapcache(folio))
1381	folio_free_swap(folio);
1382	if (folio_ref_count(folio) == `1`) {
1383	pmd_t entry;
1384
1385	folio_move_anon_rmap(folio, vma);
1386	SetPageAnonExclusive(page);
1387	folio_unlock(folio);
1388	reuse:
1389	if (unlikely(unshare)) {
1390	spin_unlock(lock: vmf->ptl);
1391	return `0`;
1392	}
1393	entry = pmd_mkyoung(pmd: orig_pmd);
1394	entry = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: entry), vma);
1395	if (pmdp_set_access_flags(vma, address: haddr, pmdp: vmf->pmd, entry, dirty: `1`))
1396	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
1397	spin_unlock(lock: vmf->ptl);
1398	return `0`;
1399	}
1400
1401	unlock_fallback:
1402	folio_unlock(folio);
1403	spin_unlock(lock: vmf->ptl);
1404	fallback:
1405	__split_huge_pmd(vma, pmd: vmf->pmd, address: vmf->address, freeze: false, NULL);
1406	return VM_FAULT_FALLBACK;
1407	}
1408
1409	static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1410	unsigned long addr, pmd_t pmd)
1411	{
1412	struct page *page;
1413
1414	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1415	return false;
1416
1417	/ Don't touch entries that are not even readable (NUMA hinting). /
1418	if (pmd_protnone(pmd))
1419	return false;
1420
1421	/ Do we need write faults for softdirty tracking? /
1422	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1423	return false;
1424
1425	/ Do we need write faults for uffd-wp tracking? /
1426	if (userfaultfd_huge_pmd_wp(vma, pmd))
1427	return false;
1428
1429	if (!(vma->vm_flags & VM_SHARED)) {
1430	/ See can_change_pte_writable(). /
1431	page = vm_normal_page_pmd(vma, addr, pmd);
1432	return page && PageAnon(page) && PageAnonExclusive(page);
1433	}
1434
1435	/ See can_change_pte_writable(). /
1436	return pmd_dirty(pmd);
1437	}
1438
1439	/ FOLL_FORCE can write to even unwritable PMDs in COW mappings. /
1440	static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
1441	struct vm_area_struct *vma,
1442	unsigned int flags)
1443	{
1444	/ If the pmd is writable, we can write to the page. /
1445	if (pmd_write(pmd))
1446	return true;
1447
1448	/ Maybe FOLL_FORCE is set to override it? /
1449	if (!(flags & FOLL_FORCE))
1450	return false;
1451
1452	/ But FOLL_FORCE has no effect on shared mappings /
1453	if (vma->vm_flags & (VM_MAYSHARE \| VM_SHARED))
1454	return false;
1455
1456	/ ... or read-only private ones /
1457	if (!(vma->vm_flags & VM_MAYWRITE))
1458	return false;
1459
1460	/ ... or already writable ones that just need to take a write fault /
1461	if (vma->vm_flags & VM_WRITE)
1462	return false;
1463
1464	/*
1465	* See can_change_pte_writable(): we broke COW and could map the page
1466	* writable if we have an exclusive anonymous page ...
1467	*/
1468	if (!page \|\| !PageAnon(page) \|\| !PageAnonExclusive(page))
1469	return false;
1470
1471	/ ... and a write-fault isn't required for other reasons. /
1472	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
1473	return false;
1474	return !userfaultfd_huge_pmd_wp(vma, pmd);
1475	}
1476
1477	struct page follow_trans_huge_pmd(struct* vm_area_struct *vma,
1478	unsigned long addr,
1479	pmd_t *pmd,
1480	unsigned int flags)
1481	{
1482	struct mm_struct *mm = vma->vm_mm;
1483	struct page *page;
1484	int ret;
1485
1486	assert_spin_locked(pmd_lockptr(mm, pmd));
1487
1488	page = pmd_page(*pmd);
1489	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1490
1491	if ((flags & FOLL_WRITE) &&
1492	!can_follow_write_pmd(pmd: *pmd, page, vma, flags))
1493	return NULL;
1494
1495	/ Avoid dumping huge zero page /
1496	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmd: *pmd))
1497	return ERR_PTR(error: -EFAULT);
1498
1499	if (pmd_protnone(pmd: *pmd) && !gup_can_follow_protnone(vma, flags))
1500	return NULL;
1501
1502	if (!pmd_write(pmd: *pmd) && gup_must_unshare(vma, flags, page))
1503	return ERR_PTR(error: -EMLINK);
1504
1505	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
1506	!PageAnonExclusive(page), page);
1507
1508	ret = try_grab_page(page, flags);
1509	if (ret)
1510	return ERR_PTR(error: ret);
1511
1512	if (flags & FOLL_TOUCH)
1513	touch_pmd(vma, addr, pmd, write: flags & FOLL_WRITE);
1514
1515	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1516	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1517
1518	return page;
1519	}
1520
1521	/ NUMA hinting page fault entry point for trans huge pmds /
1522	vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
1523	{
1524	struct vm_area_struct *vma = vmf->vma;
1525	pmd_t oldpmd = vmf->orig_pmd;
1526	pmd_t pmd;
1527	struct folio *folio;
1528	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1529	int nid = NUMA_NO_NODE;
1530	int target_nid, last_cpupid = (-`1` & LAST_CPUPID_MASK);
1531	bool migrated = false, writable = false;
1532	int flags = `0`;
1533
1534	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
1535	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1536	spin_unlock(lock: vmf->ptl);
1537	goto out;
1538	}
1539
1540	pmd = pmd_modify(pmd: oldpmd, newprot: vma->vm_page_prot);
1541
1542	/*
1543	* Detect now whether the PMD could be writable; this information
1544	* is only valid while holding the PT lock.
1545	*/
1546	writable = pmd_write(pmd);
1547	if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1548	can_change_pmd_writable(vma, addr: vmf->address, pmd))
1549	writable = true;
1550
1551	folio = vm_normal_folio_pmd(vma, addr: haddr, pmd);
1552	if (!folio)
1553	goto out_map;
1554
1555	/ See similar comment in do_numa_page for explanation /
1556	if (!writable)
1557	flags \|= TNF_NO_GROUP;
1558
1559	nid = folio_nid(folio);
1560	/*
1561	* For memory tiering mode, cpupid of slow memory page is used
1562	* to record page access time. So use default value.
1563	*/
1564	if (node_is_toptier(node: nid))
1565	last_cpupid = folio_last_cpupid(folio);
1566	target_nid = numa_migrate_prep(folio, vma, addr: haddr, page_nid: nid, flags: &flags);
1567	if (target_nid == NUMA_NO_NODE) {
1568	folio_put(folio);
1569	goto out_map;
1570	}
1571
1572	spin_unlock(lock: vmf->ptl);
1573	writable = false;
1574
1575	migrated = migrate_misplaced_folio(folio, vma, node: target_nid);
1576	if (migrated) {
1577	flags \|= TNF_MIGRATED;
1578	nid = target_nid;
1579	} else {
1580	flags \|= TNF_MIGRATE_FAIL;
1581	vmf->ptl = pmd_lock(mm: vma->vm_mm, pmd: vmf->pmd);
1582	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
1583	spin_unlock(lock: vmf->ptl);
1584	goto out;
1585	}
1586	goto out_map;
1587	}
1588
1589	out:
1590	if (nid != NUMA_NO_NODE)
1591	task_numa_fault(last_node: last_cpupid, node: nid, HPAGE_PMD_NR, flags);
1592
1593	return `0`;
1594
1595	out_map:
1596	/ Restore the PMD /
1597	pmd = pmd_modify(pmd: oldpmd, newprot: vma->vm_page_prot);
1598	pmd = pmd_mkyoung(pmd);
1599	if (writable)
1600	pmd = pmd_mkwrite(pmd, vma);
1601	set_pmd_at(mm: vma->vm_mm, addr: haddr, pmdp: vmf->pmd, pmd);
1602	update_mmu_cache_pmd(vma, addr: vmf->address, pmd: vmf->pmd);
1603	spin_unlock(lock: vmf->ptl);
1604	goto out;
1605	}
1606
1607	/*
1608	* Return true if we do MADV_FREE successfully on entire pmd page.
1609	* Otherwise, return false.
1610	*/
1611	bool madvise_free_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
1612	pmd_t pmd, unsigned* long addr, unsigned long next)
1613	{
1614	spinlock_t *ptl;
1615	pmd_t orig_pmd;
1616	struct folio *folio;
1617	struct mm_struct *mm = tlb->mm;
1618	bool ret = false;
1619
1620	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1621
1622	ptl = pmd_trans_huge_lock(pmd, vma);
1623	if (!ptl)
1624	goto out_unlocked;
1625
1626	orig_pmd = *pmd;
1627	if (is_huge_zero_pmd(pmd: orig_pmd))
1628	goto out;
1629
1630	if (unlikely(!pmd_present(orig_pmd))) {
1631	VM_BUG_ON(thp_migration_supported() &&
1632	!is_pmd_migration_entry(orig_pmd));
1633	goto out;
1634	}
1635
1636	folio = pfn_folio(pfn: pmd_pfn(pmd: orig_pmd));
1637	/*
1638	* If other processes are mapping this folio, we couldn't discard
1639	* the folio unless they all do MADV_FREE so let's skip the folio.
1640	*/
1641	if (folio_estimated_sharers(folio) != `1`)
1642	goto out;
1643
1644	if (!folio_trylock(folio))
1645	goto out;
1646
1647	/*
1648	* If user want to discard part-pages of THP, split it so MADV_FREE
1649	* will deactivate only them.
1650	*/
1651	if (next - addr != HPAGE_PMD_SIZE) {
1652	folio_get(folio);
1653	spin_unlock(lock: ptl);
1654	split_folio(folio);
1655	folio_unlock(folio);
1656	folio_put(folio);
1657	goto out_unlocked;
1658	}
1659
1660	if (folio_test_dirty(folio))
1661	folio_clear_dirty(folio);
1662	folio_unlock(folio);
1663
1664	if (pmd_young(pmd: orig_pmd) \|\| pmd_dirty(pmd: orig_pmd)) {
1665	pmdp_invalidate(vma, address: addr, pmdp: pmd);
1666	orig_pmd = pmd_mkold(pmd: orig_pmd);
1667	orig_pmd = pmd_mkclean(pmd: orig_pmd);
1668
1669	set_pmd_at(mm, addr, pmdp: pmd, pmd: orig_pmd);
1670	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1671	}
1672
1673	folio_mark_lazyfree(folio);
1674	ret = true;
1675	out:
1676	spin_unlock(lock: ptl);
1677	out_unlocked:
1678	return ret;
1679	}
1680
1681	static inline void zap_deposited_table(struct mm_struct mm, pmd_t pmd)
1682	{
1683	pgtable_t pgtable;
1684
1685	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
1686	pte_free(mm, pte_page: pgtable);
1687	mm_dec_nr_ptes(mm);
1688	}
1689
1690	int zap_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
1691	pmd_t pmd, unsigned* long addr)
1692	{
1693	pmd_t orig_pmd;
1694	spinlock_t *ptl;
1695
1696	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1697
1698	ptl = __pmd_trans_huge_lock(pmd, vma);
1699	if (!ptl)
1700	return `0`;
1701	/*
1702	* For architectures like ppc64 we look at deposited pgtable
1703	* when calling pmdp_huge_get_and_clear. So do the
1704	* pgtable_trans_huge_withdraw after finishing pmdp related
1705	* operations.
1706	*/
1707	orig_pmd = pmdp_huge_get_and_clear_full(vma, address: addr, pmdp: pmd,
1708	full: tlb->fullmm);
1709	arch_check_zapped_pmd(vma, pmd: orig_pmd);
1710	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1711	if (vma_is_special_huge(vma)) {
1712	if (arch_needs_pgtable_deposit())
1713	zap_deposited_table(mm: tlb->mm, pmd);
1714	spin_unlock(lock: ptl);
1715	} else if (is_huge_zero_pmd(pmd: orig_pmd)) {
1716	zap_deposited_table(mm: tlb->mm, pmd);
1717	spin_unlock(lock: ptl);
1718	} else {
1719	struct page *page = NULL;
1720	int flush_needed = `1`;
1721
1722	if (pmd_present(pmd: orig_pmd)) {
1723	page = pmd_page(orig_pmd);
1724	page_remove_rmap(page, vma, compound: true);
1725	VM_BUG_ON_PAGE(page_mapcount(page) < `0`, page);
1726	VM_BUG_ON_PAGE(!PageHead(page), page);
1727	} else if (thp_migration_supported()) {
1728	swp_entry_t entry;
1729
1730	VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1731	entry = pmd_to_swp_entry(pmd: orig_pmd);
1732	page = pfn_swap_entry_to_page(entry);
1733	flush_needed = `0`;
1734	} else
1735	WARN_ONCE(`1`, "Non present huge pmd without pmd migration enabled!");
1736
1737	if (PageAnon(page)) {
1738	zap_deposited_table(mm: tlb->mm, pmd);
1739	add_mm_counter(mm: tlb->mm, member: MM_ANONPAGES, value: -HPAGE_PMD_NR);
1740	} else {
1741	if (arch_needs_pgtable_deposit())
1742	zap_deposited_table(mm: tlb->mm, pmd);
1743	add_mm_counter(mm: tlb->mm, member: mm_counter_file(page), value: -HPAGE_PMD_NR);
1744	}
1745
1746	spin_unlock(lock: ptl);
1747	if (flush_needed)
1748	tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1749	}
1750	return `1`;
1751	}
1752
1753	#ifndef pmd_move_must_withdraw
1754	static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1755	spinlock_t *old_pmd_ptl,
1756	struct vm_area_struct *vma)
1757	{
1758	/*
1759	* With split pmd lock we also need to move preallocated
1760	* PTE page table if new_pmd is on different PMD page table.
1761	*
1762	* We also don't deposit and withdraw tables for file pages.
1763	*/
1764	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1765	}
1766	#endif
1767
1768	static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1769	{
1770	#ifdef CONFIG_MEM_SOFT_DIRTY
1771	if (unlikely(is_pmd_migration_entry(pmd)))
1772	pmd = pmd_swp_mksoft_dirty(pmd);
1773	else if (pmd_present(pmd))
1774	pmd = pmd_mksoft_dirty(pmd);
1775	#endif
1776	return pmd;
1777	}
1778
1779	bool move_huge_pmd(struct vm_area_struct vma, unsigned* long old_addr,
1780	unsigned long new_addr, pmd_t old_pmd, pmd_t new_pmd)
1781	{
1782	spinlock_t old_ptl, new_ptl;
1783	pmd_t pmd;
1784	struct mm_struct *mm = vma->vm_mm;
1785	bool force_flush = false;
1786
1787	/*
1788	* The destination pmd shouldn't be established, free_pgtables()
1789	* should have released it; but move_page_tables() might have already
1790	* inserted a page table, if racing against shmem/file collapse.
1791	*/
1792	if (!pmd_none(pmd: *new_pmd)) {
1793	VM_BUG_ON(pmd_trans_huge(*new_pmd));
1794	return false;
1795	}
1796
1797	/*
1798	* We don't have to worry about the ordering of src and dst
1799	* ptlocks because exclusive mmap_lock prevents deadlock.
1800	*/
1801	old_ptl = __pmd_trans_huge_lock(pmd: old_pmd, vma);
1802	if (old_ptl) {
1803	new_ptl = pmd_lockptr(mm, pmd: new_pmd);
1804	if (new_ptl != old_ptl)
1805	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1806	pmd = pmdp_huge_get_and_clear(mm, addr: old_addr, pmdp: old_pmd);
1807	if (pmd_present(pmd))
1808	force_flush = true;
1809	VM_BUG_ON(!pmd_none(*new_pmd));
1810
1811	if (pmd_move_must_withdraw(new_pmd_ptl: new_ptl, old_pmd_ptl: old_ptl, vma)) {
1812	pgtable_t pgtable;
1813	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: old_pmd);
1814	pgtable_trans_huge_deposit(mm, pmdp: new_pmd, pgtable);
1815	}
1816	pmd = move_soft_dirty_pmd(pmd);
1817	set_pmd_at(mm, addr: new_addr, pmdp: new_pmd, pmd);
1818	if (force_flush)
1819	flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1820	if (new_ptl != old_ptl)
1821	spin_unlock(lock: new_ptl);
1822	spin_unlock(lock: old_ptl);
1823	return true;
1824	}
1825	return false;
1826	}
1827
1828	/*
1829	* Returns
1830	* - 0 if PMD could not be locked
1831	* - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
1832	* or if prot_numa but THP migration is not supported
1833	* - HPAGE_PMD_NR if protections changed and TLB flush necessary
1834	*/
1835	int change_huge_pmd(struct mmu_gather tlb, struct* vm_area_struct *vma,
1836	pmd_t pmd, unsigned* long addr, pgprot_t newprot,
1837	unsigned long cp_flags)
1838	{
1839	struct mm_struct *mm = vma->vm_mm;
1840	spinlock_t *ptl;
1841	pmd_t oldpmd, entry;
1842	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
1843	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
1844	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
1845	int ret = `1`;
1846
1847	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1848
1849	if (prot_numa && !thp_migration_supported())
1850	return `1`;
1851
1852	ptl = __pmd_trans_huge_lock(pmd, vma);
1853	if (!ptl)
1854	return `0`;
1855
1856	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1857	if (is_swap_pmd(pmd: *pmd)) {
1858	swp_entry_t entry = pmd_to_swp_entry(pmd: *pmd);
1859	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
1860	pmd_t newpmd;
1861
1862	VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1863	if (is_writable_migration_entry(entry)) {
1864	/*
1865	* A protection check is difficult so
1866	* just be safe and disable write
1867	*/
1868	if (folio_test_anon(folio))
1869	entry = make_readable_exclusive_migration_entry(offset: swp_offset(entry));
1870	else
1871	entry = make_readable_migration_entry(offset: swp_offset(entry));
1872	newpmd = swp_entry_to_pmd(entry);
1873	if (pmd_swp_soft_dirty(pmd: *pmd))
1874	newpmd = pmd_swp_mksoft_dirty(pmd: newpmd);
1875	} else {
1876	newpmd = *pmd;
1877	}
1878
1879	if (uffd_wp)
1880	newpmd = pmd_swp_mkuffd_wp(pmd: newpmd);
1881	else if (uffd_wp_resolve)
1882	newpmd = pmd_swp_clear_uffd_wp(pmd: newpmd);
1883	if (!pmd_same(pmd_a: *pmd, pmd_b: newpmd))
1884	set_pmd_at(mm, addr, pmdp: pmd, pmd: newpmd);
1885	goto unlock;
1886	}
1887	#endif
1888
1889	if (prot_numa) {
1890	struct folio *folio;
1891	bool toptier;
1892	/*
1893	* Avoid trapping faults against the zero page. The read-only
1894	* data is likely to be read-cached on the local CPU and
1895	* local/remote hits to the zero page are not interesting.
1896	*/
1897	if (is_huge_zero_pmd(pmd: *pmd))
1898	goto unlock;
1899
1900	if (pmd_protnone(pmd: *pmd))
1901	goto unlock;
1902
1903	folio = page_folio(pmd_page(*pmd));
1904	toptier = node_is_toptier(node: folio_nid(folio));
1905	/*
1906	* Skip scanning top tier node if normal numa
1907	* balancing is disabled
1908	*/
1909	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
1910	toptier)
1911	goto unlock;
1912
1913	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
1914	!toptier)
1915	folio_xchg_access_time(folio,
1916	time: jiffies_to_msecs(j: jiffies));
1917	}
1918	/*
1919	* In case prot_numa, we are under mmap_read_lock(mm). It's critical
1920	* to not clear pmd intermittently to avoid race with MADV_DONTNEED
1921	* which is also under mmap_read_lock(mm):
1922	*
1923	* CPU0: CPU1:
1924	* change_huge_pmd(prot_numa=1)
1925	* pmdp_huge_get_and_clear_notify()
1926	* madvise_dontneed()
1927	* zap_pmd_range()
1928	* pmd_trans_huge(*pmd) == 0 (without ptl)
1929	* // skip the pmd
1930	* set_pmd_at();
1931	* // pmd is re-established
1932	*
1933	* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1934	* which may break userspace.
1935	*
1936	* pmdp_invalidate_ad() is required to make sure we don't miss
1937	* dirty/young flags set by hardware.
1938	*/
1939	oldpmd = pmdp_invalidate_ad(vma, address: addr, pmdp: pmd);
1940
1941	entry = pmd_modify(pmd: oldpmd, newprot);
1942	if (uffd_wp)
1943	entry = pmd_mkuffd_wp(pmd: entry);
1944	else if (uffd_wp_resolve)
1945	/*
1946	* Leave the write bit to be handled by PF interrupt
1947	* handler, then things like COW could be properly
1948	* handled.
1949	*/
1950	entry = pmd_clear_uffd_wp(pmd: entry);
1951
1952	/ See change_pte_range(). /
1953	if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(pmd: entry) &&
1954	can_change_pmd_writable(vma, addr, pmd: entry))
1955	entry = pmd_mkwrite(pmd: entry, vma);
1956
1957	ret = HPAGE_PMD_NR;
1958	set_pmd_at(mm, addr, pmdp: pmd, pmd: entry);
1959
1960	if (huge_pmd_needs_flush(oldpmd, newpmd: entry))
1961	tlb_flush_pmd_range(tlb, address: addr, HPAGE_PMD_SIZE);
1962	unlock:
1963	spin_unlock(lock: ptl);
1964	return ret;
1965	}
1966
1967	/*
1968	* Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1969	*
1970	* Note that if it returns page table lock pointer, this routine returns without
1971	* unlocking page table lock. So callers must unlock it.
1972	*/
1973	spinlock_t __pmd_trans_huge_lock(pmd_t pmd, struct vm_area_struct *vma)
1974	{
1975	spinlock_t *ptl;
1976	ptl = pmd_lock(mm: vma->vm_mm, pmd);
1977	if (likely(is_swap_pmd(pmd) \|\| pmd_trans_huge(pmd) \|\|
1978	pmd_devmap(*pmd)))
1979	return ptl;
1980	spin_unlock(lock: ptl);
1981	return NULL;
1982	}
1983
1984	/*
1985	* Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
1986	*
1987	* Note that if it returns page table lock pointer, this routine returns without
1988	* unlocking page table lock. So callers must unlock it.
1989	*/
1990	spinlock_t __pud_trans_huge_lock(pud_t pud, struct vm_area_struct *vma)
1991	{
1992	spinlock_t *ptl;
1993
1994	ptl = pud_lock(mm: vma->vm_mm, pud);
1995	if (likely(pud_trans_huge(pud) \|\| pud_devmap(pud)))
1996	return ptl;
1997	spin_unlock(lock: ptl);
1998	return NULL;
1999	}
2000
2001	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2002	int zap_huge_pud(struct mmu_gather tlb, struct* vm_area_struct *vma,
2003	pud_t pud, unsigned* long addr)
2004	{
2005	spinlock_t *ptl;
2006
2007	ptl = __pud_trans_huge_lock(pud, vma);
2008	if (!ptl)
2009	return `0`;
2010
2011	pudp_huge_get_and_clear_full(vma, address: addr, pudp: pud, full: tlb->fullmm);
2012	tlb_remove_pud_tlb_entry(tlb, pud, addr);
2013	if (vma_is_special_huge(vma)) {
2014	spin_unlock(lock: ptl);
2015	/ No zero page support yet /
2016	} else {
2017	/ No support for anonymous PUD pages yet /
2018	BUG();
2019	}
2020	return `1`;
2021	}
2022
2023	static void __split_huge_pud_locked(struct vm_area_struct vma, pud_t pud,
2024	unsigned long haddr)
2025	{
2026	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2027	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2028	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2029	VM_BUG_ON(!pud_trans_huge(pud) && !pud_devmap(pud));
2030
2031	count_vm_event(item: THP_SPLIT_PUD);
2032
2033	pudp_huge_clear_flush(vma, address: haddr, pudp: pud);
2034	}
2035
2036	void __split_huge_pud(struct vm_area_struct vma, pud_t pud,
2037	unsigned long address)
2038	{
2039	spinlock_t *ptl;
2040	struct mmu_notifier_range range;
2041
2042	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2043	start: address & HPAGE_PUD_MASK,
2044	end: (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2045	mmu_notifier_invalidate_range_start(range: &range);
2046	ptl = pud_lock(mm: vma->vm_mm, pud);
2047	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
2048	goto out;
2049	__split_huge_pud_locked(vma, pud, haddr: range.start);
2050
2051	out:
2052	spin_unlock(lock: ptl);
2053	mmu_notifier_invalidate_range_end(range: &range);
2054	}
2055	#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2056
2057	static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2058	unsigned long haddr, pmd_t *pmd)
2059	{
2060	struct mm_struct *mm = vma->vm_mm;
2061	pgtable_t pgtable;
2062	pmd_t _pmd, old_pmd;
2063	unsigned long addr;
2064	pte_t *pte;
2065	int i;
2066
2067	/*
2068	* Leave pmd empty until pte is filled note that it is fine to delay
2069	* notification until mmu_notifier_invalidate_range_end() as we are
2070	* replacing a zero pmd write protected page with a zero pte write
2071	* protected page.
2072	*
2073	* See Documentation/mm/mmu_notifier.rst
2074	*/
2075	old_pmd = pmdp_huge_clear_flush(vma, address: haddr, pmdp: pmd);
2076
2077	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
2078	pmd_populate(mm, pmd: &_pmd, pte: pgtable);
2079
2080	pte = pte_offset_map(pmd: &_pmd, addr: haddr);
2081	VM_BUG_ON(!pte);
2082	for (i = `0`, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2083	pte_t entry;
2084
2085	entry = pfn_pte(page_nr: my_zero_pfn(addr), pgprot: vma->vm_page_prot);
2086	entry = pte_mkspecial(pte: entry);
2087	if (pmd_uffd_wp(pmd: old_pmd))
2088	entry = pte_mkuffd_wp(pte: entry);
2089	VM_BUG_ON(!pte_none(ptep_get(pte)));
2090	set_pte_at(mm, addr, pte, entry);
2091	pte++;
2092	}
2093	pte_unmap(pte: pte - `1`);
2094	smp_wmb(); / make pte visible before pmd /
2095	pmd_populate(mm, pmd, pte: pgtable);
2096	}
2097
2098	static void __split_huge_pmd_locked(struct vm_area_struct vma, pmd_t pmd,
2099	unsigned long haddr, bool freeze)
2100	{
2101	struct mm_struct *mm = vma->vm_mm;
2102	struct page *page;
2103	pgtable_t pgtable;
2104	pmd_t old_pmd, _pmd;
2105	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2106	bool anon_exclusive = false, dirty = false;
2107	unsigned long addr;
2108	pte_t *pte;
2109	int i;
2110
2111	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2112	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2113	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2114	VM_BUG_ON(!is_pmd_migration_entry(pmd) && !pmd_trans_huge(pmd)
2115	&& !pmd_devmap(*pmd));
2116
2117	count_vm_event(item: THP_SPLIT_PMD);
2118
2119	if (!vma_is_anonymous(vma)) {
2120	old_pmd = pmdp_huge_clear_flush(vma, address: haddr, pmdp: pmd);
2121	/*
2122	* We are going to unmap this huge page. So
2123	* just go ahead and zap it
2124	*/
2125	if (arch_needs_pgtable_deposit())
2126	zap_deposited_table(mm, pmd);
2127	if (vma_is_special_huge(vma))
2128	return;
2129	if (unlikely(is_pmd_migration_entry(old_pmd))) {
2130	swp_entry_t entry;
2131
2132	entry = pmd_to_swp_entry(pmd: old_pmd);
2133	page = pfn_swap_entry_to_page(entry);
2134	} else {
2135	page = pmd_page(old_pmd);
2136	if (!PageDirty(page) && pmd_dirty(pmd: old_pmd))
2137	set_page_dirty(page);
2138	if (!PageReferenced(page) && pmd_young(pmd: old_pmd))
2139	SetPageReferenced(page);
2140	page_remove_rmap(page, vma, compound: true);
2141	put_page(page);
2142	}
2143	add_mm_counter(mm, member: mm_counter_file(page), value: -HPAGE_PMD_NR);
2144	return;
2145	}
2146
2147	if (is_huge_zero_pmd(pmd: *pmd)) {
2148	/*
2149	* FIXME: Do we want to invalidate secondary mmu by calling
2150	* mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2151	* inside __split_huge_pmd() ?
2152	*
2153	* We are going from a zero huge page write protected to zero
2154	* small page also write protected so it does not seems useful
2155	* to invalidate secondary mmu at this time.
2156	*/
2157	return __split_huge_zero_page_pmd(vma, haddr, pmd);
2158	}
2159
2160	/*
2161	* Up to this point the pmd is present and huge and userland has the
2162	* whole access to the hugepage during the split (which happens in
2163	* place). If we overwrite the pmd with the not-huge version pointing
2164	* to the pte here (which of course we could if all CPUs were bug
2165	* free), userland could trigger a small page size TLB miss on the
2166	* small sized TLB while the hugepage TLB entry is still established in
2167	* the huge TLB. Some CPU doesn't like that.
2168	* See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2169	* 383 on page 105. Intel should be safe but is also warns that it's
2170	* only safe if the permission and cache attributes of the two entries
2171	* loaded in the two TLB is identical (which should be the case here).
2172	* But it is generally safer to never allow small and huge TLB entries
2173	* for the same virtual address to be loaded simultaneously. So instead
2174	* of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2175	* current pmd notpresent (atomically because here the pmd_trans_huge
2176	* must remain set at all times on the pmd until the split is complete
2177	* for this pmd), then we flush the SMP TLB and finally we write the
2178	* non-huge version of the pmd entry with pmd_populate.
2179	*/
2180	old_pmd = pmdp_invalidate(vma, address: haddr, pmdp: pmd);
2181
2182	pmd_migration = is_pmd_migration_entry(pmd: old_pmd);
2183	if (unlikely(pmd_migration)) {
2184	swp_entry_t entry;
2185
2186	entry = pmd_to_swp_entry(pmd: old_pmd);
2187	page = pfn_swap_entry_to_page(entry);
2188	write = is_writable_migration_entry(entry);
2189	if (PageAnon(page))
2190	anon_exclusive = is_readable_exclusive_migration_entry(entry);
2191	young = is_migration_entry_young(entry);
2192	dirty = is_migration_entry_dirty(entry);
2193	soft_dirty = pmd_swp_soft_dirty(pmd: old_pmd);
2194	uffd_wp = pmd_swp_uffd_wp(pmd: old_pmd);
2195	} else {
2196	page = pmd_page(old_pmd);
2197	if (pmd_dirty(pmd: old_pmd)) {
2198	dirty = true;
2199	SetPageDirty(page);
2200	}
2201	write = pmd_write(pmd: old_pmd);
2202	young = pmd_young(pmd: old_pmd);
2203	soft_dirty = pmd_soft_dirty(pmd: old_pmd);
2204	uffd_wp = pmd_uffd_wp(pmd: old_pmd);
2205
2206	VM_BUG_ON_PAGE(!page_count(page), page);
2207
2208	/*
2209	* Without "freeze", we'll simply split the PMD, propagating the
2210	* PageAnonExclusive() flag for each PTE by setting it for
2211	* each subpage -- no need to (temporarily) clear.
2212	*
2213	* With "freeze" we want to replace mapped pages by
2214	* migration entries right away. This is only possible if we
2215	* managed to clear PageAnonExclusive() -- see
2216	* set_pmd_migration_entry().
2217	*
2218	* In case we cannot clear PageAnonExclusive(), split the PMD
2219	* only and let try_to_migrate_one() fail later.
2220	*
2221	* See page_try_share_anon_rmap(): invalidate PMD first.
2222	*/
2223	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
2224	if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
2225	freeze = false;
2226	if (!freeze)
2227	page_ref_add(page, HPAGE_PMD_NR - `1`);
2228	}
2229
2230	/*
2231	* Withdraw the table only after we mark the pmd entry invalid.
2232	* This's critical for some architectures (Power).
2233	*/
2234	pgtable = pgtable_trans_huge_withdraw(mm, pmdp: pmd);
2235	pmd_populate(mm, pmd: &_pmd, pte: pgtable);
2236
2237	pte = pte_offset_map(pmd: &_pmd, addr: haddr);
2238	VM_BUG_ON(!pte);
2239	for (i = `0`, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2240	pte_t entry;
2241	/*
2242	* Note that NUMA hinting access restrictions are not
2243	* transferred to avoid any possibility of altering
2244	* permissions across VMAs.
2245	*/
2246	if (freeze \|\| pmd_migration) {
2247	swp_entry_t swp_entry;
2248	if (write)
2249	swp_entry = make_writable_migration_entry(
2250	page_to_pfn(page + i));
2251	else if (anon_exclusive)
2252	swp_entry = make_readable_exclusive_migration_entry(
2253	page_to_pfn(page + i));
2254	else
2255	swp_entry = make_readable_migration_entry(
2256	page_to_pfn(page + i));
2257	if (young)
2258	swp_entry = make_migration_entry_young(entry: swp_entry);
2259	if (dirty)
2260	swp_entry = make_migration_entry_dirty(entry: swp_entry);
2261	entry = swp_entry_to_pte(entry: swp_entry);
2262	if (soft_dirty)
2263	entry = pte_swp_mksoft_dirty(pte: entry);
2264	if (uffd_wp)
2265	entry = pte_swp_mkuffd_wp(pte: entry);
2266	} else {
2267	entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2268	if (write)
2269	entry = pte_mkwrite(pte: entry, vma);
2270	if (anon_exclusive)
2271	SetPageAnonExclusive(page + i);
2272	if (!young)
2273	entry = pte_mkold(pte: entry);
2274	/ NOTE: this may set soft-dirty too on some archs /
2275	if (dirty)
2276	entry = pte_mkdirty(pte: entry);
2277	if (soft_dirty)
2278	entry = pte_mksoft_dirty(pte: entry);
2279	if (uffd_wp)
2280	entry = pte_mkuffd_wp(pte: entry);
2281	page_add_anon_rmap(page + i, vma, address: addr, RMAP_NONE);
2282	}
2283	VM_BUG_ON(!pte_none(ptep_get(pte)));
2284	set_pte_at(mm, addr, pte, entry);
2285	pte++;
2286	}
2287	pte_unmap(pte: pte - `1`);
2288
2289	if (!pmd_migration)
2290	page_remove_rmap(page, vma, compound: true);
2291	if (freeze)
2292	put_page(page);
2293
2294	smp_wmb(); / make pte visible before pmd /
2295	pmd_populate(mm, pmd, pte: pgtable);
2296	}
2297
2298	void __split_huge_pmd(struct vm_area_struct vma, pmd_t pmd,
2299	unsigned long address, bool freeze, struct folio *folio)
2300	{
2301	spinlock_t *ptl;
2302	struct mmu_notifier_range range;
2303
2304	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2305	start: address & HPAGE_PMD_MASK,
2306	end: (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2307	mmu_notifier_invalidate_range_start(range: &range);
2308	ptl = pmd_lock(mm: vma->vm_mm, pmd);
2309
2310	/*
2311	* If caller asks to setup a migration entry, we need a folio to check
2312	* pmd against. Otherwise we can end up replacing wrong folio.
2313	*/
2314	VM_BUG_ON(freeze && !folio);
2315	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2316
2317	if (pmd_trans_huge(pmd: pmd) \|\| pmd_devmap(pmd: pmd) \|\|
2318	is_pmd_migration_entry(pmd: *pmd)) {
2319	/*
2320	* It's safe to call pmd_page when folio is set because it's
2321	* guaranteed that pmd is present.
2322	*/
2323	if (folio && folio != page_folio(pmd_page(*pmd)))
2324	goto out;
2325	__split_huge_pmd_locked(vma, pmd, haddr: range.start, freeze);
2326	}
2327
2328	out:
2329	spin_unlock(lock: ptl);
2330	mmu_notifier_invalidate_range_end(range: &range);
2331	}
2332
2333	void split_huge_pmd_address(struct vm_area_struct vma, unsigned* long address,
2334	bool freeze, struct folio *folio)
2335	{
2336	pmd_t *pmd = mm_find_pmd(mm: vma->vm_mm, address);
2337
2338	if (!pmd)
2339	return;
2340
2341	__split_huge_pmd(vma, pmd, address, freeze, folio);
2342	}
2343
2344	static inline void split_huge_pmd_if_needed(struct vm_area_struct vma, unsigned* long address)
2345	{
2346	/*
2347	* If the new address isn't hpage aligned and it could previously
2348	* contain an hugepage: check if we need to split an huge pmd.
2349	*/
2350	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2351	range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2352	ALIGN(address, HPAGE_PMD_SIZE)))
2353	split_huge_pmd_address(vma, address, freeze: false, NULL);
2354	}
2355
2356	void vma_adjust_trans_huge(struct vm_area_struct *vma,
2357	unsigned long start,
2358	unsigned long end,
2359	long adjust_next)
2360	{
2361	/ Check if we need to split start first. /
2362	split_huge_pmd_if_needed(vma, address: start);
2363
2364	/ Check if we need to split end next. /
2365	split_huge_pmd_if_needed(vma, address: end);
2366
2367	/*
2368	* If we're also updating the next vma vm_start,
2369	* check if we need to split it.
2370	*/
2371	if (adjust_next > `0`) {
2372	struct vm_area_struct *next = find_vma(mm: vma->vm_mm, addr: vma->vm_end);
2373	unsigned long nstart = next->vm_start;
2374	nstart += adjust_next;
2375	split_huge_pmd_if_needed(vma: next, address: nstart);
2376	}
2377	}
2378
2379	static void unmap_folio(struct folio *folio)
2380	{
2381	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED \| TTU_SPLIT_HUGE_PMD \|
2382	TTU_SYNC;
2383
2384	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2385
2386	/*
2387	* Anon pages need migration entries to preserve them, but file
2388	* pages can simply be left unmapped, then faulted back on demand.
2389	* If that is ever changed (perhaps for mlock), update remap_page().
2390	*/
2391	if (folio_test_anon(folio))
2392	try_to_migrate(folio, flags: ttu_flags);
2393	else
2394	try_to_unmap(folio, flags: ttu_flags \| TTU_IGNORE_MLOCK);
2395	}
2396
2397	static void remap_page(struct folio folio, unsigned* long nr)
2398	{
2399	int i = `0`;
2400
2401	/ If unmap_folio() uses try_to_migrate() on file, remove this check /
2402	if (!folio_test_anon(folio))
2403	return;
2404	for (;;) {
2405	remove_migration_ptes(src: folio, dst: folio, locked: true);
2406	i += folio_nr_pages(folio);
2407	if (i >= nr)
2408	break;
2409	folio = folio_next(folio);
2410	}
2411	}
2412
2413	static void lru_add_page_tail(struct page head, struct* page *tail,
2414	struct lruvec lruvec, struct* list_head *list)
2415	{
2416	VM_BUG_ON_PAGE(!PageHead(head), head);
2417	VM_BUG_ON_PAGE(PageCompound(tail), head);
2418	VM_BUG_ON_PAGE(PageLRU(tail), head);
2419	lockdep_assert_held(&lruvec->lru_lock);
2420
2421	if (list) {
2422	/ page reclaim is reclaiming a huge page /
2423	VM_WARN_ON(PageLRU(head));
2424	get_page(page: tail);
2425	list_add_tail(new: &tail->lru, head: list);
2426	} else {
2427	/ head is still on lru (and we have it frozen) /
2428	VM_WARN_ON(!PageLRU(head));
2429	if (PageUnevictable(page: tail))
2430	tail->mlock_count = `0`;
2431	else
2432	list_add_tail(new: &tail->lru, head: &head->lru);
2433	SetPageLRU(tail);
2434	}
2435	}
2436
2437	static void __split_huge_page_tail(struct folio folio, int* tail,
2438	struct lruvec lruvec, struct* list_head *list)
2439	{
2440	struct page *head = &folio->page;
2441	struct page *page_tail = head + tail;
2442	/*
2443	* Careful: new_folio is not a "real" folio before we cleared PageTail.
2444	* Don't pass it around before clear_compound_head().
2445	*/
2446	struct folio new_folio = (struct* folio *)page_tail;
2447
2448	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -`1`, page_tail);
2449
2450	/*
2451	* Clone page flags before unfreezing refcount.
2452	*
2453	* After successful get_page_unless_zero() might follow flags change,
2454	* for example lock_page() which set PG_waiters.
2455	*
2456	* Note that for mapped sub-pages of an anonymous THP,
2457	* PG_anon_exclusive has been cleared in unmap_folio() and is stored in
2458	* the migration entry instead from where remap_page() will restore it.
2459	* We can still have PG_anon_exclusive set on effectively unmapped and
2460	* unreferenced sub-pages of an anonymous THP: we can simply drop
2461	* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
2462	*/
2463	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2464	page_tail->flags \|= (head->flags &
2465	((`1L` << PG_referenced) \|
2466	(`1L` << PG_swapbacked) \|
2467	(`1L` << PG_swapcache) \|
2468	(`1L` << PG_mlocked) \|
2469	(`1L` << PG_uptodate) \|
2470	(`1L` << PG_active) \|
2471	(`1L` << PG_workingset) \|
2472	(`1L` << PG_locked) \|
2473	(`1L` << PG_unevictable) \|
2474	#ifdef CONFIG_ARCH_USES_PG_ARCH_X
2475	(`1L` << PG_arch_2) \|
2476	(`1L` << PG_arch_3) \|
2477	#endif
2478	(`1L` << PG_dirty) \|
2479	LRU_GEN_MASK \| LRU_REFS_MASK));
2480
2481	/ ->mapping in first and second tail page is replaced by other uses /
2482	VM_BUG_ON_PAGE(tail > `2` && page_tail->mapping != TAIL_MAPPING,
2483	page_tail);
2484	page_tail->mapping = head->mapping;
2485	page_tail->index = head->index + tail;
2486
2487	/*
2488	* page->private should not be set in tail pages. Fix up and warn once
2489	* if private is unexpectedly set.
2490	*/
2491	if (unlikely(page_tail->private)) {
2492	VM_WARN_ON_ONCE_PAGE(true, page_tail);
2493	page_tail->private = `0`;
2494	}
2495	if (folio_test_swapcache(folio))
2496	new_folio->swap.val = folio->swap.val + tail;
2497
2498	/ Page flags must be visible before we make the page non-compound. /
2499	smp_wmb();
2500
2501	/*
2502	* Clear PageTail before unfreezing page refcount.
2503	*
2504	* After successful get_page_unless_zero() might follow put_page()
2505	* which needs correct compound_head().
2506	*/
2507	clear_compound_head(page: page_tail);
2508
2509	/ Finally unfreeze refcount. Additional reference from page cache. /
2510	page_ref_unfreeze(page: page_tail, count: `1` + (!PageAnon(page: head) \|\|
2511	PageSwapCache(page: head)));
2512
2513	if (page_is_young(page: head))
2514	set_page_young(page_tail);
2515	if (page_is_idle(page: head))
2516	set_page_idle(page_tail);
2517
2518	folio_xchg_last_cpupid(folio: new_folio, cpupid: folio_last_cpupid(folio));
2519
2520	/*
2521	* always add to the tail because some iterators expect new
2522	* pages to show after the currently processed elements - e.g.
2523	* migrate_pages
2524	*/
2525	lru_add_page_tail(head, tail: page_tail, lruvec, list);
2526	}
2527
2528	static void __split_huge_page(struct page page, struct* list_head *list,
2529	pgoff_t end)
2530	{
2531	struct folio *folio = page_folio(page);
2532	struct page *head = &folio->page;
2533	struct lruvec *lruvec;
2534	struct address_space *swap_cache = NULL;
2535	unsigned long offset = `0`;
2536	unsigned int nr = thp_nr_pages(page: head);
2537	int i, nr_dropped = `0`;
2538
2539	/ complete memcg works before add pages to LRU /
2540	split_page_memcg(head, nr);
2541
2542	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
2543	offset = swp_offset(entry: folio->swap);
2544	swap_cache = swap_address_space(folio->swap);
2545	xa_lock(&swap_cache->i_pages);
2546	}
2547
2548	/ lock lru list/PageCompound, ref frozen by page_ref_freeze /
2549	lruvec = folio_lruvec_lock(folio);
2550
2551	ClearPageHasHWPoisoned(page: head);
2552
2553	for (i = nr - `1`; i >= `1`; i--) {
2554	__split_huge_page_tail(folio, tail: i, lruvec, list);
2555	/ Some pages can be beyond EOF: drop them from page cache /
2556	if (head[i].index >= end) {
2557	struct folio *tail = page_folio(head + i);
2558
2559	if (shmem_mapping(mapping: head->mapping))
2560	nr_dropped++;
2561	else if (folio_test_clear_dirty(folio: tail))
2562	folio_account_cleaned(folio: tail,
2563	wb: inode_to_wb(inode: folio->mapping->host));
2564	__filemap_remove_folio(folio: tail, NULL);
2565	folio_put(folio: tail);
2566	} else if (!PageAnon(page)) {
2567	__xa_store(&head->mapping->i_pages, index: head[i].index,
2568	entry: head + i, `0`);
2569	} else if (swap_cache) {
2570	__xa_store(&swap_cache->i_pages, index: offset + i,
2571	entry: head + i, `0`);
2572	}
2573	}
2574
2575	ClearPageCompound(page: head);
2576	unlock_page_lruvec(lruvec);
2577	/ Caller disabled irqs, so they are still disabled here /
2578
2579	split_page_owner(page: head, nr);
2580
2581	/ See comment in __split_huge_page_tail() /
2582	if (PageAnon(page: head)) {
2583	/ Additional pin to swap cache /
2584	if (PageSwapCache(page: head)) {
2585	page_ref_add(page: head, nr: `2`);
2586	xa_unlock(&swap_cache->i_pages);
2587	} else {
2588	page_ref_inc(page: head);
2589	}
2590	} else {
2591	/ Additional pin to page cache /
2592	page_ref_add(page: head, nr: `2`);
2593	xa_unlock(&head->mapping->i_pages);
2594	}
2595	local_irq_enable();
2596
2597	if (nr_dropped)
2598	shmem_uncharge(inode: head->mapping->host, pages: nr_dropped);
2599	remap_page(folio, nr);
2600
2601	if (folio_test_swapcache(folio))
2602	split_swap_cluster(entry: folio->swap);
2603
2604	for (i = `0`; i < nr; i++) {
2605	struct page *subpage = head + i;
2606	if (subpage == page)
2607	continue;
2608	unlock_page(page: subpage);
2609
2610	/*
2611	* Subpages may be freed if there wasn't any mapping
2612	* like if add_to_swap() is running on a lru page that
2613	* had its mapping zapped. And freeing these pages
2614	* requires taking the lru_lock so we do the put_page
2615	* of the tail pages after the split is complete.
2616	*/
2617	free_page_and_swap_cache(subpage);
2618	}
2619	}
2620
2621	/ Racy check whether the huge page can be split /
2622	bool can_split_folio(struct folio folio, int* *pextra_pins)
2623	{
2624	int extra_pins;
2625
2626	/ Additional pins from page cache /
2627	if (folio_test_anon(folio))
2628	extra_pins = folio_test_swapcache(folio) ?
2629	folio_nr_pages(folio) : `0`;
2630	else
2631	extra_pins = folio_nr_pages(folio);
2632	if (pextra_pins)
2633	*pextra_pins = extra_pins;
2634	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - `1`;
2635	}
2636
2637	/*
2638	* This function splits huge page into normal pages. @page can point to any
2639	* subpage of huge page to split. Split doesn't change the position of @page.
2640	*
2641	* Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
2642	* The huge page must be locked.
2643	*
2644	* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2645	*
2646	* Both head page and tail pages will inherit mapping, flags, and so on from
2647	* the hugepage.
2648	*
2649	* GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
2650	* they are not mapped.
2651	*
2652	* Returns 0 if the hugepage is split successfully.
2653	* Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
2654	* us.
2655	*/
2656	int split_huge_page_to_list(struct page page, struct* list_head *list)
2657	{
2658	struct folio *folio = page_folio(page);
2659	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
2660	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
2661	struct anon_vma *anon_vma = NULL;
2662	struct address_space *mapping = NULL;
2663	int extra_pins, ret;
2664	pgoff_t end;
2665	bool is_hzp;
2666
2667	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2668	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
2669
2670	is_hzp = is_huge_zero_page(page: &folio->page);
2671	if (is_hzp) {
2672	pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
2673	return -EBUSY;
2674	}
2675
2676	if (folio_test_writeback(folio))
2677	return -EBUSY;
2678
2679	if (folio_test_anon(folio)) {
2680	/*
2681	* The caller does not necessarily hold an mmap_lock that would
2682	* prevent the anon_vma disappearing so we first we take a
2683	* reference to it and then lock the anon_vma for write. This
2684	* is similar to folio_lock_anon_vma_read except the write lock
2685	* is taken to serialise against parallel split or collapse
2686	* operations.
2687	*/
2688	anon_vma = folio_get_anon_vma(folio);
2689	if (!anon_vma) {
2690	ret = -EBUSY;
2691	goto out;
2692	}
2693	end = -`1`;
2694	mapping = NULL;
2695	anon_vma_lock_write(anon_vma);
2696	} else {
2697	gfp_t gfp;
2698
2699	mapping = folio->mapping;
2700
2701	/ Truncated ? /
2702	if (!mapping) {
2703	ret = -EBUSY;
2704	goto out;
2705	}
2706
2707	gfp = current_gfp_context(flags: mapping_gfp_mask(mapping) &
2708	GFP_RECLAIM_MASK);
2709
2710	if (!filemap_release_folio(folio, gfp)) {
2711	ret = -EBUSY;
2712	goto out;
2713	}
2714
2715	xas_split_alloc(&xas, entry: folio, order: folio_order(folio), gfp);
2716	if (xas_error(xas: &xas)) {
2717	ret = xas_error(xas: &xas);
2718	goto out;
2719	}
2720
2721	anon_vma = NULL;
2722	i_mmap_lock_read(mapping);
2723
2724	/*
2725	*__split_huge_page() may need to trim off pages beyond EOF:
2726	* but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
2727	* which cannot be nested inside the page tree lock. So note
2728	* end now: i_size itself may be changed at any moment, but
2729	* folio lock is good enough to serialize the trimming.
2730	*/
2731	end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2732	if (shmem_mapping(mapping))
2733	end = shmem_fallocend(inode: mapping->host, eof: end);
2734	}
2735
2736	/*
2737	* Racy check if we can split the page, before unmap_folio() will
2738	* split PMDs
2739	*/
2740	if (!can_split_folio(folio, pextra_pins: &extra_pins)) {
2741	ret = -EAGAIN;
2742	goto out_unlock;
2743	}
2744
2745	unmap_folio(folio);
2746
2747	/ block interrupt reentry in xa_lock and spinlock /
2748	local_irq_disable();
2749	if (mapping) {
2750	/*
2751	* Check if the folio is present in page cache.
2752	* We assume all tail are present too, if folio is there.
2753	*/
2754	xas_lock(&xas);
2755	xas_reset(xas: &xas);
2756	if (xas_load(&xas) != folio)
2757	goto fail;
2758	}
2759
2760	/ Prevent deferred_split_scan() touching ->_refcount /
2761	spin_lock(lock: &ds_queue->split_queue_lock);
2762	if (folio_ref_freeze(folio, count: `1` + extra_pins)) {
2763	if (!list_empty(head: &folio->_deferred_list)) {
2764	ds_queue->split_queue_len--;
2765	list_del(entry: &folio->_deferred_list);
2766	}
2767	spin_unlock(lock: &ds_queue->split_queue_lock);
2768	if (mapping) {
2769	int nr = folio_nr_pages(folio);
2770
2771	xas_split(&xas, entry: folio, order: folio_order(folio));
2772	if (folio_test_swapbacked(folio)) {
2773	__lruvec_stat_mod_folio(folio, idx: NR_SHMEM_THPS,
2774	val: -nr);
2775	} else {
2776	__lruvec_stat_mod_folio(folio, idx: NR_FILE_THPS,
2777	val: -nr);
2778	filemap_nr_thps_dec(mapping);
2779	}
2780	}
2781
2782	__split_huge_page(page, list, end);
2783	ret = `0`;
2784	} else {
2785	spin_unlock(lock: &ds_queue->split_queue_lock);
2786	fail:
2787	if (mapping)
2788	xas_unlock(&xas);
2789	local_irq_enable();
2790	remap_page(folio, nr: folio_nr_pages(folio));
2791	ret = -EAGAIN;
2792	}
2793
2794	out_unlock:
2795	if (anon_vma) {
2796	anon_vma_unlock_write(anon_vma);
2797	put_anon_vma(anon_vma);
2798	}
2799	if (mapping)
2800	i_mmap_unlock_read(mapping);
2801	out:
2802	xas_destroy(&xas);
2803	count_vm_event(item: !ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
2804	return ret;
2805	}
2806
2807	void folio_undo_large_rmappable(struct folio *folio)
2808	{
2809	struct deferred_split *ds_queue;
2810	unsigned long flags;
2811
2812	/*
2813	* At this point, there is no one trying to add the folio to
2814	* deferred_list. If folio is not in deferred_list, it's safe
2815	* to check without acquiring the split_queue_lock.
2816	*/
2817	if (data_race(list_empty(&folio->_deferred_list)))
2818	return;
2819
2820	ds_queue = get_deferred_split_queue(folio);
2821	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2822	if (!list_empty(head: &folio->_deferred_list)) {
2823	ds_queue->split_queue_len--;
2824	list_del(entry: &folio->_deferred_list);
2825	}
2826	spin_unlock_irqrestore(lock: &ds_queue->split_queue_lock, flags);
2827	}
2828
2829	void deferred_split_folio(struct folio *folio)
2830	{
2831	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
2832	#ifdef CONFIG_MEMCG
2833	struct mem_cgroup *memcg = folio_memcg(folio);
2834	#endif
2835	unsigned long flags;
2836
2837	VM_BUG_ON_FOLIO(folio_order(folio) < `2`, folio);
2838
2839	/*
2840	* The try_to_unmap() in page reclaim path might reach here too,
2841	* this may cause a race condition to corrupt deferred split queue.
2842	* And, if page reclaim is already handling the same folio, it is
2843	* unnecessary to handle it again in shrinker.
2844	*
2845	* Check the swapcache flag to determine if the folio is being
2846	* handled by page reclaim since THP swap would add the folio into
2847	* swap cache before calling try_to_unmap().
2848	*/
2849	if (folio_test_swapcache(folio))
2850	return;
2851
2852	if (!list_empty(head: &folio->_deferred_list))
2853	return;
2854
2855	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2856	if (list_empty(head: &folio->_deferred_list)) {
2857	count_vm_event(item: THP_DEFERRED_SPLIT_PAGE);
2858	list_add_tail(new: &folio->_deferred_list, head: &ds_queue->split_queue);
2859	ds_queue->split_queue_len++;
2860	#ifdef CONFIG_MEMCG
2861	if (memcg)
2862	set_shrinker_bit(memcg, nid: folio_nid(folio),
2863	shrinker_id: deferred_split_shrinker->id);
2864	#endif
2865	}
2866	spin_unlock_irqrestore(lock: &ds_queue->split_queue_lock, flags);
2867	}
2868
2869	static unsigned long deferred_split_count(struct shrinker *shrink,
2870	struct shrink_control *sc)
2871	{
2872	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2873	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2874
2875	#ifdef CONFIG_MEMCG
2876	if (sc->memcg)
2877	ds_queue = &sc->memcg->deferred_split_queue;
2878	#endif
2879	return READ_ONCE(ds_queue->split_queue_len);
2880	}
2881
2882	static unsigned long deferred_split_scan(struct shrinker *shrink,
2883	struct shrink_control *sc)
2884	{
2885	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2886	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2887	unsigned long flags;
2888	LIST_HEAD(list);
2889	struct folio folio, next;
2890	int split = `0`;
2891
2892	#ifdef CONFIG_MEMCG
2893	if (sc->memcg)
2894	ds_queue = &sc->memcg->deferred_split_queue;
2895	#endif
2896
2897	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2898	/ Take pin on all head pages to avoid freeing them under us /
2899	list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
2900	_deferred_list) {
2901	if (folio_try_get(folio)) {
2902	list_move(list: &folio->_deferred_list, head: &list);
2903	} else {
2904	/ We lost race with folio_put() /
2905	list_del_init(entry: &folio->_deferred_list);
2906	ds_queue->split_queue_len--;
2907	}
2908	if (!--sc->nr_to_scan)
2909	break;
2910	}
2911	spin_unlock_irqrestore(lock: &ds_queue->split_queue_lock, flags);
2912
2913	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
2914	if (!folio_trylock(folio))
2915	goto next;
2916	/ split_huge_page() removes page from list on success /
2917	if (!split_folio(folio))
2918	split++;
2919	folio_unlock(folio);
2920	next:
2921	folio_put(folio);
2922	}
2923
2924	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2925	list_splice_tail(list: &list, head: &ds_queue->split_queue);
2926	spin_unlock_irqrestore(lock: &ds_queue->split_queue_lock, flags);
2927
2928	/*
2929	* Stop shrinker if we didn't split any page, but the queue is empty.
2930	* This can happen if pages were freed under us.
2931	*/
2932	if (!split && list_empty(head: &ds_queue->split_queue))
2933	return SHRINK_STOP;
2934	return split;
2935	}
2936
2937	#ifdef CONFIG_DEBUG_FS
2938	static void split_huge_pages_all(void)
2939	{
2940	struct zone *zone;
2941	struct page *page;
2942	struct folio *folio;
2943	unsigned long pfn, max_zone_pfn;
2944	unsigned long total = `0`, split = `0`;
2945
2946	pr_debug("Split all THPs\n");
2947	for_each_zone(zone) {
2948	if (!managed_zone(zone))
2949	continue;
2950	max_zone_pfn = zone_end_pfn(zone);
2951	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
2952	int nr_pages;
2953
2954	page = pfn_to_online_page(pfn);
2955	if (!page \|\| PageTail(page))
2956	continue;
2957	folio = page_folio(page);
2958	if (!folio_try_get(folio))
2959	continue;
2960
2961	if (unlikely(page_folio(page) != folio))
2962	goto next;
2963
2964	if (zone != folio_zone(folio))
2965	goto next;
2966
2967	if (!folio_test_large(folio)
2968	\|\| folio_test_hugetlb(folio)
2969	\|\| !folio_test_lru(folio))
2970	goto next;
2971
2972	total++;
2973	folio_lock(folio);
2974	nr_pages = folio_nr_pages(folio);
2975	if (!split_folio(folio))
2976	split++;
2977	pfn += nr_pages - `1`;
2978	folio_unlock(folio);
2979	next:
2980	folio_put(folio);
2981	cond_resched();
2982	}
2983	}
2984
2985	pr_debug("%lu of %lu THP split\n", split, total);
2986	}
2987
2988	static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
2989	{
2990	return vma_is_special_huge(vma) \|\| (vma->vm_flags & VM_IO) \|\|
2991	is_vm_hugetlb_page(vma);
2992	}
2993
2994	static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
2995	unsigned long vaddr_end)
2996	{
2997	int ret = `0`;
2998	struct task_struct *task;
2999	struct mm_struct *mm;
3000	unsigned long total = `0`, split = `0`;
3001	unsigned long addr;
3002
3003	vaddr_start &= PAGE_MASK;
3004	vaddr_end &= PAGE_MASK;
3005
3006	/ Find the task_struct from pid /
3007	rcu_read_lock();
3008	task = find_task_by_vpid(nr: pid);
3009	if (!task) {
3010	rcu_read_unlock();
3011	ret = -ESRCH;
3012	goto out;
3013	}
3014	get_task_struct(t: task);
3015	rcu_read_unlock();
3016
3017	/ Find the mm_struct /
3018	mm = get_task_mm(task);
3019	put_task_struct(t: task);
3020
3021	if (!mm) {
3022	ret = -EINVAL;
3023	goto out;
3024	}
3025
3026	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3027	pid, vaddr_start, vaddr_end);
3028
3029	mmap_read_lock(mm);
3030	/*
3031	* always increase addr by PAGE_SIZE, since we could have a PTE page
3032	* table filled with PTE-mapped THPs, each of which is distinct.
3033	*/
3034	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
3035	struct vm_area_struct *vma = vma_lookup(mm, addr);
3036	struct page *page;
3037	struct folio *folio;
3038
3039	if (!vma)
3040	break;
3041
3042	/ skip special VMA and hugetlb VMA /
3043	if (vma_not_suitable_for_thp_split(vma)) {
3044	addr = vma->vm_end;
3045	continue;
3046	}
3047
3048	/ FOLL_DUMP to ignore special (like zero) pages /
3049	page = follow_page(vma, address: addr, foll_flags: FOLL_GET \| FOLL_DUMP);
3050
3051	if (IS_ERR_OR_NULL(ptr: page))
3052	continue;
3053
3054	folio = page_folio(page);
3055	if (!is_transparent_hugepage(folio))
3056	goto next;
3057
3058	total++;
3059	if (!can_split_folio(folio, NULL))
3060	goto next;
3061
3062	if (!folio_trylock(folio))
3063	goto next;
3064
3065	if (!split_folio(folio))
3066	split++;
3067
3068	folio_unlock(folio);
3069	next:
3070	folio_put(folio);
3071	cond_resched();
3072	}
3073	mmap_read_unlock(mm);
3074	mmput(mm);
3075
3076	pr_debug("%lu of %lu THP split\n", split, total);
3077
3078	out:
3079	return ret;
3080	}
3081
3082	static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
3083	pgoff_t off_end)
3084	{
3085	struct filename *file;
3086	struct file *candidate;
3087	struct address_space *mapping;
3088	int ret = -EINVAL;
3089	pgoff_t index;
3090	int nr_pages = `1`;
3091	unsigned long total = `0`, split = `0`;
3092
3093	file = getname_kernel(file_path);
3094	if (IS_ERR(ptr: file))
3095	return ret;
3096
3097	candidate = file_open_name(file, O_RDONLY, `0`);
3098	if (IS_ERR(ptr: candidate))
3099	goto out;
3100
3101	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3102	file_path, off_start, off_end);
3103
3104	mapping = candidate->f_mapping;
3105
3106	for (index = off_start; index < off_end; index += nr_pages) {
3107	struct folio *folio = filemap_get_folio(mapping, index);
3108
3109	nr_pages = `1`;
3110	if (IS_ERR(ptr: folio))
3111	continue;
3112
3113	if (!folio_test_large(folio))
3114	goto next;
3115
3116	total++;
3117	nr_pages = folio_nr_pages(folio);
3118
3119	if (!folio_trylock(folio))
3120	goto next;
3121
3122	if (!split_folio(folio))
3123	split++;
3124
3125	folio_unlock(folio);
3126	next:
3127	folio_put(folio);
3128	cond_resched();
3129	}
3130
3131	filp_close(candidate, NULL);
3132	ret = `0`;
3133
3134	pr_debug("%lu of %lu file-backed THP split\n", split, total);
3135	out:
3136	putname(name: file);
3137	return ret;
3138	}
3139
3140	#define MAX_INPUT_BUF_SZ 255
3141
3142	static ssize_t split_huge_pages_write(struct file file, const* char __user *buf,
3143	size_t count, loff_t *ppops)
3144	{
3145	static DEFINE_MUTEX(split_debug_mutex);
3146	ssize_t ret;
3147	/ hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end /
3148	char input_buf[MAX_INPUT_BUF_SZ];
3149	int pid;
3150	unsigned long vaddr_start, vaddr_end;
3151
3152	ret = mutex_lock_interruptible(&split_debug_mutex);
3153	if (ret)
3154	return ret;
3155
3156	ret = -EFAULT;
3157
3158	memset(input_buf, `0`, MAX_INPUT_BUF_SZ);
3159	if (copy_from_user(to: input_buf, from: buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
3160	goto out;
3161
3162	input_buf[MAX_INPUT_BUF_SZ - `1`] = `'\0'`;
3163
3164	if (input_buf[`0`] == `'/'`) {
3165	char *tok;
3166	char *buf = input_buf;
3167	char file_path[MAX_INPUT_BUF_SZ];
3168	pgoff_t off_start = `0`, off_end = `0`;
3169	size_t input_len = strlen(input_buf);
3170
3171	tok = strsep(&buf, ",");
3172	if (tok) {
3173	strcpy(p: file_path, q: tok);
3174	} else {
3175	ret = -EINVAL;
3176	goto out;
3177	}
3178
3179	ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
3180	if (ret != `2`) {
3181	ret = -EINVAL;
3182	goto out;
3183	}
3184	ret = split_huge_pages_in_file(file_path, off_start, off_end);
3185	if (!ret)
3186	ret = input_len;
3187
3188	goto out;
3189	}
3190
3191	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
3192	if (ret == `1` && pid == `1`) {
3193	split_huge_pages_all();
3194	ret = strlen(input_buf);
3195	goto out;
3196	} else if (ret != `3`) {
3197	ret = -EINVAL;
3198	goto out;
3199	}
3200
3201	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
3202	if (!ret)
3203	ret = strlen(input_buf);
3204	out:
3205	mutex_unlock(lock: &split_debug_mutex);
3206	return ret;
3207
3208	}
3209
3210	static const struct file_operations split_huge_pages_fops = {
3211	.owner = THIS_MODULE,
3212	.write = split_huge_pages_write,
3213	.llseek = no_llseek,
3214	};
3215
3216	static int __init split_huge_pages_debugfs(void)
3217	{
3218	debugfs_create_file(name: "split_huge_pages", mode: `0200`, NULL, NULL,
3219	fops: &split_huge_pages_fops);
3220	return `0`;
3221	}
3222	late_initcall(split_huge_pages_debugfs);
3223	#endif
3224
3225	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3226	int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3227	struct page *page)
3228	{
3229	struct vm_area_struct *vma = pvmw->vma;
3230	struct mm_struct *mm = vma->vm_mm;
3231	unsigned long address = pvmw->address;
3232	bool anon_exclusive;
3233	pmd_t pmdval;
3234	swp_entry_t entry;
3235	pmd_t pmdswp;
3236
3237	if (!(pvmw->pmd && !pvmw->pte))
3238	return `0`;
3239
3240	flush_cache_range(vma, start: address, end: address + HPAGE_PMD_SIZE);
3241	pmdval = pmdp_invalidate(vma, address, pmdp: pvmw->pmd);
3242
3243	/ See page_try_share_anon_rmap(): invalidate PMD first. /
3244	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
3245	if (anon_exclusive && page_try_share_anon_rmap(page)) {
3246	set_pmd_at(mm, addr: address, pmdp: pvmw->pmd, pmd: pmdval);
3247	return -EBUSY;
3248	}
3249
3250	if (pmd_dirty(pmd: pmdval))
3251	set_page_dirty(page);
3252	if (pmd_write(pmd: pmdval))
3253	entry = make_writable_migration_entry(page_to_pfn(page));
3254	else if (anon_exclusive)
3255	entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
3256	else
3257	entry = make_readable_migration_entry(page_to_pfn(page));
3258	if (pmd_young(pmd: pmdval))
3259	entry = make_migration_entry_young(entry);
3260	if (pmd_dirty(pmd: pmdval))
3261	entry = make_migration_entry_dirty(entry);
3262	pmdswp = swp_entry_to_pmd(entry);
3263	if (pmd_soft_dirty(pmd: pmdval))
3264	pmdswp = pmd_swp_mksoft_dirty(pmd: pmdswp);
3265	if (pmd_uffd_wp(pmd: pmdval))
3266	pmdswp = pmd_swp_mkuffd_wp(pmd: pmdswp);
3267	set_pmd_at(mm, addr: address, pmdp: pvmw->pmd, pmd: pmdswp);
3268	page_remove_rmap(page, vma, compound: true);
3269	put_page(page);
3270	trace_set_migration_pmd(addr: address, pmd: pmd_val(pmd: pmdswp));
3271
3272	return `0`;
3273	}
3274
3275	void remove_migration_pmd(struct page_vma_mapped_walk pvmw, struct* page *new)
3276	{
3277	struct vm_area_struct *vma = pvmw->vma;
3278	struct mm_struct *mm = vma->vm_mm;
3279	unsigned long address = pvmw->address;
3280	unsigned long haddr = address & HPAGE_PMD_MASK;
3281	pmd_t pmde;
3282	swp_entry_t entry;
3283
3284	if (!(pvmw->pmd && !pvmw->pte))
3285	return;
3286
3287	entry = pmd_to_swp_entry(pmd: *pvmw->pmd);
3288	get_page(page: new);
3289	pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
3290	if (pmd_swp_soft_dirty(pmd: *pvmw->pmd))
3291	pmde = pmd_mksoft_dirty(pmd: pmde);
3292	if (is_writable_migration_entry(entry))
3293	pmde = pmd_mkwrite(pmd: pmde, vma);
3294	if (pmd_swp_uffd_wp(pmd: *pvmw->pmd))
3295	pmde = pmd_mkuffd_wp(pmd: pmde);
3296	if (!is_migration_entry_young(entry))
3297	pmde = pmd_mkold(pmd: pmde);
3298	/ NOTE: this may contain setting soft-dirty on some archs /
3299	if (PageDirty(page: new) && is_migration_entry_dirty(entry))
3300	pmde = pmd_mkdirty(pmd: pmde);
3301
3302	if (PageAnon(page: new)) {
3303	rmap_t rmap_flags = RMAP_COMPOUND;
3304
3305	if (!is_readable_migration_entry(entry))
3306	rmap_flags \|= RMAP_EXCLUSIVE;
3307
3308	page_add_anon_rmap(new, vma, address: haddr, flags: rmap_flags);
3309	} else {
3310	page_add_file_rmap(new, vma, compound: true);
3311	}
3312	VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
3313	set_pmd_at(mm, addr: haddr, pmdp: pvmw->pmd, pmd: pmde);
3314
3315	/ No need to invalidate - it was non-present before /
3316	update_mmu_cache_pmd(vma, addr: address, pmd: pvmw->pmd);
3317	trace_remove_migration_pmd(addr: address, pmd: pmd_val(pmd: pmde));
3318	}
3319	#endif
3320

source code of linux/mm/huge_memory.c