khugepaged.c source code [linux/mm/khugepaged.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4	#include <linux/mm.h>
5	#include <linux/sched.h>
6	#include <linux/sched/mm.h>
7	#include <linux/sched/coredump.h>
8	#include <linux/mmu_notifier.h>
9	#include <linux/rmap.h>
10	#include <linux/swap.h>
11	#include <linux/mm_inline.h>
12	#include <linux/kthread.h>
13	#include <linux/khugepaged.h>
14	#include <linux/freezer.h>
15	#include <linux/mman.h>
16	#include <linux/hashtable.h>
17	#include <linux/userfaultfd_k.h>
18	#include <linux/page_idle.h>
19	#include <linux/page_table_check.h>
20	#include <linux/swapops.h>
21	#include <linux/shmem_fs.h>
22	#include <linux/ksm.h>
23
24	#include <asm/tlb.h>
25	#include <asm/pgalloc.h>
26	#include "internal.h"
27	#include "mm_slot.h"
28
29	enum scan_result {
30	SCAN_FAIL,
31	SCAN_SUCCEED,
32	SCAN_PMD_NULL,
33	SCAN_PMD_NONE,
34	SCAN_PMD_MAPPED,
35	SCAN_EXCEED_NONE_PTE,
36	SCAN_EXCEED_SWAP_PTE,
37	SCAN_EXCEED_SHARED_PTE,
38	SCAN_PTE_NON_PRESENT,
39	SCAN_PTE_UFFD_WP,
40	SCAN_PTE_MAPPED_HUGEPAGE,
41	SCAN_PAGE_RO,
42	SCAN_LACK_REFERENCED_PAGE,
43	SCAN_PAGE_NULL,
44	SCAN_SCAN_ABORT,
45	SCAN_PAGE_COUNT,
46	SCAN_PAGE_LRU,
47	SCAN_PAGE_LOCK,
48	SCAN_PAGE_ANON,
49	SCAN_PAGE_COMPOUND,
50	SCAN_ANY_PROCESS,
51	SCAN_VMA_NULL,
52	SCAN_VMA_CHECK,
53	SCAN_ADDRESS_RANGE,
54	SCAN_DEL_PAGE_LRU,
55	SCAN_ALLOC_HUGE_PAGE_FAIL,
56	SCAN_CGROUP_CHARGE_FAIL,
57	SCAN_TRUNCATED,
58	SCAN_PAGE_HAS_PRIVATE,
59	SCAN_STORE_FAILED,
60	SCAN_COPY_MC,
61	SCAN_PAGE_FILLED,
62	};
63
64	#define CREATE_TRACE_POINTS
65	#include <trace/events/huge_memory.h>
66
67	static struct task_struct *khugepaged_thread __read_mostly;
68	static DEFINE_MUTEX(khugepaged_mutex);
69
70	/ default scan 8512 pte (or vmas) every 30 second /*
71	static unsigned int khugepaged_pages_to_scan __read_mostly;
72	static unsigned int khugepaged_pages_collapsed;
73	static unsigned int khugepaged_full_scans;
74	static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = `10000`;
75	/ during fragmentation poll the hugepage allocator once every minute /
76	static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = `60000`;
77	static unsigned long khugepaged_sleep_expire;
78	static DEFINE_SPINLOCK(khugepaged_mm_lock);
79	static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
80	/*
81	* default collapse hugepages if there is at least one pte mapped like
82	* it would have happened if the vma was large enough during page
83	* fault.
84	*
85	* Note that these are only respected if collapse was initiated by khugepaged.
86	*/
87	static unsigned int khugepaged_max_ptes_none __read_mostly;
88	static unsigned int khugepaged_max_ptes_swap __read_mostly;
89	static unsigned int khugepaged_max_ptes_shared __read_mostly;
90
91	#define MM_SLOTS_HASH_BITS 10
92	static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
93
94	static struct kmem_cache *mm_slot_cache __ro_after_init;
95
96	struct collapse_control {
97	bool is_khugepaged;
98
99	/ Num pages scanned per node /
100	u32 node_load[MAX_NUMNODES];
101
102	/ nodemask for allocation fallback /
103	nodemask_t alloc_nmask;
104	};
105
106	/**
107	* struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
108	* @slot: hash lookup from mm to mm_slot
109	*/
110	struct khugepaged_mm_slot {
111	struct mm_slot slot;
112	};
113
114	/**
115	* struct khugepaged_scan - cursor for scanning
116	* @mm_head: the head of the mm list to scan
117	* @mm_slot: the current mm_slot we are scanning
118	* @address: the next address inside that to be scanned
119	*
120	* There is only the one khugepaged_scan instance of this cursor structure.
121	*/
122	struct khugepaged_scan {
123	struct list_head mm_head;
124	struct khugepaged_mm_slot *mm_slot;
125	unsigned long address;
126	};
127
128	static struct khugepaged_scan khugepaged_scan = {
129	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
130	};
131
132	#ifdef CONFIG_SYSFS
133	static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
134	struct kobj_attribute *attr,
135	char *buf)
136	{
137	return sysfs_emit(buf, fmt: "%u\n", khugepaged_scan_sleep_millisecs);
138	}
139
140	static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
141	struct kobj_attribute *attr,
142	const char *buf, size_t count)
143	{
144	unsigned int msecs;
145	int err;
146
147	err = kstrtouint(s: buf, base: `10`, res: &msecs);
148	if (err)
149	return -EINVAL;
150
151	khugepaged_scan_sleep_millisecs = msecs;
152	khugepaged_sleep_expire = `0`;
153	wake_up_interruptible(&khugepaged_wait);
154
155	return count;
156	}
157	static struct kobj_attribute scan_sleep_millisecs_attr =
158	__ATTR_RW(scan_sleep_millisecs);
159
160	static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
161	struct kobj_attribute *attr,
162	char *buf)
163	{
164	return sysfs_emit(buf, fmt: "%u\n", khugepaged_alloc_sleep_millisecs);
165	}
166
167	static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
168	struct kobj_attribute *attr,
169	const char *buf, size_t count)
170	{
171	unsigned int msecs;
172	int err;
173
174	err = kstrtouint(s: buf, base: `10`, res: &msecs);
175	if (err)
176	return -EINVAL;
177
178	khugepaged_alloc_sleep_millisecs = msecs;
179	khugepaged_sleep_expire = `0`;
180	wake_up_interruptible(&khugepaged_wait);
181
182	return count;
183	}
184	static struct kobj_attribute alloc_sleep_millisecs_attr =
185	__ATTR_RW(alloc_sleep_millisecs);
186
187	static ssize_t pages_to_scan_show(struct kobject *kobj,
188	struct kobj_attribute *attr,
189	char *buf)
190	{
191	return sysfs_emit(buf, fmt: "%u\n", khugepaged_pages_to_scan);
192	}
193	static ssize_t pages_to_scan_store(struct kobject *kobj,
194	struct kobj_attribute *attr,
195	const char *buf, size_t count)
196	{
197	unsigned int pages;
198	int err;
199
200	err = kstrtouint(s: buf, base: `10`, res: &pages);
201	if (err \|\| !pages)
202	return -EINVAL;
203
204	khugepaged_pages_to_scan = pages;
205
206	return count;
207	}
208	static struct kobj_attribute pages_to_scan_attr =
209	__ATTR_RW(pages_to_scan);
210
211	static ssize_t pages_collapsed_show(struct kobject *kobj,
212	struct kobj_attribute *attr,
213	char *buf)
214	{
215	return sysfs_emit(buf, fmt: "%u\n", khugepaged_pages_collapsed);
216	}
217	static struct kobj_attribute pages_collapsed_attr =
218	__ATTR_RO(pages_collapsed);
219
220	static ssize_t full_scans_show(struct kobject *kobj,
221	struct kobj_attribute *attr,
222	char *buf)
223	{
224	return sysfs_emit(buf, fmt: "%u\n", khugepaged_full_scans);
225	}
226	static struct kobj_attribute full_scans_attr =
227	__ATTR_RO(full_scans);
228
229	static ssize_t defrag_show(struct kobject *kobj,
230	struct kobj_attribute attr, char* *buf)
231	{
232	return single_hugepage_flag_show(kobj, attr, buf,
233	flag: TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
234	}
235	static ssize_t defrag_store(struct kobject *kobj,
236	struct kobj_attribute *attr,
237	const char *buf, size_t count)
238	{
239	return single_hugepage_flag_store(kobj, attr, buf, count,
240	flag: TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
241	}
242	static struct kobj_attribute khugepaged_defrag_attr =
243	__ATTR_RW(defrag);
244
245	/*
246	* max_ptes_none controls if khugepaged should collapse hugepages over
247	* any unmapped ptes in turn potentially increasing the memory
248	* footprint of the vmas. When max_ptes_none is 0 khugepaged will not
249	* reduce the available free memory in the system as it
250	* runs. Increasing max_ptes_none will instead potentially reduce the
251	* free memory in the system during the khugepaged scan.
252	*/
253	static ssize_t max_ptes_none_show(struct kobject *kobj,
254	struct kobj_attribute *attr,
255	char *buf)
256	{
257	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_none);
258	}
259	static ssize_t max_ptes_none_store(struct kobject *kobj,
260	struct kobj_attribute *attr,
261	const char *buf, size_t count)
262	{
263	int err;
264	unsigned long max_ptes_none;
265
266	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_none);
267	if (err \|\| max_ptes_none > HPAGE_PMD_NR - `1`)
268	return -EINVAL;
269
270	khugepaged_max_ptes_none = max_ptes_none;
271
272	return count;
273	}
274	static struct kobj_attribute khugepaged_max_ptes_none_attr =
275	__ATTR_RW(max_ptes_none);
276
277	static ssize_t max_ptes_swap_show(struct kobject *kobj,
278	struct kobj_attribute *attr,
279	char *buf)
280	{
281	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_swap);
282	}
283
284	static ssize_t max_ptes_swap_store(struct kobject *kobj,
285	struct kobj_attribute *attr,
286	const char *buf, size_t count)
287	{
288	int err;
289	unsigned long max_ptes_swap;
290
291	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_swap);
292	if (err \|\| max_ptes_swap > HPAGE_PMD_NR - `1`)
293	return -EINVAL;
294
295	khugepaged_max_ptes_swap = max_ptes_swap;
296
297	return count;
298	}
299
300	static struct kobj_attribute khugepaged_max_ptes_swap_attr =
301	__ATTR_RW(max_ptes_swap);
302
303	static ssize_t max_ptes_shared_show(struct kobject *kobj,
304	struct kobj_attribute *attr,
305	char *buf)
306	{
307	return sysfs_emit(buf, fmt: "%u\n", khugepaged_max_ptes_shared);
308	}
309
310	static ssize_t max_ptes_shared_store(struct kobject *kobj,
311	struct kobj_attribute *attr,
312	const char *buf, size_t count)
313	{
314	int err;
315	unsigned long max_ptes_shared;
316
317	err = kstrtoul(s: buf, base: `10`, res: &max_ptes_shared);
318	if (err \|\| max_ptes_shared > HPAGE_PMD_NR - `1`)
319	return -EINVAL;
320
321	khugepaged_max_ptes_shared = max_ptes_shared;
322
323	return count;
324	}
325
326	static struct kobj_attribute khugepaged_max_ptes_shared_attr =
327	__ATTR_RW(max_ptes_shared);
328
329	static struct attribute *khugepaged_attr[] = {
330	&khugepaged_defrag_attr.attr,
331	&khugepaged_max_ptes_none_attr.attr,
332	&khugepaged_max_ptes_swap_attr.attr,
333	&khugepaged_max_ptes_shared_attr.attr,
334	&pages_to_scan_attr.attr,
335	&pages_collapsed_attr.attr,
336	&full_scans_attr.attr,
337	&scan_sleep_millisecs_attr.attr,
338	&alloc_sleep_millisecs_attr.attr,
339	NULL,
340	};
341
342	struct attribute_group khugepaged_attr_group = {
343	.attrs = khugepaged_attr,
344	.name = "khugepaged",
345	};
346	#endif /* CONFIG_SYSFS */
347
348	int hugepage_madvise(struct vm_area_struct *vma,
349	unsigned long vm_flags, int* advice)
350	{
351	switch (advice) {
352	case MADV_HUGEPAGE:
353	#ifdef CONFIG_S390
354	/*
355	* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
356	* can't handle this properly after s390_enable_sie, so we simply
357	* ignore the madvise to prevent qemu from causing a SIGSEGV.
358	*/
359	if (mm_has_pgste(vma->vm_mm))
360	return `0`;
361	#endif
362	*vm_flags &= ~VM_NOHUGEPAGE;
363	*vm_flags \|= VM_HUGEPAGE;
364	/*
365	* If the vma become good for khugepaged to scan,
366	* register it here without waiting a page fault that
367	* may not happen any time soon.
368	*/
369	khugepaged_enter_vma(vma, vm_flags: *vm_flags);
370	break;
371	case MADV_NOHUGEPAGE:
372	*vm_flags &= ~VM_HUGEPAGE;
373	*vm_flags \|= VM_NOHUGEPAGE;
374	/*
375	* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
376	* this vma even if we leave the mm registered in khugepaged if
377	* it got registered before VM_NOHUGEPAGE was set.
378	*/
379	break;
380	}
381
382	return `0`;
383	}
384
385	int __init khugepaged_init(void)
386	{
387	mm_slot_cache = kmem_cache_create(name: "khugepaged_mm_slot",
388	size: sizeof(struct khugepaged_mm_slot),
389	align: __alignof__(struct khugepaged_mm_slot),
390	flags: `0`, NULL);
391	if (!mm_slot_cache)
392	return -ENOMEM;
393
394	khugepaged_pages_to_scan = HPAGE_PMD_NR * `8`;
395	khugepaged_max_ptes_none = HPAGE_PMD_NR - `1`;
396	khugepaged_max_ptes_swap = HPAGE_PMD_NR / `8`;
397	khugepaged_max_ptes_shared = HPAGE_PMD_NR / `2`;
398
399	return `0`;
400	}
401
402	void __init khugepaged_destroy(void)
403	{
404	kmem_cache_destroy(s: mm_slot_cache);
405	}
406
407	static inline int hpage_collapse_test_exit(struct mm_struct *mm)
408	{
409	return atomic_read(v: &mm->mm_users) == `0`;
410	}
411
412	void __khugepaged_enter(struct mm_struct *mm)
413	{
414	struct khugepaged_mm_slot *mm_slot;
415	struct mm_slot *slot;
416	int wakeup;
417
418	/ __khugepaged_exit() must not run from under us /
419	VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
420	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
421	return;
422
423	mm_slot = mm_slot_alloc(cache: mm_slot_cache);
424	if (!mm_slot)
425	return;
426
427	slot = &mm_slot->slot;
428
429	spin_lock(lock: &khugepaged_mm_lock);
430	mm_slot_insert(mm_slots_hash, mm, slot);
431	/*
432	* Insert just behind the scanning cursor, to let the area settle
433	* down a little.
434	*/
435	wakeup = list_empty(head: &khugepaged_scan.mm_head);
436	list_add_tail(new: &slot->mm_node, head: &khugepaged_scan.mm_head);
437	spin_unlock(lock: &khugepaged_mm_lock);
438
439	mmgrab(mm);
440	if (wakeup)
441	wake_up_interruptible(&khugepaged_wait);
442	}
443
444	void khugepaged_enter_vma(struct vm_area_struct *vma,
445	unsigned long vm_flags)
446	{
447	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
448	hugepage_flags_enabled()) {
449	if (hugepage_vma_check(vma, vm_flags, smaps: false, in_pf: false, enforce_sysfs: true))
450	__khugepaged_enter(mm: vma->vm_mm);
451	}
452	}
453
454	void __khugepaged_exit(struct mm_struct *mm)
455	{
456	struct khugepaged_mm_slot *mm_slot;
457	struct mm_slot *slot;
458	int free = `0`;
459
460	spin_lock(lock: &khugepaged_mm_lock);
461	slot = mm_slot_lookup(mm_slots_hash, mm);
462	mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
463	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
464	hash_del(node: &slot->hash);
465	list_del(entry: &slot->mm_node);
466	free = `1`;
467	}
468	spin_unlock(lock: &khugepaged_mm_lock);
469
470	if (free) {
471	clear_bit(MMF_VM_HUGEPAGE, addr: &mm->flags);
472	mm_slot_free(cache: mm_slot_cache, objp: mm_slot);
473	mmdrop(mm);
474	} else if (mm_slot) {
475	/*
476	* This is required to serialize against
477	* hpage_collapse_test_exit() (which is guaranteed to run
478	* under mmap sem read mode). Stop here (after we return all
479	* pagetables will be destroyed) until khugepaged has finished
480	* working on the pagetables under the mmap_lock.
481	*/
482	mmap_write_lock(mm);
483	mmap_write_unlock(mm);
484	}
485	}
486
487	static void release_pte_folio(struct folio *folio)
488	{
489	node_stat_mod_folio(folio,
490	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
491	nr: -folio_nr_pages(folio));
492	folio_unlock(folio);
493	folio_putback_lru(folio);
494	}
495
496	static void release_pte_page(struct page *page)
497	{
498	release_pte_folio(page_folio(page));
499	}
500
501	static void release_pte_pages(pte_t pte, pte_t _pte,
502	struct list_head *compound_pagelist)
503	{
504	struct folio folio, tmp;
505
506	while (--_pte >= pte) {
507	pte_t pteval = ptep_get(ptep: _pte);
508	unsigned long pfn;
509
510	if (pte_none(pte: pteval))
511	continue;
512	pfn = pte_pfn(pte: pteval);
513	if (is_zero_pfn(pfn))
514	continue;
515	folio = pfn_folio(pfn);
516	if (folio_test_large(folio))
517	continue;
518	release_pte_folio(folio);
519	}
520
521	list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
522	list_del(entry: &folio->lru);
523	release_pte_folio(folio);
524	}
525	}
526
527	static bool is_refcount_suitable(struct folio *folio)
528	{
529	int expected_refcount;
530
531	expected_refcount = folio_mapcount(folio);
532	if (folio_test_swapcache(folio))
533	expected_refcount += folio_nr_pages(folio);
534
535	return folio_ref_count(folio) == expected_refcount;
536	}
537
538	static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
539	unsigned long address,
540	pte_t *pte,
541	struct collapse_control *cc,
542	struct list_head *compound_pagelist)
543	{
544	struct page *page = NULL;
545	struct folio *folio = NULL;
546	pte_t *_pte;
547	int none_or_zero = `0`, shared = `0`, result = SCAN_FAIL, referenced = `0`;
548	bool writable = false;
549
550	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
551	_pte++, address += PAGE_SIZE) {
552	pte_t pteval = ptep_get(ptep: _pte);
553	if (pte_none(pte: pteval) \|\| (pte_present(a: pteval) &&
554	is_zero_pfn(pfn: pte_pfn(pte: pteval)))) {
555	++none_or_zero;
556	if (!userfaultfd_armed(vma) &&
557	(!cc->is_khugepaged \|\|
558	none_or_zero <= khugepaged_max_ptes_none)) {
559	continue;
560	} else {
561	result = SCAN_EXCEED_NONE_PTE;
562	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
563	goto out;
564	}
565	}
566	if (!pte_present(a: pteval)) {
567	result = SCAN_PTE_NON_PRESENT;
568	goto out;
569	}
570	if (pte_uffd_wp(pte: pteval)) {
571	result = SCAN_PTE_UFFD_WP;
572	goto out;
573	}
574	page = vm_normal_page(vma, addr: address, pte: pteval);
575	if (unlikely(!page) \|\| unlikely(is_zone_device_page(page))) {
576	result = SCAN_PAGE_NULL;
577	goto out;
578	}
579
580	folio = page_folio(page);
581	VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
582
583	if (page_mapcount(page) > `1`) {
584	++shared;
585	if (cc->is_khugepaged &&
586	shared > khugepaged_max_ptes_shared) {
587	result = SCAN_EXCEED_SHARED_PTE;
588	count_vm_event(item: THP_SCAN_EXCEED_SHARED_PTE);
589	goto out;
590	}
591	}
592
593	if (folio_test_large(folio)) {
594	struct folio *f;
595
596	/*
597	* Check if we have dealt with the compound page
598	* already
599	*/
600	list_for_each_entry(f, compound_pagelist, lru) {
601	if (folio == f)
602	goto next;
603	}
604	}
605
606	/*
607	* We can do it before isolate_lru_page because the
608	* page can't be freed from under us. NOTE: PG_lock
609	* is needed to serialize against split_huge_page
610	* when invoked from the VM.
611	*/
612	if (!folio_trylock(folio)) {
613	result = SCAN_PAGE_LOCK;
614	goto out;
615	}
616
617	/*
618	* Check if the page has any GUP (or other external) pins.
619	*
620	* The page table that maps the page has been already unlinked
621	* from the page table tree and this process cannot get
622	* an additional pin on the page.
623	*
624	* New pins can come later if the page is shared across fork,
625	* but not from this process. The other process cannot write to
626	* the page, only trigger CoW.
627	*/
628	if (!is_refcount_suitable(folio)) {
629	folio_unlock(folio);
630	result = SCAN_PAGE_COUNT;
631	goto out;
632	}
633
634	/*
635	* Isolate the page to avoid collapsing an hugepage
636	* currently in use by the VM.
637	*/
638	if (!folio_isolate_lru(folio)) {
639	folio_unlock(folio);
640	result = SCAN_DEL_PAGE_LRU;
641	goto out;
642	}
643	node_stat_mod_folio(folio,
644	item: NR_ISOLATED_ANON + folio_is_file_lru(folio),
645	nr: folio_nr_pages(folio));
646	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
647	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
648
649	if (folio_test_large(folio))
650	list_add_tail(new: &folio->lru, head: compound_pagelist);
651	next:
652	/*
653	* If collapse was initiated by khugepaged, check that there is
654	* enough young pte to justify collapsing the page
655	*/
656	if (cc->is_khugepaged &&
657	(pte_young(pte: pteval) \|\| folio_test_young(folio) \|\|
658	folio_test_referenced(folio) \|\| mmu_notifier_test_young(mm: vma->vm_mm,
659	address)))
660	referenced++;
661
662	if (pte_write(pte: pteval))
663	writable = true;
664	}
665
666	if (unlikely(!writable)) {
667	result = SCAN_PAGE_RO;
668	} else if (unlikely(cc->is_khugepaged && !referenced)) {
669	result = SCAN_LACK_REFERENCED_PAGE;
670	} else {
671	result = SCAN_SUCCEED;
672	trace_mm_collapse_huge_page_isolate(page: &folio->page, none_or_zero,
673	referenced, writable, status: result);
674	return result;
675	}
676	out:
677	release_pte_pages(pte, _pte, compound_pagelist);
678	trace_mm_collapse_huge_page_isolate(page: &folio->page, none_or_zero,
679	referenced, writable, status: result);
680	return result;
681	}
682
683	static void __collapse_huge_page_copy_succeeded(pte_t *pte,
684	struct vm_area_struct *vma,
685	unsigned long address,
686	spinlock_t *ptl,
687	struct list_head *compound_pagelist)
688	{
689	struct page *src_page;
690	struct page *tmp;
691	pte_t *_pte;
692	pte_t pteval;
693
694	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
695	_pte++, address += PAGE_SIZE) {
696	pteval = ptep_get(ptep: _pte);
697	if (pte_none(pte: pteval) \|\| is_zero_pfn(pfn: pte_pfn(pte: pteval))) {
698	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: `1`);
699	if (is_zero_pfn(pfn: pte_pfn(pte: pteval))) {
700	/*
701	* ptl mostly unnecessary.
702	*/
703	spin_lock(lock: ptl);
704	ptep_clear(mm: vma->vm_mm, addr: address, ptep: _pte);
705	spin_unlock(lock: ptl);
706	ksm_might_unmap_zero_page(mm: vma->vm_mm, pte: pteval);
707	}
708	} else {
709	src_page = pte_page(pteval);
710	if (!PageCompound(page: src_page))
711	release_pte_page(page: src_page);
712	/*
713	* ptl mostly unnecessary, but preempt has to
714	* be disabled to update the per-cpu stats
715	* inside page_remove_rmap().
716	*/
717	spin_lock(lock: ptl);
718	ptep_clear(mm: vma->vm_mm, addr: address, ptep: _pte);
719	page_remove_rmap(src_page, vma, compound: false);
720	spin_unlock(lock: ptl);
721	free_page_and_swap_cache(src_page);
722	}
723	}
724
725	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
726	list_del(entry: &src_page->lru);
727	mod_node_page_state(page_pgdat(page: src_page),
728	NR_ISOLATED_ANON + page_is_file_lru(page: src_page),
729	-compound_nr(page: src_page));
730	unlock_page(page: src_page);
731	free_swap_cache(page: src_page);
732	putback_lru_page(page: src_page);
733	}
734	}
735
736	static void __collapse_huge_page_copy_failed(pte_t *pte,
737	pmd_t *pmd,
738	pmd_t orig_pmd,
739	struct vm_area_struct *vma,
740	struct list_head *compound_pagelist)
741	{
742	spinlock_t *pmd_ptl;
743
744	/*
745	* Re-establish the PMD to point to the original page table
746	* entry. Restoring PMD needs to be done prior to releasing
747	* pages. Since pages are still isolated and locked here,
748	* acquiring anon_vma_lock_write is unnecessary.
749	*/
750	pmd_ptl = pmd_lock(mm: vma->vm_mm, pmd);
751	pmd_populate(mm: vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
752	spin_unlock(lock: pmd_ptl);
753	/*
754	* Release both raw and compound pages isolated
755	* in __collapse_huge_page_isolate.
756	*/
757	release_pte_pages(pte, pte: pte + HPAGE_PMD_NR, compound_pagelist);
758	}
759
760	/*
761	* __collapse_huge_page_copy - attempts to copy memory contents from raw
762	* pages to a hugepage. Cleans up the raw pages if copying succeeds;
763	* otherwise restores the original page table and releases isolated raw pages.
764	* Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
765	*
766	* @pte: starting of the PTEs to copy from
767	* @page: the new hugepage to copy contents to
768	* @pmd: pointer to the new hugepage's PMD
769	* @orig_pmd: the original raw pages' PMD
770	* @vma: the original raw pages' virtual memory area
771	* @address: starting address to copy
772	* @ptl: lock on raw pages' PTEs
773	* @compound_pagelist: list that stores compound pages
774	*/
775	static int __collapse_huge_page_copy(pte_t *pte,
776	struct page *page,
777	pmd_t *pmd,
778	pmd_t orig_pmd,
779	struct vm_area_struct *vma,
780	unsigned long address,
781	spinlock_t *ptl,
782	struct list_head *compound_pagelist)
783	{
784	struct page *src_page;
785	pte_t *_pte;
786	pte_t pteval;
787	unsigned long _address;
788	int result = SCAN_SUCCEED;
789
790	/*
791	* Copying pages' contents is subject to memory poison at any iteration.
792	*/
793	for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
794	_pte++, page++, _address += PAGE_SIZE) {
795	pteval = ptep_get(ptep: _pte);
796	if (pte_none(pte: pteval) \|\| is_zero_pfn(pfn: pte_pfn(pte: pteval))) {
797	clear_user_highpage(page, vaddr: _address);
798	continue;
799	}
800	src_page = pte_page(pteval);
801	if (copy_mc_user_highpage(to: page, from: src_page, vaddr: _address, vma) > `0`) {
802	result = SCAN_COPY_MC;
803	break;
804	}
805	}
806
807	if (likely(result == SCAN_SUCCEED))
808	__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
809	compound_pagelist);
810	else
811	__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
812	compound_pagelist);
813
814	return result;
815	}
816
817	static void khugepaged_alloc_sleep(void)
818	{
819	DEFINE_WAIT(wait);
820
821	add_wait_queue(wq_head: &khugepaged_wait, wq_entry: &wait);
822	__set_current_state(TASK_INTERRUPTIBLE\|TASK_FREEZABLE);
823	schedule_timeout(timeout: msecs_to_jiffies(m: khugepaged_alloc_sleep_millisecs));
824	remove_wait_queue(wq_head: &khugepaged_wait, wq_entry: &wait);
825	}
826
827	struct collapse_control khugepaged_collapse_control = {
828	.is_khugepaged = true,
829	};
830
831	static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
832	{
833	int i;
834
835	/*
836	* If node_reclaim_mode is disabled, then no extra effort is made to
837	* allocate memory locally.
838	*/
839	if (!node_reclaim_enabled())
840	return false;
841
842	/ If there is a count for this node already, it must be acceptable /
843	if (cc->node_load[nid])
844	return false;
845
846	for (i = `0`; i < MAX_NUMNODES; i++) {
847	if (!cc->node_load[i])
848	continue;
849	if (node_distance(nid, i) > node_reclaim_distance)
850	return true;
851	}
852	return false;
853	}
854
855	#define khugepaged_defrag() \
856	(transparent_hugepage_flags & \
857	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
858
859	/ Defrag for khugepaged will enter direct reclaim/compaction if necessary /
860	static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
861	{
862	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
863	}
864
865	#ifdef CONFIG_NUMA
866	static int hpage_collapse_find_target_node(struct collapse_control *cc)
867	{
868	int nid, target_node = `0`, max_value = `0`;
869
870	/ find first node with max normal pages hit /
871	for (nid = `0`; nid < MAX_NUMNODES; nid++)
872	if (cc->node_load[nid] > max_value) {
873	max_value = cc->node_load[nid];
874	target_node = nid;
875	}
876
877	for_each_online_node(nid) {
878	if (max_value == cc->node_load[nid])
879	node_set(nid, cc->alloc_nmask);
880	}
881
882	return target_node;
883	}
884	#else
885	static int hpage_collapse_find_target_node(struct collapse_control *cc)
886	{
887	return `0`;
888	}
889	#endif
890
891	static bool hpage_collapse_alloc_folio(struct folio *folio, gfp_t gfp, int* node,
892	nodemask_t *nmask)
893	{
894	*folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, preferred_nid: node, nodemask: nmask);
895
896	if (unlikely(!*folio)) {
897	count_vm_event(item: THP_COLLAPSE_ALLOC_FAILED);
898	return false;
899	}
900
901	count_vm_event(item: THP_COLLAPSE_ALLOC);
902	return true;
903	}
904
905	/*
906	* If mmap_lock temporarily dropped, revalidate vma
907	* before taking mmap_lock.
908	* Returns enum scan_result value.
909	*/
910
911	static int hugepage_vma_revalidate(struct mm_struct mm, unsigned* long address,
912	bool expect_anon,
913	struct vm_area_struct **vmap,
914	struct collapse_control *cc)
915	{
916	struct vm_area_struct *vma;
917
918	if (unlikely(hpage_collapse_test_exit(mm)))
919	return SCAN_ANY_PROCESS;
920
921	*vmap = vma = find_vma(mm, addr: address);
922	if (!vma)
923	return SCAN_VMA_NULL;
924
925	if (!transhuge_vma_suitable(vma, addr: address))
926	return SCAN_ADDRESS_RANGE;
927	if (!hugepage_vma_check(vma, vm_flags: vma->vm_flags, smaps: false, in_pf: false,
928	enforce_sysfs: cc->is_khugepaged))
929	return SCAN_VMA_CHECK;
930	/*
931	* Anon VMA expected, the address may be unmapped then
932	* remapped to file after khugepaged reaquired the mmap_lock.
933	*
934	* hugepage_vma_check may return true for qualified file
935	* vmas.
936	*/
937	if (expect_anon && (!(vmap)->anon_vma \|\| !vma_is_anonymous(vma: vmap)))
938	return SCAN_PAGE_ANON;
939	return SCAN_SUCCEED;
940	}
941
942	static int find_pmd_or_thp_or_none(struct mm_struct *mm,
943	unsigned long address,
944	pmd_t **pmd)
945	{
946	pmd_t pmde;
947
948	*pmd = mm_find_pmd(mm, address);
949	if (!*pmd)
950	return SCAN_PMD_NULL;
951
952	pmde = pmdp_get_lockless(pmdp: *pmd);
953	if (pmd_none(pmd: pmde))
954	return SCAN_PMD_NONE;
955	if (!pmd_present(pmd: pmde))
956	return SCAN_PMD_NULL;
957	if (pmd_trans_huge(pmd: pmde))
958	return SCAN_PMD_MAPPED;
959	if (pmd_devmap(pmd: pmde))
960	return SCAN_PMD_NULL;
961	if (pmd_bad(pmd: pmde))
962	return SCAN_PMD_NULL;
963	return SCAN_SUCCEED;
964	}
965
966	static int check_pmd_still_valid(struct mm_struct *mm,
967	unsigned long address,
968	pmd_t *pmd)
969	{
970	pmd_t *new_pmd;
971	int result = find_pmd_or_thp_or_none(mm, address, pmd: &new_pmd);
972
973	if (result != SCAN_SUCCEED)
974	return result;
975	if (new_pmd != pmd)
976	return SCAN_FAIL;
977	return SCAN_SUCCEED;
978	}
979
980	/*
981	* Bring missing pages in from swap, to complete THP collapse.
982	* Only done if hpage_collapse_scan_pmd believes it is worthwhile.
983	*
984	* Called and returns without pte mapped or spinlocks held.
985	* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
986	*/
987	static int __collapse_huge_page_swapin(struct mm_struct *mm,
988	struct vm_area_struct *vma,
989	unsigned long haddr, pmd_t *pmd,
990	int referenced)
991	{
992	int swapped_in = `0`;
993	vm_fault_t ret = `0`;
994	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
995	int result;
996	pte_t *pte = NULL;
997	spinlock_t *ptl;
998
999	for (address = haddr; address < end; address += PAGE_SIZE) {
1000	struct vm_fault vmf = {
1001	.vma = vma,
1002	.address = address,
1003	.pgoff = linear_page_index(vma, address),
1004	.flags = FAULT_FLAG_ALLOW_RETRY,
1005	.pmd = pmd,
1006	};
1007
1008	if (!pte++) {
1009	pte = pte_offset_map_nolock(mm, pmd, addr: address, ptlp: &ptl);
1010	if (!pte) {
1011	mmap_read_unlock(mm);
1012	result = SCAN_PMD_NULL;
1013	goto out;
1014	}
1015	}
1016
1017	vmf.orig_pte = ptep_get_lockless(ptep: pte);
1018	if (!is_swap_pte(pte: vmf.orig_pte))
1019	continue;
1020
1021	vmf.pte = pte;
1022	vmf.ptl = ptl;
1023	ret = do_swap_page(vmf: &vmf);
1024	/ Which unmaps pte (after perhaps re-checking the entry) /
1025	pte = NULL;
1026
1027	/*
1028	* do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
1029	* Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
1030	* we do not retry here and swap entry will remain in pagetable
1031	* resulting in later failure.
1032	*/
1033	if (ret & VM_FAULT_RETRY) {
1034	/ Likely, but not guaranteed, that page lock failed /
1035	result = SCAN_PAGE_LOCK;
1036	goto out;
1037	}
1038	if (ret & VM_FAULT_ERROR) {
1039	mmap_read_unlock(mm);
1040	result = SCAN_FAIL;
1041	goto out;
1042	}
1043	swapped_in++;
1044	}
1045
1046	if (pte)
1047	pte_unmap(pte);
1048
1049	/ Drain LRU cache to remove extra pin on the swapped in pages /
1050	if (swapped_in)
1051	lru_add_drain();
1052
1053	result = SCAN_SUCCEED;
1054	out:
1055	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, ret: result);
1056	return result;
1057	}
1058
1059	static int alloc_charge_hpage(struct page hpage, struct** mm_struct *mm,
1060	struct collapse_control *cc)
1061	{
1062	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
1063	GFP_TRANSHUGE);
1064	int node = hpage_collapse_find_target_node(cc);
1065	struct folio *folio;
1066
1067	if (!hpage_collapse_alloc_folio(folio: &folio, gfp, node, nmask: &cc->alloc_nmask)) {
1068	*hpage = NULL;
1069	return SCAN_ALLOC_HUGE_PAGE_FAIL;
1070	}
1071
1072	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
1073	folio_put(folio);
1074	*hpage = NULL;
1075	return SCAN_CGROUP_CHARGE_FAIL;
1076	}
1077
1078	count_memcg_folio_events(folio, idx: THP_COLLAPSE_ALLOC, nr: `1`);
1079
1080	*hpage = folio_page(folio, `0`);
1081	return SCAN_SUCCEED;
1082	}
1083
1084	static int collapse_huge_page(struct mm_struct mm, unsigned* long address,
1085	int referenced, int unmapped,
1086	struct collapse_control *cc)
1087	{
1088	LIST_HEAD(compound_pagelist);
1089	pmd_t *pmd, _pmd;
1090	pte_t *pte;
1091	pgtable_t pgtable;
1092	struct page *hpage;
1093	spinlock_t pmd_ptl, pte_ptl;
1094	int result = SCAN_FAIL;
1095	struct vm_area_struct *vma;
1096	struct mmu_notifier_range range;
1097
1098	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1099
1100	/*
1101	* Before allocating the hugepage, release the mmap_lock read lock.
1102	* The allocation can take potentially a long time if it involves
1103	* sync compaction, and we do not need to hold the mmap_lock during
1104	* that. We will recheck the vma after taking it again in write mode.
1105	*/
1106	mmap_read_unlock(mm);
1107
1108	result = alloc_charge_hpage(hpage: &hpage, mm, cc);
1109	if (result != SCAN_SUCCEED)
1110	goto out_nolock;
1111
1112	mmap_read_lock(mm);
1113	result = hugepage_vma_revalidate(mm, address, expect_anon: true, vmap: &vma, cc);
1114	if (result != SCAN_SUCCEED) {
1115	mmap_read_unlock(mm);
1116	goto out_nolock;
1117	}
1118
1119	result = find_pmd_or_thp_or_none(mm, address, pmd: &pmd);
1120	if (result != SCAN_SUCCEED) {
1121	mmap_read_unlock(mm);
1122	goto out_nolock;
1123	}
1124
1125	if (unmapped) {
1126	/*
1127	* __collapse_huge_page_swapin will return with mmap_lock
1128	* released when it fails. So we jump out_nolock directly in
1129	* that case. Continuing to collapse causes inconsistency.
1130	*/
1131	result = __collapse_huge_page_swapin(mm, vma, haddr: address, pmd,
1132	referenced);
1133	if (result != SCAN_SUCCEED)
1134	goto out_nolock;
1135	}
1136
1137	mmap_read_unlock(mm);
1138	/*
1139	* Prevent all access to pagetables with the exception of
1140	* gup_fast later handled by the ptep_clear_flush and the VM
1141	* handled by the anon_vma lock + PG_lock.
1142	*/
1143	mmap_write_lock(mm);
1144	result = hugepage_vma_revalidate(mm, address, expect_anon: true, vmap: &vma, cc);
1145	if (result != SCAN_SUCCEED)
1146	goto out_up_write;
1147	/ check if the pmd is still valid /
1148	result = check_pmd_still_valid(mm, address, pmd);
1149	if (result != SCAN_SUCCEED)
1150	goto out_up_write;
1151
1152	vma_start_write(vma);
1153	anon_vma_lock_write(anon_vma: vma->anon_vma);
1154
1155	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm, start: address,
1156	end: address + HPAGE_PMD_SIZE);
1157	mmu_notifier_invalidate_range_start(range: &range);
1158
1159	pmd_ptl = pmd_lock(mm, pmd); / probably unnecessary /
1160	/*
1161	* This removes any huge TLB entry from the CPU so we won't allow
1162	* huge and small TLB entries for the same virtual address to
1163	* avoid the risk of CPU bugs in that area.
1164	*
1165	* Parallel fast GUP is fine since fast GUP will back off when
1166	* it detects PMD is changed.
1167	*/
1168	_pmd = pmdp_collapse_flush(vma, address, pmdp: pmd);
1169	spin_unlock(lock: pmd_ptl);
1170	mmu_notifier_invalidate_range_end(range: &range);
1171	tlb_remove_table_sync_one();
1172
1173	pte = pte_offset_map_lock(mm, pmd: &_pmd, addr: address, ptlp: &pte_ptl);
1174	if (pte) {
1175	result = __collapse_huge_page_isolate(vma, address, pte, cc,
1176	compound_pagelist: &compound_pagelist);
1177	spin_unlock(lock: pte_ptl);
1178	} else {
1179	result = SCAN_PMD_NULL;
1180	}
1181
1182	if (unlikely(result != SCAN_SUCCEED)) {
1183	if (pte)
1184	pte_unmap(pte);
1185	spin_lock(lock: pmd_ptl);
1186	BUG_ON(!pmd_none(*pmd));
1187	/*
1188	* We can only use set_pmd_at when establishing
1189	* hugepmds and never for establishing regular pmds that
1190	* points to regular pagetables. Use pmd_populate for that
1191	*/
1192	pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1193	spin_unlock(lock: pmd_ptl);
1194	anon_vma_unlock_write(anon_vma: vma->anon_vma);
1195	goto out_up_write;
1196	}
1197
1198	/*
1199	* All pages are isolated and locked so anon_vma rmap
1200	* can't run anymore.
1201	*/
1202	anon_vma_unlock_write(anon_vma: vma->anon_vma);
1203
1204	result = __collapse_huge_page_copy(pte, page: hpage, pmd, orig_pmd: _pmd,
1205	vma, address, ptl: pte_ptl,
1206	compound_pagelist: &compound_pagelist);
1207	pte_unmap(pte);
1208	if (unlikely(result != SCAN_SUCCEED))
1209	goto out_up_write;
1210
1211	/*
1212	* spin_lock() below is not the equivalent of smp_wmb(), but
1213	* the smp_wmb() inside __SetPageUptodate() can be reused to
1214	* avoid the copy_huge_page writes to become visible after
1215	* the set_pmd_at() write.
1216	*/
1217	__SetPageUptodate(page: hpage);
1218	pgtable = pmd_pgtable(_pmd);
1219
1220	_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
1221	_pmd = maybe_pmd_mkwrite(pmd: pmd_mkdirty(pmd: _pmd), vma);
1222
1223	spin_lock(lock: pmd_ptl);
1224	BUG_ON(!pmd_none(*pmd));
1225	page_add_new_anon_rmap(hpage, vma, address);
1226	lru_cache_add_inactive_or_unevictable(page: hpage, vma);
1227	pgtable_trans_huge_deposit(mm, pmdp: pmd, pgtable);
1228	set_pmd_at(mm, addr: address, pmdp: pmd, pmd: _pmd);
1229	update_mmu_cache_pmd(vma, addr: address, pmd);
1230	spin_unlock(lock: pmd_ptl);
1231
1232	hpage = NULL;
1233
1234	result = SCAN_SUCCEED;
1235	out_up_write:
1236	mmap_write_unlock(mm);
1237	out_nolock:
1238	if (hpage)
1239	put_page(page: hpage);
1240	trace_mm_collapse_huge_page(mm, isolated: result == SCAN_SUCCEED, status: result);
1241	return result;
1242	}
1243
1244	static int hpage_collapse_scan_pmd(struct mm_struct *mm,
1245	struct vm_area_struct *vma,
1246	unsigned long address, bool *mmap_locked,
1247	struct collapse_control *cc)
1248	{
1249	pmd_t *pmd;
1250	pte_t pte, _pte;
1251	int result = SCAN_FAIL, referenced = `0`;
1252	int none_or_zero = `0`, shared = `0`;
1253	struct page *page = NULL;
1254	struct folio *folio = NULL;
1255	unsigned long _address;
1256	spinlock_t *ptl;
1257	int node = NUMA_NO_NODE, unmapped = `0`;
1258	bool writable = false;
1259
1260	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1261
1262	result = find_pmd_or_thp_or_none(mm, address, pmd: &pmd);
1263	if (result != SCAN_SUCCEED)
1264	goto out;
1265
1266	memset(cc->node_load, `0`, sizeof(cc->node_load));
1267	nodes_clear(cc->alloc_nmask);
1268	pte = pte_offset_map_lock(mm, pmd, addr: address, ptlp: &ptl);
1269	if (!pte) {
1270	result = SCAN_PMD_NULL;
1271	goto out;
1272	}
1273
1274	for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
1275	_pte++, _address += PAGE_SIZE) {
1276	pte_t pteval = ptep_get(ptep: _pte);
1277	if (is_swap_pte(pte: pteval)) {
1278	++unmapped;
1279	if (!cc->is_khugepaged \|\|
1280	unmapped <= khugepaged_max_ptes_swap) {
1281	/*
1282	* Always be strict with uffd-wp
1283	* enabled swap entries. Please see
1284	* comment below for pte_uffd_wp().
1285	*/
1286	if (pte_swp_uffd_wp_any(pte: pteval)) {
1287	result = SCAN_PTE_UFFD_WP;
1288	goto out_unmap;
1289	}
1290	continue;
1291	} else {
1292	result = SCAN_EXCEED_SWAP_PTE;
1293	count_vm_event(item: THP_SCAN_EXCEED_SWAP_PTE);
1294	goto out_unmap;
1295	}
1296	}
1297	if (pte_none(pte: pteval) \|\| is_zero_pfn(pfn: pte_pfn(pte: pteval))) {
1298	++none_or_zero;
1299	if (!userfaultfd_armed(vma) &&
1300	(!cc->is_khugepaged \|\|
1301	none_or_zero <= khugepaged_max_ptes_none)) {
1302	continue;
1303	} else {
1304	result = SCAN_EXCEED_NONE_PTE;
1305	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
1306	goto out_unmap;
1307	}
1308	}
1309	if (pte_uffd_wp(pte: pteval)) {
1310	/*
1311	* Don't collapse the page if any of the small
1312	* PTEs are armed with uffd write protection.
1313	* Here we can also mark the new huge pmd as
1314	* write protected if any of the small ones is
1315	* marked but that could bring unknown
1316	* userfault messages that falls outside of
1317	* the registered range. So, just be simple.
1318	*/
1319	result = SCAN_PTE_UFFD_WP;
1320	goto out_unmap;
1321	}
1322	if (pte_write(pte: pteval))
1323	writable = true;
1324
1325	page = vm_normal_page(vma, addr: _address, pte: pteval);
1326	if (unlikely(!page) \|\| unlikely(is_zone_device_page(page))) {
1327	result = SCAN_PAGE_NULL;
1328	goto out_unmap;
1329	}
1330
1331	if (page_mapcount(page) > `1`) {
1332	++shared;
1333	if (cc->is_khugepaged &&
1334	shared > khugepaged_max_ptes_shared) {
1335	result = SCAN_EXCEED_SHARED_PTE;
1336	count_vm_event(item: THP_SCAN_EXCEED_SHARED_PTE);
1337	goto out_unmap;
1338	}
1339	}
1340
1341	folio = page_folio(page);
1342	/*
1343	* Record which node the original page is from and save this
1344	* information to cc->node_load[].
1345	* Khugepaged will allocate hugepage from the node has the max
1346	* hit record.
1347	*/
1348	node = folio_nid(folio);
1349	if (hpage_collapse_scan_abort(nid: node, cc)) {
1350	result = SCAN_SCAN_ABORT;
1351	goto out_unmap;
1352	}
1353	cc->node_load[node]++;
1354	if (!folio_test_lru(folio)) {
1355	result = SCAN_PAGE_LRU;
1356	goto out_unmap;
1357	}
1358	if (folio_test_locked(folio)) {
1359	result = SCAN_PAGE_LOCK;
1360	goto out_unmap;
1361	}
1362	if (!folio_test_anon(folio)) {
1363	result = SCAN_PAGE_ANON;
1364	goto out_unmap;
1365	}
1366
1367	/*
1368	* Check if the page has any GUP (or other external) pins.
1369	*
1370	* Here the check may be racy:
1371	* it may see total_mapcount > refcount in some cases?
1372	* But such case is ephemeral we could always retry collapse
1373	* later. However it may report false positive if the page
1374	* has excessive GUP pins (i.e. 512). Anyway the same check
1375	* will be done again later the risk seems low.
1376	*/
1377	if (!is_refcount_suitable(folio)) {
1378	result = SCAN_PAGE_COUNT;
1379	goto out_unmap;
1380	}
1381
1382	/*
1383	* If collapse was initiated by khugepaged, check that there is
1384	* enough young pte to justify collapsing the page
1385	*/
1386	if (cc->is_khugepaged &&
1387	(pte_young(pte: pteval) \|\| folio_test_young(folio) \|\|
1388	folio_test_referenced(folio) \|\| mmu_notifier_test_young(mm: vma->vm_mm,
1389	address)))
1390	referenced++;
1391	}
1392	if (!writable) {
1393	result = SCAN_PAGE_RO;
1394	} else if (cc->is_khugepaged &&
1395	(!referenced \|\|
1396	(unmapped && referenced < HPAGE_PMD_NR / `2`))) {
1397	result = SCAN_LACK_REFERENCED_PAGE;
1398	} else {
1399	result = SCAN_SUCCEED;
1400	}
1401	out_unmap:
1402	pte_unmap_unlock(pte, ptl);
1403	if (result == SCAN_SUCCEED) {
1404	result = collapse_huge_page(mm, address, referenced,
1405	unmapped, cc);
1406	/ collapse_huge_page will return with the mmap_lock released /
1407	*mmap_locked = false;
1408	}
1409	out:
1410	trace_mm_khugepaged_scan_pmd(mm, page: &folio->page, writable, referenced,
1411	none_or_zero, status: result, unmapped);
1412	return result;
1413	}
1414
1415	static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
1416	{
1417	struct mm_slot *slot = &mm_slot->slot;
1418	struct mm_struct *mm = slot->mm;
1419
1420	lockdep_assert_held(&khugepaged_mm_lock);
1421
1422	if (hpage_collapse_test_exit(mm)) {
1423	/ free mm_slot /
1424	hash_del(node: &slot->hash);
1425	list_del(entry: &slot->mm_node);
1426
1427	/*
1428	* Not strictly needed because the mm exited already.
1429	*
1430	* clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1431	*/
1432
1433	/ khugepaged_mm_lock actually not necessary for the below /
1434	mm_slot_free(cache: mm_slot_cache, objp: mm_slot);
1435	mmdrop(mm);
1436	}
1437	}
1438
1439	#ifdef CONFIG_SHMEM
1440	/ hpage must be locked, and mmap_lock must be held /
1441	static int set_huge_pmd(struct vm_area_struct vma, unsigned* long addr,
1442	pmd_t pmdp, struct* page *hpage)
1443	{
1444	struct vm_fault vmf = {
1445	.vma = vma,
1446	.address = addr,
1447	.flags = `0`,
1448	.pmd = pmdp,
1449	};
1450
1451	VM_BUG_ON(!PageTransHuge(hpage));
1452	mmap_assert_locked(mm: vma->vm_mm);
1453
1454	if (do_set_pmd(vmf: &vmf, page: hpage))
1455	return SCAN_FAIL;
1456
1457	get_page(page: hpage);
1458	return SCAN_SUCCEED;
1459	}
1460
1461	/**
1462	* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
1463	* address haddr.
1464	*
1465	* @mm: process address space where collapse happens
1466	* @addr: THP collapse address
1467	* @install_pmd: If a huge PMD should be installed
1468	*
1469	* This function checks whether all the PTEs in the PMD are pointing to the
1470	* right THP. If so, retract the page table so the THP can refault in with
1471	* as pmd-mapped. Possibly install a huge PMD mapping the THP.
1472	*/
1473	int collapse_pte_mapped_thp(struct mm_struct mm, unsigned* long addr,
1474	bool install_pmd)
1475	{
1476	struct mmu_notifier_range range;
1477	bool notified = false;
1478	unsigned long haddr = addr & HPAGE_PMD_MASK;
1479	struct vm_area_struct *vma = vma_lookup(mm, addr: haddr);
1480	struct folio *folio;
1481	pte_t start_pte, pte;
1482	pmd_t *pmd, pgt_pmd;
1483	spinlock_t pml = NULL, ptl;
1484	int nr_ptes = `0`, result = SCAN_FAIL;
1485	int i;
1486
1487	mmap_assert_locked(mm);
1488
1489	/ First check VMA found, in case page tables are being torn down /
1490	if (!vma \|\| !vma->vm_file \|\|
1491	!range_in_vma(vma, start: haddr, end: haddr + HPAGE_PMD_SIZE))
1492	return SCAN_VMA_CHECK;
1493
1494	/ Fast check before locking page if already PMD-mapped /
1495	result = find_pmd_or_thp_or_none(mm, address: haddr, pmd: &pmd);
1496	if (result == SCAN_PMD_MAPPED)
1497	return result;
1498
1499	/*
1500	* If we are here, we've succeeded in replacing all the native pages
1501	* in the page cache with a single hugepage. If a mm were to fault-in
1502	* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
1503	* and map it by a PMD, regardless of sysfs THP settings. As such, let's
1504	* analogously elide sysfs THP settings here.
1505	*/
1506	if (!hugepage_vma_check(vma, vm_flags: vma->vm_flags, smaps: false, in_pf: false, enforce_sysfs: false))
1507	return SCAN_VMA_CHECK;
1508
1509	/ Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() /
1510	if (userfaultfd_wp(vma))
1511	return SCAN_PTE_UFFD_WP;
1512
1513	folio = filemap_lock_folio(mapping: vma->vm_file->f_mapping,
1514	index: linear_page_index(vma, address: haddr));
1515	if (IS_ERR(ptr: folio))
1516	return SCAN_PAGE_NULL;
1517
1518	if (folio_order(folio) != HPAGE_PMD_ORDER) {
1519	result = SCAN_PAGE_COMPOUND;
1520	goto drop_folio;
1521	}
1522
1523	result = find_pmd_or_thp_or_none(mm, address: haddr, pmd: &pmd);
1524	switch (result) {
1525	case SCAN_SUCCEED:
1526	break;
1527	case SCAN_PMD_NONE:
1528	/*
1529	* All pte entries have been removed and pmd cleared.
1530	* Skip all the pte checks and just update the pmd mapping.
1531	*/
1532	goto maybe_install_pmd;
1533	default:
1534	goto drop_folio;
1535	}
1536
1537	result = SCAN_FAIL;
1538	start_pte = pte_offset_map_lock(mm, pmd, addr: haddr, ptlp: &ptl);
1539	if (!start_pte) / mmap_lock + page lock should prevent this /
1540	goto drop_folio;
1541
1542	/ step 1: check all mapped PTEs are to the right huge page /
1543	for (i = `0`, addr = haddr, pte = start_pte;
1544	i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1545	struct page *page;
1546	pte_t ptent = ptep_get(ptep: pte);
1547
1548	/ empty pte, skip /
1549	if (pte_none(pte: ptent))
1550	continue;
1551
1552	/ page swapped out, abort /
1553	if (!pte_present(a: ptent)) {
1554	result = SCAN_PTE_NON_PRESENT;
1555	goto abort;
1556	}
1557
1558	page = vm_normal_page(vma, addr, pte: ptent);
1559	if (WARN_ON_ONCE(page && is_zone_device_page(page)))
1560	page = NULL;
1561	/*
1562	* Note that uprobe, debugger, or MAP_PRIVATE may change the
1563	* page table, but the new page will not be a subpage of hpage.
1564	*/
1565	if (folio_page(folio, i) != page)
1566	goto abort;
1567	}
1568
1569	pte_unmap_unlock(start_pte, ptl);
1570	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
1571	start: haddr, end: haddr + HPAGE_PMD_SIZE);
1572	mmu_notifier_invalidate_range_start(range: &range);
1573	notified = true;
1574
1575	/*
1576	* pmd_lock covers a wider range than ptl, and (if split from mm's
1577	* page_table_lock) ptl nests inside pml. The less time we hold pml,
1578	* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
1579	* inserts a valid as-if-COWed PTE without even looking up page cache.
1580	* So page lock of folio does not protect from it, so we must not drop
1581	* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
1582	*/
1583	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
1584	pml = pmd_lock(mm, pmd);
1585
1586	start_pte = pte_offset_map_nolock(mm, pmd, addr: haddr, ptlp: &ptl);
1587	if (!start_pte) / mmap_lock + page lock should prevent this /
1588	goto abort;
1589	if (!pml)
1590	spin_lock(lock: ptl);
1591	else if (ptl != pml)
1592	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1593
1594	/ step 2: clear page table and adjust rmap /
1595	for (i = `0`, addr = haddr, pte = start_pte;
1596	i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1597	struct page *page;
1598	pte_t ptent = ptep_get(ptep: pte);
1599
1600	if (pte_none(pte: ptent))
1601	continue;
1602	/*
1603	* We dropped ptl after the first scan, to do the mmu_notifier:
1604	* page lock stops more PTEs of the folio being faulted in, but
1605	* does not stop write faults COWing anon copies from existing
1606	* PTEs; and does not stop those being swapped out or migrated.
1607	*/
1608	if (!pte_present(a: ptent)) {
1609	result = SCAN_PTE_NON_PRESENT;
1610	goto abort;
1611	}
1612	page = vm_normal_page(vma, addr, pte: ptent);
1613	if (folio_page(folio, i) != page)
1614	goto abort;
1615
1616	/*
1617	* Must clear entry, or a racing truncate may re-remove it.
1618	* TLB flush can be left until pmdp_collapse_flush() does it.
1619	* PTE dirty? Shmem page is already dirty; file is read-only.
1620	*/
1621	ptep_clear(mm, addr, ptep: pte);
1622	page_remove_rmap(page, vma, compound: false);
1623	nr_ptes++;
1624	}
1625
1626	pte_unmap(pte: start_pte);
1627	if (!pml)
1628	spin_unlock(lock: ptl);
1629
1630	/ step 3: set proper refcount and mm_counters. /
1631	if (nr_ptes) {
1632	folio_ref_sub(folio, nr: nr_ptes);
1633	add_mm_counter(mm, member: mm_counter_file(page: &folio->page), value: -nr_ptes);
1634	}
1635
1636	/ step 4: remove empty page table /
1637	if (!pml) {
1638	pml = pmd_lock(mm, pmd);
1639	if (ptl != pml)
1640	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1641	}
1642	pgt_pmd = pmdp_collapse_flush(vma, address: haddr, pmdp: pmd);
1643	pmdp_get_lockless_sync();
1644	if (ptl != pml)
1645	spin_unlock(lock: ptl);
1646	spin_unlock(lock: pml);
1647
1648	mmu_notifier_invalidate_range_end(range: &range);
1649
1650	mm_dec_nr_ptes(mm);
1651	page_table_check_pte_clear_range(mm, addr: haddr, pmd: pgt_pmd);
1652	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1653
1654	maybe_install_pmd:
1655	/ step 5: install pmd entry /
1656	result = install_pmd
1657	? set_huge_pmd(vma, addr: haddr, pmdp: pmd, hpage: &folio->page)
1658	: SCAN_SUCCEED;
1659	goto drop_folio;
1660	abort:
1661	if (nr_ptes) {
1662	flush_tlb_mm(mm);
1663	folio_ref_sub(folio, nr: nr_ptes);
1664	add_mm_counter(mm, member: mm_counter_file(page: &folio->page), value: -nr_ptes);
1665	}
1666	if (start_pte)
1667	pte_unmap_unlock(start_pte, ptl);
1668	if (pml && pml != ptl)
1669	spin_unlock(lock: pml);
1670	if (notified)
1671	mmu_notifier_invalidate_range_end(range: &range);
1672	drop_folio:
1673	folio_unlock(folio);
1674	folio_put(folio);
1675	return result;
1676	}
1677
1678	static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1679	{
1680	struct vm_area_struct *vma;
1681
1682	i_mmap_lock_read(mapping);
1683	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1684	struct mmu_notifier_range range;
1685	struct mm_struct *mm;
1686	unsigned long addr;
1687	pmd_t *pmd, pgt_pmd;
1688	spinlock_t *pml;
1689	spinlock_t *ptl;
1690	bool skipped_uffd = false;
1691
1692	/*
1693	* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1694	* got written to. These VMAs are likely not worth removing
1695	* page tables from, as PMD-mapping is likely to be split later.
1696	*/
1697	if (READ_ONCE(vma->anon_vma))
1698	continue;
1699
1700	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1701	if (addr & ~HPAGE_PMD_MASK \|\|
1702	vma->vm_end < addr + HPAGE_PMD_SIZE)
1703	continue;
1704
1705	mm = vma->vm_mm;
1706	if (find_pmd_or_thp_or_none(mm, address: addr, pmd: &pmd) != SCAN_SUCCEED)
1707	continue;
1708
1709	if (hpage_collapse_test_exit(mm))
1710	continue;
1711	/*
1712	* When a vma is registered with uffd-wp, we cannot recycle
1713	* the page table because there may be pte markers installed.
1714	* Other vmas can still have the same file mapped hugely, but
1715	* skip this one: it will always be mapped in small page size
1716	* for uffd-wp registered ranges.
1717	*/
1718	if (userfaultfd_wp(vma))
1719	continue;
1720
1721	/ PTEs were notified when unmapped; but now for the PMD? /
1722	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
1723	start: addr, end: addr + HPAGE_PMD_SIZE);
1724	mmu_notifier_invalidate_range_start(range: &range);
1725
1726	pml = pmd_lock(mm, pmd);
1727	ptl = pte_lockptr(mm, pmd);
1728	if (ptl != pml)
1729	spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
1730
1731	/*
1732	* Huge page lock is still held, so normally the page table
1733	* must remain empty; and we have already skipped anon_vma
1734	* and userfaultfd_wp() vmas. But since the mmap_lock is not
1735	* held, it is still possible for a racing userfaultfd_ioctl()
1736	* to have inserted ptes or markers. Now that we hold ptlock,
1737	* repeating the anon_vma check protects from one category,
1738	* and repeating the userfaultfd_wp() check from another.
1739	*/
1740	if (unlikely(vma->anon_vma \|\| userfaultfd_wp(vma))) {
1741	skipped_uffd = true;
1742	} else {
1743	pgt_pmd = pmdp_collapse_flush(vma, address: addr, pmdp: pmd);
1744	pmdp_get_lockless_sync();
1745	}
1746
1747	if (ptl != pml)
1748	spin_unlock(lock: ptl);
1749	spin_unlock(lock: pml);
1750
1751	mmu_notifier_invalidate_range_end(range: &range);
1752
1753	if (!skipped_uffd) {
1754	mm_dec_nr_ptes(mm);
1755	page_table_check_pte_clear_range(mm, addr, pmd: pgt_pmd);
1756	pte_free_defer(mm, pmd_pgtable(pgt_pmd));
1757	}
1758	}
1759	i_mmap_unlock_read(mapping);
1760	}
1761
1762	/**
1763	* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1764	*
1765	* @mm: process address space where collapse happens
1766	* @addr: virtual collapse start address
1767	* @file: file that collapse on
1768	* @start: collapse start address
1769	* @cc: collapse context and scratchpad
1770	*
1771	* Basic scheme is simple, details are more complex:
1772	* - allocate and lock a new huge page;
1773	* - scan page cache, locking old pages
1774	* + swap/gup in pages if necessary;
1775	* - copy data to new page
1776	* - handle shmem holes
1777	* + re-validate that holes weren't filled by someone else
1778	* + check for userfaultfd
1779	* - finalize updates to the page cache;
1780	* - if replacing succeeds:
1781	* + unlock huge page;
1782	* + free old pages;
1783	* - if replacing failed;
1784	* + unlock old pages
1785	* + unlock and free huge page;
1786	*/
1787	static int collapse_file(struct mm_struct mm, unsigned* long addr,
1788	struct file *file, pgoff_t start,
1789	struct collapse_control *cc)
1790	{
1791	struct address_space *mapping = file->f_mapping;
1792	struct page *hpage;
1793	struct page *page;
1794	struct page *tmp;
1795	struct folio *folio;
1796	pgoff_t index = `0`, end = start + HPAGE_PMD_NR;
1797	LIST_HEAD(pagelist);
1798	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1799	int nr_none = `0`, result = SCAN_SUCCEED;
1800	bool is_shmem = shmem_file(file);
1801	int nr = `0`;
1802
1803	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1804	VM_BUG_ON(start & (HPAGE_PMD_NR - `1`));
1805
1806	result = alloc_charge_hpage(hpage: &hpage, mm, cc);
1807	if (result != SCAN_SUCCEED)
1808	goto out;
1809
1810	__SetPageLocked(page: hpage);
1811	if (is_shmem)
1812	__SetPageSwapBacked(page: hpage);
1813	hpage->index = start;
1814	hpage->mapping = mapping;
1815
1816	/*
1817	* Ensure we have slots for all the pages in the range. This is
1818	* almost certainly a no-op because most of the pages must be present
1819	*/
1820	do {
1821	xas_lock_irq(&xas);
1822	xas_create_range(&xas);
1823	if (!xas_error(xas: &xas))
1824	break;
1825	xas_unlock_irq(&xas);
1826	if (!xas_nomem(&xas, GFP_KERNEL)) {
1827	result = SCAN_FAIL;
1828	goto rollback;
1829	}
1830	} while (`1`);
1831
1832	for (index = start; index < end; index++) {
1833	xas_set(xas: &xas, index);
1834	page = xas_load(&xas);
1835
1836	VM_BUG_ON(index != xas.xa_index);
1837	if (is_shmem) {
1838	if (!page) {
1839	/*
1840	* Stop if extent has been truncated or
1841	* hole-punched, and is now completely
1842	* empty.
1843	*/
1844	if (index == start) {
1845	if (!xas_next_entry(xas: &xas, max: end - `1`)) {
1846	result = SCAN_TRUNCATED;
1847	goto xa_locked;
1848	}
1849	}
1850	nr_none++;
1851	continue;
1852	}
1853
1854	if (xa_is_value(entry: page) \|\| !PageUptodate(page)) {
1855	xas_unlock_irq(&xas);
1856	/ swap in or instantiate fallocated page /
1857	if (shmem_get_folio(inode: mapping->host, index,
1858	foliop: &folio, sgp: SGP_NOALLOC)) {
1859	result = SCAN_FAIL;
1860	goto xa_unlocked;
1861	}
1862	/ drain lru cache to help isolate_lru_page() /
1863	lru_add_drain();
1864	page = folio_file_page(folio, index);
1865	} else if (trylock_page(page)) {
1866	get_page(page);
1867	xas_unlock_irq(&xas);
1868	} else {
1869	result = SCAN_PAGE_LOCK;
1870	goto xa_locked;
1871	}
1872	} else { / !is_shmem /
1873	if (!page \|\| xa_is_value(entry: page)) {
1874	xas_unlock_irq(&xas);
1875	page_cache_sync_readahead(mapping, ra: &file->f_ra,
1876	file, index,
1877	req_count: end - index);
1878	/ drain lru cache to help isolate_lru_page() /
1879	lru_add_drain();
1880	page = find_lock_page(mapping, index);
1881	if (unlikely(page == NULL)) {
1882	result = SCAN_FAIL;
1883	goto xa_unlocked;
1884	}
1885	} else if (PageDirty(page)) {
1886	/*
1887	* khugepaged only works on read-only fd,
1888	* so this page is dirty because it hasn't
1889	* been flushed since first write. There
1890	* won't be new dirty pages.
1891	*
1892	* Trigger async flush here and hope the
1893	* writeback is done when khugepaged
1894	* revisits this page.
1895	*
1896	* This is a one-off situation. We are not
1897	* forcing writeback in loop.
1898	*/
1899	xas_unlock_irq(&xas);
1900	filemap_flush(mapping);
1901	result = SCAN_FAIL;
1902	goto xa_unlocked;
1903	} else if (PageWriteback(page)) {
1904	xas_unlock_irq(&xas);
1905	result = SCAN_FAIL;
1906	goto xa_unlocked;
1907	} else if (trylock_page(page)) {
1908	get_page(page);
1909	xas_unlock_irq(&xas);
1910	} else {
1911	result = SCAN_PAGE_LOCK;
1912	goto xa_locked;
1913	}
1914	}
1915
1916	/*
1917	* The page must be locked, so we can drop the i_pages lock
1918	* without racing with truncate.
1919	*/
1920	VM_BUG_ON_PAGE(!PageLocked(page), page);
1921
1922	/ make sure the page is up to date /
1923	if (unlikely(!PageUptodate(page))) {
1924	result = SCAN_FAIL;
1925	goto out_unlock;
1926	}
1927
1928	/*
1929	* If file was truncated then extended, or hole-punched, before
1930	* we locked the first page, then a THP might be there already.
1931	* This will be discovered on the first iteration.
1932	*/
1933	if (PageTransCompound(page)) {
1934	struct page *head = compound_head(page);
1935
1936	result = compound_order(page: head) == HPAGE_PMD_ORDER &&
1937	head->index == start
1938	/ Maybe PMD-mapped /
1939	? SCAN_PTE_MAPPED_HUGEPAGE
1940	: SCAN_PAGE_COMPOUND;
1941	goto out_unlock;
1942	}
1943
1944	folio = page_folio(page);
1945
1946	if (folio_mapping(folio) != mapping) {
1947	result = SCAN_TRUNCATED;
1948	goto out_unlock;
1949	}
1950
1951	if (!is_shmem && (folio_test_dirty(folio) \|\|
1952	folio_test_writeback(folio))) {
1953	/*
1954	* khugepaged only works on read-only fd, so this
1955	* page is dirty because it hasn't been flushed
1956	* since first write.
1957	*/
1958	result = SCAN_FAIL;
1959	goto out_unlock;
1960	}
1961
1962	if (!folio_isolate_lru(folio)) {
1963	result = SCAN_DEL_PAGE_LRU;
1964	goto out_unlock;
1965	}
1966
1967	if (!filemap_release_folio(folio, GFP_KERNEL)) {
1968	result = SCAN_PAGE_HAS_PRIVATE;
1969	folio_putback_lru(folio);
1970	goto out_unlock;
1971	}
1972
1973	if (folio_mapped(folio))
1974	try_to_unmap(folio,
1975	flags: TTU_IGNORE_MLOCK \| TTU_BATCH_FLUSH);
1976
1977	xas_lock_irq(&xas);
1978
1979	VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
1980
1981	/*
1982	* We control three references to the page:
1983	* - we hold a pin on it;
1984	* - one reference from page cache;
1985	* - one from isolate_lru_page;
1986	* If those are the only references, then any new usage of the
1987	* page will have to fetch it from the page cache. That requires
1988	* locking the page to handle truncate, so any new usage will be
1989	* blocked until we unlock page after collapse/during rollback.
1990	*/
1991	if (page_count(page) != `3`) {
1992	result = SCAN_PAGE_COUNT;
1993	xas_unlock_irq(&xas);
1994	putback_lru_page(page);
1995	goto out_unlock;
1996	}
1997
1998	/*
1999	* Accumulate the pages that are being collapsed.
2000	*/
2001	list_add_tail(new: &page->lru, head: &pagelist);
2002	continue;
2003	out_unlock:
2004	unlock_page(page);
2005	put_page(page);
2006	goto xa_unlocked;
2007	}
2008
2009	if (!is_shmem) {
2010	filemap_nr_thps_inc(mapping);
2011	/*
2012	* Paired with smp_mb() in do_dentry_open() to ensure
2013	* i_writecount is up to date and the update to nr_thps is
2014	* visible. Ensures the page cache will be truncated if the
2015	* file is opened writable.
2016	*/
2017	smp_mb();
2018	if (inode_is_open_for_write(inode: mapping->host)) {
2019	result = SCAN_FAIL;
2020	filemap_nr_thps_dec(mapping);
2021	}
2022	}
2023
2024	xa_locked:
2025	xas_unlock_irq(&xas);
2026	xa_unlocked:
2027
2028	/*
2029	* If collapse is successful, flush must be done now before copying.
2030	* If collapse is unsuccessful, does flush actually need to be done?
2031	* Do it anyway, to clear the state.
2032	*/
2033	try_to_unmap_flush();
2034
2035	if (result == SCAN_SUCCEED && nr_none &&
2036	!shmem_charge(inode: mapping->host, pages: nr_none))
2037	result = SCAN_FAIL;
2038	if (result != SCAN_SUCCEED) {
2039	nr_none = `0`;
2040	goto rollback;
2041	}
2042
2043	/*
2044	* The old pages are locked, so they won't change anymore.
2045	*/
2046	index = start;
2047	list_for_each_entry(page, &pagelist, lru) {
2048	while (index < page->index) {
2049	clear_highpage(page: hpage + (index % HPAGE_PMD_NR));
2050	index++;
2051	}
2052	if (copy_mc_highpage(to: hpage + (page->index % HPAGE_PMD_NR), from: page) > `0`) {
2053	result = SCAN_COPY_MC;
2054	goto rollback;
2055	}
2056	index++;
2057	}
2058	while (index < end) {
2059	clear_highpage(page: hpage + (index % HPAGE_PMD_NR));
2060	index++;
2061	}
2062
2063	if (nr_none) {
2064	struct vm_area_struct *vma;
2065	int nr_none_check = `0`;
2066
2067	i_mmap_lock_read(mapping);
2068	xas_lock_irq(&xas);
2069
2070	xas_set(xas: &xas, index: start);
2071	for (index = start; index < end; index++) {
2072	if (!xas_next(xas: &xas)) {
2073	xas_store(&xas, XA_RETRY_ENTRY);
2074	if (xas_error(xas: &xas)) {
2075	result = SCAN_STORE_FAILED;
2076	goto immap_locked;
2077	}
2078	nr_none_check++;
2079	}
2080	}
2081
2082	if (nr_none != nr_none_check) {
2083	result = SCAN_PAGE_FILLED;
2084	goto immap_locked;
2085	}
2086
2087	/*
2088	* If userspace observed a missing page in a VMA with a MODE_MISSING
2089	* userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
2090	* page. If so, we need to roll back to avoid suppressing such an
2091	* event. Since wp/minor userfaultfds don't give userspace any
2092	* guarantees that the kernel doesn't fill a missing page with a zero
2093	* page, so they don't matter here.
2094	*
2095	* Any userfaultfds registered after this point will not be able to
2096	* observe any missing pages due to the previously inserted retry
2097	* entries.
2098	*/
2099	vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
2100	if (userfaultfd_missing(vma)) {
2101	result = SCAN_EXCEED_NONE_PTE;
2102	goto immap_locked;
2103	}
2104	}
2105
2106	immap_locked:
2107	i_mmap_unlock_read(mapping);
2108	if (result != SCAN_SUCCEED) {
2109	xas_set(xas: &xas, index: start);
2110	for (index = start; index < end; index++) {
2111	if (xas_next(xas: &xas) == XA_RETRY_ENTRY)
2112	xas_store(&xas, NULL);
2113	}
2114
2115	xas_unlock_irq(&xas);
2116	goto rollback;
2117	}
2118	} else {
2119	xas_lock_irq(&xas);
2120	}
2121
2122	nr = thp_nr_pages(page: hpage);
2123	if (is_shmem)
2124	__mod_lruvec_page_state(page: hpage, idx: NR_SHMEM_THPS, val: nr);
2125	else
2126	__mod_lruvec_page_state(page: hpage, idx: NR_FILE_THPS, val: nr);
2127
2128	if (nr_none) {
2129	__mod_lruvec_page_state(page: hpage, idx: NR_FILE_PAGES, val: nr_none);
2130	/ nr_none is always 0 for non-shmem. /
2131	__mod_lruvec_page_state(page: hpage, idx: NR_SHMEM, val: nr_none);
2132	}
2133
2134	/*
2135	* Mark hpage as uptodate before inserting it into the page cache so
2136	* that it isn't mistaken for an fallocated but unwritten page.
2137	*/
2138	folio = page_folio(hpage);
2139	folio_mark_uptodate(folio);
2140	folio_ref_add(folio, HPAGE_PMD_NR - `1`);
2141
2142	if (is_shmem)
2143	folio_mark_dirty(folio);
2144	folio_add_lru(folio);
2145
2146	/ Join all the small entries into a single multi-index entry. /
2147	xas_set_order(xas: &xas, index: start, HPAGE_PMD_ORDER);
2148	xas_store(&xas, entry: hpage);
2149	WARN_ON_ONCE(xas_error(&xas));
2150	xas_unlock_irq(&xas);
2151
2152	/*
2153	* Remove pte page tables, so we can re-fault the page as huge.
2154	* If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
2155	*/
2156	retract_page_tables(mapping, pgoff: start);
2157	if (cc && !cc->is_khugepaged)
2158	result = SCAN_PTE_MAPPED_HUGEPAGE;
2159	unlock_page(page: hpage);
2160
2161	/*
2162	* The collapse has succeeded, so free the old pages.
2163	*/
2164	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2165	list_del(entry: &page->lru);
2166	page->mapping = NULL;
2167	ClearPageActive(page);
2168	ClearPageUnevictable(page);
2169	unlock_page(page);
2170	folio_put_refs(page_folio(page), refs: `3`);
2171	}
2172
2173	goto out;
2174
2175	rollback:
2176	/ Something went wrong: roll back page cache changes /
2177	if (nr_none) {
2178	xas_lock_irq(&xas);
2179	mapping->nrpages -= nr_none;
2180	xas_unlock_irq(&xas);
2181	shmem_uncharge(inode: mapping->host, pages: nr_none);
2182	}
2183
2184	list_for_each_entry_safe(page, tmp, &pagelist, lru) {
2185	list_del(entry: &page->lru);
2186	unlock_page(page);
2187	putback_lru_page(page);
2188	put_page(page);
2189	}
2190	/*
2191	* Undo the updates of filemap_nr_thps_inc for non-SHMEM
2192	* file only. This undo is not needed unless failure is
2193	* due to SCAN_COPY_MC.
2194	*/
2195	if (!is_shmem && result == SCAN_COPY_MC) {
2196	filemap_nr_thps_dec(mapping);
2197	/*
2198	* Paired with smp_mb() in do_dentry_open() to
2199	* ensure the update to nr_thps is visible.
2200	*/
2201	smp_mb();
2202	}
2203
2204	hpage->mapping = NULL;
2205
2206	unlock_page(page: hpage);
2207	put_page(page: hpage);
2208	out:
2209	VM_BUG_ON(!list_empty(&pagelist));
2210	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
2211	return result;
2212	}
2213
2214	static int hpage_collapse_scan_file(struct mm_struct mm, unsigned* long addr,
2215	struct file *file, pgoff_t start,
2216	struct collapse_control *cc)
2217	{
2218	struct page *page = NULL;
2219	struct address_space *mapping = file->f_mapping;
2220	XA_STATE(xas, &mapping->i_pages, start);
2221	int present, swap;
2222	int node = NUMA_NO_NODE;
2223	int result = SCAN_SUCCEED;
2224
2225	present = `0`;
2226	swap = `0`;
2227	memset(cc->node_load, `0`, sizeof(cc->node_load));
2228	nodes_clear(cc->alloc_nmask);
2229	rcu_read_lock();
2230	xas_for_each(&xas, page, start + HPAGE_PMD_NR - `1`) {
2231	if (xas_retry(xas: &xas, entry: page))
2232	continue;
2233
2234	if (xa_is_value(entry: page)) {
2235	++swap;
2236	if (cc->is_khugepaged &&
2237	swap > khugepaged_max_ptes_swap) {
2238	result = SCAN_EXCEED_SWAP_PTE;
2239	count_vm_event(item: THP_SCAN_EXCEED_SWAP_PTE);
2240	break;
2241	}
2242	continue;
2243	}
2244
2245	/*
2246	* TODO: khugepaged should compact smaller compound pages
2247	* into a PMD sized page
2248	*/
2249	if (PageTransCompound(page)) {
2250	struct page *head = compound_head(page);
2251
2252	result = compound_order(page: head) == HPAGE_PMD_ORDER &&
2253	head->index == start
2254	/ Maybe PMD-mapped /
2255	? SCAN_PTE_MAPPED_HUGEPAGE
2256	: SCAN_PAGE_COMPOUND;
2257	/*
2258	* For SCAN_PTE_MAPPED_HUGEPAGE, further processing
2259	* by the caller won't touch the page cache, and so
2260	* it's safe to skip LRU and refcount checks before
2261	* returning.
2262	*/
2263	break;
2264	}
2265
2266	node = page_to_nid(page);
2267	if (hpage_collapse_scan_abort(nid: node, cc)) {
2268	result = SCAN_SCAN_ABORT;
2269	break;
2270	}
2271	cc->node_load[node]++;
2272
2273	if (!PageLRU(page)) {
2274	result = SCAN_PAGE_LRU;
2275	break;
2276	}
2277
2278	if (page_count(page) !=
2279	`1` + page_mapcount(page) + page_has_private(page)) {
2280	result = SCAN_PAGE_COUNT;
2281	break;
2282	}
2283
2284	/*
2285	* We probably should check if the page is referenced here, but
2286	* nobody would transfer pte_young() to PageReferenced() for us.
2287	* And rmap walk here is just too costly...
2288	*/
2289
2290	present++;
2291
2292	if (need_resched()) {
2293	xas_pause(&xas);
2294	cond_resched_rcu();
2295	}
2296	}
2297	rcu_read_unlock();
2298
2299	if (result == SCAN_SUCCEED) {
2300	if (cc->is_khugepaged &&
2301	present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
2302	result = SCAN_EXCEED_NONE_PTE;
2303	count_vm_event(item: THP_SCAN_EXCEED_NONE_PTE);
2304	} else {
2305	result = collapse_file(mm, addr, file, start, cc);
2306	}
2307	}
2308
2309	trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
2310	return result;
2311	}
2312	#else
2313	static int hpage_collapse_scan_file(struct mm_struct mm, unsigned* long addr,
2314	struct file *file, pgoff_t start,
2315	struct collapse_control *cc)
2316	{
2317	BUILD_BUG();
2318	}
2319	#endif
2320
2321	static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2322	struct collapse_control *cc)
2323	__releases(&khugepaged_mm_lock)
2324	__acquires(&khugepaged_mm_lock)
2325	{
2326	struct vma_iterator vmi;
2327	struct khugepaged_mm_slot *mm_slot;
2328	struct mm_slot *slot;
2329	struct mm_struct *mm;
2330	struct vm_area_struct *vma;
2331	int progress = `0`;
2332
2333	VM_BUG_ON(!pages);
2334	lockdep_assert_held(&khugepaged_mm_lock);
2335	*result = SCAN_FAIL;
2336
2337	if (khugepaged_scan.mm_slot) {
2338	mm_slot = khugepaged_scan.mm_slot;
2339	slot = &mm_slot->slot;
2340	} else {
2341	slot = list_entry(khugepaged_scan.mm_head.next,
2342	struct mm_slot, mm_node);
2343	mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2344	khugepaged_scan.address = `0`;
2345	khugepaged_scan.mm_slot = mm_slot;
2346	}
2347	spin_unlock(lock: &khugepaged_mm_lock);
2348
2349	mm = slot->mm;
2350	/*
2351	* Don't wait for semaphore (to avoid long wait times). Just move to
2352	* the next mm on the list.
2353	*/
2354	vma = NULL;
2355	if (unlikely(!mmap_read_trylock(mm)))
2356	goto breakouterloop_mmap_lock;
2357
2358	progress++;
2359	if (unlikely(hpage_collapse_test_exit(mm)))
2360	goto breakouterloop;
2361
2362	vma_iter_init(vmi: &vmi, mm, addr: khugepaged_scan.address);
2363	for_each_vma(vmi, vma) {
2364	unsigned long hstart, hend;
2365
2366	cond_resched();
2367	if (unlikely(hpage_collapse_test_exit(mm))) {
2368	progress++;
2369	break;
2370	}
2371	if (!hugepage_vma_check(vma, vm_flags: vma->vm_flags, smaps: false, in_pf: false, enforce_sysfs: true)) {
2372	skip:
2373	progress++;
2374	continue;
2375	}
2376	hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2377	hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2378	if (khugepaged_scan.address > hend)
2379	goto skip;
2380	if (khugepaged_scan.address < hstart)
2381	khugepaged_scan.address = hstart;
2382	VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2383
2384	while (khugepaged_scan.address < hend) {
2385	bool mmap_locked = true;
2386
2387	cond_resched();
2388	if (unlikely(hpage_collapse_test_exit(mm)))
2389	goto breakouterloop;
2390
2391	VM_BUG_ON(khugepaged_scan.address < hstart \|\|
2392	khugepaged_scan.address + HPAGE_PMD_SIZE >
2393	hend);
2394	if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2395	struct file *file = get_file(f: vma->vm_file);
2396	pgoff_t pgoff = linear_page_index(vma,
2397	address: khugepaged_scan.address);
2398
2399	mmap_read_unlock(mm);
2400	mmap_locked = false;
2401	*result = hpage_collapse_scan_file(mm,
2402	addr: khugepaged_scan.address, file, start: pgoff, cc);
2403	fput(file);
2404	if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2405	mmap_read_lock(mm);
2406	if (hpage_collapse_test_exit(mm))
2407	goto breakouterloop;
2408	*result = collapse_pte_mapped_thp(mm,
2409	addr: khugepaged_scan.address, install_pmd: false);
2410	if (*result == SCAN_PMD_MAPPED)
2411	*result = SCAN_SUCCEED;
2412	mmap_read_unlock(mm);
2413	}
2414	} else {
2415	*result = hpage_collapse_scan_pmd(mm, vma,
2416	address: khugepaged_scan.address, mmap_locked: &mmap_locked, cc);
2417	}
2418
2419	if (*result == SCAN_SUCCEED)
2420	++khugepaged_pages_collapsed;
2421
2422	/ move to next address /
2423	khugepaged_scan.address += HPAGE_PMD_SIZE;
2424	progress += HPAGE_PMD_NR;
2425	if (!mmap_locked)
2426	/*
2427	* We released mmap_lock so break loop. Note
2428	* that we drop mmap_lock before all hugepage
2429	* allocations, so if allocation fails, we are
2430	* guaranteed to break here and report the
2431	* correct result back to caller.
2432	*/
2433	goto breakouterloop_mmap_lock;
2434	if (progress >= pages)
2435	goto breakouterloop;
2436	}
2437	}
2438	breakouterloop:
2439	mmap_read_unlock(mm); / exit_mmap will destroy ptes after this /
2440	breakouterloop_mmap_lock:
2441
2442	spin_lock(lock: &khugepaged_mm_lock);
2443	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2444	/*
2445	* Release the current mm_slot if this mm is about to die, or
2446	* if we scanned all vmas of this mm.
2447	*/
2448	if (hpage_collapse_test_exit(mm) \|\| !vma) {
2449	/*
2450	* Make sure that if mm_users is reaching zero while
2451	* khugepaged runs here, khugepaged_exit will find
2452	* mm_slot not pointing to the exiting mm.
2453	*/
2454	if (slot->mm_node.next != &khugepaged_scan.mm_head) {
2455	slot = list_entry(slot->mm_node.next,
2456	struct mm_slot, mm_node);
2457	khugepaged_scan.mm_slot =
2458	mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
2459	khugepaged_scan.address = `0`;
2460	} else {
2461	khugepaged_scan.mm_slot = NULL;
2462	khugepaged_full_scans++;
2463	}
2464
2465	collect_mm_slot(mm_slot);
2466	}
2467
2468	return progress;
2469	}
2470
2471	static int khugepaged_has_work(void)
2472	{
2473	return !list_empty(head: &khugepaged_scan.mm_head) &&
2474	hugepage_flags_enabled();
2475	}
2476
2477	static int khugepaged_wait_event(void)
2478	{
2479	return !list_empty(head: &khugepaged_scan.mm_head) \|\|
2480	kthread_should_stop();
2481	}
2482
2483	static void khugepaged_do_scan(struct collapse_control *cc)
2484	{
2485	unsigned int progress = `0`, pass_through_head = `0`;
2486	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
2487	bool wait = true;
2488	int result = SCAN_SUCCEED;
2489
2490	lru_add_drain_all();
2491
2492	while (true) {
2493	cond_resched();
2494
2495	if (unlikely(kthread_should_stop() \|\| try_to_freeze()))
2496	break;
2497
2498	spin_lock(lock: &khugepaged_mm_lock);
2499	if (!khugepaged_scan.mm_slot)
2500	pass_through_head++;
2501	if (khugepaged_has_work() &&
2502	pass_through_head < `2`)
2503	progress += khugepaged_scan_mm_slot(pages: pages - progress,
2504	result: &result, cc);
2505	else
2506	progress = pages;
2507	spin_unlock(lock: &khugepaged_mm_lock);
2508
2509	if (progress >= pages)
2510	break;
2511
2512	if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
2513	/*
2514	* If fail to allocate the first time, try to sleep for
2515	* a while. When hit again, cancel the scan.
2516	*/
2517	if (!wait)
2518	break;
2519	wait = false;
2520	khugepaged_alloc_sleep();
2521	}
2522	}
2523	}
2524
2525	static bool khugepaged_should_wakeup(void)
2526	{
2527	return kthread_should_stop() \|\|
2528	time_after_eq(jiffies, khugepaged_sleep_expire);
2529	}
2530
2531	static void khugepaged_wait_work(void)
2532	{
2533	if (khugepaged_has_work()) {
2534	const unsigned long scan_sleep_jiffies =
2535	msecs_to_jiffies(m: khugepaged_scan_sleep_millisecs);
2536
2537	if (!scan_sleep_jiffies)
2538	return;
2539
2540	khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2541	wait_event_freezable_timeout(khugepaged_wait,
2542	khugepaged_should_wakeup(),
2543	scan_sleep_jiffies);
2544	return;
2545	}
2546
2547	if (hugepage_flags_enabled())
2548	wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2549	}
2550
2551	static int khugepaged(void *none)
2552	{
2553	struct khugepaged_mm_slot *mm_slot;
2554
2555	set_freezable();
2556	set_user_nice(current, MAX_NICE);
2557
2558	while (!kthread_should_stop()) {
2559	khugepaged_do_scan(cc: &khugepaged_collapse_control);
2560	khugepaged_wait_work();
2561	}
2562
2563	spin_lock(lock: &khugepaged_mm_lock);
2564	mm_slot = khugepaged_scan.mm_slot;
2565	khugepaged_scan.mm_slot = NULL;
2566	if (mm_slot)
2567	collect_mm_slot(mm_slot);
2568	spin_unlock(lock: &khugepaged_mm_lock);
2569	return `0`;
2570	}
2571
2572	static void set_recommended_min_free_kbytes(void)
2573	{
2574	struct zone *zone;
2575	int nr_zones = `0`;
2576	unsigned long recommended_min;
2577
2578	if (!hugepage_flags_enabled()) {
2579	calculate_min_free_kbytes();
2580	goto update_wmarks;
2581	}
2582
2583	for_each_populated_zone(zone) {
2584	/*
2585	* We don't need to worry about fragmentation of
2586	* ZONE_MOVABLE since it only has movable pages.
2587	*/
2588	if (zone_idx(zone) > gfp_zone(GFP_USER))
2589	continue;
2590
2591	nr_zones++;
2592	}
2593
2594	/ Ensure 2 pageblocks are free to assist fragmentation avoidance /
2595	recommended_min = pageblock_nr_pages * nr_zones * `2`;
2596
2597	/*
2598	* Make sure that on average at least two pageblocks are almost free
2599	* of another type, one for a migratetype to fall back to and a
2600	* second to avoid subsequent fallbacks of other types There are 3
2601	* MIGRATE_TYPES we care about.
2602	*/
2603	recommended_min += pageblock_nr_pages * nr_zones *
2604	MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
2605
2606	/ don't ever allow to reserve more than 5% of the lowmem /
2607	recommended_min = min(recommended_min,
2608	(unsigned long) nr_free_buffer_pages() / `20`);
2609	recommended_min <<= (PAGE_SHIFT-`10`);
2610
2611	if (recommended_min > min_free_kbytes) {
2612	if (user_min_free_kbytes >= `0`)
2613	pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
2614	min_free_kbytes, recommended_min);
2615
2616	min_free_kbytes = recommended_min;
2617	}
2618
2619	update_wmarks:
2620	setup_per_zone_wmarks();
2621	}
2622
2623	int start_stop_khugepaged(void)
2624	{
2625	int err = `0`;
2626
2627	mutex_lock(&khugepaged_mutex);
2628	if (hugepage_flags_enabled()) {
2629	if (!khugepaged_thread)
2630	khugepaged_thread = kthread_run(khugepaged, NULL,
2631	"khugepaged");
2632	if (IS_ERR(ptr: khugepaged_thread)) {
2633	pr_err("khugepaged: kthread_run(khugepaged) failed\n");
2634	err = PTR_ERR(ptr: khugepaged_thread);
2635	khugepaged_thread = NULL;
2636	goto fail;
2637	}
2638
2639	if (!list_empty(head: &khugepaged_scan.mm_head))
2640	wake_up_interruptible(&khugepaged_wait);
2641	} else if (khugepaged_thread) {
2642	kthread_stop(k: khugepaged_thread);
2643	khugepaged_thread = NULL;
2644	}
2645	set_recommended_min_free_kbytes();
2646	fail:
2647	mutex_unlock(lock: &khugepaged_mutex);
2648	return err;
2649	}
2650
2651	void khugepaged_min_free_kbytes_update(void)
2652	{
2653	mutex_lock(&khugepaged_mutex);
2654	if (hugepage_flags_enabled() && khugepaged_thread)
2655	set_recommended_min_free_kbytes();
2656	mutex_unlock(lock: &khugepaged_mutex);
2657	}
2658
2659	bool current_is_khugepaged(void)
2660	{
2661	return kthread_func(current) == khugepaged;
2662	}
2663
2664	static int madvise_collapse_errno(enum scan_result r)
2665	{
2666	/*
2667	* MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
2668	* actionable feedback to caller, so they may take an appropriate
2669	* fallback measure depending on the nature of the failure.
2670	*/
2671	switch (r) {
2672	case SCAN_ALLOC_HUGE_PAGE_FAIL:
2673	return -ENOMEM;
2674	case SCAN_CGROUP_CHARGE_FAIL:
2675	case SCAN_EXCEED_NONE_PTE:
2676	return -EBUSY;
2677	/ Resource temporary unavailable - trying again might succeed /
2678	case SCAN_PAGE_COUNT:
2679	case SCAN_PAGE_LOCK:
2680	case SCAN_PAGE_LRU:
2681	case SCAN_DEL_PAGE_LRU:
2682	case SCAN_PAGE_FILLED:
2683	return -EAGAIN;
2684	/*
2685	* Other: Trying again likely not to succeed / error intrinsic to
2686	* specified memory range. khugepaged likely won't be able to collapse
2687	* either.
2688	*/
2689	default:
2690	return -EINVAL;
2691	}
2692	}
2693
2694	int madvise_collapse(struct vm_area_struct vma, struct* vm_area_struct **prev,
2695	unsigned long start, unsigned long end)
2696	{
2697	struct collapse_control *cc;
2698	struct mm_struct *mm = vma->vm_mm;
2699	unsigned long hstart, hend, addr;
2700	int thps = `0`, last_fail = SCAN_FAIL;
2701	bool mmap_locked = true;
2702
2703	BUG_ON(vma->vm_start > start);
2704	BUG_ON(vma->vm_end < end);
2705
2706	*prev = vma;
2707
2708	if (!hugepage_vma_check(vma, vm_flags: vma->vm_flags, smaps: false, in_pf: false, enforce_sysfs: false))
2709	return -EINVAL;
2710
2711	cc = kmalloc(size: sizeof(*cc), GFP_KERNEL);
2712	if (!cc)
2713	return -ENOMEM;
2714	cc->is_khugepaged = false;
2715
2716	mmgrab(mm);
2717	lru_add_drain_all();
2718
2719	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2720	hend = end & HPAGE_PMD_MASK;
2721
2722	for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
2723	int result = SCAN_FAIL;
2724
2725	if (!mmap_locked) {
2726	cond_resched();
2727	mmap_read_lock(mm);
2728	mmap_locked = true;
2729	result = hugepage_vma_revalidate(mm, address: addr, expect_anon: false, vmap: &vma,
2730	cc);
2731	if (result != SCAN_SUCCEED) {
2732	last_fail = result;
2733	goto out_nolock;
2734	}
2735
2736	hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
2737	}
2738	mmap_assert_locked(mm);
2739	memset(cc->node_load, `0`, sizeof(cc->node_load));
2740	nodes_clear(cc->alloc_nmask);
2741	if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
2742	struct file *file = get_file(f: vma->vm_file);
2743	pgoff_t pgoff = linear_page_index(vma, address: addr);
2744
2745	mmap_read_unlock(mm);
2746	mmap_locked = false;
2747	result = hpage_collapse_scan_file(mm, addr, file, start: pgoff,
2748	cc);
2749	fput(file);
2750	} else {
2751	result = hpage_collapse_scan_pmd(mm, vma, address: addr,
2752	mmap_locked: &mmap_locked, cc);
2753	}
2754	if (!mmap_locked)
2755	prev = NULL; /* Tell caller we dropped mmap_lock /
2756
2757	handle_result:
2758	switch (result) {
2759	case SCAN_SUCCEED:
2760	case SCAN_PMD_MAPPED:
2761	++thps;
2762	break;
2763	case SCAN_PTE_MAPPED_HUGEPAGE:
2764	BUG_ON(mmap_locked);
2765	BUG_ON(*prev);
2766	mmap_read_lock(mm);
2767	result = collapse_pte_mapped_thp(mm, addr, install_pmd: true);
2768	mmap_read_unlock(mm);
2769	goto handle_result;
2770	/ Whitelisted set of results where continuing OK /
2771	case SCAN_PMD_NULL:
2772	case SCAN_PTE_NON_PRESENT:
2773	case SCAN_PTE_UFFD_WP:
2774	case SCAN_PAGE_RO:
2775	case SCAN_LACK_REFERENCED_PAGE:
2776	case SCAN_PAGE_NULL:
2777	case SCAN_PAGE_COUNT:
2778	case SCAN_PAGE_LOCK:
2779	case SCAN_PAGE_COMPOUND:
2780	case SCAN_PAGE_LRU:
2781	case SCAN_DEL_PAGE_LRU:
2782	last_fail = result;
2783	break;
2784	default:
2785	last_fail = result;
2786	/ Other error, exit /
2787	goto out_maybelock;
2788	}
2789	}
2790
2791	out_maybelock:
2792	/ Caller expects us to hold mmap_lock on return /
2793	if (!mmap_locked)
2794	mmap_read_lock(mm);
2795	out_nolock:
2796	mmap_assert_locked(mm);
2797	mmdrop(mm);
2798	kfree(objp: cc);
2799
2800	return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? `0`
2801	: madvise_collapse_errno(r: last_fail);
2802	}
2803

source code of linux/mm/khugepaged.c