internal.h source code [linux/mm/internal.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/ internal.h: mm/ internal definitions*
3	*
4	* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)
6	*/
7	#ifndef __MM_INTERNAL_H
8	#define __MM_INTERNAL_H
9
10	#include <linux/fs.h>
11	#include <linux/khugepaged.h>
12	#include <linux/mm.h>
13	#include <linux/mm_inline.h>
14	#include <linux/pagemap.h>
15	#include <linux/pagewalk.h>
16	#include <linux/rmap.h>
17	#include <linux/swap.h>
18	#include <linux/swapops.h>
19	#include <linux/swap_cgroup.h>
20	#include <linux/tracepoint-defs.h>
21
22	/ Internal core VMA manipulation functions. /
23	#include "vma.h"
24
25	struct folio_batch;
26
27	/*
28	* Maintains state across a page table move. The operation assumes both source
29	* and destination VMAs already exist and are specified by the user.
30	*
31	* Partial moves are permitted, but the old and new ranges must both reside
32	* within a VMA.
33	*
34	* mmap lock must be held in write and VMA write locks must be held on any VMA
35	* that is visible.
36	*
37	* Use the PAGETABLE_MOVE() macro to initialise this struct.
38	*
39	* The old_addr and new_addr fields are updated as the page table move is
40	* executed.
41	*
42	* NOTE: The page table move is affected by reading from [old_addr, old_end),
43	* and old_addr may be updated for better page table alignment, so len_in
44	* represents the length of the range being copied as specified by the user.
45	*/
46	struct pagetable_move_control {
47	struct vm_area_struct old; /* Source VMA. /
48	struct vm_area_struct new; /* Destination VMA. /
49	unsigned long old_addr; / Address from which the move begins. /
50	unsigned long old_end; / Exclusive address at which old range ends. /
51	unsigned long new_addr; / Address to move page tables to. /
52	unsigned long len_in; / Bytes to remap specified by user. /
53
54	bool need_rmap_locks; / Do rmap locks need to be taken? /
55	bool for_stack; / Is this an early temp stack being moved? /
56	};
57
58	#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
59	struct pagetable_move_control name = { \
60	.old = old_, \
61	.new = new_, \
62	.old_addr = old_addr_, \
63	.old_end = (old_addr_) + (len_), \
64	.new_addr = new_addr_, \
65	.len_in = len_, \
66	}
67
68	/*
69	* The set of flags that only affect watermark checking and reclaim
70	* behaviour. This is used by the MM to obey the caller constraints
71	* about IO, FS and watermark checking while ignoring placement
72	* hints such as HIGHMEM usage.
73	*/
74	#define GFP_RECLAIM_MASK (__GFP_RECLAIM\|__GFP_HIGH\|__GFP_IO\|__GFP_FS\|\
75	__GFP_NOWARN\|__GFP_RETRY_MAYFAIL\|__GFP_NOFAIL\|\
76	__GFP_NORETRY\|__GFP_MEMALLOC\|__GFP_NOMEMALLOC\|\
77	__GFP_NOLOCKDEP)
78
79	/ The GFP flags allowed during early boot /
80	#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM\|__GFP_IO\|__GFP_FS))
81
82	/ Control allocation cpuset and node placement constraints /
83	#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL\|__GFP_THISNODE)
84
85	/ Do not use these with a slab allocator /
86	#define GFP_SLAB_BUG_MASK (__GFP_DMA32\|__GFP_HIGHMEM\|~__GFP_BITS_MASK)
87
88	/*
89	* Different from WARN_ON_ONCE(), no warning will be issued
90	* when we specify __GFP_NOWARN.
91	*/
92	#define WARN_ON_ONCE_GFP(cond, gfp) ({ \
93	static bool __section(".data..once") __warned; \
94	int __ret_warn_once = !!(cond); \
95	\
96	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
97	__warned = true; \
98	WARN_ON(1); \
99	} \
100	unlikely(__ret_warn_once); \
101	})
102
103	void page_writeback_init(void);
104
105	/*
106	* If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
107	* its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
108	* above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
109	* leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
110	*/
111	#define ENTIRELY_MAPPED 0x800000
112	#define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1)
113
114	/*
115	* Flags passed to __show_mem() and show_free_areas() to suppress output in
116	* various contexts.
117	*/
118	#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
119
120	/*
121	* How many individual pages have an elevated _mapcount. Excludes
122	* the folio's entire_mapcount.
123	*
124	* Don't use this function outside of debugging code.
125	*/
126	static inline int folio_nr_pages_mapped(const struct folio *folio)
127	{
128	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
129	return -`1`;
130	return atomic_read(v: &folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
131	}
132
133	/*
134	* Retrieve the first entry of a folio based on a provided entry within the
135	* folio. We cannot rely on folio->swap as there is no guarantee that it has
136	* been initialized. Used for calling arch_swap_restore()
137	*/
138	static inline swp_entry_t folio_swap(swp_entry_t entry,
139	const struct folio *folio)
140	{
141	swp_entry_t swap = {
142	.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
143	};
144
145	return swap;
146	}
147
148	static inline void folio_raw_mapping(const* struct folio *folio)
149	{
150	unsigned long mapping = (unsigned long)folio->mapping;
151
152	return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
153	}
154
155	/*
156	* This is a file-backed mapping, and is about to be memory mapped - invoke its
157	* mmap hook and safely handle error conditions. On error, VMA hooks will be
158	* mutated.
159	*
160	* @file: File which backs the mapping.
161	* @vma: VMA which we are mapping.
162	*
163	* Returns: 0 if success, error otherwise.
164	*/
165	static inline int mmap_file(struct file file, struct* vm_area_struct *vma)
166	{
167	int err = call_mmap(file, vma);
168
169	if (likely(!err))
170	return `0`;
171
172	/*
173	* OK, we tried to call the file hook for mmap(), but an error
174	* arose. The mapping is in an inconsistent state and we most not invoke
175	* any further hooks on it.
176	*/
177	vma->vm_ops = &vma_dummy_vm_ops;
178
179	return err;
180	}
181
182	/*
183	* If the VMA has a close hook then close it, and since closing it might leave
184	* it in an inconsistent state which makes the use of any hooks suspect, clear
185	* them down by installing dummy empty hooks.
186	*/
187	static inline void vma_close(struct vm_area_struct *vma)
188	{
189	if (vma->vm_ops && vma->vm_ops->close) {
190	vma->vm_ops->close(vma);
191
192	/*
193	* The mapping is in an inconsistent state, and no further hooks
194	* may be invoked upon it.
195	*/
196	vma->vm_ops = &vma_dummy_vm_ops;
197	}
198	}
199
200	#ifdef CONFIG_MMU
201
202	/ Flags for folio_pte_batch(). /
203	typedef int __bitwise fpb_t;
204
205	/ Compare PTEs after pte_mkclean(), ignoring the dirty bit. /
206	#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0))
207
208	/ Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. /
209	#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1))
210
211	static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
212	{
213	if (flags & FPB_IGNORE_DIRTY)
214	pte = pte_mkclean(pte);
215	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
216	pte = pte_clear_soft_dirty(pte);
217	return pte_wrprotect(pte: pte_mkold(pte));
218	}
219
220	/**
221	* folio_pte_batch - detect a PTE batch for a large folio
222	* @folio: The large folio to detect a PTE batch for.
223	* @addr: The user virtual address the first page is mapped at.
224	* @start_ptep: Page table pointer for the first entry.
225	* @pte: Page table entry for the first page.
226	* @max_nr: The maximum number of table entries to consider.
227	* @flags: Flags to modify the PTE batch semantics.
228	* @any_writable: Optional pointer to indicate whether any entry except the
229	* first one is writable.
230	* @any_young: Optional pointer to indicate whether any entry except the
231	* first one is young.
232	* @any_dirty: Optional pointer to indicate whether any entry except the
233	* first one is dirty.
234	*
235	* Detect a PTE batch: consecutive (present) PTEs that map consecutive
236	* pages of the same large folio.
237	*
238	* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
239	* the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
240	* soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
241	*
242	* start_ptep must map any page of the folio. max_nr must be at least one and
243	* must be limited by the caller so scanning cannot exceed a single page table.
244	*
245	* Return: the number of table entries in the batch.
246	*/
247	static inline int folio_pte_batch(struct folio folio, unsigned* long addr,
248	pte_t start_ptep, pte_t pte, int* max_nr, fpb_t flags,
249	bool any_writable, bool any_young, bool *any_dirty)
250	{
251	pte_t expected_pte, *ptep;
252	bool writable, young, dirty;
253	int nr, cur_nr;
254
255	if (any_writable)
256	*any_writable = false;
257	if (any_young)
258	*any_young = false;
259	if (any_dirty)
260	*any_dirty = false;
261
262	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
263	VM_WARN_ON_FOLIO(!folio_test_large(folio) \|\| max_nr < `1`, folio);
264	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
265
266	/ Limit max_nr to the actual remaining PFNs in the folio we could batch. /
267	max_nr = min_t(unsigned long, max_nr,
268	folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
269
270	nr = pte_batch_hint(ptep: start_ptep, pte);
271	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
272	ptep = start_ptep + nr;
273
274	while (nr < max_nr) {
275	pte = ptep_get(ptep);
276	if (any_writable)
277	writable = !!pte_write(pte);
278	if (any_young)
279	young = !!pte_young(pte);
280	if (any_dirty)
281	dirty = !!pte_dirty(pte);
282	pte = __pte_batch_clear_ignored(pte, flags);
283
284	if (!pte_same(a: pte, b: expected_pte))
285	break;
286
287	if (any_writable)
288	*any_writable \|= writable;
289	if (any_young)
290	*any_young \|= young;
291	if (any_dirty)
292	*any_dirty \|= dirty;
293
294	cur_nr = pte_batch_hint(ptep, pte);
295	expected_pte = pte_advance_pfn(pte: expected_pte, nr: cur_nr);
296	ptep += cur_nr;
297	nr += cur_nr;
298	}
299
300	return min(nr, max_nr);
301	}
302
303	/**
304	* pte_move_swp_offset - Move the swap entry offset field of a swap pte
305	* forward or backward by delta
306	* @pte: The initial pte state; is_swap_pte(pte) must be true and
307	* non_swap_entry() must be false.
308	* @delta: The direction and the offset we are moving; forward if delta
309	* is positive; backward if delta is negative
310	*
311	* Moves the swap offset, while maintaining all other fields, including
312	* swap type, and any swp pte bits. The resulting pte is returned.
313	*/
314	static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
315	{
316	swp_entry_t entry = pte_to_swp_entry(pte);
317	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
318	(swp_offset(entry) + delta)));
319
320	if (pte_swp_soft_dirty(pte))
321	new = pte_swp_mksoft_dirty(pte: new);
322	if (pte_swp_exclusive(pte))
323	new = pte_swp_mkexclusive(pte: new);
324	if (pte_swp_uffd_wp(pte))
325	new = pte_swp_mkuffd_wp(pte: new);
326
327	return new;
328	}
329
330
331	/**
332	* pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
333	* @pte: The initial pte state; is_swap_pte(pte) must be true and
334	* non_swap_entry() must be false.
335	*
336	* Increments the swap offset, while maintaining all other fields, including
337	* swap type, and any swp pte bits. The resulting pte is returned.
338	*/
339	static inline pte_t pte_next_swp_offset(pte_t pte)
340	{
341	return pte_move_swp_offset(pte, delta: `1`);
342	}
343
344	/**
345	* swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
346	* @start_ptep: Page table pointer for the first entry.
347	* @max_nr: The maximum number of table entries to consider.
348	* @pte: Page table entry for the first entry.
349	*
350	* Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
351	* containing swap entries all with consecutive offsets and targeting the same
352	* swap type, all with matching swp pte bits.
353	*
354	* max_nr must be at least one and must be limited by the caller so scanning
355	* cannot exceed a single page table.
356	*
357	* Return: the number of table entries in the batch.
358	*/
359	static inline int swap_pte_batch(pte_t start_ptep, int* max_nr, pte_t pte)
360	{
361	pte_t expected_pte = pte_next_swp_offset(pte);
362	const pte_t *end_ptep = start_ptep + max_nr;
363	swp_entry_t entry = pte_to_swp_entry(pte);
364	pte_t *ptep = start_ptep + `1`;
365	unsigned short cgroup_id;
366
367	VM_WARN_ON(max_nr < `1`);
368	VM_WARN_ON(!is_swap_pte(pte));
369	VM_WARN_ON(non_swap_entry(entry));
370
371	cgroup_id = lookup_swap_cgroup_id(ent: entry);
372	while (ptep < end_ptep) {
373	pte = ptep_get(ptep);
374
375	if (!pte_same(a: pte, b: expected_pte))
376	break;
377	if (lookup_swap_cgroup_id(ent: pte_to_swp_entry(pte)) != cgroup_id)
378	break;
379	expected_pte = pte_next_swp_offset(pte: expected_pte);
380	ptep++;
381	}
382
383	return ptep - start_ptep;
384	}
385	#endif /* CONFIG_MMU */
386
387	void __acct_reclaim_writeback(pg_data_t pgdat, struct* folio *folio,
388	int nr_throttled);
389	static inline void acct_reclaim_writeback(struct folio *folio)
390	{
391	pg_data_t *pgdat = folio_pgdat(folio);
392	int nr_throttled = atomic_read(v: &pgdat->nr_writeback_throttled);
393
394	if (nr_throttled)
395	__acct_reclaim_writeback(pgdat, folio, nr_throttled);
396	}
397
398	static inline void wake_throttle_isolated(pg_data_t *pgdat)
399	{
400	wait_queue_head_t *wqh;
401
402	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
403	if (waitqueue_active(wq_head: wqh))
404	wake_up(wqh);
405	}
406
407	vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
408	static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
409	{
410	vm_fault_t ret = __vmf_anon_prepare(vmf);
411
412	if (unlikely(ret & VM_FAULT_RETRY))
413	vma_end_read(vma: vmf->vma);
414	return ret;
415	}
416
417	vm_fault_t do_swap_page(struct vm_fault *vmf);
418	void folio_rotate_reclaimable(struct folio *folio);
419	bool __folio_end_writeback(struct folio *folio);
420	void deactivate_file_folio(struct folio *folio);
421	void folio_activate(struct folio *folio);
422
423	void free_pgtables(struct mmu_gather tlb, struct* ma_state *mas,
424	struct vm_area_struct start_vma, unsigned* long floor,
425	unsigned long ceiling, bool mm_wr_locked);
426	void pmd_install(struct mm_struct mm, pmd_t pmd, pgtable_t *pte);
427
428	struct zap_details;
429	void unmap_page_range(struct mmu_gather *tlb,
430	struct vm_area_struct *vma,
431	unsigned long addr, unsigned long end,
432	struct zap_details *details);
433	void zap_page_range_single_batched(struct mmu_gather *tlb,
434	struct vm_area_struct vma, unsigned* long addr,
435	unsigned long size, struct zap_details *details);
436	int folio_unmap_invalidate(struct address_space mapping, struct* folio *folio,
437	gfp_t gfp);
438
439	void page_cache_ra_order(struct readahead_control , struct* file_ra_state *,
440	unsigned int order);
441	void force_page_cache_ra(struct readahead_control , unsigned* long nr);
442	static inline void force_page_cache_readahead(struct address_space *mapping,
443	struct file file, pgoff_t index, unsigned* long nr_to_read)
444	{
445	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
446	force_page_cache_ra(&ractl, nr: nr_to_read);
447	}
448
449	unsigned find_lock_entries(struct address_space mapping, pgoff_t start,
450	pgoff_t end, struct folio_batch fbatch, pgoff_t indices);
451	unsigned find_get_entries(struct address_space mapping, pgoff_t start,
452	pgoff_t end, struct folio_batch fbatch, pgoff_t indices);
453	void filemap_free_folio(struct address_space mapping, struct* folio *folio);
454	int truncate_inode_folio(struct address_space mapping, struct* folio *folio);
455	bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
456	loff_t end);
457	long mapping_evict_folio(struct address_space mapping, struct* folio *folio);
458	unsigned long mapping_try_invalidate(struct address_space *mapping,
459	pgoff_t start, pgoff_t end, unsigned long *nr_failed);
460
461	/**
462	* folio_evictable - Test whether a folio is evictable.
463	* @folio: The folio to test.
464	*
465	* Test whether @folio is evictable -- i.e., should be placed on
466	* active/inactive lists vs unevictable list.
467	*
468	* Reasons folio might not be evictable:
469	* 1. folio's mapping marked unevictable
470	* 2. One of the pages in the folio is part of an mlocked VMA
471	*/
472	static inline bool folio_evictable(struct folio *folio)
473	{
474	bool ret;
475
476	/ Prevent address_space of inode and swap cache from being freed /
477	rcu_read_lock();
478	ret = !mapping_unevictable(mapping: folio_mapping(folio)) &&
479	!folio_test_mlocked(folio);
480	rcu_read_unlock();
481	return ret;
482	}
483
484	/*
485	* Turn a non-refcounted page (->_refcount == 0) into refcounted with
486	* a count of one.
487	*/
488	static inline void set_page_refcounted(struct page *page)
489	{
490	VM_BUG_ON_PAGE(PageTail(page), page);
491	VM_BUG_ON_PAGE(page_ref_count(page), page);
492	set_page_count(page, v: `1`);
493	}
494
495	/*
496	* Return true if a folio needs ->release_folio() calling upon it.
497	*/
498	static inline bool folio_needs_release(struct folio *folio)
499	{
500	struct address_space *mapping = folio_mapping(folio);
501
502	return folio_has_private(folio) \|\|
503	(mapping && mapping_release_always(mapping));
504	}
505
506	extern unsigned long highest_memmap_pfn;
507
508	/*
509	* Maximum number of reclaim retries without progress before the OOM
510	* killer is consider the only way forward.
511	*/
512	#define MAX_RECLAIM_RETRIES 16
513
514	/*
515	* in mm/vmscan.c:
516	*/
517	bool folio_isolate_lru(struct folio *folio);
518	void folio_putback_lru(struct folio *folio);
519	extern void reclaim_throttle(pg_data_t pgdat, enum* vmscan_throttle_state reason);
520
521	/*
522	* in mm/rmap.c:
523	*/
524	pmd_t mm_find_pmd(struct* mm_struct mm, unsigned* long address);
525
526	/*
527	* in mm/page_alloc.c
528	*/
529	#define K(x) ((x) << (PAGE_SHIFT-10))
530
531	extern char * const zone_names[MAX_NR_ZONES];
532
533	/ perform sanity checks on struct pages being allocated or freed /
534	DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
535
536	extern int min_free_kbytes;
537	extern int defrag_mode;
538
539	void setup_per_zone_wmarks(void);
540	void calculate_min_free_kbytes(void);
541	int __meminit init_per_zone_wmark_min(void);
542	void page_alloc_sysctl_init(void);
543
544	/*
545	* Structure for holding the mostly immutable allocation parameters passed
546	* between functions involved in allocations, including the alloc_pages*
547	* family of functions.
548	*
549	* nodemask, migratetype and highest_zoneidx are initialized only once in
550	* __alloc_pages() and then never change.
551	*
552	* zonelist, preferred_zone and highest_zoneidx are set first in
553	* __alloc_pages() for the fast path, and might be later changed
554	* in __alloc_pages_slowpath(). All other functions pass the whole structure
555	* by a const pointer.
556	*/
557	struct alloc_context {
558	struct zonelist *zonelist;
559	nodemask_t *nodemask;
560	struct zoneref *preferred_zoneref;
561	int migratetype;
562
563	/*
564	* highest_zoneidx represents highest usable zone index of
565	* the allocation request. Due to the nature of the zone,
566	* memory on lower zone than the highest_zoneidx will be
567	* protected by lowmem_reserve[highest_zoneidx].
568	*
569	* highest_zoneidx is also used by reclaim/compaction to limit
570	* the target zone since higher zone than this index cannot be
571	* usable for this allocation request.
572	*/
573	enum zone_type highest_zoneidx;
574	bool spread_dirty_pages;
575	};
576
577	/*
578	* This function returns the order of a free page in the buddy system. In
579	* general, page_zone(page)->lock must be held by the caller to prevent the
580	* page from being allocated in parallel and returning garbage as the order.
581	* If a caller does not hold page_zone(page)->lock, it must guarantee that the
582	* page cannot be allocated or merged in parallel. Alternatively, it must
583	* handle invalid values gracefully, and use buddy_order_unsafe() below.
584	*/
585	static inline unsigned int buddy_order(struct page *page)
586	{
587	/ PageBuddy() must be checked by the caller /
588	return page_private(page);
589	}
590
591	/*
592	* Like buddy_order(), but for callers who cannot afford to hold the zone lock.
593	* PageBuddy() should be checked first by the caller to minimize race window,
594	* and invalid values must be handled gracefully.
595	*
596	* READ_ONCE is used so that if the caller assigns the result into a local
597	* variable and e.g. tests it for valid range before using, the compiler cannot
598	* decide to remove the variable and inline the page_private(page) multiple
599	* times, potentially observing different values in the tests and the actual
600	* use of the result.
601	*/
602	#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
603
604	/*
605	* This function checks whether a page is free && is the buddy
606	* we can coalesce a page and its buddy if
607	* (a) the buddy is not in a hole (check before calling!) &&
608	* (b) the buddy is in the buddy system &&
609	* (c) a page and its buddy have the same order &&
610	* (d) a page and its buddy are in the same zone.
611	*
612	* For recording whether a page is in the buddy system, we set PageBuddy.
613	* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
614	*
615	* For recording page's order, we use page_private(page).
616	*/
617	static inline bool page_is_buddy(struct page page, struct* page *buddy,
618	unsigned int order)
619	{
620	if (!page_is_guard(page: buddy) && !PageBuddy(page: buddy))
621	return false;
622
623	if (buddy_order(page: buddy) != order)
624	return false;
625
626	/*
627	* zone check is done late to avoid uselessly calculating
628	* zone/node ids for pages that could never merge.
629	*/
630	if (page_zone_id(page) != page_zone_id(page: buddy))
631	return false;
632
633	VM_BUG_ON_PAGE(page_count(buddy) != `0`, buddy);
634
635	return true;
636	}
637
638	/*
639	* Locate the struct page for both the matching buddy in our
640	* pair (buddy1) and the combined O(n+1) page they form (page).
641	*
642	* 1) Any buddy B1 will have an order O twin B2 which satisfies
643	* the following equation:
644	* B2 = B1 ^ (1 << O)
645	* For example, if the starting buddy (buddy2) is #8 its order
646	* 1 buddy is #10:
647	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
648	*
649	* 2) Any buddy B will have an order O+1 parent P which
650	* satisfies the following equation:
651	* P = B & ~(1 << O)
652	*
653	* Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
654	*/
655	static inline unsigned long
656	__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
657	{
658	return page_pfn ^ (`1` << order);
659	}
660
661	/*
662	* Find the buddy of @page and validate it.
663	* @page: The input page
664	* @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
665	* function is used in the performance-critical __free_one_page().
666	* @order: The order of the page
667	* @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
668	* page_to_pfn().
669	*
670	* The found buddy can be a non PageBuddy, out of @page's zone, or its order is
671	* not the same as @page. The validation is necessary before use it.
672	*
673	* Return: the found buddy page or NULL if not found.
674	*/
675	static inline struct page find_buddy_page_pfn(struct* page *page,
676	unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
677	{
678	unsigned long __buddy_pfn = __find_buddy_pfn(page_pfn: pfn, order);
679	struct page *buddy;
680
681	buddy = page + (__buddy_pfn - pfn);
682	if (buddy_pfn)
683	*buddy_pfn = __buddy_pfn;
684
685	if (page_is_buddy(page, buddy, order))
686	return buddy;
687	return NULL;
688	}
689
690	extern struct page __pageblock_pfn_to_page(unsigned* long start_pfn,
691	unsigned long end_pfn, struct zone *zone);
692
693	static inline struct page pageblock_pfn_to_page(unsigned* long start_pfn,
694	unsigned long end_pfn, struct zone *zone)
695	{
696	if (zone->contiguous)
697	return pfn_to_page(start_pfn);
698
699	return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
700	}
701
702	void set_zone_contiguous(struct zone *zone);
703	bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
704	unsigned long nr_pages);
705
706	static inline void clear_zone_contiguous(struct zone *zone)
707	{
708	zone->contiguous = false;
709	}
710
711	extern int __isolate_free_page(struct page page, unsigned* int order);
712	extern void __putback_isolated_page(struct page page, unsigned* int order,
713	int mt);
714	extern void memblock_free_pages(struct page page, unsigned* long pfn,
715	unsigned int order);
716	extern void __free_pages_core(struct page page, unsigned* int order,
717	enum meminit_context context);
718
719	/*
720	* This will have no effect, other than possibly generating a warning, if the
721	* caller passes in a non-large folio.
722	*/
723	static inline void folio_set_order(struct folio folio, unsigned* int order)
724	{
725	if (WARN_ON_ONCE(!order \|\| !folio_test_large(folio)))
726	return;
727
728	folio->_flags_1 = (folio->_flags_1 & ~`0xffUL`) \| order;
729	#ifdef NR_PAGES_IN_LARGE_FOLIO
730	folio->_nr_pages = `1U` << order;
731	#endif
732	}
733
734	bool __folio_unqueue_deferred_split(struct folio *folio);
735	static inline bool folio_unqueue_deferred_split(struct folio *folio)
736	{
737	if (folio_order(folio) <= `1` \|\| !folio_test_large_rmappable(folio))
738	return false;
739
740	/*
741	* At this point, there is no one trying to add the folio to
742	* deferred_list. If folio is not in deferred_list, it's safe
743	* to check without acquiring the split_queue_lock.
744	*/
745	if (data_race(list_empty(&folio->_deferred_list)))
746	return false;
747
748	return __folio_unqueue_deferred_split(folio);
749	}
750
751	static inline struct folio page_rmappable_folio(struct* page *page)
752	{
753	struct folio folio = (struct* folio *)page;
754
755	if (folio && folio_test_large(folio))
756	folio_set_large_rmappable(folio);
757	return folio;
758	}
759
760	static inline void prep_compound_head(struct page page, unsigned* int order)
761	{
762	struct folio folio = (struct* folio *)page;
763
764	folio_set_order(folio, order);
765	atomic_set(v: &folio->_large_mapcount, i: -`1`);
766	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
767	atomic_set(v: &folio->_nr_pages_mapped, i: `0`);
768	if (IS_ENABLED(CONFIG_MM_ID)) {
769	folio->_mm_ids = `0`;
770	folio->_mm_id_mapcount[`0`] = -`1`;
771	folio->_mm_id_mapcount[`1`] = -`1`;
772	}
773	if (IS_ENABLED(CONFIG_64BIT) \|\| order > `1`) {
774	atomic_set(v: &folio->_pincount, i: `0`);
775	atomic_set(v: &folio->_entire_mapcount, i: -`1`);
776	}
777	if (order > `1`)
778	INIT_LIST_HEAD(list: &folio->_deferred_list);
779	}
780
781	static inline void prep_compound_tail(struct page head, int* tail_idx)
782	{
783	struct page *p = head + tail_idx;
784
785	p->mapping = TAIL_MAPPING;
786	set_compound_head(page: p, head);
787	set_page_private(page: p, private: `0`);
788	}
789
790	void post_alloc_hook(struct page page, unsigned* int order, gfp_t gfp_flags);
791	extern bool free_pages_prepare(struct page page, unsigned* int order);
792
793	extern int user_min_free_kbytes;
794
795	struct page __alloc_frozen_pages_noprof(gfp_t, unsigned* int order, int nid,
796	nodemask_t *);
797	#define __alloc_frozen_pages(...) \
798	alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
799	void free_frozen_pages(struct page page, unsigned* int order);
800	void free_unref_folios(struct folio_batch *fbatch);
801
802	#ifdef CONFIG_NUMA
803	struct page alloc_frozen_pages_noprof(gfp_t, unsigned* int order);
804	#else
805	static inline struct page alloc_frozen_pages_noprof(gfp_t gfp, unsigned* int order)
806	{
807	return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
808	}
809	#endif
810
811	#define alloc_frozen_pages(...) \
812	alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
813
814	extern void zone_pcp_reset(struct zone *zone);
815	extern void zone_pcp_disable(struct zone *zone);
816	extern void zone_pcp_enable(struct zone *zone);
817	extern void zone_pcp_init(struct zone *zone);
818
819	extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
820	phys_addr_t min_addr,
821	int nid, bool exact_nid);
822
823	void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
824	unsigned long, enum meminit_context, struct vmem_altmap , int*);
825
826	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
827
828	/*
829	* in mm/compaction.c
830	*/
831	/*
832	* compact_control is used to track pages being migrated and the free pages
833	* they are being migrated to during memory compaction. The free_pfn starts
834	* at the end of a zone and migrate_pfn begins at the start. Movable pages
835	* are moved to the end of a zone during a compaction run and the run
836	* completes when free_pfn <= migrate_pfn
837	*/
838	struct compact_control {
839	struct list_head freepages[NR_PAGE_ORDERS]; / List of free pages to migrate to /
840	struct list_head migratepages; / List of pages being migrated /
841	unsigned int nr_freepages; / Number of isolated free pages /
842	unsigned int nr_migratepages; / Number of pages to migrate /
843	unsigned long free_pfn; / isolate_freepages search base /
844	/*
845	* Acts as an in/out parameter to page isolation for migration.
846	* isolate_migratepages uses it as a search base.
847	* isolate_migratepages_block will update the value to the next pfn
848	* after the last isolated one.
849	*/
850	unsigned long migrate_pfn;
851	unsigned long fast_start_pfn; / a pfn to start linear scan from /
852	struct zone *zone;
853	unsigned long total_migrate_scanned;
854	unsigned long total_free_scanned;
855	unsigned short fast_search_fail;/ failures to use free list searches /
856	short search_order; / order to start a fast search at /
857	const gfp_t gfp_mask; / gfp mask of a direct compactor /
858	int order; / order a direct compactor needs /
859	int migratetype; / migratetype of direct compactor /
860	const unsigned int alloc_flags; / alloc flags of a direct compactor /
861	const int highest_zoneidx; / zone index of a direct compactor /
862	enum migrate_mode mode; / Async or sync migration mode /
863	bool ignore_skip_hint; / Scan blocks even if marked skip /
864	bool no_set_skip_hint; / Don't mark blocks for skipping /
865	bool ignore_block_suitable; / Scan blocks considered unsuitable /
866	bool direct_compaction; / False from kcompactd or /proc/... /
867	bool proactive_compaction; / kcompactd proactive compaction /
868	bool whole_zone; / Whole zone should/has been scanned /
869	bool contended; / Signal lock contention /
870	bool finish_pageblock; / Scan the remainder of a pageblock. Used*
871	* when there are potentially transient
872	* isolation or migration failures to
873	* ensure forward progress.
874	*/
875	bool alloc_contig; / alloc_contig_range allocation /
876	};
877
878	/*
879	* Used in direct compaction when a page should be taken from the freelists
880	* immediately when one is created during the free path.
881	*/
882	struct capture_control {
883	struct compact_control *cc;
884	struct page *page;
885	};
886
887	unsigned long
888	isolate_freepages_range(struct compact_control *cc,
889	unsigned long start_pfn, unsigned long end_pfn);
890	int
891	isolate_migratepages_range(struct compact_control *cc,
892	unsigned long low_pfn, unsigned long end_pfn);
893
894	/ Free whole pageblock and set its migration type to MIGRATE_CMA. /
895	void init_cma_reserved_pageblock(struct page *page);
896
897	#endif /* CONFIG_COMPACTION \|\| CONFIG_CMA */
898
899	struct cma;
900
901	#ifdef CONFIG_CMA
902	void cma_reserve_early(struct* cma cma, unsigned* long size);
903	void init_cma_pageblock(struct page *page);
904	#else
905	static inline void cma_reserve_early(struct* cma cma, unsigned* long size)
906	{
907	return NULL;
908	}
909	static inline void init_cma_pageblock(struct page *page)
910	{
911	}
912	#endif
913
914
915	int find_suitable_fallback(struct free_area area, unsigned* int order,
916	int migratetype, bool claimable);
917
918	static inline bool free_area_empty(struct free_area area, int* migratetype)
919	{
920	return list_empty(head: &area->free_list[migratetype]);
921	}
922
923	/ mm/util.c /
924	struct anon_vma folio_anon_vma(const* struct folio *folio);
925
926	#ifdef CONFIG_MMU
927	void unmap_mapping_folio(struct folio *folio);
928	extern long populate_vma_page_range(struct vm_area_struct *vma,
929	unsigned long start, unsigned long end, int *locked);
930	extern long faultin_page_range(struct mm_struct mm, unsigned* long start,
931	unsigned long end, bool write, int *locked);
932	extern bool mlock_future_ok(struct mm_struct mm, unsigned* long flags,
933	unsigned long bytes);
934
935	/*
936	* NOTE: This function can't tell whether the folio is "fully mapped" in the
937	* range.
938	* "fully mapped" means all the pages of folio is associated with the page
939	* table of range while this function just check whether the folio range is
940	* within the range [start, end). Function caller needs to do page table
941	* check if it cares about the page table association.
942	*
943	* Typical usage (like mlock or madvise) is:
944	* Caller knows at least 1 page of folio is associated with page table of VMA
945	* and the range [start, end) is intersect with the VMA range. Caller wants
946	* to know whether the folio is fully associated with the range. It calls
947	* this function to check whether the folio is in the range first. Then checks
948	* the page table to know whether the folio is fully mapped to the range.
949	*/
950	static inline bool
951	folio_within_range(struct folio folio, struct* vm_area_struct *vma,
952	unsigned long start, unsigned long end)
953	{
954	pgoff_t pgoff, addr;
955	unsigned long vma_pglen = vma_pages(vma);
956
957	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
958	if (start > end)
959	return false;
960
961	if (start < vma->vm_start)
962	start = vma->vm_start;
963
964	if (end > vma->vm_end)
965	end = vma->vm_end;
966
967	pgoff = folio_pgoff(folio);
968
969	/ if folio start address is not in vma range /
970	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
971	return false;
972
973	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
974
975	return !(addr < start \|\| end - addr < folio_size(folio));
976	}
977
978	static inline bool
979	folio_within_vma(struct folio folio, struct* vm_area_struct *vma)
980	{
981	return folio_within_range(folio, vma, start: vma->vm_start, end: vma->vm_end);
982	}
983
984	/*
985	* mlock_vma_folio() and munlock_vma_folio():
986	* should be called with vma's mmap_lock held for read or write,
987	* under page table lock for the pte/pmd being added or removed.
988	*
989	* mlock is usually called at the end of folio_add__rmap_(), munlock at
990	* the end of folio_remove_rmap_*(); but new anon folios are managed by
991	* folio_add_lru_vma() calling mlock_new_folio().
992	*/
993	void mlock_folio(struct folio *folio);
994	static inline void mlock_vma_folio(struct folio *folio,
995	struct vm_area_struct *vma)
996	{
997	/*
998	* The VM_SPECIAL check here serves two purposes.
999	* 1) VM_IO check prevents migration from double-counting during mlock.
1000	* 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
1001	* is never left set on a VM_SPECIAL vma, there is an interval while
1002	* file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
1003	* still be set while VM_SPECIAL bits are added: so ignore it then.
1004	*/
1005	if (unlikely((vma->vm_flags & (VM_LOCKED\|VM_SPECIAL)) == VM_LOCKED))
1006	mlock_folio(folio);
1007	}
1008
1009	void munlock_folio(struct folio *folio);
1010	static inline void munlock_vma_folio(struct folio *folio,
1011	struct vm_area_struct *vma)
1012	{
1013	/*
1014	* munlock if the function is called. Ideally, we should only
1015	* do munlock if any page of folio is unmapped from VMA and
1016	* cause folio not fully mapped to VMA.
1017	*
1018	* But it's not easy to confirm that's the situation. So we
1019	* always munlock the folio and page reclaim will correct it
1020	* if it's wrong.
1021	*/
1022	if (unlikely(vma->vm_flags & VM_LOCKED))
1023	munlock_folio(folio);
1024	}
1025
1026	void mlock_new_folio(struct folio *folio);
1027	bool need_mlock_drain(int cpu);
1028	void mlock_drain_local(void);
1029	void mlock_drain_remote(int cpu);
1030
1031	extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
1032
1033	/**
1034	* vma_address - Find the virtual address a page range is mapped at
1035	* @vma: The vma which maps this object.
1036	* @pgoff: The page offset within its object.
1037	* @nr_pages: The number of pages to consider.
1038	*
1039	* If any page in this range is mapped by this VMA, return the first address
1040	* where any of these pages appear. Otherwise, return -EFAULT.
1041	*/
1042	static inline unsigned long vma_address(const struct vm_area_struct *vma,
1043	pgoff_t pgoff, unsigned long nr_pages)
1044	{
1045	unsigned long address;
1046
1047	if (pgoff >= vma->vm_pgoff) {
1048	address = vma->vm_start +
1049	((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1050	/ Check for address beyond vma (or wrapped through 0?) /
1051	if (address < vma->vm_start \|\| address >= vma->vm_end)
1052	address = -EFAULT;
1053	} else if (pgoff + nr_pages - `1` >= vma->vm_pgoff) {
1054	/ Test above avoids possibility of wrap to 0 on 32-bit /
1055	address = vma->vm_start;
1056	} else {
1057	address = -EFAULT;
1058	}
1059	return address;
1060	}
1061
1062	/*
1063	* Then at what user virtual address will none of the range be found in vma?
1064	* Assumes that vma_address() already returned a good starting address.
1065	*/
1066	static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
1067	{
1068	struct vm_area_struct *vma = pvmw->vma;
1069	pgoff_t pgoff;
1070	unsigned long address;
1071
1072	/ Common case, plus ->pgoff is invalid for KSM /
1073	if (pvmw->nr_pages == `1`)
1074	return pvmw->address + PAGE_SIZE;
1075
1076	pgoff = pvmw->pgoff + pvmw->nr_pages;
1077	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1078	/ Check for address beyond vma (or wrapped through 0?) /
1079	if (address < vma->vm_start \|\| address > vma->vm_end)
1080	address = vma->vm_end;
1081	return address;
1082	}
1083
1084	static inline struct file maybe_unlock_mmap_for_io(struct* vm_fault *vmf,
1085	struct file *fpin)
1086	{
1087	int flags = vmf->flags;
1088
1089	if (fpin)
1090	return fpin;
1091
1092	/*
1093	* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
1094	* anything, so we only pin the file and drop the mmap_lock if only
1095	* FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
1096	*/
1097	if (fault_flag_allow_retry_first(flags) &&
1098	!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
1099	fpin = get_file(f: vmf->vma->vm_file);
1100	release_fault_lock(vmf);
1101	}
1102	return fpin;
1103	}
1104	#else /* !CONFIG_MMU */
1105	static inline void unmap_mapping_folio(struct folio *folio) { }
1106	static inline void mlock_new_folio(struct folio *folio) { }
1107	static inline bool need_mlock_drain(int cpu) { return false; }
1108	static inline void mlock_drain_local(void) { }
1109	static inline void mlock_drain_remote(int cpu) { }
1110	static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
1111	{
1112	}
1113	#endif /* !CONFIG_MMU */
1114
1115	/ Memory initialisation debug and verification /
1116	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1117	DECLARE_STATIC_KEY_TRUE(deferred_pages);
1118
1119	bool __init deferred_grow_zone(struct zone zone, unsigned* int order);
1120	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1121
1122	void init_deferred_page(unsigned long pfn, int nid);
1123
1124	enum mminit_level {
1125	MMINIT_WARNING,
1126	MMINIT_VERIFY,
1127	MMINIT_TRACE
1128	};
1129
1130	#ifdef CONFIG_DEBUG_MEMORY_INIT
1131
1132	extern int mminit_loglevel;
1133
1134	#define mminit_dprintk(level, prefix, fmt, arg...) \
1135	do { \
1136	if (level < mminit_loglevel) { \
1137	if (level <= MMINIT_WARNING) \
1138	pr_warn("mminit::" prefix " " fmt, ##arg); \
1139	else \
1140	printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
1141	} \
1142	} while (0)
1143
1144	extern void mminit_verify_pageflags_layout(void);
1145	extern void mminit_verify_zonelist(void);
1146	#else
1147
1148	static inline void mminit_dprintk(enum mminit_level level,
1149	const char prefix, const* char *fmt, ...)
1150	{
1151	}
1152
1153	static inline void mminit_verify_pageflags_layout(void)
1154	{
1155	}
1156
1157	static inline void mminit_verify_zonelist(void)
1158	{
1159	}
1160	#endif /* CONFIG_DEBUG_MEMORY_INIT */
1161
1162	#define NODE_RECLAIM_NOSCAN -2
1163	#define NODE_RECLAIM_FULL -1
1164	#define NODE_RECLAIM_SOME 0
1165	#define NODE_RECLAIM_SUCCESS 1
1166
1167	#ifdef CONFIG_NUMA
1168	extern int node_reclaim_mode;
1169
1170	extern int node_reclaim(struct pglist_data , gfp_t, unsigned* int);
1171	extern int find_next_best_node(int node, nodemask_t *used_node_mask);
1172	#else
1173	#define node_reclaim_mode 0
1174
1175	static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
1176	unsigned int order)
1177	{
1178	return NODE_RECLAIM_NOSCAN;
1179	}
1180	static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
1181	{
1182	return NUMA_NO_NODE;
1183	}
1184	#endif
1185
1186	static inline bool node_reclaim_enabled(void)
1187	{
1188	/ Is any node_reclaim_mode bit set? /
1189	return node_reclaim_mode & (RECLAIM_ZONE\|RECLAIM_WRITE\|RECLAIM_UNMAP);
1190	}
1191
1192	/*
1193	* mm/memory-failure.c
1194	*/
1195	#ifdef CONFIG_MEMORY_FAILURE
1196	int unmap_poisoned_folio(struct folio folio, unsigned* long pfn, bool must_kill);
1197	void shake_folio(struct folio *folio);
1198	extern int hwpoison_filter(struct page *p);
1199
1200	extern u32 hwpoison_filter_dev_major;
1201	extern u32 hwpoison_filter_dev_minor;
1202	extern u64 hwpoison_filter_flags_mask;
1203	extern u64 hwpoison_filter_flags_value;
1204	extern u64 hwpoison_filter_memcg;
1205	extern u32 hwpoison_filter_enable;
1206	#define MAGIC_HWPOISON 0x48575053U /* HWPS */
1207	void SetPageHWPoisonTakenOff(struct page *page);
1208	void ClearPageHWPoisonTakenOff(struct page *page);
1209	bool take_page_off_buddy(struct page *page);
1210	bool put_page_back_buddy(struct page *page);
1211	struct task_struct task_early_kill(struct* task_struct tsk, int* force_early);
1212	void add_to_kill_ksm(struct task_struct tsk, const* struct page *p,
1213	struct vm_area_struct vma, struct* list_head *to_kill,
1214	unsigned long ksm_addr);
1215	unsigned long page_mapped_in_vma(const struct page *page,
1216	struct vm_area_struct *vma);
1217
1218	#else
1219	static inline int unmap_poisoned_folio(struct folio folio, unsigned* long pfn, bool must_kill)
1220	{
1221	return -EBUSY;
1222	}
1223	#endif
1224
1225	extern unsigned long __must_check vm_mmap_pgoff(struct file , unsigned* long,
1226	unsigned long, unsigned long,
1227	unsigned long, unsigned long);
1228
1229	extern void set_pageblock_order(void);
1230	struct folio alloc_migrate_folio(struct* folio src, unsigned* long private);
1231	unsigned long reclaim_pages(struct list_head *folio_list);
1232	unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1233	struct list_head *folio_list);
1234	/ The ALLOC_WMARK bits are used as an index to zone->watermark /
1235	#define ALLOC_WMARK_MIN WMARK_MIN
1236	#define ALLOC_WMARK_LOW WMARK_LOW
1237	#define ALLOC_WMARK_HIGH WMARK_HIGH
1238	#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1239
1240	/ Mask to get the watermark bits /
1241	#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1242
1243	/*
1244	* Only MMU archs have async oom victim reclaim - aka oom_reaper so we
1245	* cannot assume a reduced access to memory reserves is sufficient for
1246	* !MMU
1247	*/
1248	#ifdef CONFIG_MMU
1249	#define ALLOC_OOM 0x08
1250	#else
1251	#define ALLOC_OOM ALLOC_NO_WATERMARKS
1252	#endif
1253
1254	#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
1255	* to 25% of the min watermark or
1256	* 62.5% if __GFP_HIGH is set.
1257	*/
1258	#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
1259	* of the min watermark.
1260	*/
1261	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1262	#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
1263	#ifdef CONFIG_ZONE_DMA32
1264	#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
1265	#else
1266	#define ALLOC_NOFRAGMENT 0x0
1267	#endif
1268	#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
1269	#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
1270	#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
1271
1272	/ Flags that allow allocations below the min watermark. /
1273	#define ALLOC_RESERVES (ALLOC_NON_BLOCK\|ALLOC_MIN_RESERVE\|ALLOC_HIGHATOMIC\|ALLOC_OOM)
1274
1275	enum ttu_flags;
1276	struct tlbflush_unmap_batch;
1277
1278
1279	/*
1280	* only for MM internal work items which do not depend on
1281	* any allocations or locks which might depend on allocations
1282	*/
1283	extern struct workqueue_struct *mm_percpu_wq;
1284
1285	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1286	void try_to_unmap_flush(void);
1287	void try_to_unmap_flush_dirty(void);
1288	void flush_tlb_batched_pending(struct mm_struct *mm);
1289	#else
1290	static inline void try_to_unmap_flush(void)
1291	{
1292	}
1293	static inline void try_to_unmap_flush_dirty(void)
1294	{
1295	}
1296	static inline void flush_tlb_batched_pending(struct mm_struct *mm)
1297	{
1298	}
1299	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
1300
1301	extern const struct trace_print_flags pageflag_names[];
1302	extern const struct trace_print_flags vmaflag_names[];
1303	extern const struct trace_print_flags gfpflag_names[];
1304
1305	static inline bool is_migrate_highatomic(enum migratetype migratetype)
1306	{
1307	return migratetype == MIGRATE_HIGHATOMIC;
1308	}
1309
1310	void setup_zone_pageset(struct zone *zone);
1311
1312	struct migration_target_control {
1313	int nid; / preferred node id /
1314	nodemask_t *nmask;
1315	gfp_t gfp_mask;
1316	enum migrate_reason reason;
1317	};
1318
1319	/*
1320	* mm/filemap.c
1321	*/
1322	size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
1323	struct folio *folio, loff_t fpos, size_t size);
1324
1325	/*
1326	* mm/vmalloc.c
1327	*/
1328	#ifdef CONFIG_MMU
1329	void __init vmalloc_init(void);
1330	int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1331	pgprot_t prot, struct page *pages, unsigned* int page_shift);
1332	unsigned int get_vm_area_page_order(struct vm_struct *vm);
1333	#else
1334	static inline void vmalloc_init(void)
1335	{
1336	}
1337
1338	static inline
1339	int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1340	pgprot_t prot, struct page *pages, unsigned* int page_shift)
1341	{
1342	return -EINVAL;
1343	}
1344	#endif
1345
1346	int __must_check __vmap_pages_range_noflush(unsigned long addr,
1347	unsigned long end, pgprot_t prot,
1348	struct page *pages, unsigned* int page_shift);
1349
1350	void vunmap_range_noflush(unsigned long start, unsigned long end);
1351
1352	void __vunmap_range_noflush(unsigned long start, unsigned long end);
1353
1354	int numa_migrate_check(struct folio folio, struct* vm_fault *vmf,
1355	unsigned long addr, int *flags, bool writable,
1356	int *last_cpupid);
1357
1358	void free_zone_device_folio(struct folio *folio);
1359	int migrate_device_coherent_folio(struct folio *folio);
1360
1361	struct vm_struct __get_vm_area_node(unsigned* long size,
1362	unsigned long align, unsigned long shift,
1363	unsigned long flags, unsigned long start,
1364	unsigned long end, int node, gfp_t gfp_mask,
1365	const void *caller);
1366
1367	/*
1368	* mm/gup.c
1369	*/
1370	int __must_check try_grab_folio(struct folio folio, int* refs,
1371	unsigned int flags);
1372
1373	/*
1374	* mm/huge_memory.c
1375	*/
1376	void touch_pud(struct vm_area_struct vma, unsigned* long addr,
1377	pud_t *pud, bool write);
1378	void touch_pmd(struct vm_area_struct vma, unsigned* long addr,
1379	pmd_t *pmd, bool write);
1380
1381	/*
1382	* Parses a string with mem suffixes into its order. Useful to parse kernel
1383	* parameters.
1384	*/
1385	static inline int get_order_from_str(const char *size_str,
1386	unsigned long valid_orders)
1387	{
1388	unsigned long size;
1389	char *endptr;
1390	int order;
1391
1392	size = memparse(ptr: size_str, retptr: &endptr);
1393
1394	if (!is_power_of_2(n: size))
1395	return -EINVAL;
1396	order = get_order(size);
1397	if (BIT(order) & ~valid_orders)
1398	return -EINVAL;
1399
1400	return order;
1401	}
1402
1403	enum {
1404	/ mark page accessed /
1405	FOLL_TOUCH = `1` << `16`,
1406	/ a retry, previous pass started an IO /
1407	FOLL_TRIED = `1` << `17`,
1408	/ we are working on non-current tsk/mm /
1409	FOLL_REMOTE = `1` << `18`,
1410	/ pages must be released via unpin_user_page /
1411	FOLL_PIN = `1` << `19`,
1412	/ gup_fast: prevent fall-back to slow gup /
1413	FOLL_FAST_ONLY = `1` << `20`,
1414	/ allow unlocking the mmap lock /
1415	FOLL_UNLOCKABLE = `1` << `21`,
1416	/ VMA lookup+checks compatible with MADV_POPULATE_(READ\|WRITE) /
1417	FOLL_MADV_POPULATE = `1` << `22`,
1418	};
1419
1420	#define INTERNAL_GUP_FLAGS (FOLL_TOUCH \| FOLL_TRIED \| FOLL_REMOTE \| FOLL_PIN \| \
1421	FOLL_FAST_ONLY \| FOLL_UNLOCKABLE \| \
1422	FOLL_MADV_POPULATE)
1423
1424	/*
1425	* Indicates for which pages that are write-protected in the page table,
1426	* whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
1427	* GUP pin will remain consistent with the pages mapped into the page tables
1428	* of the MM.
1429	*
1430	* Temporary unmapping of PageAnonExclusive() pages or clearing of
1431	* PageAnonExclusive() has to protect against concurrent GUP:
1432	* * Ordinary GUP: Using the PT lock
1433	* * GUP-fast and fork(): mm->write_protect_seq
1434	* * GUP-fast and KSM or temporary unmapping (swap, migration): see
1435	* folio_try_share_anon_rmap_*()
1436	*
1437	* Must be called with the (sub)page that's actually referenced via the
1438	* page table entry, which might not necessarily be the head page for a
1439	* PTE-mapped THP.
1440	*
1441	* If the vma is NULL, we're coming from the GUP-fast path and might have
1442	* to fallback to the slow path just to lookup the vma.
1443	*/
1444	static inline bool gup_must_unshare(struct vm_area_struct *vma,
1445	unsigned int flags, struct page *page)
1446	{
1447	/*
1448	* FOLL_WRITE is implicitly handled correctly as the page table entry
1449	* has to be writable -- and if it references (part of) an anonymous
1450	* folio, that part is required to be marked exclusive.
1451	*/
1452	if ((flags & (FOLL_WRITE \| FOLL_PIN)) != FOLL_PIN)
1453	return false;
1454	/*
1455	* Note: PageAnon(page) is stable until the page is actually getting
1456	* freed.
1457	*/
1458	if (!PageAnon(page)) {
1459	/*
1460	* We only care about R/O long-term pining: R/O short-term
1461	* pinning does not have the semantics to observe successive
1462	* changes through the process page tables.
1463	*/
1464	if (!(flags & FOLL_LONGTERM))
1465	return false;
1466
1467	/ We really need the vma ... /
1468	if (!vma)
1469	return true;
1470
1471	/*
1472	* ... because we only care about writable private ("COW")
1473	* mappings where we have to break COW early.
1474	*/
1475	return is_cow_mapping(flags: vma->vm_flags);
1476	}
1477
1478	/ Paired with a memory barrier in folio_try_share_anon_rmap_(). /*
1479	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
1480	smp_rmb();
1481
1482	/*
1483	* Note that KSM pages cannot be exclusive, and consequently,
1484	* cannot get pinned.
1485	*/
1486	return !PageAnonExclusive(page);
1487	}
1488
1489	extern bool mirrored_kernelcore;
1490	bool memblock_has_mirror(void);
1491	void memblock_free_all(void);
1492
1493	static __always_inline void vma_set_range(struct vm_area_struct *vma,
1494	unsigned long start, unsigned long end,
1495	pgoff_t pgoff)
1496	{
1497	vma->vm_start = start;
1498	vma->vm_end = end;
1499	vma->vm_pgoff = pgoff;
1500	}
1501
1502	static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
1503	{
1504	/*
1505	* NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
1506	* enablements, because when without soft-dirty being compiled in,
1507	* VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
1508	* will be constantly true.
1509	*/
1510	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
1511	return false;
1512
1513	/*
1514	* Soft-dirty is kind of special: its tracking is enabled when the
1515	* vma flags not set.
1516	*/
1517	return !(vma->vm_flags & VM_SOFTDIRTY);
1518	}
1519
1520	static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
1521	{
1522	return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
1523	}
1524
1525	static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
1526	{
1527	return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
1528	}
1529
1530	void __meminit __init_single_page(struct page page, unsigned* long pfn,
1531	unsigned long zone, int nid);
1532	void __meminit __init_page_from_nid(unsigned long pfn, int nid);
1533
1534	/ shrinker related functions /
1535	unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
1536	int priority);
1537
1538	#ifdef CONFIG_SHRINKER_DEBUG
1539	static inline __printf(`2`, `0`) int shrinker_debugfs_name_alloc(
1540	struct shrinker shrinker, const* char *fmt, va_list ap)
1541	{
1542	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, args: ap);
1543
1544	return shrinker->name ? `0` : -ENOMEM;
1545	}
1546
1547	static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1548	{
1549	kfree_const(x: shrinker->name);
1550	shrinker->name = NULL;
1551	}
1552
1553	extern int shrinker_debugfs_add(struct shrinker *shrinker);
1554	extern struct dentry shrinker_debugfs_detach(struct* shrinker *shrinker,
1555	int *debugfs_id);
1556	extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1557	int debugfs_id);
1558	#else /* CONFIG_SHRINKER_DEBUG */
1559	static inline int shrinker_debugfs_add(struct shrinker *shrinker)
1560	{
1561	return `0`;
1562	}
1563	static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
1564	const char *fmt, va_list ap)
1565	{
1566	return `0`;
1567	}
1568	static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1569	{
1570	}
1571	static inline struct dentry shrinker_debugfs_detach(struct* shrinker *shrinker,
1572	int *debugfs_id)
1573	{
1574	*debugfs_id = -`1`;
1575	return NULL;
1576	}
1577	static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1578	int debugfs_id)
1579	{
1580	}
1581	#endif /* CONFIG_SHRINKER_DEBUG */
1582
1583	/ Only track the nodes of mappings with shadow entries /
1584	void workingset_update_node(struct xa_node *node);
1585	extern struct list_lru shadow_nodes;
1586	#define mapping_set_update(xas, mapping) do { \
1587	if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \
1588	xas_set_update(xas, workingset_update_node); \
1589	xas_set_lru(xas, &shadow_nodes); \
1590	} \
1591	} while (0)
1592
1593	/ mremap.c /
1594	unsigned long move_page_tables(struct pagetable_move_control *pmc);
1595
1596	#ifdef CONFIG_UNACCEPTED_MEMORY
1597	void accept_page(struct page *page);
1598	#else /* CONFIG_UNACCEPTED_MEMORY */
1599	static inline void accept_page(struct page *page)
1600	{
1601	}
1602	#endif /* CONFIG_UNACCEPTED_MEMORY */
1603
1604	/ pagewalk.c /
1605	int walk_page_range_mm(struct mm_struct mm, unsigned* long start,
1606	unsigned long end, const struct mm_walk_ops *ops,
1607	void *private);
1608
1609	/ pt_reclaim.c /
1610	bool try_get_and_clear_pmd(struct mm_struct mm, pmd_t pmd, pmd_t *pmdval);
1611	void free_pte(struct mm_struct mm, unsigned* long addr, struct mmu_gather *tlb,
1612	pmd_t pmdval);
1613	void try_to_free_pte(struct mm_struct mm, pmd_t pmd, unsigned long addr,
1614	struct mmu_gather *tlb);
1615
1616	#ifdef CONFIG_PT_RECLAIM
1617	bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
1618	struct zap_details *details);
1619	#else
1620	static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
1621	struct zap_details *details)
1622	{
1623	return false;
1624	}
1625	#endif /* CONFIG_PT_RECLAIM */
1626
1627	void dup_mm_exe_file(struct mm_struct mm, struct* mm_struct *oldmm);
1628	int dup_mmap(struct mm_struct mm, struct* mm_struct *oldmm);
1629
1630	#endif /* __MM_INTERNAL_H */
1631

source code of linux/mm/internal.h