pagewalk.c source code [linux/mm/pagewalk.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/pagewalk.h>
3	#include <linux/highmem.h>
4	#include <linux/sched.h>
5	#include <linux/hugetlb.h>
6
7	/*
8	* We want to know the real level where a entry is located ignoring any
9	* folding of levels which may be happening. For example if p4d is folded then
10	* a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
11	*/
12	static int real_depth(int depth)
13	{
14	if (depth == `3` && PTRS_PER_PMD == `1`)
15	depth = `2`;
16	if (depth == `2` && PTRS_PER_PUD == `1`)
17	depth = `1`;
18	if (depth == `1` && PTRS_PER_P4D == `1`)
19	depth = `0`;
20	return depth;
21	}
22
23	static int walk_pte_range_inner(pte_t pte, unsigned* long addr,
24	unsigned long end, struct mm_walk *walk)
25	{
26	const struct mm_walk_ops *ops = walk->ops;
27	int err = `0`;
28
29	for (;;) {
30	err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
31	if (err)
32	break;
33	if (addr >= end - PAGE_SIZE)
34	break;
35	addr += PAGE_SIZE;
36	pte++;
37	}
38	return err;
39	}
40
41	static int walk_pte_range(pmd_t pmd, unsigned* long addr, unsigned long end,
42	struct mm_walk *walk)
43	{
44	pte_t *pte;
45	int err = `0`;
46	spinlock_t *ptl;
47
48	if (walk->no_vma) {
49	/*
50	* pte_offset_map() might apply user-specific validation.
51	* Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
52	* fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
53	* and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
54	*/
55	if (walk->mm == &init_mm \|\| addr >= TASK_SIZE)
56	pte = pte_offset_kernel(pmd, address: addr);
57	else
58	pte = pte_offset_map(pmd, addr);
59	if (pte) {
60	err = walk_pte_range_inner(pte, addr, end, walk);
61	if (walk->mm != &init_mm && addr < TASK_SIZE)
62	pte_unmap(pte);
63	}
64	} else {
65	pte = pte_offset_map_lock(mm: walk->mm, pmd, addr, ptlp: &ptl);
66	if (pte) {
67	err = walk_pte_range_inner(pte, addr, end, walk);
68	pte_unmap_unlock(pte, ptl);
69	}
70	}
71	if (!pte)
72	walk->action = ACTION_AGAIN;
73	return err;
74	}
75
76	#ifdef CONFIG_ARCH_HAS_HUGEPD
77	static int walk_hugepd_range(hugepd_t phpd, unsigned* long addr,
78	unsigned long end, struct mm_walk walk, int* pdshift)
79	{
80	int err = `0`;
81	const struct mm_walk_ops *ops = walk->ops;
82	int shift = hugepd_shift(*phpd);
83	int page_size = `1` << shift;
84
85	if (!ops->pte_entry)
86	return `0`;
87
88	if (addr & (page_size - `1`))
89	return `0`;
90
91	for (;;) {
92	pte_t *pte;
93
94	spin_lock(&walk->mm->page_table_lock);
95	pte = hugepte_offset(*phpd, addr, pdshift);
96	err = ops->pte_entry(pte, addr, addr + page_size, walk);
97	spin_unlock(&walk->mm->page_table_lock);
98
99	if (err)
100	break;
101	if (addr >= end - page_size)
102	break;
103	addr += page_size;
104	}
105	return err;
106	}
107	#else
108	static int walk_hugepd_range(hugepd_t phpd, unsigned* long addr,
109	unsigned long end, struct mm_walk walk, int* pdshift)
110	{
111	return `0`;
112	}
113	#endif
114
115	static int walk_pmd_range(pud_t pud, unsigned* long addr, unsigned long end,
116	struct mm_walk *walk)
117	{
118	pmd_t *pmd;
119	unsigned long next;
120	const struct mm_walk_ops *ops = walk->ops;
121	int err = `0`;
122	int depth = real_depth(depth: `3`);
123
124	pmd = pmd_offset(pud, address: addr);
125	do {
126	again:
127	next = pmd_addr_end(addr, end);
128	if (pmd_none(pmd: *pmd)) {
129	if (ops->pte_hole)
130	err = ops->pte_hole(addr, next, depth, walk);
131	if (err)
132	break;
133	continue;
134	}
135
136	walk->action = ACTION_SUBTREE;
137
138	/*
139	* This implies that each ->pmd_entry() handler
140	* needs to know about pmd_trans_huge() pmds
141	*/
142	if (ops->pmd_entry)
143	err = ops->pmd_entry(pmd, addr, next, walk);
144	if (err)
145	break;
146
147	if (walk->action == ACTION_AGAIN)
148	goto again;
149
150	/*
151	* Check this here so we only break down trans_huge
152	* pages when we _need_ to
153	*/
154	if ((!walk->vma && (pmd_leaf(pte: pmd) \|\| !pmd_present(pmd: pmd))) \|\|
155	walk->action == ACTION_CONTINUE \|\|
156	!(ops->pte_entry))
157	continue;
158
159	if (walk->vma)
160	split_huge_pmd(walk->vma, pmd, addr);
161
162	if (is_hugepd(__hugepd(pmd_val(*pmd))))
163	err = walk_hugepd_range(phpd: (hugepd_t *)pmd, addr, end: next, walk, PMD_SHIFT);
164	else
165	err = walk_pte_range(pmd, addr, end: next, walk);
166	if (err)
167	break;
168
169	if (walk->action == ACTION_AGAIN)
170	goto again;
171
172	} while (pmd++, addr = next, addr != end);
173
174	return err;
175	}
176
177	static int walk_pud_range(p4d_t p4d, unsigned* long addr, unsigned long end,
178	struct mm_walk *walk)
179	{
180	pud_t *pud;
181	unsigned long next;
182	const struct mm_walk_ops *ops = walk->ops;
183	int err = `0`;
184	int depth = real_depth(depth: `2`);
185
186	pud = pud_offset(p4d, address: addr);
187	do {
188	again:
189	next = pud_addr_end(addr, end);
190	if (pud_none(pud: *pud)) {
191	if (ops->pte_hole)
192	err = ops->pte_hole(addr, next, depth, walk);
193	if (err)
194	break;
195	continue;
196	}
197
198	walk->action = ACTION_SUBTREE;
199
200	if (ops->pud_entry)
201	err = ops->pud_entry(pud, addr, next, walk);
202	if (err)
203	break;
204
205	if (walk->action == ACTION_AGAIN)
206	goto again;
207
208	if ((!walk->vma && (pud_leaf(pud: pud) \|\| !pud_present(pud: pud))) \|\|
209	walk->action == ACTION_CONTINUE \|\|
210	!(ops->pmd_entry \|\| ops->pte_entry))
211	continue;
212
213	if (walk->vma)
214	split_huge_pud(walk->vma, pud, addr);
215	if (pud_none(pud: *pud))
216	goto again;
217
218	if (is_hugepd(__hugepd(pud_val(*pud))))
219	err = walk_hugepd_range(phpd: (hugepd_t *)pud, addr, end: next, walk, PUD_SHIFT);
220	else
221	err = walk_pmd_range(pud, addr, end: next, walk);
222	if (err)
223	break;
224	} while (pud++, addr = next, addr != end);
225
226	return err;
227	}
228
229	static int walk_p4d_range(pgd_t pgd, unsigned* long addr, unsigned long end,
230	struct mm_walk *walk)
231	{
232	p4d_t *p4d;
233	unsigned long next;
234	const struct mm_walk_ops *ops = walk->ops;
235	int err = `0`;
236	int depth = real_depth(depth: `1`);
237
238	p4d = p4d_offset(pgd, address: addr);
239	do {
240	next = p4d_addr_end(addr, end);
241	if (p4d_none_or_clear_bad(p4d)) {
242	if (ops->pte_hole)
243	err = ops->pte_hole(addr, next, depth, walk);
244	if (err)
245	break;
246	continue;
247	}
248	if (ops->p4d_entry) {
249	err = ops->p4d_entry(p4d, addr, next, walk);
250	if (err)
251	break;
252	}
253	if (is_hugepd(__hugepd(p4d_val(*p4d))))
254	err = walk_hugepd_range(phpd: (hugepd_t *)p4d, addr, end: next, walk, P4D_SHIFT);
255	else if (ops->pud_entry \|\| ops->pmd_entry \|\| ops->pte_entry)
256	err = walk_pud_range(p4d, addr, end: next, walk);
257	if (err)
258	break;
259	} while (p4d++, addr = next, addr != end);
260
261	return err;
262	}
263
264	static int walk_pgd_range(unsigned long addr, unsigned long end,
265	struct mm_walk *walk)
266	{
267	pgd_t *pgd;
268	unsigned long next;
269	const struct mm_walk_ops *ops = walk->ops;
270	int err = `0`;
271
272	if (walk->pgd)
273	pgd = walk->pgd + pgd_index(addr);
274	else
275	pgd = pgd_offset(walk->mm, addr);
276	do {
277	next = pgd_addr_end(addr, end);
278	if (pgd_none_or_clear_bad(pgd)) {
279	if (ops->pte_hole)
280	err = ops->pte_hole(addr, next, `0`, walk);
281	if (err)
282	break;
283	continue;
284	}
285	if (ops->pgd_entry) {
286	err = ops->pgd_entry(pgd, addr, next, walk);
287	if (err)
288	break;
289	}
290	if (is_hugepd(__hugepd(pgd_val(*pgd))))
291	err = walk_hugepd_range(phpd: (hugepd_t *)pgd, addr, end: next, walk, PGDIR_SHIFT);
292	else if (ops->p4d_entry \|\| ops->pud_entry \|\| ops->pmd_entry \|\| ops->pte_entry)
293	err = walk_p4d_range(pgd, addr, end: next, walk);
294	if (err)
295	break;
296	} while (pgd++, addr = next, addr != end);
297
298	return err;
299	}
300
301	#ifdef CONFIG_HUGETLB_PAGE
302	static unsigned long hugetlb_entry_end(struct hstate h, unsigned* long addr,
303	unsigned long end)
304	{
305	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
306	return boundary < end ? boundary : end;
307	}
308
309	static int walk_hugetlb_range(unsigned long addr, unsigned long end,
310	struct mm_walk *walk)
311	{
312	struct vm_area_struct *vma = walk->vma;
313	struct hstate *h = hstate_vma(vma);
314	unsigned long next;
315	unsigned long hmask = huge_page_mask(h);
316	unsigned long sz = huge_page_size(h);
317	pte_t *pte;
318	const struct mm_walk_ops *ops = walk->ops;
319	int err = `0`;
320
321	hugetlb_vma_lock_read(vma);
322	do {
323	next = hugetlb_entry_end(h, addr, end);
324	pte = hugetlb_walk(vma, addr: addr & hmask, sz);
325	if (pte)
326	err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
327	else if (ops->pte_hole)
328	err = ops->pte_hole(addr, next, -`1`, walk);
329	if (err)
330	break;
331	} while (addr = next, addr != end);
332	hugetlb_vma_unlock_read(vma);
333
334	return err;
335	}
336
337	#else /* CONFIG_HUGETLB_PAGE */
338	static int walk_hugetlb_range(unsigned long addr, unsigned long end,
339	struct mm_walk *walk)
340	{
341	return `0`;
342	}
343
344	#endif /* CONFIG_HUGETLB_PAGE */
345
346	/*
347	* Decide whether we really walk over the current vma on [@start, @end)
348	* or skip it via the returned value. Return 0 if we do walk over the
349	* current vma, and return 1 if we skip the vma. Negative values means
350	* error, where we abort the current walk.
351	*/
352	static int walk_page_test(unsigned long start, unsigned long end,
353	struct mm_walk *walk)
354	{
355	struct vm_area_struct *vma = walk->vma;
356	const struct mm_walk_ops *ops = walk->ops;
357
358	if (ops->test_walk)
359	return ops->test_walk(start, end, walk);
360
361	/*
362	* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
363	* range, so we don't walk over it as we do for normal vmas. However,
364	* Some callers are interested in handling hole range and they don't
365	* want to just ignore any single address range. Such users certainly
366	* define their ->pte_hole() callbacks, so let's delegate them to handle
367	* vma(VM_PFNMAP).
368	*/
369	if (vma->vm_flags & VM_PFNMAP) {
370	int err = `1`;
371	if (ops->pte_hole)
372	err = ops->pte_hole(start, end, -`1`, walk);
373	return err ? err : `1`;
374	}
375	return `0`;
376	}
377
378	static int __walk_page_range(unsigned long start, unsigned long end,
379	struct mm_walk *walk)
380	{
381	int err = `0`;
382	struct vm_area_struct *vma = walk->vma;
383	const struct mm_walk_ops *ops = walk->ops;
384
385	if (ops->pre_vma) {
386	err = ops->pre_vma(start, end, walk);
387	if (err)
388	return err;
389	}
390
391	if (is_vm_hugetlb_page(vma)) {
392	if (ops->hugetlb_entry)
393	err = walk_hugetlb_range(addr: start, end, walk);
394	} else
395	err = walk_pgd_range(addr: start, end, walk);
396
397	if (ops->post_vma)
398	ops->post_vma(walk);
399
400	return err;
401	}
402
403	static inline void process_mm_walk_lock(struct mm_struct *mm,
404	enum page_walk_lock walk_lock)
405	{
406	if (walk_lock == PGWALK_RDLOCK)
407	mmap_assert_locked(mm);
408	else
409	mmap_assert_write_locked(mm);
410	}
411
412	static inline void process_vma_walk_lock(struct vm_area_struct *vma,
413	enum page_walk_lock walk_lock)
414	{
415	#ifdef CONFIG_PER_VMA_LOCK
416	switch (walk_lock) {
417	case PGWALK_WRLOCK:
418	vma_start_write(vma);
419	break;
420	case PGWALK_WRLOCK_VERIFY:
421	vma_assert_write_locked(vma);
422	break;
423	case PGWALK_RDLOCK:
424	/ PGWALK_RDLOCK is handled by process_mm_walk_lock /
425	break;
426	}
427	#endif
428	}
429
430	/**
431	* walk_page_range - walk page table with caller specific callbacks
432	* @mm: mm_struct representing the target process of page table walk
433	* @start: start address of the virtual address range
434	* @end: end address of the virtual address range
435	* @ops: operation to call during the walk
436	* @private: private data for callbacks' usage
437	*
438	* Recursively walk the page table tree of the process represented by @mm
439	* within the virtual address range [@start, @end). During walking, we can do
440	* some caller-specific works for each entry, by setting up pmd_entry(),
441	* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
442	* callbacks, the associated entries/pages are just ignored.
443	* The return values of these callbacks are commonly defined like below:
444	*
445	* - 0 : succeeded to handle the current entry, and if you don't reach the
446	* end address yet, continue to walk.
447	* - >0 : succeeded to handle the current entry, and return to the caller
448	* with caller specific value.
449	* - <0 : failed to handle the current entry, and return to the caller
450	* with error code.
451	*
452	* Before starting to walk page table, some callers want to check whether
453	* they really want to walk over the current vma, typically by checking
454	* its vm_flags. walk_page_test() and @ops->test_walk() are used for this
455	* purpose.
456	*
457	* If operations need to be staged before and committed after a vma is walked,
458	* there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
459	* since it is intended to handle commit-type operations, can't return any
460	* errors.
461	*
462	* struct mm_walk keeps current values of some common data like vma and pmd,
463	* which are useful for the access from callbacks. If you want to pass some
464	* caller-specific data to callbacks, @private should be helpful.
465	*
466	* Locking:
467	* Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
468	* because these function traverse vma list and/or access to vma's data.
469	*/
470	int walk_page_range(struct mm_struct mm, unsigned* long start,
471	unsigned long end, const struct mm_walk_ops *ops,
472	void *private)
473	{
474	int err = `0`;
475	unsigned long next;
476	struct vm_area_struct *vma;
477	struct mm_walk walk = {
478	.ops = ops,
479	.mm = mm,
480	.private = private,
481	};
482
483	if (start >= end)
484	return -EINVAL;
485
486	if (!walk.mm)
487	return -EINVAL;
488
489	process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock);
490
491	vma = find_vma(mm: walk.mm, addr: start);
492	do {
493	if (!vma) { / after the last vma /
494	walk.vma = NULL;
495	next = end;
496	if (ops->pte_hole)
497	err = ops->pte_hole(start, next, -`1`, &walk);
498	} else if (start < vma->vm_start) { / outside vma /
499	walk.vma = NULL;
500	next = min(end, vma->vm_start);
501	if (ops->pte_hole)
502	err = ops->pte_hole(start, next, -`1`, &walk);
503	} else { / inside vma /
504	process_vma_walk_lock(vma, walk_lock: ops->walk_lock);
505	walk.vma = vma;
506	next = min(end, vma->vm_end);
507	vma = find_vma(mm, addr: vma->vm_end);
508
509	err = walk_page_test(start, end: next, walk: &walk);
510	if (err > `0`) {
511	/*
512	* positive return values are purely for
513	* controlling the pagewalk, so should never
514	* be passed to the callers.
515	*/
516	err = `0`;
517	continue;
518	}
519	if (err < `0`)
520	break;
521	err = __walk_page_range(start, end: next, walk: &walk);
522	}
523	if (err)
524	break;
525	} while (start = next, start < end);
526	return err;
527	}
528
529	/**
530	* walk_page_range_novma - walk a range of pagetables not backed by a vma
531	* @mm: mm_struct representing the target process of page table walk
532	* @start: start address of the virtual address range
533	* @end: end address of the virtual address range
534	* @ops: operation to call during the walk
535	* @pgd: pgd to walk if different from mm->pgd
536	* @private: private data for callbacks' usage
537	*
538	* Similar to walk_page_range() but can walk any page tables even if they are
539	* not backed by VMAs. Because 'unusual' entries may be walked this function
540	* will also not lock the PTEs for the pte_entry() callback. This is useful for
541	* walking the kernel pages tables or page tables for firmware.
542	*/
543	int walk_page_range_novma(struct mm_struct mm, unsigned* long start,
544	unsigned long end, const struct mm_walk_ops *ops,
545	pgd_t *pgd,
546	void *private)
547	{
548	struct mm_walk walk = {
549	.ops = ops,
550	.mm = mm,
551	.pgd = pgd,
552	.private = private,
553	.no_vma = true
554	};
555
556	if (start >= end \|\| !walk.mm)
557	return -EINVAL;
558
559	mmap_assert_write_locked(mm: walk.mm);
560
561	return walk_pgd_range(addr: start, end, walk: &walk);
562	}
563
564	int walk_page_range_vma(struct vm_area_struct vma, unsigned* long start,
565	unsigned long end, const struct mm_walk_ops *ops,
566	void *private)
567	{
568	struct mm_walk walk = {
569	.ops = ops,
570	.mm = vma->vm_mm,
571	.vma = vma,
572	.private = private,
573	};
574
575	if (start >= end \|\| !walk.mm)
576	return -EINVAL;
577	if (start < vma->vm_start \|\| end > vma->vm_end)
578	return -EINVAL;
579
580	process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock);
581	process_vma_walk_lock(vma, walk_lock: ops->walk_lock);
582	return __walk_page_range(start, end, walk: &walk);
583	}
584
585	int walk_page_vma(struct vm_area_struct vma, const* struct mm_walk_ops *ops,
586	void *private)
587	{
588	struct mm_walk walk = {
589	.ops = ops,
590	.mm = vma->vm_mm,
591	.vma = vma,
592	.private = private,
593	};
594
595	if (!walk.mm)
596	return -EINVAL;
597
598	process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock);
599	process_vma_walk_lock(vma, walk_lock: ops->walk_lock);
600	return __walk_page_range(start: vma->vm_start, end: vma->vm_end, walk: &walk);
601	}
602
603	/**
604	* walk_page_mapping - walk all memory areas mapped into a struct address_space.
605	* @mapping: Pointer to the struct address_space
606	* @first_index: First page offset in the address_space
607	* @nr: Number of incremental page offsets to cover
608	* @ops: operation to call during the walk
609	* @private: private data for callbacks' usage
610	*
611	* This function walks all memory areas mapped into a struct address_space.
612	* The walk is limited to only the given page-size index range, but if
613	* the index boundaries cross a huge page-table entry, that entry will be
614	* included.
615	*
616	* Also see walk_page_range() for additional information.
617	*
618	* Locking:
619	* This function can't require that the struct mm_struct::mmap_lock is held,
620	* since @mapping may be mapped by multiple processes. Instead
621	* @mapping->i_mmap_rwsem must be held. This might have implications in the
622	* callbacks, and it's up tho the caller to ensure that the
623	* struct mm_struct::mmap_lock is not needed.
624	*
625	* Also this means that a caller can't rely on the struct
626	* vm_area_struct::vm_flags to be constant across a call,
627	* except for immutable flags. Callers requiring this shouldn't use
628	* this function.
629	*
630	* Return: 0 on success, negative error code on failure, positive number on
631	* caller defined premature termination.
632	*/
633	int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
634	pgoff_t nr, const struct mm_walk_ops *ops,
635	void *private)
636	{
637	struct mm_walk walk = {
638	.ops = ops,
639	.private = private,
640	};
641	struct vm_area_struct *vma;
642	pgoff_t vba, vea, cba, cea;
643	unsigned long start_addr, end_addr;
644	int err = `0`;
645
646	lockdep_assert_held(&mapping->i_mmap_rwsem);
647	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
648	first_index + nr - `1`) {
649	/ Clip to the vma /
650	vba = vma->vm_pgoff;
651	vea = vba + vma_pages(vma);
652	cba = first_index;
653	cba = max(cba, vba);
654	cea = first_index + nr;
655	cea = min(cea, vea);
656
657	start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
658	end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
659	if (start_addr >= end_addr)
660	continue;
661
662	walk.vma = vma;
663	walk.mm = vma->vm_mm;
664
665	err = walk_page_test(start: vma->vm_start, end: vma->vm_end, walk: &walk);
666	if (err > `0`) {
667	err = `0`;
668	break;
669	} else if (err < `0`)
670	break;
671
672	err = __walk_page_range(start: start_addr, end: end_addr, walk: &walk);
673	if (err)
674	break;
675	}
676
677	return err;
678	}
679

source code of linux/mm/pagewalk.c