1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/pagewalk.h> |
3 | #include <linux/mm_inline.h> |
4 | #include <linux/hugetlb.h> |
5 | #include <linux/huge_mm.h> |
6 | #include <linux/mount.h> |
7 | #include <linux/ksm.h> |
8 | #include <linux/seq_file.h> |
9 | #include <linux/highmem.h> |
10 | #include <linux/ptrace.h> |
11 | #include <linux/slab.h> |
12 | #include <linux/pagemap.h> |
13 | #include <linux/mempolicy.h> |
14 | #include <linux/rmap.h> |
15 | #include <linux/swap.h> |
16 | #include <linux/sched/mm.h> |
17 | #include <linux/swapops.h> |
18 | #include <linux/mmu_notifier.h> |
19 | #include <linux/page_idle.h> |
20 | #include <linux/shmem_fs.h> |
21 | #include <linux/uaccess.h> |
22 | #include <linux/pkeys.h> |
23 | #include <linux/minmax.h> |
24 | #include <linux/overflow.h> |
25 | |
26 | #include <asm/elf.h> |
27 | #include <asm/tlb.h> |
28 | #include <asm/tlbflush.h> |
29 | #include "internal.h" |
30 | |
31 | #define SEQ_PUT_DEC(str, val) \ |
32 | seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) |
33 | void task_mem(struct seq_file *m, struct mm_struct *mm) |
34 | { |
35 | unsigned long text, lib, swap, anon, file, shmem; |
36 | unsigned long hiwater_vm, total_vm, , ; |
37 | |
38 | anon = get_mm_counter(mm, member: MM_ANONPAGES); |
39 | file = get_mm_counter(mm, member: MM_FILEPAGES); |
40 | shmem = get_mm_counter(mm, member: MM_SHMEMPAGES); |
41 | |
42 | /* |
43 | * Note: to minimize their overhead, mm maintains hiwater_vm and |
44 | * hiwater_rss only when about to *lower* total_vm or rss. Any |
45 | * collector of these hiwater stats must therefore get total_vm |
46 | * and rss too, which will usually be the higher. Barriers? not |
47 | * worth the effort, such snapshots can always be inconsistent. |
48 | */ |
49 | hiwater_vm = total_vm = mm->total_vm; |
50 | if (hiwater_vm < mm->hiwater_vm) |
51 | hiwater_vm = mm->hiwater_vm; |
52 | hiwater_rss = total_rss = anon + file + shmem; |
53 | if (hiwater_rss < mm->hiwater_rss) |
54 | hiwater_rss = mm->hiwater_rss; |
55 | |
56 | /* split executable areas between text and lib */ |
57 | text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); |
58 | text = min(text, mm->exec_vm << PAGE_SHIFT); |
59 | lib = (mm->exec_vm << PAGE_SHIFT) - text; |
60 | |
61 | swap = get_mm_counter(mm, member: MM_SWAPENTS); |
62 | SEQ_PUT_DEC("VmPeak:\t" , hiwater_vm); |
63 | SEQ_PUT_DEC(" kB\nVmSize:\t" , total_vm); |
64 | SEQ_PUT_DEC(" kB\nVmLck:\t" , mm->locked_vm); |
65 | SEQ_PUT_DEC(" kB\nVmPin:\t" , atomic64_read(&mm->pinned_vm)); |
66 | SEQ_PUT_DEC(" kB\nVmHWM:\t" , hiwater_rss); |
67 | SEQ_PUT_DEC(" kB\nVmRSS:\t" , total_rss); |
68 | SEQ_PUT_DEC(" kB\nRssAnon:\t" , anon); |
69 | SEQ_PUT_DEC(" kB\nRssFile:\t" , file); |
70 | SEQ_PUT_DEC(" kB\nRssShmem:\t" , shmem); |
71 | SEQ_PUT_DEC(" kB\nVmData:\t" , mm->data_vm); |
72 | SEQ_PUT_DEC(" kB\nVmStk:\t" , mm->stack_vm); |
73 | seq_put_decimal_ull_width(m, |
74 | delimiter: " kB\nVmExe:\t" , num: text >> 10, width: 8); |
75 | seq_put_decimal_ull_width(m, |
76 | delimiter: " kB\nVmLib:\t" , num: lib >> 10, width: 8); |
77 | seq_put_decimal_ull_width(m, |
78 | delimiter: " kB\nVmPTE:\t" , num: mm_pgtables_bytes(mm) >> 10, width: 8); |
79 | SEQ_PUT_DEC(" kB\nVmSwap:\t" , swap); |
80 | seq_puts(m, s: " kB\n" ); |
81 | hugetlb_report_usage(m, mm); |
82 | } |
83 | #undef SEQ_PUT_DEC |
84 | |
85 | unsigned long task_vsize(struct mm_struct *mm) |
86 | { |
87 | return PAGE_SIZE * mm->total_vm; |
88 | } |
89 | |
90 | unsigned long task_statm(struct mm_struct *mm, |
91 | unsigned long *shared, unsigned long *text, |
92 | unsigned long *data, unsigned long *resident) |
93 | { |
94 | *shared = get_mm_counter(mm, member: MM_FILEPAGES) + |
95 | get_mm_counter(mm, member: MM_SHMEMPAGES); |
96 | *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) |
97 | >> PAGE_SHIFT; |
98 | *data = mm->data_vm + mm->stack_vm; |
99 | *resident = *shared + get_mm_counter(mm, member: MM_ANONPAGES); |
100 | return mm->total_vm; |
101 | } |
102 | |
103 | #ifdef CONFIG_NUMA |
104 | /* |
105 | * Save get_task_policy() for show_numa_map(). |
106 | */ |
107 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
108 | { |
109 | struct task_struct *task = priv->task; |
110 | |
111 | task_lock(p: task); |
112 | priv->task_mempolicy = get_task_policy(p: task); |
113 | mpol_get(pol: priv->task_mempolicy); |
114 | task_unlock(p: task); |
115 | } |
116 | static void release_task_mempolicy(struct proc_maps_private *priv) |
117 | { |
118 | mpol_put(pol: priv->task_mempolicy); |
119 | } |
120 | #else |
121 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
122 | { |
123 | } |
124 | static void release_task_mempolicy(struct proc_maps_private *priv) |
125 | { |
126 | } |
127 | #endif |
128 | |
129 | static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, |
130 | loff_t *ppos) |
131 | { |
132 | struct vm_area_struct *vma = vma_next(vmi: &priv->iter); |
133 | |
134 | if (vma) { |
135 | *ppos = vma->vm_start; |
136 | } else { |
137 | *ppos = -2UL; |
138 | vma = get_gate_vma(mm: priv->mm); |
139 | } |
140 | |
141 | return vma; |
142 | } |
143 | |
144 | static void *m_start(struct seq_file *m, loff_t *ppos) |
145 | { |
146 | struct proc_maps_private *priv = m->private; |
147 | unsigned long last_addr = *ppos; |
148 | struct mm_struct *mm; |
149 | |
150 | /* See m_next(). Zero at the start or after lseek. */ |
151 | if (last_addr == -1UL) |
152 | return NULL; |
153 | |
154 | priv->task = get_proc_task(inode: priv->inode); |
155 | if (!priv->task) |
156 | return ERR_PTR(error: -ESRCH); |
157 | |
158 | mm = priv->mm; |
159 | if (!mm || !mmget_not_zero(mm)) { |
160 | put_task_struct(t: priv->task); |
161 | priv->task = NULL; |
162 | return NULL; |
163 | } |
164 | |
165 | if (mmap_read_lock_killable(mm)) { |
166 | mmput(mm); |
167 | put_task_struct(t: priv->task); |
168 | priv->task = NULL; |
169 | return ERR_PTR(error: -EINTR); |
170 | } |
171 | |
172 | vma_iter_init(vmi: &priv->iter, mm, addr: last_addr); |
173 | hold_task_mempolicy(priv); |
174 | if (last_addr == -2UL) |
175 | return get_gate_vma(mm); |
176 | |
177 | return proc_get_vma(priv, ppos); |
178 | } |
179 | |
180 | static void *m_next(struct seq_file *m, void *v, loff_t *ppos) |
181 | { |
182 | if (*ppos == -2UL) { |
183 | *ppos = -1UL; |
184 | return NULL; |
185 | } |
186 | return proc_get_vma(priv: m->private, ppos); |
187 | } |
188 | |
189 | static void m_stop(struct seq_file *m, void *v) |
190 | { |
191 | struct proc_maps_private *priv = m->private; |
192 | struct mm_struct *mm = priv->mm; |
193 | |
194 | if (!priv->task) |
195 | return; |
196 | |
197 | release_task_mempolicy(priv); |
198 | mmap_read_unlock(mm); |
199 | mmput(mm); |
200 | put_task_struct(t: priv->task); |
201 | priv->task = NULL; |
202 | } |
203 | |
204 | static int proc_maps_open(struct inode *inode, struct file *file, |
205 | const struct seq_operations *ops, int psize) |
206 | { |
207 | struct proc_maps_private *priv = __seq_open_private(file, ops, psize); |
208 | |
209 | if (!priv) |
210 | return -ENOMEM; |
211 | |
212 | priv->inode = inode; |
213 | priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); |
214 | if (IS_ERR(ptr: priv->mm)) { |
215 | int err = PTR_ERR(ptr: priv->mm); |
216 | |
217 | seq_release_private(inode, file); |
218 | return err; |
219 | } |
220 | |
221 | return 0; |
222 | } |
223 | |
224 | static int proc_map_release(struct inode *inode, struct file *file) |
225 | { |
226 | struct seq_file *seq = file->private_data; |
227 | struct proc_maps_private *priv = seq->private; |
228 | |
229 | if (priv->mm) |
230 | mmdrop(mm: priv->mm); |
231 | |
232 | return seq_release_private(inode, file); |
233 | } |
234 | |
235 | static int do_maps_open(struct inode *inode, struct file *file, |
236 | const struct seq_operations *ops) |
237 | { |
238 | return proc_maps_open(inode, file, ops, |
239 | psize: sizeof(struct proc_maps_private)); |
240 | } |
241 | |
242 | static void (struct seq_file *m, |
243 | unsigned long start, unsigned long end, |
244 | vm_flags_t flags, unsigned long long pgoff, |
245 | dev_t dev, unsigned long ino) |
246 | { |
247 | seq_setwidth(m, size: 25 + sizeof(void *) * 6 - 1); |
248 | seq_put_hex_ll(m, NULL, v: start, width: 8); |
249 | seq_put_hex_ll(m, delimiter: "-" , v: end, width: 8); |
250 | seq_putc(m, c: ' '); |
251 | seq_putc(m, c: flags & VM_READ ? 'r' : '-'); |
252 | seq_putc(m, c: flags & VM_WRITE ? 'w' : '-'); |
253 | seq_putc(m, c: flags & VM_EXEC ? 'x' : '-'); |
254 | seq_putc(m, c: flags & VM_MAYSHARE ? 's' : 'p'); |
255 | seq_put_hex_ll(m, delimiter: " " , v: pgoff, width: 8); |
256 | seq_put_hex_ll(m, delimiter: " " , MAJOR(dev), width: 2); |
257 | seq_put_hex_ll(m, delimiter: ":" , MINOR(dev), width: 2); |
258 | seq_put_decimal_ull(m, delimiter: " " , num: ino); |
259 | seq_putc(m, c: ' '); |
260 | } |
261 | |
262 | static void |
263 | show_map_vma(struct seq_file *m, struct vm_area_struct *vma) |
264 | { |
265 | struct anon_vma_name *anon_name = NULL; |
266 | struct mm_struct *mm = vma->vm_mm; |
267 | struct file *file = vma->vm_file; |
268 | vm_flags_t flags = vma->vm_flags; |
269 | unsigned long ino = 0; |
270 | unsigned long long pgoff = 0; |
271 | unsigned long start, end; |
272 | dev_t dev = 0; |
273 | const char *name = NULL; |
274 | |
275 | if (file) { |
276 | const struct inode *inode = file_user_inode(f: vma->vm_file); |
277 | |
278 | dev = inode->i_sb->s_dev; |
279 | ino = inode->i_ino; |
280 | pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; |
281 | } |
282 | |
283 | start = vma->vm_start; |
284 | end = vma->vm_end; |
285 | show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); |
286 | if (mm) |
287 | anon_name = anon_vma_name(vma); |
288 | |
289 | /* |
290 | * Print the dentry name for named mappings, and a |
291 | * special [heap] marker for the heap: |
292 | */ |
293 | if (file) { |
294 | seq_pad(m, c: ' '); |
295 | /* |
296 | * If user named this anon shared memory via |
297 | * prctl(PR_SET_VMA ..., use the provided name. |
298 | */ |
299 | if (anon_name) |
300 | seq_printf(m, fmt: "[anon_shmem:%s]" , anon_name->name); |
301 | else |
302 | seq_path(m, file_user_path(f: file), "\n" ); |
303 | goto done; |
304 | } |
305 | |
306 | if (vma->vm_ops && vma->vm_ops->name) { |
307 | name = vma->vm_ops->name(vma); |
308 | if (name) |
309 | goto done; |
310 | } |
311 | |
312 | name = arch_vma_name(vma); |
313 | if (!name) { |
314 | if (!mm) { |
315 | name = "[vdso]" ; |
316 | goto done; |
317 | } |
318 | |
319 | if (vma_is_initial_heap(vma)) { |
320 | name = "[heap]" ; |
321 | goto done; |
322 | } |
323 | |
324 | if (vma_is_initial_stack(vma)) { |
325 | name = "[stack]" ; |
326 | goto done; |
327 | } |
328 | |
329 | if (anon_name) { |
330 | seq_pad(m, c: ' '); |
331 | seq_printf(m, fmt: "[anon:%s]" , anon_name->name); |
332 | } |
333 | } |
334 | |
335 | done: |
336 | if (name) { |
337 | seq_pad(m, c: ' '); |
338 | seq_puts(m, s: name); |
339 | } |
340 | seq_putc(m, c: '\n'); |
341 | } |
342 | |
343 | static int show_map(struct seq_file *m, void *v) |
344 | { |
345 | show_map_vma(m, vma: v); |
346 | return 0; |
347 | } |
348 | |
349 | static const struct seq_operations proc_pid_maps_op = { |
350 | .start = m_start, |
351 | .next = m_next, |
352 | .stop = m_stop, |
353 | .show = show_map |
354 | }; |
355 | |
356 | static int pid_maps_open(struct inode *inode, struct file *file) |
357 | { |
358 | return do_maps_open(inode, file, ops: &proc_pid_maps_op); |
359 | } |
360 | |
361 | const struct file_operations proc_pid_maps_operations = { |
362 | .open = pid_maps_open, |
363 | .read = seq_read, |
364 | .llseek = seq_lseek, |
365 | .release = proc_map_release, |
366 | }; |
367 | |
368 | /* |
369 | * Proportional Set Size(PSS): my share of RSS. |
370 | * |
371 | * PSS of a process is the count of pages it has in memory, where each |
372 | * page is divided by the number of processes sharing it. So if a |
373 | * process has 1000 pages all to itself, and 1000 shared with one other |
374 | * process, its PSS will be 1500. |
375 | * |
376 | * To keep (accumulated) division errors low, we adopt a 64bit |
377 | * fixed-point pss counter to minimize division errors. So (pss >> |
378 | * PSS_SHIFT) would be the real byte count. |
379 | * |
380 | * A shift of 12 before division means (assuming 4K page size): |
381 | * - 1M 3-user-pages add up to 8KB errors; |
382 | * - supports mapcount up to 2^24, or 16M; |
383 | * - supports PSS up to 2^52 bytes, or 4PB. |
384 | */ |
385 | #define PSS_SHIFT 12 |
386 | |
387 | #ifdef CONFIG_PROC_PAGE_MONITOR |
388 | struct mem_size_stats { |
389 | unsigned long resident; |
390 | unsigned long shared_clean; |
391 | unsigned long shared_dirty; |
392 | unsigned long private_clean; |
393 | unsigned long private_dirty; |
394 | unsigned long referenced; |
395 | unsigned long anonymous; |
396 | unsigned long lazyfree; |
397 | unsigned long anonymous_thp; |
398 | unsigned long shmem_thp; |
399 | unsigned long file_thp; |
400 | unsigned long swap; |
401 | unsigned long shared_hugetlb; |
402 | unsigned long private_hugetlb; |
403 | unsigned long ksm; |
404 | u64 pss; |
405 | u64 pss_anon; |
406 | u64 pss_file; |
407 | u64 pss_shmem; |
408 | u64 pss_dirty; |
409 | u64 pss_locked; |
410 | u64 swap_pss; |
411 | }; |
412 | |
413 | static void smaps_page_accumulate(struct mem_size_stats *mss, |
414 | struct page *page, unsigned long size, unsigned long pss, |
415 | bool dirty, bool locked, bool private) |
416 | { |
417 | mss->pss += pss; |
418 | |
419 | if (PageAnon(page)) |
420 | mss->pss_anon += pss; |
421 | else if (PageSwapBacked(page)) |
422 | mss->pss_shmem += pss; |
423 | else |
424 | mss->pss_file += pss; |
425 | |
426 | if (locked) |
427 | mss->pss_locked += pss; |
428 | |
429 | if (dirty || PageDirty(page)) { |
430 | mss->pss_dirty += pss; |
431 | if (private) |
432 | mss->private_dirty += size; |
433 | else |
434 | mss->shared_dirty += size; |
435 | } else { |
436 | if (private) |
437 | mss->private_clean += size; |
438 | else |
439 | mss->shared_clean += size; |
440 | } |
441 | } |
442 | |
443 | static void smaps_account(struct mem_size_stats *mss, struct page *page, |
444 | bool compound, bool young, bool dirty, bool locked, |
445 | bool migration) |
446 | { |
447 | int i, nr = compound ? compound_nr(page) : 1; |
448 | unsigned long size = nr * PAGE_SIZE; |
449 | |
450 | /* |
451 | * First accumulate quantities that depend only on |size| and the type |
452 | * of the compound page. |
453 | */ |
454 | if (PageAnon(page)) { |
455 | mss->anonymous += size; |
456 | if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) |
457 | mss->lazyfree += size; |
458 | } |
459 | |
460 | if (PageKsm(page)) |
461 | mss->ksm += size; |
462 | |
463 | mss->resident += size; |
464 | /* Accumulate the size in pages that have been accessed. */ |
465 | if (young || page_is_young(page) || PageReferenced(page)) |
466 | mss->referenced += size; |
467 | |
468 | /* |
469 | * Then accumulate quantities that may depend on sharing, or that may |
470 | * differ page-by-page. |
471 | * |
472 | * page_count(page) == 1 guarantees the page is mapped exactly once. |
473 | * If any subpage of the compound page mapped with PTE it would elevate |
474 | * page_count(). |
475 | * |
476 | * The page_mapcount() is called to get a snapshot of the mapcount. |
477 | * Without holding the page lock this snapshot can be slightly wrong as |
478 | * we cannot always read the mapcount atomically. It is not safe to |
479 | * call page_mapcount() even with PTL held if the page is not mapped, |
480 | * especially for migration entries. Treat regular migration entries |
481 | * as mapcount == 1. |
482 | */ |
483 | if ((page_count(page) == 1) || migration) { |
484 | smaps_page_accumulate(mss, page, size, pss: size << PSS_SHIFT, dirty, |
485 | locked, private: true); |
486 | return; |
487 | } |
488 | for (i = 0; i < nr; i++, page++) { |
489 | int mapcount = page_mapcount(page); |
490 | unsigned long pss = PAGE_SIZE << PSS_SHIFT; |
491 | if (mapcount >= 2) |
492 | pss /= mapcount; |
493 | smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, |
494 | private: mapcount < 2); |
495 | } |
496 | } |
497 | |
498 | #ifdef CONFIG_SHMEM |
499 | static int smaps_pte_hole(unsigned long addr, unsigned long end, |
500 | __always_unused int depth, struct mm_walk *walk) |
501 | { |
502 | struct mem_size_stats *mss = walk->private; |
503 | struct vm_area_struct *vma = walk->vma; |
504 | |
505 | mss->swap += shmem_partial_swap_usage(mapping: walk->vma->vm_file->f_mapping, |
506 | start: linear_page_index(vma, address: addr), |
507 | end: linear_page_index(vma, address: end)); |
508 | |
509 | return 0; |
510 | } |
511 | #else |
512 | #define smaps_pte_hole NULL |
513 | #endif /* CONFIG_SHMEM */ |
514 | |
515 | static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) |
516 | { |
517 | #ifdef CONFIG_SHMEM |
518 | if (walk->ops->pte_hole) { |
519 | /* depth is not used */ |
520 | smaps_pte_hole(addr, end: addr + PAGE_SIZE, depth: 0, walk); |
521 | } |
522 | #endif |
523 | } |
524 | |
525 | static void smaps_pte_entry(pte_t *pte, unsigned long addr, |
526 | struct mm_walk *walk) |
527 | { |
528 | struct mem_size_stats *mss = walk->private; |
529 | struct vm_area_struct *vma = walk->vma; |
530 | bool locked = !!(vma->vm_flags & VM_LOCKED); |
531 | struct page *page = NULL; |
532 | bool migration = false, young = false, dirty = false; |
533 | pte_t ptent = ptep_get(ptep: pte); |
534 | |
535 | if (pte_present(a: ptent)) { |
536 | page = vm_normal_page(vma, addr, pte: ptent); |
537 | young = pte_young(pte: ptent); |
538 | dirty = pte_dirty(pte: ptent); |
539 | } else if (is_swap_pte(pte: ptent)) { |
540 | swp_entry_t swpent = pte_to_swp_entry(pte: ptent); |
541 | |
542 | if (!non_swap_entry(entry: swpent)) { |
543 | int mapcount; |
544 | |
545 | mss->swap += PAGE_SIZE; |
546 | mapcount = swp_swapcount(entry: swpent); |
547 | if (mapcount >= 2) { |
548 | u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; |
549 | |
550 | do_div(pss_delta, mapcount); |
551 | mss->swap_pss += pss_delta; |
552 | } else { |
553 | mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; |
554 | } |
555 | } else if (is_pfn_swap_entry(entry: swpent)) { |
556 | if (is_migration_entry(entry: swpent)) |
557 | migration = true; |
558 | page = pfn_swap_entry_to_page(entry: swpent); |
559 | } |
560 | } else { |
561 | smaps_pte_hole_lookup(addr, walk); |
562 | return; |
563 | } |
564 | |
565 | if (!page) |
566 | return; |
567 | |
568 | smaps_account(mss, page, compound: false, young, dirty, locked, migration); |
569 | } |
570 | |
571 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
572 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, |
573 | struct mm_walk *walk) |
574 | { |
575 | struct mem_size_stats *mss = walk->private; |
576 | struct vm_area_struct *vma = walk->vma; |
577 | bool locked = !!(vma->vm_flags & VM_LOCKED); |
578 | struct page *page = NULL; |
579 | bool migration = false; |
580 | |
581 | if (pmd_present(pmd: *pmd)) { |
582 | page = vm_normal_page_pmd(vma, addr, pmd: *pmd); |
583 | } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { |
584 | swp_entry_t entry = pmd_to_swp_entry(pmd: *pmd); |
585 | |
586 | if (is_migration_entry(entry)) { |
587 | migration = true; |
588 | page = pfn_swap_entry_to_page(entry); |
589 | } |
590 | } |
591 | if (IS_ERR_OR_NULL(ptr: page)) |
592 | return; |
593 | if (PageAnon(page)) |
594 | mss->anonymous_thp += HPAGE_PMD_SIZE; |
595 | else if (PageSwapBacked(page)) |
596 | mss->shmem_thp += HPAGE_PMD_SIZE; |
597 | else if (is_zone_device_page(page)) |
598 | /* pass */; |
599 | else |
600 | mss->file_thp += HPAGE_PMD_SIZE; |
601 | |
602 | smaps_account(mss, page, compound: true, pmd_young(pmd: *pmd), pmd_dirty(pmd: *pmd), |
603 | locked, migration); |
604 | } |
605 | #else |
606 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, |
607 | struct mm_walk *walk) |
608 | { |
609 | } |
610 | #endif |
611 | |
612 | static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
613 | struct mm_walk *walk) |
614 | { |
615 | struct vm_area_struct *vma = walk->vma; |
616 | pte_t *pte; |
617 | spinlock_t *ptl; |
618 | |
619 | ptl = pmd_trans_huge_lock(pmd, vma); |
620 | if (ptl) { |
621 | smaps_pmd_entry(pmd, addr, walk); |
622 | spin_unlock(lock: ptl); |
623 | goto out; |
624 | } |
625 | |
626 | pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl); |
627 | if (!pte) { |
628 | walk->action = ACTION_AGAIN; |
629 | return 0; |
630 | } |
631 | for (; addr != end; pte++, addr += PAGE_SIZE) |
632 | smaps_pte_entry(pte, addr, walk); |
633 | pte_unmap_unlock(pte - 1, ptl); |
634 | out: |
635 | cond_resched(); |
636 | return 0; |
637 | } |
638 | |
639 | static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) |
640 | { |
641 | /* |
642 | * Don't forget to update Documentation/ on changes. |
643 | */ |
644 | static const char mnemonics[BITS_PER_LONG][2] = { |
645 | /* |
646 | * In case if we meet a flag we don't know about. |
647 | */ |
648 | [0 ... (BITS_PER_LONG-1)] = "??" , |
649 | |
650 | [ilog2(VM_READ)] = "rd" , |
651 | [ilog2(VM_WRITE)] = "wr" , |
652 | [ilog2(VM_EXEC)] = "ex" , |
653 | [ilog2(VM_SHARED)] = "sh" , |
654 | [ilog2(VM_MAYREAD)] = "mr" , |
655 | [ilog2(VM_MAYWRITE)] = "mw" , |
656 | [ilog2(VM_MAYEXEC)] = "me" , |
657 | [ilog2(VM_MAYSHARE)] = "ms" , |
658 | [ilog2(VM_GROWSDOWN)] = "gd" , |
659 | [ilog2(VM_PFNMAP)] = "pf" , |
660 | [ilog2(VM_LOCKED)] = "lo" , |
661 | [ilog2(VM_IO)] = "io" , |
662 | [ilog2(VM_SEQ_READ)] = "sr" , |
663 | [ilog2(VM_RAND_READ)] = "rr" , |
664 | [ilog2(VM_DONTCOPY)] = "dc" , |
665 | [ilog2(VM_DONTEXPAND)] = "de" , |
666 | [ilog2(VM_LOCKONFAULT)] = "lf" , |
667 | [ilog2(VM_ACCOUNT)] = "ac" , |
668 | [ilog2(VM_NORESERVE)] = "nr" , |
669 | [ilog2(VM_HUGETLB)] = "ht" , |
670 | [ilog2(VM_SYNC)] = "sf" , |
671 | [ilog2(VM_ARCH_1)] = "ar" , |
672 | [ilog2(VM_WIPEONFORK)] = "wf" , |
673 | [ilog2(VM_DONTDUMP)] = "dd" , |
674 | #ifdef CONFIG_ARM64_BTI |
675 | [ilog2(VM_ARM64_BTI)] = "bt" , |
676 | #endif |
677 | #ifdef CONFIG_MEM_SOFT_DIRTY |
678 | [ilog2(VM_SOFTDIRTY)] = "sd" , |
679 | #endif |
680 | [ilog2(VM_MIXEDMAP)] = "mm" , |
681 | [ilog2(VM_HUGEPAGE)] = "hg" , |
682 | [ilog2(VM_NOHUGEPAGE)] = "nh" , |
683 | [ilog2(VM_MERGEABLE)] = "mg" , |
684 | [ilog2(VM_UFFD_MISSING)]= "um" , |
685 | [ilog2(VM_UFFD_WP)] = "uw" , |
686 | #ifdef CONFIG_ARM64_MTE |
687 | [ilog2(VM_MTE)] = "mt" , |
688 | [ilog2(VM_MTE_ALLOWED)] = "" , |
689 | #endif |
690 | #ifdef CONFIG_ARCH_HAS_PKEYS |
691 | /* These come out via ProtectionKey: */ |
692 | [ilog2(VM_PKEY_BIT0)] = "" , |
693 | [ilog2(VM_PKEY_BIT1)] = "" , |
694 | [ilog2(VM_PKEY_BIT2)] = "" , |
695 | [ilog2(VM_PKEY_BIT3)] = "" , |
696 | #if VM_PKEY_BIT4 |
697 | [ilog2(VM_PKEY_BIT4)] = "" , |
698 | #endif |
699 | #endif /* CONFIG_ARCH_HAS_PKEYS */ |
700 | #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR |
701 | [ilog2(VM_UFFD_MINOR)] = "ui" , |
702 | #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ |
703 | #ifdef CONFIG_X86_USER_SHADOW_STACK |
704 | [ilog2(VM_SHADOW_STACK)] = "ss" , |
705 | #endif |
706 | }; |
707 | size_t i; |
708 | |
709 | seq_puts(m, s: "VmFlags: " ); |
710 | for (i = 0; i < BITS_PER_LONG; i++) { |
711 | if (!mnemonics[i][0]) |
712 | continue; |
713 | if (vma->vm_flags & (1UL << i)) { |
714 | seq_putc(m, c: mnemonics[i][0]); |
715 | seq_putc(m, c: mnemonics[i][1]); |
716 | seq_putc(m, c: ' '); |
717 | } |
718 | } |
719 | seq_putc(m, c: '\n'); |
720 | } |
721 | |
722 | #ifdef CONFIG_HUGETLB_PAGE |
723 | static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, |
724 | unsigned long addr, unsigned long end, |
725 | struct mm_walk *walk) |
726 | { |
727 | struct mem_size_stats *mss = walk->private; |
728 | struct vm_area_struct *vma = walk->vma; |
729 | struct page *page = NULL; |
730 | pte_t ptent = ptep_get(ptep: pte); |
731 | |
732 | if (pte_present(a: ptent)) { |
733 | page = vm_normal_page(vma, addr, pte: ptent); |
734 | } else if (is_swap_pte(pte: ptent)) { |
735 | swp_entry_t swpent = pte_to_swp_entry(pte: ptent); |
736 | |
737 | if (is_pfn_swap_entry(entry: swpent)) |
738 | page = pfn_swap_entry_to_page(entry: swpent); |
739 | } |
740 | if (page) { |
741 | if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) |
742 | mss->shared_hugetlb += huge_page_size(h: hstate_vma(vma)); |
743 | else |
744 | mss->private_hugetlb += huge_page_size(h: hstate_vma(vma)); |
745 | } |
746 | return 0; |
747 | } |
748 | #else |
749 | #define smaps_hugetlb_range NULL |
750 | #endif /* HUGETLB_PAGE */ |
751 | |
752 | static const struct mm_walk_ops smaps_walk_ops = { |
753 | .pmd_entry = smaps_pte_range, |
754 | .hugetlb_entry = smaps_hugetlb_range, |
755 | .walk_lock = PGWALK_RDLOCK, |
756 | }; |
757 | |
758 | static const struct mm_walk_ops smaps_shmem_walk_ops = { |
759 | .pmd_entry = smaps_pte_range, |
760 | .hugetlb_entry = smaps_hugetlb_range, |
761 | .pte_hole = smaps_pte_hole, |
762 | .walk_lock = PGWALK_RDLOCK, |
763 | }; |
764 | |
765 | /* |
766 | * Gather mem stats from @vma with the indicated beginning |
767 | * address @start, and keep them in @mss. |
768 | * |
769 | * Use vm_start of @vma as the beginning address if @start is 0. |
770 | */ |
771 | static void smap_gather_stats(struct vm_area_struct *vma, |
772 | struct mem_size_stats *mss, unsigned long start) |
773 | { |
774 | const struct mm_walk_ops *ops = &smaps_walk_ops; |
775 | |
776 | /* Invalid start */ |
777 | if (start >= vma->vm_end) |
778 | return; |
779 | |
780 | if (vma->vm_file && shmem_mapping(mapping: vma->vm_file->f_mapping)) { |
781 | /* |
782 | * For shared or readonly shmem mappings we know that all |
783 | * swapped out pages belong to the shmem object, and we can |
784 | * obtain the swap value much more efficiently. For private |
785 | * writable mappings, we might have COW pages that are |
786 | * not affected by the parent swapped out pages of the shmem |
787 | * object, so we have to distinguish them during the page walk. |
788 | * Unless we know that the shmem object (or the part mapped by |
789 | * our VMA) has no swapped out pages at all. |
790 | */ |
791 | unsigned long shmem_swapped = shmem_swap_usage(vma); |
792 | |
793 | if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || |
794 | !(vma->vm_flags & VM_WRITE))) { |
795 | mss->swap += shmem_swapped; |
796 | } else { |
797 | ops = &smaps_shmem_walk_ops; |
798 | } |
799 | } |
800 | |
801 | /* mmap_lock is held in m_start */ |
802 | if (!start) |
803 | walk_page_vma(vma, ops, private: mss); |
804 | else |
805 | walk_page_range(mm: vma->vm_mm, start, end: vma->vm_end, ops, private: mss); |
806 | } |
807 | |
808 | #define SEQ_PUT_DEC(str, val) \ |
809 | seq_put_decimal_ull_width(m, str, (val) >> 10, 8) |
810 | |
811 | /* Show the contents common for smaps and smaps_rollup */ |
812 | static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, |
813 | bool rollup_mode) |
814 | { |
815 | SEQ_PUT_DEC("Rss: " , mss->resident); |
816 | SEQ_PUT_DEC(" kB\nPss: " , mss->pss >> PSS_SHIFT); |
817 | SEQ_PUT_DEC(" kB\nPss_Dirty: " , mss->pss_dirty >> PSS_SHIFT); |
818 | if (rollup_mode) { |
819 | /* |
820 | * These are meaningful only for smaps_rollup, otherwise two of |
821 | * them are zero, and the other one is the same as Pss. |
822 | */ |
823 | SEQ_PUT_DEC(" kB\nPss_Anon: " , |
824 | mss->pss_anon >> PSS_SHIFT); |
825 | SEQ_PUT_DEC(" kB\nPss_File: " , |
826 | mss->pss_file >> PSS_SHIFT); |
827 | SEQ_PUT_DEC(" kB\nPss_Shmem: " , |
828 | mss->pss_shmem >> PSS_SHIFT); |
829 | } |
830 | SEQ_PUT_DEC(" kB\nShared_Clean: " , mss->shared_clean); |
831 | SEQ_PUT_DEC(" kB\nShared_Dirty: " , mss->shared_dirty); |
832 | SEQ_PUT_DEC(" kB\nPrivate_Clean: " , mss->private_clean); |
833 | SEQ_PUT_DEC(" kB\nPrivate_Dirty: " , mss->private_dirty); |
834 | SEQ_PUT_DEC(" kB\nReferenced: " , mss->referenced); |
835 | SEQ_PUT_DEC(" kB\nAnonymous: " , mss->anonymous); |
836 | SEQ_PUT_DEC(" kB\nKSM: " , mss->ksm); |
837 | SEQ_PUT_DEC(" kB\nLazyFree: " , mss->lazyfree); |
838 | SEQ_PUT_DEC(" kB\nAnonHugePages: " , mss->anonymous_thp); |
839 | SEQ_PUT_DEC(" kB\nShmemPmdMapped: " , mss->shmem_thp); |
840 | SEQ_PUT_DEC(" kB\nFilePmdMapped: " , mss->file_thp); |
841 | SEQ_PUT_DEC(" kB\nShared_Hugetlb: " , mss->shared_hugetlb); |
842 | seq_put_decimal_ull_width(m, delimiter: " kB\nPrivate_Hugetlb: " , |
843 | num: mss->private_hugetlb >> 10, width: 7); |
844 | SEQ_PUT_DEC(" kB\nSwap: " , mss->swap); |
845 | SEQ_PUT_DEC(" kB\nSwapPss: " , |
846 | mss->swap_pss >> PSS_SHIFT); |
847 | SEQ_PUT_DEC(" kB\nLocked: " , |
848 | mss->pss_locked >> PSS_SHIFT); |
849 | seq_puts(m, s: " kB\n" ); |
850 | } |
851 | |
852 | static int show_smap(struct seq_file *m, void *v) |
853 | { |
854 | struct vm_area_struct *vma = v; |
855 | struct mem_size_stats mss = {}; |
856 | |
857 | smap_gather_stats(vma, mss: &mss, start: 0); |
858 | |
859 | show_map_vma(m, vma); |
860 | |
861 | SEQ_PUT_DEC("Size: " , vma->vm_end - vma->vm_start); |
862 | SEQ_PUT_DEC(" kB\nKernelPageSize: " , vma_kernel_pagesize(vma)); |
863 | SEQ_PUT_DEC(" kB\nMMUPageSize: " , vma_mmu_pagesize(vma)); |
864 | seq_puts(m, s: " kB\n" ); |
865 | |
866 | __show_smap(m, mss: &mss, rollup_mode: false); |
867 | |
868 | seq_printf(m, fmt: "THPeligible: %8u\n" , |
869 | !!thp_vma_allowable_orders(vma, vm_flags: vma->vm_flags, smaps: true, in_pf: false, |
870 | enforce_sysfs: true, THP_ORDERS_ALL)); |
871 | |
872 | if (arch_pkeys_enabled()) |
873 | seq_printf(m, fmt: "ProtectionKey: %8u\n" , vma_pkey(vma)); |
874 | show_smap_vma_flags(m, vma); |
875 | |
876 | return 0; |
877 | } |
878 | |
879 | static int show_smaps_rollup(struct seq_file *m, void *v) |
880 | { |
881 | struct proc_maps_private *priv = m->private; |
882 | struct mem_size_stats mss = {}; |
883 | struct mm_struct *mm = priv->mm; |
884 | struct vm_area_struct *vma; |
885 | unsigned long vma_start = 0, last_vma_end = 0; |
886 | int ret = 0; |
887 | VMA_ITERATOR(vmi, mm, 0); |
888 | |
889 | priv->task = get_proc_task(inode: priv->inode); |
890 | if (!priv->task) |
891 | return -ESRCH; |
892 | |
893 | if (!mm || !mmget_not_zero(mm)) { |
894 | ret = -ESRCH; |
895 | goto out_put_task; |
896 | } |
897 | |
898 | ret = mmap_read_lock_killable(mm); |
899 | if (ret) |
900 | goto out_put_mm; |
901 | |
902 | hold_task_mempolicy(priv); |
903 | vma = vma_next(vmi: &vmi); |
904 | |
905 | if (unlikely(!vma)) |
906 | goto empty_set; |
907 | |
908 | vma_start = vma->vm_start; |
909 | do { |
910 | smap_gather_stats(vma, mss: &mss, start: 0); |
911 | last_vma_end = vma->vm_end; |
912 | |
913 | /* |
914 | * Release mmap_lock temporarily if someone wants to |
915 | * access it for write request. |
916 | */ |
917 | if (mmap_lock_is_contended(mm)) { |
918 | vma_iter_invalidate(vmi: &vmi); |
919 | mmap_read_unlock(mm); |
920 | ret = mmap_read_lock_killable(mm); |
921 | if (ret) { |
922 | release_task_mempolicy(priv); |
923 | goto out_put_mm; |
924 | } |
925 | |
926 | /* |
927 | * After dropping the lock, there are four cases to |
928 | * consider. See the following example for explanation. |
929 | * |
930 | * +------+------+-----------+ |
931 | * | VMA1 | VMA2 | VMA3 | |
932 | * +------+------+-----------+ |
933 | * | | | | |
934 | * 4k 8k 16k 400k |
935 | * |
936 | * Suppose we drop the lock after reading VMA2 due to |
937 | * contention, then we get: |
938 | * |
939 | * last_vma_end = 16k |
940 | * |
941 | * 1) VMA2 is freed, but VMA3 exists: |
942 | * |
943 | * vma_next(vmi) will return VMA3. |
944 | * In this case, just continue from VMA3. |
945 | * |
946 | * 2) VMA2 still exists: |
947 | * |
948 | * vma_next(vmi) will return VMA3. |
949 | * In this case, just continue from VMA3. |
950 | * |
951 | * 3) No more VMAs can be found: |
952 | * |
953 | * vma_next(vmi) will return NULL. |
954 | * No more things to do, just break. |
955 | * |
956 | * 4) (last_vma_end - 1) is the middle of a vma (VMA'): |
957 | * |
958 | * vma_next(vmi) will return VMA' whose range |
959 | * contains last_vma_end. |
960 | * Iterate VMA' from last_vma_end. |
961 | */ |
962 | vma = vma_next(vmi: &vmi); |
963 | /* Case 3 above */ |
964 | if (!vma) |
965 | break; |
966 | |
967 | /* Case 1 and 2 above */ |
968 | if (vma->vm_start >= last_vma_end) |
969 | continue; |
970 | |
971 | /* Case 4 above */ |
972 | if (vma->vm_end > last_vma_end) |
973 | smap_gather_stats(vma, mss: &mss, start: last_vma_end); |
974 | } |
975 | } for_each_vma(vmi, vma); |
976 | |
977 | empty_set: |
978 | show_vma_header_prefix(m, start: vma_start, end: last_vma_end, flags: 0, pgoff: 0, dev: 0, ino: 0); |
979 | seq_pad(m, c: ' '); |
980 | seq_puts(m, s: "[rollup]\n" ); |
981 | |
982 | __show_smap(m, mss: &mss, rollup_mode: true); |
983 | |
984 | release_task_mempolicy(priv); |
985 | mmap_read_unlock(mm); |
986 | |
987 | out_put_mm: |
988 | mmput(mm); |
989 | out_put_task: |
990 | put_task_struct(t: priv->task); |
991 | priv->task = NULL; |
992 | |
993 | return ret; |
994 | } |
995 | #undef SEQ_PUT_DEC |
996 | |
997 | static const struct seq_operations proc_pid_smaps_op = { |
998 | .start = m_start, |
999 | .next = m_next, |
1000 | .stop = m_stop, |
1001 | .show = show_smap |
1002 | }; |
1003 | |
1004 | static int pid_smaps_open(struct inode *inode, struct file *file) |
1005 | { |
1006 | return do_maps_open(inode, file, ops: &proc_pid_smaps_op); |
1007 | } |
1008 | |
1009 | static int smaps_rollup_open(struct inode *inode, struct file *file) |
1010 | { |
1011 | int ret; |
1012 | struct proc_maps_private *priv; |
1013 | |
1014 | priv = kzalloc(size: sizeof(*priv), GFP_KERNEL_ACCOUNT); |
1015 | if (!priv) |
1016 | return -ENOMEM; |
1017 | |
1018 | ret = single_open(file, show_smaps_rollup, priv); |
1019 | if (ret) |
1020 | goto out_free; |
1021 | |
1022 | priv->inode = inode; |
1023 | priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); |
1024 | if (IS_ERR(ptr: priv->mm)) { |
1025 | ret = PTR_ERR(ptr: priv->mm); |
1026 | |
1027 | single_release(inode, file); |
1028 | goto out_free; |
1029 | } |
1030 | |
1031 | return 0; |
1032 | |
1033 | out_free: |
1034 | kfree(objp: priv); |
1035 | return ret; |
1036 | } |
1037 | |
1038 | static int smaps_rollup_release(struct inode *inode, struct file *file) |
1039 | { |
1040 | struct seq_file *seq = file->private_data; |
1041 | struct proc_maps_private *priv = seq->private; |
1042 | |
1043 | if (priv->mm) |
1044 | mmdrop(mm: priv->mm); |
1045 | |
1046 | kfree(objp: priv); |
1047 | return single_release(inode, file); |
1048 | } |
1049 | |
1050 | const struct file_operations proc_pid_smaps_operations = { |
1051 | .open = pid_smaps_open, |
1052 | .read = seq_read, |
1053 | .llseek = seq_lseek, |
1054 | .release = proc_map_release, |
1055 | }; |
1056 | |
1057 | const struct file_operations proc_pid_smaps_rollup_operations = { |
1058 | .open = smaps_rollup_open, |
1059 | .read = seq_read, |
1060 | .llseek = seq_lseek, |
1061 | .release = smaps_rollup_release, |
1062 | }; |
1063 | |
1064 | enum clear_refs_types { |
1065 | CLEAR_REFS_ALL = 1, |
1066 | CLEAR_REFS_ANON, |
1067 | CLEAR_REFS_MAPPED, |
1068 | CLEAR_REFS_SOFT_DIRTY, |
1069 | , |
1070 | CLEAR_REFS_LAST, |
1071 | }; |
1072 | |
1073 | struct clear_refs_private { |
1074 | enum clear_refs_types type; |
1075 | }; |
1076 | |
1077 | #ifdef CONFIG_MEM_SOFT_DIRTY |
1078 | |
1079 | static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
1080 | { |
1081 | struct page *page; |
1082 | |
1083 | if (!pte_write(pte)) |
1084 | return false; |
1085 | if (!is_cow_mapping(flags: vma->vm_flags)) |
1086 | return false; |
1087 | if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) |
1088 | return false; |
1089 | page = vm_normal_page(vma, addr, pte); |
1090 | if (!page) |
1091 | return false; |
1092 | return page_maybe_dma_pinned(page); |
1093 | } |
1094 | |
1095 | static inline void clear_soft_dirty(struct vm_area_struct *vma, |
1096 | unsigned long addr, pte_t *pte) |
1097 | { |
1098 | /* |
1099 | * The soft-dirty tracker uses #PF-s to catch writes |
1100 | * to pages, so write-protect the pte as well. See the |
1101 | * Documentation/admin-guide/mm/soft-dirty.rst for full description |
1102 | * of how soft-dirty works. |
1103 | */ |
1104 | pte_t ptent = ptep_get(ptep: pte); |
1105 | |
1106 | if (pte_present(a: ptent)) { |
1107 | pte_t old_pte; |
1108 | |
1109 | if (pte_is_pinned(vma, addr, pte: ptent)) |
1110 | return; |
1111 | old_pte = ptep_modify_prot_start(vma, addr, ptep: pte); |
1112 | ptent = pte_wrprotect(pte: old_pte); |
1113 | ptent = pte_clear_soft_dirty(pte: ptent); |
1114 | ptep_modify_prot_commit(vma, addr, ptep: pte, old_pte, pte: ptent); |
1115 | } else if (is_swap_pte(pte: ptent)) { |
1116 | ptent = pte_swp_clear_soft_dirty(pte: ptent); |
1117 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
1118 | } |
1119 | } |
1120 | #else |
1121 | static inline void clear_soft_dirty(struct vm_area_struct *vma, |
1122 | unsigned long addr, pte_t *pte) |
1123 | { |
1124 | } |
1125 | #endif |
1126 | |
1127 | #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) |
1128 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
1129 | unsigned long addr, pmd_t *pmdp) |
1130 | { |
1131 | pmd_t old, pmd = *pmdp; |
1132 | |
1133 | if (pmd_present(pmd)) { |
1134 | /* See comment in change_huge_pmd() */ |
1135 | old = pmdp_invalidate(vma, address: addr, pmdp); |
1136 | if (pmd_dirty(pmd: old)) |
1137 | pmd = pmd_mkdirty(pmd); |
1138 | if (pmd_young(pmd: old)) |
1139 | pmd = pmd_mkyoung(pmd); |
1140 | |
1141 | pmd = pmd_wrprotect(pmd); |
1142 | pmd = pmd_clear_soft_dirty(pmd); |
1143 | |
1144 | set_pmd_at(mm: vma->vm_mm, addr, pmdp, pmd); |
1145 | } else if (is_migration_entry(entry: pmd_to_swp_entry(pmd))) { |
1146 | pmd = pmd_swp_clear_soft_dirty(pmd); |
1147 | set_pmd_at(mm: vma->vm_mm, addr, pmdp, pmd); |
1148 | } |
1149 | } |
1150 | #else |
1151 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
1152 | unsigned long addr, pmd_t *pmdp) |
1153 | { |
1154 | } |
1155 | #endif |
1156 | |
1157 | static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, |
1158 | unsigned long end, struct mm_walk *walk) |
1159 | { |
1160 | struct clear_refs_private *cp = walk->private; |
1161 | struct vm_area_struct *vma = walk->vma; |
1162 | pte_t *pte, ptent; |
1163 | spinlock_t *ptl; |
1164 | struct page *page; |
1165 | |
1166 | ptl = pmd_trans_huge_lock(pmd, vma); |
1167 | if (ptl) { |
1168 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { |
1169 | clear_soft_dirty_pmd(vma, addr, pmdp: pmd); |
1170 | goto out; |
1171 | } |
1172 | |
1173 | if (!pmd_present(pmd: *pmd)) |
1174 | goto out; |
1175 | |
1176 | page = pmd_page(*pmd); |
1177 | |
1178 | /* Clear accessed and referenced bits. */ |
1179 | pmdp_test_and_clear_young(vma, addr, pmdp: pmd); |
1180 | test_and_clear_page_young(page); |
1181 | ClearPageReferenced(page); |
1182 | out: |
1183 | spin_unlock(lock: ptl); |
1184 | return 0; |
1185 | } |
1186 | |
1187 | pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl); |
1188 | if (!pte) { |
1189 | walk->action = ACTION_AGAIN; |
1190 | return 0; |
1191 | } |
1192 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
1193 | ptent = ptep_get(ptep: pte); |
1194 | |
1195 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { |
1196 | clear_soft_dirty(vma, addr, pte); |
1197 | continue; |
1198 | } |
1199 | |
1200 | if (!pte_present(a: ptent)) |
1201 | continue; |
1202 | |
1203 | page = vm_normal_page(vma, addr, pte: ptent); |
1204 | if (!page) |
1205 | continue; |
1206 | |
1207 | /* Clear accessed and referenced bits. */ |
1208 | ptep_test_and_clear_young(vma, addr, ptep: pte); |
1209 | test_and_clear_page_young(page); |
1210 | ClearPageReferenced(page); |
1211 | } |
1212 | pte_unmap_unlock(pte - 1, ptl); |
1213 | cond_resched(); |
1214 | return 0; |
1215 | } |
1216 | |
1217 | static int clear_refs_test_walk(unsigned long start, unsigned long end, |
1218 | struct mm_walk *walk) |
1219 | { |
1220 | struct clear_refs_private *cp = walk->private; |
1221 | struct vm_area_struct *vma = walk->vma; |
1222 | |
1223 | if (vma->vm_flags & VM_PFNMAP) |
1224 | return 1; |
1225 | |
1226 | /* |
1227 | * Writing 1 to /proc/pid/clear_refs affects all pages. |
1228 | * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. |
1229 | * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. |
1230 | * Writing 4 to /proc/pid/clear_refs affects all pages. |
1231 | */ |
1232 | if (cp->type == CLEAR_REFS_ANON && vma->vm_file) |
1233 | return 1; |
1234 | if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) |
1235 | return 1; |
1236 | return 0; |
1237 | } |
1238 | |
1239 | static const struct mm_walk_ops clear_refs_walk_ops = { |
1240 | .pmd_entry = clear_refs_pte_range, |
1241 | .test_walk = clear_refs_test_walk, |
1242 | .walk_lock = PGWALK_WRLOCK, |
1243 | }; |
1244 | |
1245 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, |
1246 | size_t count, loff_t *ppos) |
1247 | { |
1248 | struct task_struct *task; |
1249 | char buffer[PROC_NUMBUF] = {}; |
1250 | struct mm_struct *mm; |
1251 | struct vm_area_struct *vma; |
1252 | enum clear_refs_types type; |
1253 | int itype; |
1254 | int rv; |
1255 | |
1256 | if (count > sizeof(buffer) - 1) |
1257 | count = sizeof(buffer) - 1; |
1258 | if (copy_from_user(to: buffer, from: buf, n: count)) |
1259 | return -EFAULT; |
1260 | rv = kstrtoint(s: strstrip(str: buffer), base: 10, res: &itype); |
1261 | if (rv < 0) |
1262 | return rv; |
1263 | type = (enum clear_refs_types)itype; |
1264 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) |
1265 | return -EINVAL; |
1266 | |
1267 | task = get_proc_task(inode: file_inode(f: file)); |
1268 | if (!task) |
1269 | return -ESRCH; |
1270 | mm = get_task_mm(task); |
1271 | if (mm) { |
1272 | VMA_ITERATOR(vmi, mm, 0); |
1273 | struct mmu_notifier_range range; |
1274 | struct clear_refs_private cp = { |
1275 | .type = type, |
1276 | }; |
1277 | |
1278 | if (mmap_write_lock_killable(mm)) { |
1279 | count = -EINTR; |
1280 | goto out_mm; |
1281 | } |
1282 | if (type == CLEAR_REFS_MM_HIWATER_RSS) { |
1283 | /* |
1284 | * Writing 5 to /proc/pid/clear_refs resets the peak |
1285 | * resident set size to this mm's current rss value. |
1286 | */ |
1287 | reset_mm_hiwater_rss(mm); |
1288 | goto out_unlock; |
1289 | } |
1290 | |
1291 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
1292 | for_each_vma(vmi, vma) { |
1293 | if (!(vma->vm_flags & VM_SOFTDIRTY)) |
1294 | continue; |
1295 | vm_flags_clear(vma, VM_SOFTDIRTY); |
1296 | vma_set_page_prot(vma); |
1297 | } |
1298 | |
1299 | inc_tlb_flush_pending(mm); |
1300 | mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_SOFT_DIRTY, |
1301 | flags: 0, mm, start: 0, end: -1UL); |
1302 | mmu_notifier_invalidate_range_start(range: &range); |
1303 | } |
1304 | walk_page_range(mm, start: 0, end: -1, ops: &clear_refs_walk_ops, private: &cp); |
1305 | if (type == CLEAR_REFS_SOFT_DIRTY) { |
1306 | mmu_notifier_invalidate_range_end(range: &range); |
1307 | flush_tlb_mm(mm); |
1308 | dec_tlb_flush_pending(mm); |
1309 | } |
1310 | out_unlock: |
1311 | mmap_write_unlock(mm); |
1312 | out_mm: |
1313 | mmput(mm); |
1314 | } |
1315 | put_task_struct(t: task); |
1316 | |
1317 | return count; |
1318 | } |
1319 | |
1320 | const struct file_operations proc_clear_refs_operations = { |
1321 | .write = clear_refs_write, |
1322 | .llseek = noop_llseek, |
1323 | }; |
1324 | |
1325 | typedef struct { |
1326 | u64 pme; |
1327 | } pagemap_entry_t; |
1328 | |
1329 | struct pagemapread { |
1330 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ |
1331 | pagemap_entry_t *buffer; |
1332 | bool show_pfn; |
1333 | }; |
1334 | |
1335 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) |
1336 | #define PAGEMAP_WALK_MASK (PMD_MASK) |
1337 | |
1338 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) |
1339 | #define PM_PFRAME_BITS 55 |
1340 | #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) |
1341 | #define PM_SOFT_DIRTY BIT_ULL(55) |
1342 | #define PM_MMAP_EXCLUSIVE BIT_ULL(56) |
1343 | #define PM_UFFD_WP BIT_ULL(57) |
1344 | #define PM_FILE BIT_ULL(61) |
1345 | #define PM_SWAP BIT_ULL(62) |
1346 | #define PM_PRESENT BIT_ULL(63) |
1347 | |
1348 | #define PM_END_OF_BUFFER 1 |
1349 | |
1350 | static inline pagemap_entry_t make_pme(u64 frame, u64 flags) |
1351 | { |
1352 | return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; |
1353 | } |
1354 | |
1355 | static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) |
1356 | { |
1357 | pm->buffer[pm->pos++] = *pme; |
1358 | if (pm->pos >= pm->len) |
1359 | return PM_END_OF_BUFFER; |
1360 | return 0; |
1361 | } |
1362 | |
1363 | static int pagemap_pte_hole(unsigned long start, unsigned long end, |
1364 | __always_unused int depth, struct mm_walk *walk) |
1365 | { |
1366 | struct pagemapread *pm = walk->private; |
1367 | unsigned long addr = start; |
1368 | int err = 0; |
1369 | |
1370 | while (addr < end) { |
1371 | struct vm_area_struct *vma = find_vma(mm: walk->mm, addr); |
1372 | pagemap_entry_t pme = make_pme(frame: 0, flags: 0); |
1373 | /* End of address space hole, which we mark as non-present. */ |
1374 | unsigned long hole_end; |
1375 | |
1376 | if (vma) |
1377 | hole_end = min(end, vma->vm_start); |
1378 | else |
1379 | hole_end = end; |
1380 | |
1381 | for (; addr < hole_end; addr += PAGE_SIZE) { |
1382 | err = add_to_pagemap(pme: &pme, pm); |
1383 | if (err) |
1384 | goto out; |
1385 | } |
1386 | |
1387 | if (!vma) |
1388 | break; |
1389 | |
1390 | /* Addresses in the VMA. */ |
1391 | if (vma->vm_flags & VM_SOFTDIRTY) |
1392 | pme = make_pme(frame: 0, PM_SOFT_DIRTY); |
1393 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { |
1394 | err = add_to_pagemap(pme: &pme, pm); |
1395 | if (err) |
1396 | goto out; |
1397 | } |
1398 | } |
1399 | out: |
1400 | return err; |
1401 | } |
1402 | |
1403 | static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, |
1404 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
1405 | { |
1406 | u64 frame = 0, flags = 0; |
1407 | struct page *page = NULL; |
1408 | bool migration = false; |
1409 | |
1410 | if (pte_present(a: pte)) { |
1411 | if (pm->show_pfn) |
1412 | frame = pte_pfn(pte); |
1413 | flags |= PM_PRESENT; |
1414 | page = vm_normal_page(vma, addr, pte); |
1415 | if (pte_soft_dirty(pte)) |
1416 | flags |= PM_SOFT_DIRTY; |
1417 | if (pte_uffd_wp(pte)) |
1418 | flags |= PM_UFFD_WP; |
1419 | } else if (is_swap_pte(pte)) { |
1420 | swp_entry_t entry; |
1421 | if (pte_swp_soft_dirty(pte)) |
1422 | flags |= PM_SOFT_DIRTY; |
1423 | if (pte_swp_uffd_wp(pte)) |
1424 | flags |= PM_UFFD_WP; |
1425 | entry = pte_to_swp_entry(pte); |
1426 | if (pm->show_pfn) { |
1427 | pgoff_t offset; |
1428 | /* |
1429 | * For PFN swap offsets, keeping the offset field |
1430 | * to be PFN only to be compatible with old smaps. |
1431 | */ |
1432 | if (is_pfn_swap_entry(entry)) |
1433 | offset = swp_offset_pfn(entry); |
1434 | else |
1435 | offset = swp_offset(entry); |
1436 | frame = swp_type(entry) | |
1437 | (offset << MAX_SWAPFILES_SHIFT); |
1438 | } |
1439 | flags |= PM_SWAP; |
1440 | migration = is_migration_entry(entry); |
1441 | if (is_pfn_swap_entry(entry)) |
1442 | page = pfn_swap_entry_to_page(entry); |
1443 | if (pte_marker_entry_uffd_wp(entry)) |
1444 | flags |= PM_UFFD_WP; |
1445 | } |
1446 | |
1447 | if (page && !PageAnon(page)) |
1448 | flags |= PM_FILE; |
1449 | if (page && !migration && page_mapcount(page) == 1) |
1450 | flags |= PM_MMAP_EXCLUSIVE; |
1451 | if (vma->vm_flags & VM_SOFTDIRTY) |
1452 | flags |= PM_SOFT_DIRTY; |
1453 | |
1454 | return make_pme(frame, flags); |
1455 | } |
1456 | |
1457 | static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, |
1458 | struct mm_walk *walk) |
1459 | { |
1460 | struct vm_area_struct *vma = walk->vma; |
1461 | struct pagemapread *pm = walk->private; |
1462 | spinlock_t *ptl; |
1463 | pte_t *pte, *orig_pte; |
1464 | int err = 0; |
1465 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1466 | bool migration = false; |
1467 | |
1468 | ptl = pmd_trans_huge_lock(pmd: pmdp, vma); |
1469 | if (ptl) { |
1470 | u64 flags = 0, frame = 0; |
1471 | pmd_t pmd = *pmdp; |
1472 | struct page *page = NULL; |
1473 | |
1474 | if (vma->vm_flags & VM_SOFTDIRTY) |
1475 | flags |= PM_SOFT_DIRTY; |
1476 | |
1477 | if (pmd_present(pmd)) { |
1478 | page = pmd_page(pmd); |
1479 | |
1480 | flags |= PM_PRESENT; |
1481 | if (pmd_soft_dirty(pmd)) |
1482 | flags |= PM_SOFT_DIRTY; |
1483 | if (pmd_uffd_wp(pmd)) |
1484 | flags |= PM_UFFD_WP; |
1485 | if (pm->show_pfn) |
1486 | frame = pmd_pfn(pmd) + |
1487 | ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
1488 | } |
1489 | #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION |
1490 | else if (is_swap_pmd(pmd)) { |
1491 | swp_entry_t entry = pmd_to_swp_entry(pmd); |
1492 | unsigned long offset; |
1493 | |
1494 | if (pm->show_pfn) { |
1495 | if (is_pfn_swap_entry(entry)) |
1496 | offset = swp_offset_pfn(entry); |
1497 | else |
1498 | offset = swp_offset(entry); |
1499 | offset = offset + |
1500 | ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
1501 | frame = swp_type(entry) | |
1502 | (offset << MAX_SWAPFILES_SHIFT); |
1503 | } |
1504 | flags |= PM_SWAP; |
1505 | if (pmd_swp_soft_dirty(pmd)) |
1506 | flags |= PM_SOFT_DIRTY; |
1507 | if (pmd_swp_uffd_wp(pmd)) |
1508 | flags |= PM_UFFD_WP; |
1509 | VM_BUG_ON(!is_pmd_migration_entry(pmd)); |
1510 | migration = is_migration_entry(entry); |
1511 | page = pfn_swap_entry_to_page(entry); |
1512 | } |
1513 | #endif |
1514 | |
1515 | if (page && !migration && page_mapcount(page) == 1) |
1516 | flags |= PM_MMAP_EXCLUSIVE; |
1517 | |
1518 | for (; addr != end; addr += PAGE_SIZE) { |
1519 | pagemap_entry_t pme = make_pme(frame, flags); |
1520 | |
1521 | err = add_to_pagemap(pme: &pme, pm); |
1522 | if (err) |
1523 | break; |
1524 | if (pm->show_pfn) { |
1525 | if (flags & PM_PRESENT) |
1526 | frame++; |
1527 | else if (flags & PM_SWAP) |
1528 | frame += (1 << MAX_SWAPFILES_SHIFT); |
1529 | } |
1530 | } |
1531 | spin_unlock(lock: ptl); |
1532 | return err; |
1533 | } |
1534 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1535 | |
1536 | /* |
1537 | * We can assume that @vma always points to a valid one and @end never |
1538 | * goes beyond vma->vm_end. |
1539 | */ |
1540 | orig_pte = pte = pte_offset_map_lock(mm: walk->mm, pmd: pmdp, addr, ptlp: &ptl); |
1541 | if (!pte) { |
1542 | walk->action = ACTION_AGAIN; |
1543 | return err; |
1544 | } |
1545 | for (; addr < end; pte++, addr += PAGE_SIZE) { |
1546 | pagemap_entry_t pme; |
1547 | |
1548 | pme = pte_to_pagemap_entry(pm, vma, addr, pte: ptep_get(ptep: pte)); |
1549 | err = add_to_pagemap(pme: &pme, pm); |
1550 | if (err) |
1551 | break; |
1552 | } |
1553 | pte_unmap_unlock(orig_pte, ptl); |
1554 | |
1555 | cond_resched(); |
1556 | |
1557 | return err; |
1558 | } |
1559 | |
1560 | #ifdef CONFIG_HUGETLB_PAGE |
1561 | /* This function walks within one hugetlb entry in the single call */ |
1562 | static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, |
1563 | unsigned long addr, unsigned long end, |
1564 | struct mm_walk *walk) |
1565 | { |
1566 | struct pagemapread *pm = walk->private; |
1567 | struct vm_area_struct *vma = walk->vma; |
1568 | u64 flags = 0, frame = 0; |
1569 | int err = 0; |
1570 | pte_t pte; |
1571 | |
1572 | if (vma->vm_flags & VM_SOFTDIRTY) |
1573 | flags |= PM_SOFT_DIRTY; |
1574 | |
1575 | pte = huge_ptep_get(ptep); |
1576 | if (pte_present(a: pte)) { |
1577 | struct page *page = pte_page(pte); |
1578 | |
1579 | if (!PageAnon(page)) |
1580 | flags |= PM_FILE; |
1581 | |
1582 | if (page_mapcount(page) == 1) |
1583 | flags |= PM_MMAP_EXCLUSIVE; |
1584 | |
1585 | if (huge_pte_uffd_wp(pte)) |
1586 | flags |= PM_UFFD_WP; |
1587 | |
1588 | flags |= PM_PRESENT; |
1589 | if (pm->show_pfn) |
1590 | frame = pte_pfn(pte) + |
1591 | ((addr & ~hmask) >> PAGE_SHIFT); |
1592 | } else if (pte_swp_uffd_wp_any(pte)) { |
1593 | flags |= PM_UFFD_WP; |
1594 | } |
1595 | |
1596 | for (; addr != end; addr += PAGE_SIZE) { |
1597 | pagemap_entry_t pme = make_pme(frame, flags); |
1598 | |
1599 | err = add_to_pagemap(pme: &pme, pm); |
1600 | if (err) |
1601 | return err; |
1602 | if (pm->show_pfn && (flags & PM_PRESENT)) |
1603 | frame++; |
1604 | } |
1605 | |
1606 | cond_resched(); |
1607 | |
1608 | return err; |
1609 | } |
1610 | #else |
1611 | #define pagemap_hugetlb_range NULL |
1612 | #endif /* HUGETLB_PAGE */ |
1613 | |
1614 | static const struct mm_walk_ops pagemap_ops = { |
1615 | .pmd_entry = pagemap_pmd_range, |
1616 | .pte_hole = pagemap_pte_hole, |
1617 | .hugetlb_entry = pagemap_hugetlb_range, |
1618 | .walk_lock = PGWALK_RDLOCK, |
1619 | }; |
1620 | |
1621 | /* |
1622 | * /proc/pid/pagemap - an array mapping virtual pages to pfns |
1623 | * |
1624 | * For each page in the address space, this file contains one 64-bit entry |
1625 | * consisting of the following: |
1626 | * |
1627 | * Bits 0-54 page frame number (PFN) if present |
1628 | * Bits 0-4 swap type if swapped |
1629 | * Bits 5-54 swap offset if swapped |
1630 | * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) |
1631 | * Bit 56 page exclusively mapped |
1632 | * Bit 57 pte is uffd-wp write-protected |
1633 | * Bits 58-60 zero |
1634 | * Bit 61 page is file-page or shared-anon |
1635 | * Bit 62 page swapped |
1636 | * Bit 63 page present |
1637 | * |
1638 | * If the page is not present but in swap, then the PFN contains an |
1639 | * encoding of the swap file number and the page's offset into the |
1640 | * swap. Unmapped pages return a null PFN. This allows determining |
1641 | * precisely which pages are mapped (or in swap) and comparing mapped |
1642 | * pages between processes. |
1643 | * |
1644 | * Efficient users of this interface will use /proc/pid/maps to |
1645 | * determine which areas of memory are actually mapped and llseek to |
1646 | * skip over unmapped regions. |
1647 | */ |
1648 | static ssize_t pagemap_read(struct file *file, char __user *buf, |
1649 | size_t count, loff_t *ppos) |
1650 | { |
1651 | struct mm_struct *mm = file->private_data; |
1652 | struct pagemapread pm; |
1653 | unsigned long src; |
1654 | unsigned long svpfn; |
1655 | unsigned long start_vaddr; |
1656 | unsigned long end_vaddr; |
1657 | int ret = 0, copied = 0; |
1658 | |
1659 | if (!mm || !mmget_not_zero(mm)) |
1660 | goto out; |
1661 | |
1662 | ret = -EINVAL; |
1663 | /* file position must be aligned */ |
1664 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) |
1665 | goto out_mm; |
1666 | |
1667 | ret = 0; |
1668 | if (!count) |
1669 | goto out_mm; |
1670 | |
1671 | /* do not disclose physical addresses: attack vector */ |
1672 | pm.show_pfn = file_ns_capable(file, ns: &init_user_ns, CAP_SYS_ADMIN); |
1673 | |
1674 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); |
1675 | pm.buffer = kmalloc_array(n: pm.len, PM_ENTRY_BYTES, GFP_KERNEL); |
1676 | ret = -ENOMEM; |
1677 | if (!pm.buffer) |
1678 | goto out_mm; |
1679 | |
1680 | src = *ppos; |
1681 | svpfn = src / PM_ENTRY_BYTES; |
1682 | end_vaddr = mm->task_size; |
1683 | |
1684 | /* watch out for wraparound */ |
1685 | start_vaddr = end_vaddr; |
1686 | if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { |
1687 | unsigned long end; |
1688 | |
1689 | ret = mmap_read_lock_killable(mm); |
1690 | if (ret) |
1691 | goto out_free; |
1692 | start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); |
1693 | mmap_read_unlock(mm); |
1694 | |
1695 | end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); |
1696 | if (end >= start_vaddr && end < mm->task_size) |
1697 | end_vaddr = end; |
1698 | } |
1699 | |
1700 | /* Ensure the address is inside the task */ |
1701 | if (start_vaddr > mm->task_size) |
1702 | start_vaddr = end_vaddr; |
1703 | |
1704 | ret = 0; |
1705 | while (count && (start_vaddr < end_vaddr)) { |
1706 | int len; |
1707 | unsigned long end; |
1708 | |
1709 | pm.pos = 0; |
1710 | end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; |
1711 | /* overflow ? */ |
1712 | if (end < start_vaddr || end > end_vaddr) |
1713 | end = end_vaddr; |
1714 | ret = mmap_read_lock_killable(mm); |
1715 | if (ret) |
1716 | goto out_free; |
1717 | ret = walk_page_range(mm, start: start_vaddr, end, ops: &pagemap_ops, private: &pm); |
1718 | mmap_read_unlock(mm); |
1719 | start_vaddr = end; |
1720 | |
1721 | len = min(count, PM_ENTRY_BYTES * pm.pos); |
1722 | if (copy_to_user(to: buf, from: pm.buffer, n: len)) { |
1723 | ret = -EFAULT; |
1724 | goto out_free; |
1725 | } |
1726 | copied += len; |
1727 | buf += len; |
1728 | count -= len; |
1729 | } |
1730 | *ppos += copied; |
1731 | if (!ret || ret == PM_END_OF_BUFFER) |
1732 | ret = copied; |
1733 | |
1734 | out_free: |
1735 | kfree(objp: pm.buffer); |
1736 | out_mm: |
1737 | mmput(mm); |
1738 | out: |
1739 | return ret; |
1740 | } |
1741 | |
1742 | static int pagemap_open(struct inode *inode, struct file *file) |
1743 | { |
1744 | struct mm_struct *mm; |
1745 | |
1746 | mm = proc_mem_open(inode, PTRACE_MODE_READ); |
1747 | if (IS_ERR(ptr: mm)) |
1748 | return PTR_ERR(ptr: mm); |
1749 | file->private_data = mm; |
1750 | return 0; |
1751 | } |
1752 | |
1753 | static int pagemap_release(struct inode *inode, struct file *file) |
1754 | { |
1755 | struct mm_struct *mm = file->private_data; |
1756 | |
1757 | if (mm) |
1758 | mmdrop(mm); |
1759 | return 0; |
1760 | } |
1761 | |
1762 | #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ |
1763 | PAGE_IS_FILE | PAGE_IS_PRESENT | \ |
1764 | PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ |
1765 | PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) |
1766 | #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) |
1767 | |
1768 | struct pagemap_scan_private { |
1769 | struct pm_scan_arg arg; |
1770 | unsigned long masks_of_interest, cur_vma_category; |
1771 | struct page_region *vec_buf; |
1772 | unsigned long vec_buf_len, vec_buf_index, found_pages; |
1773 | struct page_region __user *vec_out; |
1774 | }; |
1775 | |
1776 | static unsigned long pagemap_page_category(struct pagemap_scan_private *p, |
1777 | struct vm_area_struct *vma, |
1778 | unsigned long addr, pte_t pte) |
1779 | { |
1780 | unsigned long categories = 0; |
1781 | |
1782 | if (pte_present(a: pte)) { |
1783 | struct page *page; |
1784 | |
1785 | categories |= PAGE_IS_PRESENT; |
1786 | if (!pte_uffd_wp(pte)) |
1787 | categories |= PAGE_IS_WRITTEN; |
1788 | |
1789 | if (p->masks_of_interest & PAGE_IS_FILE) { |
1790 | page = vm_normal_page(vma, addr, pte); |
1791 | if (page && !PageAnon(page)) |
1792 | categories |= PAGE_IS_FILE; |
1793 | } |
1794 | |
1795 | if (is_zero_pfn(pfn: pte_pfn(pte))) |
1796 | categories |= PAGE_IS_PFNZERO; |
1797 | if (pte_soft_dirty(pte)) |
1798 | categories |= PAGE_IS_SOFT_DIRTY; |
1799 | } else if (is_swap_pte(pte)) { |
1800 | swp_entry_t swp; |
1801 | |
1802 | categories |= PAGE_IS_SWAPPED; |
1803 | if (!pte_swp_uffd_wp_any(pte)) |
1804 | categories |= PAGE_IS_WRITTEN; |
1805 | |
1806 | if (p->masks_of_interest & PAGE_IS_FILE) { |
1807 | swp = pte_to_swp_entry(pte); |
1808 | if (is_pfn_swap_entry(entry: swp) && |
1809 | !folio_test_anon(folio: pfn_swap_entry_folio(entry: swp))) |
1810 | categories |= PAGE_IS_FILE; |
1811 | } |
1812 | if (pte_swp_soft_dirty(pte)) |
1813 | categories |= PAGE_IS_SOFT_DIRTY; |
1814 | } |
1815 | |
1816 | return categories; |
1817 | } |
1818 | |
1819 | static void make_uffd_wp_pte(struct vm_area_struct *vma, |
1820 | unsigned long addr, pte_t *pte) |
1821 | { |
1822 | pte_t ptent = ptep_get(ptep: pte); |
1823 | |
1824 | if (pte_present(a: ptent)) { |
1825 | pte_t old_pte; |
1826 | |
1827 | old_pte = ptep_modify_prot_start(vma, addr, ptep: pte); |
1828 | ptent = pte_mkuffd_wp(pte: ptent); |
1829 | ptep_modify_prot_commit(vma, addr, ptep: pte, old_pte, pte: ptent); |
1830 | } else if (is_swap_pte(pte: ptent)) { |
1831 | ptent = pte_swp_mkuffd_wp(pte: ptent); |
1832 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
1833 | } else { |
1834 | set_pte_at(vma->vm_mm, addr, pte, |
1835 | make_pte_marker(PTE_MARKER_UFFD_WP)); |
1836 | } |
1837 | } |
1838 | |
1839 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1840 | static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, |
1841 | struct vm_area_struct *vma, |
1842 | unsigned long addr, pmd_t pmd) |
1843 | { |
1844 | unsigned long categories = PAGE_IS_HUGE; |
1845 | |
1846 | if (pmd_present(pmd)) { |
1847 | struct page *page; |
1848 | |
1849 | categories |= PAGE_IS_PRESENT; |
1850 | if (!pmd_uffd_wp(pmd)) |
1851 | categories |= PAGE_IS_WRITTEN; |
1852 | |
1853 | if (p->masks_of_interest & PAGE_IS_FILE) { |
1854 | page = vm_normal_page_pmd(vma, addr, pmd); |
1855 | if (page && !PageAnon(page)) |
1856 | categories |= PAGE_IS_FILE; |
1857 | } |
1858 | |
1859 | if (is_zero_pfn(pfn: pmd_pfn(pmd))) |
1860 | categories |= PAGE_IS_PFNZERO; |
1861 | if (pmd_soft_dirty(pmd)) |
1862 | categories |= PAGE_IS_SOFT_DIRTY; |
1863 | } else if (is_swap_pmd(pmd)) { |
1864 | swp_entry_t swp; |
1865 | |
1866 | categories |= PAGE_IS_SWAPPED; |
1867 | if (!pmd_swp_uffd_wp(pmd)) |
1868 | categories |= PAGE_IS_WRITTEN; |
1869 | if (pmd_swp_soft_dirty(pmd)) |
1870 | categories |= PAGE_IS_SOFT_DIRTY; |
1871 | |
1872 | if (p->masks_of_interest & PAGE_IS_FILE) { |
1873 | swp = pmd_to_swp_entry(pmd); |
1874 | if (is_pfn_swap_entry(entry: swp) && |
1875 | !folio_test_anon(folio: pfn_swap_entry_folio(entry: swp))) |
1876 | categories |= PAGE_IS_FILE; |
1877 | } |
1878 | } |
1879 | |
1880 | return categories; |
1881 | } |
1882 | |
1883 | static void make_uffd_wp_pmd(struct vm_area_struct *vma, |
1884 | unsigned long addr, pmd_t *pmdp) |
1885 | { |
1886 | pmd_t old, pmd = *pmdp; |
1887 | |
1888 | if (pmd_present(pmd)) { |
1889 | old = pmdp_invalidate_ad(vma, address: addr, pmdp); |
1890 | pmd = pmd_mkuffd_wp(pmd: old); |
1891 | set_pmd_at(mm: vma->vm_mm, addr, pmdp, pmd); |
1892 | } else if (is_migration_entry(entry: pmd_to_swp_entry(pmd))) { |
1893 | pmd = pmd_swp_mkuffd_wp(pmd); |
1894 | set_pmd_at(mm: vma->vm_mm, addr, pmdp, pmd); |
1895 | } |
1896 | } |
1897 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1898 | |
1899 | #ifdef CONFIG_HUGETLB_PAGE |
1900 | static unsigned long pagemap_hugetlb_category(pte_t pte) |
1901 | { |
1902 | unsigned long categories = PAGE_IS_HUGE; |
1903 | |
1904 | /* |
1905 | * According to pagemap_hugetlb_range(), file-backed HugeTLB |
1906 | * page cannot be swapped. So PAGE_IS_FILE is not checked for |
1907 | * swapped pages. |
1908 | */ |
1909 | if (pte_present(a: pte)) { |
1910 | categories |= PAGE_IS_PRESENT; |
1911 | if (!huge_pte_uffd_wp(pte)) |
1912 | categories |= PAGE_IS_WRITTEN; |
1913 | if (!PageAnon(pte_page(pte))) |
1914 | categories |= PAGE_IS_FILE; |
1915 | if (is_zero_pfn(pfn: pte_pfn(pte))) |
1916 | categories |= PAGE_IS_PFNZERO; |
1917 | if (pte_soft_dirty(pte)) |
1918 | categories |= PAGE_IS_SOFT_DIRTY; |
1919 | } else if (is_swap_pte(pte)) { |
1920 | categories |= PAGE_IS_SWAPPED; |
1921 | if (!pte_swp_uffd_wp_any(pte)) |
1922 | categories |= PAGE_IS_WRITTEN; |
1923 | if (pte_swp_soft_dirty(pte)) |
1924 | categories |= PAGE_IS_SOFT_DIRTY; |
1925 | } |
1926 | |
1927 | return categories; |
1928 | } |
1929 | |
1930 | static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, |
1931 | unsigned long addr, pte_t *ptep, |
1932 | pte_t ptent) |
1933 | { |
1934 | unsigned long psize; |
1935 | |
1936 | if (is_hugetlb_entry_hwpoisoned(pte: ptent) || is_pte_marker(pte: ptent)) |
1937 | return; |
1938 | |
1939 | psize = huge_page_size(h: hstate_vma(vma)); |
1940 | |
1941 | if (is_hugetlb_entry_migration(pte: ptent)) |
1942 | set_huge_pte_at(mm: vma->vm_mm, addr, ptep, |
1943 | pte: pte_swp_mkuffd_wp(pte: ptent), sz: psize); |
1944 | else if (!huge_pte_none(pte: ptent)) |
1945 | huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte: ptent, |
1946 | pte: huge_pte_mkuffd_wp(pte: ptent)); |
1947 | else |
1948 | set_huge_pte_at(mm: vma->vm_mm, addr, ptep, |
1949 | pte: make_pte_marker(PTE_MARKER_UFFD_WP), sz: psize); |
1950 | } |
1951 | #endif /* CONFIG_HUGETLB_PAGE */ |
1952 | |
1953 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) |
1954 | static void pagemap_scan_backout_range(struct pagemap_scan_private *p, |
1955 | unsigned long addr, unsigned long end) |
1956 | { |
1957 | struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; |
1958 | |
1959 | if (cur_buf->start != addr) |
1960 | cur_buf->end = addr; |
1961 | else |
1962 | cur_buf->start = cur_buf->end = 0; |
1963 | |
1964 | p->found_pages -= (end - addr) / PAGE_SIZE; |
1965 | } |
1966 | #endif |
1967 | |
1968 | static bool pagemap_scan_is_interesting_page(unsigned long categories, |
1969 | const struct pagemap_scan_private *p) |
1970 | { |
1971 | categories ^= p->arg.category_inverted; |
1972 | if ((categories & p->arg.category_mask) != p->arg.category_mask) |
1973 | return false; |
1974 | if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) |
1975 | return false; |
1976 | |
1977 | return true; |
1978 | } |
1979 | |
1980 | static bool pagemap_scan_is_interesting_vma(unsigned long categories, |
1981 | const struct pagemap_scan_private *p) |
1982 | { |
1983 | unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; |
1984 | |
1985 | categories ^= p->arg.category_inverted; |
1986 | if ((categories & required) != required) |
1987 | return false; |
1988 | |
1989 | return true; |
1990 | } |
1991 | |
1992 | static int pagemap_scan_test_walk(unsigned long start, unsigned long end, |
1993 | struct mm_walk *walk) |
1994 | { |
1995 | struct pagemap_scan_private *p = walk->private; |
1996 | struct vm_area_struct *vma = walk->vma; |
1997 | unsigned long vma_category = 0; |
1998 | bool wp_allowed = userfaultfd_wp_async(vma) && |
1999 | userfaultfd_wp_use_markers(vma); |
2000 | |
2001 | if (!wp_allowed) { |
2002 | /* User requested explicit failure over wp-async capability */ |
2003 | if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) |
2004 | return -EPERM; |
2005 | /* |
2006 | * User requires wr-protect, and allows silently skipping |
2007 | * unsupported vmas. |
2008 | */ |
2009 | if (p->arg.flags & PM_SCAN_WP_MATCHING) |
2010 | return 1; |
2011 | /* |
2012 | * Then the request doesn't involve wr-protects at all, |
2013 | * fall through to the rest checks, and allow vma walk. |
2014 | */ |
2015 | } |
2016 | |
2017 | if (vma->vm_flags & VM_PFNMAP) |
2018 | return 1; |
2019 | |
2020 | if (wp_allowed) |
2021 | vma_category |= PAGE_IS_WPALLOWED; |
2022 | |
2023 | if (vma->vm_flags & VM_SOFTDIRTY) |
2024 | vma_category |= PAGE_IS_SOFT_DIRTY; |
2025 | |
2026 | if (!pagemap_scan_is_interesting_vma(categories: vma_category, p)) |
2027 | return 1; |
2028 | |
2029 | p->cur_vma_category = vma_category; |
2030 | |
2031 | return 0; |
2032 | } |
2033 | |
2034 | static bool pagemap_scan_push_range(unsigned long categories, |
2035 | struct pagemap_scan_private *p, |
2036 | unsigned long addr, unsigned long end) |
2037 | { |
2038 | struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; |
2039 | |
2040 | /* |
2041 | * When there is no output buffer provided at all, the sentinel values |
2042 | * won't match here. There is no other way for `cur_buf->end` to be |
2043 | * non-zero other than it being non-empty. |
2044 | */ |
2045 | if (addr == cur_buf->end && categories == cur_buf->categories) { |
2046 | cur_buf->end = end; |
2047 | return true; |
2048 | } |
2049 | |
2050 | if (cur_buf->end) { |
2051 | if (p->vec_buf_index >= p->vec_buf_len - 1) |
2052 | return false; |
2053 | |
2054 | cur_buf = &p->vec_buf[++p->vec_buf_index]; |
2055 | } |
2056 | |
2057 | cur_buf->start = addr; |
2058 | cur_buf->end = end; |
2059 | cur_buf->categories = categories; |
2060 | |
2061 | return true; |
2062 | } |
2063 | |
2064 | static int pagemap_scan_output(unsigned long categories, |
2065 | struct pagemap_scan_private *p, |
2066 | unsigned long addr, unsigned long *end) |
2067 | { |
2068 | unsigned long n_pages, total_pages; |
2069 | int ret = 0; |
2070 | |
2071 | if (!p->vec_buf) |
2072 | return 0; |
2073 | |
2074 | categories &= p->arg.return_mask; |
2075 | |
2076 | n_pages = (*end - addr) / PAGE_SIZE; |
2077 | if (check_add_overflow(p->found_pages, n_pages, &total_pages) || |
2078 | total_pages > p->arg.max_pages) { |
2079 | size_t n_too_much = total_pages - p->arg.max_pages; |
2080 | *end -= n_too_much * PAGE_SIZE; |
2081 | n_pages -= n_too_much; |
2082 | ret = -ENOSPC; |
2083 | } |
2084 | |
2085 | if (!pagemap_scan_push_range(categories, p, addr, end: *end)) { |
2086 | *end = addr; |
2087 | n_pages = 0; |
2088 | ret = -ENOSPC; |
2089 | } |
2090 | |
2091 | p->found_pages += n_pages; |
2092 | if (ret) |
2093 | p->arg.walk_end = *end; |
2094 | |
2095 | return ret; |
2096 | } |
2097 | |
2098 | static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, |
2099 | unsigned long end, struct mm_walk *walk) |
2100 | { |
2101 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2102 | struct pagemap_scan_private *p = walk->private; |
2103 | struct vm_area_struct *vma = walk->vma; |
2104 | unsigned long categories; |
2105 | spinlock_t *ptl; |
2106 | int ret = 0; |
2107 | |
2108 | ptl = pmd_trans_huge_lock(pmd, vma); |
2109 | if (!ptl) |
2110 | return -ENOENT; |
2111 | |
2112 | categories = p->cur_vma_category | |
2113 | pagemap_thp_category(p, vma, addr: start, pmd: *pmd); |
2114 | |
2115 | if (!pagemap_scan_is_interesting_page(categories, p)) |
2116 | goto out_unlock; |
2117 | |
2118 | ret = pagemap_scan_output(categories, p, addr: start, end: &end); |
2119 | if (start == end) |
2120 | goto out_unlock; |
2121 | |
2122 | if (~p->arg.flags & PM_SCAN_WP_MATCHING) |
2123 | goto out_unlock; |
2124 | if (~categories & PAGE_IS_WRITTEN) |
2125 | goto out_unlock; |
2126 | |
2127 | /* |
2128 | * Break huge page into small pages if the WP operation |
2129 | * needs to be performed on a portion of the huge page. |
2130 | */ |
2131 | if (end != start + HPAGE_SIZE) { |
2132 | spin_unlock(lock: ptl); |
2133 | split_huge_pmd(vma, pmd, start); |
2134 | pagemap_scan_backout_range(p, addr: start, end); |
2135 | /* Report as if there was no THP */ |
2136 | return -ENOENT; |
2137 | } |
2138 | |
2139 | make_uffd_wp_pmd(vma, addr: start, pmdp: pmd); |
2140 | flush_tlb_range(vma, start, end); |
2141 | out_unlock: |
2142 | spin_unlock(lock: ptl); |
2143 | return ret; |
2144 | #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ |
2145 | return -ENOENT; |
2146 | #endif |
2147 | } |
2148 | |
2149 | static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, |
2150 | unsigned long end, struct mm_walk *walk) |
2151 | { |
2152 | struct pagemap_scan_private *p = walk->private; |
2153 | struct vm_area_struct *vma = walk->vma; |
2154 | unsigned long addr, flush_end = 0; |
2155 | pte_t *pte, *start_pte; |
2156 | spinlock_t *ptl; |
2157 | int ret; |
2158 | |
2159 | arch_enter_lazy_mmu_mode(); |
2160 | |
2161 | ret = pagemap_scan_thp_entry(pmd, start, end, walk); |
2162 | if (ret != -ENOENT) { |
2163 | arch_leave_lazy_mmu_mode(); |
2164 | return ret; |
2165 | } |
2166 | |
2167 | ret = 0; |
2168 | start_pte = pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr: start, ptlp: &ptl); |
2169 | if (!pte) { |
2170 | arch_leave_lazy_mmu_mode(); |
2171 | walk->action = ACTION_AGAIN; |
2172 | return 0; |
2173 | } |
2174 | |
2175 | if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { |
2176 | /* Fast path for performing exclusive WP */ |
2177 | for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { |
2178 | if (pte_uffd_wp(pte: ptep_get(ptep: pte))) |
2179 | continue; |
2180 | make_uffd_wp_pte(vma, addr, pte); |
2181 | if (!flush_end) |
2182 | start = addr; |
2183 | flush_end = addr + PAGE_SIZE; |
2184 | } |
2185 | goto flush_and_return; |
2186 | } |
2187 | |
2188 | if (!p->arg.category_anyof_mask && !p->arg.category_inverted && |
2189 | p->arg.category_mask == PAGE_IS_WRITTEN && |
2190 | p->arg.return_mask == PAGE_IS_WRITTEN) { |
2191 | for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { |
2192 | unsigned long next = addr + PAGE_SIZE; |
2193 | |
2194 | if (pte_uffd_wp(pte: ptep_get(ptep: pte))) |
2195 | continue; |
2196 | ret = pagemap_scan_output(categories: p->cur_vma_category | PAGE_IS_WRITTEN, |
2197 | p, addr, end: &next); |
2198 | if (next == addr) |
2199 | break; |
2200 | if (~p->arg.flags & PM_SCAN_WP_MATCHING) |
2201 | continue; |
2202 | make_uffd_wp_pte(vma, addr, pte); |
2203 | if (!flush_end) |
2204 | start = addr; |
2205 | flush_end = next; |
2206 | } |
2207 | goto flush_and_return; |
2208 | } |
2209 | |
2210 | for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { |
2211 | unsigned long categories = p->cur_vma_category | |
2212 | pagemap_page_category(p, vma, addr, pte: ptep_get(ptep: pte)); |
2213 | unsigned long next = addr + PAGE_SIZE; |
2214 | |
2215 | if (!pagemap_scan_is_interesting_page(categories, p)) |
2216 | continue; |
2217 | |
2218 | ret = pagemap_scan_output(categories, p, addr, end: &next); |
2219 | if (next == addr) |
2220 | break; |
2221 | |
2222 | if (~p->arg.flags & PM_SCAN_WP_MATCHING) |
2223 | continue; |
2224 | if (~categories & PAGE_IS_WRITTEN) |
2225 | continue; |
2226 | |
2227 | make_uffd_wp_pte(vma, addr, pte); |
2228 | if (!flush_end) |
2229 | start = addr; |
2230 | flush_end = next; |
2231 | } |
2232 | |
2233 | flush_and_return: |
2234 | if (flush_end) |
2235 | flush_tlb_range(vma, start, addr); |
2236 | |
2237 | pte_unmap_unlock(start_pte, ptl); |
2238 | arch_leave_lazy_mmu_mode(); |
2239 | |
2240 | cond_resched(); |
2241 | return ret; |
2242 | } |
2243 | |
2244 | #ifdef CONFIG_HUGETLB_PAGE |
2245 | static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, |
2246 | unsigned long start, unsigned long end, |
2247 | struct mm_walk *walk) |
2248 | { |
2249 | struct pagemap_scan_private *p = walk->private; |
2250 | struct vm_area_struct *vma = walk->vma; |
2251 | unsigned long categories; |
2252 | spinlock_t *ptl; |
2253 | int ret = 0; |
2254 | pte_t pte; |
2255 | |
2256 | if (~p->arg.flags & PM_SCAN_WP_MATCHING) { |
2257 | /* Go the short route when not write-protecting pages. */ |
2258 | |
2259 | pte = huge_ptep_get(ptep); |
2260 | categories = p->cur_vma_category | pagemap_hugetlb_category(pte); |
2261 | |
2262 | if (!pagemap_scan_is_interesting_page(categories, p)) |
2263 | return 0; |
2264 | |
2265 | return pagemap_scan_output(categories, p, addr: start, end: &end); |
2266 | } |
2267 | |
2268 | i_mmap_lock_write(mapping: vma->vm_file->f_mapping); |
2269 | ptl = huge_pte_lock(h: hstate_vma(vma), mm: vma->vm_mm, pte: ptep); |
2270 | |
2271 | pte = huge_ptep_get(ptep); |
2272 | categories = p->cur_vma_category | pagemap_hugetlb_category(pte); |
2273 | |
2274 | if (!pagemap_scan_is_interesting_page(categories, p)) |
2275 | goto out_unlock; |
2276 | |
2277 | ret = pagemap_scan_output(categories, p, addr: start, end: &end); |
2278 | if (start == end) |
2279 | goto out_unlock; |
2280 | |
2281 | if (~categories & PAGE_IS_WRITTEN) |
2282 | goto out_unlock; |
2283 | |
2284 | if (end != start + HPAGE_SIZE) { |
2285 | /* Partial HugeTLB page WP isn't possible. */ |
2286 | pagemap_scan_backout_range(p, addr: start, end); |
2287 | p->arg.walk_end = start; |
2288 | ret = 0; |
2289 | goto out_unlock; |
2290 | } |
2291 | |
2292 | make_uffd_wp_huge_pte(vma, addr: start, ptep, ptent: pte); |
2293 | flush_hugetlb_tlb_range(vma, start, end); |
2294 | |
2295 | out_unlock: |
2296 | spin_unlock(lock: ptl); |
2297 | i_mmap_unlock_write(mapping: vma->vm_file->f_mapping); |
2298 | |
2299 | return ret; |
2300 | } |
2301 | #else |
2302 | #define pagemap_scan_hugetlb_entry NULL |
2303 | #endif |
2304 | |
2305 | static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, |
2306 | int depth, struct mm_walk *walk) |
2307 | { |
2308 | struct pagemap_scan_private *p = walk->private; |
2309 | struct vm_area_struct *vma = walk->vma; |
2310 | int ret, err; |
2311 | |
2312 | if (!vma || !pagemap_scan_is_interesting_page(categories: p->cur_vma_category, p)) |
2313 | return 0; |
2314 | |
2315 | ret = pagemap_scan_output(categories: p->cur_vma_category, p, addr, end: &end); |
2316 | if (addr == end) |
2317 | return ret; |
2318 | |
2319 | if (~p->arg.flags & PM_SCAN_WP_MATCHING) |
2320 | return ret; |
2321 | |
2322 | err = uffd_wp_range(vma, start: addr, len: end - addr, enable_wp: true); |
2323 | if (err < 0) |
2324 | ret = err; |
2325 | |
2326 | return ret; |
2327 | } |
2328 | |
2329 | static const struct mm_walk_ops pagemap_scan_ops = { |
2330 | .test_walk = pagemap_scan_test_walk, |
2331 | .pmd_entry = pagemap_scan_pmd_entry, |
2332 | .pte_hole = pagemap_scan_pte_hole, |
2333 | .hugetlb_entry = pagemap_scan_hugetlb_entry, |
2334 | }; |
2335 | |
2336 | static int pagemap_scan_get_args(struct pm_scan_arg *arg, |
2337 | unsigned long uarg) |
2338 | { |
2339 | if (copy_from_user(to: arg, from: (void __user *)uarg, n: sizeof(*arg))) |
2340 | return -EFAULT; |
2341 | |
2342 | if (arg->size != sizeof(struct pm_scan_arg)) |
2343 | return -EINVAL; |
2344 | |
2345 | /* Validate requested features */ |
2346 | if (arg->flags & ~PM_SCAN_FLAGS) |
2347 | return -EINVAL; |
2348 | if ((arg->category_inverted | arg->category_mask | |
2349 | arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) |
2350 | return -EINVAL; |
2351 | |
2352 | arg->start = untagged_addr((unsigned long)arg->start); |
2353 | arg->end = untagged_addr((unsigned long)arg->end); |
2354 | arg->vec = untagged_addr((unsigned long)arg->vec); |
2355 | |
2356 | /* Validate memory pointers */ |
2357 | if (!IS_ALIGNED(arg->start, PAGE_SIZE)) |
2358 | return -EINVAL; |
2359 | if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) |
2360 | return -EFAULT; |
2361 | if (!arg->vec && arg->vec_len) |
2362 | return -EINVAL; |
2363 | if (arg->vec && !access_ok((void __user *)(long)arg->vec, |
2364 | arg->vec_len * sizeof(struct page_region))) |
2365 | return -EFAULT; |
2366 | |
2367 | /* Fixup default values */ |
2368 | arg->end = ALIGN(arg->end, PAGE_SIZE); |
2369 | arg->walk_end = 0; |
2370 | if (!arg->max_pages) |
2371 | arg->max_pages = ULONG_MAX; |
2372 | |
2373 | return 0; |
2374 | } |
2375 | |
2376 | static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, |
2377 | unsigned long uargl) |
2378 | { |
2379 | struct pm_scan_arg __user *uarg = (void __user *)uargl; |
2380 | |
2381 | if (copy_to_user(to: &uarg->walk_end, from: &arg->walk_end, n: sizeof(arg->walk_end))) |
2382 | return -EFAULT; |
2383 | |
2384 | return 0; |
2385 | } |
2386 | |
2387 | static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) |
2388 | { |
2389 | if (!p->arg.vec_len) |
2390 | return 0; |
2391 | |
2392 | p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, |
2393 | p->arg.vec_len); |
2394 | p->vec_buf = kmalloc_array(n: p->vec_buf_len, size: sizeof(*p->vec_buf), |
2395 | GFP_KERNEL); |
2396 | if (!p->vec_buf) |
2397 | return -ENOMEM; |
2398 | |
2399 | p->vec_buf->start = p->vec_buf->end = 0; |
2400 | p->vec_out = (struct page_region __user *)(long)p->arg.vec; |
2401 | |
2402 | return 0; |
2403 | } |
2404 | |
2405 | static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) |
2406 | { |
2407 | const struct page_region *buf = p->vec_buf; |
2408 | long n = p->vec_buf_index; |
2409 | |
2410 | if (!p->vec_buf) |
2411 | return 0; |
2412 | |
2413 | if (buf[n].end != buf[n].start) |
2414 | n++; |
2415 | |
2416 | if (!n) |
2417 | return 0; |
2418 | |
2419 | if (copy_to_user(to: p->vec_out, from: buf, n: n * sizeof(*buf))) |
2420 | return -EFAULT; |
2421 | |
2422 | p->arg.vec_len -= n; |
2423 | p->vec_out += n; |
2424 | |
2425 | p->vec_buf_index = 0; |
2426 | p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); |
2427 | p->vec_buf->start = p->vec_buf->end = 0; |
2428 | |
2429 | return n; |
2430 | } |
2431 | |
2432 | static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) |
2433 | { |
2434 | struct pagemap_scan_private p = {0}; |
2435 | unsigned long walk_start; |
2436 | size_t n_ranges_out = 0; |
2437 | int ret; |
2438 | |
2439 | ret = pagemap_scan_get_args(arg: &p.arg, uarg); |
2440 | if (ret) |
2441 | return ret; |
2442 | |
2443 | p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | |
2444 | p.arg.return_mask; |
2445 | ret = pagemap_scan_init_bounce_buffer(p: &p); |
2446 | if (ret) |
2447 | return ret; |
2448 | |
2449 | for (walk_start = p.arg.start; walk_start < p.arg.end; |
2450 | walk_start = p.arg.walk_end) { |
2451 | struct mmu_notifier_range range; |
2452 | long n_out; |
2453 | |
2454 | if (fatal_signal_pending(current)) { |
2455 | ret = -EINTR; |
2456 | break; |
2457 | } |
2458 | |
2459 | ret = mmap_read_lock_killable(mm); |
2460 | if (ret) |
2461 | break; |
2462 | |
2463 | /* Protection change for the range is going to happen. */ |
2464 | if (p.arg.flags & PM_SCAN_WP_MATCHING) { |
2465 | mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_VMA, flags: 0, |
2466 | mm, start: walk_start, end: p.arg.end); |
2467 | mmu_notifier_invalidate_range_start(range: &range); |
2468 | } |
2469 | |
2470 | ret = walk_page_range(mm, start: walk_start, end: p.arg.end, |
2471 | ops: &pagemap_scan_ops, private: &p); |
2472 | |
2473 | if (p.arg.flags & PM_SCAN_WP_MATCHING) |
2474 | mmu_notifier_invalidate_range_end(range: &range); |
2475 | |
2476 | mmap_read_unlock(mm); |
2477 | |
2478 | n_out = pagemap_scan_flush_buffer(p: &p); |
2479 | if (n_out < 0) |
2480 | ret = n_out; |
2481 | else |
2482 | n_ranges_out += n_out; |
2483 | |
2484 | if (ret != -ENOSPC) |
2485 | break; |
2486 | |
2487 | if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) |
2488 | break; |
2489 | } |
2490 | |
2491 | /* ENOSPC signifies early stop (buffer full) from the walk. */ |
2492 | if (!ret || ret == -ENOSPC) |
2493 | ret = n_ranges_out; |
2494 | |
2495 | /* The walk_end isn't set when ret is zero */ |
2496 | if (!p.arg.walk_end) |
2497 | p.arg.walk_end = p.arg.end; |
2498 | if (pagemap_scan_writeback_args(arg: &p.arg, uargl: uarg)) |
2499 | ret = -EFAULT; |
2500 | |
2501 | kfree(objp: p.vec_buf); |
2502 | return ret; |
2503 | } |
2504 | |
2505 | static long do_pagemap_cmd(struct file *file, unsigned int cmd, |
2506 | unsigned long arg) |
2507 | { |
2508 | struct mm_struct *mm = file->private_data; |
2509 | |
2510 | switch (cmd) { |
2511 | case PAGEMAP_SCAN: |
2512 | return do_pagemap_scan(mm, uarg: arg); |
2513 | |
2514 | default: |
2515 | return -EINVAL; |
2516 | } |
2517 | } |
2518 | |
2519 | const struct file_operations proc_pagemap_operations = { |
2520 | .llseek = mem_lseek, /* borrow this */ |
2521 | .read = pagemap_read, |
2522 | .open = pagemap_open, |
2523 | .release = pagemap_release, |
2524 | .unlocked_ioctl = do_pagemap_cmd, |
2525 | .compat_ioctl = do_pagemap_cmd, |
2526 | }; |
2527 | #endif /* CONFIG_PROC_PAGE_MONITOR */ |
2528 | |
2529 | #ifdef CONFIG_NUMA |
2530 | |
2531 | struct numa_maps { |
2532 | unsigned long pages; |
2533 | unsigned long anon; |
2534 | unsigned long active; |
2535 | unsigned long writeback; |
2536 | unsigned long mapcount_max; |
2537 | unsigned long dirty; |
2538 | unsigned long swapcache; |
2539 | unsigned long node[MAX_NUMNODES]; |
2540 | }; |
2541 | |
2542 | struct numa_maps_private { |
2543 | struct proc_maps_private proc_maps; |
2544 | struct numa_maps md; |
2545 | }; |
2546 | |
2547 | static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, |
2548 | unsigned long nr_pages) |
2549 | { |
2550 | int count = page_mapcount(page); |
2551 | |
2552 | md->pages += nr_pages; |
2553 | if (pte_dirty || PageDirty(page)) |
2554 | md->dirty += nr_pages; |
2555 | |
2556 | if (PageSwapCache(page)) |
2557 | md->swapcache += nr_pages; |
2558 | |
2559 | if (PageActive(page) || PageUnevictable(page)) |
2560 | md->active += nr_pages; |
2561 | |
2562 | if (PageWriteback(page)) |
2563 | md->writeback += nr_pages; |
2564 | |
2565 | if (PageAnon(page)) |
2566 | md->anon += nr_pages; |
2567 | |
2568 | if (count > md->mapcount_max) |
2569 | md->mapcount_max = count; |
2570 | |
2571 | md->node[page_to_nid(page)] += nr_pages; |
2572 | } |
2573 | |
2574 | static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, |
2575 | unsigned long addr) |
2576 | { |
2577 | struct page *page; |
2578 | int nid; |
2579 | |
2580 | if (!pte_present(a: pte)) |
2581 | return NULL; |
2582 | |
2583 | page = vm_normal_page(vma, addr, pte); |
2584 | if (!page || is_zone_device_page(page)) |
2585 | return NULL; |
2586 | |
2587 | if (PageReserved(page)) |
2588 | return NULL; |
2589 | |
2590 | nid = page_to_nid(page); |
2591 | if (!node_isset(nid, node_states[N_MEMORY])) |
2592 | return NULL; |
2593 | |
2594 | return page; |
2595 | } |
2596 | |
2597 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2598 | static struct page *can_gather_numa_stats_pmd(pmd_t pmd, |
2599 | struct vm_area_struct *vma, |
2600 | unsigned long addr) |
2601 | { |
2602 | struct page *page; |
2603 | int nid; |
2604 | |
2605 | if (!pmd_present(pmd)) |
2606 | return NULL; |
2607 | |
2608 | page = vm_normal_page_pmd(vma, addr, pmd); |
2609 | if (!page) |
2610 | return NULL; |
2611 | |
2612 | if (PageReserved(page)) |
2613 | return NULL; |
2614 | |
2615 | nid = page_to_nid(page); |
2616 | if (!node_isset(nid, node_states[N_MEMORY])) |
2617 | return NULL; |
2618 | |
2619 | return page; |
2620 | } |
2621 | #endif |
2622 | |
2623 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, |
2624 | unsigned long end, struct mm_walk *walk) |
2625 | { |
2626 | struct numa_maps *md = walk->private; |
2627 | struct vm_area_struct *vma = walk->vma; |
2628 | spinlock_t *ptl; |
2629 | pte_t *orig_pte; |
2630 | pte_t *pte; |
2631 | |
2632 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2633 | ptl = pmd_trans_huge_lock(pmd, vma); |
2634 | if (ptl) { |
2635 | struct page *page; |
2636 | |
2637 | page = can_gather_numa_stats_pmd(pmd: *pmd, vma, addr); |
2638 | if (page) |
2639 | gather_stats(page, md, pmd_dirty(pmd: *pmd), |
2640 | HPAGE_PMD_SIZE/PAGE_SIZE); |
2641 | spin_unlock(lock: ptl); |
2642 | return 0; |
2643 | } |
2644 | #endif |
2645 | orig_pte = pte = pte_offset_map_lock(mm: walk->mm, pmd, addr, ptlp: &ptl); |
2646 | if (!pte) { |
2647 | walk->action = ACTION_AGAIN; |
2648 | return 0; |
2649 | } |
2650 | do { |
2651 | pte_t ptent = ptep_get(ptep: pte); |
2652 | struct page *page = can_gather_numa_stats(pte: ptent, vma, addr); |
2653 | if (!page) |
2654 | continue; |
2655 | gather_stats(page, md, pte_dirty: pte_dirty(pte: ptent), nr_pages: 1); |
2656 | |
2657 | } while (pte++, addr += PAGE_SIZE, addr != end); |
2658 | pte_unmap_unlock(orig_pte, ptl); |
2659 | cond_resched(); |
2660 | return 0; |
2661 | } |
2662 | #ifdef CONFIG_HUGETLB_PAGE |
2663 | static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, |
2664 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
2665 | { |
2666 | pte_t huge_pte = huge_ptep_get(ptep: pte); |
2667 | struct numa_maps *md; |
2668 | struct page *page; |
2669 | |
2670 | if (!pte_present(a: huge_pte)) |
2671 | return 0; |
2672 | |
2673 | page = pte_page(huge_pte); |
2674 | |
2675 | md = walk->private; |
2676 | gather_stats(page, md, pte_dirty: pte_dirty(pte: huge_pte), nr_pages: 1); |
2677 | return 0; |
2678 | } |
2679 | |
2680 | #else |
2681 | static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, |
2682 | unsigned long addr, unsigned long end, struct mm_walk *walk) |
2683 | { |
2684 | return 0; |
2685 | } |
2686 | #endif |
2687 | |
2688 | static const struct mm_walk_ops show_numa_ops = { |
2689 | .hugetlb_entry = gather_hugetlb_stats, |
2690 | .pmd_entry = gather_pte_stats, |
2691 | .walk_lock = PGWALK_RDLOCK, |
2692 | }; |
2693 | |
2694 | /* |
2695 | * Display pages allocated per node and memory policy via /proc. |
2696 | */ |
2697 | static int show_numa_map(struct seq_file *m, void *v) |
2698 | { |
2699 | struct numa_maps_private *numa_priv = m->private; |
2700 | struct proc_maps_private *proc_priv = &numa_priv->proc_maps; |
2701 | struct vm_area_struct *vma = v; |
2702 | struct numa_maps *md = &numa_priv->md; |
2703 | struct file *file = vma->vm_file; |
2704 | struct mm_struct *mm = vma->vm_mm; |
2705 | char buffer[64]; |
2706 | struct mempolicy *pol; |
2707 | pgoff_t ilx; |
2708 | int nid; |
2709 | |
2710 | if (!mm) |
2711 | return 0; |
2712 | |
2713 | /* Ensure we start with an empty set of numa_maps statistics. */ |
2714 | memset(md, 0, sizeof(*md)); |
2715 | |
2716 | pol = __get_vma_policy(vma, addr: vma->vm_start, ilx: &ilx); |
2717 | if (pol) { |
2718 | mpol_to_str(buffer, maxlen: sizeof(buffer), pol); |
2719 | mpol_cond_put(pol); |
2720 | } else { |
2721 | mpol_to_str(buffer, maxlen: sizeof(buffer), pol: proc_priv->task_mempolicy); |
2722 | } |
2723 | |
2724 | seq_printf(m, fmt: "%08lx %s" , vma->vm_start, buffer); |
2725 | |
2726 | if (file) { |
2727 | seq_puts(m, s: " file=" ); |
2728 | seq_path(m, file_user_path(f: file), "\n\t= " ); |
2729 | } else if (vma_is_initial_heap(vma)) { |
2730 | seq_puts(m, s: " heap" ); |
2731 | } else if (vma_is_initial_stack(vma)) { |
2732 | seq_puts(m, s: " stack" ); |
2733 | } |
2734 | |
2735 | if (is_vm_hugetlb_page(vma)) |
2736 | seq_puts(m, s: " huge" ); |
2737 | |
2738 | /* mmap_lock is held by m_start */ |
2739 | walk_page_vma(vma, ops: &show_numa_ops, private: md); |
2740 | |
2741 | if (!md->pages) |
2742 | goto out; |
2743 | |
2744 | if (md->anon) |
2745 | seq_printf(m, fmt: " anon=%lu" , md->anon); |
2746 | |
2747 | if (md->dirty) |
2748 | seq_printf(m, fmt: " dirty=%lu" , md->dirty); |
2749 | |
2750 | if (md->pages != md->anon && md->pages != md->dirty) |
2751 | seq_printf(m, fmt: " mapped=%lu" , md->pages); |
2752 | |
2753 | if (md->mapcount_max > 1) |
2754 | seq_printf(m, fmt: " mapmax=%lu" , md->mapcount_max); |
2755 | |
2756 | if (md->swapcache) |
2757 | seq_printf(m, fmt: " swapcache=%lu" , md->swapcache); |
2758 | |
2759 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) |
2760 | seq_printf(m, fmt: " active=%lu" , md->active); |
2761 | |
2762 | if (md->writeback) |
2763 | seq_printf(m, fmt: " writeback=%lu" , md->writeback); |
2764 | |
2765 | for_each_node_state(nid, N_MEMORY) |
2766 | if (md->node[nid]) |
2767 | seq_printf(m, fmt: " N%d=%lu" , nid, md->node[nid]); |
2768 | |
2769 | seq_printf(m, fmt: " kernelpagesize_kB=%lu" , vma_kernel_pagesize(vma) >> 10); |
2770 | out: |
2771 | seq_putc(m, c: '\n'); |
2772 | return 0; |
2773 | } |
2774 | |
2775 | static const struct seq_operations proc_pid_numa_maps_op = { |
2776 | .start = m_start, |
2777 | .next = m_next, |
2778 | .stop = m_stop, |
2779 | .show = show_numa_map, |
2780 | }; |
2781 | |
2782 | static int pid_numa_maps_open(struct inode *inode, struct file *file) |
2783 | { |
2784 | return proc_maps_open(inode, file, ops: &proc_pid_numa_maps_op, |
2785 | psize: sizeof(struct numa_maps_private)); |
2786 | } |
2787 | |
2788 | const struct file_operations proc_pid_numa_maps_operations = { |
2789 | .open = pid_numa_maps_open, |
2790 | .read = seq_read, |
2791 | .llseek = seq_lseek, |
2792 | .release = proc_map_release, |
2793 | }; |
2794 | |
2795 | #endif /* CONFIG_NUMA */ |
2796 | |