1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright 2002 Andi Kleen, SuSE Labs. |
4 | * Thanks to Ben LaHaise for precious feedback. |
5 | */ |
6 | #include <linux/highmem.h> |
7 | #include <linux/memblock.h> |
8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> |
10 | #include <linux/interrupt.h> |
11 | #include <linux/seq_file.h> |
12 | #include <linux/proc_fs.h> |
13 | #include <linux/debugfs.h> |
14 | #include <linux/pfn.h> |
15 | #include <linux/percpu.h> |
16 | #include <linux/gfp.h> |
17 | #include <linux/pci.h> |
18 | #include <linux/vmalloc.h> |
19 | #include <linux/libnvdimm.h> |
20 | #include <linux/vmstat.h> |
21 | #include <linux/kernel.h> |
22 | #include <linux/cc_platform.h> |
23 | #include <linux/set_memory.h> |
24 | #include <linux/memregion.h> |
25 | |
26 | #include <asm/e820/api.h> |
27 | #include <asm/processor.h> |
28 | #include <asm/tlbflush.h> |
29 | #include <asm/sections.h> |
30 | #include <asm/setup.h> |
31 | #include <linux/uaccess.h> |
32 | #include <asm/pgalloc.h> |
33 | #include <asm/proto.h> |
34 | #include <asm/memtype.h> |
35 | #include <asm/hyperv-tlfs.h> |
36 | #include <asm/mshyperv.h> |
37 | |
38 | #include "../mm_internal.h" |
39 | |
40 | /* |
41 | * The current flushing context - we pass it instead of 5 arguments: |
42 | */ |
43 | struct cpa_data { |
44 | unsigned long *vaddr; |
45 | pgd_t *pgd; |
46 | pgprot_t mask_set; |
47 | pgprot_t mask_clr; |
48 | unsigned long numpages; |
49 | unsigned long curpage; |
50 | unsigned long pfn; |
51 | unsigned int flags; |
52 | unsigned int force_split : 1, |
53 | force_static_prot : 1, |
54 | force_flush_all : 1; |
55 | struct page **pages; |
56 | }; |
57 | |
58 | enum cpa_warn { |
59 | CPA_CONFLICT, |
60 | CPA_PROTECT, |
61 | CPA_DETECT, |
62 | }; |
63 | |
64 | static const int cpa_warn_level = CPA_PROTECT; |
65 | |
66 | /* |
67 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) |
68 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb |
69 | * entries change the page attribute in parallel to some other cpu |
70 | * splitting a large page entry along with changing the attribute. |
71 | */ |
72 | static DEFINE_SPINLOCK(cpa_lock); |
73 | |
74 | #define CPA_FLUSHTLB 1 |
75 | #define CPA_ARRAY 2 |
76 | #define CPA_PAGES_ARRAY 4 |
77 | #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ |
78 | |
79 | static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) |
80 | { |
81 | return __pgprot(cachemode2protval(pcm)); |
82 | } |
83 | |
84 | #ifdef CONFIG_PROC_FS |
85 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
86 | |
87 | void update_page_count(int level, unsigned long pages) |
88 | { |
89 | /* Protect against CPA */ |
90 | spin_lock(lock: &pgd_lock); |
91 | direct_pages_count[level] += pages; |
92 | spin_unlock(lock: &pgd_lock); |
93 | } |
94 | |
95 | static void split_page_count(int level) |
96 | { |
97 | if (direct_pages_count[level] == 0) |
98 | return; |
99 | |
100 | direct_pages_count[level]--; |
101 | if (system_state == SYSTEM_RUNNING) { |
102 | if (level == PG_LEVEL_2M) |
103 | count_vm_event(item: DIRECT_MAP_LEVEL2_SPLIT); |
104 | else if (level == PG_LEVEL_1G) |
105 | count_vm_event(item: DIRECT_MAP_LEVEL3_SPLIT); |
106 | } |
107 | direct_pages_count[level - 1] += PTRS_PER_PTE; |
108 | } |
109 | |
110 | void arch_report_meminfo(struct seq_file *m) |
111 | { |
112 | seq_printf(m, fmt: "DirectMap4k: %8lu kB\n" , |
113 | direct_pages_count[PG_LEVEL_4K] << 2); |
114 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
115 | seq_printf(m, fmt: "DirectMap2M: %8lu kB\n" , |
116 | direct_pages_count[PG_LEVEL_2M] << 11); |
117 | #else |
118 | seq_printf(m, "DirectMap4M: %8lu kB\n" , |
119 | direct_pages_count[PG_LEVEL_2M] << 12); |
120 | #endif |
121 | if (direct_gbpages) |
122 | seq_printf(m, fmt: "DirectMap1G: %8lu kB\n" , |
123 | direct_pages_count[PG_LEVEL_1G] << 20); |
124 | } |
125 | #else |
126 | static inline void split_page_count(int level) { } |
127 | #endif |
128 | |
129 | #ifdef CONFIG_X86_CPA_STATISTICS |
130 | |
131 | static unsigned long cpa_1g_checked; |
132 | static unsigned long cpa_1g_sameprot; |
133 | static unsigned long cpa_1g_preserved; |
134 | static unsigned long cpa_2m_checked; |
135 | static unsigned long cpa_2m_sameprot; |
136 | static unsigned long cpa_2m_preserved; |
137 | static unsigned long cpa_4k_install; |
138 | |
139 | static inline void cpa_inc_1g_checked(void) |
140 | { |
141 | cpa_1g_checked++; |
142 | } |
143 | |
144 | static inline void cpa_inc_2m_checked(void) |
145 | { |
146 | cpa_2m_checked++; |
147 | } |
148 | |
149 | static inline void cpa_inc_4k_install(void) |
150 | { |
151 | data_race(cpa_4k_install++); |
152 | } |
153 | |
154 | static inline void cpa_inc_lp_sameprot(int level) |
155 | { |
156 | if (level == PG_LEVEL_1G) |
157 | cpa_1g_sameprot++; |
158 | else |
159 | cpa_2m_sameprot++; |
160 | } |
161 | |
162 | static inline void cpa_inc_lp_preserved(int level) |
163 | { |
164 | if (level == PG_LEVEL_1G) |
165 | cpa_1g_preserved++; |
166 | else |
167 | cpa_2m_preserved++; |
168 | } |
169 | |
170 | static int cpastats_show(struct seq_file *m, void *p) |
171 | { |
172 | seq_printf(m, fmt: "1G pages checked: %16lu\n" , cpa_1g_checked); |
173 | seq_printf(m, fmt: "1G pages sameprot: %16lu\n" , cpa_1g_sameprot); |
174 | seq_printf(m, fmt: "1G pages preserved: %16lu\n" , cpa_1g_preserved); |
175 | seq_printf(m, fmt: "2M pages checked: %16lu\n" , cpa_2m_checked); |
176 | seq_printf(m, fmt: "2M pages sameprot: %16lu\n" , cpa_2m_sameprot); |
177 | seq_printf(m, fmt: "2M pages preserved: %16lu\n" , cpa_2m_preserved); |
178 | seq_printf(m, fmt: "4K pages set-checked: %16lu\n" , cpa_4k_install); |
179 | return 0; |
180 | } |
181 | |
182 | static int cpastats_open(struct inode *inode, struct file *file) |
183 | { |
184 | return single_open(file, cpastats_show, NULL); |
185 | } |
186 | |
187 | static const struct file_operations cpastats_fops = { |
188 | .open = cpastats_open, |
189 | .read = seq_read, |
190 | .llseek = seq_lseek, |
191 | .release = single_release, |
192 | }; |
193 | |
194 | static int __init cpa_stats_init(void) |
195 | { |
196 | debugfs_create_file(name: "cpa_stats" , S_IRUSR, parent: arch_debugfs_dir, NULL, |
197 | fops: &cpastats_fops); |
198 | return 0; |
199 | } |
200 | late_initcall(cpa_stats_init); |
201 | #else |
202 | static inline void cpa_inc_1g_checked(void) { } |
203 | static inline void cpa_inc_2m_checked(void) { } |
204 | static inline void cpa_inc_4k_install(void) { } |
205 | static inline void cpa_inc_lp_sameprot(int level) { } |
206 | static inline void cpa_inc_lp_preserved(int level) { } |
207 | #endif |
208 | |
209 | |
210 | static inline int |
211 | within(unsigned long addr, unsigned long start, unsigned long end) |
212 | { |
213 | return addr >= start && addr < end; |
214 | } |
215 | |
216 | static inline int |
217 | within_inclusive(unsigned long addr, unsigned long start, unsigned long end) |
218 | { |
219 | return addr >= start && addr <= end; |
220 | } |
221 | |
222 | #ifdef CONFIG_X86_64 |
223 | |
224 | /* |
225 | * The kernel image is mapped into two places in the virtual address space |
226 | * (addresses without KASLR, of course): |
227 | * |
228 | * 1. The kernel direct map (0xffff880000000000) |
229 | * 2. The "high kernel map" (0xffffffff81000000) |
230 | * |
231 | * We actually execute out of #2. If we get the address of a kernel symbol, it |
232 | * points to #2, but almost all physical-to-virtual translations point to #1. |
233 | * |
234 | * This is so that we can have both a directmap of all physical memory *and* |
235 | * take full advantage of the limited (s32) immediate addressing range (2G) |
236 | * of x86_64. |
237 | * |
238 | * See Documentation/arch/x86/x86_64/mm.rst for more detail. |
239 | */ |
240 | |
241 | static inline unsigned long highmap_start_pfn(void) |
242 | { |
243 | return __pa_symbol(_text) >> PAGE_SHIFT; |
244 | } |
245 | |
246 | static inline unsigned long highmap_end_pfn(void) |
247 | { |
248 | /* Do not reference physical address outside the kernel. */ |
249 | return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; |
250 | } |
251 | |
252 | static bool __cpa_pfn_in_highmap(unsigned long pfn) |
253 | { |
254 | /* |
255 | * Kernel text has an alias mapping at a high address, known |
256 | * here as "highmap". |
257 | */ |
258 | return within_inclusive(addr: pfn, start: highmap_start_pfn(), end: highmap_end_pfn()); |
259 | } |
260 | |
261 | #else |
262 | |
263 | static bool __cpa_pfn_in_highmap(unsigned long pfn) |
264 | { |
265 | /* There is no highmap on 32-bit */ |
266 | return false; |
267 | } |
268 | |
269 | #endif |
270 | |
271 | /* |
272 | * See set_mce_nospec(). |
273 | * |
274 | * Machine check recovery code needs to change cache mode of poisoned pages to |
275 | * UC to avoid speculative access logging another error. But passing the |
276 | * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a |
277 | * speculative access. So we cheat and flip the top bit of the address. This |
278 | * works fine for the code that updates the page tables. But at the end of the |
279 | * process we need to flush the TLB and cache and the non-canonical address |
280 | * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. |
281 | * |
282 | * But in the common case we already have a canonical address. This code |
283 | * will fix the top bit if needed and is a no-op otherwise. |
284 | */ |
285 | static inline unsigned long fix_addr(unsigned long addr) |
286 | { |
287 | #ifdef CONFIG_X86_64 |
288 | return (long)(addr << 1) >> 1; |
289 | #else |
290 | return addr; |
291 | #endif |
292 | } |
293 | |
294 | static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) |
295 | { |
296 | if (cpa->flags & CPA_PAGES_ARRAY) { |
297 | struct page *page = cpa->pages[idx]; |
298 | |
299 | if (unlikely(PageHighMem(page))) |
300 | return 0; |
301 | |
302 | return (unsigned long)page_address(page); |
303 | } |
304 | |
305 | if (cpa->flags & CPA_ARRAY) |
306 | return cpa->vaddr[idx]; |
307 | |
308 | return *cpa->vaddr + idx * PAGE_SIZE; |
309 | } |
310 | |
311 | /* |
312 | * Flushing functions |
313 | */ |
314 | |
315 | static void clflush_cache_range_opt(void *vaddr, unsigned int size) |
316 | { |
317 | const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; |
318 | void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); |
319 | void *vend = vaddr + size; |
320 | |
321 | if (p >= vend) |
322 | return; |
323 | |
324 | for (; p < vend; p += clflush_size) |
325 | clflushopt(p: p); |
326 | } |
327 | |
328 | /** |
329 | * clflush_cache_range - flush a cache range with clflush |
330 | * @vaddr: virtual start address |
331 | * @size: number of bytes to flush |
332 | * |
333 | * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or |
334 | * SFENCE to avoid ordering issues. |
335 | */ |
336 | void clflush_cache_range(void *vaddr, unsigned int size) |
337 | { |
338 | mb(); |
339 | clflush_cache_range_opt(vaddr, size); |
340 | mb(); |
341 | } |
342 | EXPORT_SYMBOL_GPL(clflush_cache_range); |
343 | |
344 | #ifdef CONFIG_ARCH_HAS_PMEM_API |
345 | void arch_invalidate_pmem(void *addr, size_t size) |
346 | { |
347 | clflush_cache_range(addr, size); |
348 | } |
349 | EXPORT_SYMBOL_GPL(arch_invalidate_pmem); |
350 | #endif |
351 | |
352 | #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION |
353 | bool cpu_cache_has_invalidate_memregion(void) |
354 | { |
355 | return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); |
356 | } |
357 | EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); |
358 | |
359 | int cpu_cache_invalidate_memregion(int res_desc) |
360 | { |
361 | if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) |
362 | return -ENXIO; |
363 | wbinvd_on_all_cpus(); |
364 | return 0; |
365 | } |
366 | EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); |
367 | #endif |
368 | |
369 | static void __cpa_flush_all(void *arg) |
370 | { |
371 | unsigned long cache = (unsigned long)arg; |
372 | |
373 | /* |
374 | * Flush all to work around Errata in early athlons regarding |
375 | * large page flushing. |
376 | */ |
377 | __flush_tlb_all(); |
378 | |
379 | if (cache && boot_cpu_data.x86 >= 4) |
380 | wbinvd(); |
381 | } |
382 | |
383 | static void cpa_flush_all(unsigned long cache) |
384 | { |
385 | BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); |
386 | |
387 | on_each_cpu(func: __cpa_flush_all, info: (void *) cache, wait: 1); |
388 | } |
389 | |
390 | static void __cpa_flush_tlb(void *data) |
391 | { |
392 | struct cpa_data *cpa = data; |
393 | unsigned int i; |
394 | |
395 | for (i = 0; i < cpa->numpages; i++) |
396 | flush_tlb_one_kernel(addr: fix_addr(addr: __cpa_addr(cpa, idx: i))); |
397 | } |
398 | |
399 | static void cpa_flush(struct cpa_data *data, int cache) |
400 | { |
401 | struct cpa_data *cpa = data; |
402 | unsigned int i; |
403 | |
404 | BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); |
405 | |
406 | if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { |
407 | cpa_flush_all(cache); |
408 | return; |
409 | } |
410 | |
411 | if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) |
412 | flush_tlb_all(); |
413 | else |
414 | on_each_cpu(func: __cpa_flush_tlb, info: cpa, wait: 1); |
415 | |
416 | if (!cache) |
417 | return; |
418 | |
419 | mb(); |
420 | for (i = 0; i < cpa->numpages; i++) { |
421 | unsigned long addr = __cpa_addr(cpa, idx: i); |
422 | unsigned int level; |
423 | |
424 | pte_t *pte = lookup_address(address: addr, level: &level); |
425 | |
426 | /* |
427 | * Only flush present addresses: |
428 | */ |
429 | if (pte && (pte_val(pte: *pte) & _PAGE_PRESENT)) |
430 | clflush_cache_range_opt(vaddr: (void *)fix_addr(addr), PAGE_SIZE); |
431 | } |
432 | mb(); |
433 | } |
434 | |
435 | static bool overlaps(unsigned long r1_start, unsigned long r1_end, |
436 | unsigned long r2_start, unsigned long r2_end) |
437 | { |
438 | return (r1_start <= r2_end && r1_end >= r2_start) || |
439 | (r2_start <= r1_end && r2_end >= r1_start); |
440 | } |
441 | |
442 | #ifdef CONFIG_PCI_BIOS |
443 | /* |
444 | * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS |
445 | * based config access (CONFIG_PCI_GOBIOS) support. |
446 | */ |
447 | #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) |
448 | #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) |
449 | |
450 | static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) |
451 | { |
452 | if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) |
453 | return _PAGE_NX; |
454 | return 0; |
455 | } |
456 | #else |
457 | static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) |
458 | { |
459 | return 0; |
460 | } |
461 | #endif |
462 | |
463 | /* |
464 | * The .rodata section needs to be read-only. Using the pfn catches all |
465 | * aliases. This also includes __ro_after_init, so do not enforce until |
466 | * kernel_set_to_readonly is true. |
467 | */ |
468 | static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) |
469 | { |
470 | unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); |
471 | |
472 | /* |
473 | * Note: __end_rodata is at page aligned and not inclusive, so |
474 | * subtract 1 to get the last enforced PFN in the rodata area. |
475 | */ |
476 | epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; |
477 | |
478 | if (kernel_set_to_readonly && overlaps(r1_start: spfn, r1_end: epfn, r2_start: spfn_ro, r2_end: epfn_ro)) |
479 | return _PAGE_RW; |
480 | return 0; |
481 | } |
482 | |
483 | /* |
484 | * Protect kernel text against becoming non executable by forbidding |
485 | * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) |
486 | * out of which the kernel actually executes. Do not protect the low |
487 | * mapping. |
488 | * |
489 | * This does not cover __inittext since that is gone after boot. |
490 | */ |
491 | static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) |
492 | { |
493 | unsigned long t_end = (unsigned long)_etext - 1; |
494 | unsigned long t_start = (unsigned long)_text; |
495 | |
496 | if (overlaps(r1_start: start, r1_end: end, r2_start: t_start, r2_end: t_end)) |
497 | return _PAGE_NX; |
498 | return 0; |
499 | } |
500 | |
501 | #if defined(CONFIG_X86_64) |
502 | /* |
503 | * Once the kernel maps the text as RO (kernel_set_to_readonly is set), |
504 | * kernel text mappings for the large page aligned text, rodata sections |
505 | * will be always read-only. For the kernel identity mappings covering the |
506 | * holes caused by this alignment can be anything that user asks. |
507 | * |
508 | * This will preserve the large page mappings for kernel text/data at no |
509 | * extra cost. |
510 | */ |
511 | static pgprotval_t protect_kernel_text_ro(unsigned long start, |
512 | unsigned long end) |
513 | { |
514 | unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; |
515 | unsigned long t_start = (unsigned long)_text; |
516 | unsigned int level; |
517 | |
518 | if (!kernel_set_to_readonly || !overlaps(r1_start: start, r1_end: end, r2_start: t_start, r2_end: t_end)) |
519 | return 0; |
520 | /* |
521 | * Don't enforce the !RW mapping for the kernel text mapping, if |
522 | * the current mapping is already using small page mapping. No |
523 | * need to work hard to preserve large page mappings in this case. |
524 | * |
525 | * This also fixes the Linux Xen paravirt guest boot failure caused |
526 | * by unexpected read-only mappings for kernel identity |
527 | * mappings. In this paravirt guest case, the kernel text mapping |
528 | * and the kernel identity mapping share the same page-table pages, |
529 | * so the protections for kernel text and identity mappings have to |
530 | * be the same. |
531 | */ |
532 | if (lookup_address(address: start, level: &level) && (level != PG_LEVEL_4K)) |
533 | return _PAGE_RW; |
534 | return 0; |
535 | } |
536 | #else |
537 | static pgprotval_t protect_kernel_text_ro(unsigned long start, |
538 | unsigned long end) |
539 | { |
540 | return 0; |
541 | } |
542 | #endif |
543 | |
544 | static inline bool conflicts(pgprot_t prot, pgprotval_t val) |
545 | { |
546 | return (pgprot_val(prot) & ~val) != pgprot_val(prot); |
547 | } |
548 | |
549 | static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, |
550 | unsigned long start, unsigned long end, |
551 | unsigned long pfn, const char *txt) |
552 | { |
553 | static const char *lvltxt[] = { |
554 | [CPA_CONFLICT] = "conflict" , |
555 | [CPA_PROTECT] = "protect" , |
556 | [CPA_DETECT] = "detect" , |
557 | }; |
558 | |
559 | if (warnlvl > cpa_warn_level || !conflicts(prot, val)) |
560 | return; |
561 | |
562 | pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n" , |
563 | lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), |
564 | (unsigned long long)val); |
565 | } |
566 | |
567 | /* |
568 | * Certain areas of memory on x86 require very specific protection flags, |
569 | * for example the BIOS area or kernel text. Callers don't always get this |
570 | * right (again, ioremap() on BIOS memory is not uncommon) so this function |
571 | * checks and fixes these known static required protection bits. |
572 | */ |
573 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, |
574 | unsigned long pfn, unsigned long npg, |
575 | unsigned long lpsize, int warnlvl) |
576 | { |
577 | pgprotval_t forbidden, res; |
578 | unsigned long end; |
579 | |
580 | /* |
581 | * There is no point in checking RW/NX conflicts when the requested |
582 | * mapping is setting the page !PRESENT. |
583 | */ |
584 | if (!(pgprot_val(prot) & _PAGE_PRESENT)) |
585 | return prot; |
586 | |
587 | /* Operate on the virtual address */ |
588 | end = start + npg * PAGE_SIZE - 1; |
589 | |
590 | res = protect_kernel_text(start, end); |
591 | check_conflict(warnlvl, prot, val: res, start, end, pfn, txt: "Text NX" ); |
592 | forbidden = res; |
593 | |
594 | /* |
595 | * Special case to preserve a large page. If the change spawns the |
596 | * full large page mapping then there is no point to split it |
597 | * up. Happens with ftrace and is going to be removed once ftrace |
598 | * switched to text_poke(). |
599 | */ |
600 | if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { |
601 | res = protect_kernel_text_ro(start, end); |
602 | check_conflict(warnlvl, prot, val: res, start, end, pfn, txt: "Text RO" ); |
603 | forbidden |= res; |
604 | } |
605 | |
606 | /* Check the PFN directly */ |
607 | res = protect_pci_bios(spfn: pfn, epfn: pfn + npg - 1); |
608 | check_conflict(warnlvl, prot, val: res, start, end, pfn, txt: "PCIBIOS NX" ); |
609 | forbidden |= res; |
610 | |
611 | res = protect_rodata(spfn: pfn, epfn: pfn + npg - 1); |
612 | check_conflict(warnlvl, prot, val: res, start, end, pfn, txt: "Rodata RO" ); |
613 | forbidden |= res; |
614 | |
615 | return __pgprot(pgprot_val(prot) & ~forbidden); |
616 | } |
617 | |
618 | /* |
619 | * Validate strict W^X semantics. |
620 | */ |
621 | static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, |
622 | unsigned long pfn, unsigned long npg) |
623 | { |
624 | unsigned long end; |
625 | |
626 | /* |
627 | * 32-bit has some unfixable W+X issues, like EFI code |
628 | * and writeable data being in the same page. Disable |
629 | * detection and enforcement there. |
630 | */ |
631 | if (IS_ENABLED(CONFIG_X86_32)) |
632 | return new; |
633 | |
634 | /* Only verify when NX is supported: */ |
635 | if (!(__supported_pte_mask & _PAGE_NX)) |
636 | return new; |
637 | |
638 | if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) |
639 | return new; |
640 | |
641 | if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) |
642 | return new; |
643 | |
644 | end = start + npg * PAGE_SIZE - 1; |
645 | WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n" , |
646 | (unsigned long long)pgprot_val(old), |
647 | (unsigned long long)pgprot_val(new), |
648 | start, end, pfn); |
649 | |
650 | /* |
651 | * For now, allow all permission change attempts by returning the |
652 | * attempted permissions. This can 'return old' to actively |
653 | * refuse the permission change at a later time. |
654 | */ |
655 | return new; |
656 | } |
657 | |
658 | /* |
659 | * Lookup the page table entry for a virtual address in a specific pgd. |
660 | * Return a pointer to the entry and the level of the mapping. |
661 | */ |
662 | pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, |
663 | unsigned int *level) |
664 | { |
665 | p4d_t *p4d; |
666 | pud_t *pud; |
667 | pmd_t *pmd; |
668 | |
669 | *level = PG_LEVEL_NONE; |
670 | |
671 | if (pgd_none(pgd: *pgd)) |
672 | return NULL; |
673 | |
674 | p4d = p4d_offset(pgd, address); |
675 | if (p4d_none(p4d: *p4d)) |
676 | return NULL; |
677 | |
678 | *level = PG_LEVEL_512G; |
679 | if (p4d_leaf(p4d: *p4d) || !p4d_present(p4d: *p4d)) |
680 | return (pte_t *)p4d; |
681 | |
682 | pud = pud_offset(p4d, address); |
683 | if (pud_none(pud: *pud)) |
684 | return NULL; |
685 | |
686 | *level = PG_LEVEL_1G; |
687 | if (pud_leaf(pud: *pud) || !pud_present(pud: *pud)) |
688 | return (pte_t *)pud; |
689 | |
690 | pmd = pmd_offset(pud, address); |
691 | if (pmd_none(pmd: *pmd)) |
692 | return NULL; |
693 | |
694 | *level = PG_LEVEL_2M; |
695 | if (pmd_leaf(pte: *pmd) || !pmd_present(pmd: *pmd)) |
696 | return (pte_t *)pmd; |
697 | |
698 | *level = PG_LEVEL_4K; |
699 | |
700 | return pte_offset_kernel(pmd, address); |
701 | } |
702 | |
703 | /* |
704 | * Lookup the page table entry for a virtual address. Return a pointer |
705 | * to the entry and the level of the mapping. |
706 | * |
707 | * Note: We return pud and pmd either when the entry is marked large |
708 | * or when the present bit is not set. Otherwise we would return a |
709 | * pointer to a nonexisting mapping. |
710 | */ |
711 | pte_t *lookup_address(unsigned long address, unsigned int *level) |
712 | { |
713 | return lookup_address_in_pgd(pgd_offset_k(address), address, level); |
714 | } |
715 | EXPORT_SYMBOL_GPL(lookup_address); |
716 | |
717 | static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, |
718 | unsigned int *level) |
719 | { |
720 | if (cpa->pgd) |
721 | return lookup_address_in_pgd(pgd: cpa->pgd + pgd_index(address), |
722 | address, level); |
723 | |
724 | return lookup_address(address, level); |
725 | } |
726 | |
727 | /* |
728 | * Lookup the PMD entry for a virtual address. Return a pointer to the entry |
729 | * or NULL if not present. |
730 | */ |
731 | pmd_t *lookup_pmd_address(unsigned long address) |
732 | { |
733 | pgd_t *pgd; |
734 | p4d_t *p4d; |
735 | pud_t *pud; |
736 | |
737 | pgd = pgd_offset_k(address); |
738 | if (pgd_none(pgd: *pgd)) |
739 | return NULL; |
740 | |
741 | p4d = p4d_offset(pgd, address); |
742 | if (p4d_none(p4d: *p4d) || p4d_leaf(p4d: *p4d) || !p4d_present(p4d: *p4d)) |
743 | return NULL; |
744 | |
745 | pud = pud_offset(p4d, address); |
746 | if (pud_none(pud: *pud) || pud_leaf(pud: *pud) || !pud_present(pud: *pud)) |
747 | return NULL; |
748 | |
749 | return pmd_offset(pud, address); |
750 | } |
751 | |
752 | /* |
753 | * This is necessary because __pa() does not work on some |
754 | * kinds of memory, like vmalloc() or the alloc_remap() |
755 | * areas on 32-bit NUMA systems. The percpu areas can |
756 | * end up in this kind of memory, for instance. |
757 | * |
758 | * Note that as long as the PTEs are well-formed with correct PFNs, this |
759 | * works without checking the PRESENT bit in the leaf PTE. This is unlike |
760 | * the similar vmalloc_to_page() and derivatives. Callers may depend on |
761 | * this behavior. |
762 | * |
763 | * This could be optimized, but it is only used in paths that are not perf |
764 | * sensitive, and keeping it unoptimized should increase the testing coverage |
765 | * for the more obscure platforms. |
766 | */ |
767 | phys_addr_t slow_virt_to_phys(void *__virt_addr) |
768 | { |
769 | unsigned long virt_addr = (unsigned long)__virt_addr; |
770 | phys_addr_t phys_addr; |
771 | unsigned long offset; |
772 | enum pg_level level; |
773 | pte_t *pte; |
774 | |
775 | pte = lookup_address(virt_addr, &level); |
776 | BUG_ON(!pte); |
777 | |
778 | /* |
779 | * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t |
780 | * before being left-shifted PAGE_SHIFT bits -- this trick is to |
781 | * make 32-PAE kernel work correctly. |
782 | */ |
783 | switch (level) { |
784 | case PG_LEVEL_1G: |
785 | phys_addr = (phys_addr_t)pud_pfn(pud: *(pud_t *)pte) << PAGE_SHIFT; |
786 | offset = virt_addr & ~PUD_MASK; |
787 | break; |
788 | case PG_LEVEL_2M: |
789 | phys_addr = (phys_addr_t)pmd_pfn(pmd: *(pmd_t *)pte) << PAGE_SHIFT; |
790 | offset = virt_addr & ~PMD_MASK; |
791 | break; |
792 | default: |
793 | phys_addr = (phys_addr_t)pte_pfn(pte: *pte) << PAGE_SHIFT; |
794 | offset = virt_addr & ~PAGE_MASK; |
795 | } |
796 | |
797 | return (phys_addr_t)(phys_addr | offset); |
798 | } |
799 | EXPORT_SYMBOL_GPL(slow_virt_to_phys); |
800 | |
801 | /* |
802 | * Set the new pmd in all the pgds we know about: |
803 | */ |
804 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
805 | { |
806 | /* change init_mm */ |
807 | set_pte_atomic(ptep: kpte, pte); |
808 | #ifdef CONFIG_X86_32 |
809 | if (!SHARED_KERNEL_PMD) { |
810 | struct page *page; |
811 | |
812 | list_for_each_entry(page, &pgd_list, lru) { |
813 | pgd_t *pgd; |
814 | p4d_t *p4d; |
815 | pud_t *pud; |
816 | pmd_t *pmd; |
817 | |
818 | pgd = (pgd_t *)page_address(page) + pgd_index(address); |
819 | p4d = p4d_offset(pgd, address); |
820 | pud = pud_offset(p4d, address); |
821 | pmd = pmd_offset(pud, address); |
822 | set_pte_atomic((pte_t *)pmd, pte); |
823 | } |
824 | } |
825 | #endif |
826 | } |
827 | |
828 | static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) |
829 | { |
830 | /* |
831 | * _PAGE_GLOBAL means "global page" for present PTEs. |
832 | * But, it is also used to indicate _PAGE_PROTNONE |
833 | * for non-present PTEs. |
834 | * |
835 | * This ensures that a _PAGE_GLOBAL PTE going from |
836 | * present to non-present is not confused as |
837 | * _PAGE_PROTNONE. |
838 | */ |
839 | if (!(pgprot_val(prot) & _PAGE_PRESENT)) |
840 | pgprot_val(prot) &= ~_PAGE_GLOBAL; |
841 | |
842 | return prot; |
843 | } |
844 | |
845 | static int __should_split_large_page(pte_t *kpte, unsigned long address, |
846 | struct cpa_data *cpa) |
847 | { |
848 | unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; |
849 | pgprot_t old_prot, new_prot, req_prot, chk_prot; |
850 | pte_t new_pte, *tmp; |
851 | enum pg_level level; |
852 | |
853 | /* |
854 | * Check for races, another CPU might have split this page |
855 | * up already: |
856 | */ |
857 | tmp = _lookup_address_cpa(cpa, address, level: &level); |
858 | if (tmp != kpte) |
859 | return 1; |
860 | |
861 | switch (level) { |
862 | case PG_LEVEL_2M: |
863 | old_prot = pmd_pgprot(*(pmd_t *)kpte); |
864 | old_pfn = pmd_pfn(pmd: *(pmd_t *)kpte); |
865 | cpa_inc_2m_checked(); |
866 | break; |
867 | case PG_LEVEL_1G: |
868 | old_prot = pud_pgprot(*(pud_t *)kpte); |
869 | old_pfn = pud_pfn(pud: *(pud_t *)kpte); |
870 | cpa_inc_1g_checked(); |
871 | break; |
872 | default: |
873 | return -EINVAL; |
874 | } |
875 | |
876 | psize = page_level_size(level); |
877 | pmask = page_level_mask(level); |
878 | |
879 | /* |
880 | * Calculate the number of pages, which fit into this large |
881 | * page starting at address: |
882 | */ |
883 | lpaddr = (address + psize) & pmask; |
884 | numpages = (lpaddr - address) >> PAGE_SHIFT; |
885 | if (numpages < cpa->numpages) |
886 | cpa->numpages = numpages; |
887 | |
888 | /* |
889 | * We are safe now. Check whether the new pgprot is the same: |
890 | * Convert protection attributes to 4k-format, as cpa->mask* are set |
891 | * up accordingly. |
892 | */ |
893 | |
894 | /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ |
895 | req_prot = pgprot_large_2_4k(pgprot: old_prot); |
896 | |
897 | pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); |
898 | pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); |
899 | |
900 | /* |
901 | * req_prot is in format of 4k pages. It must be converted to large |
902 | * page format: the caching mode includes the PAT bit located at |
903 | * different bit positions in the two formats. |
904 | */ |
905 | req_prot = pgprot_4k_2_large(pgprot: req_prot); |
906 | req_prot = pgprot_clear_protnone_bits(prot: req_prot); |
907 | if (pgprot_val(req_prot) & _PAGE_PRESENT) |
908 | pgprot_val(req_prot) |= _PAGE_PSE; |
909 | |
910 | /* |
911 | * old_pfn points to the large page base pfn. So we need to add the |
912 | * offset of the virtual address: |
913 | */ |
914 | pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); |
915 | cpa->pfn = pfn; |
916 | |
917 | /* |
918 | * Calculate the large page base address and the number of 4K pages |
919 | * in the large page |
920 | */ |
921 | lpaddr = address & pmask; |
922 | numpages = psize >> PAGE_SHIFT; |
923 | |
924 | /* |
925 | * Sanity check that the existing mapping is correct versus the static |
926 | * protections. static_protections() guards against !PRESENT, so no |
927 | * extra conditional required here. |
928 | */ |
929 | chk_prot = static_protections(prot: old_prot, start: lpaddr, pfn: old_pfn, npg: numpages, |
930 | lpsize: psize, warnlvl: CPA_CONFLICT); |
931 | |
932 | if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { |
933 | /* |
934 | * Split the large page and tell the split code to |
935 | * enforce static protections. |
936 | */ |
937 | cpa->force_static_prot = 1; |
938 | return 1; |
939 | } |
940 | |
941 | /* |
942 | * Optimization: If the requested pgprot is the same as the current |
943 | * pgprot, then the large page can be preserved and no updates are |
944 | * required independent of alignment and length of the requested |
945 | * range. The above already established that the current pgprot is |
946 | * correct, which in consequence makes the requested pgprot correct |
947 | * as well if it is the same. The static protection scan below will |
948 | * not come to a different conclusion. |
949 | */ |
950 | if (pgprot_val(req_prot) == pgprot_val(old_prot)) { |
951 | cpa_inc_lp_sameprot(level); |
952 | return 0; |
953 | } |
954 | |
955 | /* |
956 | * If the requested range does not cover the full page, split it up |
957 | */ |
958 | if (address != lpaddr || cpa->numpages != numpages) |
959 | return 1; |
960 | |
961 | /* |
962 | * Check whether the requested pgprot is conflicting with a static |
963 | * protection requirement in the large page. |
964 | */ |
965 | new_prot = static_protections(prot: req_prot, start: lpaddr, pfn: old_pfn, npg: numpages, |
966 | lpsize: psize, warnlvl: CPA_DETECT); |
967 | |
968 | new_prot = verify_rwx(old: old_prot, new: new_prot, start: lpaddr, pfn: old_pfn, npg: numpages); |
969 | |
970 | /* |
971 | * If there is a conflict, split the large page. |
972 | * |
973 | * There used to be a 4k wise evaluation trying really hard to |
974 | * preserve the large pages, but experimentation has shown, that this |
975 | * does not help at all. There might be corner cases which would |
976 | * preserve one large page occasionally, but it's really not worth the |
977 | * extra code and cycles for the common case. |
978 | */ |
979 | if (pgprot_val(req_prot) != pgprot_val(new_prot)) |
980 | return 1; |
981 | |
982 | /* All checks passed. Update the large page mapping. */ |
983 | new_pte = pfn_pte(page_nr: old_pfn, pgprot: new_prot); |
984 | __set_pmd_pte(kpte, address, pte: new_pte); |
985 | cpa->flags |= CPA_FLUSHTLB; |
986 | cpa_inc_lp_preserved(level); |
987 | return 0; |
988 | } |
989 | |
990 | static int should_split_large_page(pte_t *kpte, unsigned long address, |
991 | struct cpa_data *cpa) |
992 | { |
993 | int do_split; |
994 | |
995 | if (cpa->force_split) |
996 | return 1; |
997 | |
998 | spin_lock(lock: &pgd_lock); |
999 | do_split = __should_split_large_page(kpte, address, cpa); |
1000 | spin_unlock(lock: &pgd_lock); |
1001 | |
1002 | return do_split; |
1003 | } |
1004 | |
1005 | static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, |
1006 | pgprot_t ref_prot, unsigned long address, |
1007 | unsigned long size) |
1008 | { |
1009 | unsigned int npg = PFN_DOWN(size); |
1010 | pgprot_t prot; |
1011 | |
1012 | /* |
1013 | * If should_split_large_page() discovered an inconsistent mapping, |
1014 | * remove the invalid protection in the split mapping. |
1015 | */ |
1016 | if (!cpa->force_static_prot) |
1017 | goto set; |
1018 | |
1019 | /* Hand in lpsize = 0 to enforce the protection mechanism */ |
1020 | prot = static_protections(prot: ref_prot, start: address, pfn, npg, lpsize: 0, warnlvl: CPA_PROTECT); |
1021 | |
1022 | if (pgprot_val(prot) == pgprot_val(ref_prot)) |
1023 | goto set; |
1024 | |
1025 | /* |
1026 | * If this is splitting a PMD, fix it up. PUD splits cannot be |
1027 | * fixed trivially as that would require to rescan the newly |
1028 | * installed PMD mappings after returning from split_large_page() |
1029 | * so an eventual further split can allocate the necessary PTE |
1030 | * pages. Warn for now and revisit it in case this actually |
1031 | * happens. |
1032 | */ |
1033 | if (size == PAGE_SIZE) |
1034 | ref_prot = prot; |
1035 | else |
1036 | pr_warn_once("CPA: Cannot fixup static protections for PUD split\n" ); |
1037 | set: |
1038 | set_pte(ptep: pte, pte: pfn_pte(page_nr: pfn, pgprot: ref_prot)); |
1039 | } |
1040 | |
1041 | static int |
1042 | __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, |
1043 | struct page *base) |
1044 | { |
1045 | unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; |
1046 | pte_t *pbase = (pte_t *)page_address(base); |
1047 | unsigned int i, level; |
1048 | pgprot_t ref_prot; |
1049 | pte_t *tmp; |
1050 | |
1051 | spin_lock(lock: &pgd_lock); |
1052 | /* |
1053 | * Check for races, another CPU might have split this page |
1054 | * up for us already: |
1055 | */ |
1056 | tmp = _lookup_address_cpa(cpa, address, level: &level); |
1057 | if (tmp != kpte) { |
1058 | spin_unlock(lock: &pgd_lock); |
1059 | return 1; |
1060 | } |
1061 | |
1062 | paravirt_alloc_pte(mm: &init_mm, page_to_pfn(base)); |
1063 | |
1064 | switch (level) { |
1065 | case PG_LEVEL_2M: |
1066 | ref_prot = pmd_pgprot(*(pmd_t *)kpte); |
1067 | /* |
1068 | * Clear PSE (aka _PAGE_PAT) and move |
1069 | * PAT bit to correct position. |
1070 | */ |
1071 | ref_prot = pgprot_large_2_4k(pgprot: ref_prot); |
1072 | ref_pfn = pmd_pfn(pmd: *(pmd_t *)kpte); |
1073 | lpaddr = address & PMD_MASK; |
1074 | lpinc = PAGE_SIZE; |
1075 | break; |
1076 | |
1077 | case PG_LEVEL_1G: |
1078 | ref_prot = pud_pgprot(*(pud_t *)kpte); |
1079 | ref_pfn = pud_pfn(pud: *(pud_t *)kpte); |
1080 | pfninc = PMD_SIZE >> PAGE_SHIFT; |
1081 | lpaddr = address & PUD_MASK; |
1082 | lpinc = PMD_SIZE; |
1083 | /* |
1084 | * Clear the PSE flags if the PRESENT flag is not set |
1085 | * otherwise pmd_present/pmd_huge will return true |
1086 | * even on a non present pmd. |
1087 | */ |
1088 | if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) |
1089 | pgprot_val(ref_prot) &= ~_PAGE_PSE; |
1090 | break; |
1091 | |
1092 | default: |
1093 | spin_unlock(lock: &pgd_lock); |
1094 | return 1; |
1095 | } |
1096 | |
1097 | ref_prot = pgprot_clear_protnone_bits(prot: ref_prot); |
1098 | |
1099 | /* |
1100 | * Get the target pfn from the original entry: |
1101 | */ |
1102 | pfn = ref_pfn; |
1103 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) |
1104 | split_set_pte(cpa, pte: pbase + i, pfn, ref_prot, address: lpaddr, size: lpinc); |
1105 | |
1106 | if (virt_addr_valid(address)) { |
1107 | unsigned long pfn = PFN_DOWN(__pa(address)); |
1108 | |
1109 | if (pfn_range_is_mapped(start_pfn: pfn, end_pfn: pfn + 1)) |
1110 | split_page_count(level); |
1111 | } |
1112 | |
1113 | /* |
1114 | * Install the new, split up pagetable. |
1115 | * |
1116 | * We use the standard kernel pagetable protections for the new |
1117 | * pagetable protections, the actual ptes set above control the |
1118 | * primary protection behavior: |
1119 | */ |
1120 | __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); |
1121 | |
1122 | /* |
1123 | * Do a global flush tlb after splitting the large page |
1124 | * and before we do the actual change page attribute in the PTE. |
1125 | * |
1126 | * Without this, we violate the TLB application note, that says: |
1127 | * "The TLBs may contain both ordinary and large-page |
1128 | * translations for a 4-KByte range of linear addresses. This |
1129 | * may occur if software modifies the paging structures so that |
1130 | * the page size used for the address range changes. If the two |
1131 | * translations differ with respect to page frame or attributes |
1132 | * (e.g., permissions), processor behavior is undefined and may |
1133 | * be implementation-specific." |
1134 | * |
1135 | * We do this global tlb flush inside the cpa_lock, so that we |
1136 | * don't allow any other cpu, with stale tlb entries change the |
1137 | * page attribute in parallel, that also falls into the |
1138 | * just split large page entry. |
1139 | */ |
1140 | flush_tlb_all(); |
1141 | spin_unlock(lock: &pgd_lock); |
1142 | |
1143 | return 0; |
1144 | } |
1145 | |
1146 | static int split_large_page(struct cpa_data *cpa, pte_t *kpte, |
1147 | unsigned long address) |
1148 | { |
1149 | struct page *base; |
1150 | |
1151 | if (!debug_pagealloc_enabled()) |
1152 | spin_unlock(lock: &cpa_lock); |
1153 | base = alloc_pages(GFP_KERNEL, order: 0); |
1154 | if (!debug_pagealloc_enabled()) |
1155 | spin_lock(lock: &cpa_lock); |
1156 | if (!base) |
1157 | return -ENOMEM; |
1158 | |
1159 | if (__split_large_page(cpa, kpte, address, base)) |
1160 | __free_page(base); |
1161 | |
1162 | return 0; |
1163 | } |
1164 | |
1165 | static bool try_to_free_pte_page(pte_t *pte) |
1166 | { |
1167 | int i; |
1168 | |
1169 | for (i = 0; i < PTRS_PER_PTE; i++) |
1170 | if (!pte_none(pte: pte[i])) |
1171 | return false; |
1172 | |
1173 | free_page((unsigned long)pte); |
1174 | return true; |
1175 | } |
1176 | |
1177 | static bool try_to_free_pmd_page(pmd_t *pmd) |
1178 | { |
1179 | int i; |
1180 | |
1181 | for (i = 0; i < PTRS_PER_PMD; i++) |
1182 | if (!pmd_none(pmd: pmd[i])) |
1183 | return false; |
1184 | |
1185 | free_page((unsigned long)pmd); |
1186 | return true; |
1187 | } |
1188 | |
1189 | static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
1190 | { |
1191 | pte_t *pte = pte_offset_kernel(pmd, address: start); |
1192 | |
1193 | while (start < end) { |
1194 | set_pte(ptep: pte, pte: __pte(val: 0)); |
1195 | |
1196 | start += PAGE_SIZE; |
1197 | pte++; |
1198 | } |
1199 | |
1200 | if (try_to_free_pte_page(pte: (pte_t *)pmd_page_vaddr(pmd: *pmd))) { |
1201 | pmd_clear(pmdp: pmd); |
1202 | return true; |
1203 | } |
1204 | return false; |
1205 | } |
1206 | |
1207 | static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, |
1208 | unsigned long start, unsigned long end) |
1209 | { |
1210 | if (unmap_pte_range(pmd, start, end)) |
1211 | if (try_to_free_pmd_page(pmd: pud_pgtable(pud: *pud))) |
1212 | pud_clear(pudp: pud); |
1213 | } |
1214 | |
1215 | static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
1216 | { |
1217 | pmd_t *pmd = pmd_offset(pud, address: start); |
1218 | |
1219 | /* |
1220 | * Not on a 2MB page boundary? |
1221 | */ |
1222 | if (start & (PMD_SIZE - 1)) { |
1223 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
1224 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
1225 | |
1226 | __unmap_pmd_range(pud, pmd, start, end: pre_end); |
1227 | |
1228 | start = pre_end; |
1229 | pmd++; |
1230 | } |
1231 | |
1232 | /* |
1233 | * Try to unmap in 2M chunks. |
1234 | */ |
1235 | while (end - start >= PMD_SIZE) { |
1236 | if (pmd_leaf(pte: *pmd)) |
1237 | pmd_clear(pmdp: pmd); |
1238 | else |
1239 | __unmap_pmd_range(pud, pmd, start, end: start + PMD_SIZE); |
1240 | |
1241 | start += PMD_SIZE; |
1242 | pmd++; |
1243 | } |
1244 | |
1245 | /* |
1246 | * 4K leftovers? |
1247 | */ |
1248 | if (start < end) |
1249 | return __unmap_pmd_range(pud, pmd, start, end); |
1250 | |
1251 | /* |
1252 | * Try again to free the PMD page if haven't succeeded above. |
1253 | */ |
1254 | if (!pud_none(pud: *pud)) |
1255 | if (try_to_free_pmd_page(pmd: pud_pgtable(pud: *pud))) |
1256 | pud_clear(pudp: pud); |
1257 | } |
1258 | |
1259 | static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) |
1260 | { |
1261 | pud_t *pud = pud_offset(p4d, address: start); |
1262 | |
1263 | /* |
1264 | * Not on a GB page boundary? |
1265 | */ |
1266 | if (start & (PUD_SIZE - 1)) { |
1267 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
1268 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
1269 | |
1270 | unmap_pmd_range(pud, start, end: pre_end); |
1271 | |
1272 | start = pre_end; |
1273 | pud++; |
1274 | } |
1275 | |
1276 | /* |
1277 | * Try to unmap in 1G chunks? |
1278 | */ |
1279 | while (end - start >= PUD_SIZE) { |
1280 | |
1281 | if (pud_leaf(pud: *pud)) |
1282 | pud_clear(pudp: pud); |
1283 | else |
1284 | unmap_pmd_range(pud, start, end: start + PUD_SIZE); |
1285 | |
1286 | start += PUD_SIZE; |
1287 | pud++; |
1288 | } |
1289 | |
1290 | /* |
1291 | * 2M leftovers? |
1292 | */ |
1293 | if (start < end) |
1294 | unmap_pmd_range(pud, start, end); |
1295 | |
1296 | /* |
1297 | * No need to try to free the PUD page because we'll free it in |
1298 | * populate_pgd's error path |
1299 | */ |
1300 | } |
1301 | |
1302 | static int alloc_pte_page(pmd_t *pmd) |
1303 | { |
1304 | pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); |
1305 | if (!pte) |
1306 | return -1; |
1307 | |
1308 | set_pmd(pmdp: pmd, pmd: __pmd(__pa(pte) | _KERNPG_TABLE)); |
1309 | return 0; |
1310 | } |
1311 | |
1312 | static int alloc_pmd_page(pud_t *pud) |
1313 | { |
1314 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
1315 | if (!pmd) |
1316 | return -1; |
1317 | |
1318 | set_pud(pudp: pud, pud: __pud(__pa(pmd) | _KERNPG_TABLE)); |
1319 | return 0; |
1320 | } |
1321 | |
1322 | static void populate_pte(struct cpa_data *cpa, |
1323 | unsigned long start, unsigned long end, |
1324 | unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) |
1325 | { |
1326 | pte_t *pte; |
1327 | |
1328 | pte = pte_offset_kernel(pmd, address: start); |
1329 | |
1330 | pgprot = pgprot_clear_protnone_bits(prot: pgprot); |
1331 | |
1332 | while (num_pages-- && start < end) { |
1333 | set_pte(ptep: pte, pte: pfn_pte(page_nr: cpa->pfn, pgprot)); |
1334 | |
1335 | start += PAGE_SIZE; |
1336 | cpa->pfn++; |
1337 | pte++; |
1338 | } |
1339 | } |
1340 | |
1341 | static long populate_pmd(struct cpa_data *cpa, |
1342 | unsigned long start, unsigned long end, |
1343 | unsigned num_pages, pud_t *pud, pgprot_t pgprot) |
1344 | { |
1345 | long cur_pages = 0; |
1346 | pmd_t *pmd; |
1347 | pgprot_t pmd_pgprot; |
1348 | |
1349 | /* |
1350 | * Not on a 2M boundary? |
1351 | */ |
1352 | if (start & (PMD_SIZE - 1)) { |
1353 | unsigned long pre_end = start + (num_pages << PAGE_SHIFT); |
1354 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
1355 | |
1356 | pre_end = min_t(unsigned long, pre_end, next_page); |
1357 | cur_pages = (pre_end - start) >> PAGE_SHIFT; |
1358 | cur_pages = min_t(unsigned int, num_pages, cur_pages); |
1359 | |
1360 | /* |
1361 | * Need a PTE page? |
1362 | */ |
1363 | pmd = pmd_offset(pud, address: start); |
1364 | if (pmd_none(pmd: *pmd)) |
1365 | if (alloc_pte_page(pmd)) |
1366 | return -1; |
1367 | |
1368 | populate_pte(cpa, start, end: pre_end, num_pages: cur_pages, pmd, pgprot); |
1369 | |
1370 | start = pre_end; |
1371 | } |
1372 | |
1373 | /* |
1374 | * We mapped them all? |
1375 | */ |
1376 | if (num_pages == cur_pages) |
1377 | return cur_pages; |
1378 | |
1379 | pmd_pgprot = pgprot_4k_2_large(pgprot); |
1380 | |
1381 | while (end - start >= PMD_SIZE) { |
1382 | |
1383 | /* |
1384 | * We cannot use a 1G page so allocate a PMD page if needed. |
1385 | */ |
1386 | if (pud_none(pud: *pud)) |
1387 | if (alloc_pmd_page(pud)) |
1388 | return -1; |
1389 | |
1390 | pmd = pmd_offset(pud, address: start); |
1391 | |
1392 | set_pmd(pmdp: pmd, pmd: pmd_mkhuge(pmd: pfn_pmd(page_nr: cpa->pfn, |
1393 | canon_pgprot(pmd_pgprot)))); |
1394 | |
1395 | start += PMD_SIZE; |
1396 | cpa->pfn += PMD_SIZE >> PAGE_SHIFT; |
1397 | cur_pages += PMD_SIZE >> PAGE_SHIFT; |
1398 | } |
1399 | |
1400 | /* |
1401 | * Map trailing 4K pages. |
1402 | */ |
1403 | if (start < end) { |
1404 | pmd = pmd_offset(pud, address: start); |
1405 | if (pmd_none(pmd: *pmd)) |
1406 | if (alloc_pte_page(pmd)) |
1407 | return -1; |
1408 | |
1409 | populate_pte(cpa, start, end, num_pages: num_pages - cur_pages, |
1410 | pmd, pgprot); |
1411 | } |
1412 | return num_pages; |
1413 | } |
1414 | |
1415 | static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, |
1416 | pgprot_t pgprot) |
1417 | { |
1418 | pud_t *pud; |
1419 | unsigned long end; |
1420 | long cur_pages = 0; |
1421 | pgprot_t pud_pgprot; |
1422 | |
1423 | end = start + (cpa->numpages << PAGE_SHIFT); |
1424 | |
1425 | /* |
1426 | * Not on a Gb page boundary? => map everything up to it with |
1427 | * smaller pages. |
1428 | */ |
1429 | if (start & (PUD_SIZE - 1)) { |
1430 | unsigned long pre_end; |
1431 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
1432 | |
1433 | pre_end = min_t(unsigned long, end, next_page); |
1434 | cur_pages = (pre_end - start) >> PAGE_SHIFT; |
1435 | cur_pages = min_t(int, (int)cpa->numpages, cur_pages); |
1436 | |
1437 | pud = pud_offset(p4d, address: start); |
1438 | |
1439 | /* |
1440 | * Need a PMD page? |
1441 | */ |
1442 | if (pud_none(pud: *pud)) |
1443 | if (alloc_pmd_page(pud)) |
1444 | return -1; |
1445 | |
1446 | cur_pages = populate_pmd(cpa, start, end: pre_end, num_pages: cur_pages, |
1447 | pud, pgprot); |
1448 | if (cur_pages < 0) |
1449 | return cur_pages; |
1450 | |
1451 | start = pre_end; |
1452 | } |
1453 | |
1454 | /* We mapped them all? */ |
1455 | if (cpa->numpages == cur_pages) |
1456 | return cur_pages; |
1457 | |
1458 | pud = pud_offset(p4d, address: start); |
1459 | pud_pgprot = pgprot_4k_2_large(pgprot); |
1460 | |
1461 | /* |
1462 | * Map everything starting from the Gb boundary, possibly with 1G pages |
1463 | */ |
1464 | while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { |
1465 | set_pud(pudp: pud, pud: pud_mkhuge(pud: pfn_pud(page_nr: cpa->pfn, |
1466 | canon_pgprot(pud_pgprot)))); |
1467 | |
1468 | start += PUD_SIZE; |
1469 | cpa->pfn += PUD_SIZE >> PAGE_SHIFT; |
1470 | cur_pages += PUD_SIZE >> PAGE_SHIFT; |
1471 | pud++; |
1472 | } |
1473 | |
1474 | /* Map trailing leftover */ |
1475 | if (start < end) { |
1476 | long tmp; |
1477 | |
1478 | pud = pud_offset(p4d, address: start); |
1479 | if (pud_none(pud: *pud)) |
1480 | if (alloc_pmd_page(pud)) |
1481 | return -1; |
1482 | |
1483 | tmp = populate_pmd(cpa, start, end, num_pages: cpa->numpages - cur_pages, |
1484 | pud, pgprot); |
1485 | if (tmp < 0) |
1486 | return cur_pages; |
1487 | |
1488 | cur_pages += tmp; |
1489 | } |
1490 | return cur_pages; |
1491 | } |
1492 | |
1493 | /* |
1494 | * Restrictions for kernel page table do not necessarily apply when mapping in |
1495 | * an alternate PGD. |
1496 | */ |
1497 | static int populate_pgd(struct cpa_data *cpa, unsigned long addr) |
1498 | { |
1499 | pgprot_t pgprot = __pgprot(_KERNPG_TABLE); |
1500 | pud_t *pud = NULL; /* shut up gcc */ |
1501 | p4d_t *p4d; |
1502 | pgd_t *pgd_entry; |
1503 | long ret; |
1504 | |
1505 | pgd_entry = cpa->pgd + pgd_index(addr); |
1506 | |
1507 | if (pgd_none(pgd: *pgd_entry)) { |
1508 | p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); |
1509 | if (!p4d) |
1510 | return -1; |
1511 | |
1512 | set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); |
1513 | } |
1514 | |
1515 | /* |
1516 | * Allocate a PUD page and hand it down for mapping. |
1517 | */ |
1518 | p4d = p4d_offset(pgd: pgd_entry, address: addr); |
1519 | if (p4d_none(p4d: *p4d)) { |
1520 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); |
1521 | if (!pud) |
1522 | return -1; |
1523 | |
1524 | set_p4d(p4dp: p4d, p4d: __p4d(__pa(pud) | _KERNPG_TABLE)); |
1525 | } |
1526 | |
1527 | pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); |
1528 | pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); |
1529 | |
1530 | ret = populate_pud(cpa, start: addr, p4d, pgprot); |
1531 | if (ret < 0) { |
1532 | /* |
1533 | * Leave the PUD page in place in case some other CPU or thread |
1534 | * already found it, but remove any useless entries we just |
1535 | * added to it. |
1536 | */ |
1537 | unmap_pud_range(p4d, start: addr, |
1538 | end: addr + (cpa->numpages << PAGE_SHIFT)); |
1539 | return ret; |
1540 | } |
1541 | |
1542 | cpa->numpages = ret; |
1543 | return 0; |
1544 | } |
1545 | |
1546 | static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, |
1547 | int primary) |
1548 | { |
1549 | if (cpa->pgd) { |
1550 | /* |
1551 | * Right now, we only execute this code path when mapping |
1552 | * the EFI virtual memory map regions, no other users |
1553 | * provide a ->pgd value. This may change in the future. |
1554 | */ |
1555 | return populate_pgd(cpa, addr: vaddr); |
1556 | } |
1557 | |
1558 | /* |
1559 | * Ignore all non primary paths. |
1560 | */ |
1561 | if (!primary) { |
1562 | cpa->numpages = 1; |
1563 | return 0; |
1564 | } |
1565 | |
1566 | /* |
1567 | * Ignore the NULL PTE for kernel identity mapping, as it is expected |
1568 | * to have holes. |
1569 | * Also set numpages to '1' indicating that we processed cpa req for |
1570 | * one virtual address page and its pfn. TBD: numpages can be set based |
1571 | * on the initial value and the level returned by lookup_address(). |
1572 | */ |
1573 | if (within(addr: vaddr, PAGE_OFFSET, |
1574 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { |
1575 | cpa->numpages = 1; |
1576 | cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; |
1577 | return 0; |
1578 | |
1579 | } else if (__cpa_pfn_in_highmap(pfn: cpa->pfn)) { |
1580 | /* Faults in the highmap are OK, so do not warn: */ |
1581 | return -EFAULT; |
1582 | } else { |
1583 | WARN(1, KERN_WARNING "CPA: called for zero pte. " |
1584 | "vaddr = %lx cpa->vaddr = %lx\n" , vaddr, |
1585 | *cpa->vaddr); |
1586 | |
1587 | return -EFAULT; |
1588 | } |
1589 | } |
1590 | |
1591 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
1592 | { |
1593 | unsigned long address; |
1594 | int do_split, err; |
1595 | unsigned int level; |
1596 | pte_t *kpte, old_pte; |
1597 | |
1598 | address = __cpa_addr(cpa, idx: cpa->curpage); |
1599 | repeat: |
1600 | kpte = _lookup_address_cpa(cpa, address, level: &level); |
1601 | if (!kpte) |
1602 | return __cpa_process_fault(cpa, vaddr: address, primary); |
1603 | |
1604 | old_pte = *kpte; |
1605 | if (pte_none(pte: old_pte)) |
1606 | return __cpa_process_fault(cpa, vaddr: address, primary); |
1607 | |
1608 | if (level == PG_LEVEL_4K) { |
1609 | pte_t new_pte; |
1610 | pgprot_t old_prot = pte_pgprot(old_pte); |
1611 | pgprot_t new_prot = pte_pgprot(old_pte); |
1612 | unsigned long pfn = pte_pfn(pte: old_pte); |
1613 | |
1614 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); |
1615 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); |
1616 | |
1617 | cpa_inc_4k_install(); |
1618 | /* Hand in lpsize = 0 to enforce the protection mechanism */ |
1619 | new_prot = static_protections(prot: new_prot, start: address, pfn, npg: 1, lpsize: 0, |
1620 | warnlvl: CPA_PROTECT); |
1621 | |
1622 | new_prot = verify_rwx(old: old_prot, new: new_prot, start: address, pfn, npg: 1); |
1623 | |
1624 | new_prot = pgprot_clear_protnone_bits(prot: new_prot); |
1625 | |
1626 | /* |
1627 | * We need to keep the pfn from the existing PTE, |
1628 | * after all we're only going to change its attributes |
1629 | * not the memory it points to |
1630 | */ |
1631 | new_pte = pfn_pte(page_nr: pfn, pgprot: new_prot); |
1632 | cpa->pfn = pfn; |
1633 | /* |
1634 | * Do we really change anything ? |
1635 | */ |
1636 | if (pte_val(pte: old_pte) != pte_val(pte: new_pte)) { |
1637 | set_pte_atomic(ptep: kpte, pte: new_pte); |
1638 | cpa->flags |= CPA_FLUSHTLB; |
1639 | } |
1640 | cpa->numpages = 1; |
1641 | return 0; |
1642 | } |
1643 | |
1644 | /* |
1645 | * Check, whether we can keep the large page intact |
1646 | * and just change the pte: |
1647 | */ |
1648 | do_split = should_split_large_page(kpte, address, cpa); |
1649 | /* |
1650 | * When the range fits into the existing large page, |
1651 | * return. cp->numpages and cpa->tlbflush have been updated in |
1652 | * try_large_page: |
1653 | */ |
1654 | if (do_split <= 0) |
1655 | return do_split; |
1656 | |
1657 | /* |
1658 | * We have to split the large page: |
1659 | */ |
1660 | err = split_large_page(cpa, kpte, address); |
1661 | if (!err) |
1662 | goto repeat; |
1663 | |
1664 | return err; |
1665 | } |
1666 | |
1667 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); |
1668 | |
1669 | /* |
1670 | * Check the directmap and "high kernel map" 'aliases'. |
1671 | */ |
1672 | static int cpa_process_alias(struct cpa_data *cpa) |
1673 | { |
1674 | struct cpa_data alias_cpa; |
1675 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); |
1676 | unsigned long vaddr; |
1677 | int ret; |
1678 | |
1679 | if (!pfn_range_is_mapped(start_pfn: cpa->pfn, end_pfn: cpa->pfn + 1)) |
1680 | return 0; |
1681 | |
1682 | /* |
1683 | * No need to redo, when the primary call touched the direct |
1684 | * mapping already: |
1685 | */ |
1686 | vaddr = __cpa_addr(cpa, idx: cpa->curpage); |
1687 | if (!(within(addr: vaddr, PAGE_OFFSET, |
1688 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { |
1689 | |
1690 | alias_cpa = *cpa; |
1691 | alias_cpa.vaddr = &laddr; |
1692 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); |
1693 | alias_cpa.curpage = 0; |
1694 | |
1695 | /* Directmap always has NX set, do not modify. */ |
1696 | if (__supported_pte_mask & _PAGE_NX) { |
1697 | alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; |
1698 | alias_cpa.mask_set.pgprot &= ~_PAGE_NX; |
1699 | } |
1700 | |
1701 | cpa->force_flush_all = 1; |
1702 | |
1703 | ret = __change_page_attr_set_clr(cpa: &alias_cpa, primary: 0); |
1704 | if (ret) |
1705 | return ret; |
1706 | } |
1707 | |
1708 | #ifdef CONFIG_X86_64 |
1709 | /* |
1710 | * If the primary call didn't touch the high mapping already |
1711 | * and the physical address is inside the kernel map, we need |
1712 | * to touch the high mapped kernel as well: |
1713 | */ |
1714 | if (!within(addr: vaddr, start: (unsigned long)_text, end: _brk_end) && |
1715 | __cpa_pfn_in_highmap(pfn: cpa->pfn)) { |
1716 | unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + |
1717 | __START_KERNEL_map - phys_base; |
1718 | alias_cpa = *cpa; |
1719 | alias_cpa.vaddr = &temp_cpa_vaddr; |
1720 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); |
1721 | alias_cpa.curpage = 0; |
1722 | |
1723 | /* |
1724 | * [_text, _brk_end) also covers data, do not modify NX except |
1725 | * in cases where the highmap is the primary target. |
1726 | */ |
1727 | if (__supported_pte_mask & _PAGE_NX) { |
1728 | alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; |
1729 | alias_cpa.mask_set.pgprot &= ~_PAGE_NX; |
1730 | } |
1731 | |
1732 | cpa->force_flush_all = 1; |
1733 | /* |
1734 | * The high mapping range is imprecise, so ignore the |
1735 | * return value. |
1736 | */ |
1737 | __change_page_attr_set_clr(cpa: &alias_cpa, primary: 0); |
1738 | } |
1739 | #endif |
1740 | |
1741 | return 0; |
1742 | } |
1743 | |
1744 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) |
1745 | { |
1746 | unsigned long numpages = cpa->numpages; |
1747 | unsigned long rempages = numpages; |
1748 | int ret = 0; |
1749 | |
1750 | /* |
1751 | * No changes, easy! |
1752 | */ |
1753 | if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && |
1754 | !cpa->force_split) |
1755 | return ret; |
1756 | |
1757 | while (rempages) { |
1758 | /* |
1759 | * Store the remaining nr of pages for the large page |
1760 | * preservation check. |
1761 | */ |
1762 | cpa->numpages = rempages; |
1763 | /* for array changes, we can't use large page */ |
1764 | if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) |
1765 | cpa->numpages = 1; |
1766 | |
1767 | if (!debug_pagealloc_enabled()) |
1768 | spin_lock(lock: &cpa_lock); |
1769 | ret = __change_page_attr(cpa, primary); |
1770 | if (!debug_pagealloc_enabled()) |
1771 | spin_unlock(lock: &cpa_lock); |
1772 | if (ret) |
1773 | goto out; |
1774 | |
1775 | if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { |
1776 | ret = cpa_process_alias(cpa); |
1777 | if (ret) |
1778 | goto out; |
1779 | } |
1780 | |
1781 | /* |
1782 | * Adjust the number of pages with the result of the |
1783 | * CPA operation. Either a large page has been |
1784 | * preserved or a single page update happened. |
1785 | */ |
1786 | BUG_ON(cpa->numpages > rempages || !cpa->numpages); |
1787 | rempages -= cpa->numpages; |
1788 | cpa->curpage += cpa->numpages; |
1789 | } |
1790 | |
1791 | out: |
1792 | /* Restore the original numpages */ |
1793 | cpa->numpages = numpages; |
1794 | return ret; |
1795 | } |
1796 | |
1797 | static int change_page_attr_set_clr(unsigned long *addr, int numpages, |
1798 | pgprot_t mask_set, pgprot_t mask_clr, |
1799 | int force_split, int in_flag, |
1800 | struct page **pages) |
1801 | { |
1802 | struct cpa_data cpa; |
1803 | int ret, cache; |
1804 | |
1805 | memset(&cpa, 0, sizeof(cpa)); |
1806 | |
1807 | /* |
1808 | * Check, if we are requested to set a not supported |
1809 | * feature. Clearing non-supported features is OK. |
1810 | */ |
1811 | mask_set = canon_pgprot(mask_set); |
1812 | |
1813 | if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) |
1814 | return 0; |
1815 | |
1816 | /* Ensure we are PAGE_SIZE aligned */ |
1817 | if (in_flag & CPA_ARRAY) { |
1818 | int i; |
1819 | for (i = 0; i < numpages; i++) { |
1820 | if (addr[i] & ~PAGE_MASK) { |
1821 | addr[i] &= PAGE_MASK; |
1822 | WARN_ON_ONCE(1); |
1823 | } |
1824 | } |
1825 | } else if (!(in_flag & CPA_PAGES_ARRAY)) { |
1826 | /* |
1827 | * in_flag of CPA_PAGES_ARRAY implies it is aligned. |
1828 | * No need to check in that case |
1829 | */ |
1830 | if (*addr & ~PAGE_MASK) { |
1831 | *addr &= PAGE_MASK; |
1832 | /* |
1833 | * People should not be passing in unaligned addresses: |
1834 | */ |
1835 | WARN_ON_ONCE(1); |
1836 | } |
1837 | } |
1838 | |
1839 | /* Must avoid aliasing mappings in the highmem code */ |
1840 | kmap_flush_unused(); |
1841 | |
1842 | vm_unmap_aliases(); |
1843 | |
1844 | cpa.vaddr = addr; |
1845 | cpa.pages = pages; |
1846 | cpa.numpages = numpages; |
1847 | cpa.mask_set = mask_set; |
1848 | cpa.mask_clr = mask_clr; |
1849 | cpa.flags = in_flag; |
1850 | cpa.curpage = 0; |
1851 | cpa.force_split = force_split; |
1852 | |
1853 | ret = __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
1854 | |
1855 | /* |
1856 | * Check whether we really changed something: |
1857 | */ |
1858 | if (!(cpa.flags & CPA_FLUSHTLB)) |
1859 | goto out; |
1860 | |
1861 | /* |
1862 | * No need to flush, when we did not set any of the caching |
1863 | * attributes: |
1864 | */ |
1865 | cache = !!pgprot2cachemode(pgprot: mask_set); |
1866 | |
1867 | /* |
1868 | * On error; flush everything to be sure. |
1869 | */ |
1870 | if (ret) { |
1871 | cpa_flush_all(cache); |
1872 | goto out; |
1873 | } |
1874 | |
1875 | cpa_flush(data: &cpa, cache); |
1876 | out: |
1877 | return ret; |
1878 | } |
1879 | |
1880 | static inline int change_page_attr_set(unsigned long *addr, int numpages, |
1881 | pgprot_t mask, int array) |
1882 | { |
1883 | return change_page_attr_set_clr(addr, numpages, mask_set: mask, __pgprot(0), force_split: 0, |
1884 | in_flag: (array ? CPA_ARRAY : 0), NULL); |
1885 | } |
1886 | |
1887 | static inline int change_page_attr_clear(unsigned long *addr, int numpages, |
1888 | pgprot_t mask, int array) |
1889 | { |
1890 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask_clr: mask, force_split: 0, |
1891 | in_flag: (array ? CPA_ARRAY : 0), NULL); |
1892 | } |
1893 | |
1894 | static inline int cpa_set_pages_array(struct page **pages, int numpages, |
1895 | pgprot_t mask) |
1896 | { |
1897 | return change_page_attr_set_clr(NULL, numpages, mask_set: mask, __pgprot(0), force_split: 0, |
1898 | CPA_PAGES_ARRAY, pages); |
1899 | } |
1900 | |
1901 | static inline int cpa_clear_pages_array(struct page **pages, int numpages, |
1902 | pgprot_t mask) |
1903 | { |
1904 | return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask_clr: mask, force_split: 0, |
1905 | CPA_PAGES_ARRAY, pages); |
1906 | } |
1907 | |
1908 | /* |
1909 | * __set_memory_prot is an internal helper for callers that have been passed |
1910 | * a pgprot_t value from upper layers and a reservation has already been taken. |
1911 | * If you want to set the pgprot to a specific page protocol, use the |
1912 | * set_memory_xx() functions. |
1913 | */ |
1914 | int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) |
1915 | { |
1916 | return change_page_attr_set_clr(addr: &addr, numpages, mask_set: prot, |
1917 | __pgprot(~pgprot_val(prot)), force_split: 0, in_flag: 0, |
1918 | NULL); |
1919 | } |
1920 | |
1921 | int _set_memory_uc(unsigned long addr, int numpages) |
1922 | { |
1923 | /* |
1924 | * for now UC MINUS. see comments in ioremap() |
1925 | * If you really need strong UC use ioremap_uc(), but note |
1926 | * that you cannot override IO areas with set_memory_*() as |
1927 | * these helpers cannot work with IO memory. |
1928 | */ |
1929 | return change_page_attr_set(addr: &addr, numpages, |
1930 | mask: cachemode2pgprot(pcm: _PAGE_CACHE_MODE_UC_MINUS), |
1931 | array: 0); |
1932 | } |
1933 | |
1934 | int set_memory_uc(unsigned long addr, int numpages) |
1935 | { |
1936 | int ret; |
1937 | |
1938 | /* |
1939 | * for now UC MINUS. see comments in ioremap() |
1940 | */ |
1941 | ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
1942 | req_pcm: _PAGE_CACHE_MODE_UC_MINUS, NULL); |
1943 | if (ret) |
1944 | goto out_err; |
1945 | |
1946 | ret = _set_memory_uc(addr, numpages); |
1947 | if (ret) |
1948 | goto out_free; |
1949 | |
1950 | return 0; |
1951 | |
1952 | out_free: |
1953 | memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
1954 | out_err: |
1955 | return ret; |
1956 | } |
1957 | EXPORT_SYMBOL(set_memory_uc); |
1958 | |
1959 | int _set_memory_wc(unsigned long addr, int numpages) |
1960 | { |
1961 | int ret; |
1962 | |
1963 | ret = change_page_attr_set(addr: &addr, numpages, |
1964 | mask: cachemode2pgprot(pcm: _PAGE_CACHE_MODE_UC_MINUS), |
1965 | array: 0); |
1966 | if (!ret) { |
1967 | ret = change_page_attr_set_clr(addr: &addr, numpages, |
1968 | mask_set: cachemode2pgprot(pcm: _PAGE_CACHE_MODE_WC), |
1969 | __pgprot(_PAGE_CACHE_MASK), |
1970 | force_split: 0, in_flag: 0, NULL); |
1971 | } |
1972 | return ret; |
1973 | } |
1974 | |
1975 | int set_memory_wc(unsigned long addr, int numpages) |
1976 | { |
1977 | int ret; |
1978 | |
1979 | ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
1980 | req_pcm: _PAGE_CACHE_MODE_WC, NULL); |
1981 | if (ret) |
1982 | return ret; |
1983 | |
1984 | ret = _set_memory_wc(addr, numpages); |
1985 | if (ret) |
1986 | memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
1987 | |
1988 | return ret; |
1989 | } |
1990 | EXPORT_SYMBOL(set_memory_wc); |
1991 | |
1992 | int _set_memory_wt(unsigned long addr, int numpages) |
1993 | { |
1994 | return change_page_attr_set(addr: &addr, numpages, |
1995 | mask: cachemode2pgprot(pcm: _PAGE_CACHE_MODE_WT), array: 0); |
1996 | } |
1997 | |
1998 | int _set_memory_wb(unsigned long addr, int numpages) |
1999 | { |
2000 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ |
2001 | return change_page_attr_clear(addr: &addr, numpages, |
2002 | __pgprot(_PAGE_CACHE_MASK), array: 0); |
2003 | } |
2004 | |
2005 | int set_memory_wb(unsigned long addr, int numpages) |
2006 | { |
2007 | int ret; |
2008 | |
2009 | ret = _set_memory_wb(addr, numpages); |
2010 | if (ret) |
2011 | return ret; |
2012 | |
2013 | memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
2014 | return 0; |
2015 | } |
2016 | EXPORT_SYMBOL(set_memory_wb); |
2017 | |
2018 | /* Prevent speculative access to a page by marking it not-present */ |
2019 | #ifdef CONFIG_X86_64 |
2020 | int set_mce_nospec(unsigned long pfn) |
2021 | { |
2022 | unsigned long decoy_addr; |
2023 | int rc; |
2024 | |
2025 | /* SGX pages are not in the 1:1 map */ |
2026 | if (arch_is_platform_page(paddr: pfn << PAGE_SHIFT)) |
2027 | return 0; |
2028 | /* |
2029 | * We would like to just call: |
2030 | * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); |
2031 | * but doing that would radically increase the odds of a |
2032 | * speculative access to the poison page because we'd have |
2033 | * the virtual address of the kernel 1:1 mapping sitting |
2034 | * around in registers. |
2035 | * Instead we get tricky. We create a non-canonical address |
2036 | * that looks just like the one we want, but has bit 63 flipped. |
2037 | * This relies on set_memory_XX() properly sanitizing any __pa() |
2038 | * results with __PHYSICAL_MASK or PTE_PFN_MASK. |
2039 | */ |
2040 | decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); |
2041 | |
2042 | rc = set_memory_np(addr: decoy_addr, numpages: 1); |
2043 | if (rc) |
2044 | pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n" , pfn); |
2045 | return rc; |
2046 | } |
2047 | |
2048 | /* Restore full speculative operation to the pfn. */ |
2049 | int clear_mce_nospec(unsigned long pfn) |
2050 | { |
2051 | unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); |
2052 | |
2053 | return set_memory_p(addr, numpages: 1); |
2054 | } |
2055 | EXPORT_SYMBOL_GPL(clear_mce_nospec); |
2056 | #endif /* CONFIG_X86_64 */ |
2057 | |
2058 | int set_memory_x(unsigned long addr, int numpages) |
2059 | { |
2060 | if (!(__supported_pte_mask & _PAGE_NX)) |
2061 | return 0; |
2062 | |
2063 | return change_page_attr_clear(addr: &addr, numpages, __pgprot(_PAGE_NX), array: 0); |
2064 | } |
2065 | |
2066 | int set_memory_nx(unsigned long addr, int numpages) |
2067 | { |
2068 | if (!(__supported_pte_mask & _PAGE_NX)) |
2069 | return 0; |
2070 | |
2071 | return change_page_attr_set(addr: &addr, numpages, __pgprot(_PAGE_NX), array: 0); |
2072 | } |
2073 | |
2074 | int set_memory_ro(unsigned long addr, int numpages) |
2075 | { |
2076 | return change_page_attr_clear(addr: &addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), array: 0); |
2077 | } |
2078 | |
2079 | int set_memory_rox(unsigned long addr, int numpages) |
2080 | { |
2081 | pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); |
2082 | |
2083 | if (__supported_pte_mask & _PAGE_NX) |
2084 | clr.pgprot |= _PAGE_NX; |
2085 | |
2086 | return change_page_attr_clear(addr: &addr, numpages, mask: clr, array: 0); |
2087 | } |
2088 | |
2089 | int set_memory_rw(unsigned long addr, int numpages) |
2090 | { |
2091 | return change_page_attr_set(addr: &addr, numpages, __pgprot(_PAGE_RW), array: 0); |
2092 | } |
2093 | |
2094 | int set_memory_np(unsigned long addr, int numpages) |
2095 | { |
2096 | return change_page_attr_clear(addr: &addr, numpages, __pgprot(_PAGE_PRESENT), array: 0); |
2097 | } |
2098 | |
2099 | int set_memory_np_noalias(unsigned long addr, int numpages) |
2100 | { |
2101 | return change_page_attr_set_clr(addr: &addr, numpages, __pgprot(0), |
2102 | __pgprot(_PAGE_PRESENT), force_split: 0, |
2103 | CPA_NO_CHECK_ALIAS, NULL); |
2104 | } |
2105 | |
2106 | int set_memory_p(unsigned long addr, int numpages) |
2107 | { |
2108 | return change_page_attr_set(addr: &addr, numpages, __pgprot(_PAGE_PRESENT), array: 0); |
2109 | } |
2110 | |
2111 | int set_memory_4k(unsigned long addr, int numpages) |
2112 | { |
2113 | return change_page_attr_set_clr(addr: &addr, numpages, __pgprot(0), |
2114 | __pgprot(0), force_split: 1, in_flag: 0, NULL); |
2115 | } |
2116 | |
2117 | int set_memory_nonglobal(unsigned long addr, int numpages) |
2118 | { |
2119 | return change_page_attr_clear(addr: &addr, numpages, |
2120 | __pgprot(_PAGE_GLOBAL), array: 0); |
2121 | } |
2122 | |
2123 | int set_memory_global(unsigned long addr, int numpages) |
2124 | { |
2125 | return change_page_attr_set(addr: &addr, numpages, |
2126 | __pgprot(_PAGE_GLOBAL), array: 0); |
2127 | } |
2128 | |
2129 | /* |
2130 | * __set_memory_enc_pgtable() is used for the hypervisors that get |
2131 | * informed about "encryption" status via page tables. |
2132 | */ |
2133 | static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) |
2134 | { |
2135 | pgprot_t empty = __pgprot(0); |
2136 | struct cpa_data cpa; |
2137 | int ret; |
2138 | |
2139 | /* Should not be working on unaligned addresses */ |
2140 | if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n" , addr)) |
2141 | addr &= PAGE_MASK; |
2142 | |
2143 | memset(&cpa, 0, sizeof(cpa)); |
2144 | cpa.vaddr = &addr; |
2145 | cpa.numpages = numpages; |
2146 | cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); |
2147 | cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); |
2148 | cpa.pgd = init_mm.pgd; |
2149 | |
2150 | /* Must avoid aliasing mappings in the highmem code */ |
2151 | kmap_flush_unused(); |
2152 | vm_unmap_aliases(); |
2153 | |
2154 | /* Flush the caches as needed before changing the encryption attribute. */ |
2155 | if (x86_platform.guest.enc_tlb_flush_required(enc)) |
2156 | cpa_flush(data: &cpa, cache: x86_platform.guest.enc_cache_flush_required()); |
2157 | |
2158 | /* Notify hypervisor that we are about to set/clr encryption attribute. */ |
2159 | if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) |
2160 | goto vmm_fail; |
2161 | |
2162 | ret = __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
2163 | |
2164 | /* |
2165 | * After changing the encryption attribute, we need to flush TLBs again |
2166 | * in case any speculative TLB caching occurred (but no need to flush |
2167 | * caches again). We could just use cpa_flush_all(), but in case TLB |
2168 | * flushing gets optimized in the cpa_flush() path use the same logic |
2169 | * as above. |
2170 | */ |
2171 | cpa_flush(data: &cpa, cache: 0); |
2172 | |
2173 | if (ret) |
2174 | return ret; |
2175 | |
2176 | /* Notify hypervisor that we have successfully set/clr encryption attribute. */ |
2177 | if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) |
2178 | goto vmm_fail; |
2179 | |
2180 | return 0; |
2181 | |
2182 | vmm_fail: |
2183 | WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n" , |
2184 | (void *)addr, numpages, enc ? "private" : "shared" ); |
2185 | |
2186 | return -EIO; |
2187 | } |
2188 | |
2189 | static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) |
2190 | { |
2191 | if (cc_platform_has(attr: CC_ATTR_MEM_ENCRYPT)) |
2192 | return __set_memory_enc_pgtable(addr, numpages, enc); |
2193 | |
2194 | return 0; |
2195 | } |
2196 | |
2197 | int set_memory_encrypted(unsigned long addr, int numpages) |
2198 | { |
2199 | return __set_memory_enc_dec(addr, numpages, enc: true); |
2200 | } |
2201 | EXPORT_SYMBOL_GPL(set_memory_encrypted); |
2202 | |
2203 | int set_memory_decrypted(unsigned long addr, int numpages) |
2204 | { |
2205 | return __set_memory_enc_dec(addr, numpages, enc: false); |
2206 | } |
2207 | EXPORT_SYMBOL_GPL(set_memory_decrypted); |
2208 | |
2209 | int set_pages_uc(struct page *page, int numpages) |
2210 | { |
2211 | unsigned long addr = (unsigned long)page_address(page); |
2212 | |
2213 | return set_memory_uc(addr, numpages); |
2214 | } |
2215 | EXPORT_SYMBOL(set_pages_uc); |
2216 | |
2217 | static int _set_pages_array(struct page **pages, int numpages, |
2218 | enum page_cache_mode new_type) |
2219 | { |
2220 | unsigned long start; |
2221 | unsigned long end; |
2222 | enum page_cache_mode set_type; |
2223 | int i; |
2224 | int free_idx; |
2225 | int ret; |
2226 | |
2227 | for (i = 0; i < numpages; i++) { |
2228 | if (PageHighMem(page: pages[i])) |
2229 | continue; |
2230 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
2231 | end = start + PAGE_SIZE; |
2232 | if (memtype_reserve(start, end, req_pcm: new_type, NULL)) |
2233 | goto err_out; |
2234 | } |
2235 | |
2236 | /* If WC, set to UC- first and then WC */ |
2237 | set_type = (new_type == _PAGE_CACHE_MODE_WC) ? |
2238 | _PAGE_CACHE_MODE_UC_MINUS : new_type; |
2239 | |
2240 | ret = cpa_set_pages_array(pages, numpages, |
2241 | mask: cachemode2pgprot(pcm: set_type)); |
2242 | if (!ret && new_type == _PAGE_CACHE_MODE_WC) |
2243 | ret = change_page_attr_set_clr(NULL, numpages, |
2244 | mask_set: cachemode2pgprot( |
2245 | pcm: _PAGE_CACHE_MODE_WC), |
2246 | __pgprot(_PAGE_CACHE_MASK), |
2247 | force_split: 0, CPA_PAGES_ARRAY, pages); |
2248 | if (ret) |
2249 | goto err_out; |
2250 | return 0; /* Success */ |
2251 | err_out: |
2252 | free_idx = i; |
2253 | for (i = 0; i < free_idx; i++) { |
2254 | if (PageHighMem(page: pages[i])) |
2255 | continue; |
2256 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
2257 | end = start + PAGE_SIZE; |
2258 | memtype_free(start, end); |
2259 | } |
2260 | return -EINVAL; |
2261 | } |
2262 | |
2263 | int set_pages_array_uc(struct page **pages, int numpages) |
2264 | { |
2265 | return _set_pages_array(pages, numpages, new_type: _PAGE_CACHE_MODE_UC_MINUS); |
2266 | } |
2267 | EXPORT_SYMBOL(set_pages_array_uc); |
2268 | |
2269 | int set_pages_array_wc(struct page **pages, int numpages) |
2270 | { |
2271 | return _set_pages_array(pages, numpages, new_type: _PAGE_CACHE_MODE_WC); |
2272 | } |
2273 | EXPORT_SYMBOL(set_pages_array_wc); |
2274 | |
2275 | int set_pages_wb(struct page *page, int numpages) |
2276 | { |
2277 | unsigned long addr = (unsigned long)page_address(page); |
2278 | |
2279 | return set_memory_wb(addr, numpages); |
2280 | } |
2281 | EXPORT_SYMBOL(set_pages_wb); |
2282 | |
2283 | int set_pages_array_wb(struct page **pages, int numpages) |
2284 | { |
2285 | int retval; |
2286 | unsigned long start; |
2287 | unsigned long end; |
2288 | int i; |
2289 | |
2290 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ |
2291 | retval = cpa_clear_pages_array(pages, numpages, |
2292 | __pgprot(_PAGE_CACHE_MASK)); |
2293 | if (retval) |
2294 | return retval; |
2295 | |
2296 | for (i = 0; i < numpages; i++) { |
2297 | if (PageHighMem(page: pages[i])) |
2298 | continue; |
2299 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
2300 | end = start + PAGE_SIZE; |
2301 | memtype_free(start, end); |
2302 | } |
2303 | |
2304 | return 0; |
2305 | } |
2306 | EXPORT_SYMBOL(set_pages_array_wb); |
2307 | |
2308 | int set_pages_ro(struct page *page, int numpages) |
2309 | { |
2310 | unsigned long addr = (unsigned long)page_address(page); |
2311 | |
2312 | return set_memory_ro(addr, numpages); |
2313 | } |
2314 | |
2315 | int set_pages_rw(struct page *page, int numpages) |
2316 | { |
2317 | unsigned long addr = (unsigned long)page_address(page); |
2318 | |
2319 | return set_memory_rw(addr, numpages); |
2320 | } |
2321 | |
2322 | static int __set_pages_p(struct page *page, int numpages) |
2323 | { |
2324 | unsigned long tempaddr = (unsigned long) page_address(page); |
2325 | struct cpa_data cpa = { .vaddr = &tempaddr, |
2326 | .pgd = NULL, |
2327 | .numpages = numpages, |
2328 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
2329 | .mask_clr = __pgprot(0), |
2330 | .flags = CPA_NO_CHECK_ALIAS }; |
2331 | |
2332 | /* |
2333 | * No alias checking needed for setting present flag. otherwise, |
2334 | * we may need to break large pages for 64-bit kernel text |
2335 | * mappings (this adds to complexity if we want to do this from |
2336 | * atomic context especially). Let's keep it simple! |
2337 | */ |
2338 | return __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
2339 | } |
2340 | |
2341 | static int __set_pages_np(struct page *page, int numpages) |
2342 | { |
2343 | unsigned long tempaddr = (unsigned long) page_address(page); |
2344 | struct cpa_data cpa = { .vaddr = &tempaddr, |
2345 | .pgd = NULL, |
2346 | .numpages = numpages, |
2347 | .mask_set = __pgprot(0), |
2348 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
2349 | .flags = CPA_NO_CHECK_ALIAS }; |
2350 | |
2351 | /* |
2352 | * No alias checking needed for setting not present flag. otherwise, |
2353 | * we may need to break large pages for 64-bit kernel text |
2354 | * mappings (this adds to complexity if we want to do this from |
2355 | * atomic context especially). Let's keep it simple! |
2356 | */ |
2357 | return __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
2358 | } |
2359 | |
2360 | int set_direct_map_invalid_noflush(struct page *page) |
2361 | { |
2362 | return __set_pages_np(page, numpages: 1); |
2363 | } |
2364 | |
2365 | int set_direct_map_default_noflush(struct page *page) |
2366 | { |
2367 | return __set_pages_p(page, numpages: 1); |
2368 | } |
2369 | |
2370 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2371 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
2372 | { |
2373 | if (PageHighMem(page)) |
2374 | return; |
2375 | if (!enable) { |
2376 | debug_check_no_locks_freed(page_address(page), |
2377 | len: numpages * PAGE_SIZE); |
2378 | } |
2379 | |
2380 | /* |
2381 | * The return value is ignored as the calls cannot fail. |
2382 | * Large pages for identity mappings are not used at boot time |
2383 | * and hence no memory allocations during large page split. |
2384 | */ |
2385 | if (enable) |
2386 | __set_pages_p(page, numpages); |
2387 | else |
2388 | __set_pages_np(page, numpages); |
2389 | |
2390 | /* |
2391 | * We should perform an IPI and flush all tlbs, |
2392 | * but that can deadlock->flush only current cpu. |
2393 | * Preemption needs to be disabled around __flush_tlb_all() due to |
2394 | * CR3 reload in __native_flush_tlb(). |
2395 | */ |
2396 | preempt_disable(); |
2397 | __flush_tlb_all(); |
2398 | preempt_enable(); |
2399 | |
2400 | arch_flush_lazy_mmu_mode(); |
2401 | } |
2402 | #endif /* CONFIG_DEBUG_PAGEALLOC */ |
2403 | |
2404 | bool kernel_page_present(struct page *page) |
2405 | { |
2406 | unsigned int level; |
2407 | pte_t *pte; |
2408 | |
2409 | if (PageHighMem(page)) |
2410 | return false; |
2411 | |
2412 | pte = lookup_address((unsigned long)page_address(page), &level); |
2413 | return (pte_val(pte: *pte) & _PAGE_PRESENT); |
2414 | } |
2415 | |
2416 | int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, |
2417 | unsigned numpages, unsigned long page_flags) |
2418 | { |
2419 | int retval = -EINVAL; |
2420 | |
2421 | struct cpa_data cpa = { |
2422 | .vaddr = &address, |
2423 | .pfn = pfn, |
2424 | .pgd = pgd, |
2425 | .numpages = numpages, |
2426 | .mask_set = __pgprot(0), |
2427 | .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), |
2428 | .flags = CPA_NO_CHECK_ALIAS, |
2429 | }; |
2430 | |
2431 | WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP" ); |
2432 | |
2433 | if (!(__supported_pte_mask & _PAGE_NX)) |
2434 | goto out; |
2435 | |
2436 | if (!(page_flags & _PAGE_ENC)) |
2437 | cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); |
2438 | |
2439 | cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); |
2440 | |
2441 | retval = __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
2442 | __flush_tlb_all(); |
2443 | |
2444 | out: |
2445 | return retval; |
2446 | } |
2447 | |
2448 | /* |
2449 | * __flush_tlb_all() flushes mappings only on current CPU and hence this |
2450 | * function shouldn't be used in an SMP environment. Presently, it's used only |
2451 | * during boot (way before smp_init()) by EFI subsystem and hence is ok. |
2452 | */ |
2453 | int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, |
2454 | unsigned long numpages) |
2455 | { |
2456 | int retval; |
2457 | |
2458 | /* |
2459 | * The typical sequence for unmapping is to find a pte through |
2460 | * lookup_address_in_pgd() (ideally, it should never return NULL because |
2461 | * the address is already mapped) and change its protections. As pfn is |
2462 | * the *target* of a mapping, it's not useful while unmapping. |
2463 | */ |
2464 | struct cpa_data cpa = { |
2465 | .vaddr = &address, |
2466 | .pfn = 0, |
2467 | .pgd = pgd, |
2468 | .numpages = numpages, |
2469 | .mask_set = __pgprot(0), |
2470 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
2471 | .flags = CPA_NO_CHECK_ALIAS, |
2472 | }; |
2473 | |
2474 | WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP" ); |
2475 | |
2476 | retval = __change_page_attr_set_clr(cpa: &cpa, primary: 1); |
2477 | __flush_tlb_all(); |
2478 | |
2479 | return retval; |
2480 | } |
2481 | |
2482 | /* |
2483 | * The testcases use internal knowledge of the implementation that shouldn't |
2484 | * be exposed to the rest of the kernel. Include these directly here. |
2485 | */ |
2486 | #ifdef CONFIG_CPA_DEBUG |
2487 | #include "cpa-test.c" |
2488 | #endif |
2489 | |