| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/mm.h> |
| 3 | #include <linux/gfp.h> |
| 4 | #include <linux/hugetlb.h> |
| 5 | #include <asm/pgalloc.h> |
| 6 | #include <asm/tlb.h> |
| 7 | #include <asm/fixmap.h> |
| 8 | #include <asm/mtrr.h> |
| 9 | |
| 10 | #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK |
| 11 | phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; |
| 12 | EXPORT_SYMBOL(physical_mask); |
| 13 | SYM_PIC_ALIAS(physical_mask); |
| 14 | #endif |
| 15 | |
| 16 | pgtable_t pte_alloc_one(struct mm_struct *mm) |
| 17 | { |
| 18 | return __pte_alloc_one(mm, GFP_PGTABLE_USER); |
| 19 | } |
| 20 | |
| 21 | void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) |
| 22 | { |
| 23 | paravirt_release_pte(page_to_pfn(pte)); |
| 24 | tlb_remove_ptdesc(tlb, page_ptdesc(pte)); |
| 25 | } |
| 26 | |
| 27 | #if CONFIG_PGTABLE_LEVELS > 2 |
| 28 | void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) |
| 29 | { |
| 30 | paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); |
| 31 | /* |
| 32 | * NOTE! For PAE, any changes to the top page-directory-pointer-table |
| 33 | * entries need a full cr3 reload to flush. |
| 34 | */ |
| 35 | #ifdef CONFIG_X86_PAE |
| 36 | tlb->need_flush_all = 1; |
| 37 | #endif |
| 38 | tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pmd)); |
| 39 | } |
| 40 | |
| 41 | #if CONFIG_PGTABLE_LEVELS > 3 |
| 42 | void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) |
| 43 | { |
| 44 | paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); |
| 45 | tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pud)); |
| 46 | } |
| 47 | |
| 48 | #if CONFIG_PGTABLE_LEVELS > 4 |
| 49 | void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) |
| 50 | { |
| 51 | paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); |
| 52 | tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: p4d)); |
| 53 | } |
| 54 | #endif /* CONFIG_PGTABLE_LEVELS > 4 */ |
| 55 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ |
| 56 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ |
| 57 | |
| 58 | static inline void pgd_list_add(pgd_t *pgd) |
| 59 | { |
| 60 | struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd); |
| 61 | |
| 62 | list_add(new: &ptdesc->pt_list, head: &pgd_list); |
| 63 | } |
| 64 | |
| 65 | static inline void pgd_list_del(pgd_t *pgd) |
| 66 | { |
| 67 | struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd); |
| 68 | |
| 69 | list_del(entry: &ptdesc->pt_list); |
| 70 | } |
| 71 | |
| 72 | static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) |
| 73 | { |
| 74 | virt_to_ptdesc(x: pgd)->pt_mm = mm; |
| 75 | } |
| 76 | |
| 77 | struct mm_struct *pgd_page_get_mm(struct page *page) |
| 78 | { |
| 79 | return page_ptdesc(page)->pt_mm; |
| 80 | } |
| 81 | |
| 82 | static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) |
| 83 | { |
| 84 | /* PAE preallocates all its PMDs. No cloning needed. */ |
| 85 | if (!IS_ENABLED(CONFIG_X86_PAE)) |
| 86 | clone_pgd_range(dst: pgd + KERNEL_PGD_BOUNDARY, |
| 87 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
| 88 | KERNEL_PGD_PTRS); |
| 89 | |
| 90 | /* List used to sync kernel mapping updates */ |
| 91 | pgd_set_mm(pgd, mm); |
| 92 | pgd_list_add(pgd); |
| 93 | } |
| 94 | |
| 95 | static void pgd_dtor(pgd_t *pgd) |
| 96 | { |
| 97 | spin_lock(lock: &pgd_lock); |
| 98 | pgd_list_del(pgd); |
| 99 | spin_unlock(lock: &pgd_lock); |
| 100 | } |
| 101 | |
| 102 | /* |
| 103 | * List of all pgd's needed for non-PAE so it can invalidate entries |
| 104 | * in both cached and uncached pgd's; not needed for PAE since the |
| 105 | * kernel pmd is shared. If PAE were not to share the pmd a similar |
| 106 | * tactic would be needed. This is essentially codepath-based locking |
| 107 | * against pageattr.c; it is the unique case in which a valid change |
| 108 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. |
| 109 | * vmalloc faults work because attached pagetables are never freed. |
| 110 | * -- nyc |
| 111 | */ |
| 112 | |
| 113 | #ifdef CONFIG_X86_PAE |
| 114 | /* |
| 115 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when |
| 116 | * updating the top-level pagetable entries to guarantee the |
| 117 | * processor notices the update. Since this is expensive, and |
| 118 | * all 4 top-level entries are used almost immediately in a |
| 119 | * new process's life, we just pre-populate them here. |
| 120 | */ |
| 121 | #define PREALLOCATED_PMDS PTRS_PER_PGD |
| 122 | |
| 123 | /* |
| 124 | * "USER_PMDS" are the PMDs for the user copy of the page tables when |
| 125 | * PTI is enabled. They do not exist when PTI is disabled. Note that |
| 126 | * this is distinct from the user _portion_ of the kernel page tables |
| 127 | * which always exists. |
| 128 | * |
| 129 | * We allocate separate PMDs for the kernel part of the user page-table |
| 130 | * when PTI is enabled. We need them to map the per-process LDT into the |
| 131 | * user-space page-table. |
| 132 | */ |
| 133 | #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ |
| 134 | KERNEL_PGD_PTRS : 0) |
| 135 | #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS |
| 136 | |
| 137 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) |
| 138 | { |
| 139 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); |
| 140 | |
| 141 | /* Note: almost everything apart from _PAGE_PRESENT is |
| 142 | reserved at the pmd (PDPT) level. */ |
| 143 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); |
| 144 | |
| 145 | /* |
| 146 | * According to Intel App note "TLBs, Paging-Structure Caches, |
| 147 | * and Their Invalidation", April 2007, document 317080-001, |
| 148 | * section 8.1: in PAE mode we explicitly have to flush the |
| 149 | * TLB via cr3 if the top-level pgd is changed... |
| 150 | */ |
| 151 | flush_tlb_mm(mm); |
| 152 | } |
| 153 | #else /* !CONFIG_X86_PAE */ |
| 154 | |
| 155 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ |
| 156 | #define PREALLOCATED_PMDS 0 |
| 157 | #define PREALLOCATED_USER_PMDS 0 |
| 158 | #define MAX_PREALLOCATED_USER_PMDS 0 |
| 159 | #endif /* CONFIG_X86_PAE */ |
| 160 | |
| 161 | static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) |
| 162 | { |
| 163 | int i; |
| 164 | struct ptdesc *ptdesc; |
| 165 | |
| 166 | for (i = 0; i < count; i++) |
| 167 | if (pmds[i]) { |
| 168 | ptdesc = virt_to_ptdesc(x: pmds[i]); |
| 169 | |
| 170 | pagetable_dtor(ptdesc); |
| 171 | pagetable_free(pt: ptdesc); |
| 172 | mm_dec_nr_pmds(mm); |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) |
| 177 | { |
| 178 | int i; |
| 179 | bool failed = false; |
| 180 | gfp_t gfp = GFP_PGTABLE_USER; |
| 181 | |
| 182 | if (mm == &init_mm) |
| 183 | gfp &= ~__GFP_ACCOUNT; |
| 184 | gfp &= ~__GFP_HIGHMEM; |
| 185 | |
| 186 | for (i = 0; i < count; i++) { |
| 187 | pmd_t *pmd = NULL; |
| 188 | struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); |
| 189 | |
| 190 | if (!ptdesc) |
| 191 | failed = true; |
| 192 | if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { |
| 193 | pagetable_free(pt: ptdesc); |
| 194 | ptdesc = NULL; |
| 195 | failed = true; |
| 196 | } |
| 197 | if (ptdesc) { |
| 198 | mm_inc_nr_pmds(mm); |
| 199 | pmd = ptdesc_address(pt: ptdesc); |
| 200 | } |
| 201 | |
| 202 | pmds[i] = pmd; |
| 203 | } |
| 204 | |
| 205 | if (failed) { |
| 206 | free_pmds(mm, pmds, count); |
| 207 | return -ENOMEM; |
| 208 | } |
| 209 | |
| 210 | return 0; |
| 211 | } |
| 212 | |
| 213 | /* |
| 214 | * Mop up any pmd pages which may still be attached to the pgd. |
| 215 | * Normally they will be freed by munmap/exit_mmap, but any pmd we |
| 216 | * preallocate which never got a corresponding vma will need to be |
| 217 | * freed manually. |
| 218 | */ |
| 219 | static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) |
| 220 | { |
| 221 | pgd_t pgd = *pgdp; |
| 222 | |
| 223 | if (pgd_val(pgd) != 0) { |
| 224 | pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); |
| 225 | |
| 226 | pgd_clear(pgdp); |
| 227 | |
| 228 | paravirt_release_pmd(pfn: pgd_val(pgd) >> PAGE_SHIFT); |
| 229 | pmd_free(mm, pmd); |
| 230 | mm_dec_nr_pmds(mm); |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) |
| 235 | { |
| 236 | int i; |
| 237 | |
| 238 | for (i = 0; i < PREALLOCATED_PMDS; i++) |
| 239 | mop_up_one_pmd(mm, pgdp: &pgdp[i]); |
| 240 | |
| 241 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
| 242 | |
| 243 | if (!boot_cpu_has(X86_FEATURE_PTI)) |
| 244 | return; |
| 245 | |
| 246 | pgdp = kernel_to_user_pgdp(pgdp); |
| 247 | |
| 248 | for (i = 0; i < PREALLOCATED_USER_PMDS; i++) |
| 249 | mop_up_one_pmd(mm, pgdp: &pgdp[i + KERNEL_PGD_BOUNDARY]); |
| 250 | #endif |
| 251 | } |
| 252 | |
| 253 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
| 254 | { |
| 255 | p4d_t *p4d; |
| 256 | pud_t *pud; |
| 257 | int i; |
| 258 | |
| 259 | p4d = p4d_offset(pgd, address: 0); |
| 260 | pud = pud_offset(p4d, address: 0); |
| 261 | |
| 262 | for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { |
| 263 | pmd_t *pmd = pmds[i]; |
| 264 | |
| 265 | if (i >= KERNEL_PGD_BOUNDARY) |
| 266 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), |
| 267 | sizeof(pmd_t) * PTRS_PER_PMD); |
| 268 | |
| 269 | pud_populate(mm, pud, pmd); |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION |
| 274 | static void pgd_prepopulate_user_pmd(struct mm_struct *mm, |
| 275 | pgd_t *k_pgd, pmd_t *pmds[]) |
| 276 | { |
| 277 | pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); |
| 278 | pgd_t *u_pgd = kernel_to_user_pgdp(pgdp: k_pgd); |
| 279 | p4d_t *u_p4d; |
| 280 | pud_t *u_pud; |
| 281 | int i; |
| 282 | |
| 283 | u_p4d = p4d_offset(pgd: u_pgd, address: 0); |
| 284 | u_pud = pud_offset(p4d: u_p4d, address: 0); |
| 285 | |
| 286 | s_pgd += KERNEL_PGD_BOUNDARY; |
| 287 | u_pud += KERNEL_PGD_BOUNDARY; |
| 288 | |
| 289 | for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { |
| 290 | pmd_t *pmd = pmds[i]; |
| 291 | |
| 292 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), |
| 293 | sizeof(pmd_t) * PTRS_PER_PMD); |
| 294 | |
| 295 | pud_populate(mm, pud: u_pud, pmd); |
| 296 | } |
| 297 | |
| 298 | } |
| 299 | #else |
| 300 | static void pgd_prepopulate_user_pmd(struct mm_struct *mm, |
| 301 | pgd_t *k_pgd, pmd_t *pmds[]) |
| 302 | { |
| 303 | } |
| 304 | #endif |
| 305 | |
| 306 | static inline pgd_t *_pgd_alloc(struct mm_struct *mm) |
| 307 | { |
| 308 | /* |
| 309 | * PTI and Xen need a whole page for the PAE PGD |
| 310 | * even though the hardware only needs 32 bytes. |
| 311 | * |
| 312 | * For simplicity, allocate a page for all users. |
| 313 | */ |
| 314 | return __pgd_alloc(mm, pgd_allocation_order()); |
| 315 | } |
| 316 | |
| 317 | static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) |
| 318 | { |
| 319 | __pgd_free(mm, pgd); |
| 320 | } |
| 321 | |
| 322 | pgd_t *pgd_alloc(struct mm_struct *mm) |
| 323 | { |
| 324 | pgd_t *pgd; |
| 325 | pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; |
| 326 | pmd_t *pmds[PREALLOCATED_PMDS]; |
| 327 | |
| 328 | pgd = _pgd_alloc(mm); |
| 329 | |
| 330 | if (pgd == NULL) |
| 331 | goto out; |
| 332 | |
| 333 | mm->pgd = pgd; |
| 334 | |
| 335 | if (sizeof(pmds) != 0 && |
| 336 | preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) |
| 337 | goto out_free_pgd; |
| 338 | |
| 339 | if (sizeof(u_pmds) != 0 && |
| 340 | preallocate_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS) != 0) |
| 341 | goto out_free_pmds; |
| 342 | |
| 343 | if (paravirt_pgd_alloc(mm) != 0) |
| 344 | goto out_free_user_pmds; |
| 345 | |
| 346 | /* |
| 347 | * Make sure that pre-populating the pmds is atomic with |
| 348 | * respect to anything walking the pgd_list, so that they |
| 349 | * never see a partially populated pgd. |
| 350 | */ |
| 351 | spin_lock(lock: &pgd_lock); |
| 352 | |
| 353 | pgd_ctor(mm, pgd); |
| 354 | if (sizeof(pmds) != 0) |
| 355 | pgd_prepopulate_pmd(mm, pgd, pmds); |
| 356 | |
| 357 | if (sizeof(u_pmds) != 0) |
| 358 | pgd_prepopulate_user_pmd(mm, k_pgd: pgd, pmds: u_pmds); |
| 359 | |
| 360 | spin_unlock(lock: &pgd_lock); |
| 361 | |
| 362 | return pgd; |
| 363 | |
| 364 | out_free_user_pmds: |
| 365 | if (sizeof(u_pmds) != 0) |
| 366 | free_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS); |
| 367 | out_free_pmds: |
| 368 | if (sizeof(pmds) != 0) |
| 369 | free_pmds(mm, pmds, PREALLOCATED_PMDS); |
| 370 | out_free_pgd: |
| 371 | _pgd_free(mm, pgd); |
| 372 | out: |
| 373 | return NULL; |
| 374 | } |
| 375 | |
| 376 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
| 377 | { |
| 378 | pgd_mop_up_pmds(mm, pgdp: pgd); |
| 379 | pgd_dtor(pgd); |
| 380 | paravirt_pgd_free(mm, pgd); |
| 381 | _pgd_free(mm, pgd); |
| 382 | } |
| 383 | |
| 384 | /* |
| 385 | * Used to set accessed or dirty bits in the page table entries |
| 386 | * on other architectures. On x86, the accessed and dirty bits |
| 387 | * are tracked by hardware. However, do_wp_page calls this function |
| 388 | * to also make the pte writeable at the same time the dirty bit is |
| 389 | * set. In that case we do actually need to write the PTE. |
| 390 | */ |
| 391 | int ptep_set_access_flags(struct vm_area_struct *vma, |
| 392 | unsigned long address, pte_t *ptep, |
| 393 | pte_t entry, int dirty) |
| 394 | { |
| 395 | int changed = !pte_same(a: *ptep, b: entry); |
| 396 | |
| 397 | if (changed && dirty) |
| 398 | set_pte(ptep, pte: entry); |
| 399 | |
| 400 | return changed; |
| 401 | } |
| 402 | |
| 403 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 404 | int pmdp_set_access_flags(struct vm_area_struct *vma, |
| 405 | unsigned long address, pmd_t *pmdp, |
| 406 | pmd_t entry, int dirty) |
| 407 | { |
| 408 | int changed = !pmd_same(pmd_a: *pmdp, pmd_b: entry); |
| 409 | |
| 410 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 411 | |
| 412 | if (changed && dirty) { |
| 413 | set_pmd(pmdp, pmd: entry); |
| 414 | /* |
| 415 | * We had a write-protection fault here and changed the pmd |
| 416 | * to to more permissive. No need to flush the TLB for that, |
| 417 | * #PF is architecturally guaranteed to do that and in the |
| 418 | * worst-case we'll generate a spurious fault. |
| 419 | */ |
| 420 | } |
| 421 | |
| 422 | return changed; |
| 423 | } |
| 424 | |
| 425 | int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, |
| 426 | pud_t *pudp, pud_t entry, int dirty) |
| 427 | { |
| 428 | int changed = !pud_same(pud_a: *pudp, pud_b: entry); |
| 429 | |
| 430 | VM_BUG_ON(address & ~HPAGE_PUD_MASK); |
| 431 | |
| 432 | if (changed && dirty) { |
| 433 | set_pud(pudp, pud: entry); |
| 434 | /* |
| 435 | * We had a write-protection fault here and changed the pud |
| 436 | * to to more permissive. No need to flush the TLB for that, |
| 437 | * #PF is architecturally guaranteed to do that and in the |
| 438 | * worst-case we'll generate a spurious fault. |
| 439 | */ |
| 440 | } |
| 441 | |
| 442 | return changed; |
| 443 | } |
| 444 | #endif |
| 445 | |
| 446 | int ptep_test_and_clear_young(struct vm_area_struct *vma, |
| 447 | unsigned long addr, pte_t *ptep) |
| 448 | { |
| 449 | int ret = 0; |
| 450 | |
| 451 | if (pte_young(pte: *ptep)) |
| 452 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
| 453 | addr: (unsigned long *) &ptep->pte); |
| 454 | |
| 455 | return ret; |
| 456 | } |
| 457 | |
| 458 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) |
| 459 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, |
| 460 | unsigned long addr, pmd_t *pmdp) |
| 461 | { |
| 462 | int ret = 0; |
| 463 | |
| 464 | if (pmd_young(pmd: *pmdp)) |
| 465 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
| 466 | addr: (unsigned long *)pmdp); |
| 467 | |
| 468 | return ret; |
| 469 | } |
| 470 | #endif |
| 471 | |
| 472 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 473 | int pudp_test_and_clear_young(struct vm_area_struct *vma, |
| 474 | unsigned long addr, pud_t *pudp) |
| 475 | { |
| 476 | int ret = 0; |
| 477 | |
| 478 | if (pud_young(pud: *pudp)) |
| 479 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
| 480 | addr: (unsigned long *)pudp); |
| 481 | |
| 482 | return ret; |
| 483 | } |
| 484 | #endif |
| 485 | |
| 486 | int ptep_clear_flush_young(struct vm_area_struct *vma, |
| 487 | unsigned long address, pte_t *ptep) |
| 488 | { |
| 489 | /* |
| 490 | * On x86 CPUs, clearing the accessed bit without a TLB flush |
| 491 | * doesn't cause data corruption. [ It could cause incorrect |
| 492 | * page aging and the (mistaken) reclaim of hot pages, but the |
| 493 | * chance of that should be relatively low. ] |
| 494 | * |
| 495 | * So as a performance optimization don't flush the TLB when |
| 496 | * clearing the accessed bit, it will eventually be flushed by |
| 497 | * a context switch or a VM operation anyway. [ In the rare |
| 498 | * event of it not getting flushed for a long time the delay |
| 499 | * shouldn't really matter because there's no real memory |
| 500 | * pressure for swapout to react to. ] |
| 501 | */ |
| 502 | return ptep_test_and_clear_young(vma, addr: address, ptep); |
| 503 | } |
| 504 | |
| 505 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 506 | int pmdp_clear_flush_young(struct vm_area_struct *vma, |
| 507 | unsigned long address, pmd_t *pmdp) |
| 508 | { |
| 509 | int young; |
| 510 | |
| 511 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 512 | |
| 513 | young = pmdp_test_and_clear_young(vma, addr: address, pmdp); |
| 514 | if (young) |
| 515 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
| 516 | |
| 517 | return young; |
| 518 | } |
| 519 | |
| 520 | pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, |
| 521 | pmd_t *pmdp) |
| 522 | { |
| 523 | VM_WARN_ON_ONCE(!pmd_present(*pmdp)); |
| 524 | |
| 525 | /* |
| 526 | * No flush is necessary. Once an invalid PTE is established, the PTE's |
| 527 | * access and dirty bits cannot be updated. |
| 528 | */ |
| 529 | return pmdp_establish(vma, address, pmdp, pmd: pmd_mkinvalid(pmd: *pmdp)); |
| 530 | } |
| 531 | #endif |
| 532 | |
| 533 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ |
| 534 | defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) |
| 535 | pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, |
| 536 | pud_t *pudp) |
| 537 | { |
| 538 | VM_WARN_ON_ONCE(!pud_present(*pudp)); |
| 539 | pud_t old = pudp_establish(vma, address, pudp, pud: pud_mkinvalid(pud: *pudp)); |
| 540 | flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); |
| 541 | return old; |
| 542 | } |
| 543 | #endif |
| 544 | |
| 545 | /** |
| 546 | * reserve_top_address - Reserve a hole in the top of the kernel address space |
| 547 | * @reserve: Size of hole to reserve |
| 548 | * |
| 549 | * Can be used to relocate the fixmap area and poke a hole in the top |
| 550 | * of the kernel address space to make room for a hypervisor. |
| 551 | */ |
| 552 | void __init reserve_top_address(unsigned long reserve) |
| 553 | { |
| 554 | #ifdef CONFIG_X86_32 |
| 555 | BUG_ON(fixmaps_set > 0); |
| 556 | __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; |
| 557 | printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n" , |
| 558 | -reserve, __FIXADDR_TOP + PAGE_SIZE); |
| 559 | #endif |
| 560 | } |
| 561 | |
| 562 | int fixmaps_set; |
| 563 | |
| 564 | void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) |
| 565 | { |
| 566 | unsigned long address = __fix_to_virt(idx); |
| 567 | |
| 568 | #ifdef CONFIG_X86_64 |
| 569 | /* |
| 570 | * Ensure that the static initial page tables are covering the |
| 571 | * fixmap completely. |
| 572 | */ |
| 573 | BUILD_BUG_ON(__end_of_permanent_fixed_addresses > |
| 574 | (FIXMAP_PMD_NUM * PTRS_PER_PTE)); |
| 575 | #endif |
| 576 | |
| 577 | if (idx >= __end_of_fixed_addresses) { |
| 578 | BUG(); |
| 579 | return; |
| 580 | } |
| 581 | set_pte_vaddr(vaddr: address, pte); |
| 582 | fixmaps_set++; |
| 583 | } |
| 584 | |
| 585 | void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, |
| 586 | phys_addr_t phys, pgprot_t flags) |
| 587 | { |
| 588 | /* Sanitize 'prot' against any unsupported bits: */ |
| 589 | pgprot_val(flags) &= __default_kernel_pte_mask; |
| 590 | |
| 591 | __native_set_fixmap(idx, pte: pfn_pte(page_nr: phys >> PAGE_SHIFT, pgprot: flags)); |
| 592 | } |
| 593 | |
| 594 | #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP |
| 595 | #if CONFIG_PGTABLE_LEVELS > 4 |
| 596 | /** |
| 597 | * p4d_set_huge - Set up kernel P4D mapping |
| 598 | * @p4d: Pointer to the P4D entry |
| 599 | * @addr: Virtual address associated with the P4D entry |
| 600 | * @prot: Protection bits to use |
| 601 | * |
| 602 | * No 512GB pages yet -- always return 0 |
| 603 | */ |
| 604 | int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) |
| 605 | { |
| 606 | return 0; |
| 607 | } |
| 608 | |
| 609 | /** |
| 610 | * p4d_clear_huge - Clear kernel P4D mapping when it is set |
| 611 | * @p4d: Pointer to the P4D entry to clear |
| 612 | * |
| 613 | * No 512GB pages yet -- do nothing |
| 614 | */ |
| 615 | void p4d_clear_huge(p4d_t *p4d) |
| 616 | { |
| 617 | } |
| 618 | #endif |
| 619 | |
| 620 | /** |
| 621 | * pud_set_huge - Set up kernel PUD mapping |
| 622 | * @pud: Pointer to the PUD entry |
| 623 | * @addr: Virtual address associated with the PUD entry |
| 624 | * @prot: Protection bits to use |
| 625 | * |
| 626 | * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this |
| 627 | * function sets up a huge page only if the complete range has the same MTRR |
| 628 | * caching mode. |
| 629 | * |
| 630 | * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger |
| 631 | * page mapping attempt fails. |
| 632 | * |
| 633 | * Returns 1 on success and 0 on failure. |
| 634 | */ |
| 635 | int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) |
| 636 | { |
| 637 | u8 uniform; |
| 638 | |
| 639 | mtrr_type_lookup(addr, end: addr + PUD_SIZE, uniform: &uniform); |
| 640 | if (!uniform) |
| 641 | return 0; |
| 642 | |
| 643 | /* Bail out if we are we on a populated non-leaf entry: */ |
| 644 | if (pud_present(pud: *pud) && !pud_leaf(pud: *pud)) |
| 645 | return 0; |
| 646 | |
| 647 | set_pte(ptep: (pte_t *)pud, pte: pfn_pte( |
| 648 | page_nr: (u64)addr >> PAGE_SHIFT, |
| 649 | __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); |
| 650 | |
| 651 | return 1; |
| 652 | } |
| 653 | |
| 654 | /** |
| 655 | * pmd_set_huge - Set up kernel PMD mapping |
| 656 | * @pmd: Pointer to the PMD entry |
| 657 | * @addr: Virtual address associated with the PMD entry |
| 658 | * @prot: Protection bits to use |
| 659 | * |
| 660 | * See text over pud_set_huge() above. |
| 661 | * |
| 662 | * Returns 1 on success and 0 on failure. |
| 663 | */ |
| 664 | int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) |
| 665 | { |
| 666 | u8 uniform; |
| 667 | |
| 668 | mtrr_type_lookup(addr, end: addr + PMD_SIZE, uniform: &uniform); |
| 669 | if (!uniform) { |
| 670 | pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n" , |
| 671 | __func__, addr, addr + PMD_SIZE); |
| 672 | return 0; |
| 673 | } |
| 674 | |
| 675 | /* Bail out if we are we on a populated non-leaf entry: */ |
| 676 | if (pmd_present(pmd: *pmd) && !pmd_leaf(pte: *pmd)) |
| 677 | return 0; |
| 678 | |
| 679 | set_pte(ptep: (pte_t *)pmd, pte: pfn_pte( |
| 680 | page_nr: (u64)addr >> PAGE_SHIFT, |
| 681 | __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); |
| 682 | |
| 683 | return 1; |
| 684 | } |
| 685 | |
| 686 | /** |
| 687 | * pud_clear_huge - Clear kernel PUD mapping when it is set |
| 688 | * @pud: Pointer to the PUD entry to clear. |
| 689 | * |
| 690 | * Returns 1 on success and 0 on failure (no PUD map is found). |
| 691 | */ |
| 692 | int pud_clear_huge(pud_t *pud) |
| 693 | { |
| 694 | if (pud_leaf(pud: *pud)) { |
| 695 | pud_clear(pudp: pud); |
| 696 | return 1; |
| 697 | } |
| 698 | |
| 699 | return 0; |
| 700 | } |
| 701 | |
| 702 | /** |
| 703 | * pmd_clear_huge - Clear kernel PMD mapping when it is set |
| 704 | * @pmd: Pointer to the PMD entry to clear. |
| 705 | * |
| 706 | * Returns 1 on success and 0 on failure (no PMD map is found). |
| 707 | */ |
| 708 | int pmd_clear_huge(pmd_t *pmd) |
| 709 | { |
| 710 | if (pmd_leaf(pte: *pmd)) { |
| 711 | pmd_clear(pmdp: pmd); |
| 712 | return 1; |
| 713 | } |
| 714 | |
| 715 | return 0; |
| 716 | } |
| 717 | |
| 718 | #ifdef CONFIG_X86_64 |
| 719 | /** |
| 720 | * pud_free_pmd_page - Clear PUD entry and free PMD page |
| 721 | * @pud: Pointer to a PUD |
| 722 | * @addr: Virtual address associated with PUD |
| 723 | * |
| 724 | * Context: The PUD range has been unmapped and TLB purged. |
| 725 | * Return: 1 if clearing the entry succeeded. 0 otherwise. |
| 726 | * |
| 727 | * NOTE: Callers must allow a single page allocation. |
| 728 | */ |
| 729 | int pud_free_pmd_page(pud_t *pud, unsigned long addr) |
| 730 | { |
| 731 | pmd_t *pmd, *pmd_sv; |
| 732 | struct ptdesc *pt; |
| 733 | int i; |
| 734 | |
| 735 | pmd = pud_pgtable(pud: *pud); |
| 736 | pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); |
| 737 | if (!pmd_sv) |
| 738 | return 0; |
| 739 | |
| 740 | for (i = 0; i < PTRS_PER_PMD; i++) { |
| 741 | pmd_sv[i] = pmd[i]; |
| 742 | if (!pmd_none(pmd: pmd[i])) |
| 743 | pmd_clear(pmdp: &pmd[i]); |
| 744 | } |
| 745 | |
| 746 | pud_clear(pudp: pud); |
| 747 | |
| 748 | /* INVLPG to clear all paging-structure caches */ |
| 749 | flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-1); |
| 750 | |
| 751 | for (i = 0; i < PTRS_PER_PMD; i++) { |
| 752 | if (!pmd_none(pmd: pmd_sv[i])) { |
| 753 | pt = page_ptdesc(pmd_page(pmd_sv[i])); |
| 754 | pagetable_dtor_free(ptdesc: pt); |
| 755 | } |
| 756 | } |
| 757 | |
| 758 | free_page((unsigned long)pmd_sv); |
| 759 | |
| 760 | pmd_free(mm: &init_mm, pmd); |
| 761 | |
| 762 | return 1; |
| 763 | } |
| 764 | |
| 765 | /** |
| 766 | * pmd_free_pte_page - Clear PMD entry and free PTE page. |
| 767 | * @pmd: Pointer to the PMD |
| 768 | * @addr: Virtual address associated with PMD |
| 769 | * |
| 770 | * Context: The PMD range has been unmapped and TLB purged. |
| 771 | * Return: 1 if clearing the entry succeeded. 0 otherwise. |
| 772 | */ |
| 773 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
| 774 | { |
| 775 | struct ptdesc *pt; |
| 776 | |
| 777 | pt = page_ptdesc(pmd_page(*pmd)); |
| 778 | pmd_clear(pmdp: pmd); |
| 779 | |
| 780 | /* INVLPG to clear all paging-structure caches */ |
| 781 | flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-1); |
| 782 | |
| 783 | pagetable_dtor_free(ptdesc: pt); |
| 784 | |
| 785 | return 1; |
| 786 | } |
| 787 | |
| 788 | #else /* !CONFIG_X86_64 */ |
| 789 | |
| 790 | /* |
| 791 | * Disable free page handling on x86-PAE. This assures that ioremap() |
| 792 | * does not update sync'd PMD entries. See vmalloc_sync_one(). |
| 793 | */ |
| 794 | int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) |
| 795 | { |
| 796 | return pmd_none(*pmd); |
| 797 | } |
| 798 | |
| 799 | #endif /* CONFIG_X86_64 */ |
| 800 | #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ |
| 801 | |
| 802 | pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) |
| 803 | { |
| 804 | if (vma->vm_flags & VM_SHADOW_STACK) |
| 805 | return pte_mkwrite_shstk(pte); |
| 806 | |
| 807 | pte = pte_mkwrite_novma(pte); |
| 808 | |
| 809 | return pte_clear_saveddirty(pte); |
| 810 | } |
| 811 | |
| 812 | pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
| 813 | { |
| 814 | if (vma->vm_flags & VM_SHADOW_STACK) |
| 815 | return pmd_mkwrite_shstk(pmd); |
| 816 | |
| 817 | pmd = pmd_mkwrite_novma(pmd); |
| 818 | |
| 819 | return pmd_clear_saveddirty(pmd); |
| 820 | } |
| 821 | |
| 822 | void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) |
| 823 | { |
| 824 | /* |
| 825 | * Hardware before shadow stack can (rarely) set Dirty=1 |
| 826 | * on a Write=0 PTE. So the below condition |
| 827 | * only indicates a software bug when shadow stack is |
| 828 | * supported by the HW. This checking is covered in |
| 829 | * pte_shstk(). |
| 830 | */ |
| 831 | VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && |
| 832 | pte_shstk(pte)); |
| 833 | } |
| 834 | |
| 835 | void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) |
| 836 | { |
| 837 | /* See note in arch_check_zapped_pte() */ |
| 838 | VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && |
| 839 | pmd_shstk(pmd)); |
| 840 | } |
| 841 | |
| 842 | void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) |
| 843 | { |
| 844 | /* See note in arch_check_zapped_pte() */ |
| 845 | VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); |
| 846 | } |
| 847 | |