| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * Kernel-based Virtual Machine driver for Linux |
| 4 | * |
| 5 | * Macros and functions to access KVM PTEs (also known as SPTEs) |
| 6 | * |
| 7 | * Copyright (C) 2006 Qumranet, Inc. |
| 8 | * Copyright 2020 Red Hat, Inc. and/or its affiliates. |
| 9 | */ |
| 10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 11 | |
| 12 | #include <linux/kvm_host.h> |
| 13 | #include "mmu.h" |
| 14 | #include "mmu_internal.h" |
| 15 | #include "x86.h" |
| 16 | #include "spte.h" |
| 17 | |
| 18 | #include <asm/e820/api.h> |
| 19 | #include <asm/memtype.h> |
| 20 | #include <asm/vmx.h> |
| 21 | |
| 22 | bool __read_mostly enable_mmio_caching = true; |
| 23 | static bool __ro_after_init allow_mmio_caching; |
| 24 | module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); |
| 25 | EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching); |
| 26 | |
| 27 | bool __read_mostly kvm_ad_enabled; |
| 28 | |
| 29 | u64 __read_mostly shadow_host_writable_mask; |
| 30 | u64 __read_mostly shadow_mmu_writable_mask; |
| 31 | u64 __read_mostly shadow_nx_mask; |
| 32 | u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
| 33 | u64 __read_mostly shadow_user_mask; |
| 34 | u64 __read_mostly shadow_accessed_mask; |
| 35 | u64 __read_mostly shadow_dirty_mask; |
| 36 | u64 __read_mostly shadow_mmio_value; |
| 37 | u64 __read_mostly shadow_mmio_mask; |
| 38 | u64 __read_mostly shadow_mmio_access_mask; |
| 39 | u64 __read_mostly shadow_present_mask; |
| 40 | u64 __read_mostly shadow_me_value; |
| 41 | u64 __read_mostly shadow_me_mask; |
| 42 | u64 __read_mostly shadow_acc_track_mask; |
| 43 | |
| 44 | u64 __read_mostly shadow_nonpresent_or_rsvd_mask; |
| 45 | u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; |
| 46 | |
| 47 | static u8 __init kvm_get_host_maxphyaddr(void) |
| 48 | { |
| 49 | /* |
| 50 | * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected |
| 51 | * in CPU detection code, but the processor treats those reduced bits as |
| 52 | * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at |
| 53 | * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, |
| 54 | * when reasoning about CPU behavior with respect to MAXPHYADDR. |
| 55 | */ |
| 56 | if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) |
| 57 | return cpuid_eax(op: 0x80000008) & 0xff; |
| 58 | |
| 59 | /* |
| 60 | * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with |
| 61 | * custom CPUID. Proceed with whatever the kernel found since these features |
| 62 | * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). |
| 63 | */ |
| 64 | return boot_cpu_data.x86_phys_bits; |
| 65 | } |
| 66 | |
| 67 | void __init kvm_mmu_spte_module_init(void) |
| 68 | { |
| 69 | /* |
| 70 | * Snapshot userspace's desire to allow MMIO caching. Whether or not |
| 71 | * KVM can actually enable MMIO caching depends on vendor-specific |
| 72 | * hardware capabilities and other module params that can't be resolved |
| 73 | * until the vendor module is loaded, i.e. enable_mmio_caching can and |
| 74 | * will change when the vendor module is (re)loaded. |
| 75 | */ |
| 76 | allow_mmio_caching = enable_mmio_caching; |
| 77 | |
| 78 | kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); |
| 79 | } |
| 80 | |
| 81 | static u64 generation_mmio_spte_mask(u64 gen) |
| 82 | { |
| 83 | u64 mask; |
| 84 | |
| 85 | WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); |
| 86 | |
| 87 | mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; |
| 88 | mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; |
| 89 | return mask; |
| 90 | } |
| 91 | |
| 92 | u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) |
| 93 | { |
| 94 | u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; |
| 95 | u64 spte = generation_mmio_spte_mask(gen); |
| 96 | u64 gpa = gfn << PAGE_SHIFT; |
| 97 | |
| 98 | access &= shadow_mmio_access_mask; |
| 99 | spte |= vcpu->kvm->arch.shadow_mmio_value | access; |
| 100 | spte |= gpa | shadow_nonpresent_or_rsvd_mask; |
| 101 | spte |= (gpa & shadow_nonpresent_or_rsvd_mask) |
| 102 | << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; |
| 103 | |
| 104 | return spte; |
| 105 | } |
| 106 | |
| 107 | static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn) |
| 108 | { |
| 109 | if (pfn_valid(pfn)) |
| 110 | return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && |
| 111 | /* |
| 112 | * Some reserved pages, such as those from NVDIMM |
| 113 | * DAX devices, are not for MMIO, and can be mapped |
| 114 | * with cached memory type for better performance. |
| 115 | * However, the above check misconceives those pages |
| 116 | * as MMIO, and results in KVM mapping them with UC |
| 117 | * memory type, which would hurt the performance. |
| 118 | * Therefore, we check the host memory type in addition |
| 119 | * and only treat UC/UC-/WC pages as MMIO. |
| 120 | */ |
| 121 | (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); |
| 122 | |
| 123 | return !e820__mapped_raw_any(start: pfn_to_hpa(pfn), |
| 124 | end: pfn_to_hpa(pfn: pfn + 1) - 1, |
| 125 | type: E820_TYPE_RAM); |
| 126 | } |
| 127 | |
| 128 | static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio) |
| 129 | { |
| 130 | /* |
| 131 | * Determining if a PFN is host MMIO is relative expensive. Cache the |
| 132 | * result locally (in the sole caller) to avoid doing the full query |
| 133 | * multiple times when creating a single SPTE. |
| 134 | */ |
| 135 | if (*is_host_mmio < 0) |
| 136 | *is_host_mmio = __kvm_is_mmio_pfn(pfn); |
| 137 | |
| 138 | return *is_host_mmio; |
| 139 | } |
| 140 | |
| 141 | static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu) |
| 142 | { |
| 143 | struct kvm_mmu_page *root = root_to_sp(root: vcpu->arch.mmu->root.hpa); |
| 144 | |
| 145 | if (root) |
| 146 | WRITE_ONCE(root->has_mapped_host_mmio, true); |
| 147 | else |
| 148 | WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true); |
| 149 | |
| 150 | /* |
| 151 | * Force vCPUs to exit and flush CPU buffers if the vCPU is using the |
| 152 | * affected root(s). |
| 153 | */ |
| 154 | kvm_make_all_cpus_request(kvm: vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE); |
| 155 | } |
| 156 | |
| 157 | /* |
| 158 | * Returns true if the SPTE needs to be updated atomically due to having bits |
| 159 | * that may be changed without holding mmu_lock, and for which KVM must not |
| 160 | * lose information. E.g. KVM must not drop Dirty bit information. The caller |
| 161 | * is responsible for checking if the SPTE is shadow-present, and for |
| 162 | * determining whether or not the caller cares about non-leaf SPTEs. |
| 163 | */ |
| 164 | bool spte_needs_atomic_update(u64 spte) |
| 165 | { |
| 166 | /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ |
| 167 | if (!is_writable_pte(pte: spte) && is_mmu_writable_spte(spte)) |
| 168 | return true; |
| 169 | |
| 170 | /* |
| 171 | * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked |
| 172 | * SPTEs can be restored by KVM's fast page fault handler. |
| 173 | */ |
| 174 | if (!spte_ad_enabled(spte)) |
| 175 | return true; |
| 176 | |
| 177 | /* |
| 178 | * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed |
| 179 | * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't |
| 180 | * invalidate TLBs when aging SPTEs, and so it's safe to clobber the |
| 181 | * Accessed bit (and rare in practice). |
| 182 | */ |
| 183 | return is_writable_pte(pte: spte) && !(spte & shadow_dirty_mask); |
| 184 | } |
| 185 | |
| 186 | bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
| 187 | const struct kvm_memory_slot *slot, |
| 188 | unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, |
| 189 | u64 old_spte, bool prefetch, bool synchronizing, |
| 190 | bool host_writable, u64 *new_spte) |
| 191 | { |
| 192 | int level = sp->role.level; |
| 193 | u64 spte = SPTE_MMU_PRESENT_MASK; |
| 194 | int is_host_mmio = -1; |
| 195 | bool wrprot = false; |
| 196 | |
| 197 | /* |
| 198 | * For the EPT case, shadow_present_mask has no RWX bits set if |
| 199 | * exec-only page table entries are supported. In that case, |
| 200 | * ACC_USER_MASK and shadow_user_mask are used to represent |
| 201 | * read access. See FNAME(gpte_access) in paging_tmpl.h. |
| 202 | */ |
| 203 | WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); |
| 204 | |
| 205 | if (sp->role.ad_disabled) |
| 206 | spte |= SPTE_TDP_AD_DISABLED; |
| 207 | else if (kvm_mmu_page_ad_need_write_protect(kvm: vcpu->kvm, sp)) |
| 208 | spte |= SPTE_TDP_AD_WRPROT_ONLY; |
| 209 | |
| 210 | spte |= shadow_present_mask; |
| 211 | if (!prefetch || synchronizing) |
| 212 | spte |= shadow_accessed_mask; |
| 213 | |
| 214 | /* |
| 215 | * For simplicity, enforce the NX huge page mitigation even if not |
| 216 | * strictly necessary. KVM could ignore the mitigation if paging is |
| 217 | * disabled in the guest, as the guest doesn't have any page tables to |
| 218 | * abuse. But to safely ignore the mitigation, KVM would have to |
| 219 | * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG |
| 220 | * is toggled on, and that's a net negative for performance when TDP is |
| 221 | * enabled. When TDP is disabled, KVM will always switch to a new MMU |
| 222 | * when CR0.PG is toggled, but leveraging that to ignore the mitigation |
| 223 | * would tie make_spte() further to vCPU/MMU state, and add complexity |
| 224 | * just to optimize a mode that is anything but performance critical. |
| 225 | */ |
| 226 | if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && |
| 227 | is_nx_huge_page_enabled(kvm: vcpu->kvm)) { |
| 228 | pte_access &= ~ACC_EXEC_MASK; |
| 229 | } |
| 230 | |
| 231 | if (pte_access & ACC_EXEC_MASK) |
| 232 | spte |= shadow_x_mask; |
| 233 | else |
| 234 | spte |= shadow_nx_mask; |
| 235 | |
| 236 | if (pte_access & ACC_USER_MASK) |
| 237 | spte |= shadow_user_mask; |
| 238 | |
| 239 | if (level > PG_LEVEL_4K) |
| 240 | spte |= PT_PAGE_SIZE_MASK; |
| 241 | |
| 242 | if (kvm_x86_ops.get_mt_mask) |
| 243 | spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, |
| 244 | kvm_is_mmio_pfn(pfn, is_host_mmio: &is_host_mmio)); |
| 245 | if (host_writable) |
| 246 | spte |= shadow_host_writable_mask; |
| 247 | else |
| 248 | pte_access &= ~ACC_WRITE_MASK; |
| 249 | |
| 250 | if (shadow_me_value && !kvm_is_mmio_pfn(pfn, is_host_mmio: &is_host_mmio)) |
| 251 | spte |= shadow_me_value; |
| 252 | |
| 253 | spte |= (u64)pfn << PAGE_SHIFT; |
| 254 | |
| 255 | if (pte_access & ACC_WRITE_MASK) { |
| 256 | /* |
| 257 | * Unsync shadow pages that are reachable by the new, writable |
| 258 | * SPTE. Write-protect the SPTE if the page can't be unsync'd, |
| 259 | * e.g. it's write-tracked (upper-level SPs) or has one or more |
| 260 | * shadow pages and unsync'ing pages is not allowed. |
| 261 | * |
| 262 | * When overwriting an existing leaf SPTE, and the old SPTE was |
| 263 | * writable, skip trying to unsync shadow pages as any relevant |
| 264 | * shadow pages must already be unsync, i.e. the hash lookup is |
| 265 | * unnecessary (and expensive). Note, this relies on KVM not |
| 266 | * changing PFNs without first zapping the old SPTE, which is |
| 267 | * guaranteed by both the shadow MMU and the TDP MMU. |
| 268 | */ |
| 269 | if ((!is_last_spte(pte: old_spte, level) || !is_writable_pte(pte: old_spte)) && |
| 270 | mmu_try_to_unsync_pages(kvm: vcpu->kvm, slot, gfn, synchronizing, prefetch)) |
| 271 | wrprot = true; |
| 272 | else |
| 273 | spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | |
| 274 | shadow_dirty_mask; |
| 275 | } |
| 276 | |
| 277 | if (prefetch && !synchronizing) |
| 278 | spte = mark_spte_for_access_track(spte); |
| 279 | |
| 280 | WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), |
| 281 | "spte = 0x%llx, level = %d, rsvd bits = 0x%llx" , spte, level, |
| 282 | get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); |
| 283 | |
| 284 | /* |
| 285 | * Mark the memslot dirty *after* modifying it for access tracking. |
| 286 | * Unlike folios, memslots can be safely marked dirty out of mmu_lock, |
| 287 | * i.e. in the fast page fault handler. |
| 288 | */ |
| 289 | if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { |
| 290 | /* Enforced by kvm_mmu_hugepage_adjust. */ |
| 291 | WARN_ON_ONCE(level > PG_LEVEL_4K); |
| 292 | mark_page_dirty_in_slot(kvm: vcpu->kvm, memslot: slot, gfn); |
| 293 | } |
| 294 | |
| 295 | if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && |
| 296 | !kvm_vcpu_can_access_host_mmio(vcpu) && |
| 297 | kvm_is_mmio_pfn(pfn, is_host_mmio: &is_host_mmio)) |
| 298 | kvm_track_host_mmio_mapping(vcpu); |
| 299 | |
| 300 | *new_spte = spte; |
| 301 | return wrprot; |
| 302 | } |
| 303 | |
| 304 | static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) |
| 305 | { |
| 306 | bool is_access_track = is_access_track_spte(spte); |
| 307 | |
| 308 | if (is_access_track) |
| 309 | spte = restore_acc_track_spte(spte); |
| 310 | |
| 311 | KVM_MMU_WARN_ON(set & clear); |
| 312 | spte = (spte | set) & ~clear; |
| 313 | |
| 314 | if (is_access_track) |
| 315 | spte = mark_spte_for_access_track(spte); |
| 316 | |
| 317 | return spte; |
| 318 | } |
| 319 | |
| 320 | static u64 make_spte_executable(u64 spte) |
| 321 | { |
| 322 | return modify_spte_protections(spte, set: shadow_x_mask, clear: shadow_nx_mask); |
| 323 | } |
| 324 | |
| 325 | static u64 make_spte_nonexecutable(u64 spte) |
| 326 | { |
| 327 | return modify_spte_protections(spte, set: shadow_nx_mask, clear: shadow_x_mask); |
| 328 | } |
| 329 | |
| 330 | /* |
| 331 | * Construct an SPTE that maps a sub-page of the given huge page SPTE where |
| 332 | * `index` identifies which sub-page. |
| 333 | * |
| 334 | * This is used during huge page splitting to build the SPTEs that make up the |
| 335 | * new page table. |
| 336 | */ |
| 337 | u64 make_small_spte(struct kvm *kvm, u64 huge_spte, |
| 338 | union kvm_mmu_page_role role, int index) |
| 339 | { |
| 340 | u64 child_spte = huge_spte; |
| 341 | |
| 342 | KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); |
| 343 | |
| 344 | /* |
| 345 | * The child_spte already has the base address of the huge page being |
| 346 | * split. So we just have to OR in the offset to the page at the next |
| 347 | * lower level for the given index. |
| 348 | */ |
| 349 | child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; |
| 350 | |
| 351 | if (role.level == PG_LEVEL_4K) { |
| 352 | child_spte &= ~PT_PAGE_SIZE_MASK; |
| 353 | |
| 354 | /* |
| 355 | * When splitting to a 4K page where execution is allowed, mark |
| 356 | * the page executable as the NX hugepage mitigation no longer |
| 357 | * applies. |
| 358 | */ |
| 359 | if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) |
| 360 | child_spte = make_spte_executable(spte: child_spte); |
| 361 | } |
| 362 | |
| 363 | return child_spte; |
| 364 | } |
| 365 | |
| 366 | u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) |
| 367 | { |
| 368 | u64 huge_spte; |
| 369 | |
| 370 | KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); |
| 371 | |
| 372 | huge_spte = small_spte | PT_PAGE_SIZE_MASK; |
| 373 | |
| 374 | /* |
| 375 | * huge_spte already has the address of the sub-page being collapsed |
| 376 | * from small_spte, so just clear the lower address bits to create the |
| 377 | * huge page address. |
| 378 | */ |
| 379 | huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; |
| 380 | |
| 381 | if (is_nx_huge_page_enabled(kvm)) |
| 382 | huge_spte = make_spte_nonexecutable(spte: huge_spte); |
| 383 | |
| 384 | return huge_spte; |
| 385 | } |
| 386 | |
| 387 | u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) |
| 388 | { |
| 389 | u64 spte = SPTE_MMU_PRESENT_MASK; |
| 390 | |
| 391 | spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | |
| 392 | shadow_user_mask | shadow_x_mask | shadow_me_value; |
| 393 | |
| 394 | if (ad_disabled) |
| 395 | spte |= SPTE_TDP_AD_DISABLED; |
| 396 | else |
| 397 | spte |= shadow_accessed_mask; |
| 398 | |
| 399 | return spte; |
| 400 | } |
| 401 | |
| 402 | u64 mark_spte_for_access_track(u64 spte) |
| 403 | { |
| 404 | if (spte_ad_enabled(spte)) |
| 405 | return spte & ~shadow_accessed_mask; |
| 406 | |
| 407 | if (is_access_track_spte(spte)) |
| 408 | return spte; |
| 409 | |
| 410 | check_spte_writable_invariants(spte); |
| 411 | |
| 412 | WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << |
| 413 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), |
| 414 | "Access Tracking saved bit locations are not zero\n" ); |
| 415 | |
| 416 | spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << |
| 417 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; |
| 418 | spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); |
| 419 | |
| 420 | return spte; |
| 421 | } |
| 422 | |
| 423 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) |
| 424 | { |
| 425 | BUG_ON((u64)(unsigned)access_mask != access_mask); |
| 426 | WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); |
| 427 | |
| 428 | /* |
| 429 | * Reset to the original module param value to honor userspace's desire |
| 430 | * to (dis)allow MMIO caching. Update the param itself so that |
| 431 | * userspace can see whether or not KVM is actually using MMIO caching. |
| 432 | */ |
| 433 | enable_mmio_caching = allow_mmio_caching; |
| 434 | if (!enable_mmio_caching) |
| 435 | mmio_value = 0; |
| 436 | |
| 437 | /* |
| 438 | * The mask must contain only bits that are carved out specifically for |
| 439 | * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO |
| 440 | * generation. |
| 441 | */ |
| 442 | if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK)) |
| 443 | mmio_value = 0; |
| 444 | |
| 445 | /* |
| 446 | * Disable MMIO caching if the MMIO value collides with the bits that |
| 447 | * are used to hold the relocated GFN when the L1TF mitigation is |
| 448 | * enabled. This should never fire as there is no known hardware that |
| 449 | * can trigger this condition, e.g. SME/SEV CPUs that require a custom |
| 450 | * MMIO value are not susceptible to L1TF. |
| 451 | */ |
| 452 | if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << |
| 453 | SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) |
| 454 | mmio_value = 0; |
| 455 | |
| 456 | /* |
| 457 | * The masked MMIO value must obviously match itself and a frozen SPTE |
| 458 | * must not get a false positive. Frozen SPTEs and MMIO SPTEs should |
| 459 | * never collide as MMIO must set some RWX bits, and frozen SPTEs must |
| 460 | * not set any RWX bits. |
| 461 | */ |
| 462 | if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || |
| 463 | WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) |
| 464 | mmio_value = 0; |
| 465 | |
| 466 | if (!mmio_value) |
| 467 | enable_mmio_caching = false; |
| 468 | |
| 469 | shadow_mmio_value = mmio_value; |
| 470 | shadow_mmio_mask = mmio_mask; |
| 471 | shadow_mmio_access_mask = access_mask; |
| 472 | } |
| 473 | EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask); |
| 474 | |
| 475 | void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) |
| 476 | { |
| 477 | kvm->arch.shadow_mmio_value = mmio_value; |
| 478 | } |
| 479 | EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value); |
| 480 | |
| 481 | void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) |
| 482 | { |
| 483 | /* shadow_me_value must be a subset of shadow_me_mask */ |
| 484 | if (WARN_ON(me_value & ~me_mask)) |
| 485 | me_value = me_mask = 0; |
| 486 | |
| 487 | shadow_me_value = me_value; |
| 488 | shadow_me_mask = me_mask; |
| 489 | } |
| 490 | EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask); |
| 491 | |
| 492 | void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) |
| 493 | { |
| 494 | kvm_ad_enabled = has_ad_bits; |
| 495 | |
| 496 | shadow_user_mask = VMX_EPT_READABLE_MASK; |
| 497 | shadow_accessed_mask = VMX_EPT_ACCESS_BIT; |
| 498 | shadow_dirty_mask = VMX_EPT_DIRTY_BIT; |
| 499 | shadow_nx_mask = 0ull; |
| 500 | shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; |
| 501 | /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ |
| 502 | shadow_present_mask = |
| 503 | (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; |
| 504 | |
| 505 | shadow_acc_track_mask = VMX_EPT_RWX_MASK; |
| 506 | shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; |
| 507 | shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; |
| 508 | |
| 509 | /* |
| 510 | * EPT Misconfigurations are generated if the value of bits 2:0 |
| 511 | * of an EPT paging-structure entry is 110b (write/execute). |
| 512 | */ |
| 513 | kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, |
| 514 | VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, access_mask: 0); |
| 515 | } |
| 516 | EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks); |
| 517 | |
| 518 | void kvm_mmu_reset_all_pte_masks(void) |
| 519 | { |
| 520 | u8 low_phys_bits; |
| 521 | u64 mask; |
| 522 | |
| 523 | kvm_ad_enabled = true; |
| 524 | |
| 525 | /* |
| 526 | * If the CPU has 46 or less physical address bits, then set an |
| 527 | * appropriate mask to guard against L1TF attacks. Otherwise, it is |
| 528 | * assumed that the CPU is not vulnerable to L1TF. |
| 529 | * |
| 530 | * Some Intel CPUs address the L1 cache using more PA bits than are |
| 531 | * reported by CPUID. Use the PA width of the L1 cache when possible |
| 532 | * to achieve more effective mitigation, e.g. if system RAM overlaps |
| 533 | * the most significant bits of legal physical address space. |
| 534 | */ |
| 535 | shadow_nonpresent_or_rsvd_mask = 0; |
| 536 | low_phys_bits = boot_cpu_data.x86_phys_bits; |
| 537 | if (boot_cpu_has_bug(X86_BUG_L1TF) && |
| 538 | !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= |
| 539 | 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) { |
| 540 | low_phys_bits = boot_cpu_data.x86_cache_bits |
| 541 | - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; |
| 542 | shadow_nonpresent_or_rsvd_mask = |
| 543 | rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); |
| 544 | } |
| 545 | |
| 546 | shadow_nonpresent_or_rsvd_lower_gfn_mask = |
| 547 | GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); |
| 548 | |
| 549 | shadow_user_mask = PT_USER_MASK; |
| 550 | shadow_accessed_mask = PT_ACCESSED_MASK; |
| 551 | shadow_dirty_mask = PT_DIRTY_MASK; |
| 552 | shadow_nx_mask = PT64_NX_MASK; |
| 553 | shadow_x_mask = 0; |
| 554 | shadow_present_mask = PT_PRESENT_MASK; |
| 555 | |
| 556 | shadow_acc_track_mask = 0; |
| 557 | shadow_me_mask = 0; |
| 558 | shadow_me_value = 0; |
| 559 | |
| 560 | shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; |
| 561 | shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; |
| 562 | |
| 563 | /* |
| 564 | * Set a reserved PA bit in MMIO SPTEs to generate page faults with |
| 565 | * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT |
| 566 | * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports |
| 567 | * 52-bit physical addresses then there are no reserved PA bits in the |
| 568 | * PTEs and so the reserved PA approach must be disabled. |
| 569 | */ |
| 570 | if (kvm_host.maxphyaddr < 52) |
| 571 | mask = BIT_ULL(51) | PT_PRESENT_MASK; |
| 572 | else |
| 573 | mask = 0; |
| 574 | |
| 575 | kvm_mmu_set_mmio_spte_mask(mmio_value: mask, mmio_mask: mask, ACC_WRITE_MASK | ACC_USER_MASK); |
| 576 | } |
| 577 | |