| 1 | // SPDX-License-Identifier: GPL-2.0-only |
|---|---|
| 2 | |
| 3 | #ifndef KVM_X86_MMU_SPTE_H |
| 4 | #define KVM_X86_MMU_SPTE_H |
| 5 | |
| 6 | #include <asm/vmx.h> |
| 7 | |
| 8 | #include "mmu.h" |
| 9 | #include "mmu_internal.h" |
| 10 | |
| 11 | /* |
| 12 | * A MMU present SPTE is backed by actual memory and may or may not be present |
| 13 | * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it |
| 14 | * is ignored by all flavors of SPTEs and checking a low bit often generates |
| 15 | * better code than for a high bit, e.g. 56+. MMU present checks are pervasive |
| 16 | * enough that the improved code generation is noticeable in KVM's footprint. |
| 17 | */ |
| 18 | #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) |
| 19 | |
| 20 | /* |
| 21 | * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also |
| 22 | * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. |
| 23 | * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that |
| 24 | * is must be employed for a given TDP SPTE. |
| 25 | * |
| 26 | * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE |
| 27 | * paging, including NPT PAE. This scheme works because legacy shadow paging |
| 28 | * is guaranteed to have A/D bits and write-protection is forced only for |
| 29 | * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it |
| 30 | * must be restricted to 64-bit KVM. |
| 31 | */ |
| 32 | #define SPTE_TDP_AD_SHIFT 52 |
| 33 | #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) |
| 34 | #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) |
| 35 | #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) |
| 36 | #define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) |
| 37 | static_assert(SPTE_TDP_AD_ENABLED == 0); |
| 38 | |
| 39 | #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK |
| 40 | #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) |
| 41 | #else |
| 42 | #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
| 43 | #endif |
| 44 | |
| 45 | #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ |
| 46 | | shadow_x_mask | shadow_nx_mask | shadow_me_mask) |
| 47 | |
| 48 | #define ACC_EXEC_MASK 1 |
| 49 | #define ACC_WRITE_MASK PT_WRITABLE_MASK |
| 50 | #define ACC_USER_MASK PT_USER_MASK |
| 51 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
| 52 | |
| 53 | /* The mask for the R/X bits in EPT PTEs */ |
| 54 | #define SPTE_EPT_READABLE_MASK 0x1ull |
| 55 | #define SPTE_EPT_EXECUTABLE_MASK 0x4ull |
| 56 | |
| 57 | #define SPTE_LEVEL_BITS 9 |
| 58 | #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) |
| 59 | #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) |
| 60 | #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) |
| 61 | |
| 62 | /* |
| 63 | * The mask/shift to use for saving the original R/X bits when marking the PTE |
| 64 | * as not-present for access tracking purposes. We do not save the W bit as the |
| 65 | * PTEs being access tracked also need to be dirty tracked, so the W bit will be |
| 66 | * restored only when a write is attempted to the page. This mask obviously |
| 67 | * must not overlap the A/D type mask. |
| 68 | */ |
| 69 | #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ |
| 70 | SPTE_EPT_EXECUTABLE_MASK) |
| 71 | #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 |
| 72 | #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ |
| 73 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) |
| 74 | static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); |
| 75 | |
| 76 | /* |
| 77 | * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given |
| 78 | * SPTE is write-protected. See is_writable_pte() for details. |
| 79 | */ |
| 80 | |
| 81 | /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ |
| 82 | #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) |
| 83 | #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) |
| 84 | |
| 85 | /* |
| 86 | * Low ignored bits are at a premium for EPT, use high ignored bits, taking care |
| 87 | * to not overlap the A/D type mask or the saved access bits of access-tracked |
| 88 | * SPTEs when A/D bits are disabled. |
| 89 | */ |
| 90 | #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) |
| 91 | #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) |
| 92 | |
| 93 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); |
| 94 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); |
| 95 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); |
| 96 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); |
| 97 | |
| 98 | /* Defined only to keep the above static asserts readable. */ |
| 99 | #undef SHADOW_ACC_TRACK_SAVED_MASK |
| 100 | |
| 101 | /* |
| 102 | * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of |
| 103 | * the memslots generation and is derived as follows: |
| 104 | * |
| 105 | * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 |
| 106 | * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 |
| 107 | * |
| 108 | * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in |
| 109 | * the MMIO generation number, as doing so would require stealing a bit from |
| 110 | * the "real" generation number and thus effectively halve the maximum number |
| 111 | * of MMIO generations that can be handled before encountering a wrap (which |
| 112 | * requires a full MMU zap). The flag is instead explicitly queried when |
| 113 | * checking for MMIO spte cache hits. |
| 114 | */ |
| 115 | |
| 116 | #define MMIO_SPTE_GEN_LOW_START 3 |
| 117 | #define MMIO_SPTE_GEN_LOW_END 10 |
| 118 | |
| 119 | #define MMIO_SPTE_GEN_HIGH_START 52 |
| 120 | #define MMIO_SPTE_GEN_HIGH_END 62 |
| 121 | |
| 122 | #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ |
| 123 | MMIO_SPTE_GEN_LOW_START) |
| 124 | #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ |
| 125 | MMIO_SPTE_GEN_HIGH_START) |
| 126 | static_assert(!(SPTE_MMU_PRESENT_MASK & |
| 127 | (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); |
| 128 | |
| 129 | /* |
| 130 | * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the |
| 131 | * MMU-present bit. The generation obviously co-exists with the magic MMIO |
| 132 | * mask/value, and MMIO SPTEs are considered !MMU-present. |
| 133 | * |
| 134 | * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT |
| 135 | * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO |
| 136 | * and so they're off-limits for generation; additional checks ensure the mask |
| 137 | * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). |
| 138 | */ |
| 139 | #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) |
| 140 | static_assert(!(SPTE_MMIO_ALLOWED_MASK & |
| 141 | (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); |
| 142 | |
| 143 | #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) |
| 144 | #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) |
| 145 | |
| 146 | /* remember to adjust the comment above as well if you change these */ |
| 147 | static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); |
| 148 | |
| 149 | #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) |
| 150 | #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) |
| 151 | |
| 152 | #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) |
| 153 | |
| 154 | /* |
| 155 | * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress |
| 156 | * #VE and get EPT violations on non-present PTEs. We can use the |
| 157 | * same value also without TDX for both VMX and SVM: |
| 158 | * |
| 159 | * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. |
| 160 | * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) |
| 161 | * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) |
| 162 | */ |
| 163 | #ifdef CONFIG_X86_64 |
| 164 | #define SHADOW_NONPRESENT_VALUE BIT_ULL(63) |
| 165 | static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); |
| 166 | #else |
| 167 | #define SHADOW_NONPRESENT_VALUE 0ULL |
| 168 | #endif |
| 169 | |
| 170 | |
| 171 | /* |
| 172 | * True if A/D bits are supported in hardware and are enabled by KVM. When |
| 173 | * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable |
| 174 | * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario |
| 175 | * where KVM is using A/D bits for L1, but not L2. |
| 176 | */ |
| 177 | extern bool __read_mostly kvm_ad_enabled; |
| 178 | |
| 179 | extern u64 __read_mostly shadow_host_writable_mask; |
| 180 | extern u64 __read_mostly shadow_mmu_writable_mask; |
| 181 | extern u64 __read_mostly shadow_nx_mask; |
| 182 | extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
| 183 | extern u64 __read_mostly shadow_user_mask; |
| 184 | extern u64 __read_mostly shadow_accessed_mask; |
| 185 | extern u64 __read_mostly shadow_dirty_mask; |
| 186 | extern u64 __read_mostly shadow_mmio_value; |
| 187 | extern u64 __read_mostly shadow_mmio_mask; |
| 188 | extern u64 __read_mostly shadow_mmio_access_mask; |
| 189 | extern u64 __read_mostly shadow_present_mask; |
| 190 | extern u64 __read_mostly shadow_me_value; |
| 191 | extern u64 __read_mostly shadow_me_mask; |
| 192 | |
| 193 | /* |
| 194 | * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; |
| 195 | * shadow_acc_track_mask is the set of bits to be cleared in non-accessed |
| 196 | * pages. |
| 197 | */ |
| 198 | extern u64 __read_mostly shadow_acc_track_mask; |
| 199 | |
| 200 | /* |
| 201 | * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order |
| 202 | * to guard against L1TF attacks. |
| 203 | */ |
| 204 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; |
| 205 | |
| 206 | /* |
| 207 | * The number of high-order 1 bits to use in the mask above. |
| 208 | */ |
| 209 | #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 |
| 210 | |
| 211 | /* |
| 212 | * If a thread running without exclusive control of the MMU lock must perform a |
| 213 | * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a |
| 214 | * non-present intermediate value. Other threads which encounter this value |
| 215 | * should not modify the SPTE. |
| 216 | * |
| 217 | * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on |
| 218 | * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF |
| 219 | * vulnerability. |
| 220 | * |
| 221 | * Only used by the TDP MMU. |
| 222 | */ |
| 223 | #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) |
| 224 | |
| 225 | /* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ |
| 226 | static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); |
| 227 | |
| 228 | static inline bool is_frozen_spte(u64 spte) |
| 229 | { |
| 230 | return spte == FROZEN_SPTE; |
| 231 | } |
| 232 | |
| 233 | /* Get an SPTE's index into its parent's page table (and the spt array). */ |
| 234 | static inline int spte_index(u64 *sptep) |
| 235 | { |
| 236 | return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); |
| 237 | } |
| 238 | |
| 239 | /* |
| 240 | * In some cases, we need to preserve the GFN of a non-present or reserved |
| 241 | * SPTE when we usurp the upper five bits of the physical address space to |
| 242 | * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll |
| 243 | * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask |
| 244 | * left into the reserved bits, i.e. the GFN in the SPTE will be split into |
| 245 | * high and low parts. This mask covers the lower bits of the GFN. |
| 246 | */ |
| 247 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; |
| 248 | |
| 249 | static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) |
| 250 | { |
| 251 | struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); |
| 252 | |
| 253 | return (struct kvm_mmu_page *)page_private(page); |
| 254 | } |
| 255 | |
| 256 | static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) |
| 257 | { |
| 258 | return to_shadow_page(shadow_page: spte & SPTE_BASE_ADDR_MASK); |
| 259 | } |
| 260 | |
| 261 | static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) |
| 262 | { |
| 263 | return to_shadow_page(__pa(sptep)); |
| 264 | } |
| 265 | |
| 266 | static inline struct kvm_mmu_page *root_to_sp(hpa_t root) |
| 267 | { |
| 268 | if (kvm_mmu_is_dummy_root(shadow_page: root)) |
| 269 | return NULL; |
| 270 | |
| 271 | /* |
| 272 | * The "root" may be a special root, e.g. a PAE entry, treat it as a |
| 273 | * SPTE to ensure any non-PA bits are dropped. |
| 274 | */ |
| 275 | return spte_to_child_sp(spte: root); |
| 276 | } |
| 277 | |
| 278 | static inline bool is_mirror_sptep(tdp_ptep_t sptep) |
| 279 | { |
| 280 | return is_mirror_sp(sp: sptep_to_sp(rcu_dereference(sptep))); |
| 281 | } |
| 282 | |
| 283 | static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) |
| 284 | { |
| 285 | return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && |
| 286 | likely(enable_mmio_caching); |
| 287 | } |
| 288 | |
| 289 | static inline bool is_shadow_present_pte(u64 pte) |
| 290 | { |
| 291 | return !!(pte & SPTE_MMU_PRESENT_MASK); |
| 292 | } |
| 293 | |
| 294 | static inline bool is_ept_ve_possible(u64 spte) |
| 295 | { |
| 296 | return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && |
| 297 | !(spte & VMX_EPT_SUPPRESS_VE_BIT) && |
| 298 | (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; |
| 299 | } |
| 300 | |
| 301 | static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) |
| 302 | { |
| 303 | return sp->role.ad_disabled; |
| 304 | } |
| 305 | |
| 306 | static inline bool spte_ad_enabled(u64 spte) |
| 307 | { |
| 308 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
| 309 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; |
| 310 | } |
| 311 | |
| 312 | static inline bool spte_ad_need_write_protect(u64 spte) |
| 313 | { |
| 314 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
| 315 | /* |
| 316 | * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', |
| 317 | * and non-TDP SPTEs will never set these bits. Optimize for 64-bit |
| 318 | * TDP and do the A/D type check unconditionally. |
| 319 | */ |
| 320 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; |
| 321 | } |
| 322 | |
| 323 | static inline bool is_access_track_spte(u64 spte) |
| 324 | { |
| 325 | return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; |
| 326 | } |
| 327 | |
| 328 | static inline bool is_large_pte(u64 pte) |
| 329 | { |
| 330 | return pte & PT_PAGE_SIZE_MASK; |
| 331 | } |
| 332 | |
| 333 | static inline bool is_last_spte(u64 pte, int level) |
| 334 | { |
| 335 | return (level == PG_LEVEL_4K) || is_large_pte(pte); |
| 336 | } |
| 337 | |
| 338 | static inline bool is_executable_pte(u64 spte) |
| 339 | { |
| 340 | return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; |
| 341 | } |
| 342 | |
| 343 | static inline kvm_pfn_t spte_to_pfn(u64 pte) |
| 344 | { |
| 345 | return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; |
| 346 | } |
| 347 | |
| 348 | static inline bool is_accessed_spte(u64 spte) |
| 349 | { |
| 350 | return spte & shadow_accessed_mask; |
| 351 | } |
| 352 | |
| 353 | static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, |
| 354 | int level) |
| 355 | { |
| 356 | int bit7 = (pte >> 7) & 1; |
| 357 | |
| 358 | return rsvd_check->rsvd_bits_mask[bit7][level-1]; |
| 359 | } |
| 360 | |
| 361 | static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, |
| 362 | u64 pte, int level) |
| 363 | { |
| 364 | return pte & get_rsvd_bits(rsvd_check, pte, level); |
| 365 | } |
| 366 | |
| 367 | static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, |
| 368 | u64 pte) |
| 369 | { |
| 370 | return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); |
| 371 | } |
| 372 | |
| 373 | static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, |
| 374 | u64 spte, int level) |
| 375 | { |
| 376 | return __is_bad_mt_xwr(rsvd_check, pte: spte) || |
| 377 | __is_rsvd_bits_set(rsvd_check, pte: spte, level); |
| 378 | } |
| 379 | |
| 380 | /* |
| 381 | * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: |
| 382 | * |
| 383 | * 1. To intercept writes for dirty logging. KVM write-protects huge pages |
| 384 | * so that they can be split down into the dirty logging |
| 385 | * granularity (4KiB) whenever the guest writes to them. KVM also |
| 386 | * write-protects 4KiB pages so that writes can be recorded in the dirty log |
| 387 | * (e.g. if not using PML). SPTEs are write-protected for dirty logging |
| 388 | * during the VM-iotcls that enable dirty logging. |
| 389 | * |
| 390 | * 2. To intercept writes to guest page tables that KVM is shadowing. When a |
| 391 | * guest writes to its page table the corresponding shadow page table will |
| 392 | * be marked "unsync". That way KVM knows which shadow page tables need to |
| 393 | * be updated on the next TLB flush, INVLPG, etc. and which do not. |
| 394 | * |
| 395 | * 3. To prevent guest writes to read-only memory, such as for memory in a |
| 396 | * read-only memslot or guest memory backed by a read-only VMA. Writes to |
| 397 | * such pages are disallowed entirely. |
| 398 | * |
| 399 | * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this |
| 400 | * case, the SPTE is access-protected, not just write-protected! |
| 401 | * |
| 402 | * For cases #1 and #4, KVM can safely make such SPTEs writable without taking |
| 403 | * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. |
| 404 | * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits |
| 405 | * in the SPTE: |
| 406 | * |
| 407 | * shadow_mmu_writable_mask, aka MMU-writable - |
| 408 | * Cleared on SPTEs that KVM is currently write-protecting for shadow paging |
| 409 | * purposes (case 2 above). |
| 410 | * |
| 411 | * shadow_host_writable_mask, aka Host-writable - |
| 412 | * Cleared on SPTEs that are not host-writable (case 3 above) |
| 413 | * |
| 414 | * Note, not all possible combinations of PT_WRITABLE_MASK, |
| 415 | * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given |
| 416 | * SPTE can be in only one of the following states, which map to the |
| 417 | * aforementioned 3 cases: |
| 418 | * |
| 419 | * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK |
| 420 | * ------------------------- | ------------------------ | ---------------- |
| 421 | * 1 | 1 | 1 (writable) |
| 422 | * 1 | 1 | 0 (case 1) |
| 423 | * 1 | 0 | 0 (case 2) |
| 424 | * 0 | 0 | 0 (case 3) |
| 425 | * |
| 426 | * The valid combinations of these bits are checked by |
| 427 | * check_spte_writable_invariants() whenever an SPTE is modified. |
| 428 | * |
| 429 | * Clearing the MMU-writable bit is always done under the MMU lock and always |
| 430 | * accompanied by a TLB flush before dropping the lock to avoid corrupting the |
| 431 | * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging |
| 432 | * (which does not clear the MMU-writable bit), does not flush TLBs before |
| 433 | * dropping the lock, as it only needs to synchronize guest writes with the |
| 434 | * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for |
| 435 | * access-tracking via the clear_young() MMU notifier also does not flush TLBs. |
| 436 | * |
| 437 | * So, there is the problem: clearing the MMU-writable bit can encounter a |
| 438 | * write-protected SPTE while CPUs still have writable mappings for that SPTE |
| 439 | * cached in their TLB. To address this, KVM always flushes TLBs when |
| 440 | * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. |
| 441 | * |
| 442 | * The Host-writable bit is not modified on present SPTEs, it is only set or |
| 443 | * cleared when an SPTE is first faulted in from non-present and then remains |
| 444 | * immutable. |
| 445 | */ |
| 446 | static inline bool is_writable_pte(unsigned long pte) |
| 447 | { |
| 448 | return pte & PT_WRITABLE_MASK; |
| 449 | } |
| 450 | |
| 451 | /* Note: spte must be a shadow-present leaf SPTE. */ |
| 452 | static inline void check_spte_writable_invariants(u64 spte) |
| 453 | { |
| 454 | if (spte & shadow_mmu_writable_mask) |
| 455 | WARN_ONCE(!(spte & shadow_host_writable_mask), |
| 456 | KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", |
| 457 | spte); |
| 458 | else |
| 459 | WARN_ONCE(is_writable_pte(spte), |
| 460 | KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); |
| 461 | } |
| 462 | |
| 463 | static inline bool is_mmu_writable_spte(u64 spte) |
| 464 | { |
| 465 | return spte & shadow_mmu_writable_mask; |
| 466 | } |
| 467 | |
| 468 | /* |
| 469 | * Returns true if the access indicated by @fault is allowed by the existing |
| 470 | * SPTE protections. Note, the caller is responsible for checking that the |
| 471 | * SPTE is a shadow-present, leaf SPTE (either before or after). |
| 472 | */ |
| 473 | static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) |
| 474 | { |
| 475 | if (fault->exec) |
| 476 | return is_executable_pte(spte); |
| 477 | |
| 478 | if (fault->write) |
| 479 | return is_writable_pte(pte: spte); |
| 480 | |
| 481 | /* Fault was on Read access */ |
| 482 | return spte & PT_PRESENT_MASK; |
| 483 | } |
| 484 | |
| 485 | /* |
| 486 | * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for |
| 487 | * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, |
| 488 | * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM |
| 489 | * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, |
| 490 | * not whether or not SPTEs were modified, i.e. only the write-tracking case |
| 491 | * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. |
| 492 | * |
| 493 | * Don't flush if the Accessed bit is cleared, as access tracking tolerates |
| 494 | * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a |
| 495 | * mmu_notifier.clear_flush_young() event. |
| 496 | * |
| 497 | * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally |
| 498 | * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and |
| 499 | * when clearing dirty logs, KVM flushes based on whether or not dirty entries |
| 500 | * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. |
| 501 | * |
| 502 | * Note, this logic only applies to shadow-present leaf SPTEs. The caller is |
| 503 | * responsible for checking that the old SPTE is shadow-present, and is also |
| 504 | * responsible for determining whether or not a TLB flush is required when |
| 505 | * modifying a shadow-present non-leaf SPTE. |
| 506 | */ |
| 507 | static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) |
| 508 | { |
| 509 | return is_mmu_writable_spte(spte: old_spte) && !is_mmu_writable_spte(spte: new_spte); |
| 510 | } |
| 511 | |
| 512 | static inline u64 get_mmio_spte_generation(u64 spte) |
| 513 | { |
| 514 | u64 gen; |
| 515 | |
| 516 | gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; |
| 517 | gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; |
| 518 | return gen; |
| 519 | } |
| 520 | |
| 521 | bool spte_needs_atomic_update(u64 spte); |
| 522 | |
| 523 | bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
| 524 | const struct kvm_memory_slot *slot, |
| 525 | unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, |
| 526 | u64 old_spte, bool prefetch, bool synchronizing, |
| 527 | bool host_writable, u64 *new_spte); |
| 528 | u64 make_small_spte(struct kvm *kvm, u64 huge_spte, |
| 529 | union kvm_mmu_page_role role, int index); |
| 530 | u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); |
| 531 | u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); |
| 532 | u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); |
| 533 | u64 mark_spte_for_access_track(u64 spte); |
| 534 | |
| 535 | /* Restore an acc-track PTE back to a regular PTE */ |
| 536 | static inline u64 restore_acc_track_spte(u64 spte) |
| 537 | { |
| 538 | u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) |
| 539 | & SHADOW_ACC_TRACK_SAVED_BITS_MASK; |
| 540 | |
| 541 | spte &= ~shadow_acc_track_mask; |
| 542 | spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << |
| 543 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); |
| 544 | spte |= saved_bits; |
| 545 | |
| 546 | return spte; |
| 547 | } |
| 548 | |
| 549 | void __init kvm_mmu_spte_module_init(void); |
| 550 | void kvm_mmu_reset_all_pte_masks(void); |
| 551 | |
| 552 | #endif |
| 553 |
Definitions
- is_frozen_spte
- spte_index
- to_shadow_page
- spte_to_child_sp
- sptep_to_sp
- root_to_sp
- is_mirror_sptep
- is_mmio_spte
- is_shadow_present_pte
- is_ept_ve_possible
- sp_ad_disabled
- spte_ad_enabled
- spte_ad_need_write_protect
- is_access_track_spte
- is_large_pte
- is_last_spte
- is_executable_pte
- spte_to_pfn
- is_accessed_spte
- get_rsvd_bits
- __is_rsvd_bits_set
- __is_bad_mt_xwr
- is_rsvd_spte
- is_writable_pte
- check_spte_writable_invariants
- is_mmu_writable_spte
- is_access_allowed
- leaf_spte_change_needs_tlb_flush
- get_mmio_spte_generation
Improve your Profiling and Debugging skills
Find out more
