1 | // SPDX-License-Identifier: GPL-2.0-only |
---|---|
2 | |
3 | #ifndef KVM_X86_MMU_SPTE_H |
4 | #define KVM_X86_MMU_SPTE_H |
5 | |
6 | #include <asm/vmx.h> |
7 | |
8 | #include "mmu.h" |
9 | #include "mmu_internal.h" |
10 | |
11 | /* |
12 | * A MMU present SPTE is backed by actual memory and may or may not be present |
13 | * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it |
14 | * is ignored by all flavors of SPTEs and checking a low bit often generates |
15 | * better code than for a high bit, e.g. 56+. MMU present checks are pervasive |
16 | * enough that the improved code generation is noticeable in KVM's footprint. |
17 | */ |
18 | #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) |
19 | |
20 | /* |
21 | * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also |
22 | * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. |
23 | * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that |
24 | * is must be employed for a given TDP SPTE. |
25 | * |
26 | * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE |
27 | * paging, including NPT PAE. This scheme works because legacy shadow paging |
28 | * is guaranteed to have A/D bits and write-protection is forced only for |
29 | * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it |
30 | * must be restricted to 64-bit KVM. |
31 | */ |
32 | #define SPTE_TDP_AD_SHIFT 52 |
33 | #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) |
34 | #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) |
35 | #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) |
36 | #define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) |
37 | static_assert(SPTE_TDP_AD_ENABLED == 0); |
38 | |
39 | #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK |
40 | #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) |
41 | #else |
42 | #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
43 | #endif |
44 | |
45 | #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ |
46 | | shadow_x_mask | shadow_nx_mask | shadow_me_mask) |
47 | |
48 | #define ACC_EXEC_MASK 1 |
49 | #define ACC_WRITE_MASK PT_WRITABLE_MASK |
50 | #define ACC_USER_MASK PT_USER_MASK |
51 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
52 | |
53 | /* The mask for the R/X bits in EPT PTEs */ |
54 | #define SPTE_EPT_READABLE_MASK 0x1ull |
55 | #define SPTE_EPT_EXECUTABLE_MASK 0x4ull |
56 | |
57 | #define SPTE_LEVEL_BITS 9 |
58 | #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) |
59 | #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) |
60 | #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) |
61 | |
62 | /* |
63 | * The mask/shift to use for saving the original R/X bits when marking the PTE |
64 | * as not-present for access tracking purposes. We do not save the W bit as the |
65 | * PTEs being access tracked also need to be dirty tracked, so the W bit will be |
66 | * restored only when a write is attempted to the page. This mask obviously |
67 | * must not overlap the A/D type mask. |
68 | */ |
69 | #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ |
70 | SPTE_EPT_EXECUTABLE_MASK) |
71 | #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 |
72 | #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ |
73 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) |
74 | static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); |
75 | |
76 | /* |
77 | * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given |
78 | * SPTE is write-protected. See is_writable_pte() for details. |
79 | */ |
80 | |
81 | /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ |
82 | #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) |
83 | #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) |
84 | |
85 | /* |
86 | * Low ignored bits are at a premium for EPT, use high ignored bits, taking care |
87 | * to not overlap the A/D type mask or the saved access bits of access-tracked |
88 | * SPTEs when A/D bits are disabled. |
89 | */ |
90 | #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) |
91 | #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) |
92 | |
93 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); |
94 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); |
95 | static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); |
96 | static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); |
97 | |
98 | /* Defined only to keep the above static asserts readable. */ |
99 | #undef SHADOW_ACC_TRACK_SAVED_MASK |
100 | |
101 | /* |
102 | * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of |
103 | * the memslots generation and is derived as follows: |
104 | * |
105 | * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 |
106 | * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 |
107 | * |
108 | * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in |
109 | * the MMIO generation number, as doing so would require stealing a bit from |
110 | * the "real" generation number and thus effectively halve the maximum number |
111 | * of MMIO generations that can be handled before encountering a wrap (which |
112 | * requires a full MMU zap). The flag is instead explicitly queried when |
113 | * checking for MMIO spte cache hits. |
114 | */ |
115 | |
116 | #define MMIO_SPTE_GEN_LOW_START 3 |
117 | #define MMIO_SPTE_GEN_LOW_END 10 |
118 | |
119 | #define MMIO_SPTE_GEN_HIGH_START 52 |
120 | #define MMIO_SPTE_GEN_HIGH_END 62 |
121 | |
122 | #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ |
123 | MMIO_SPTE_GEN_LOW_START) |
124 | #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ |
125 | MMIO_SPTE_GEN_HIGH_START) |
126 | static_assert(!(SPTE_MMU_PRESENT_MASK & |
127 | (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); |
128 | |
129 | /* |
130 | * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the |
131 | * MMU-present bit. The generation obviously co-exists with the magic MMIO |
132 | * mask/value, and MMIO SPTEs are considered !MMU-present. |
133 | * |
134 | * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT |
135 | * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO |
136 | * and so they're off-limits for generation; additional checks ensure the mask |
137 | * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). |
138 | */ |
139 | #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) |
140 | static_assert(!(SPTE_MMIO_ALLOWED_MASK & |
141 | (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); |
142 | |
143 | #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) |
144 | #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) |
145 | |
146 | /* remember to adjust the comment above as well if you change these */ |
147 | static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); |
148 | |
149 | #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) |
150 | #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) |
151 | |
152 | #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) |
153 | |
154 | /* |
155 | * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress |
156 | * #VE and get EPT violations on non-present PTEs. We can use the |
157 | * same value also without TDX for both VMX and SVM: |
158 | * |
159 | * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. |
160 | * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) |
161 | * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) |
162 | */ |
163 | #ifdef CONFIG_X86_64 |
164 | #define SHADOW_NONPRESENT_VALUE BIT_ULL(63) |
165 | static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); |
166 | #else |
167 | #define SHADOW_NONPRESENT_VALUE 0ULL |
168 | #endif |
169 | |
170 | |
171 | /* |
172 | * True if A/D bits are supported in hardware and are enabled by KVM. When |
173 | * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable |
174 | * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario |
175 | * where KVM is using A/D bits for L1, but not L2. |
176 | */ |
177 | extern bool __read_mostly kvm_ad_enabled; |
178 | |
179 | extern u64 __read_mostly shadow_host_writable_mask; |
180 | extern u64 __read_mostly shadow_mmu_writable_mask; |
181 | extern u64 __read_mostly shadow_nx_mask; |
182 | extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
183 | extern u64 __read_mostly shadow_user_mask; |
184 | extern u64 __read_mostly shadow_accessed_mask; |
185 | extern u64 __read_mostly shadow_dirty_mask; |
186 | extern u64 __read_mostly shadow_mmio_value; |
187 | extern u64 __read_mostly shadow_mmio_mask; |
188 | extern u64 __read_mostly shadow_mmio_access_mask; |
189 | extern u64 __read_mostly shadow_present_mask; |
190 | extern u64 __read_mostly shadow_me_value; |
191 | extern u64 __read_mostly shadow_me_mask; |
192 | |
193 | /* |
194 | * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; |
195 | * shadow_acc_track_mask is the set of bits to be cleared in non-accessed |
196 | * pages. |
197 | */ |
198 | extern u64 __read_mostly shadow_acc_track_mask; |
199 | |
200 | /* |
201 | * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order |
202 | * to guard against L1TF attacks. |
203 | */ |
204 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; |
205 | |
206 | /* |
207 | * The number of high-order 1 bits to use in the mask above. |
208 | */ |
209 | #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 |
210 | |
211 | /* |
212 | * If a thread running without exclusive control of the MMU lock must perform a |
213 | * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a |
214 | * non-present intermediate value. Other threads which encounter this value |
215 | * should not modify the SPTE. |
216 | * |
217 | * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on |
218 | * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF |
219 | * vulnerability. |
220 | * |
221 | * Only used by the TDP MMU. |
222 | */ |
223 | #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) |
224 | |
225 | /* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ |
226 | static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); |
227 | |
228 | static inline bool is_frozen_spte(u64 spte) |
229 | { |
230 | return spte == FROZEN_SPTE; |
231 | } |
232 | |
233 | /* Get an SPTE's index into its parent's page table (and the spt array). */ |
234 | static inline int spte_index(u64 *sptep) |
235 | { |
236 | return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); |
237 | } |
238 | |
239 | /* |
240 | * In some cases, we need to preserve the GFN of a non-present or reserved |
241 | * SPTE when we usurp the upper five bits of the physical address space to |
242 | * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll |
243 | * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask |
244 | * left into the reserved bits, i.e. the GFN in the SPTE will be split into |
245 | * high and low parts. This mask covers the lower bits of the GFN. |
246 | */ |
247 | extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; |
248 | |
249 | static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) |
250 | { |
251 | struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); |
252 | |
253 | return (struct kvm_mmu_page *)page_private(page); |
254 | } |
255 | |
256 | static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) |
257 | { |
258 | return to_shadow_page(shadow_page: spte & SPTE_BASE_ADDR_MASK); |
259 | } |
260 | |
261 | static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) |
262 | { |
263 | return to_shadow_page(__pa(sptep)); |
264 | } |
265 | |
266 | static inline struct kvm_mmu_page *root_to_sp(hpa_t root) |
267 | { |
268 | if (kvm_mmu_is_dummy_root(shadow_page: root)) |
269 | return NULL; |
270 | |
271 | /* |
272 | * The "root" may be a special root, e.g. a PAE entry, treat it as a |
273 | * SPTE to ensure any non-PA bits are dropped. |
274 | */ |
275 | return spte_to_child_sp(spte: root); |
276 | } |
277 | |
278 | static inline bool is_mirror_sptep(tdp_ptep_t sptep) |
279 | { |
280 | return is_mirror_sp(sp: sptep_to_sp(rcu_dereference(sptep))); |
281 | } |
282 | |
283 | static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) |
284 | { |
285 | return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && |
286 | likely(enable_mmio_caching); |
287 | } |
288 | |
289 | static inline bool is_shadow_present_pte(u64 pte) |
290 | { |
291 | return !!(pte & SPTE_MMU_PRESENT_MASK); |
292 | } |
293 | |
294 | static inline bool is_ept_ve_possible(u64 spte) |
295 | { |
296 | return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && |
297 | !(spte & VMX_EPT_SUPPRESS_VE_BIT) && |
298 | (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; |
299 | } |
300 | |
301 | static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) |
302 | { |
303 | return sp->role.ad_disabled; |
304 | } |
305 | |
306 | static inline bool spte_ad_enabled(u64 spte) |
307 | { |
308 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
309 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; |
310 | } |
311 | |
312 | static inline bool spte_ad_need_write_protect(u64 spte) |
313 | { |
314 | KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); |
315 | /* |
316 | * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', |
317 | * and non-TDP SPTEs will never set these bits. Optimize for 64-bit |
318 | * TDP and do the A/D type check unconditionally. |
319 | */ |
320 | return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; |
321 | } |
322 | |
323 | static inline bool is_access_track_spte(u64 spte) |
324 | { |
325 | return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; |
326 | } |
327 | |
328 | static inline bool is_large_pte(u64 pte) |
329 | { |
330 | return pte & PT_PAGE_SIZE_MASK; |
331 | } |
332 | |
333 | static inline bool is_last_spte(u64 pte, int level) |
334 | { |
335 | return (level == PG_LEVEL_4K) || is_large_pte(pte); |
336 | } |
337 | |
338 | static inline bool is_executable_pte(u64 spte) |
339 | { |
340 | return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; |
341 | } |
342 | |
343 | static inline kvm_pfn_t spte_to_pfn(u64 pte) |
344 | { |
345 | return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; |
346 | } |
347 | |
348 | static inline bool is_accessed_spte(u64 spte) |
349 | { |
350 | return spte & shadow_accessed_mask; |
351 | } |
352 | |
353 | static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, |
354 | int level) |
355 | { |
356 | int bit7 = (pte >> 7) & 1; |
357 | |
358 | return rsvd_check->rsvd_bits_mask[bit7][level-1]; |
359 | } |
360 | |
361 | static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, |
362 | u64 pte, int level) |
363 | { |
364 | return pte & get_rsvd_bits(rsvd_check, pte, level); |
365 | } |
366 | |
367 | static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, |
368 | u64 pte) |
369 | { |
370 | return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); |
371 | } |
372 | |
373 | static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, |
374 | u64 spte, int level) |
375 | { |
376 | return __is_bad_mt_xwr(rsvd_check, pte: spte) || |
377 | __is_rsvd_bits_set(rsvd_check, pte: spte, level); |
378 | } |
379 | |
380 | /* |
381 | * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: |
382 | * |
383 | * 1. To intercept writes for dirty logging. KVM write-protects huge pages |
384 | * so that they can be split down into the dirty logging |
385 | * granularity (4KiB) whenever the guest writes to them. KVM also |
386 | * write-protects 4KiB pages so that writes can be recorded in the dirty log |
387 | * (e.g. if not using PML). SPTEs are write-protected for dirty logging |
388 | * during the VM-iotcls that enable dirty logging. |
389 | * |
390 | * 2. To intercept writes to guest page tables that KVM is shadowing. When a |
391 | * guest writes to its page table the corresponding shadow page table will |
392 | * be marked "unsync". That way KVM knows which shadow page tables need to |
393 | * be updated on the next TLB flush, INVLPG, etc. and which do not. |
394 | * |
395 | * 3. To prevent guest writes to read-only memory, such as for memory in a |
396 | * read-only memslot or guest memory backed by a read-only VMA. Writes to |
397 | * such pages are disallowed entirely. |
398 | * |
399 | * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this |
400 | * case, the SPTE is access-protected, not just write-protected! |
401 | * |
402 | * For cases #1 and #4, KVM can safely make such SPTEs writable without taking |
403 | * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. |
404 | * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits |
405 | * in the SPTE: |
406 | * |
407 | * shadow_mmu_writable_mask, aka MMU-writable - |
408 | * Cleared on SPTEs that KVM is currently write-protecting for shadow paging |
409 | * purposes (case 2 above). |
410 | * |
411 | * shadow_host_writable_mask, aka Host-writable - |
412 | * Cleared on SPTEs that are not host-writable (case 3 above) |
413 | * |
414 | * Note, not all possible combinations of PT_WRITABLE_MASK, |
415 | * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given |
416 | * SPTE can be in only one of the following states, which map to the |
417 | * aforementioned 3 cases: |
418 | * |
419 | * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK |
420 | * ------------------------- | ------------------------ | ---------------- |
421 | * 1 | 1 | 1 (writable) |
422 | * 1 | 1 | 0 (case 1) |
423 | * 1 | 0 | 0 (case 2) |
424 | * 0 | 0 | 0 (case 3) |
425 | * |
426 | * The valid combinations of these bits are checked by |
427 | * check_spte_writable_invariants() whenever an SPTE is modified. |
428 | * |
429 | * Clearing the MMU-writable bit is always done under the MMU lock and always |
430 | * accompanied by a TLB flush before dropping the lock to avoid corrupting the |
431 | * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging |
432 | * (which does not clear the MMU-writable bit), does not flush TLBs before |
433 | * dropping the lock, as it only needs to synchronize guest writes with the |
434 | * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for |
435 | * access-tracking via the clear_young() MMU notifier also does not flush TLBs. |
436 | * |
437 | * So, there is the problem: clearing the MMU-writable bit can encounter a |
438 | * write-protected SPTE while CPUs still have writable mappings for that SPTE |
439 | * cached in their TLB. To address this, KVM always flushes TLBs when |
440 | * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. |
441 | * |
442 | * The Host-writable bit is not modified on present SPTEs, it is only set or |
443 | * cleared when an SPTE is first faulted in from non-present and then remains |
444 | * immutable. |
445 | */ |
446 | static inline bool is_writable_pte(unsigned long pte) |
447 | { |
448 | return pte & PT_WRITABLE_MASK; |
449 | } |
450 | |
451 | /* Note: spte must be a shadow-present leaf SPTE. */ |
452 | static inline void check_spte_writable_invariants(u64 spte) |
453 | { |
454 | if (spte & shadow_mmu_writable_mask) |
455 | WARN_ONCE(!(spte & shadow_host_writable_mask), |
456 | KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", |
457 | spte); |
458 | else |
459 | WARN_ONCE(is_writable_pte(spte), |
460 | KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); |
461 | } |
462 | |
463 | static inline bool is_mmu_writable_spte(u64 spte) |
464 | { |
465 | return spte & shadow_mmu_writable_mask; |
466 | } |
467 | |
468 | /* |
469 | * Returns true if the access indicated by @fault is allowed by the existing |
470 | * SPTE protections. Note, the caller is responsible for checking that the |
471 | * SPTE is a shadow-present, leaf SPTE (either before or after). |
472 | */ |
473 | static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) |
474 | { |
475 | if (fault->exec) |
476 | return is_executable_pte(spte); |
477 | |
478 | if (fault->write) |
479 | return is_writable_pte(pte: spte); |
480 | |
481 | /* Fault was on Read access */ |
482 | return spte & PT_PRESENT_MASK; |
483 | } |
484 | |
485 | /* |
486 | * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for |
487 | * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, |
488 | * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM |
489 | * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, |
490 | * not whether or not SPTEs were modified, i.e. only the write-tracking case |
491 | * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. |
492 | * |
493 | * Don't flush if the Accessed bit is cleared, as access tracking tolerates |
494 | * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a |
495 | * mmu_notifier.clear_flush_young() event. |
496 | * |
497 | * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally |
498 | * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and |
499 | * when clearing dirty logs, KVM flushes based on whether or not dirty entries |
500 | * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. |
501 | * |
502 | * Note, this logic only applies to shadow-present leaf SPTEs. The caller is |
503 | * responsible for checking that the old SPTE is shadow-present, and is also |
504 | * responsible for determining whether or not a TLB flush is required when |
505 | * modifying a shadow-present non-leaf SPTE. |
506 | */ |
507 | static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) |
508 | { |
509 | return is_mmu_writable_spte(spte: old_spte) && !is_mmu_writable_spte(spte: new_spte); |
510 | } |
511 | |
512 | static inline u64 get_mmio_spte_generation(u64 spte) |
513 | { |
514 | u64 gen; |
515 | |
516 | gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; |
517 | gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; |
518 | return gen; |
519 | } |
520 | |
521 | bool spte_needs_atomic_update(u64 spte); |
522 | |
523 | bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
524 | const struct kvm_memory_slot *slot, |
525 | unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, |
526 | u64 old_spte, bool prefetch, bool synchronizing, |
527 | bool host_writable, u64 *new_spte); |
528 | u64 make_small_spte(struct kvm *kvm, u64 huge_spte, |
529 | union kvm_mmu_page_role role, int index); |
530 | u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); |
531 | u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); |
532 | u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); |
533 | u64 mark_spte_for_access_track(u64 spte); |
534 | |
535 | /* Restore an acc-track PTE back to a regular PTE */ |
536 | static inline u64 restore_acc_track_spte(u64 spte) |
537 | { |
538 | u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) |
539 | & SHADOW_ACC_TRACK_SAVED_BITS_MASK; |
540 | |
541 | spte &= ~shadow_acc_track_mask; |
542 | spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << |
543 | SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); |
544 | spte |= saved_bits; |
545 | |
546 | return spte; |
547 | } |
548 | |
549 | void __init kvm_mmu_spte_module_init(void); |
550 | void kvm_mmu_reset_all_pte_masks(void); |
551 | |
552 | #endif |
553 |
Definitions
- is_frozen_spte
- spte_index
- to_shadow_page
- spte_to_child_sp
- sptep_to_sp
- root_to_sp
- is_mirror_sptep
- is_mmio_spte
- is_shadow_present_pte
- is_ept_ve_possible
- sp_ad_disabled
- spte_ad_enabled
- spte_ad_need_write_protect
- is_access_track_spte
- is_large_pte
- is_last_spte
- is_executable_pte
- spte_to_pfn
- is_accessed_spte
- get_rsvd_bits
- __is_rsvd_bits_set
- __is_bad_mt_xwr
- is_rsvd_spte
- is_writable_pte
- check_spte_writable_invariants
- is_mmu_writable_spte
- is_access_allowed
- leaf_spte_change_needs_tlb_flush
- get_mmio_spte_generation
Improve your Profiling and Debugging skills
Find out more