1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Kernel-based Virtual Machine driver for Linux |
4 | * |
5 | * This module enables machines with Intel VT-x extensions to run virtual |
6 | * machines without emulation or binary translation. |
7 | * |
8 | * MMU support |
9 | * |
10 | * Copyright (C) 2006 Qumranet, Inc. |
11 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
12 | * |
13 | * Authors: |
14 | * Yaniv Kamay <yaniv@qumranet.com> |
15 | * Avi Kivity <avi@qumranet.com> |
16 | */ |
17 | |
18 | /* |
19 | * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, |
20 | * as well as guest EPT tables, so the code in this file is compiled thrice, |
21 | * once per guest PTE type. The per-type defines are #undef'd at the end. |
22 | */ |
23 | |
24 | #if PTTYPE == 64 |
25 | #define pt_element_t u64 |
26 | #define guest_walker guest_walker64 |
27 | #define FNAME(name) paging##64_##name |
28 | #define PT_LEVEL_BITS 9 |
29 | #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT |
30 | #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT |
31 | #define PT_HAVE_ACCESSED_DIRTY(mmu) true |
32 | #ifdef CONFIG_X86_64 |
33 | #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL |
34 | #else |
35 | #define PT_MAX_FULL_LEVELS 2 |
36 | #endif |
37 | #elif PTTYPE == 32 |
38 | #define pt_element_t u32 |
39 | #define guest_walker guest_walker32 |
40 | #define FNAME(name) paging##32_##name |
41 | #define PT_LEVEL_BITS 10 |
42 | #define PT_MAX_FULL_LEVELS 2 |
43 | #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT |
44 | #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT |
45 | #define PT_HAVE_ACCESSED_DIRTY(mmu) true |
46 | |
47 | #define PT32_DIR_PSE36_SIZE 4 |
48 | #define PT32_DIR_PSE36_SHIFT 13 |
49 | #define PT32_DIR_PSE36_MASK \ |
50 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) |
51 | #elif PTTYPE == PTTYPE_EPT |
52 | #define pt_element_t u64 |
53 | #define guest_walker guest_walkerEPT |
54 | #define FNAME(name) ept_##name |
55 | #define PT_LEVEL_BITS 9 |
56 | #define PT_GUEST_DIRTY_SHIFT 9 |
57 | #define PT_GUEST_ACCESSED_SHIFT 8 |
58 | #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) |
59 | #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL |
60 | #else |
61 | #error Invalid PTTYPE value |
62 | #endif |
63 | |
64 | /* Common logic, but per-type values. These also need to be undefined. */ |
65 | #define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) |
66 | #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) |
67 | #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) |
68 | #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) |
69 | |
70 | #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) |
71 | #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) |
72 | |
73 | #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) |
74 | #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) |
75 | |
76 | /* |
77 | * The guest_walker structure emulates the behavior of the hardware page |
78 | * table walker. |
79 | */ |
80 | struct guest_walker { |
81 | int level; |
82 | unsigned max_level; |
83 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
84 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
85 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; |
86 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
87 | pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; |
88 | bool pte_writable[PT_MAX_FULL_LEVELS]; |
89 | unsigned int pt_access[PT_MAX_FULL_LEVELS]; |
90 | unsigned int pte_access; |
91 | gfn_t gfn; |
92 | struct x86_exception fault; |
93 | }; |
94 | |
95 | #if PTTYPE == 32 |
96 | static inline gfn_t pse36_gfn_delta(u32 gpte) |
97 | { |
98 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; |
99 | |
100 | return (gpte & PT32_DIR_PSE36_MASK) << shift; |
101 | } |
102 | #endif |
103 | |
104 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
105 | { |
106 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; |
107 | } |
108 | |
109 | static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, |
110 | unsigned gpte) |
111 | { |
112 | unsigned mask; |
113 | |
114 | /* dirty bit is not supported, so no need to track it */ |
115 | if (!PT_HAVE_ACCESSED_DIRTY(mmu)) |
116 | return; |
117 | |
118 | BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); |
119 | |
120 | mask = (unsigned)~ACC_WRITE_MASK; |
121 | /* Allow write access to dirty gptes */ |
122 | mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & |
123 | PT_WRITABLE_MASK; |
124 | *access &= mask; |
125 | } |
126 | |
127 | static inline int FNAME(is_present_gpte)(unsigned long pte) |
128 | { |
129 | #if PTTYPE != PTTYPE_EPT |
130 | return pte & PT_PRESENT_MASK; |
131 | #else |
132 | return pte & 7; |
133 | #endif |
134 | } |
135 | |
136 | static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) |
137 | { |
138 | #if PTTYPE != PTTYPE_EPT |
139 | return false; |
140 | #else |
141 | return __is_bad_mt_xwr(rsvd_check, pte: gpte); |
142 | #endif |
143 | } |
144 | |
145 | static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) |
146 | { |
147 | return __is_rsvd_bits_set(rsvd_check: &mmu->guest_rsvd_check, pte: gpte, level) || |
148 | FNAME(is_bad_mt_xwr)(rsvd_check: &mmu->guest_rsvd_check, gpte); |
149 | } |
150 | |
151 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, |
152 | struct kvm_mmu_page *sp, u64 *spte, |
153 | u64 gpte) |
154 | { |
155 | if (!FNAME(is_present_gpte)(pte: gpte)) |
156 | goto no_present; |
157 | |
158 | /* Prefetch only accessed entries (unless A/D bits are disabled). */ |
159 | if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && |
160 | !(gpte & PT_GUEST_ACCESSED_MASK)) |
161 | goto no_present; |
162 | |
163 | if (FNAME(is_rsvd_bits_set)(mmu: vcpu->arch.mmu, gpte, level: PG_LEVEL_4K)) |
164 | goto no_present; |
165 | |
166 | return false; |
167 | |
168 | no_present: |
169 | drop_spte(kvm: vcpu->kvm, sptep: spte); |
170 | return true; |
171 | } |
172 | |
173 | /* |
174 | * For PTTYPE_EPT, a page table can be executable but not readable |
175 | * on supported processors. Therefore, set_spte does not automatically |
176 | * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK |
177 | * to signify readability since it isn't used in the EPT case |
178 | */ |
179 | static inline unsigned FNAME(gpte_access)(u64 gpte) |
180 | { |
181 | unsigned access; |
182 | #if PTTYPE == PTTYPE_EPT |
183 | access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | |
184 | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | |
185 | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); |
186 | #else |
187 | BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); |
188 | BUILD_BUG_ON(ACC_EXEC_MASK != 1); |
189 | access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); |
190 | /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ |
191 | access ^= (gpte >> PT64_NX_SHIFT); |
192 | #endif |
193 | |
194 | return access; |
195 | } |
196 | |
197 | static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, |
198 | struct kvm_mmu *mmu, |
199 | struct guest_walker *walker, |
200 | gpa_t addr, int write_fault) |
201 | { |
202 | unsigned level, index; |
203 | pt_element_t pte, orig_pte; |
204 | pt_element_t __user *ptep_user; |
205 | gfn_t table_gfn; |
206 | int ret; |
207 | |
208 | /* dirty/accessed bits are not supported, so no need to update them */ |
209 | if (!PT_HAVE_ACCESSED_DIRTY(mmu)) |
210 | return 0; |
211 | |
212 | for (level = walker->max_level; level >= walker->level; --level) { |
213 | pte = orig_pte = walker->ptes[level - 1]; |
214 | table_gfn = walker->table_gfn[level - 1]; |
215 | ptep_user = walker->ptep_user[level - 1]; |
216 | index = offset_in_page(ptep_user) / sizeof(pt_element_t); |
217 | if (!(pte & PT_GUEST_ACCESSED_MASK)) { |
218 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, size: sizeof(pte)); |
219 | pte |= PT_GUEST_ACCESSED_MASK; |
220 | } |
221 | if (level == walker->level && write_fault && |
222 | !(pte & PT_GUEST_DIRTY_MASK)) { |
223 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, size: sizeof(pte)); |
224 | #if PTTYPE == PTTYPE_EPT |
225 | if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) |
226 | return -EINVAL; |
227 | #endif |
228 | pte |= PT_GUEST_DIRTY_MASK; |
229 | } |
230 | if (pte == orig_pte) |
231 | continue; |
232 | |
233 | /* |
234 | * If the slot is read-only, simply do not process the accessed |
235 | * and dirty bits. This is the correct thing to do if the slot |
236 | * is ROM, and page tables in read-as-ROM/write-as-MMIO slots |
237 | * are only supported if the accessed and dirty bits are already |
238 | * set in the ROM (so that MMIO writes are never needed). |
239 | * |
240 | * Note that NPT does not allow this at all and faults, since |
241 | * it always wants nested page table entries for the guest |
242 | * page tables to be writable. And EPT works but will simply |
243 | * overwrite the read-only memory to set the accessed and dirty |
244 | * bits. |
245 | */ |
246 | if (unlikely(!walker->pte_writable[level - 1])) |
247 | continue; |
248 | |
249 | ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); |
250 | if (ret) |
251 | return ret; |
252 | |
253 | kvm_vcpu_mark_page_dirty(vcpu, gfn: table_gfn); |
254 | walker->ptes[level - 1] = pte; |
255 | } |
256 | return 0; |
257 | } |
258 | |
259 | static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) |
260 | { |
261 | unsigned pkeys = 0; |
262 | #if PTTYPE == 64 |
263 | pte_t pte = {.pte = gpte}; |
264 | |
265 | pkeys = pte_flags_pkey(pte_flags(pte)); |
266 | #endif |
267 | return pkeys; |
268 | } |
269 | |
270 | static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, |
271 | unsigned int level, unsigned int gpte) |
272 | { |
273 | /* |
274 | * For EPT and PAE paging (both variants), bit 7 is either reserved at |
275 | * all level or indicates a huge page (ignoring CR3/EPTP). In either |
276 | * case, bit 7 being set terminates the walk. |
277 | */ |
278 | #if PTTYPE == 32 |
279 | /* |
280 | * 32-bit paging requires special handling because bit 7 is ignored if |
281 | * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is |
282 | * greater than the last level for which bit 7 is the PAGE_SIZE bit. |
283 | * |
284 | * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 |
285 | * is not reserved and does not indicate a large page at this level, |
286 | * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. |
287 | */ |
288 | gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); |
289 | #endif |
290 | /* |
291 | * PG_LEVEL_4K always terminates. The RHS has bit 7 set |
292 | * iff level <= PG_LEVEL_4K, which for our purpose means |
293 | * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. |
294 | */ |
295 | gpte |= level - PG_LEVEL_4K - 1; |
296 | |
297 | return gpte & PT_PAGE_SIZE_MASK; |
298 | } |
299 | /* |
300 | * Fetch a guest pte for a guest virtual address, or for an L2's GPA. |
301 | */ |
302 | static int FNAME(walk_addr_generic)(struct guest_walker *walker, |
303 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
304 | gpa_t addr, u64 access) |
305 | { |
306 | int ret; |
307 | pt_element_t pte; |
308 | pt_element_t __user *ptep_user; |
309 | gfn_t table_gfn; |
310 | u64 pt_access, pte_access; |
311 | unsigned index, accessed_dirty, pte_pkey; |
312 | u64 nested_access; |
313 | gpa_t pte_gpa; |
314 | bool have_ad; |
315 | int offset; |
316 | u64 walk_nx_mask = 0; |
317 | const int write_fault = access & PFERR_WRITE_MASK; |
318 | const int user_fault = access & PFERR_USER_MASK; |
319 | const int fetch_fault = access & PFERR_FETCH_MASK; |
320 | u16 errcode = 0; |
321 | gpa_t real_gpa; |
322 | gfn_t gfn; |
323 | |
324 | trace_kvm_mmu_pagetable_walk(addr, pferr: access); |
325 | retry_walk: |
326 | walker->level = mmu->cpu_role.base.level; |
327 | pte = kvm_mmu_get_guest_pgd(vcpu, mmu); |
328 | have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); |
329 | |
330 | #if PTTYPE == 64 |
331 | walk_nx_mask = 1ULL << PT64_NX_SHIFT; |
332 | if (walker->level == PT32E_ROOT_LEVEL) { |
333 | pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); |
334 | trace_kvm_mmu_paging_element(pte, walker->level); |
335 | if (!FNAME(is_present_gpte)(pte)) |
336 | goto error; |
337 | --walker->level; |
338 | } |
339 | #endif |
340 | walker->max_level = walker->level; |
341 | |
342 | /* |
343 | * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging |
344 | * by the MOV to CR instruction are treated as reads and do not cause the |
345 | * processor to set the dirty flag in any EPT paging-structure entry. |
346 | */ |
347 | nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; |
348 | |
349 | pte_access = ~0; |
350 | |
351 | /* |
352 | * Queue a page fault for injection if this assertion fails, as callers |
353 | * assume that walker.fault contains sane info on a walk failure. I.e. |
354 | * avoid making the situation worse by inducing even worse badness |
355 | * between when the assertion fails and when KVM kicks the vCPU out to |
356 | * userspace (because the VM is bugged). |
357 | */ |
358 | if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) |
359 | goto error; |
360 | |
361 | ++walker->level; |
362 | |
363 | do { |
364 | struct kvm_memory_slot *slot; |
365 | unsigned long host_addr; |
366 | |
367 | pt_access = pte_access; |
368 | --walker->level; |
369 | |
370 | index = PT_INDEX(addr, walker->level); |
371 | table_gfn = gpte_to_gfn(pte); |
372 | offset = index * sizeof(pt_element_t); |
373 | pte_gpa = gfn_to_gpa(gfn: table_gfn) + offset; |
374 | |
375 | BUG_ON(walker->level < 1); |
376 | walker->table_gfn[walker->level - 1] = table_gfn; |
377 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
378 | |
379 | real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn: table_gfn), |
380 | nested_access, &walker->fault); |
381 | |
382 | /* |
383 | * FIXME: This can happen if emulation (for of an INS/OUTS |
384 | * instruction) triggers a nested page fault. The exit |
385 | * qualification / exit info field will incorrectly have |
386 | * "guest page access" as the nested page fault's cause, |
387 | * instead of "guest page structure access". To fix this, |
388 | * the x86_exception struct should be augmented with enough |
389 | * information to fix the exit_qualification or exit_info_1 |
390 | * fields. |
391 | */ |
392 | if (unlikely(real_gpa == INVALID_GPA)) |
393 | return 0; |
394 | |
395 | slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn: gpa_to_gfn(gpa: real_gpa)); |
396 | if (!kvm_is_visible_memslot(memslot: slot)) |
397 | goto error; |
398 | |
399 | host_addr = gfn_to_hva_memslot_prot(slot, gfn: gpa_to_gfn(gpa: real_gpa), |
400 | writable: &walker->pte_writable[walker->level - 1]); |
401 | if (unlikely(kvm_is_error_hva(host_addr))) |
402 | goto error; |
403 | |
404 | ptep_user = (pt_element_t __user *)((void *)host_addr + offset); |
405 | if (unlikely(__get_user(pte, ptep_user))) |
406 | goto error; |
407 | walker->ptep_user[walker->level - 1] = ptep_user; |
408 | |
409 | trace_kvm_mmu_paging_element(pte, level: walker->level); |
410 | |
411 | /* |
412 | * Inverting the NX it lets us AND it like other |
413 | * permission bits. |
414 | */ |
415 | pte_access = pt_access & (pte ^ walk_nx_mask); |
416 | |
417 | if (unlikely(!FNAME(is_present_gpte)(pte))) |
418 | goto error; |
419 | |
420 | if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { |
421 | errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; |
422 | goto error; |
423 | } |
424 | |
425 | walker->ptes[walker->level - 1] = pte; |
426 | |
427 | /* Convert to ACC_*_MASK flags for struct guest_walker. */ |
428 | walker->pt_access[walker->level - 1] = FNAME(gpte_access)(gpte: pt_access ^ walk_nx_mask); |
429 | } while (!FNAME(is_last_gpte)(mmu, level: walker->level, gpte: pte)); |
430 | |
431 | pte_pkey = FNAME(gpte_pkeys)(vcpu, gpte: pte); |
432 | accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; |
433 | |
434 | /* Convert to ACC_*_MASK flags for struct guest_walker. */ |
435 | walker->pte_access = FNAME(gpte_access)(gpte: pte_access ^ walk_nx_mask); |
436 | errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); |
437 | if (unlikely(errcode)) |
438 | goto error; |
439 | |
440 | gfn = gpte_to_gfn_lvl(gpte: pte, lvl: walker->level); |
441 | gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; |
442 | |
443 | #if PTTYPE == 32 |
444 | if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) |
445 | gfn += pse36_gfn_delta(pte); |
446 | #endif |
447 | |
448 | real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); |
449 | if (real_gpa == INVALID_GPA) |
450 | return 0; |
451 | |
452 | walker->gfn = real_gpa >> PAGE_SHIFT; |
453 | |
454 | if (!write_fault) |
455 | FNAME(protect_clean_gpte)(mmu, access: &walker->pte_access, gpte: pte); |
456 | else |
457 | /* |
458 | * On a write fault, fold the dirty bit into accessed_dirty. |
459 | * For modes without A/D bits support accessed_dirty will be |
460 | * always clear. |
461 | */ |
462 | accessed_dirty &= pte >> |
463 | (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); |
464 | |
465 | if (unlikely(!accessed_dirty)) { |
466 | ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, |
467 | addr, write_fault); |
468 | if (unlikely(ret < 0)) |
469 | goto error; |
470 | else if (ret) |
471 | goto retry_walk; |
472 | } |
473 | |
474 | return 1; |
475 | |
476 | error: |
477 | errcode |= write_fault | user_fault; |
478 | if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) |
479 | errcode |= PFERR_FETCH_MASK; |
480 | |
481 | walker->fault.vector = PF_VECTOR; |
482 | walker->fault.error_code_valid = true; |
483 | walker->fault.error_code = errcode; |
484 | |
485 | #if PTTYPE == PTTYPE_EPT |
486 | /* |
487 | * Use PFERR_RSVD_MASK in error_code to tell if EPT |
488 | * misconfiguration requires to be injected. The detection is |
489 | * done by is_rsvd_bits_set() above. |
490 | * |
491 | * We set up the value of exit_qualification to inject: |
492 | * [2:0] - Derive from the access bits. The exit_qualification might be |
493 | * out of date if it is serving an EPT misconfiguration. |
494 | * [5:3] - Calculated by the page walk of the guest EPT page tables |
495 | * [7:8] - Derived from [7:8] of real exit_qualification |
496 | * |
497 | * The other bits are set to 0. |
498 | */ |
499 | if (!(errcode & PFERR_RSVD_MASK)) { |
500 | vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | |
501 | EPT_VIOLATION_GVA_TRANSLATED); |
502 | if (write_fault) |
503 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; |
504 | if (user_fault) |
505 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; |
506 | if (fetch_fault) |
507 | vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; |
508 | |
509 | /* |
510 | * Note, pte_access holds the raw RWX bits from the EPTE, not |
511 | * ACC_*_MASK flags! |
512 | */ |
513 | vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << |
514 | EPT_VIOLATION_RWX_SHIFT; |
515 | } |
516 | #endif |
517 | walker->fault.address = addr; |
518 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; |
519 | walker->fault.async_page_fault = false; |
520 | |
521 | trace_kvm_mmu_walker_error(pferr: walker->fault.error_code); |
522 | return 0; |
523 | } |
524 | |
525 | static int FNAME(walk_addr)(struct guest_walker *walker, |
526 | struct kvm_vcpu *vcpu, gpa_t addr, u64 access) |
527 | { |
528 | return FNAME(walk_addr_generic)(walker, vcpu, mmu: vcpu->arch.mmu, addr, |
529 | access); |
530 | } |
531 | |
532 | static bool |
533 | FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
534 | u64 *spte, pt_element_t gpte) |
535 | { |
536 | struct kvm_memory_slot *slot; |
537 | unsigned pte_access; |
538 | gfn_t gfn; |
539 | kvm_pfn_t pfn; |
540 | |
541 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
542 | return false; |
543 | |
544 | gfn = gpte_to_gfn(gpte); |
545 | pte_access = sp->role.access & FNAME(gpte_access)(gpte); |
546 | FNAME(protect_clean_gpte)(mmu: vcpu->arch.mmu, access: &pte_access, gpte); |
547 | |
548 | slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); |
549 | if (!slot) |
550 | return false; |
551 | |
552 | pfn = gfn_to_pfn_memslot_atomic(slot, gfn); |
553 | if (is_error_pfn(pfn)) |
554 | return false; |
555 | |
556 | mmu_set_spte(vcpu, slot, sptep: spte, pte_access, gfn, pfn, NULL); |
557 | kvm_release_pfn_clean(pfn); |
558 | return true; |
559 | } |
560 | |
561 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, |
562 | struct guest_walker *gw, int level) |
563 | { |
564 | pt_element_t curr_pte; |
565 | gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; |
566 | u64 mask; |
567 | int r, index; |
568 | |
569 | if (level == PG_LEVEL_4K) { |
570 | mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; |
571 | base_gpa = pte_gpa & ~mask; |
572 | index = (pte_gpa - base_gpa) / sizeof(pt_element_t); |
573 | |
574 | r = kvm_vcpu_read_guest_atomic(vcpu, gpa: base_gpa, |
575 | data: gw->prefetch_ptes, len: sizeof(gw->prefetch_ptes)); |
576 | curr_pte = gw->prefetch_ptes[index]; |
577 | } else |
578 | r = kvm_vcpu_read_guest_atomic(vcpu, gpa: pte_gpa, |
579 | data: &curr_pte, len: sizeof(curr_pte)); |
580 | |
581 | return r || curr_pte != gw->ptes[level - 1]; |
582 | } |
583 | |
584 | static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, |
585 | u64 *sptep) |
586 | { |
587 | struct kvm_mmu_page *sp; |
588 | pt_element_t *gptep = gw->prefetch_ptes; |
589 | u64 *spte; |
590 | int i; |
591 | |
592 | sp = sptep_to_sp(sptep); |
593 | |
594 | if (sp->role.level > PG_LEVEL_4K) |
595 | return; |
596 | |
597 | /* |
598 | * If addresses are being invalidated, skip prefetching to avoid |
599 | * accidentally prefetching those addresses. |
600 | */ |
601 | if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) |
602 | return; |
603 | |
604 | if (sp->role.direct) |
605 | return __direct_pte_prefetch(vcpu, sp, sptep); |
606 | |
607 | i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); |
608 | spte = sp->spt + i; |
609 | |
610 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { |
611 | if (spte == sptep) |
612 | continue; |
613 | |
614 | if (is_shadow_present_pte(pte: *spte)) |
615 | continue; |
616 | |
617 | if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gpte: gptep[i])) |
618 | break; |
619 | } |
620 | } |
621 | |
622 | /* |
623 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
624 | * If the guest tries to write a write-protected page, we need to |
625 | * emulate this operation, return 1 to indicate this case. |
626 | */ |
627 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, |
628 | struct guest_walker *gw) |
629 | { |
630 | struct kvm_mmu_page *sp = NULL; |
631 | struct kvm_shadow_walk_iterator it; |
632 | unsigned int direct_access, access; |
633 | int top_level, ret; |
634 | gfn_t base_gfn = fault->gfn; |
635 | |
636 | WARN_ON_ONCE(gw->gfn != base_gfn); |
637 | direct_access = gw->pte_access; |
638 | |
639 | top_level = vcpu->arch.mmu->cpu_role.base.level; |
640 | if (top_level == PT32E_ROOT_LEVEL) |
641 | top_level = PT32_ROOT_LEVEL; |
642 | /* |
643 | * Verify that the top-level gpte is still there. Since the page |
644 | * is a root page, it is either write protected (and cannot be |
645 | * changed from now on) or it is invalid (in which case, we don't |
646 | * really care if it changes underneath us after this point). |
647 | */ |
648 | if (FNAME(gpte_changed)(vcpu, gw, level: top_level)) |
649 | goto out_gpte_changed; |
650 | |
651 | if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) |
652 | goto out_gpte_changed; |
653 | |
654 | /* |
655 | * Load a new root and retry the faulting instruction in the extremely |
656 | * unlikely scenario that the guest root gfn became visible between |
657 | * loading a dummy root and handling the resulting page fault, e.g. if |
658 | * userspace create a memslot in the interim. |
659 | */ |
660 | if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { |
661 | kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); |
662 | goto out_gpte_changed; |
663 | } |
664 | |
665 | for_each_shadow_entry(vcpu, fault->addr, it) { |
666 | gfn_t table_gfn; |
667 | |
668 | clear_sp_write_flooding_count(spte: it.sptep); |
669 | if (it.level == gw->level) |
670 | break; |
671 | |
672 | table_gfn = gw->table_gfn[it.level - 2]; |
673 | access = gw->pt_access[it.level - 2]; |
674 | sp = kvm_mmu_get_child_sp(vcpu, sptep: it.sptep, gfn: table_gfn, |
675 | direct: false, access); |
676 | |
677 | if (sp != ERR_PTR(error: -EEXIST)) { |
678 | /* |
679 | * We must synchronize the pagetable before linking it |
680 | * because the guest doesn't need to flush tlb when |
681 | * the gpte is changed from non-present to present. |
682 | * Otherwise, the guest may use the wrong mapping. |
683 | * |
684 | * For PG_LEVEL_4K, kvm_mmu_get_page() has already |
685 | * synchronized it transiently via kvm_sync_page(). |
686 | * |
687 | * For higher level pagetable, we synchronize it via |
688 | * the slower mmu_sync_children(). If it needs to |
689 | * break, some progress has been made; return |
690 | * RET_PF_RETRY and retry on the next #PF. |
691 | * KVM_REQ_MMU_SYNC is not necessary but it |
692 | * expedites the process. |
693 | */ |
694 | if (sp->unsync_children && |
695 | mmu_sync_children(vcpu, parent: sp, can_yield: false)) |
696 | return RET_PF_RETRY; |
697 | } |
698 | |
699 | /* |
700 | * Verify that the gpte in the page we've just write |
701 | * protected is still there. |
702 | */ |
703 | if (FNAME(gpte_changed)(vcpu, gw, level: it.level - 1)) |
704 | goto out_gpte_changed; |
705 | |
706 | if (sp != ERR_PTR(error: -EEXIST)) |
707 | link_shadow_page(vcpu, sptep: it.sptep, sp); |
708 | |
709 | if (fault->write && table_gfn == fault->gfn) |
710 | fault->write_fault_to_shadow_pgtable = true; |
711 | } |
712 | |
713 | /* |
714 | * Adjust the hugepage size _after_ resolving indirect shadow pages. |
715 | * KVM doesn't support mapping hugepages into the guest for gfns that |
716 | * are being shadowed by KVM, i.e. allocating a new shadow page may |
717 | * affect the allowed hugepage size. |
718 | */ |
719 | kvm_mmu_hugepage_adjust(vcpu, fault); |
720 | |
721 | trace_kvm_mmu_spte_requested(fault); |
722 | |
723 | for (; shadow_walk_okay(iterator: &it); shadow_walk_next(iterator: &it)) { |
724 | /* |
725 | * We cannot overwrite existing page tables with an NX |
726 | * large page, as the leaf could be executable. |
727 | */ |
728 | if (fault->nx_huge_page_workaround_enabled) |
729 | disallowed_hugepage_adjust(fault, spte: *it.sptep, cur_level: it.level); |
730 | |
731 | base_gfn = gfn_round_for_level(gfn: fault->gfn, level: it.level); |
732 | if (it.level == fault->goal_level) |
733 | break; |
734 | |
735 | validate_direct_spte(vcpu, sptep: it.sptep, direct_access); |
736 | |
737 | sp = kvm_mmu_get_child_sp(vcpu, sptep: it.sptep, gfn: base_gfn, |
738 | direct: true, access: direct_access); |
739 | if (sp == ERR_PTR(error: -EEXIST)) |
740 | continue; |
741 | |
742 | link_shadow_page(vcpu, sptep: it.sptep, sp); |
743 | if (fault->huge_page_disallowed) |
744 | account_nx_huge_page(kvm: vcpu->kvm, sp, |
745 | nx_huge_page_possible: fault->req_level >= it.level); |
746 | } |
747 | |
748 | if (WARN_ON_ONCE(it.level != fault->goal_level)) |
749 | return -EFAULT; |
750 | |
751 | ret = mmu_set_spte(vcpu, slot: fault->slot, sptep: it.sptep, pte_access: gw->pte_access, |
752 | gfn: base_gfn, pfn: fault->pfn, fault); |
753 | if (ret == RET_PF_SPURIOUS) |
754 | return ret; |
755 | |
756 | FNAME(pte_prefetch)(vcpu, gw, sptep: it.sptep); |
757 | return ret; |
758 | |
759 | out_gpte_changed: |
760 | return RET_PF_RETRY; |
761 | } |
762 | |
763 | /* |
764 | * Page fault handler. There are several causes for a page fault: |
765 | * - there is no shadow pte for the guest pte |
766 | * - write access through a shadow pte marked read only so that we can set |
767 | * the dirty bit |
768 | * - write access to a shadow pte marked read only so we can update the page |
769 | * dirty bitmap, when userspace requests it |
770 | * - mmio access; in this case we will never install a present shadow pte |
771 | * - normal guest page fault due to the guest pte marked not present, not |
772 | * writable, or not executable |
773 | * |
774 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
775 | * a negative value on error. |
776 | */ |
777 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) |
778 | { |
779 | struct guest_walker walker; |
780 | int r; |
781 | |
782 | WARN_ON_ONCE(fault->is_tdp); |
783 | |
784 | /* |
785 | * Look up the guest pte for the faulting address. |
786 | * If PFEC.RSVD is set, this is a shadow page fault. |
787 | * The bit needs to be cleared before walking guest page tables. |
788 | */ |
789 | r = FNAME(walk_addr)(walker: &walker, vcpu, addr: fault->addr, |
790 | access: fault->error_code & ~PFERR_RSVD_MASK); |
791 | |
792 | /* |
793 | * The page is not mapped by the guest. Let the guest handle it. |
794 | */ |
795 | if (!r) { |
796 | if (!fault->prefetch) |
797 | kvm_inject_emulated_page_fault(vcpu, fault: &walker.fault); |
798 | |
799 | return RET_PF_RETRY; |
800 | } |
801 | |
802 | fault->gfn = walker.gfn; |
803 | fault->max_level = walker.level; |
804 | fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn: fault->gfn); |
805 | |
806 | if (page_fault_handle_page_track(vcpu, fault)) { |
807 | shadow_page_table_clear_flood(vcpu, addr: fault->addr); |
808 | return RET_PF_EMULATE; |
809 | } |
810 | |
811 | r = mmu_topup_memory_caches(vcpu, maybe_indirect: true); |
812 | if (r) |
813 | return r; |
814 | |
815 | r = kvm_faultin_pfn(vcpu, fault, access: walker.pte_access); |
816 | if (r != RET_PF_CONTINUE) |
817 | return r; |
818 | |
819 | /* |
820 | * Do not change pte_access if the pfn is a mmio page, otherwise |
821 | * we will cache the incorrect access into mmio spte. |
822 | */ |
823 | if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && |
824 | !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { |
825 | walker.pte_access |= ACC_WRITE_MASK; |
826 | walker.pte_access &= ~ACC_USER_MASK; |
827 | |
828 | /* |
829 | * If we converted a user page to a kernel page, |
830 | * so that the kernel can write to it when cr0.wp=0, |
831 | * then we should prevent the kernel from executing it |
832 | * if SMEP is enabled. |
833 | */ |
834 | if (is_cr4_smep(mmu: vcpu->arch.mmu)) |
835 | walker.pte_access &= ~ACC_EXEC_MASK; |
836 | } |
837 | |
838 | r = RET_PF_RETRY; |
839 | write_lock(&vcpu->kvm->mmu_lock); |
840 | |
841 | if (is_page_fault_stale(vcpu, fault)) |
842 | goto out_unlock; |
843 | |
844 | r = make_mmu_pages_available(vcpu); |
845 | if (r) |
846 | goto out_unlock; |
847 | r = FNAME(fetch)(vcpu, fault, gw: &walker); |
848 | |
849 | out_unlock: |
850 | write_unlock(&vcpu->kvm->mmu_lock); |
851 | kvm_release_pfn_clean(pfn: fault->pfn); |
852 | return r; |
853 | } |
854 | |
855 | static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) |
856 | { |
857 | int offset = 0; |
858 | |
859 | WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); |
860 | |
861 | if (PTTYPE == 32) |
862 | offset = sp->role.quadrant << SPTE_LEVEL_BITS; |
863 | |
864 | return gfn_to_gpa(gfn: sp->gfn) + offset * sizeof(pt_element_t); |
865 | } |
866 | |
867 | /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ |
868 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
869 | gpa_t addr, u64 access, |
870 | struct x86_exception *exception) |
871 | { |
872 | struct guest_walker walker; |
873 | gpa_t gpa = INVALID_GPA; |
874 | int r; |
875 | |
876 | #ifndef CONFIG_X86_64 |
877 | /* A 64-bit GVA should be impossible on 32-bit KVM. */ |
878 | WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); |
879 | #endif |
880 | |
881 | r = FNAME(walk_addr_generic)(walker: &walker, vcpu, mmu, addr, access); |
882 | |
883 | if (r) { |
884 | gpa = gfn_to_gpa(gfn: walker.gfn); |
885 | gpa |= addr & ~PAGE_MASK; |
886 | } else if (exception) |
887 | *exception = walker.fault; |
888 | |
889 | return gpa; |
890 | } |
891 | |
892 | /* |
893 | * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is |
894 | * safe because: |
895 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
896 | * can't change unless all sptes pointing to it are nuked first. |
897 | * |
898 | * Returns |
899 | * < 0: failed to sync spte |
900 | * 0: the spte is synced and no tlb flushing is required |
901 | * > 0: the spte is synced and tlb flushing is required |
902 | */ |
903 | static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) |
904 | { |
905 | bool host_writable; |
906 | gpa_t first_pte_gpa; |
907 | u64 *sptep, spte; |
908 | struct kvm_memory_slot *slot; |
909 | unsigned pte_access; |
910 | pt_element_t gpte; |
911 | gpa_t pte_gpa; |
912 | gfn_t gfn; |
913 | |
914 | if (WARN_ON_ONCE(!sp->spt[i])) |
915 | return 0; |
916 | |
917 | first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); |
918 | pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); |
919 | |
920 | if (kvm_vcpu_read_guest_atomic(vcpu, gpa: pte_gpa, data: &gpte, |
921 | len: sizeof(pt_element_t))) |
922 | return -1; |
923 | |
924 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte: &sp->spt[i], gpte)) |
925 | return 1; |
926 | |
927 | gfn = gpte_to_gfn(gpte); |
928 | pte_access = sp->role.access; |
929 | pte_access &= FNAME(gpte_access)(gpte); |
930 | FNAME(protect_clean_gpte)(mmu: vcpu->arch.mmu, access: &pte_access, gpte); |
931 | |
932 | if (sync_mmio_spte(vcpu, sptep: &sp->spt[i], gfn, access: pte_access)) |
933 | return 0; |
934 | |
935 | /* |
936 | * Drop the SPTE if the new protections would result in a RWX=0 |
937 | * SPTE or if the gfn is changing. The RWX=0 case only affects |
938 | * EPT with execute-only support, i.e. EPT without an effective |
939 | * "present" bit, as all other paging modes will create a |
940 | * read-only SPTE if pte_access is zero. |
941 | */ |
942 | if ((!pte_access && !shadow_present_mask) || |
943 | gfn != kvm_mmu_page_get_gfn(sp, index: i)) { |
944 | drop_spte(kvm: vcpu->kvm, sptep: &sp->spt[i]); |
945 | return 1; |
946 | } |
947 | /* |
948 | * Do nothing if the permissions are unchanged. The existing SPTE is |
949 | * still, and prefetch_invalid_gpte() has verified that the A/D bits |
950 | * are set in the "new" gPTE, i.e. there is no danger of missing an A/D |
951 | * update due to A/D bits being set in the SPTE but not the gPTE. |
952 | */ |
953 | if (kvm_mmu_page_get_access(sp, index: i) == pte_access) |
954 | return 0; |
955 | |
956 | /* Update the shadowed access bits in case they changed. */ |
957 | kvm_mmu_page_set_access(sp, index: i, access: pte_access); |
958 | |
959 | sptep = &sp->spt[i]; |
960 | spte = *sptep; |
961 | host_writable = spte & shadow_host_writable_mask; |
962 | slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); |
963 | make_spte(vcpu, sp, slot, pte_access, gfn, |
964 | pfn: spte_to_pfn(pte: spte), old_spte: spte, prefetch: true, can_unsync: false, |
965 | host_writable, new_spte: &spte); |
966 | |
967 | return mmu_spte_update(sptep, new_spte: spte); |
968 | } |
969 | |
970 | #undef pt_element_t |
971 | #undef guest_walker |
972 | #undef FNAME |
973 | #undef PT_BASE_ADDR_MASK |
974 | #undef PT_INDEX |
975 | #undef PT_LVL_ADDR_MASK |
976 | #undef PT_LVL_OFFSET_MASK |
977 | #undef PT_LEVEL_BITS |
978 | #undef PT_MAX_FULL_LEVELS |
979 | #undef gpte_to_gfn |
980 | #undef gpte_to_gfn_lvl |
981 | #undef PT_GUEST_ACCESSED_MASK |
982 | #undef PT_GUEST_DIRTY_MASK |
983 | #undef PT_GUEST_DIRTY_SHIFT |
984 | #undef PT_GUEST_ACCESSED_SHIFT |
985 | #undef PT_HAVE_ACCESSED_DIRTY |
986 | |