1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
3 | |
4 | #include <linux/objtool.h> |
5 | #include <linux/percpu.h> |
6 | |
7 | #include <asm/debugreg.h> |
8 | #include <asm/mmu_context.h> |
9 | |
10 | #include "cpuid.h" |
11 | #include "hyperv.h" |
12 | #include "mmu.h" |
13 | #include "nested.h" |
14 | #include "pmu.h" |
15 | #include "sgx.h" |
16 | #include "trace.h" |
17 | #include "vmx.h" |
18 | #include "x86.h" |
19 | #include "smm.h" |
20 | |
21 | static bool __read_mostly enable_shadow_vmcs = 1; |
22 | module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); |
23 | |
24 | static bool __read_mostly nested_early_check = 0; |
25 | module_param(nested_early_check, bool, S_IRUGO); |
26 | |
27 | #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK |
28 | |
29 | /* |
30 | * Hyper-V requires all of these, so mark them as supported even though |
31 | * they are just treated the same as all-context. |
32 | */ |
33 | #define VMX_VPID_EXTENT_SUPPORTED_MASK \ |
34 | (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ |
35 | VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ |
36 | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ |
37 | VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) |
38 | |
39 | #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 |
40 | |
41 | enum { |
42 | VMX_VMREAD_BITMAP, |
43 | VMX_VMWRITE_BITMAP, |
44 | VMX_BITMAP_NR |
45 | }; |
46 | static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; |
47 | |
48 | #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) |
49 | #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) |
50 | |
51 | struct shadow_vmcs_field { |
52 | u16 encoding; |
53 | u16 offset; |
54 | }; |
55 | static struct shadow_vmcs_field shadow_read_only_fields[] = { |
56 | #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, |
57 | #include "vmcs_shadow_fields.h" |
58 | }; |
59 | static int max_shadow_read_only_fields = |
60 | ARRAY_SIZE(shadow_read_only_fields); |
61 | |
62 | static struct shadow_vmcs_field shadow_read_write_fields[] = { |
63 | #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, |
64 | #include "vmcs_shadow_fields.h" |
65 | }; |
66 | static int max_shadow_read_write_fields = |
67 | ARRAY_SIZE(shadow_read_write_fields); |
68 | |
69 | static void init_vmcs_shadow_fields(void) |
70 | { |
71 | int i, j; |
72 | |
73 | memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); |
74 | memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); |
75 | |
76 | for (i = j = 0; i < max_shadow_read_only_fields; i++) { |
77 | struct shadow_vmcs_field entry = shadow_read_only_fields[i]; |
78 | u16 field = entry.encoding; |
79 | |
80 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && |
81 | (i + 1 == max_shadow_read_only_fields || |
82 | shadow_read_only_fields[i + 1].encoding != field + 1)) |
83 | pr_err("Missing field from shadow_read_only_field %x\n" , |
84 | field + 1); |
85 | |
86 | clear_bit(nr: field, vmx_vmread_bitmap); |
87 | if (field & 1) |
88 | #ifdef CONFIG_X86_64 |
89 | continue; |
90 | #else |
91 | entry.offset += sizeof(u32); |
92 | #endif |
93 | shadow_read_only_fields[j++] = entry; |
94 | } |
95 | max_shadow_read_only_fields = j; |
96 | |
97 | for (i = j = 0; i < max_shadow_read_write_fields; i++) { |
98 | struct shadow_vmcs_field entry = shadow_read_write_fields[i]; |
99 | u16 field = entry.encoding; |
100 | |
101 | if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && |
102 | (i + 1 == max_shadow_read_write_fields || |
103 | shadow_read_write_fields[i + 1].encoding != field + 1)) |
104 | pr_err("Missing field from shadow_read_write_field %x\n" , |
105 | field + 1); |
106 | |
107 | WARN_ONCE(field >= GUEST_ES_AR_BYTES && |
108 | field <= GUEST_TR_AR_BYTES, |
109 | "Update vmcs12_write_any() to drop reserved bits from AR_BYTES" ); |
110 | |
111 | /* |
112 | * PML and the preemption timer can be emulated, but the |
113 | * processor cannot vmwrite to fields that don't exist |
114 | * on bare metal. |
115 | */ |
116 | switch (field) { |
117 | case GUEST_PML_INDEX: |
118 | if (!cpu_has_vmx_pml()) |
119 | continue; |
120 | break; |
121 | case VMX_PREEMPTION_TIMER_VALUE: |
122 | if (!cpu_has_vmx_preemption_timer()) |
123 | continue; |
124 | break; |
125 | case GUEST_INTR_STATUS: |
126 | if (!cpu_has_vmx_apicv()) |
127 | continue; |
128 | break; |
129 | default: |
130 | break; |
131 | } |
132 | |
133 | clear_bit(nr: field, vmx_vmwrite_bitmap); |
134 | clear_bit(nr: field, vmx_vmread_bitmap); |
135 | if (field & 1) |
136 | #ifdef CONFIG_X86_64 |
137 | continue; |
138 | #else |
139 | entry.offset += sizeof(u32); |
140 | #endif |
141 | shadow_read_write_fields[j++] = entry; |
142 | } |
143 | max_shadow_read_write_fields = j; |
144 | } |
145 | |
146 | /* |
147 | * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), |
148 | * set the success or error code of an emulated VMX instruction (as specified |
149 | * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated |
150 | * instruction. |
151 | */ |
152 | static int nested_vmx_succeed(struct kvm_vcpu *vcpu) |
153 | { |
154 | vmx_set_rflags(vcpu, rflags: vmx_get_rflags(vcpu) |
155 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | |
156 | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); |
157 | return kvm_skip_emulated_instruction(vcpu); |
158 | } |
159 | |
160 | static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) |
161 | { |
162 | vmx_set_rflags(vcpu, rflags: (vmx_get_rflags(vcpu) |
163 | & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | |
164 | X86_EFLAGS_SF | X86_EFLAGS_OF)) |
165 | | X86_EFLAGS_CF); |
166 | return kvm_skip_emulated_instruction(vcpu); |
167 | } |
168 | |
169 | static int nested_vmx_failValid(struct kvm_vcpu *vcpu, |
170 | u32 vm_instruction_error) |
171 | { |
172 | vmx_set_rflags(vcpu, rflags: (vmx_get_rflags(vcpu) |
173 | & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | |
174 | X86_EFLAGS_SF | X86_EFLAGS_OF)) |
175 | | X86_EFLAGS_ZF); |
176 | get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; |
177 | /* |
178 | * We don't need to force sync to shadow VMCS because |
179 | * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all |
180 | * fields and thus must be synced. |
181 | */ |
182 | if (nested_vmx_is_evmptr12_set(vmx: to_vmx(vcpu))) |
183 | to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; |
184 | |
185 | return kvm_skip_emulated_instruction(vcpu); |
186 | } |
187 | |
188 | static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) |
189 | { |
190 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
191 | |
192 | /* |
193 | * failValid writes the error number to the current VMCS, which |
194 | * can't be done if there isn't a current VMCS. |
195 | */ |
196 | if (vmx->nested.current_vmptr == INVALID_GPA && |
197 | !nested_vmx_is_evmptr12_valid(vmx)) |
198 | return nested_vmx_failInvalid(vcpu); |
199 | |
200 | return nested_vmx_failValid(vcpu, vm_instruction_error); |
201 | } |
202 | |
203 | static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) |
204 | { |
205 | /* TODO: not to reset guest simply here. */ |
206 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
207 | pr_debug_ratelimited("nested vmx abort, indicator %d\n" , indicator); |
208 | } |
209 | |
210 | static inline bool vmx_control_verify(u32 control, u32 low, u32 high) |
211 | { |
212 | return fixed_bits_valid(val: control, fixed0: low, fixed1: high); |
213 | } |
214 | |
215 | static inline u64 vmx_control_msr(u32 low, u32 high) |
216 | { |
217 | return low | ((u64)high << 32); |
218 | } |
219 | |
220 | static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) |
221 | { |
222 | secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); |
223 | vmcs_write64(field: VMCS_LINK_POINTER, INVALID_GPA); |
224 | vmx->nested.need_vmcs12_to_shadow_sync = false; |
225 | } |
226 | |
227 | static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) |
228 | { |
229 | #ifdef CONFIG_KVM_HYPERV |
230 | struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); |
231 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
232 | |
233 | if (nested_vmx_is_evmptr12_valid(vmx)) { |
234 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.hv_evmcs_map, dirty: true); |
235 | vmx->nested.hv_evmcs = NULL; |
236 | } |
237 | |
238 | vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; |
239 | |
240 | if (hv_vcpu) { |
241 | hv_vcpu->nested.pa_page_gpa = INVALID_GPA; |
242 | hv_vcpu->nested.vm_id = 0; |
243 | hv_vcpu->nested.vp_id = 0; |
244 | } |
245 | #endif |
246 | } |
247 | |
248 | static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) |
249 | { |
250 | #ifdef CONFIG_KVM_HYPERV |
251 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
252 | /* |
253 | * When Enlightened VMEntry is enabled on the calling CPU we treat |
254 | * memory area pointer by vmptr as Enlightened VMCS (as there's no good |
255 | * way to distinguish it from VMCS12) and we must not corrupt it by |
256 | * writing to the non-existent 'launch_state' field. The area doesn't |
257 | * have to be the currently active EVMCS on the calling CPU and there's |
258 | * nothing KVM has to do to transition it from 'active' to 'non-active' |
259 | * state. It is possible that the area will stay mapped as |
260 | * vmx->nested.hv_evmcs but this shouldn't be a problem. |
261 | */ |
262 | if (!guest_cpuid_has_evmcs(vcpu) || |
263 | !evmptr_is_valid(evmptr: nested_get_evmptr(vcpu))) |
264 | return false; |
265 | |
266 | if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) |
267 | nested_release_evmcs(vcpu); |
268 | |
269 | return true; |
270 | #else |
271 | return false; |
272 | #endif |
273 | } |
274 | |
275 | static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, |
276 | struct loaded_vmcs *prev) |
277 | { |
278 | struct vmcs_host_state *dest, *src; |
279 | |
280 | if (unlikely(!vmx->guest_state_loaded)) |
281 | return; |
282 | |
283 | src = &prev->host_state; |
284 | dest = &vmx->loaded_vmcs->host_state; |
285 | |
286 | vmx_set_host_fs_gs(host: dest, fs_sel: src->fs_sel, gs_sel: src->gs_sel, fs_base: src->fs_base, gs_base: src->gs_base); |
287 | dest->ldt_sel = src->ldt_sel; |
288 | #ifdef CONFIG_X86_64 |
289 | dest->ds_sel = src->ds_sel; |
290 | dest->es_sel = src->es_sel; |
291 | #endif |
292 | } |
293 | |
294 | static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) |
295 | { |
296 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
297 | struct loaded_vmcs *prev; |
298 | int cpu; |
299 | |
300 | if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) |
301 | return; |
302 | |
303 | cpu = get_cpu(); |
304 | prev = vmx->loaded_vmcs; |
305 | vmx->loaded_vmcs = vmcs; |
306 | vmx_vcpu_load_vmcs(vcpu, cpu, buddy: prev); |
307 | vmx_sync_vmcs_host_state(vmx, prev); |
308 | put_cpu(); |
309 | |
310 | vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; |
311 | |
312 | /* |
313 | * All lazily updated registers will be reloaded from VMCS12 on both |
314 | * vmentry and vmexit. |
315 | */ |
316 | vcpu->arch.regs_dirty = 0; |
317 | } |
318 | |
319 | /* |
320 | * Free whatever needs to be freed from vmx->nested when L1 goes down, or |
321 | * just stops using VMX. |
322 | */ |
323 | static void free_nested(struct kvm_vcpu *vcpu) |
324 | { |
325 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
326 | |
327 | if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) |
328 | vmx_switch_vmcs(vcpu, vmcs: &vmx->vmcs01); |
329 | |
330 | if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) |
331 | return; |
332 | |
333 | kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
334 | |
335 | vmx->nested.vmxon = false; |
336 | vmx->nested.smm.vmxon = false; |
337 | vmx->nested.vmxon_ptr = INVALID_GPA; |
338 | free_vpid(vpid: vmx->nested.vpid02); |
339 | vmx->nested.posted_intr_nv = -1; |
340 | vmx->nested.current_vmptr = INVALID_GPA; |
341 | if (enable_shadow_vmcs) { |
342 | vmx_disable_shadow_vmcs(vmx); |
343 | vmcs_clear(vmcs: vmx->vmcs01.shadow_vmcs); |
344 | free_vmcs(vmcs: vmx->vmcs01.shadow_vmcs); |
345 | vmx->vmcs01.shadow_vmcs = NULL; |
346 | } |
347 | kfree(objp: vmx->nested.cached_vmcs12); |
348 | vmx->nested.cached_vmcs12 = NULL; |
349 | kfree(objp: vmx->nested.cached_shadow_vmcs12); |
350 | vmx->nested.cached_shadow_vmcs12 = NULL; |
351 | /* |
352 | * Unpin physical memory we referred to in the vmcs02. The APIC access |
353 | * page's backing page (yeah, confusing) shouldn't actually be accessed, |
354 | * and if it is written, the contents are irrelevant. |
355 | */ |
356 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.apic_access_page_map, dirty: false); |
357 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.virtual_apic_map, dirty: true); |
358 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.pi_desc_map, dirty: true); |
359 | vmx->nested.pi_desc = NULL; |
360 | |
361 | kvm_mmu_free_roots(kvm: vcpu->kvm, mmu: &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); |
362 | |
363 | nested_release_evmcs(vcpu); |
364 | |
365 | free_loaded_vmcs(loaded_vmcs: &vmx->nested.vmcs02); |
366 | } |
367 | |
368 | /* |
369 | * Ensure that the current vmcs of the logical processor is the |
370 | * vmcs01 of the vcpu before calling free_nested(). |
371 | */ |
372 | void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) |
373 | { |
374 | vcpu_load(vcpu); |
375 | vmx_leave_nested(vcpu); |
376 | vcpu_put(vcpu); |
377 | } |
378 | |
379 | #define EPTP_PA_MASK GENMASK_ULL(51, 12) |
380 | |
381 | static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) |
382 | { |
383 | return VALID_PAGE(root_hpa) && |
384 | ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); |
385 | } |
386 | |
387 | static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, |
388 | gpa_t addr) |
389 | { |
390 | unsigned long roots = 0; |
391 | uint i; |
392 | struct kvm_mmu_root_info *cached_root; |
393 | |
394 | WARN_ON_ONCE(!mmu_is_nested(vcpu)); |
395 | |
396 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { |
397 | cached_root = &vcpu->arch.mmu->prev_roots[i]; |
398 | |
399 | if (nested_ept_root_matches(root_hpa: cached_root->hpa, root_eptp: cached_root->pgd, |
400 | eptp)) |
401 | roots |= KVM_MMU_ROOT_PREVIOUS(i); |
402 | } |
403 | if (roots) |
404 | kvm_mmu_invalidate_addr(vcpu, mmu: vcpu->arch.mmu, addr, roots); |
405 | } |
406 | |
407 | static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, |
408 | struct x86_exception *fault) |
409 | { |
410 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
411 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
412 | u32 vm_exit_reason; |
413 | unsigned long exit_qualification = vcpu->arch.exit_qualification; |
414 | |
415 | if (vmx->nested.pml_full) { |
416 | vm_exit_reason = EXIT_REASON_PML_FULL; |
417 | vmx->nested.pml_full = false; |
418 | exit_qualification &= INTR_INFO_UNBLOCK_NMI; |
419 | } else { |
420 | if (fault->error_code & PFERR_RSVD_MASK) |
421 | vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; |
422 | else |
423 | vm_exit_reason = EXIT_REASON_EPT_VIOLATION; |
424 | |
425 | /* |
426 | * Although the caller (kvm_inject_emulated_page_fault) would |
427 | * have already synced the faulting address in the shadow EPT |
428 | * tables for the current EPTP12, we also need to sync it for |
429 | * any other cached EPTP02s based on the same EP4TA, since the |
430 | * TLB associates mappings to the EP4TA rather than the full EPTP. |
431 | */ |
432 | nested_ept_invalidate_addr(vcpu, eptp: vmcs12->ept_pointer, |
433 | addr: fault->address); |
434 | } |
435 | |
436 | nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info: 0, exit_qualification); |
437 | vmcs12->guest_physical_address = fault->address; |
438 | } |
439 | |
440 | static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) |
441 | { |
442 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
443 | bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; |
444 | int ept_lpage_level = ept_caps_to_lpage_level(ept_caps: vmx->nested.msrs.ept_caps); |
445 | |
446 | kvm_init_shadow_ept_mmu(vcpu, execonly, huge_page_level: ept_lpage_level, |
447 | accessed_dirty: nested_ept_ad_enabled(vcpu), |
448 | new_eptp: nested_ept_get_eptp(vcpu)); |
449 | } |
450 | |
451 | static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) |
452 | { |
453 | WARN_ON(mmu_is_nested(vcpu)); |
454 | |
455 | vcpu->arch.mmu = &vcpu->arch.guest_mmu; |
456 | nested_ept_new_eptp(vcpu); |
457 | vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; |
458 | vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; |
459 | vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; |
460 | |
461 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; |
462 | } |
463 | |
464 | static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) |
465 | { |
466 | vcpu->arch.mmu = &vcpu->arch.root_mmu; |
467 | vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; |
468 | } |
469 | |
470 | static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, |
471 | u16 error_code) |
472 | { |
473 | bool inequality, bit; |
474 | |
475 | bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; |
476 | inequality = |
477 | (error_code & vmcs12->page_fault_error_code_mask) != |
478 | vmcs12->page_fault_error_code_match; |
479 | return inequality ^ bit; |
480 | } |
481 | |
482 | static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, |
483 | u32 error_code) |
484 | { |
485 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
486 | |
487 | /* |
488 | * Drop bits 31:16 of the error code when performing the #PF mask+match |
489 | * check. All VMCS fields involved are 32 bits, but Intel CPUs never |
490 | * set bits 31:16 and VMX disallows setting bits 31:16 in the injected |
491 | * error code. Including the to-be-dropped bits in the check might |
492 | * result in an "impossible" or missed exit from L1's perspective. |
493 | */ |
494 | if (vector == PF_VECTOR) |
495 | return nested_vmx_is_page_fault_vmexit(vmcs12, error_code: (u16)error_code); |
496 | |
497 | return (vmcs12->exception_bitmap & (1u << vector)); |
498 | } |
499 | |
500 | static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, |
501 | struct vmcs12 *vmcs12) |
502 | { |
503 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) |
504 | return 0; |
505 | |
506 | if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || |
507 | CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) |
508 | return -EINVAL; |
509 | |
510 | return 0; |
511 | } |
512 | |
513 | static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, |
514 | struct vmcs12 *vmcs12) |
515 | { |
516 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) |
517 | return 0; |
518 | |
519 | if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) |
520 | return -EINVAL; |
521 | |
522 | return 0; |
523 | } |
524 | |
525 | static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, |
526 | struct vmcs12 *vmcs12) |
527 | { |
528 | if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) |
529 | return 0; |
530 | |
531 | if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) |
532 | return -EINVAL; |
533 | |
534 | return 0; |
535 | } |
536 | |
537 | /* |
538 | * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 |
539 | * itself utilizing x2APIC. All MSRs were previously set to be intercepted, |
540 | * only the "disable intercept" case needs to be handled. |
541 | */ |
542 | static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, |
543 | unsigned long *msr_bitmap_l0, |
544 | u32 msr, int type) |
545 | { |
546 | if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(bitmap: msr_bitmap_l1, msr)) |
547 | vmx_clear_msr_bitmap_read(bitmap: msr_bitmap_l0, msr); |
548 | |
549 | if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(bitmap: msr_bitmap_l1, msr)) |
550 | vmx_clear_msr_bitmap_write(bitmap: msr_bitmap_l0, msr); |
551 | } |
552 | |
553 | static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) |
554 | { |
555 | int msr; |
556 | |
557 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { |
558 | unsigned word = msr / BITS_PER_LONG; |
559 | |
560 | msr_bitmap[word] = ~0; |
561 | msr_bitmap[word + (0x800 / sizeof(long))] = ~0; |
562 | } |
563 | } |
564 | |
565 | #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ |
566 | static inline \ |
567 | void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ |
568 | unsigned long *msr_bitmap_l1, \ |
569 | unsigned long *msr_bitmap_l0, u32 msr) \ |
570 | { \ |
571 | if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ |
572 | vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ |
573 | vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ |
574 | else \ |
575 | vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ |
576 | } |
577 | BUILD_NVMX_MSR_INTERCEPT_HELPER(read) |
578 | BUILD_NVMX_MSR_INTERCEPT_HELPER(write) |
579 | |
580 | static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, |
581 | unsigned long *msr_bitmap_l1, |
582 | unsigned long *msr_bitmap_l0, |
583 | u32 msr, int types) |
584 | { |
585 | if (types & MSR_TYPE_R) |
586 | nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, |
587 | msr_bitmap_l0, msr); |
588 | if (types & MSR_TYPE_W) |
589 | nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, |
590 | msr_bitmap_l0, msr); |
591 | } |
592 | |
593 | /* |
594 | * Merge L0's and L1's MSR bitmap, return false to indicate that |
595 | * we do not use the hardware. |
596 | */ |
597 | static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, |
598 | struct vmcs12 *vmcs12) |
599 | { |
600 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
601 | int msr; |
602 | unsigned long *msr_bitmap_l1; |
603 | unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; |
604 | struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; |
605 | |
606 | /* Nothing to do if the MSR bitmap is not in use. */ |
607 | if (!cpu_has_vmx_msr_bitmap() || |
608 | !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) |
609 | return false; |
610 | |
611 | /* |
612 | * MSR bitmap update can be skipped when: |
613 | * - MSR bitmap for L1 hasn't changed. |
614 | * - Nested hypervisor (L1) is attempting to launch the same L2 as |
615 | * before. |
616 | * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature |
617 | * and tells KVM (L0) there were no changes in MSR bitmap for L2. |
618 | */ |
619 | if (!vmx->nested.force_msr_bitmap_recalc) { |
620 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
621 | |
622 | if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && |
623 | evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) |
624 | return true; |
625 | } |
626 | |
627 | if (kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: vmcs12->msr_bitmap), map)) |
628 | return false; |
629 | |
630 | msr_bitmap_l1 = (unsigned long *)map->hva; |
631 | |
632 | /* |
633 | * To keep the control flow simple, pay eight 8-byte writes (sixteen |
634 | * 4-byte writes on 32-bit systems) up front to enable intercepts for |
635 | * the x2APIC MSR range and selectively toggle those relevant to L2. |
636 | */ |
637 | enable_x2apic_msr_intercepts(msr_bitmap: msr_bitmap_l0); |
638 | |
639 | if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { |
640 | if (nested_cpu_has_apic_reg_virt(vmcs12)) { |
641 | /* |
642 | * L0 need not intercept reads for MSRs between 0x800 |
643 | * and 0x8ff, it just lets the processor take the value |
644 | * from the virtual-APIC page; take those 256 bits |
645 | * directly from the L1 bitmap. |
646 | */ |
647 | for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { |
648 | unsigned word = msr / BITS_PER_LONG; |
649 | |
650 | msr_bitmap_l0[word] = msr_bitmap_l1[word]; |
651 | } |
652 | } |
653 | |
654 | nested_vmx_disable_intercept_for_x2apic_msr( |
655 | msr_bitmap_l1, msr_bitmap_l0, |
656 | X2APIC_MSR(APIC_TASKPRI), |
657 | MSR_TYPE_R | MSR_TYPE_W); |
658 | |
659 | if (nested_cpu_has_vid(vmcs12)) { |
660 | nested_vmx_disable_intercept_for_x2apic_msr( |
661 | msr_bitmap_l1, msr_bitmap_l0, |
662 | X2APIC_MSR(APIC_EOI), |
663 | MSR_TYPE_W); |
664 | nested_vmx_disable_intercept_for_x2apic_msr( |
665 | msr_bitmap_l1, msr_bitmap_l0, |
666 | X2APIC_MSR(APIC_SELF_IPI), |
667 | MSR_TYPE_W); |
668 | } |
669 | } |
670 | |
671 | /* |
672 | * Always check vmcs01's bitmap to honor userspace MSR filters and any |
673 | * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. |
674 | */ |
675 | #ifdef CONFIG_X86_64 |
676 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
677 | MSR_FS_BASE, MSR_TYPE_RW); |
678 | |
679 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
680 | MSR_GS_BASE, MSR_TYPE_RW); |
681 | |
682 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
683 | MSR_KERNEL_GS_BASE, MSR_TYPE_RW); |
684 | #endif |
685 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
686 | MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); |
687 | |
688 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
689 | MSR_IA32_PRED_CMD, MSR_TYPE_W); |
690 | |
691 | nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, |
692 | MSR_IA32_FLUSH_CMD, MSR_TYPE_W); |
693 | |
694 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.msr_bitmap_map, dirty: false); |
695 | |
696 | vmx->nested.force_msr_bitmap_recalc = false; |
697 | |
698 | return true; |
699 | } |
700 | |
701 | static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, |
702 | struct vmcs12 *vmcs12) |
703 | { |
704 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
705 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; |
706 | |
707 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || |
708 | vmcs12->vmcs_link_pointer == INVALID_GPA) |
709 | return; |
710 | |
711 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
712 | kvm_gfn_to_hva_cache_init(kvm: vcpu->kvm, ghc, |
713 | gpa: vmcs12->vmcs_link_pointer, VMCS12_SIZE)) |
714 | return; |
715 | |
716 | kvm_read_guest_cached(kvm: vmx->vcpu.kvm, ghc, data: get_shadow_vmcs12(vcpu), |
717 | VMCS12_SIZE); |
718 | } |
719 | |
720 | static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, |
721 | struct vmcs12 *vmcs12) |
722 | { |
723 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
724 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; |
725 | |
726 | if (!nested_cpu_has_shadow_vmcs(vmcs12) || |
727 | vmcs12->vmcs_link_pointer == INVALID_GPA) |
728 | return; |
729 | |
730 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
731 | kvm_gfn_to_hva_cache_init(kvm: vcpu->kvm, ghc, |
732 | gpa: vmcs12->vmcs_link_pointer, VMCS12_SIZE)) |
733 | return; |
734 | |
735 | kvm_write_guest_cached(kvm: vmx->vcpu.kvm, ghc, data: get_shadow_vmcs12(vcpu), |
736 | VMCS12_SIZE); |
737 | } |
738 | |
739 | /* |
740 | * In nested virtualization, check if L1 has set |
741 | * VM_EXIT_ACK_INTR_ON_EXIT |
742 | */ |
743 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) |
744 | { |
745 | return get_vmcs12(vcpu)->vm_exit_controls & |
746 | VM_EXIT_ACK_INTR_ON_EXIT; |
747 | } |
748 | |
749 | static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, |
750 | struct vmcs12 *vmcs12) |
751 | { |
752 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && |
753 | CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) |
754 | return -EINVAL; |
755 | else |
756 | return 0; |
757 | } |
758 | |
759 | static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, |
760 | struct vmcs12 *vmcs12) |
761 | { |
762 | if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && |
763 | !nested_cpu_has_apic_reg_virt(vmcs12) && |
764 | !nested_cpu_has_vid(vmcs12) && |
765 | !nested_cpu_has_posted_intr(vmcs12)) |
766 | return 0; |
767 | |
768 | /* |
769 | * If virtualize x2apic mode is enabled, |
770 | * virtualize apic access must be disabled. |
771 | */ |
772 | if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && |
773 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) |
774 | return -EINVAL; |
775 | |
776 | /* |
777 | * If virtual interrupt delivery is enabled, |
778 | * we must exit on external interrupts. |
779 | */ |
780 | if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) |
781 | return -EINVAL; |
782 | |
783 | /* |
784 | * bits 15:8 should be zero in posted_intr_nv, |
785 | * the descriptor address has been already checked |
786 | * in nested_get_vmcs12_pages. |
787 | * |
788 | * bits 5:0 of posted_intr_desc_addr should be zero. |
789 | */ |
790 | if (nested_cpu_has_posted_intr(vmcs12) && |
791 | (CC(!nested_cpu_has_vid(vmcs12)) || |
792 | CC(!nested_exit_intr_ack_set(vcpu)) || |
793 | CC((vmcs12->posted_intr_nv & 0xff00)) || |
794 | CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) |
795 | return -EINVAL; |
796 | |
797 | /* tpr shadow is needed by all apicv features. */ |
798 | if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) |
799 | return -EINVAL; |
800 | |
801 | return 0; |
802 | } |
803 | |
804 | static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, |
805 | u32 count, u64 addr) |
806 | { |
807 | if (count == 0) |
808 | return 0; |
809 | |
810 | if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa: addr, alignment: 16) || |
811 | !kvm_vcpu_is_legal_gpa(vcpu, gpa: (addr + count * sizeof(struct vmx_msr_entry) - 1))) |
812 | return -EINVAL; |
813 | |
814 | return 0; |
815 | } |
816 | |
817 | static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, |
818 | struct vmcs12 *vmcs12) |
819 | { |
820 | if (CC(nested_vmx_check_msr_switch(vcpu, |
821 | vmcs12->vm_exit_msr_load_count, |
822 | vmcs12->vm_exit_msr_load_addr)) || |
823 | CC(nested_vmx_check_msr_switch(vcpu, |
824 | vmcs12->vm_exit_msr_store_count, |
825 | vmcs12->vm_exit_msr_store_addr))) |
826 | return -EINVAL; |
827 | |
828 | return 0; |
829 | } |
830 | |
831 | static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, |
832 | struct vmcs12 *vmcs12) |
833 | { |
834 | if (CC(nested_vmx_check_msr_switch(vcpu, |
835 | vmcs12->vm_entry_msr_load_count, |
836 | vmcs12->vm_entry_msr_load_addr))) |
837 | return -EINVAL; |
838 | |
839 | return 0; |
840 | } |
841 | |
842 | static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, |
843 | struct vmcs12 *vmcs12) |
844 | { |
845 | if (!nested_cpu_has_pml(vmcs12)) |
846 | return 0; |
847 | |
848 | if (CC(!nested_cpu_has_ept(vmcs12)) || |
849 | CC(!page_address_valid(vcpu, vmcs12->pml_address))) |
850 | return -EINVAL; |
851 | |
852 | return 0; |
853 | } |
854 | |
855 | static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, |
856 | struct vmcs12 *vmcs12) |
857 | { |
858 | if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && |
859 | !nested_cpu_has_ept(vmcs12))) |
860 | return -EINVAL; |
861 | return 0; |
862 | } |
863 | |
864 | static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, |
865 | struct vmcs12 *vmcs12) |
866 | { |
867 | if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && |
868 | !nested_cpu_has_ept(vmcs12))) |
869 | return -EINVAL; |
870 | return 0; |
871 | } |
872 | |
873 | static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, |
874 | struct vmcs12 *vmcs12) |
875 | { |
876 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) |
877 | return 0; |
878 | |
879 | if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || |
880 | CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) |
881 | return -EINVAL; |
882 | |
883 | return 0; |
884 | } |
885 | |
886 | static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, |
887 | struct vmx_msr_entry *e) |
888 | { |
889 | /* x2APIC MSR accesses are not allowed */ |
890 | if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) |
891 | return -EINVAL; |
892 | if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ |
893 | CC(e->index == MSR_IA32_UCODE_REV)) |
894 | return -EINVAL; |
895 | if (CC(e->reserved != 0)) |
896 | return -EINVAL; |
897 | return 0; |
898 | } |
899 | |
900 | static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, |
901 | struct vmx_msr_entry *e) |
902 | { |
903 | if (CC(e->index == MSR_FS_BASE) || |
904 | CC(e->index == MSR_GS_BASE) || |
905 | CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ |
906 | nested_vmx_msr_check_common(vcpu, e)) |
907 | return -EINVAL; |
908 | return 0; |
909 | } |
910 | |
911 | static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, |
912 | struct vmx_msr_entry *e) |
913 | { |
914 | if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ |
915 | nested_vmx_msr_check_common(vcpu, e)) |
916 | return -EINVAL; |
917 | return 0; |
918 | } |
919 | |
920 | static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) |
921 | { |
922 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
923 | u64 vmx_misc = vmx_control_msr(low: vmx->nested.msrs.misc_low, |
924 | high: vmx->nested.msrs.misc_high); |
925 | |
926 | return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; |
927 | } |
928 | |
929 | /* |
930 | * Load guest's/host's msr at nested entry/exit. |
931 | * return 0 for success, entry index for failure. |
932 | * |
933 | * One of the failure modes for MSR load/store is when a list exceeds the |
934 | * virtual hardware's capacity. To maintain compatibility with hardware inasmuch |
935 | * as possible, process all valid entries before failing rather than precheck |
936 | * for a capacity violation. |
937 | */ |
938 | static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) |
939 | { |
940 | u32 i; |
941 | struct vmx_msr_entry e; |
942 | u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); |
943 | |
944 | for (i = 0; i < count; i++) { |
945 | if (unlikely(i >= max_msr_list_size)) |
946 | goto fail; |
947 | |
948 | if (kvm_vcpu_read_guest(vcpu, gpa: gpa + i * sizeof(e), |
949 | data: &e, len: sizeof(e))) { |
950 | pr_debug_ratelimited( |
951 | "%s cannot read MSR entry (%u, 0x%08llx)\n" , |
952 | __func__, i, gpa + i * sizeof(e)); |
953 | goto fail; |
954 | } |
955 | if (nested_vmx_load_msr_check(vcpu, e: &e)) { |
956 | pr_debug_ratelimited( |
957 | "%s check failed (%u, 0x%x, 0x%x)\n" , |
958 | __func__, i, e.index, e.reserved); |
959 | goto fail; |
960 | } |
961 | if (kvm_set_msr(vcpu, index: e.index, data: e.value)) { |
962 | pr_debug_ratelimited( |
963 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n" , |
964 | __func__, i, e.index, e.value); |
965 | goto fail; |
966 | } |
967 | } |
968 | return 0; |
969 | fail: |
970 | /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ |
971 | return i + 1; |
972 | } |
973 | |
974 | static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, |
975 | u32 msr_index, |
976 | u64 *data) |
977 | { |
978 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
979 | |
980 | /* |
981 | * If the L0 hypervisor stored a more accurate value for the TSC that |
982 | * does not include the time taken for emulation of the L2->L1 |
983 | * VM-exit in L0, use the more accurate value. |
984 | */ |
985 | if (msr_index == MSR_IA32_TSC) { |
986 | int i = vmx_find_loadstore_msr_slot(m: &vmx->msr_autostore.guest, |
987 | MSR_IA32_TSC); |
988 | |
989 | if (i >= 0) { |
990 | u64 val = vmx->msr_autostore.guest.val[i].value; |
991 | |
992 | *data = kvm_read_l1_tsc(vcpu, host_tsc: val); |
993 | return true; |
994 | } |
995 | } |
996 | |
997 | if (kvm_get_msr(vcpu, index: msr_index, data)) { |
998 | pr_debug_ratelimited("%s cannot read MSR (0x%x)\n" , __func__, |
999 | msr_index); |
1000 | return false; |
1001 | } |
1002 | return true; |
1003 | } |
1004 | |
1005 | static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, |
1006 | struct vmx_msr_entry *e) |
1007 | { |
1008 | if (kvm_vcpu_read_guest(vcpu, |
1009 | gpa: gpa + i * sizeof(*e), |
1010 | data: e, len: 2 * sizeof(u32))) { |
1011 | pr_debug_ratelimited( |
1012 | "%s cannot read MSR entry (%u, 0x%08llx)\n" , |
1013 | __func__, i, gpa + i * sizeof(*e)); |
1014 | return false; |
1015 | } |
1016 | if (nested_vmx_store_msr_check(vcpu, e)) { |
1017 | pr_debug_ratelimited( |
1018 | "%s check failed (%u, 0x%x, 0x%x)\n" , |
1019 | __func__, i, e->index, e->reserved); |
1020 | return false; |
1021 | } |
1022 | return true; |
1023 | } |
1024 | |
1025 | static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) |
1026 | { |
1027 | u64 data; |
1028 | u32 i; |
1029 | struct vmx_msr_entry e; |
1030 | u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); |
1031 | |
1032 | for (i = 0; i < count; i++) { |
1033 | if (unlikely(i >= max_msr_list_size)) |
1034 | return -EINVAL; |
1035 | |
1036 | if (!read_and_check_msr_entry(vcpu, gpa, i, e: &e)) |
1037 | return -EINVAL; |
1038 | |
1039 | if (!nested_vmx_get_vmexit_msr_value(vcpu, msr_index: e.index, data: &data)) |
1040 | return -EINVAL; |
1041 | |
1042 | if (kvm_vcpu_write_guest(vcpu, |
1043 | gpa: gpa + i * sizeof(e) + |
1044 | offsetof(struct vmx_msr_entry, value), |
1045 | data: &data, len: sizeof(data))) { |
1046 | pr_debug_ratelimited( |
1047 | "%s cannot write MSR (%u, 0x%x, 0x%llx)\n" , |
1048 | __func__, i, e.index, data); |
1049 | return -EINVAL; |
1050 | } |
1051 | } |
1052 | return 0; |
1053 | } |
1054 | |
1055 | static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) |
1056 | { |
1057 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
1058 | u32 count = vmcs12->vm_exit_msr_store_count; |
1059 | u64 gpa = vmcs12->vm_exit_msr_store_addr; |
1060 | struct vmx_msr_entry e; |
1061 | u32 i; |
1062 | |
1063 | for (i = 0; i < count; i++) { |
1064 | if (!read_and_check_msr_entry(vcpu, gpa, i, e: &e)) |
1065 | return false; |
1066 | |
1067 | if (e.index == msr_index) |
1068 | return true; |
1069 | } |
1070 | return false; |
1071 | } |
1072 | |
1073 | static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, |
1074 | u32 msr_index) |
1075 | { |
1076 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1077 | struct vmx_msrs *autostore = &vmx->msr_autostore.guest; |
1078 | bool in_vmcs12_store_list; |
1079 | int msr_autostore_slot; |
1080 | bool in_autostore_list; |
1081 | int last; |
1082 | |
1083 | msr_autostore_slot = vmx_find_loadstore_msr_slot(m: autostore, msr: msr_index); |
1084 | in_autostore_list = msr_autostore_slot >= 0; |
1085 | in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); |
1086 | |
1087 | if (in_vmcs12_store_list && !in_autostore_list) { |
1088 | if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { |
1089 | /* |
1090 | * Emulated VMEntry does not fail here. Instead a less |
1091 | * accurate value will be returned by |
1092 | * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() |
1093 | * instead of reading the value from the vmcs02 VMExit |
1094 | * MSR-store area. |
1095 | */ |
1096 | pr_warn_ratelimited( |
1097 | "Not enough msr entries in msr_autostore. Can't add msr %x\n" , |
1098 | msr_index); |
1099 | return; |
1100 | } |
1101 | last = autostore->nr++; |
1102 | autostore->val[last].index = msr_index; |
1103 | } else if (!in_vmcs12_store_list && in_autostore_list) { |
1104 | last = --autostore->nr; |
1105 | autostore->val[msr_autostore_slot] = autostore->val[last]; |
1106 | } |
1107 | } |
1108 | |
1109 | /* |
1110 | * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are |
1111 | * emulating VM-Entry into a guest with EPT enabled. On failure, the expected |
1112 | * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to |
1113 | * @entry_failure_code. |
1114 | */ |
1115 | static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, |
1116 | bool nested_ept, bool reload_pdptrs, |
1117 | enum vm_entry_failure_code *entry_failure_code) |
1118 | { |
1119 | if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { |
1120 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
1121 | return -EINVAL; |
1122 | } |
1123 | |
1124 | /* |
1125 | * If PAE paging and EPT are both on, CR3 is not used by the CPU and |
1126 | * must not be dereferenced. |
1127 | */ |
1128 | if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && |
1129 | CC(!load_pdptrs(vcpu, cr3))) { |
1130 | *entry_failure_code = ENTRY_FAIL_PDPTE; |
1131 | return -EINVAL; |
1132 | } |
1133 | |
1134 | vcpu->arch.cr3 = cr3; |
1135 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_CR3); |
1136 | |
1137 | /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ |
1138 | kvm_init_mmu(vcpu); |
1139 | |
1140 | if (!nested_ept) |
1141 | kvm_mmu_new_pgd(vcpu, new_pgd: cr3); |
1142 | |
1143 | return 0; |
1144 | } |
1145 | |
1146 | /* |
1147 | * Returns if KVM is able to config CPU to tag TLB entries |
1148 | * populated by L2 differently than TLB entries populated |
1149 | * by L1. |
1150 | * |
1151 | * If L0 uses EPT, L1 and L2 run with different EPTP because |
1152 | * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries |
1153 | * are tagged with different EPTP. |
1154 | * |
1155 | * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged |
1156 | * with different VPID (L1 entries are tagged with vmx->vpid |
1157 | * while L2 entries are tagged with vmx->nested.vpid02). |
1158 | */ |
1159 | static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) |
1160 | { |
1161 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
1162 | |
1163 | return enable_ept || |
1164 | (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); |
1165 | } |
1166 | |
1167 | static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, |
1168 | struct vmcs12 *vmcs12, |
1169 | bool is_vmenter) |
1170 | { |
1171 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1172 | |
1173 | /* Handle pending Hyper-V TLB flush requests */ |
1174 | kvm_hv_nested_transtion_tlb_flush(vcpu, tdp_enabled: enable_ept); |
1175 | |
1176 | /* |
1177 | * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings |
1178 | * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a |
1179 | * full TLB flush from the guest's perspective. This is required even |
1180 | * if VPID is disabled in the host as KVM may need to synchronize the |
1181 | * MMU in response to the guest TLB flush. |
1182 | * |
1183 | * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. |
1184 | * EPT is a special snowflake, as guest-physical mappings aren't |
1185 | * flushed on VPID invalidations, including VM-Enter or VM-Exit with |
1186 | * VPID disabled. As a result, KVM _never_ needs to sync nEPT |
1187 | * entries on VM-Enter because L1 can't rely on VM-Enter to flush |
1188 | * those mappings. |
1189 | */ |
1190 | if (!nested_cpu_has_vpid(vmcs12)) { |
1191 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
1192 | return; |
1193 | } |
1194 | |
1195 | /* L2 should never have a VPID if VPID is disabled. */ |
1196 | WARN_ON(!enable_vpid); |
1197 | |
1198 | /* |
1199 | * VPID is enabled and in use by vmcs12. If vpid12 is changing, then |
1200 | * emulate a guest TLB flush as KVM does not track vpid12 history nor |
1201 | * is the VPID incorporated into the MMU context. I.e. KVM must assume |
1202 | * that the new vpid12 has never been used and thus represents a new |
1203 | * guest ASID that cannot have entries in the TLB. |
1204 | */ |
1205 | if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { |
1206 | vmx->nested.last_vpid = vmcs12->virtual_processor_id; |
1207 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
1208 | return; |
1209 | } |
1210 | |
1211 | /* |
1212 | * If VPID is enabled, used by vmc12, and vpid12 is not changing but |
1213 | * does not have a unique TLB tag (ASID), i.e. EPT is disabled and |
1214 | * KVM was unable to allocate a VPID for L2, flush the current context |
1215 | * as the effective ASID is common to both L1 and L2. |
1216 | */ |
1217 | if (!nested_has_guest_tlb_tag(vcpu)) |
1218 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
1219 | } |
1220 | |
1221 | static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) |
1222 | { |
1223 | superset &= mask; |
1224 | subset &= mask; |
1225 | |
1226 | return (superset | subset) == superset; |
1227 | } |
1228 | |
1229 | static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) |
1230 | { |
1231 | const u64 feature_and_reserved = |
1232 | /* feature (except bit 48; see below) */ |
1233 | BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | |
1234 | /* reserved */ |
1235 | BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); |
1236 | u64 vmx_basic = vmcs_config.nested.basic; |
1237 | |
1238 | if (!is_bitwise_subset(superset: vmx_basic, subset: data, mask: feature_and_reserved)) |
1239 | return -EINVAL; |
1240 | |
1241 | /* |
1242 | * KVM does not emulate a version of VMX that constrains physical |
1243 | * addresses of VMX structures (e.g. VMCS) to 32-bits. |
1244 | */ |
1245 | if (data & BIT_ULL(48)) |
1246 | return -EINVAL; |
1247 | |
1248 | if (vmx_basic_vmcs_revision_id(vmx_basic) != |
1249 | vmx_basic_vmcs_revision_id(vmx_basic: data)) |
1250 | return -EINVAL; |
1251 | |
1252 | if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(vmx_basic: data)) |
1253 | return -EINVAL; |
1254 | |
1255 | vmx->nested.msrs.basic = data; |
1256 | return 0; |
1257 | } |
1258 | |
1259 | static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, |
1260 | u32 **low, u32 **high) |
1261 | { |
1262 | switch (msr_index) { |
1263 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: |
1264 | *low = &msrs->pinbased_ctls_low; |
1265 | *high = &msrs->pinbased_ctls_high; |
1266 | break; |
1267 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: |
1268 | *low = &msrs->procbased_ctls_low; |
1269 | *high = &msrs->procbased_ctls_high; |
1270 | break; |
1271 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: |
1272 | *low = &msrs->exit_ctls_low; |
1273 | *high = &msrs->exit_ctls_high; |
1274 | break; |
1275 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: |
1276 | *low = &msrs->entry_ctls_low; |
1277 | *high = &msrs->entry_ctls_high; |
1278 | break; |
1279 | case MSR_IA32_VMX_PROCBASED_CTLS2: |
1280 | *low = &msrs->secondary_ctls_low; |
1281 | *high = &msrs->secondary_ctls_high; |
1282 | break; |
1283 | default: |
1284 | BUG(); |
1285 | } |
1286 | } |
1287 | |
1288 | static int |
1289 | vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) |
1290 | { |
1291 | u32 *lowp, *highp; |
1292 | u64 supported; |
1293 | |
1294 | vmx_get_control_msr(msrs: &vmcs_config.nested, msr_index, low: &lowp, high: &highp); |
1295 | |
1296 | supported = vmx_control_msr(low: *lowp, high: *highp); |
1297 | |
1298 | /* Check must-be-1 bits are still 1. */ |
1299 | if (!is_bitwise_subset(superset: data, subset: supported, GENMASK_ULL(31, 0))) |
1300 | return -EINVAL; |
1301 | |
1302 | /* Check must-be-0 bits are still 0. */ |
1303 | if (!is_bitwise_subset(superset: supported, subset: data, GENMASK_ULL(63, 32))) |
1304 | return -EINVAL; |
1305 | |
1306 | vmx_get_control_msr(msrs: &vmx->nested.msrs, msr_index, low: &lowp, high: &highp); |
1307 | *lowp = data; |
1308 | *highp = data >> 32; |
1309 | return 0; |
1310 | } |
1311 | |
1312 | static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) |
1313 | { |
1314 | const u64 feature_and_reserved_bits = |
1315 | /* feature */ |
1316 | BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | |
1317 | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | |
1318 | /* reserved */ |
1319 | GENMASK_ULL(13, 9) | BIT_ULL(31); |
1320 | u64 vmx_misc = vmx_control_msr(low: vmcs_config.nested.misc_low, |
1321 | high: vmcs_config.nested.misc_high); |
1322 | |
1323 | if (!is_bitwise_subset(superset: vmx_misc, subset: data, mask: feature_and_reserved_bits)) |
1324 | return -EINVAL; |
1325 | |
1326 | if ((vmx->nested.msrs.pinbased_ctls_high & |
1327 | PIN_BASED_VMX_PREEMPTION_TIMER) && |
1328 | vmx_misc_preemption_timer_rate(vmx_misc: data) != |
1329 | vmx_misc_preemption_timer_rate(vmx_misc)) |
1330 | return -EINVAL; |
1331 | |
1332 | if (vmx_misc_cr3_count(vmx_misc: data) > vmx_misc_cr3_count(vmx_misc)) |
1333 | return -EINVAL; |
1334 | |
1335 | if (vmx_misc_max_msr(vmx_misc: data) > vmx_misc_max_msr(vmx_misc)) |
1336 | return -EINVAL; |
1337 | |
1338 | if (vmx_misc_mseg_revid(vmx_misc: data) != vmx_misc_mseg_revid(vmx_misc)) |
1339 | return -EINVAL; |
1340 | |
1341 | vmx->nested.msrs.misc_low = data; |
1342 | vmx->nested.msrs.misc_high = data >> 32; |
1343 | |
1344 | return 0; |
1345 | } |
1346 | |
1347 | static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) |
1348 | { |
1349 | u64 vmx_ept_vpid_cap = vmx_control_msr(low: vmcs_config.nested.ept_caps, |
1350 | high: vmcs_config.nested.vpid_caps); |
1351 | |
1352 | /* Every bit is either reserved or a feature bit. */ |
1353 | if (!is_bitwise_subset(superset: vmx_ept_vpid_cap, subset: data, mask: -1ULL)) |
1354 | return -EINVAL; |
1355 | |
1356 | vmx->nested.msrs.ept_caps = data; |
1357 | vmx->nested.msrs.vpid_caps = data >> 32; |
1358 | return 0; |
1359 | } |
1360 | |
1361 | static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) |
1362 | { |
1363 | switch (msr_index) { |
1364 | case MSR_IA32_VMX_CR0_FIXED0: |
1365 | return &msrs->cr0_fixed0; |
1366 | case MSR_IA32_VMX_CR4_FIXED0: |
1367 | return &msrs->cr4_fixed0; |
1368 | default: |
1369 | BUG(); |
1370 | } |
1371 | } |
1372 | |
1373 | static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) |
1374 | { |
1375 | const u64 *msr = vmx_get_fixed0_msr(msrs: &vmcs_config.nested, msr_index); |
1376 | |
1377 | /* |
1378 | * 1 bits (which indicates bits which "must-be-1" during VMX operation) |
1379 | * must be 1 in the restored value. |
1380 | */ |
1381 | if (!is_bitwise_subset(superset: data, subset: *msr, mask: -1ULL)) |
1382 | return -EINVAL; |
1383 | |
1384 | *vmx_get_fixed0_msr(msrs: &vmx->nested.msrs, msr_index) = data; |
1385 | return 0; |
1386 | } |
1387 | |
1388 | /* |
1389 | * Called when userspace is restoring VMX MSRs. |
1390 | * |
1391 | * Returns 0 on success, non-0 otherwise. |
1392 | */ |
1393 | int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) |
1394 | { |
1395 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1396 | |
1397 | /* |
1398 | * Don't allow changes to the VMX capability MSRs while the vCPU |
1399 | * is in VMX operation. |
1400 | */ |
1401 | if (vmx->nested.vmxon) |
1402 | return -EBUSY; |
1403 | |
1404 | switch (msr_index) { |
1405 | case MSR_IA32_VMX_BASIC: |
1406 | return vmx_restore_vmx_basic(vmx, data); |
1407 | case MSR_IA32_VMX_PINBASED_CTLS: |
1408 | case MSR_IA32_VMX_PROCBASED_CTLS: |
1409 | case MSR_IA32_VMX_EXIT_CTLS: |
1410 | case MSR_IA32_VMX_ENTRY_CTLS: |
1411 | /* |
1412 | * The "non-true" VMX capability MSRs are generated from the |
1413 | * "true" MSRs, so we do not support restoring them directly. |
1414 | * |
1415 | * If userspace wants to emulate VMX_BASIC[55]=0, userspace |
1416 | * should restore the "true" MSRs with the must-be-1 bits |
1417 | * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND |
1418 | * DEFAULT SETTINGS". |
1419 | */ |
1420 | return -EINVAL; |
1421 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: |
1422 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: |
1423 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: |
1424 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: |
1425 | case MSR_IA32_VMX_PROCBASED_CTLS2: |
1426 | return vmx_restore_control_msr(vmx, msr_index, data); |
1427 | case MSR_IA32_VMX_MISC: |
1428 | return vmx_restore_vmx_misc(vmx, data); |
1429 | case MSR_IA32_VMX_CR0_FIXED0: |
1430 | case MSR_IA32_VMX_CR4_FIXED0: |
1431 | return vmx_restore_fixed0_msr(vmx, msr_index, data); |
1432 | case MSR_IA32_VMX_CR0_FIXED1: |
1433 | case MSR_IA32_VMX_CR4_FIXED1: |
1434 | /* |
1435 | * These MSRs are generated based on the vCPU's CPUID, so we |
1436 | * do not support restoring them directly. |
1437 | */ |
1438 | return -EINVAL; |
1439 | case MSR_IA32_VMX_EPT_VPID_CAP: |
1440 | return vmx_restore_vmx_ept_vpid_cap(vmx, data); |
1441 | case MSR_IA32_VMX_VMCS_ENUM: |
1442 | vmx->nested.msrs.vmcs_enum = data; |
1443 | return 0; |
1444 | case MSR_IA32_VMX_VMFUNC: |
1445 | if (data & ~vmcs_config.nested.vmfunc_controls) |
1446 | return -EINVAL; |
1447 | vmx->nested.msrs.vmfunc_controls = data; |
1448 | return 0; |
1449 | default: |
1450 | /* |
1451 | * The rest of the VMX capability MSRs do not support restore. |
1452 | */ |
1453 | return -EINVAL; |
1454 | } |
1455 | } |
1456 | |
1457 | /* Returns 0 on success, non-0 otherwise. */ |
1458 | int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) |
1459 | { |
1460 | switch (msr_index) { |
1461 | case MSR_IA32_VMX_BASIC: |
1462 | *pdata = msrs->basic; |
1463 | break; |
1464 | case MSR_IA32_VMX_TRUE_PINBASED_CTLS: |
1465 | case MSR_IA32_VMX_PINBASED_CTLS: |
1466 | *pdata = vmx_control_msr( |
1467 | low: msrs->pinbased_ctls_low, |
1468 | high: msrs->pinbased_ctls_high); |
1469 | if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) |
1470 | *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
1471 | break; |
1472 | case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: |
1473 | case MSR_IA32_VMX_PROCBASED_CTLS: |
1474 | *pdata = vmx_control_msr( |
1475 | low: msrs->procbased_ctls_low, |
1476 | high: msrs->procbased_ctls_high); |
1477 | if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) |
1478 | *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
1479 | break; |
1480 | case MSR_IA32_VMX_TRUE_EXIT_CTLS: |
1481 | case MSR_IA32_VMX_EXIT_CTLS: |
1482 | *pdata = vmx_control_msr( |
1483 | low: msrs->exit_ctls_low, |
1484 | high: msrs->exit_ctls_high); |
1485 | if (msr_index == MSR_IA32_VMX_EXIT_CTLS) |
1486 | *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; |
1487 | break; |
1488 | case MSR_IA32_VMX_TRUE_ENTRY_CTLS: |
1489 | case MSR_IA32_VMX_ENTRY_CTLS: |
1490 | *pdata = vmx_control_msr( |
1491 | low: msrs->entry_ctls_low, |
1492 | high: msrs->entry_ctls_high); |
1493 | if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) |
1494 | *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; |
1495 | break; |
1496 | case MSR_IA32_VMX_MISC: |
1497 | *pdata = vmx_control_msr( |
1498 | low: msrs->misc_low, |
1499 | high: msrs->misc_high); |
1500 | break; |
1501 | case MSR_IA32_VMX_CR0_FIXED0: |
1502 | *pdata = msrs->cr0_fixed0; |
1503 | break; |
1504 | case MSR_IA32_VMX_CR0_FIXED1: |
1505 | *pdata = msrs->cr0_fixed1; |
1506 | break; |
1507 | case MSR_IA32_VMX_CR4_FIXED0: |
1508 | *pdata = msrs->cr4_fixed0; |
1509 | break; |
1510 | case MSR_IA32_VMX_CR4_FIXED1: |
1511 | *pdata = msrs->cr4_fixed1; |
1512 | break; |
1513 | case MSR_IA32_VMX_VMCS_ENUM: |
1514 | *pdata = msrs->vmcs_enum; |
1515 | break; |
1516 | case MSR_IA32_VMX_PROCBASED_CTLS2: |
1517 | *pdata = vmx_control_msr( |
1518 | low: msrs->secondary_ctls_low, |
1519 | high: msrs->secondary_ctls_high); |
1520 | break; |
1521 | case MSR_IA32_VMX_EPT_VPID_CAP: |
1522 | *pdata = msrs->ept_caps | |
1523 | ((u64)msrs->vpid_caps << 32); |
1524 | break; |
1525 | case MSR_IA32_VMX_VMFUNC: |
1526 | *pdata = msrs->vmfunc_controls; |
1527 | break; |
1528 | default: |
1529 | return 1; |
1530 | } |
1531 | |
1532 | return 0; |
1533 | } |
1534 | |
1535 | /* |
1536 | * Copy the writable VMCS shadow fields back to the VMCS12, in case they have |
1537 | * been modified by the L1 guest. Note, "writable" in this context means |
1538 | * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of |
1539 | * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" |
1540 | * VM-exit information fields (which are actually writable if the vCPU is |
1541 | * configured to support "VMWRITE to any supported field in the VMCS"). |
1542 | */ |
1543 | static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) |
1544 | { |
1545 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; |
1546 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu: &vmx->vcpu); |
1547 | struct shadow_vmcs_field field; |
1548 | unsigned long val; |
1549 | int i; |
1550 | |
1551 | if (WARN_ON(!shadow_vmcs)) |
1552 | return; |
1553 | |
1554 | preempt_disable(); |
1555 | |
1556 | vmcs_load(vmcs: shadow_vmcs); |
1557 | |
1558 | for (i = 0; i < max_shadow_read_write_fields; i++) { |
1559 | field = shadow_read_write_fields[i]; |
1560 | val = __vmcs_readl(field: field.encoding); |
1561 | vmcs12_write_any(vmcs12, field: field.encoding, offset: field.offset, field_value: val); |
1562 | } |
1563 | |
1564 | vmcs_clear(vmcs: shadow_vmcs); |
1565 | vmcs_load(vmcs: vmx->loaded_vmcs->vmcs); |
1566 | |
1567 | preempt_enable(); |
1568 | } |
1569 | |
1570 | static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) |
1571 | { |
1572 | const struct shadow_vmcs_field *fields[] = { |
1573 | shadow_read_write_fields, |
1574 | shadow_read_only_fields |
1575 | }; |
1576 | const int max_fields[] = { |
1577 | max_shadow_read_write_fields, |
1578 | max_shadow_read_only_fields |
1579 | }; |
1580 | struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; |
1581 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu: &vmx->vcpu); |
1582 | struct shadow_vmcs_field field; |
1583 | unsigned long val; |
1584 | int i, q; |
1585 | |
1586 | if (WARN_ON(!shadow_vmcs)) |
1587 | return; |
1588 | |
1589 | vmcs_load(vmcs: shadow_vmcs); |
1590 | |
1591 | for (q = 0; q < ARRAY_SIZE(fields); q++) { |
1592 | for (i = 0; i < max_fields[q]; i++) { |
1593 | field = fields[q][i]; |
1594 | val = vmcs12_read_any(vmcs12, field: field.encoding, |
1595 | offset: field.offset); |
1596 | __vmcs_writel(field: field.encoding, value: val); |
1597 | } |
1598 | } |
1599 | |
1600 | vmcs_clear(vmcs: shadow_vmcs); |
1601 | vmcs_load(vmcs: vmx->loaded_vmcs->vmcs); |
1602 | } |
1603 | |
1604 | static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) |
1605 | { |
1606 | #ifdef CONFIG_KVM_HYPERV |
1607 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; |
1608 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
1609 | struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu: &vmx->vcpu); |
1610 | |
1611 | /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ |
1612 | vmcs12->tpr_threshold = evmcs->tpr_threshold; |
1613 | vmcs12->guest_rip = evmcs->guest_rip; |
1614 | |
1615 | if (unlikely(!(hv_clean_fields & |
1616 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { |
1617 | hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; |
1618 | hv_vcpu->nested.vm_id = evmcs->hv_vm_id; |
1619 | hv_vcpu->nested.vp_id = evmcs->hv_vp_id; |
1620 | } |
1621 | |
1622 | if (unlikely(!(hv_clean_fields & |
1623 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { |
1624 | vmcs12->guest_rsp = evmcs->guest_rsp; |
1625 | vmcs12->guest_rflags = evmcs->guest_rflags; |
1626 | vmcs12->guest_interruptibility_info = |
1627 | evmcs->guest_interruptibility_info; |
1628 | /* |
1629 | * Not present in struct vmcs12: |
1630 | * vmcs12->guest_ssp = evmcs->guest_ssp; |
1631 | */ |
1632 | } |
1633 | |
1634 | if (unlikely(!(hv_clean_fields & |
1635 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { |
1636 | vmcs12->cpu_based_vm_exec_control = |
1637 | evmcs->cpu_based_vm_exec_control; |
1638 | } |
1639 | |
1640 | if (unlikely(!(hv_clean_fields & |
1641 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { |
1642 | vmcs12->exception_bitmap = evmcs->exception_bitmap; |
1643 | } |
1644 | |
1645 | if (unlikely(!(hv_clean_fields & |
1646 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { |
1647 | vmcs12->vm_entry_controls = evmcs->vm_entry_controls; |
1648 | } |
1649 | |
1650 | if (unlikely(!(hv_clean_fields & |
1651 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { |
1652 | vmcs12->vm_entry_intr_info_field = |
1653 | evmcs->vm_entry_intr_info_field; |
1654 | vmcs12->vm_entry_exception_error_code = |
1655 | evmcs->vm_entry_exception_error_code; |
1656 | vmcs12->vm_entry_instruction_len = |
1657 | evmcs->vm_entry_instruction_len; |
1658 | } |
1659 | |
1660 | if (unlikely(!(hv_clean_fields & |
1661 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { |
1662 | vmcs12->host_ia32_pat = evmcs->host_ia32_pat; |
1663 | vmcs12->host_ia32_efer = evmcs->host_ia32_efer; |
1664 | vmcs12->host_cr0 = evmcs->host_cr0; |
1665 | vmcs12->host_cr3 = evmcs->host_cr3; |
1666 | vmcs12->host_cr4 = evmcs->host_cr4; |
1667 | vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; |
1668 | vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; |
1669 | vmcs12->host_rip = evmcs->host_rip; |
1670 | vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; |
1671 | vmcs12->host_es_selector = evmcs->host_es_selector; |
1672 | vmcs12->host_cs_selector = evmcs->host_cs_selector; |
1673 | vmcs12->host_ss_selector = evmcs->host_ss_selector; |
1674 | vmcs12->host_ds_selector = evmcs->host_ds_selector; |
1675 | vmcs12->host_fs_selector = evmcs->host_fs_selector; |
1676 | vmcs12->host_gs_selector = evmcs->host_gs_selector; |
1677 | vmcs12->host_tr_selector = evmcs->host_tr_selector; |
1678 | vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; |
1679 | /* |
1680 | * Not present in struct vmcs12: |
1681 | * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; |
1682 | * vmcs12->host_ssp = evmcs->host_ssp; |
1683 | * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; |
1684 | */ |
1685 | } |
1686 | |
1687 | if (unlikely(!(hv_clean_fields & |
1688 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { |
1689 | vmcs12->pin_based_vm_exec_control = |
1690 | evmcs->pin_based_vm_exec_control; |
1691 | vmcs12->vm_exit_controls = evmcs->vm_exit_controls; |
1692 | vmcs12->secondary_vm_exec_control = |
1693 | evmcs->secondary_vm_exec_control; |
1694 | } |
1695 | |
1696 | if (unlikely(!(hv_clean_fields & |
1697 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { |
1698 | vmcs12->io_bitmap_a = evmcs->io_bitmap_a; |
1699 | vmcs12->io_bitmap_b = evmcs->io_bitmap_b; |
1700 | } |
1701 | |
1702 | if (unlikely(!(hv_clean_fields & |
1703 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { |
1704 | vmcs12->msr_bitmap = evmcs->msr_bitmap; |
1705 | } |
1706 | |
1707 | if (unlikely(!(hv_clean_fields & |
1708 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { |
1709 | vmcs12->guest_es_base = evmcs->guest_es_base; |
1710 | vmcs12->guest_cs_base = evmcs->guest_cs_base; |
1711 | vmcs12->guest_ss_base = evmcs->guest_ss_base; |
1712 | vmcs12->guest_ds_base = evmcs->guest_ds_base; |
1713 | vmcs12->guest_fs_base = evmcs->guest_fs_base; |
1714 | vmcs12->guest_gs_base = evmcs->guest_gs_base; |
1715 | vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; |
1716 | vmcs12->guest_tr_base = evmcs->guest_tr_base; |
1717 | vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; |
1718 | vmcs12->guest_idtr_base = evmcs->guest_idtr_base; |
1719 | vmcs12->guest_es_limit = evmcs->guest_es_limit; |
1720 | vmcs12->guest_cs_limit = evmcs->guest_cs_limit; |
1721 | vmcs12->guest_ss_limit = evmcs->guest_ss_limit; |
1722 | vmcs12->guest_ds_limit = evmcs->guest_ds_limit; |
1723 | vmcs12->guest_fs_limit = evmcs->guest_fs_limit; |
1724 | vmcs12->guest_gs_limit = evmcs->guest_gs_limit; |
1725 | vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; |
1726 | vmcs12->guest_tr_limit = evmcs->guest_tr_limit; |
1727 | vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; |
1728 | vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; |
1729 | vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; |
1730 | vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; |
1731 | vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; |
1732 | vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; |
1733 | vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; |
1734 | vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; |
1735 | vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; |
1736 | vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; |
1737 | vmcs12->guest_es_selector = evmcs->guest_es_selector; |
1738 | vmcs12->guest_cs_selector = evmcs->guest_cs_selector; |
1739 | vmcs12->guest_ss_selector = evmcs->guest_ss_selector; |
1740 | vmcs12->guest_ds_selector = evmcs->guest_ds_selector; |
1741 | vmcs12->guest_fs_selector = evmcs->guest_fs_selector; |
1742 | vmcs12->guest_gs_selector = evmcs->guest_gs_selector; |
1743 | vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; |
1744 | vmcs12->guest_tr_selector = evmcs->guest_tr_selector; |
1745 | } |
1746 | |
1747 | if (unlikely(!(hv_clean_fields & |
1748 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { |
1749 | vmcs12->tsc_offset = evmcs->tsc_offset; |
1750 | vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; |
1751 | vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; |
1752 | vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; |
1753 | vmcs12->tsc_multiplier = evmcs->tsc_multiplier; |
1754 | } |
1755 | |
1756 | if (unlikely(!(hv_clean_fields & |
1757 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { |
1758 | vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; |
1759 | vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; |
1760 | vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; |
1761 | vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; |
1762 | vmcs12->guest_cr0 = evmcs->guest_cr0; |
1763 | vmcs12->guest_cr3 = evmcs->guest_cr3; |
1764 | vmcs12->guest_cr4 = evmcs->guest_cr4; |
1765 | vmcs12->guest_dr7 = evmcs->guest_dr7; |
1766 | } |
1767 | |
1768 | if (unlikely(!(hv_clean_fields & |
1769 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { |
1770 | vmcs12->host_fs_base = evmcs->host_fs_base; |
1771 | vmcs12->host_gs_base = evmcs->host_gs_base; |
1772 | vmcs12->host_tr_base = evmcs->host_tr_base; |
1773 | vmcs12->host_gdtr_base = evmcs->host_gdtr_base; |
1774 | vmcs12->host_idtr_base = evmcs->host_idtr_base; |
1775 | vmcs12->host_rsp = evmcs->host_rsp; |
1776 | } |
1777 | |
1778 | if (unlikely(!(hv_clean_fields & |
1779 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { |
1780 | vmcs12->ept_pointer = evmcs->ept_pointer; |
1781 | vmcs12->virtual_processor_id = evmcs->virtual_processor_id; |
1782 | } |
1783 | |
1784 | if (unlikely(!(hv_clean_fields & |
1785 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { |
1786 | vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; |
1787 | vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; |
1788 | vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; |
1789 | vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; |
1790 | vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; |
1791 | vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; |
1792 | vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; |
1793 | vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; |
1794 | vmcs12->guest_pending_dbg_exceptions = |
1795 | evmcs->guest_pending_dbg_exceptions; |
1796 | vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; |
1797 | vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; |
1798 | vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; |
1799 | vmcs12->guest_activity_state = evmcs->guest_activity_state; |
1800 | vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; |
1801 | vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; |
1802 | /* |
1803 | * Not present in struct vmcs12: |
1804 | * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; |
1805 | * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; |
1806 | * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; |
1807 | */ |
1808 | } |
1809 | |
1810 | /* |
1811 | * Not used? |
1812 | * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; |
1813 | * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; |
1814 | * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; |
1815 | * vmcs12->page_fault_error_code_mask = |
1816 | * evmcs->page_fault_error_code_mask; |
1817 | * vmcs12->page_fault_error_code_match = |
1818 | * evmcs->page_fault_error_code_match; |
1819 | * vmcs12->cr3_target_count = evmcs->cr3_target_count; |
1820 | * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; |
1821 | * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; |
1822 | * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; |
1823 | */ |
1824 | |
1825 | /* |
1826 | * Read only fields: |
1827 | * vmcs12->guest_physical_address = evmcs->guest_physical_address; |
1828 | * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; |
1829 | * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; |
1830 | * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; |
1831 | * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; |
1832 | * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; |
1833 | * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; |
1834 | * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; |
1835 | * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; |
1836 | * vmcs12->exit_qualification = evmcs->exit_qualification; |
1837 | * vmcs12->guest_linear_address = evmcs->guest_linear_address; |
1838 | * |
1839 | * Not present in struct vmcs12: |
1840 | * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; |
1841 | * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; |
1842 | * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; |
1843 | * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; |
1844 | */ |
1845 | |
1846 | return; |
1847 | #else /* CONFIG_KVM_HYPERV */ |
1848 | KVM_BUG_ON(1, vmx->vcpu.kvm); |
1849 | #endif /* CONFIG_KVM_HYPERV */ |
1850 | } |
1851 | |
1852 | static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) |
1853 | { |
1854 | #ifdef CONFIG_KVM_HYPERV |
1855 | struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; |
1856 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
1857 | |
1858 | /* |
1859 | * Should not be changed by KVM: |
1860 | * |
1861 | * evmcs->host_es_selector = vmcs12->host_es_selector; |
1862 | * evmcs->host_cs_selector = vmcs12->host_cs_selector; |
1863 | * evmcs->host_ss_selector = vmcs12->host_ss_selector; |
1864 | * evmcs->host_ds_selector = vmcs12->host_ds_selector; |
1865 | * evmcs->host_fs_selector = vmcs12->host_fs_selector; |
1866 | * evmcs->host_gs_selector = vmcs12->host_gs_selector; |
1867 | * evmcs->host_tr_selector = vmcs12->host_tr_selector; |
1868 | * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; |
1869 | * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; |
1870 | * evmcs->host_cr0 = vmcs12->host_cr0; |
1871 | * evmcs->host_cr3 = vmcs12->host_cr3; |
1872 | * evmcs->host_cr4 = vmcs12->host_cr4; |
1873 | * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; |
1874 | * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; |
1875 | * evmcs->host_rip = vmcs12->host_rip; |
1876 | * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; |
1877 | * evmcs->host_fs_base = vmcs12->host_fs_base; |
1878 | * evmcs->host_gs_base = vmcs12->host_gs_base; |
1879 | * evmcs->host_tr_base = vmcs12->host_tr_base; |
1880 | * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; |
1881 | * evmcs->host_idtr_base = vmcs12->host_idtr_base; |
1882 | * evmcs->host_rsp = vmcs12->host_rsp; |
1883 | * sync_vmcs02_to_vmcs12() doesn't read these: |
1884 | * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; |
1885 | * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; |
1886 | * evmcs->msr_bitmap = vmcs12->msr_bitmap; |
1887 | * evmcs->ept_pointer = vmcs12->ept_pointer; |
1888 | * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; |
1889 | * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; |
1890 | * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; |
1891 | * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; |
1892 | * evmcs->tpr_threshold = vmcs12->tpr_threshold; |
1893 | * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; |
1894 | * evmcs->exception_bitmap = vmcs12->exception_bitmap; |
1895 | * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; |
1896 | * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; |
1897 | * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; |
1898 | * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; |
1899 | * evmcs->page_fault_error_code_mask = |
1900 | * vmcs12->page_fault_error_code_mask; |
1901 | * evmcs->page_fault_error_code_match = |
1902 | * vmcs12->page_fault_error_code_match; |
1903 | * evmcs->cr3_target_count = vmcs12->cr3_target_count; |
1904 | * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; |
1905 | * evmcs->tsc_offset = vmcs12->tsc_offset; |
1906 | * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; |
1907 | * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; |
1908 | * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; |
1909 | * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; |
1910 | * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; |
1911 | * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; |
1912 | * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; |
1913 | * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; |
1914 | * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; |
1915 | * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; |
1916 | * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; |
1917 | * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; |
1918 | * |
1919 | * Not present in struct vmcs12: |
1920 | * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; |
1921 | * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; |
1922 | * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; |
1923 | * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; |
1924 | * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; |
1925 | * evmcs->host_ssp = vmcs12->host_ssp; |
1926 | * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; |
1927 | * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; |
1928 | * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; |
1929 | * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; |
1930 | * evmcs->guest_ssp = vmcs12->guest_ssp; |
1931 | */ |
1932 | |
1933 | evmcs->guest_es_selector = vmcs12->guest_es_selector; |
1934 | evmcs->guest_cs_selector = vmcs12->guest_cs_selector; |
1935 | evmcs->guest_ss_selector = vmcs12->guest_ss_selector; |
1936 | evmcs->guest_ds_selector = vmcs12->guest_ds_selector; |
1937 | evmcs->guest_fs_selector = vmcs12->guest_fs_selector; |
1938 | evmcs->guest_gs_selector = vmcs12->guest_gs_selector; |
1939 | evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; |
1940 | evmcs->guest_tr_selector = vmcs12->guest_tr_selector; |
1941 | |
1942 | evmcs->guest_es_limit = vmcs12->guest_es_limit; |
1943 | evmcs->guest_cs_limit = vmcs12->guest_cs_limit; |
1944 | evmcs->guest_ss_limit = vmcs12->guest_ss_limit; |
1945 | evmcs->guest_ds_limit = vmcs12->guest_ds_limit; |
1946 | evmcs->guest_fs_limit = vmcs12->guest_fs_limit; |
1947 | evmcs->guest_gs_limit = vmcs12->guest_gs_limit; |
1948 | evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; |
1949 | evmcs->guest_tr_limit = vmcs12->guest_tr_limit; |
1950 | evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; |
1951 | evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; |
1952 | |
1953 | evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; |
1954 | evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; |
1955 | evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; |
1956 | evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; |
1957 | evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; |
1958 | evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; |
1959 | evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; |
1960 | evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; |
1961 | |
1962 | evmcs->guest_es_base = vmcs12->guest_es_base; |
1963 | evmcs->guest_cs_base = vmcs12->guest_cs_base; |
1964 | evmcs->guest_ss_base = vmcs12->guest_ss_base; |
1965 | evmcs->guest_ds_base = vmcs12->guest_ds_base; |
1966 | evmcs->guest_fs_base = vmcs12->guest_fs_base; |
1967 | evmcs->guest_gs_base = vmcs12->guest_gs_base; |
1968 | evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; |
1969 | evmcs->guest_tr_base = vmcs12->guest_tr_base; |
1970 | evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; |
1971 | evmcs->guest_idtr_base = vmcs12->guest_idtr_base; |
1972 | |
1973 | evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; |
1974 | evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; |
1975 | |
1976 | evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; |
1977 | evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; |
1978 | evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; |
1979 | evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; |
1980 | |
1981 | evmcs->guest_pending_dbg_exceptions = |
1982 | vmcs12->guest_pending_dbg_exceptions; |
1983 | evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; |
1984 | evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; |
1985 | |
1986 | evmcs->guest_activity_state = vmcs12->guest_activity_state; |
1987 | evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; |
1988 | |
1989 | evmcs->guest_cr0 = vmcs12->guest_cr0; |
1990 | evmcs->guest_cr3 = vmcs12->guest_cr3; |
1991 | evmcs->guest_cr4 = vmcs12->guest_cr4; |
1992 | evmcs->guest_dr7 = vmcs12->guest_dr7; |
1993 | |
1994 | evmcs->guest_physical_address = vmcs12->guest_physical_address; |
1995 | |
1996 | evmcs->vm_instruction_error = vmcs12->vm_instruction_error; |
1997 | evmcs->vm_exit_reason = vmcs12->vm_exit_reason; |
1998 | evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; |
1999 | evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; |
2000 | evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; |
2001 | evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; |
2002 | evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; |
2003 | evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; |
2004 | |
2005 | evmcs->exit_qualification = vmcs12->exit_qualification; |
2006 | |
2007 | evmcs->guest_linear_address = vmcs12->guest_linear_address; |
2008 | evmcs->guest_rsp = vmcs12->guest_rsp; |
2009 | evmcs->guest_rflags = vmcs12->guest_rflags; |
2010 | |
2011 | evmcs->guest_interruptibility_info = |
2012 | vmcs12->guest_interruptibility_info; |
2013 | evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; |
2014 | evmcs->vm_entry_controls = vmcs12->vm_entry_controls; |
2015 | evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; |
2016 | evmcs->vm_entry_exception_error_code = |
2017 | vmcs12->vm_entry_exception_error_code; |
2018 | evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; |
2019 | |
2020 | evmcs->guest_rip = vmcs12->guest_rip; |
2021 | |
2022 | evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; |
2023 | |
2024 | return; |
2025 | #else /* CONFIG_KVM_HYPERV */ |
2026 | KVM_BUG_ON(1, vmx->vcpu.kvm); |
2027 | #endif /* CONFIG_KVM_HYPERV */ |
2028 | } |
2029 | |
2030 | /* |
2031 | * This is an equivalent of the nested hypervisor executing the vmptrld |
2032 | * instruction. |
2033 | */ |
2034 | static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( |
2035 | struct kvm_vcpu *vcpu, bool from_launch) |
2036 | { |
2037 | #ifdef CONFIG_KVM_HYPERV |
2038 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2039 | bool evmcs_gpa_changed = false; |
2040 | u64 evmcs_gpa; |
2041 | |
2042 | if (likely(!guest_cpuid_has_evmcs(vcpu))) |
2043 | return EVMPTRLD_DISABLED; |
2044 | |
2045 | evmcs_gpa = nested_get_evmptr(vcpu); |
2046 | if (!evmptr_is_valid(evmptr: evmcs_gpa)) { |
2047 | nested_release_evmcs(vcpu); |
2048 | return EVMPTRLD_DISABLED; |
2049 | } |
2050 | |
2051 | if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { |
2052 | vmx->nested.current_vmptr = INVALID_GPA; |
2053 | |
2054 | nested_release_evmcs(vcpu); |
2055 | |
2056 | if (kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: evmcs_gpa), |
2057 | map: &vmx->nested.hv_evmcs_map)) |
2058 | return EVMPTRLD_ERROR; |
2059 | |
2060 | vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; |
2061 | |
2062 | /* |
2063 | * Currently, KVM only supports eVMCS version 1 |
2064 | * (== KVM_EVMCS_VERSION) and thus we expect guest to set this |
2065 | * value to first u32 field of eVMCS which should specify eVMCS |
2066 | * VersionNumber. |
2067 | * |
2068 | * Guest should be aware of supported eVMCS versions by host by |
2069 | * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is |
2070 | * expected to set this CPUID leaf according to the value |
2071 | * returned in vmcs_version from nested_enable_evmcs(). |
2072 | * |
2073 | * However, it turns out that Microsoft Hyper-V fails to comply |
2074 | * to their own invented interface: When Hyper-V use eVMCS, it |
2075 | * just sets first u32 field of eVMCS to revision_id specified |
2076 | * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number |
2077 | * which is one of the supported versions specified in |
2078 | * CPUID.0x4000000A.EAX[0:15]. |
2079 | * |
2080 | * To overcome Hyper-V bug, we accept here either a supported |
2081 | * eVMCS version or VMCS12 revision_id as valid values for first |
2082 | * u32 field of eVMCS. |
2083 | */ |
2084 | if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && |
2085 | (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { |
2086 | nested_release_evmcs(vcpu); |
2087 | return EVMPTRLD_VMFAIL; |
2088 | } |
2089 | |
2090 | vmx->nested.hv_evmcs_vmptr = evmcs_gpa; |
2091 | |
2092 | evmcs_gpa_changed = true; |
2093 | /* |
2094 | * Unlike normal vmcs12, enlightened vmcs12 is not fully |
2095 | * reloaded from guest's memory (read only fields, fields not |
2096 | * present in struct hv_enlightened_vmcs, ...). Make sure there |
2097 | * are no leftovers. |
2098 | */ |
2099 | if (from_launch) { |
2100 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
2101 | memset(vmcs12, 0, sizeof(*vmcs12)); |
2102 | vmcs12->hdr.revision_id = VMCS12_REVISION; |
2103 | } |
2104 | |
2105 | } |
2106 | |
2107 | /* |
2108 | * Clean fields data can't be used on VMLAUNCH and when we switch |
2109 | * between different L2 guests as KVM keeps a single VMCS12 per L1. |
2110 | */ |
2111 | if (from_launch || evmcs_gpa_changed) { |
2112 | vmx->nested.hv_evmcs->hv_clean_fields &= |
2113 | ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; |
2114 | |
2115 | vmx->nested.force_msr_bitmap_recalc = true; |
2116 | } |
2117 | |
2118 | return EVMPTRLD_SUCCEEDED; |
2119 | #else |
2120 | return EVMPTRLD_DISABLED; |
2121 | #endif |
2122 | } |
2123 | |
2124 | void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) |
2125 | { |
2126 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2127 | |
2128 | if (nested_vmx_is_evmptr12_valid(vmx)) |
2129 | copy_vmcs12_to_enlightened(vmx); |
2130 | else |
2131 | copy_vmcs12_to_shadow(vmx); |
2132 | |
2133 | vmx->nested.need_vmcs12_to_shadow_sync = false; |
2134 | } |
2135 | |
2136 | static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) |
2137 | { |
2138 | struct vcpu_vmx *vmx = |
2139 | container_of(timer, struct vcpu_vmx, nested.preemption_timer); |
2140 | |
2141 | vmx->nested.preemption_timer_expired = true; |
2142 | kvm_make_request(KVM_REQ_EVENT, vcpu: &vmx->vcpu); |
2143 | kvm_vcpu_kick(vcpu: &vmx->vcpu); |
2144 | |
2145 | return HRTIMER_NORESTART; |
2146 | } |
2147 | |
2148 | static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) |
2149 | { |
2150 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2151 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
2152 | |
2153 | u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, host_tsc: rdtsc()) >> |
2154 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; |
2155 | |
2156 | if (!vmx->nested.has_preemption_timer_deadline) { |
2157 | vmx->nested.preemption_timer_deadline = |
2158 | vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; |
2159 | vmx->nested.has_preemption_timer_deadline = true; |
2160 | } |
2161 | return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; |
2162 | } |
2163 | |
2164 | static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, |
2165 | u64 preemption_timeout) |
2166 | { |
2167 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2168 | |
2169 | /* |
2170 | * A timer value of zero is architecturally guaranteed to cause |
2171 | * a VMExit prior to executing any instructions in the guest. |
2172 | */ |
2173 | if (preemption_timeout == 0) { |
2174 | vmx_preemption_timer_fn(timer: &vmx->nested.preemption_timer); |
2175 | return; |
2176 | } |
2177 | |
2178 | if (vcpu->arch.virtual_tsc_khz == 0) |
2179 | return; |
2180 | |
2181 | preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; |
2182 | preemption_timeout *= 1000000; |
2183 | do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); |
2184 | hrtimer_start(timer: &vmx->nested.preemption_timer, |
2185 | ktime_add_ns(ktime_get(), preemption_timeout), |
2186 | mode: HRTIMER_MODE_ABS_PINNED); |
2187 | } |
2188 | |
2189 | static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) |
2190 | { |
2191 | if (vmx->nested.nested_run_pending && |
2192 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) |
2193 | return vmcs12->guest_ia32_efer; |
2194 | else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) |
2195 | return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); |
2196 | else |
2197 | return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); |
2198 | } |
2199 | |
2200 | static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) |
2201 | { |
2202 | struct kvm *kvm = vmx->vcpu.kvm; |
2203 | |
2204 | /* |
2205 | * If vmcs02 hasn't been initialized, set the constant vmcs02 state |
2206 | * according to L0's settings (vmcs12 is irrelevant here). Host |
2207 | * fields that come from L0 and are not constant, e.g. HOST_CR3, |
2208 | * will be set as needed prior to VMLAUNCH/VMRESUME. |
2209 | */ |
2210 | if (vmx->nested.vmcs02_initialized) |
2211 | return; |
2212 | vmx->nested.vmcs02_initialized = true; |
2213 | |
2214 | /* |
2215 | * We don't care what the EPTP value is we just need to guarantee |
2216 | * it's valid so we don't get a false positive when doing early |
2217 | * consistency checks. |
2218 | */ |
2219 | if (enable_ept && nested_early_check) |
2220 | vmcs_write64(field: EPT_POINTER, |
2221 | value: construct_eptp(vcpu: &vmx->vcpu, root_hpa: 0, PT64_ROOT_4LEVEL)); |
2222 | |
2223 | /* All VMFUNCs are currently emulated through L0 vmexits. */ |
2224 | if (cpu_has_vmx_vmfunc()) |
2225 | vmcs_write64(field: VM_FUNCTION_CONTROL, value: 0); |
2226 | |
2227 | if (cpu_has_vmx_posted_intr()) |
2228 | vmcs_write16(field: POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); |
2229 | |
2230 | if (cpu_has_vmx_msr_bitmap()) |
2231 | vmcs_write64(field: MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); |
2232 | |
2233 | /* |
2234 | * PML is emulated for L2, but never enabled in hardware as the MMU |
2235 | * handles A/D emulation. Disabling PML for L2 also avoids having to |
2236 | * deal with filtering out L2 GPAs from the buffer. |
2237 | */ |
2238 | if (enable_pml) { |
2239 | vmcs_write64(field: PML_ADDRESS, value: 0); |
2240 | vmcs_write16(field: GUEST_PML_INDEX, value: -1); |
2241 | } |
2242 | |
2243 | if (cpu_has_vmx_encls_vmexit()) |
2244 | vmcs_write64(field: ENCLS_EXITING_BITMAP, INVALID_GPA); |
2245 | |
2246 | if (kvm_notify_vmexit_enabled(kvm)) |
2247 | vmcs_write32(field: NOTIFY_WINDOW, value: kvm->arch.notify_window); |
2248 | |
2249 | /* |
2250 | * Set the MSR load/store lists to match L0's settings. Only the |
2251 | * addresses are constant (for vmcs02), the counts can change based |
2252 | * on L2's behavior, e.g. switching to/from long mode. |
2253 | */ |
2254 | vmcs_write64(field: VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); |
2255 | vmcs_write64(field: VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); |
2256 | vmcs_write64(field: VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); |
2257 | |
2258 | vmx_set_constant_host_state(vmx); |
2259 | } |
2260 | |
2261 | static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, |
2262 | struct vmcs12 *vmcs12) |
2263 | { |
2264 | prepare_vmcs02_constant_state(vmx); |
2265 | |
2266 | vmcs_write64(field: VMCS_LINK_POINTER, INVALID_GPA); |
2267 | |
2268 | if (enable_vpid) { |
2269 | if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) |
2270 | vmcs_write16(field: VIRTUAL_PROCESSOR_ID, value: vmx->nested.vpid02); |
2271 | else |
2272 | vmcs_write16(field: VIRTUAL_PROCESSOR_ID, value: vmx->vpid); |
2273 | } |
2274 | } |
2275 | |
2276 | static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, |
2277 | struct vmcs12 *vmcs12) |
2278 | { |
2279 | u32 exec_control; |
2280 | u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); |
2281 | |
2282 | if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) |
2283 | prepare_vmcs02_early_rare(vmx, vmcs12); |
2284 | |
2285 | /* |
2286 | * PIN CONTROLS |
2287 | */ |
2288 | exec_control = __pin_controls_get(vmcs: vmcs01); |
2289 | exec_control |= (vmcs12->pin_based_vm_exec_control & |
2290 | ~PIN_BASED_VMX_PREEMPTION_TIMER); |
2291 | |
2292 | /* Posted interrupts setting is only taken from vmcs12. */ |
2293 | vmx->nested.pi_pending = false; |
2294 | if (nested_cpu_has_posted_intr(vmcs12)) |
2295 | vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; |
2296 | else |
2297 | exec_control &= ~PIN_BASED_POSTED_INTR; |
2298 | pin_controls_set(vmx, val: exec_control); |
2299 | |
2300 | /* |
2301 | * EXEC CONTROLS |
2302 | */ |
2303 | exec_control = __exec_controls_get(vmcs: vmcs01); /* L0's desires */ |
2304 | exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; |
2305 | exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; |
2306 | exec_control &= ~CPU_BASED_TPR_SHADOW; |
2307 | exec_control |= vmcs12->cpu_based_vm_exec_control; |
2308 | |
2309 | vmx->nested.l1_tpr_threshold = -1; |
2310 | if (exec_control & CPU_BASED_TPR_SHADOW) |
2311 | vmcs_write32(field: TPR_THRESHOLD, value: vmcs12->tpr_threshold); |
2312 | #ifdef CONFIG_X86_64 |
2313 | else |
2314 | exec_control |= CPU_BASED_CR8_LOAD_EXITING | |
2315 | CPU_BASED_CR8_STORE_EXITING; |
2316 | #endif |
2317 | |
2318 | /* |
2319 | * A vmexit (to either L1 hypervisor or L0 userspace) is always needed |
2320 | * for I/O port accesses. |
2321 | */ |
2322 | exec_control |= CPU_BASED_UNCOND_IO_EXITING; |
2323 | exec_control &= ~CPU_BASED_USE_IO_BITMAPS; |
2324 | |
2325 | /* |
2326 | * This bit will be computed in nested_get_vmcs12_pages, because |
2327 | * we do not have access to L1's MSR bitmap yet. For now, keep |
2328 | * the same bit as before, hoping to avoid multiple VMWRITEs that |
2329 | * only set/clear this bit. |
2330 | */ |
2331 | exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; |
2332 | exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; |
2333 | |
2334 | exec_controls_set(vmx, val: exec_control); |
2335 | |
2336 | /* |
2337 | * SECONDARY EXEC CONTROLS |
2338 | */ |
2339 | if (cpu_has_secondary_exec_ctrls()) { |
2340 | exec_control = __secondary_exec_controls_get(vmcs: vmcs01); |
2341 | |
2342 | /* Take the following fields only from vmcs12 */ |
2343 | exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
2344 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | |
2345 | SECONDARY_EXEC_ENABLE_INVPCID | |
2346 | SECONDARY_EXEC_ENABLE_RDTSCP | |
2347 | SECONDARY_EXEC_ENABLE_XSAVES | |
2348 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | |
2349 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
2350 | SECONDARY_EXEC_APIC_REGISTER_VIRT | |
2351 | SECONDARY_EXEC_ENABLE_VMFUNC | |
2352 | SECONDARY_EXEC_DESC); |
2353 | |
2354 | if (nested_cpu_has(vmcs12, |
2355 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) |
2356 | exec_control |= vmcs12->secondary_vm_exec_control; |
2357 | |
2358 | /* PML is emulated and never enabled in hardware for L2. */ |
2359 | exec_control &= ~SECONDARY_EXEC_ENABLE_PML; |
2360 | |
2361 | /* VMCS shadowing for L2 is emulated for now */ |
2362 | exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; |
2363 | |
2364 | /* |
2365 | * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() |
2366 | * will not have to rewrite the controls just for this bit. |
2367 | */ |
2368 | if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) |
2369 | exec_control |= SECONDARY_EXEC_DESC; |
2370 | |
2371 | if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) |
2372 | vmcs_write16(field: GUEST_INTR_STATUS, |
2373 | value: vmcs12->guest_intr_status); |
2374 | |
2375 | if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) |
2376 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; |
2377 | |
2378 | if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) |
2379 | vmx_write_encls_bitmap(vcpu: &vmx->vcpu, vmcs12); |
2380 | |
2381 | secondary_exec_controls_set(vmx, val: exec_control); |
2382 | } |
2383 | |
2384 | /* |
2385 | * ENTRY CONTROLS |
2386 | * |
2387 | * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE |
2388 | * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate |
2389 | * on the related bits (if supported by the CPU) in the hope that |
2390 | * we can avoid VMWrites during vmx_set_efer(). |
2391 | * |
2392 | * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is |
2393 | * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to |
2394 | * do the same for L2. |
2395 | */ |
2396 | exec_control = __vm_entry_controls_get(vmcs: vmcs01); |
2397 | exec_control |= (vmcs12->vm_entry_controls & |
2398 | ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); |
2399 | exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); |
2400 | if (cpu_has_load_ia32_efer()) { |
2401 | if (guest_efer & EFER_LMA) |
2402 | exec_control |= VM_ENTRY_IA32E_MODE; |
2403 | if (guest_efer != host_efer) |
2404 | exec_control |= VM_ENTRY_LOAD_IA32_EFER; |
2405 | } |
2406 | vm_entry_controls_set(vmx, val: exec_control); |
2407 | |
2408 | /* |
2409 | * EXIT CONTROLS |
2410 | * |
2411 | * L2->L1 exit controls are emulated - the hardware exit is to L0 so |
2412 | * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER |
2413 | * bits may be modified by vmx_set_efer() in prepare_vmcs02(). |
2414 | */ |
2415 | exec_control = __vm_exit_controls_get(vmcs: vmcs01); |
2416 | if (cpu_has_load_ia32_efer() && guest_efer != host_efer) |
2417 | exec_control |= VM_EXIT_LOAD_IA32_EFER; |
2418 | else |
2419 | exec_control &= ~VM_EXIT_LOAD_IA32_EFER; |
2420 | vm_exit_controls_set(vmx, val: exec_control); |
2421 | |
2422 | /* |
2423 | * Interrupt/Exception Fields |
2424 | */ |
2425 | if (vmx->nested.nested_run_pending) { |
2426 | vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, |
2427 | value: vmcs12->vm_entry_intr_info_field); |
2428 | vmcs_write32(field: VM_ENTRY_EXCEPTION_ERROR_CODE, |
2429 | value: vmcs12->vm_entry_exception_error_code); |
2430 | vmcs_write32(field: VM_ENTRY_INSTRUCTION_LEN, |
2431 | value: vmcs12->vm_entry_instruction_len); |
2432 | vmcs_write32(field: GUEST_INTERRUPTIBILITY_INFO, |
2433 | value: vmcs12->guest_interruptibility_info); |
2434 | vmx->loaded_vmcs->nmi_known_unmasked = |
2435 | !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); |
2436 | } else { |
2437 | vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, value: 0); |
2438 | } |
2439 | } |
2440 | |
2441 | static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) |
2442 | { |
2443 | struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); |
2444 | |
2445 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & |
2446 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { |
2447 | vmcs_write16(field: GUEST_ES_SELECTOR, value: vmcs12->guest_es_selector); |
2448 | vmcs_write16(field: GUEST_CS_SELECTOR, value: vmcs12->guest_cs_selector); |
2449 | vmcs_write16(field: GUEST_SS_SELECTOR, value: vmcs12->guest_ss_selector); |
2450 | vmcs_write16(field: GUEST_DS_SELECTOR, value: vmcs12->guest_ds_selector); |
2451 | vmcs_write16(field: GUEST_FS_SELECTOR, value: vmcs12->guest_fs_selector); |
2452 | vmcs_write16(field: GUEST_GS_SELECTOR, value: vmcs12->guest_gs_selector); |
2453 | vmcs_write16(field: GUEST_LDTR_SELECTOR, value: vmcs12->guest_ldtr_selector); |
2454 | vmcs_write16(field: GUEST_TR_SELECTOR, value: vmcs12->guest_tr_selector); |
2455 | vmcs_write32(field: GUEST_ES_LIMIT, value: vmcs12->guest_es_limit); |
2456 | vmcs_write32(field: GUEST_CS_LIMIT, value: vmcs12->guest_cs_limit); |
2457 | vmcs_write32(field: GUEST_SS_LIMIT, value: vmcs12->guest_ss_limit); |
2458 | vmcs_write32(field: GUEST_DS_LIMIT, value: vmcs12->guest_ds_limit); |
2459 | vmcs_write32(field: GUEST_FS_LIMIT, value: vmcs12->guest_fs_limit); |
2460 | vmcs_write32(field: GUEST_GS_LIMIT, value: vmcs12->guest_gs_limit); |
2461 | vmcs_write32(field: GUEST_LDTR_LIMIT, value: vmcs12->guest_ldtr_limit); |
2462 | vmcs_write32(field: GUEST_TR_LIMIT, value: vmcs12->guest_tr_limit); |
2463 | vmcs_write32(field: GUEST_GDTR_LIMIT, value: vmcs12->guest_gdtr_limit); |
2464 | vmcs_write32(field: GUEST_IDTR_LIMIT, value: vmcs12->guest_idtr_limit); |
2465 | vmcs_write32(field: GUEST_CS_AR_BYTES, value: vmcs12->guest_cs_ar_bytes); |
2466 | vmcs_write32(field: GUEST_SS_AR_BYTES, value: vmcs12->guest_ss_ar_bytes); |
2467 | vmcs_write32(field: GUEST_ES_AR_BYTES, value: vmcs12->guest_es_ar_bytes); |
2468 | vmcs_write32(field: GUEST_DS_AR_BYTES, value: vmcs12->guest_ds_ar_bytes); |
2469 | vmcs_write32(field: GUEST_FS_AR_BYTES, value: vmcs12->guest_fs_ar_bytes); |
2470 | vmcs_write32(field: GUEST_GS_AR_BYTES, value: vmcs12->guest_gs_ar_bytes); |
2471 | vmcs_write32(field: GUEST_LDTR_AR_BYTES, value: vmcs12->guest_ldtr_ar_bytes); |
2472 | vmcs_write32(field: GUEST_TR_AR_BYTES, value: vmcs12->guest_tr_ar_bytes); |
2473 | vmcs_writel(field: GUEST_ES_BASE, value: vmcs12->guest_es_base); |
2474 | vmcs_writel(field: GUEST_CS_BASE, value: vmcs12->guest_cs_base); |
2475 | vmcs_writel(field: GUEST_SS_BASE, value: vmcs12->guest_ss_base); |
2476 | vmcs_writel(field: GUEST_DS_BASE, value: vmcs12->guest_ds_base); |
2477 | vmcs_writel(field: GUEST_FS_BASE, value: vmcs12->guest_fs_base); |
2478 | vmcs_writel(field: GUEST_GS_BASE, value: vmcs12->guest_gs_base); |
2479 | vmcs_writel(field: GUEST_LDTR_BASE, value: vmcs12->guest_ldtr_base); |
2480 | vmcs_writel(field: GUEST_TR_BASE, value: vmcs12->guest_tr_base); |
2481 | vmcs_writel(field: GUEST_GDTR_BASE, value: vmcs12->guest_gdtr_base); |
2482 | vmcs_writel(field: GUEST_IDTR_BASE, value: vmcs12->guest_idtr_base); |
2483 | |
2484 | vmx->segment_cache.bitmask = 0; |
2485 | } |
2486 | |
2487 | if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & |
2488 | HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { |
2489 | vmcs_write32(field: GUEST_SYSENTER_CS, value: vmcs12->guest_sysenter_cs); |
2490 | vmcs_writel(field: GUEST_PENDING_DBG_EXCEPTIONS, |
2491 | value: vmcs12->guest_pending_dbg_exceptions); |
2492 | vmcs_writel(field: GUEST_SYSENTER_ESP, value: vmcs12->guest_sysenter_esp); |
2493 | vmcs_writel(field: GUEST_SYSENTER_EIP, value: vmcs12->guest_sysenter_eip); |
2494 | |
2495 | /* |
2496 | * L1 may access the L2's PDPTR, so save them to construct |
2497 | * vmcs12 |
2498 | */ |
2499 | if (enable_ept) { |
2500 | vmcs_write64(field: GUEST_PDPTR0, value: vmcs12->guest_pdptr0); |
2501 | vmcs_write64(field: GUEST_PDPTR1, value: vmcs12->guest_pdptr1); |
2502 | vmcs_write64(field: GUEST_PDPTR2, value: vmcs12->guest_pdptr2); |
2503 | vmcs_write64(field: GUEST_PDPTR3, value: vmcs12->guest_pdptr3); |
2504 | } |
2505 | |
2506 | if (kvm_mpx_supported() && vmx->nested.nested_run_pending && |
2507 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) |
2508 | vmcs_write64(field: GUEST_BNDCFGS, value: vmcs12->guest_bndcfgs); |
2509 | } |
2510 | |
2511 | if (nested_cpu_has_xsaves(vmcs12)) |
2512 | vmcs_write64(field: XSS_EXIT_BITMAP, value: vmcs12->xss_exit_bitmap); |
2513 | |
2514 | /* |
2515 | * Whether page-faults are trapped is determined by a combination of |
2516 | * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 |
2517 | * doesn't care about page faults then we should set all of these to |
2518 | * L1's desires. However, if L0 does care about (some) page faults, it |
2519 | * is not easy (if at all possible?) to merge L0 and L1's desires, we |
2520 | * simply ask to exit on each and every L2 page fault. This is done by |
2521 | * setting MASK=MATCH=0 and (see below) EB.PF=1. |
2522 | * Note that below we don't need special code to set EB.PF beyond the |
2523 | * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, |
2524 | * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when |
2525 | * !enable_ept, EB.PF is 1, so the "or" will always be 1. |
2526 | */ |
2527 | if (vmx_need_pf_intercept(vcpu: &vmx->vcpu)) { |
2528 | /* |
2529 | * TODO: if both L0 and L1 need the same MASK and MATCH, |
2530 | * go ahead and use it? |
2531 | */ |
2532 | vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MASK, value: 0); |
2533 | vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MATCH, value: 0); |
2534 | } else { |
2535 | vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MASK, value: vmcs12->page_fault_error_code_mask); |
2536 | vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MATCH, value: vmcs12->page_fault_error_code_match); |
2537 | } |
2538 | |
2539 | if (cpu_has_vmx_apicv()) { |
2540 | vmcs_write64(field: EOI_EXIT_BITMAP0, value: vmcs12->eoi_exit_bitmap0); |
2541 | vmcs_write64(field: EOI_EXIT_BITMAP1, value: vmcs12->eoi_exit_bitmap1); |
2542 | vmcs_write64(field: EOI_EXIT_BITMAP2, value: vmcs12->eoi_exit_bitmap2); |
2543 | vmcs_write64(field: EOI_EXIT_BITMAP3, value: vmcs12->eoi_exit_bitmap3); |
2544 | } |
2545 | |
2546 | /* |
2547 | * Make sure the msr_autostore list is up to date before we set the |
2548 | * count in the vmcs02. |
2549 | */ |
2550 | prepare_vmx_msr_autostore_list(vcpu: &vmx->vcpu, MSR_IA32_TSC); |
2551 | |
2552 | vmcs_write32(field: VM_EXIT_MSR_STORE_COUNT, value: vmx->msr_autostore.guest.nr); |
2553 | vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: vmx->msr_autoload.host.nr); |
2554 | vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: vmx->msr_autoload.guest.nr); |
2555 | |
2556 | set_cr4_guest_host_mask(vmx); |
2557 | } |
2558 | |
2559 | /* |
2560 | * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested |
2561 | * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it |
2562 | * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 |
2563 | * guest in a way that will both be appropriate to L1's requests, and our |
2564 | * needs. In addition to modifying the active vmcs (which is vmcs02), this |
2565 | * function also has additional necessary side-effects, like setting various |
2566 | * vcpu->arch fields. |
2567 | * Returns 0 on success, 1 on failure. Invalid state exit qualification code |
2568 | * is assigned to entry_failure_code on failure. |
2569 | */ |
2570 | static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, |
2571 | bool from_vmentry, |
2572 | enum vm_entry_failure_code *entry_failure_code) |
2573 | { |
2574 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2575 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
2576 | bool load_guest_pdptrs_vmcs12 = false; |
2577 | |
2578 | if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { |
2579 | prepare_vmcs02_rare(vmx, vmcs12); |
2580 | vmx->nested.dirty_vmcs12 = false; |
2581 | |
2582 | load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || |
2583 | !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); |
2584 | } |
2585 | |
2586 | if (vmx->nested.nested_run_pending && |
2587 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { |
2588 | kvm_set_dr(vcpu, dr: 7, val: vmcs12->guest_dr7); |
2589 | vmcs_write64(field: GUEST_IA32_DEBUGCTL, value: vmcs12->guest_ia32_debugctl); |
2590 | } else { |
2591 | kvm_set_dr(vcpu, dr: 7, val: vcpu->arch.dr7); |
2592 | vmcs_write64(field: GUEST_IA32_DEBUGCTL, value: vmx->nested.pre_vmenter_debugctl); |
2593 | } |
2594 | if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || |
2595 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) |
2596 | vmcs_write64(field: GUEST_BNDCFGS, value: vmx->nested.pre_vmenter_bndcfgs); |
2597 | vmx_set_rflags(vcpu, rflags: vmcs12->guest_rflags); |
2598 | |
2599 | /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the |
2600 | * bitwise-or of what L1 wants to trap for L2, and what we want to |
2601 | * trap. Note that CR0.TS also needs updating - we do this later. |
2602 | */ |
2603 | vmx_update_exception_bitmap(vcpu); |
2604 | vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; |
2605 | vmcs_writel(field: CR0_GUEST_HOST_MASK, value: ~vcpu->arch.cr0_guest_owned_bits); |
2606 | |
2607 | if (vmx->nested.nested_run_pending && |
2608 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { |
2609 | vmcs_write64(field: GUEST_IA32_PAT, value: vmcs12->guest_ia32_pat); |
2610 | vcpu->arch.pat = vmcs12->guest_ia32_pat; |
2611 | } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
2612 | vmcs_write64(field: GUEST_IA32_PAT, value: vmx->vcpu.arch.pat); |
2613 | } |
2614 | |
2615 | vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( |
2616 | l1_offset: vcpu->arch.l1_tsc_offset, |
2617 | l2_offset: vmx_get_l2_tsc_offset(vcpu), |
2618 | l2_multiplier: vmx_get_l2_tsc_multiplier(vcpu)); |
2619 | |
2620 | vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( |
2621 | l1_multiplier: vcpu->arch.l1_tsc_scaling_ratio, |
2622 | l2_multiplier: vmx_get_l2_tsc_multiplier(vcpu)); |
2623 | |
2624 | vmcs_write64(field: TSC_OFFSET, value: vcpu->arch.tsc_offset); |
2625 | if (kvm_caps.has_tsc_control) |
2626 | vmcs_write64(field: TSC_MULTIPLIER, value: vcpu->arch.tsc_scaling_ratio); |
2627 | |
2628 | nested_vmx_transition_tlb_flush(vcpu, vmcs12, is_vmenter: true); |
2629 | |
2630 | if (nested_cpu_has_ept(vmcs12)) |
2631 | nested_ept_init_mmu_context(vcpu); |
2632 | |
2633 | /* |
2634 | * Override the CR0/CR4 read shadows after setting the effective guest |
2635 | * CR0/CR4. The common helpers also set the shadows, but they don't |
2636 | * account for vmcs12's cr0/4_guest_host_mask. |
2637 | */ |
2638 | vmx_set_cr0(vcpu, cr0: vmcs12->guest_cr0); |
2639 | vmcs_writel(field: CR0_READ_SHADOW, value: nested_read_cr0(fields: vmcs12)); |
2640 | |
2641 | vmx_set_cr4(vcpu, cr4: vmcs12->guest_cr4); |
2642 | vmcs_writel(field: CR4_READ_SHADOW, value: nested_read_cr4(fields: vmcs12)); |
2643 | |
2644 | vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); |
2645 | /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ |
2646 | vmx_set_efer(vcpu, efer: vcpu->arch.efer); |
2647 | |
2648 | /* |
2649 | * Guest state is invalid and unrestricted guest is disabled, |
2650 | * which means L1 attempted VMEntry to L2 with invalid state. |
2651 | * Fail the VMEntry. |
2652 | * |
2653 | * However when force loading the guest state (SMM exit or |
2654 | * loading nested state after migration, it is possible to |
2655 | * have invalid guest state now, which will be later fixed by |
2656 | * restoring L2 register state |
2657 | */ |
2658 | if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { |
2659 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
2660 | return -EINVAL; |
2661 | } |
2662 | |
2663 | /* Shadow page tables on either EPT or shadow page tables. */ |
2664 | if (nested_vmx_load_cr3(vcpu, cr3: vmcs12->guest_cr3, nested_ept: nested_cpu_has_ept(vmcs12), |
2665 | reload_pdptrs: from_vmentry, entry_failure_code)) |
2666 | return -EINVAL; |
2667 | |
2668 | /* |
2669 | * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 |
2670 | * on nested VM-Exit, which can occur without actually running L2 and |
2671 | * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with |
2672 | * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the |
2673 | * transition to HLT instead of running L2. |
2674 | */ |
2675 | if (enable_ept) |
2676 | vmcs_writel(field: GUEST_CR3, value: vmcs12->guest_cr3); |
2677 | |
2678 | /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ |
2679 | if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && |
2680 | is_pae_paging(vcpu)) { |
2681 | vmcs_write64(field: GUEST_PDPTR0, value: vmcs12->guest_pdptr0); |
2682 | vmcs_write64(field: GUEST_PDPTR1, value: vmcs12->guest_pdptr1); |
2683 | vmcs_write64(field: GUEST_PDPTR2, value: vmcs12->guest_pdptr2); |
2684 | vmcs_write64(field: GUEST_PDPTR3, value: vmcs12->guest_pdptr3); |
2685 | } |
2686 | |
2687 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && |
2688 | kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && |
2689 | WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, |
2690 | vmcs12->guest_ia32_perf_global_ctrl))) { |
2691 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
2692 | return -EINVAL; |
2693 | } |
2694 | |
2695 | kvm_rsp_write(vcpu, val: vmcs12->guest_rsp); |
2696 | kvm_rip_write(vcpu, val: vmcs12->guest_rip); |
2697 | |
2698 | /* |
2699 | * It was observed that genuine Hyper-V running in L1 doesn't reset |
2700 | * 'hv_clean_fields' by itself, it only sets the corresponding dirty |
2701 | * bits when it changes a field in eVMCS. Mark all fields as clean |
2702 | * here. |
2703 | */ |
2704 | if (nested_vmx_is_evmptr12_valid(vmx)) |
2705 | evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; |
2706 | |
2707 | return 0; |
2708 | } |
2709 | |
2710 | static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) |
2711 | { |
2712 | if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && |
2713 | nested_cpu_has_virtual_nmis(vmcs12))) |
2714 | return -EINVAL; |
2715 | |
2716 | if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && |
2717 | nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) |
2718 | return -EINVAL; |
2719 | |
2720 | return 0; |
2721 | } |
2722 | |
2723 | static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) |
2724 | { |
2725 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2726 | |
2727 | /* Check for memory type validity */ |
2728 | switch (new_eptp & VMX_EPTP_MT_MASK) { |
2729 | case VMX_EPTP_MT_UC: |
2730 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) |
2731 | return false; |
2732 | break; |
2733 | case VMX_EPTP_MT_WB: |
2734 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) |
2735 | return false; |
2736 | break; |
2737 | default: |
2738 | return false; |
2739 | } |
2740 | |
2741 | /* Page-walk levels validity. */ |
2742 | switch (new_eptp & VMX_EPTP_PWL_MASK) { |
2743 | case VMX_EPTP_PWL_5: |
2744 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) |
2745 | return false; |
2746 | break; |
2747 | case VMX_EPTP_PWL_4: |
2748 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) |
2749 | return false; |
2750 | break; |
2751 | default: |
2752 | return false; |
2753 | } |
2754 | |
2755 | /* Reserved bits should not be set */ |
2756 | if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) |
2757 | return false; |
2758 | |
2759 | /* AD, if set, should be supported */ |
2760 | if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { |
2761 | if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) |
2762 | return false; |
2763 | } |
2764 | |
2765 | return true; |
2766 | } |
2767 | |
2768 | /* |
2769 | * Checks related to VM-Execution Control Fields |
2770 | */ |
2771 | static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, |
2772 | struct vmcs12 *vmcs12) |
2773 | { |
2774 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2775 | |
2776 | if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, |
2777 | vmx->nested.msrs.pinbased_ctls_low, |
2778 | vmx->nested.msrs.pinbased_ctls_high)) || |
2779 | CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, |
2780 | vmx->nested.msrs.procbased_ctls_low, |
2781 | vmx->nested.msrs.procbased_ctls_high))) |
2782 | return -EINVAL; |
2783 | |
2784 | if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && |
2785 | CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, |
2786 | vmx->nested.msrs.secondary_ctls_low, |
2787 | vmx->nested.msrs.secondary_ctls_high))) |
2788 | return -EINVAL; |
2789 | |
2790 | if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || |
2791 | nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || |
2792 | nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || |
2793 | nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || |
2794 | nested_vmx_check_apic_access_controls(vcpu, vmcs12) || |
2795 | nested_vmx_check_apicv_controls(vcpu, vmcs12) || |
2796 | nested_vmx_check_nmi_controls(vmcs12) || |
2797 | nested_vmx_check_pml_controls(vcpu, vmcs12) || |
2798 | nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || |
2799 | nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || |
2800 | nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || |
2801 | CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) |
2802 | return -EINVAL; |
2803 | |
2804 | if (!nested_cpu_has_preemption_timer(vmcs12) && |
2805 | nested_cpu_has_save_preemption_timer(vmcs12)) |
2806 | return -EINVAL; |
2807 | |
2808 | if (nested_cpu_has_ept(vmcs12) && |
2809 | CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) |
2810 | return -EINVAL; |
2811 | |
2812 | if (nested_cpu_has_vmfunc(vmcs12)) { |
2813 | if (CC(vmcs12->vm_function_control & |
2814 | ~vmx->nested.msrs.vmfunc_controls)) |
2815 | return -EINVAL; |
2816 | |
2817 | if (nested_cpu_has_eptp_switching(vmcs12)) { |
2818 | if (CC(!nested_cpu_has_ept(vmcs12)) || |
2819 | CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) |
2820 | return -EINVAL; |
2821 | } |
2822 | } |
2823 | |
2824 | return 0; |
2825 | } |
2826 | |
2827 | /* |
2828 | * Checks related to VM-Exit Control Fields |
2829 | */ |
2830 | static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, |
2831 | struct vmcs12 *vmcs12) |
2832 | { |
2833 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2834 | |
2835 | if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, |
2836 | vmx->nested.msrs.exit_ctls_low, |
2837 | vmx->nested.msrs.exit_ctls_high)) || |
2838 | CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) |
2839 | return -EINVAL; |
2840 | |
2841 | return 0; |
2842 | } |
2843 | |
2844 | /* |
2845 | * Checks related to VM-Entry Control Fields |
2846 | */ |
2847 | static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, |
2848 | struct vmcs12 *vmcs12) |
2849 | { |
2850 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2851 | |
2852 | if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, |
2853 | vmx->nested.msrs.entry_ctls_low, |
2854 | vmx->nested.msrs.entry_ctls_high))) |
2855 | return -EINVAL; |
2856 | |
2857 | /* |
2858 | * From the Intel SDM, volume 3: |
2859 | * Fields relevant to VM-entry event injection must be set properly. |
2860 | * These fields are the VM-entry interruption-information field, the |
2861 | * VM-entry exception error code, and the VM-entry instruction length. |
2862 | */ |
2863 | if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { |
2864 | u32 intr_info = vmcs12->vm_entry_intr_info_field; |
2865 | u8 vector = intr_info & INTR_INFO_VECTOR_MASK; |
2866 | u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; |
2867 | bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; |
2868 | bool should_have_error_code; |
2869 | bool urg = nested_cpu_has2(vmcs12, |
2870 | SECONDARY_EXEC_UNRESTRICTED_GUEST); |
2871 | bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; |
2872 | |
2873 | /* VM-entry interruption-info field: interruption type */ |
2874 | if (CC(intr_type == INTR_TYPE_RESERVED) || |
2875 | CC(intr_type == INTR_TYPE_OTHER_EVENT && |
2876 | !nested_cpu_supports_monitor_trap_flag(vcpu))) |
2877 | return -EINVAL; |
2878 | |
2879 | /* VM-entry interruption-info field: vector */ |
2880 | if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || |
2881 | CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || |
2882 | CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) |
2883 | return -EINVAL; |
2884 | |
2885 | /* VM-entry interruption-info field: deliver error code */ |
2886 | should_have_error_code = |
2887 | intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && |
2888 | x86_exception_has_error_code(vector); |
2889 | if (CC(has_error_code != should_have_error_code)) |
2890 | return -EINVAL; |
2891 | |
2892 | /* VM-entry exception error code */ |
2893 | if (CC(has_error_code && |
2894 | vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) |
2895 | return -EINVAL; |
2896 | |
2897 | /* VM-entry interruption-info field: reserved bits */ |
2898 | if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) |
2899 | return -EINVAL; |
2900 | |
2901 | /* VM-entry instruction length */ |
2902 | switch (intr_type) { |
2903 | case INTR_TYPE_SOFT_EXCEPTION: |
2904 | case INTR_TYPE_SOFT_INTR: |
2905 | case INTR_TYPE_PRIV_SW_EXCEPTION: |
2906 | if (CC(vmcs12->vm_entry_instruction_len > 15) || |
2907 | CC(vmcs12->vm_entry_instruction_len == 0 && |
2908 | CC(!nested_cpu_has_zero_length_injection(vcpu)))) |
2909 | return -EINVAL; |
2910 | } |
2911 | } |
2912 | |
2913 | if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) |
2914 | return -EINVAL; |
2915 | |
2916 | return 0; |
2917 | } |
2918 | |
2919 | static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, |
2920 | struct vmcs12 *vmcs12) |
2921 | { |
2922 | if (nested_check_vm_execution_controls(vcpu, vmcs12) || |
2923 | nested_check_vm_exit_controls(vcpu, vmcs12) || |
2924 | nested_check_vm_entry_controls(vcpu, vmcs12)) |
2925 | return -EINVAL; |
2926 | |
2927 | #ifdef CONFIG_KVM_HYPERV |
2928 | if (guest_cpuid_has_evmcs(vcpu)) |
2929 | return nested_evmcs_check_controls(vmcs12); |
2930 | #endif |
2931 | |
2932 | return 0; |
2933 | } |
2934 | |
2935 | static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, |
2936 | struct vmcs12 *vmcs12) |
2937 | { |
2938 | #ifdef CONFIG_X86_64 |
2939 | if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != |
2940 | !!(vcpu->arch.efer & EFER_LMA))) |
2941 | return -EINVAL; |
2942 | #endif |
2943 | return 0; |
2944 | } |
2945 | |
2946 | static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, |
2947 | struct vmcs12 *vmcs12) |
2948 | { |
2949 | bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); |
2950 | |
2951 | if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || |
2952 | CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || |
2953 | CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) |
2954 | return -EINVAL; |
2955 | |
2956 | if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || |
2957 | CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) |
2958 | return -EINVAL; |
2959 | |
2960 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && |
2961 | CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) |
2962 | return -EINVAL; |
2963 | |
2964 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && |
2965 | CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), |
2966 | vmcs12->host_ia32_perf_global_ctrl))) |
2967 | return -EINVAL; |
2968 | |
2969 | if (ia32e) { |
2970 | if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) |
2971 | return -EINVAL; |
2972 | } else { |
2973 | if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || |
2974 | CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || |
2975 | CC((vmcs12->host_rip) >> 32)) |
2976 | return -EINVAL; |
2977 | } |
2978 | |
2979 | if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2980 | CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2981 | CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2982 | CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2983 | CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2984 | CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2985 | CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || |
2986 | CC(vmcs12->host_cs_selector == 0) || |
2987 | CC(vmcs12->host_tr_selector == 0) || |
2988 | CC(vmcs12->host_ss_selector == 0 && !ia32e)) |
2989 | return -EINVAL; |
2990 | |
2991 | if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || |
2992 | CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || |
2993 | CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || |
2994 | CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || |
2995 | CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || |
2996 | CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) |
2997 | return -EINVAL; |
2998 | |
2999 | /* |
3000 | * If the load IA32_EFER VM-exit control is 1, bits reserved in the |
3001 | * IA32_EFER MSR must be 0 in the field for that register. In addition, |
3002 | * the values of the LMA and LME bits in the field must each be that of |
3003 | * the host address-space size VM-exit control. |
3004 | */ |
3005 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { |
3006 | if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || |
3007 | CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || |
3008 | CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) |
3009 | return -EINVAL; |
3010 | } |
3011 | |
3012 | return 0; |
3013 | } |
3014 | |
3015 | static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, |
3016 | struct vmcs12 *vmcs12) |
3017 | { |
3018 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3019 | struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; |
3020 | struct vmcs_hdr hdr; |
3021 | |
3022 | if (vmcs12->vmcs_link_pointer == INVALID_GPA) |
3023 | return 0; |
3024 | |
3025 | if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) |
3026 | return -EINVAL; |
3027 | |
3028 | if (ghc->gpa != vmcs12->vmcs_link_pointer && |
3029 | CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, |
3030 | vmcs12->vmcs_link_pointer, VMCS12_SIZE))) |
3031 | return -EINVAL; |
3032 | |
3033 | if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, |
3034 | offsetof(struct vmcs12, hdr), |
3035 | sizeof(hdr)))) |
3036 | return -EINVAL; |
3037 | |
3038 | if (CC(hdr.revision_id != VMCS12_REVISION) || |
3039 | CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) |
3040 | return -EINVAL; |
3041 | |
3042 | return 0; |
3043 | } |
3044 | |
3045 | /* |
3046 | * Checks related to Guest Non-register State |
3047 | */ |
3048 | static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) |
3049 | { |
3050 | if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && |
3051 | vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && |
3052 | vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) |
3053 | return -EINVAL; |
3054 | |
3055 | return 0; |
3056 | } |
3057 | |
3058 | static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, |
3059 | struct vmcs12 *vmcs12, |
3060 | enum vm_entry_failure_code *entry_failure_code) |
3061 | { |
3062 | bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); |
3063 | |
3064 | *entry_failure_code = ENTRY_FAIL_DEFAULT; |
3065 | |
3066 | if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || |
3067 | CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) |
3068 | return -EINVAL; |
3069 | |
3070 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && |
3071 | CC(!kvm_dr7_valid(vmcs12->guest_dr7))) |
3072 | return -EINVAL; |
3073 | |
3074 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && |
3075 | CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) |
3076 | return -EINVAL; |
3077 | |
3078 | if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { |
3079 | *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; |
3080 | return -EINVAL; |
3081 | } |
3082 | |
3083 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && |
3084 | CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), |
3085 | vmcs12->guest_ia32_perf_global_ctrl))) |
3086 | return -EINVAL; |
3087 | |
3088 | if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) |
3089 | return -EINVAL; |
3090 | |
3091 | if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || |
3092 | CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) |
3093 | return -EINVAL; |
3094 | |
3095 | /* |
3096 | * If the load IA32_EFER VM-entry control is 1, the following checks |
3097 | * are performed on the field for the IA32_EFER MSR: |
3098 | * - Bits reserved in the IA32_EFER MSR must be 0. |
3099 | * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of |
3100 | * the IA-32e mode guest VM-exit control. It must also be identical |
3101 | * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to |
3102 | * CR0.PG) is 1. |
3103 | */ |
3104 | if (to_vmx(vcpu)->nested.nested_run_pending && |
3105 | (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { |
3106 | if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || |
3107 | CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || |
3108 | CC(((vmcs12->guest_cr0 & X86_CR0_PG) && |
3109 | ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) |
3110 | return -EINVAL; |
3111 | } |
3112 | |
3113 | if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && |
3114 | (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || |
3115 | CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) |
3116 | return -EINVAL; |
3117 | |
3118 | if (nested_check_guest_non_reg_state(vmcs12)) |
3119 | return -EINVAL; |
3120 | |
3121 | return 0; |
3122 | } |
3123 | |
3124 | static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) |
3125 | { |
3126 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3127 | unsigned long cr3, cr4; |
3128 | bool vm_fail; |
3129 | |
3130 | if (!nested_early_check) |
3131 | return 0; |
3132 | |
3133 | if (vmx->msr_autoload.host.nr) |
3134 | vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: 0); |
3135 | if (vmx->msr_autoload.guest.nr) |
3136 | vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: 0); |
3137 | |
3138 | preempt_disable(); |
3139 | |
3140 | vmx_prepare_switch_to_guest(vcpu); |
3141 | |
3142 | /* |
3143 | * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, |
3144 | * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to |
3145 | * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. |
3146 | * there is no need to preserve other bits or save/restore the field. |
3147 | */ |
3148 | vmcs_writel(field: GUEST_RFLAGS, value: 0); |
3149 | |
3150 | cr3 = __get_current_cr3_fast(); |
3151 | if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { |
3152 | vmcs_writel(field: HOST_CR3, value: cr3); |
3153 | vmx->loaded_vmcs->host_state.cr3 = cr3; |
3154 | } |
3155 | |
3156 | cr4 = cr4_read_shadow(); |
3157 | if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { |
3158 | vmcs_writel(field: HOST_CR4, value: cr4); |
3159 | vmx->loaded_vmcs->host_state.cr4 = cr4; |
3160 | } |
3161 | |
3162 | vm_fail = __vmx_vcpu_run(vmx, regs: (unsigned long *)&vcpu->arch.regs, |
3163 | flags: __vmx_vcpu_run_flags(vmx)); |
3164 | |
3165 | if (vmx->msr_autoload.host.nr) |
3166 | vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: vmx->msr_autoload.host.nr); |
3167 | if (vmx->msr_autoload.guest.nr) |
3168 | vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: vmx->msr_autoload.guest.nr); |
3169 | |
3170 | if (vm_fail) { |
3171 | u32 error = vmcs_read32(field: VM_INSTRUCTION_ERROR); |
3172 | |
3173 | preempt_enable(); |
3174 | |
3175 | trace_kvm_nested_vmenter_failed( |
3176 | "early hardware check VM-instruction error: " , error); |
3177 | WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
3178 | return 1; |
3179 | } |
3180 | |
3181 | /* |
3182 | * VMExit clears RFLAGS.IF and DR7, even on a consistency check. |
3183 | */ |
3184 | if (hw_breakpoint_active()) |
3185 | set_debugreg(__this_cpu_read(cpu_dr7), reg: 7); |
3186 | local_irq_enable(); |
3187 | preempt_enable(); |
3188 | |
3189 | /* |
3190 | * A non-failing VMEntry means we somehow entered guest mode with |
3191 | * an illegal RIP, and that's just the tip of the iceberg. There |
3192 | * is no telling what memory has been modified or what state has |
3193 | * been exposed to unknown code. Hitting this all but guarantees |
3194 | * a (very critical) hardware issue. |
3195 | */ |
3196 | WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & |
3197 | VMX_EXIT_REASONS_FAILED_VMENTRY)); |
3198 | |
3199 | return 0; |
3200 | } |
3201 | |
3202 | #ifdef CONFIG_KVM_HYPERV |
3203 | static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) |
3204 | { |
3205 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3206 | |
3207 | /* |
3208 | * hv_evmcs may end up being not mapped after migration (when |
3209 | * L2 was running), map it here to make sure vmcs12 changes are |
3210 | * properly reflected. |
3211 | */ |
3212 | if (guest_cpuid_has_evmcs(vcpu) && |
3213 | vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { |
3214 | enum nested_evmptrld_status evmptrld_status = |
3215 | nested_vmx_handle_enlightened_vmptrld(vcpu, from_launch: false); |
3216 | |
3217 | if (evmptrld_status == EVMPTRLD_VMFAIL || |
3218 | evmptrld_status == EVMPTRLD_ERROR) |
3219 | return false; |
3220 | |
3221 | /* |
3222 | * Post migration VMCS12 always provides the most actual |
3223 | * information, copy it to eVMCS upon entry. |
3224 | */ |
3225 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
3226 | } |
3227 | |
3228 | return true; |
3229 | } |
3230 | #endif |
3231 | |
3232 | static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) |
3233 | { |
3234 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
3235 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3236 | struct kvm_host_map *map; |
3237 | |
3238 | if (!vcpu->arch.pdptrs_from_userspace && |
3239 | !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { |
3240 | /* |
3241 | * Reload the guest's PDPTRs since after a migration |
3242 | * the guest CR3 might be restored prior to setting the nested |
3243 | * state which can lead to a load of wrong PDPTRs. |
3244 | */ |
3245 | if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) |
3246 | return false; |
3247 | } |
3248 | |
3249 | |
3250 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { |
3251 | map = &vmx->nested.apic_access_page_map; |
3252 | |
3253 | if (!kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: vmcs12->apic_access_addr), map)) { |
3254 | vmcs_write64(field: APIC_ACCESS_ADDR, value: pfn_to_hpa(pfn: map->pfn)); |
3255 | } else { |
3256 | pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n" , |
3257 | __func__); |
3258 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
3259 | vcpu->run->internal.suberror = |
3260 | KVM_INTERNAL_ERROR_EMULATION; |
3261 | vcpu->run->internal.ndata = 0; |
3262 | return false; |
3263 | } |
3264 | } |
3265 | |
3266 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { |
3267 | map = &vmx->nested.virtual_apic_map; |
3268 | |
3269 | if (!kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: vmcs12->virtual_apic_page_addr), map)) { |
3270 | vmcs_write64(field: VIRTUAL_APIC_PAGE_ADDR, value: pfn_to_hpa(pfn: map->pfn)); |
3271 | } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && |
3272 | nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && |
3273 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { |
3274 | /* |
3275 | * The processor will never use the TPR shadow, simply |
3276 | * clear the bit from the execution control. Such a |
3277 | * configuration is useless, but it happens in tests. |
3278 | * For any other configuration, failing the vm entry is |
3279 | * _not_ what the processor does but it's basically the |
3280 | * only possibility we have. |
3281 | */ |
3282 | exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); |
3283 | } else { |
3284 | /* |
3285 | * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to |
3286 | * force VM-Entry to fail. |
3287 | */ |
3288 | vmcs_write64(field: VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); |
3289 | } |
3290 | } |
3291 | |
3292 | if (nested_cpu_has_posted_intr(vmcs12)) { |
3293 | map = &vmx->nested.pi_desc_map; |
3294 | |
3295 | if (!kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: vmcs12->posted_intr_desc_addr), map)) { |
3296 | vmx->nested.pi_desc = |
3297 | (struct pi_desc *)(((void *)map->hva) + |
3298 | offset_in_page(vmcs12->posted_intr_desc_addr)); |
3299 | vmcs_write64(field: POSTED_INTR_DESC_ADDR, |
3300 | value: pfn_to_hpa(pfn: map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); |
3301 | } else { |
3302 | /* |
3303 | * Defer the KVM_INTERNAL_EXIT until KVM tries to |
3304 | * access the contents of the VMCS12 posted interrupt |
3305 | * descriptor. (Note that KVM may do this when it |
3306 | * should not, per the architectural specification.) |
3307 | */ |
3308 | vmx->nested.pi_desc = NULL; |
3309 | pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); |
3310 | } |
3311 | } |
3312 | if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) |
3313 | exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); |
3314 | else |
3315 | exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); |
3316 | |
3317 | return true; |
3318 | } |
3319 | |
3320 | static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) |
3321 | { |
3322 | #ifdef CONFIG_KVM_HYPERV |
3323 | /* |
3324 | * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy |
3325 | * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory |
3326 | * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post |
3327 | * migration. |
3328 | */ |
3329 | if (!nested_get_evmcs_page(vcpu)) { |
3330 | pr_debug_ratelimited("%s: enlightened vmptrld failed\n" , |
3331 | __func__); |
3332 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
3333 | vcpu->run->internal.suberror = |
3334 | KVM_INTERNAL_ERROR_EMULATION; |
3335 | vcpu->run->internal.ndata = 0; |
3336 | |
3337 | return false; |
3338 | } |
3339 | #endif |
3340 | |
3341 | if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) |
3342 | return false; |
3343 | |
3344 | return true; |
3345 | } |
3346 | |
3347 | static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) |
3348 | { |
3349 | struct vmcs12 *vmcs12; |
3350 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3351 | gpa_t dst; |
3352 | |
3353 | if (WARN_ON_ONCE(!is_guest_mode(vcpu))) |
3354 | return 0; |
3355 | |
3356 | if (WARN_ON_ONCE(vmx->nested.pml_full)) |
3357 | return 1; |
3358 | |
3359 | /* |
3360 | * Check if PML is enabled for the nested guest. Whether eptp bit 6 is |
3361 | * set is already checked as part of A/D emulation. |
3362 | */ |
3363 | vmcs12 = get_vmcs12(vcpu); |
3364 | if (!nested_cpu_has_pml(vmcs12)) |
3365 | return 0; |
3366 | |
3367 | if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { |
3368 | vmx->nested.pml_full = true; |
3369 | return 1; |
3370 | } |
3371 | |
3372 | gpa &= ~0xFFFull; |
3373 | dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; |
3374 | |
3375 | if (kvm_write_guest_page(kvm: vcpu->kvm, gfn: gpa_to_gfn(gpa: dst), data: &gpa, |
3376 | offset_in_page(dst), len: sizeof(gpa))) |
3377 | return 0; |
3378 | |
3379 | vmcs12->guest_pml_index--; |
3380 | |
3381 | return 0; |
3382 | } |
3383 | |
3384 | /* |
3385 | * Intel's VMX Instruction Reference specifies a common set of prerequisites |
3386 | * for running VMX instructions (except VMXON, whose prerequisites are |
3387 | * slightly different). It also specifies what exception to inject otherwise. |
3388 | * Note that many of these exceptions have priority over VM exits, so they |
3389 | * don't have to be checked again here. |
3390 | */ |
3391 | static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) |
3392 | { |
3393 | if (!to_vmx(vcpu)->nested.vmxon) { |
3394 | kvm_queue_exception(vcpu, UD_VECTOR); |
3395 | return 0; |
3396 | } |
3397 | |
3398 | if (vmx_get_cpl(vcpu)) { |
3399 | kvm_inject_gp(vcpu, error_code: 0); |
3400 | return 0; |
3401 | } |
3402 | |
3403 | return 1; |
3404 | } |
3405 | |
3406 | static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) |
3407 | { |
3408 | u8 rvi = vmx_get_rvi(); |
3409 | u8 vppr = kvm_lapic_get_reg(apic: vcpu->arch.apic, APIC_PROCPRI); |
3410 | |
3411 | return ((rvi & 0xf0) > (vppr & 0xf0)); |
3412 | } |
3413 | |
3414 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, |
3415 | struct vmcs12 *vmcs12); |
3416 | |
3417 | /* |
3418 | * If from_vmentry is false, this is being called from state restore (either RSM |
3419 | * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. |
3420 | * |
3421 | * Returns: |
3422 | * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode |
3423 | * NVMX_VMENTRY_VMFAIL: Consistency check VMFail |
3424 | * NVMX_VMENTRY_VMEXIT: Consistency check VMExit |
3425 | * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error |
3426 | */ |
3427 | enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, |
3428 | bool from_vmentry) |
3429 | { |
3430 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3431 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
3432 | enum vm_entry_failure_code entry_failure_code; |
3433 | bool evaluate_pending_interrupts; |
3434 | union vmx_exit_reason exit_reason = { |
3435 | .basic = EXIT_REASON_INVALID_STATE, |
3436 | .failed_vmentry = 1, |
3437 | }; |
3438 | u32 failed_index; |
3439 | |
3440 | trace_kvm_nested_vmenter(kvm_rip_read(vcpu), |
3441 | vmx->nested.current_vmptr, |
3442 | vmcs12->guest_rip, |
3443 | vmcs12->guest_intr_status, |
3444 | vmcs12->vm_entry_intr_info_field, |
3445 | vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, |
3446 | vmcs12->ept_pointer, |
3447 | vmcs12->guest_cr3, |
3448 | KVM_ISA_VMX); |
3449 | |
3450 | kvm_service_local_tlb_flush_requests(vcpu); |
3451 | |
3452 | evaluate_pending_interrupts = exec_controls_get(vmx) & |
3453 | (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); |
3454 | if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) |
3455 | evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); |
3456 | if (!evaluate_pending_interrupts) |
3457 | evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); |
3458 | |
3459 | if (!vmx->nested.nested_run_pending || |
3460 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) |
3461 | vmx->nested.pre_vmenter_debugctl = vmcs_read64(field: GUEST_IA32_DEBUGCTL); |
3462 | if (kvm_mpx_supported() && |
3463 | (!vmx->nested.nested_run_pending || |
3464 | !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) |
3465 | vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(field: GUEST_BNDCFGS); |
3466 | |
3467 | /* |
3468 | * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* |
3469 | * nested early checks are disabled. In the event of a "late" VM-Fail, |
3470 | * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its |
3471 | * software model to the pre-VMEntry host state. When EPT is disabled, |
3472 | * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes |
3473 | * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing |
3474 | * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to |
3475 | * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested |
3476 | * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is |
3477 | * guaranteed to be overwritten with a shadow CR3 prior to re-entering |
3478 | * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as |
3479 | * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks |
3480 | * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail |
3481 | * path would need to manually save/restore vmcs01.GUEST_CR3. |
3482 | */ |
3483 | if (!enable_ept && !nested_early_check) |
3484 | vmcs_writel(field: GUEST_CR3, value: vcpu->arch.cr3); |
3485 | |
3486 | vmx_switch_vmcs(vcpu, vmcs: &vmx->nested.vmcs02); |
3487 | |
3488 | prepare_vmcs02_early(vmx, vmcs01: &vmx->vmcs01, vmcs12); |
3489 | |
3490 | if (from_vmentry) { |
3491 | if (unlikely(!nested_get_vmcs12_pages(vcpu))) { |
3492 | vmx_switch_vmcs(vcpu, vmcs: &vmx->vmcs01); |
3493 | return NVMX_VMENTRY_KVM_INTERNAL_ERROR; |
3494 | } |
3495 | |
3496 | if (nested_vmx_check_vmentry_hw(vcpu)) { |
3497 | vmx_switch_vmcs(vcpu, vmcs: &vmx->vmcs01); |
3498 | return NVMX_VMENTRY_VMFAIL; |
3499 | } |
3500 | |
3501 | if (nested_vmx_check_guest_state(vcpu, vmcs12, |
3502 | entry_failure_code: &entry_failure_code)) { |
3503 | exit_reason.basic = EXIT_REASON_INVALID_STATE; |
3504 | vmcs12->exit_qualification = entry_failure_code; |
3505 | goto vmentry_fail_vmexit; |
3506 | } |
3507 | } |
3508 | |
3509 | enter_guest_mode(vcpu); |
3510 | |
3511 | if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, entry_failure_code: &entry_failure_code)) { |
3512 | exit_reason.basic = EXIT_REASON_INVALID_STATE; |
3513 | vmcs12->exit_qualification = entry_failure_code; |
3514 | goto vmentry_fail_vmexit_guest_mode; |
3515 | } |
3516 | |
3517 | if (from_vmentry) { |
3518 | failed_index = nested_vmx_load_msr(vcpu, |
3519 | gpa: vmcs12->vm_entry_msr_load_addr, |
3520 | count: vmcs12->vm_entry_msr_load_count); |
3521 | if (failed_index) { |
3522 | exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; |
3523 | vmcs12->exit_qualification = failed_index; |
3524 | goto vmentry_fail_vmexit_guest_mode; |
3525 | } |
3526 | } else { |
3527 | /* |
3528 | * The MMU is not initialized to point at the right entities yet and |
3529 | * "get pages" would need to read data from the guest (i.e. we will |
3530 | * need to perform gpa to hpa translation). Request a call |
3531 | * to nested_get_vmcs12_pages before the next VM-entry. The MSRs |
3532 | * have already been set at vmentry time and should not be reset. |
3533 | */ |
3534 | kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
3535 | } |
3536 | |
3537 | /* |
3538 | * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI |
3539 | * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can |
3540 | * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit |
3541 | * unconditionally. |
3542 | */ |
3543 | if (unlikely(evaluate_pending_interrupts)) |
3544 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
3545 | |
3546 | /* |
3547 | * Do not start the preemption timer hrtimer until after we know |
3548 | * we are successful, so that only nested_vmx_vmexit needs to cancel |
3549 | * the timer. |
3550 | */ |
3551 | vmx->nested.preemption_timer_expired = false; |
3552 | if (nested_cpu_has_preemption_timer(vmcs12)) { |
3553 | u64 timer_value = vmx_calc_preemption_timer_value(vcpu); |
3554 | vmx_start_preemption_timer(vcpu, preemption_timeout: timer_value); |
3555 | } |
3556 | |
3557 | /* |
3558 | * Note no nested_vmx_succeed or nested_vmx_fail here. At this point |
3559 | * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet |
3560 | * returned as far as L1 is concerned. It will only return (and set |
3561 | * the success flag) when L2 exits (see nested_vmx_vmexit()). |
3562 | */ |
3563 | return NVMX_VMENTRY_SUCCESS; |
3564 | |
3565 | /* |
3566 | * A failed consistency check that leads to a VMExit during L1's |
3567 | * VMEnter to L2 is a variation of a normal VMexit, as explained in |
3568 | * 26.7 "VM-entry failures during or after loading guest state". |
3569 | */ |
3570 | vmentry_fail_vmexit_guest_mode: |
3571 | if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) |
3572 | vcpu->arch.tsc_offset -= vmcs12->tsc_offset; |
3573 | leave_guest_mode(vcpu); |
3574 | |
3575 | vmentry_fail_vmexit: |
3576 | vmx_switch_vmcs(vcpu, vmcs: &vmx->vmcs01); |
3577 | |
3578 | if (!from_vmentry) |
3579 | return NVMX_VMENTRY_VMEXIT; |
3580 | |
3581 | load_vmcs12_host_state(vcpu, vmcs12); |
3582 | vmcs12->vm_exit_reason = exit_reason.full; |
3583 | if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) |
3584 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
3585 | return NVMX_VMENTRY_VMEXIT; |
3586 | } |
3587 | |
3588 | /* |
3589 | * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 |
3590 | * for running an L2 nested guest. |
3591 | */ |
3592 | static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) |
3593 | { |
3594 | struct vmcs12 *vmcs12; |
3595 | enum nvmx_vmentry_status status; |
3596 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3597 | u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); |
3598 | enum nested_evmptrld_status evmptrld_status; |
3599 | |
3600 | if (!nested_vmx_check_permission(vcpu)) |
3601 | return 1; |
3602 | |
3603 | evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, from_launch: launch); |
3604 | if (evmptrld_status == EVMPTRLD_ERROR) { |
3605 | kvm_queue_exception(vcpu, UD_VECTOR); |
3606 | return 1; |
3607 | } |
3608 | |
3609 | kvm_pmu_trigger_event(vcpu, eventsel: kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); |
3610 | |
3611 | if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) |
3612 | return nested_vmx_failInvalid(vcpu); |
3613 | |
3614 | if (CC(!nested_vmx_is_evmptr12_valid(vmx) && |
3615 | vmx->nested.current_vmptr == INVALID_GPA)) |
3616 | return nested_vmx_failInvalid(vcpu); |
3617 | |
3618 | vmcs12 = get_vmcs12(vcpu); |
3619 | |
3620 | /* |
3621 | * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact |
3622 | * that there *is* a valid VMCS pointer, RFLAGS.CF is set |
3623 | * rather than RFLAGS.ZF, and no error number is stored to the |
3624 | * VM-instruction error field. |
3625 | */ |
3626 | if (CC(vmcs12->hdr.shadow_vmcs)) |
3627 | return nested_vmx_failInvalid(vcpu); |
3628 | |
3629 | if (nested_vmx_is_evmptr12_valid(vmx)) { |
3630 | struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); |
3631 | |
3632 | copy_enlightened_to_vmcs12(vmx, hv_clean_fields: evmcs->hv_clean_fields); |
3633 | /* Enlightened VMCS doesn't have launch state */ |
3634 | vmcs12->launch_state = !launch; |
3635 | } else if (enable_shadow_vmcs) { |
3636 | copy_shadow_to_vmcs12(vmx); |
3637 | } |
3638 | |
3639 | /* |
3640 | * The nested entry process starts with enforcing various prerequisites |
3641 | * on vmcs12 as required by the Intel SDM, and act appropriately when |
3642 | * they fail: As the SDM explains, some conditions should cause the |
3643 | * instruction to fail, while others will cause the instruction to seem |
3644 | * to succeed, but return an EXIT_REASON_INVALID_STATE. |
3645 | * To speed up the normal (success) code path, we should avoid checking |
3646 | * for misconfigurations which will anyway be caught by the processor |
3647 | * when using the merged vmcs02. |
3648 | */ |
3649 | if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) |
3650 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); |
3651 | |
3652 | if (CC(vmcs12->launch_state == launch)) |
3653 | return nested_vmx_fail(vcpu, |
3654 | vm_instruction_error: launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS |
3655 | : VMXERR_VMRESUME_NONLAUNCHED_VMCS); |
3656 | |
3657 | if (nested_vmx_check_controls(vcpu, vmcs12)) |
3658 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
3659 | |
3660 | if (nested_vmx_check_address_space_size(vcpu, vmcs12)) |
3661 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); |
3662 | |
3663 | if (nested_vmx_check_host_state(vcpu, vmcs12)) |
3664 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); |
3665 | |
3666 | /* |
3667 | * We're finally done with prerequisite checking, and can start with |
3668 | * the nested entry. |
3669 | */ |
3670 | vmx->nested.nested_run_pending = 1; |
3671 | vmx->nested.has_preemption_timer_deadline = false; |
3672 | status = nested_vmx_enter_non_root_mode(vcpu, from_vmentry: true); |
3673 | if (unlikely(status != NVMX_VMENTRY_SUCCESS)) |
3674 | goto vmentry_failed; |
3675 | |
3676 | /* Emulate processing of posted interrupts on VM-Enter. */ |
3677 | if (nested_cpu_has_posted_intr(vmcs12) && |
3678 | kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { |
3679 | vmx->nested.pi_pending = true; |
3680 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
3681 | kvm_apic_clear_irr(vcpu, vec: vmx->nested.posted_intr_nv); |
3682 | } |
3683 | |
3684 | /* Hide L1D cache contents from the nested guest. */ |
3685 | vmx->vcpu.arch.l1tf_flush_l1d = true; |
3686 | |
3687 | /* |
3688 | * Must happen outside of nested_vmx_enter_non_root_mode() as it will |
3689 | * also be used as part of restoring nVMX state for |
3690 | * snapshot restore (migration). |
3691 | * |
3692 | * In this flow, it is assumed that vmcs12 cache was |
3693 | * transferred as part of captured nVMX state and should |
3694 | * therefore not be read from guest memory (which may not |
3695 | * exist on destination host yet). |
3696 | */ |
3697 | nested_cache_shadow_vmcs12(vcpu, vmcs12); |
3698 | |
3699 | switch (vmcs12->guest_activity_state) { |
3700 | case GUEST_ACTIVITY_HLT: |
3701 | /* |
3702 | * If we're entering a halted L2 vcpu and the L2 vcpu won't be |
3703 | * awakened by event injection or by an NMI-window VM-exit or |
3704 | * by an interrupt-window VM-exit, halt the vcpu. |
3705 | */ |
3706 | if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && |
3707 | !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && |
3708 | !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && |
3709 | (vmcs12->guest_rflags & X86_EFLAGS_IF))) { |
3710 | vmx->nested.nested_run_pending = 0; |
3711 | return kvm_emulate_halt_noskip(vcpu); |
3712 | } |
3713 | break; |
3714 | case GUEST_ACTIVITY_WAIT_SIPI: |
3715 | vmx->nested.nested_run_pending = 0; |
3716 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
3717 | break; |
3718 | default: |
3719 | break; |
3720 | } |
3721 | |
3722 | return 1; |
3723 | |
3724 | vmentry_failed: |
3725 | vmx->nested.nested_run_pending = 0; |
3726 | if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) |
3727 | return 0; |
3728 | if (status == NVMX_VMENTRY_VMEXIT) |
3729 | return 1; |
3730 | WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); |
3731 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
3732 | } |
3733 | |
3734 | /* |
3735 | * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date |
3736 | * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). |
3737 | * This function returns the new value we should put in vmcs12.guest_cr0. |
3738 | * It's not enough to just return the vmcs02 GUEST_CR0. Rather, |
3739 | * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now |
3740 | * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 |
3741 | * didn't trap the bit, because if L1 did, so would L0). |
3742 | * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have |
3743 | * been modified by L2, and L1 knows it. So just leave the old value of |
3744 | * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 |
3745 | * isn't relevant, because if L0 traps this bit it can set it to anything. |
3746 | * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have |
3747 | * changed these bits, and therefore they need to be updated, but L0 |
3748 | * didn't necessarily allow them to be changed in GUEST_CR0 - and rather |
3749 | * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. |
3750 | */ |
3751 | static inline unsigned long |
3752 | vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) |
3753 | { |
3754 | return |
3755 | /*1*/ (vmcs_readl(field: GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | |
3756 | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | |
3757 | /*3*/ (vmcs_readl(field: CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | |
3758 | vcpu->arch.cr0_guest_owned_bits)); |
3759 | } |
3760 | |
3761 | static inline unsigned long |
3762 | vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) |
3763 | { |
3764 | return |
3765 | /*1*/ (vmcs_readl(field: GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | |
3766 | /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | |
3767 | /*3*/ (vmcs_readl(field: CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | |
3768 | vcpu->arch.cr4_guest_owned_bits)); |
3769 | } |
3770 | |
3771 | static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, |
3772 | struct vmcs12 *vmcs12, |
3773 | u32 vm_exit_reason, u32 exit_intr_info) |
3774 | { |
3775 | u32 idt_vectoring; |
3776 | unsigned int nr; |
3777 | |
3778 | /* |
3779 | * Per the SDM, VM-Exits due to double and triple faults are never |
3780 | * considered to occur during event delivery, even if the double/triple |
3781 | * fault is the result of an escalating vectoring issue. |
3782 | * |
3783 | * Note, the SDM qualifies the double fault behavior with "The original |
3784 | * event results in a double-fault exception". It's unclear why the |
3785 | * qualification exists since exits due to double fault can occur only |
3786 | * while vectoring a different exception (injected events are never |
3787 | * subject to interception), i.e. there's _always_ an original event. |
3788 | * |
3789 | * The SDM also uses NMI as a confusing example for the "original event |
3790 | * causes the VM exit directly" clause. NMI isn't special in any way, |
3791 | * the same rule applies to all events that cause an exit directly. |
3792 | * NMI is an odd choice for the example because NMIs can only occur on |
3793 | * instruction boundaries, i.e. they _can't_ occur during vectoring. |
3794 | */ |
3795 | if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || |
3796 | ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && |
3797 | is_double_fault(intr_info: exit_intr_info))) { |
3798 | vmcs12->idt_vectoring_info_field = 0; |
3799 | } else if (vcpu->arch.exception.injected) { |
3800 | nr = vcpu->arch.exception.vector; |
3801 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; |
3802 | |
3803 | if (kvm_exception_is_soft(nr)) { |
3804 | vmcs12->vm_exit_instruction_len = |
3805 | vcpu->arch.event_exit_inst_len; |
3806 | idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; |
3807 | } else |
3808 | idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; |
3809 | |
3810 | if (vcpu->arch.exception.has_error_code) { |
3811 | idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; |
3812 | vmcs12->idt_vectoring_error_code = |
3813 | vcpu->arch.exception.error_code; |
3814 | } |
3815 | |
3816 | vmcs12->idt_vectoring_info_field = idt_vectoring; |
3817 | } else if (vcpu->arch.nmi_injected) { |
3818 | vmcs12->idt_vectoring_info_field = |
3819 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; |
3820 | } else if (vcpu->arch.interrupt.injected) { |
3821 | nr = vcpu->arch.interrupt.nr; |
3822 | idt_vectoring = nr | VECTORING_INFO_VALID_MASK; |
3823 | |
3824 | if (vcpu->arch.interrupt.soft) { |
3825 | idt_vectoring |= INTR_TYPE_SOFT_INTR; |
3826 | vmcs12->vm_entry_instruction_len = |
3827 | vcpu->arch.event_exit_inst_len; |
3828 | } else |
3829 | idt_vectoring |= INTR_TYPE_EXT_INTR; |
3830 | |
3831 | vmcs12->idt_vectoring_info_field = idt_vectoring; |
3832 | } else { |
3833 | vmcs12->idt_vectoring_info_field = 0; |
3834 | } |
3835 | } |
3836 | |
3837 | |
3838 | void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) |
3839 | { |
3840 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
3841 | gfn_t gfn; |
3842 | |
3843 | /* |
3844 | * Don't need to mark the APIC access page dirty; it is never |
3845 | * written to by the CPU during APIC virtualization. |
3846 | */ |
3847 | |
3848 | if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { |
3849 | gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; |
3850 | kvm_vcpu_mark_page_dirty(vcpu, gfn); |
3851 | } |
3852 | |
3853 | if (nested_cpu_has_posted_intr(vmcs12)) { |
3854 | gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; |
3855 | kvm_vcpu_mark_page_dirty(vcpu, gfn); |
3856 | } |
3857 | } |
3858 | |
3859 | static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) |
3860 | { |
3861 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3862 | int max_irr; |
3863 | void *vapic_page; |
3864 | u16 status; |
3865 | |
3866 | if (!vmx->nested.pi_pending) |
3867 | return 0; |
3868 | |
3869 | if (!vmx->nested.pi_desc) |
3870 | goto mmio_needed; |
3871 | |
3872 | vmx->nested.pi_pending = false; |
3873 | |
3874 | if (!pi_test_and_clear_on(pi_desc: vmx->nested.pi_desc)) |
3875 | return 0; |
3876 | |
3877 | max_irr = find_last_bit(addr: (unsigned long *)vmx->nested.pi_desc->pir, size: 256); |
3878 | if (max_irr != 256) { |
3879 | vapic_page = vmx->nested.virtual_apic_map.hva; |
3880 | if (!vapic_page) |
3881 | goto mmio_needed; |
3882 | |
3883 | __kvm_apic_update_irr(pir: vmx->nested.pi_desc->pir, |
3884 | regs: vapic_page, max_irr: &max_irr); |
3885 | status = vmcs_read16(field: GUEST_INTR_STATUS); |
3886 | if ((u8)max_irr > ((u8)status & 0xff)) { |
3887 | status &= ~0xff; |
3888 | status |= (u8)max_irr; |
3889 | vmcs_write16(field: GUEST_INTR_STATUS, value: status); |
3890 | } |
3891 | } |
3892 | |
3893 | nested_mark_vmcs12_pages_dirty(vcpu); |
3894 | return 0; |
3895 | |
3896 | mmio_needed: |
3897 | kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); |
3898 | return -ENXIO; |
3899 | } |
3900 | |
3901 | static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) |
3902 | { |
3903 | struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; |
3904 | u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; |
3905 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
3906 | unsigned long exit_qual; |
3907 | |
3908 | if (ex->has_payload) { |
3909 | exit_qual = ex->payload; |
3910 | } else if (ex->vector == PF_VECTOR) { |
3911 | exit_qual = vcpu->arch.cr2; |
3912 | } else if (ex->vector == DB_VECTOR) { |
3913 | exit_qual = vcpu->arch.dr6; |
3914 | exit_qual &= ~DR6_BT; |
3915 | exit_qual ^= DR6_ACTIVE_LOW; |
3916 | } else { |
3917 | exit_qual = 0; |
3918 | } |
3919 | |
3920 | /* |
3921 | * Unlike AMD's Paged Real Mode, which reports an error code on #PF |
3922 | * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the |
3923 | * "has error code" flags on VM-Exit if the CPU is in Real Mode. |
3924 | */ |
3925 | if (ex->has_error_code && is_protmode(vcpu)) { |
3926 | /* |
3927 | * Intel CPUs do not generate error codes with bits 31:16 set, |
3928 | * and more importantly VMX disallows setting bits 31:16 in the |
3929 | * injected error code for VM-Entry. Drop the bits to mimic |
3930 | * hardware and avoid inducing failure on nested VM-Entry if L1 |
3931 | * chooses to inject the exception back to L2. AMD CPUs _do_ |
3932 | * generate "full" 32-bit error codes, so KVM allows userspace |
3933 | * to inject exception error codes with bits 31:16 set. |
3934 | */ |
3935 | vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; |
3936 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
3937 | } |
3938 | |
3939 | if (kvm_exception_is_soft(nr: ex->vector)) |
3940 | intr_info |= INTR_TYPE_SOFT_EXCEPTION; |
3941 | else |
3942 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
3943 | |
3944 | if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && |
3945 | vmx_get_nmi_mask(vcpu)) |
3946 | intr_info |= INTR_INFO_UNBLOCK_NMI; |
3947 | |
3948 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, exit_intr_info: intr_info, exit_qualification: exit_qual); |
3949 | } |
3950 | |
3951 | /* |
3952 | * Returns true if a debug trap is (likely) pending delivery. Infer the class |
3953 | * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). |
3954 | * Using the payload is flawed because code breakpoints (fault-like) and data |
3955 | * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. |
3956 | * this will return false positives if a to-be-injected code breakpoint #DB is |
3957 | * pending (from KVM's perspective, but not "pending" across an instruction |
3958 | * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it |
3959 | * too is trap-like. |
3960 | * |
3961 | * KVM "works" despite these flaws as ICEBP isn't currently supported by the |
3962 | * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the |
3963 | * #DB has already happened), and MTF isn't marked pending on code breakpoints |
3964 | * from the emulator (because such #DBs are fault-like and thus don't trigger |
3965 | * actions that fire on instruction retire). |
3966 | */ |
3967 | static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) |
3968 | { |
3969 | if (!ex->pending || ex->vector != DB_VECTOR) |
3970 | return 0; |
3971 | |
3972 | /* General Detect #DBs are always fault-like. */ |
3973 | return ex->payload & ~DR6_BD; |
3974 | } |
3975 | |
3976 | /* |
3977 | * Returns true if there's a pending #DB exception that is lower priority than |
3978 | * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by |
3979 | * KVM, but could theoretically be injected by userspace. Note, this code is |
3980 | * imperfect, see above. |
3981 | */ |
3982 | static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) |
3983 | { |
3984 | return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; |
3985 | } |
3986 | |
3987 | /* |
3988 | * Certain VM-exits set the 'pending debug exceptions' field to indicate a |
3989 | * recognized #DB (data or single-step) that has yet to be delivered. Since KVM |
3990 | * represents these debug traps with a payload that is said to be compatible |
3991 | * with the 'pending debug exceptions' field, write the payload to the VMCS |
3992 | * field if a VM-exit is delivered before the debug trap. |
3993 | */ |
3994 | static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) |
3995 | { |
3996 | unsigned long pending_dbg; |
3997 | |
3998 | pending_dbg = vmx_get_pending_dbg_trap(ex: &vcpu->arch.exception); |
3999 | if (pending_dbg) |
4000 | vmcs_writel(field: GUEST_PENDING_DBG_EXCEPTIONS, value: pending_dbg); |
4001 | } |
4002 | |
4003 | static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) |
4004 | { |
4005 | return nested_cpu_has_preemption_timer(vmcs12: get_vmcs12(vcpu)) && |
4006 | to_vmx(vcpu)->nested.preemption_timer_expired; |
4007 | } |
4008 | |
4009 | static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) |
4010 | { |
4011 | return nested_vmx_preemption_timer_pending(vcpu) || |
4012 | to_vmx(vcpu)->nested.mtf_pending; |
4013 | } |
4014 | |
4015 | /* |
4016 | * Per the Intel SDM's table "Priority Among Concurrent Events", with minor |
4017 | * edits to fill in missing examples, e.g. #DB due to split-lock accesses, |
4018 | * and less minor edits to splice in the priority of VMX Non-Root specific |
4019 | * events, e.g. MTF and NMI/INTR-window exiting. |
4020 | * |
4021 | * 1 Hardware Reset and Machine Checks |
4022 | * - RESET |
4023 | * - Machine Check |
4024 | * |
4025 | * 2 Trap on Task Switch |
4026 | * - T flag in TSS is set (on task switch) |
4027 | * |
4028 | * 3 External Hardware Interventions |
4029 | * - FLUSH |
4030 | * - STOPCLK |
4031 | * - SMI |
4032 | * - INIT |
4033 | * |
4034 | * 3.5 Monitor Trap Flag (MTF) VM-exit[1] |
4035 | * |
4036 | * 4 Traps on Previous Instruction |
4037 | * - Breakpoints |
4038 | * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O |
4039 | * breakpoint, or #DB due to a split-lock access) |
4040 | * |
4041 | * 4.3 VMX-preemption timer expired VM-exit |
4042 | * |
4043 | * 4.6 NMI-window exiting VM-exit[2] |
4044 | * |
4045 | * 5 Nonmaskable Interrupts (NMI) |
4046 | * |
4047 | * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery |
4048 | * |
4049 | * 6 Maskable Hardware Interrupts |
4050 | * |
4051 | * 7 Code Breakpoint Fault |
4052 | * |
4053 | * 8 Faults from Fetching Next Instruction |
4054 | * - Code-Segment Limit Violation |
4055 | * - Code Page Fault |
4056 | * - Control protection exception (missing ENDBRANCH at target of indirect |
4057 | * call or jump) |
4058 | * |
4059 | * 9 Faults from Decoding Next Instruction |
4060 | * - Instruction length > 15 bytes |
4061 | * - Invalid Opcode |
4062 | * - Coprocessor Not Available |
4063 | * |
4064 | *10 Faults on Executing Instruction |
4065 | * - Overflow |
4066 | * - Bound error |
4067 | * - Invalid TSS |
4068 | * - Segment Not Present |
4069 | * - Stack fault |
4070 | * - General Protection |
4071 | * - Data Page Fault |
4072 | * - Alignment Check |
4073 | * - x86 FPU Floating-point exception |
4074 | * - SIMD floating-point exception |
4075 | * - Virtualization exception |
4076 | * - Control protection exception |
4077 | * |
4078 | * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), |
4079 | * INIT signals, and higher priority events take priority over MTF VM exits. |
4080 | * MTF VM exits take priority over debug-trap exceptions and lower priority |
4081 | * events. |
4082 | * |
4083 | * [2] Debug-trap exceptions and higher priority events take priority over VM exits |
4084 | * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption |
4085 | * timer take priority over VM exits caused by the "NMI-window exiting" |
4086 | * VM-execution control and lower priority events. |
4087 | * |
4088 | * [3] Debug-trap exceptions and higher priority events take priority over VM exits |
4089 | * caused by "NMI-window exiting". VM exits caused by this control take |
4090 | * priority over non-maskable interrupts (NMIs) and lower priority events. |
4091 | * |
4092 | * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to |
4093 | * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, |
4094 | * non-maskable interrupts (NMIs) and higher priority events take priority over |
4095 | * delivery of a virtual interrupt; delivery of a virtual interrupt takes |
4096 | * priority over external interrupts and lower priority events. |
4097 | */ |
4098 | static int vmx_check_nested_events(struct kvm_vcpu *vcpu) |
4099 | { |
4100 | struct kvm_lapic *apic = vcpu->arch.apic; |
4101 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4102 | /* |
4103 | * Only a pending nested run blocks a pending exception. If there is a |
4104 | * previously injected event, the pending exception occurred while said |
4105 | * event was being delivered and thus needs to be handled. |
4106 | */ |
4107 | bool block_nested_exceptions = vmx->nested.nested_run_pending; |
4108 | /* |
4109 | * New events (not exceptions) are only recognized at instruction |
4110 | * boundaries. If an event needs reinjection, then KVM is handling a |
4111 | * VM-Exit that occurred _during_ instruction execution; new events are |
4112 | * blocked until the instruction completes. |
4113 | */ |
4114 | bool block_nested_events = block_nested_exceptions || |
4115 | kvm_event_needs_reinjection(vcpu); |
4116 | |
4117 | if (lapic_in_kernel(vcpu) && |
4118 | test_bit(KVM_APIC_INIT, &apic->pending_events)) { |
4119 | if (block_nested_events) |
4120 | return -EBUSY; |
4121 | nested_vmx_update_pending_dbg(vcpu); |
4122 | clear_bit(KVM_APIC_INIT, addr: &apic->pending_events); |
4123 | if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) |
4124 | nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, exit_intr_info: 0, exit_qualification: 0); |
4125 | |
4126 | /* MTF is discarded if the vCPU is in WFS. */ |
4127 | vmx->nested.mtf_pending = false; |
4128 | return 0; |
4129 | } |
4130 | |
4131 | if (lapic_in_kernel(vcpu) && |
4132 | test_bit(KVM_APIC_SIPI, &apic->pending_events)) { |
4133 | if (block_nested_events) |
4134 | return -EBUSY; |
4135 | |
4136 | clear_bit(KVM_APIC_SIPI, addr: &apic->pending_events); |
4137 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
4138 | nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, exit_intr_info: 0, |
4139 | exit_qualification: apic->sipi_vector & 0xFFUL); |
4140 | return 0; |
4141 | } |
4142 | /* Fallthrough, the SIPI is completely ignored. */ |
4143 | } |
4144 | |
4145 | /* |
4146 | * Process exceptions that are higher priority than Monitor Trap Flag: |
4147 | * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but |
4148 | * could theoretically come in from userspace), and ICEBP (INT1). |
4149 | * |
4150 | * TODO: SMIs have higher priority than MTF and trap-like #DBs (except |
4151 | * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF |
4152 | * across SMI/RSM as it should; that needs to be addressed in order to |
4153 | * prioritize SMI over MTF and trap-like #DBs. |
4154 | */ |
4155 | if (vcpu->arch.exception_vmexit.pending && |
4156 | !vmx_is_low_priority_db_trap(ex: &vcpu->arch.exception_vmexit)) { |
4157 | if (block_nested_exceptions) |
4158 | return -EBUSY; |
4159 | |
4160 | nested_vmx_inject_exception_vmexit(vcpu); |
4161 | return 0; |
4162 | } |
4163 | |
4164 | if (vcpu->arch.exception.pending && |
4165 | !vmx_is_low_priority_db_trap(ex: &vcpu->arch.exception)) { |
4166 | if (block_nested_exceptions) |
4167 | return -EBUSY; |
4168 | goto no_vmexit; |
4169 | } |
4170 | |
4171 | if (vmx->nested.mtf_pending) { |
4172 | if (block_nested_events) |
4173 | return -EBUSY; |
4174 | nested_vmx_update_pending_dbg(vcpu); |
4175 | nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, exit_intr_info: 0, exit_qualification: 0); |
4176 | return 0; |
4177 | } |
4178 | |
4179 | if (vcpu->arch.exception_vmexit.pending) { |
4180 | if (block_nested_exceptions) |
4181 | return -EBUSY; |
4182 | |
4183 | nested_vmx_inject_exception_vmexit(vcpu); |
4184 | return 0; |
4185 | } |
4186 | |
4187 | if (vcpu->arch.exception.pending) { |
4188 | if (block_nested_exceptions) |
4189 | return -EBUSY; |
4190 | goto no_vmexit; |
4191 | } |
4192 | |
4193 | if (nested_vmx_preemption_timer_pending(vcpu)) { |
4194 | if (block_nested_events) |
4195 | return -EBUSY; |
4196 | nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, exit_intr_info: 0, exit_qualification: 0); |
4197 | return 0; |
4198 | } |
4199 | |
4200 | if (vcpu->arch.smi_pending && !is_smm(vcpu)) { |
4201 | if (block_nested_events) |
4202 | return -EBUSY; |
4203 | goto no_vmexit; |
4204 | } |
4205 | |
4206 | if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { |
4207 | if (block_nested_events) |
4208 | return -EBUSY; |
4209 | if (!nested_exit_on_nmi(vcpu)) |
4210 | goto no_vmexit; |
4211 | |
4212 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, |
4213 | NMI_VECTOR | INTR_TYPE_NMI_INTR | |
4214 | INTR_INFO_VALID_MASK, exit_qualification: 0); |
4215 | /* |
4216 | * The NMI-triggered VM exit counts as injection: |
4217 | * clear this one and block further NMIs. |
4218 | */ |
4219 | vcpu->arch.nmi_pending = 0; |
4220 | vmx_set_nmi_mask(vcpu, masked: true); |
4221 | return 0; |
4222 | } |
4223 | |
4224 | if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { |
4225 | if (block_nested_events) |
4226 | return -EBUSY; |
4227 | if (!nested_exit_on_intr(vcpu)) |
4228 | goto no_vmexit; |
4229 | nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, exit_intr_info: 0, exit_qualification: 0); |
4230 | return 0; |
4231 | } |
4232 | |
4233 | no_vmexit: |
4234 | return vmx_complete_nested_posted_interrupt(vcpu); |
4235 | } |
4236 | |
4237 | static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) |
4238 | { |
4239 | ktime_t remaining = |
4240 | hrtimer_get_remaining(timer: &to_vmx(vcpu)->nested.preemption_timer); |
4241 | u64 value; |
4242 | |
4243 | if (ktime_to_ns(kt: remaining) <= 0) |
4244 | return 0; |
4245 | |
4246 | value = ktime_to_ns(kt: remaining) * vcpu->arch.virtual_tsc_khz; |
4247 | do_div(value, 1000000); |
4248 | return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; |
4249 | } |
4250 | |
4251 | static bool is_vmcs12_ext_field(unsigned long field) |
4252 | { |
4253 | switch (field) { |
4254 | case GUEST_ES_SELECTOR: |
4255 | case GUEST_CS_SELECTOR: |
4256 | case GUEST_SS_SELECTOR: |
4257 | case GUEST_DS_SELECTOR: |
4258 | case GUEST_FS_SELECTOR: |
4259 | case GUEST_GS_SELECTOR: |
4260 | case GUEST_LDTR_SELECTOR: |
4261 | case GUEST_TR_SELECTOR: |
4262 | case GUEST_ES_LIMIT: |
4263 | case GUEST_CS_LIMIT: |
4264 | case GUEST_SS_LIMIT: |
4265 | case GUEST_DS_LIMIT: |
4266 | case GUEST_FS_LIMIT: |
4267 | case GUEST_GS_LIMIT: |
4268 | case GUEST_LDTR_LIMIT: |
4269 | case GUEST_TR_LIMIT: |
4270 | case GUEST_GDTR_LIMIT: |
4271 | case GUEST_IDTR_LIMIT: |
4272 | case GUEST_ES_AR_BYTES: |
4273 | case GUEST_DS_AR_BYTES: |
4274 | case GUEST_FS_AR_BYTES: |
4275 | case GUEST_GS_AR_BYTES: |
4276 | case GUEST_LDTR_AR_BYTES: |
4277 | case GUEST_TR_AR_BYTES: |
4278 | case GUEST_ES_BASE: |
4279 | case GUEST_CS_BASE: |
4280 | case GUEST_SS_BASE: |
4281 | case GUEST_DS_BASE: |
4282 | case GUEST_FS_BASE: |
4283 | case GUEST_GS_BASE: |
4284 | case GUEST_LDTR_BASE: |
4285 | case GUEST_TR_BASE: |
4286 | case GUEST_GDTR_BASE: |
4287 | case GUEST_IDTR_BASE: |
4288 | case GUEST_PENDING_DBG_EXCEPTIONS: |
4289 | case GUEST_BNDCFGS: |
4290 | return true; |
4291 | default: |
4292 | break; |
4293 | } |
4294 | |
4295 | return false; |
4296 | } |
4297 | |
4298 | static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, |
4299 | struct vmcs12 *vmcs12) |
4300 | { |
4301 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4302 | |
4303 | vmcs12->guest_es_selector = vmcs_read16(field: GUEST_ES_SELECTOR); |
4304 | vmcs12->guest_cs_selector = vmcs_read16(field: GUEST_CS_SELECTOR); |
4305 | vmcs12->guest_ss_selector = vmcs_read16(field: GUEST_SS_SELECTOR); |
4306 | vmcs12->guest_ds_selector = vmcs_read16(field: GUEST_DS_SELECTOR); |
4307 | vmcs12->guest_fs_selector = vmcs_read16(field: GUEST_FS_SELECTOR); |
4308 | vmcs12->guest_gs_selector = vmcs_read16(field: GUEST_GS_SELECTOR); |
4309 | vmcs12->guest_ldtr_selector = vmcs_read16(field: GUEST_LDTR_SELECTOR); |
4310 | vmcs12->guest_tr_selector = vmcs_read16(field: GUEST_TR_SELECTOR); |
4311 | vmcs12->guest_es_limit = vmcs_read32(field: GUEST_ES_LIMIT); |
4312 | vmcs12->guest_cs_limit = vmcs_read32(field: GUEST_CS_LIMIT); |
4313 | vmcs12->guest_ss_limit = vmcs_read32(field: GUEST_SS_LIMIT); |
4314 | vmcs12->guest_ds_limit = vmcs_read32(field: GUEST_DS_LIMIT); |
4315 | vmcs12->guest_fs_limit = vmcs_read32(field: GUEST_FS_LIMIT); |
4316 | vmcs12->guest_gs_limit = vmcs_read32(field: GUEST_GS_LIMIT); |
4317 | vmcs12->guest_ldtr_limit = vmcs_read32(field: GUEST_LDTR_LIMIT); |
4318 | vmcs12->guest_tr_limit = vmcs_read32(field: GUEST_TR_LIMIT); |
4319 | vmcs12->guest_gdtr_limit = vmcs_read32(field: GUEST_GDTR_LIMIT); |
4320 | vmcs12->guest_idtr_limit = vmcs_read32(field: GUEST_IDTR_LIMIT); |
4321 | vmcs12->guest_es_ar_bytes = vmcs_read32(field: GUEST_ES_AR_BYTES); |
4322 | vmcs12->guest_ds_ar_bytes = vmcs_read32(field: GUEST_DS_AR_BYTES); |
4323 | vmcs12->guest_fs_ar_bytes = vmcs_read32(field: GUEST_FS_AR_BYTES); |
4324 | vmcs12->guest_gs_ar_bytes = vmcs_read32(field: GUEST_GS_AR_BYTES); |
4325 | vmcs12->guest_ldtr_ar_bytes = vmcs_read32(field: GUEST_LDTR_AR_BYTES); |
4326 | vmcs12->guest_tr_ar_bytes = vmcs_read32(field: GUEST_TR_AR_BYTES); |
4327 | vmcs12->guest_es_base = vmcs_readl(field: GUEST_ES_BASE); |
4328 | vmcs12->guest_cs_base = vmcs_readl(field: GUEST_CS_BASE); |
4329 | vmcs12->guest_ss_base = vmcs_readl(field: GUEST_SS_BASE); |
4330 | vmcs12->guest_ds_base = vmcs_readl(field: GUEST_DS_BASE); |
4331 | vmcs12->guest_fs_base = vmcs_readl(field: GUEST_FS_BASE); |
4332 | vmcs12->guest_gs_base = vmcs_readl(field: GUEST_GS_BASE); |
4333 | vmcs12->guest_ldtr_base = vmcs_readl(field: GUEST_LDTR_BASE); |
4334 | vmcs12->guest_tr_base = vmcs_readl(field: GUEST_TR_BASE); |
4335 | vmcs12->guest_gdtr_base = vmcs_readl(field: GUEST_GDTR_BASE); |
4336 | vmcs12->guest_idtr_base = vmcs_readl(field: GUEST_IDTR_BASE); |
4337 | vmcs12->guest_pending_dbg_exceptions = |
4338 | vmcs_readl(field: GUEST_PENDING_DBG_EXCEPTIONS); |
4339 | |
4340 | vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; |
4341 | } |
4342 | |
4343 | static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, |
4344 | struct vmcs12 *vmcs12) |
4345 | { |
4346 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4347 | int cpu; |
4348 | |
4349 | if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) |
4350 | return; |
4351 | |
4352 | |
4353 | WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); |
4354 | |
4355 | cpu = get_cpu(); |
4356 | vmx->loaded_vmcs = &vmx->nested.vmcs02; |
4357 | vmx_vcpu_load_vmcs(vcpu, cpu, buddy: &vmx->vmcs01); |
4358 | |
4359 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
4360 | |
4361 | vmx->loaded_vmcs = &vmx->vmcs01; |
4362 | vmx_vcpu_load_vmcs(vcpu, cpu, buddy: &vmx->nested.vmcs02); |
4363 | put_cpu(); |
4364 | } |
4365 | |
4366 | /* |
4367 | * Update the guest state fields of vmcs12 to reflect changes that |
4368 | * occurred while L2 was running. (The "IA-32e mode guest" bit of the |
4369 | * VM-entry controls is also updated, since this is really a guest |
4370 | * state bit.) |
4371 | */ |
4372 | static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) |
4373 | { |
4374 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4375 | |
4376 | if (nested_vmx_is_evmptr12_valid(vmx)) |
4377 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
4378 | |
4379 | vmx->nested.need_sync_vmcs02_to_vmcs12_rare = |
4380 | !nested_vmx_is_evmptr12_valid(vmx); |
4381 | |
4382 | vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); |
4383 | vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); |
4384 | |
4385 | vmcs12->guest_rsp = kvm_rsp_read(vcpu); |
4386 | vmcs12->guest_rip = kvm_rip_read(vcpu); |
4387 | vmcs12->guest_rflags = vmcs_readl(field: GUEST_RFLAGS); |
4388 | |
4389 | vmcs12->guest_cs_ar_bytes = vmcs_read32(field: GUEST_CS_AR_BYTES); |
4390 | vmcs12->guest_ss_ar_bytes = vmcs_read32(field: GUEST_SS_AR_BYTES); |
4391 | |
4392 | vmcs12->guest_interruptibility_info = |
4393 | vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO); |
4394 | |
4395 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) |
4396 | vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; |
4397 | else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) |
4398 | vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; |
4399 | else |
4400 | vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; |
4401 | |
4402 | if (nested_cpu_has_preemption_timer(vmcs12) && |
4403 | vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && |
4404 | !vmx->nested.nested_run_pending) |
4405 | vmcs12->vmx_preemption_timer_value = |
4406 | vmx_get_preemption_timer_value(vcpu); |
4407 | |
4408 | /* |
4409 | * In some cases (usually, nested EPT), L2 is allowed to change its |
4410 | * own CR3 without exiting. If it has changed it, we must keep it. |
4411 | * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined |
4412 | * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. |
4413 | * |
4414 | * Additionally, restore L2's PDPTR to vmcs12. |
4415 | */ |
4416 | if (enable_ept) { |
4417 | vmcs12->guest_cr3 = vmcs_readl(field: GUEST_CR3); |
4418 | if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { |
4419 | vmcs12->guest_pdptr0 = vmcs_read64(field: GUEST_PDPTR0); |
4420 | vmcs12->guest_pdptr1 = vmcs_read64(field: GUEST_PDPTR1); |
4421 | vmcs12->guest_pdptr2 = vmcs_read64(field: GUEST_PDPTR2); |
4422 | vmcs12->guest_pdptr3 = vmcs_read64(field: GUEST_PDPTR3); |
4423 | } |
4424 | } |
4425 | |
4426 | vmcs12->guest_linear_address = vmcs_readl(field: GUEST_LINEAR_ADDRESS); |
4427 | |
4428 | if (nested_cpu_has_vid(vmcs12)) |
4429 | vmcs12->guest_intr_status = vmcs_read16(field: GUEST_INTR_STATUS); |
4430 | |
4431 | vmcs12->vm_entry_controls = |
4432 | (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | |
4433 | (vm_entry_controls_get(vmx: to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); |
4434 | |
4435 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) |
4436 | vmcs12->guest_dr7 = vcpu->arch.dr7; |
4437 | |
4438 | if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) |
4439 | vmcs12->guest_ia32_efer = vcpu->arch.efer; |
4440 | } |
4441 | |
4442 | /* |
4443 | * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits |
4444 | * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), |
4445 | * and this function updates it to reflect the changes to the guest state while |
4446 | * L2 was running (and perhaps made some exits which were handled directly by L0 |
4447 | * without going back to L1), and to reflect the exit reason. |
4448 | * Note that we do not have to copy here all VMCS fields, just those that |
4449 | * could have changed by the L2 guest or the exit - i.e., the guest-state and |
4450 | * exit-information fields only. Other fields are modified by L1 with VMWRITE, |
4451 | * which already writes to vmcs12 directly. |
4452 | */ |
4453 | static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, |
4454 | u32 vm_exit_reason, u32 exit_intr_info, |
4455 | unsigned long exit_qualification) |
4456 | { |
4457 | /* update exit information fields: */ |
4458 | vmcs12->vm_exit_reason = vm_exit_reason; |
4459 | if (to_vmx(vcpu)->exit_reason.enclave_mode) |
4460 | vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; |
4461 | vmcs12->exit_qualification = exit_qualification; |
4462 | |
4463 | /* |
4464 | * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched |
4465 | * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other |
4466 | * exit info fields are unmodified. |
4467 | */ |
4468 | if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { |
4469 | vmcs12->launch_state = 1; |
4470 | |
4471 | /* vm_entry_intr_info_field is cleared on exit. Emulate this |
4472 | * instead of reading the real value. */ |
4473 | vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; |
4474 | |
4475 | /* |
4476 | * Transfer the event that L0 or L1 may wanted to inject into |
4477 | * L2 to IDT_VECTORING_INFO_FIELD. |
4478 | */ |
4479 | vmcs12_save_pending_event(vcpu, vmcs12, |
4480 | vm_exit_reason, exit_intr_info); |
4481 | |
4482 | vmcs12->vm_exit_intr_info = exit_intr_info; |
4483 | vmcs12->vm_exit_instruction_len = vmcs_read32(field: VM_EXIT_INSTRUCTION_LEN); |
4484 | vmcs12->vmx_instruction_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
4485 | |
4486 | /* |
4487 | * According to spec, there's no need to store the guest's |
4488 | * MSRs if the exit is due to a VM-entry failure that occurs |
4489 | * during or after loading the guest state. Since this exit |
4490 | * does not fall in that category, we need to save the MSRs. |
4491 | */ |
4492 | if (nested_vmx_store_msr(vcpu, |
4493 | gpa: vmcs12->vm_exit_msr_store_addr, |
4494 | count: vmcs12->vm_exit_msr_store_count)) |
4495 | nested_vmx_abort(vcpu, |
4496 | VMX_ABORT_SAVE_GUEST_MSR_FAIL); |
4497 | } |
4498 | } |
4499 | |
4500 | /* |
4501 | * A part of what we need to when the nested L2 guest exits and we want to |
4502 | * run its L1 parent, is to reset L1's guest state to the host state specified |
4503 | * in vmcs12. |
4504 | * This function is to be called not only on normal nested exit, but also on |
4505 | * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry |
4506 | * Failures During or After Loading Guest State"). |
4507 | * This function should be called when the active VMCS is L1's (vmcs01). |
4508 | */ |
4509 | static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, |
4510 | struct vmcs12 *vmcs12) |
4511 | { |
4512 | enum vm_entry_failure_code ignored; |
4513 | struct kvm_segment seg; |
4514 | |
4515 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) |
4516 | vcpu->arch.efer = vmcs12->host_ia32_efer; |
4517 | else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) |
4518 | vcpu->arch.efer |= (EFER_LMA | EFER_LME); |
4519 | else |
4520 | vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); |
4521 | vmx_set_efer(vcpu, efer: vcpu->arch.efer); |
4522 | |
4523 | kvm_rsp_write(vcpu, val: vmcs12->host_rsp); |
4524 | kvm_rip_write(vcpu, val: vmcs12->host_rip); |
4525 | vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); |
4526 | vmx_set_interrupt_shadow(vcpu, mask: 0); |
4527 | |
4528 | /* |
4529 | * Note that calling vmx_set_cr0 is important, even if cr0 hasn't |
4530 | * actually changed, because vmx_set_cr0 refers to efer set above. |
4531 | * |
4532 | * CR0_GUEST_HOST_MASK is already set in the original vmcs01 |
4533 | * (KVM doesn't change it); |
4534 | */ |
4535 | vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); |
4536 | vmx_set_cr0(vcpu, cr0: vmcs12->host_cr0); |
4537 | |
4538 | /* Same as above - no reason to call set_cr4_guest_host_mask(). */ |
4539 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(field: CR4_GUEST_HOST_MASK); |
4540 | vmx_set_cr4(vcpu, cr4: vmcs12->host_cr4); |
4541 | |
4542 | nested_ept_uninit_mmu_context(vcpu); |
4543 | |
4544 | /* |
4545 | * Only PDPTE load can fail as the value of cr3 was checked on entry and |
4546 | * couldn't have changed. |
4547 | */ |
4548 | if (nested_vmx_load_cr3(vcpu, cr3: vmcs12->host_cr3, nested_ept: false, reload_pdptrs: true, entry_failure_code: &ignored)) |
4549 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); |
4550 | |
4551 | nested_vmx_transition_tlb_flush(vcpu, vmcs12, is_vmenter: false); |
4552 | |
4553 | vmcs_write32(field: GUEST_SYSENTER_CS, value: vmcs12->host_ia32_sysenter_cs); |
4554 | vmcs_writel(field: GUEST_SYSENTER_ESP, value: vmcs12->host_ia32_sysenter_esp); |
4555 | vmcs_writel(field: GUEST_SYSENTER_EIP, value: vmcs12->host_ia32_sysenter_eip); |
4556 | vmcs_writel(field: GUEST_IDTR_BASE, value: vmcs12->host_idtr_base); |
4557 | vmcs_writel(field: GUEST_GDTR_BASE, value: vmcs12->host_gdtr_base); |
4558 | vmcs_write32(field: GUEST_IDTR_LIMIT, value: 0xFFFF); |
4559 | vmcs_write32(field: GUEST_GDTR_LIMIT, value: 0xFFFF); |
4560 | |
4561 | /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ |
4562 | if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) |
4563 | vmcs_write64(field: GUEST_BNDCFGS, value: 0); |
4564 | |
4565 | if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { |
4566 | vmcs_write64(field: GUEST_IA32_PAT, value: vmcs12->host_ia32_pat); |
4567 | vcpu->arch.pat = vmcs12->host_ia32_pat; |
4568 | } |
4569 | if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && |
4570 | kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) |
4571 | WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, |
4572 | vmcs12->host_ia32_perf_global_ctrl)); |
4573 | |
4574 | /* Set L1 segment info according to Intel SDM |
4575 | 27.5.2 Loading Host Segment and Descriptor-Table Registers */ |
4576 | seg = (struct kvm_segment) { |
4577 | .base = 0, |
4578 | .limit = 0xFFFFFFFF, |
4579 | .selector = vmcs12->host_cs_selector, |
4580 | .type = 11, |
4581 | .present = 1, |
4582 | .s = 1, |
4583 | .g = 1 |
4584 | }; |
4585 | if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) |
4586 | seg.l = 1; |
4587 | else |
4588 | seg.db = 1; |
4589 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_CS); |
4590 | seg = (struct kvm_segment) { |
4591 | .base = 0, |
4592 | .limit = 0xFFFFFFFF, |
4593 | .type = 3, |
4594 | .present = 1, |
4595 | .s = 1, |
4596 | .db = 1, |
4597 | .g = 1 |
4598 | }; |
4599 | seg.selector = vmcs12->host_ds_selector; |
4600 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_DS); |
4601 | seg.selector = vmcs12->host_es_selector; |
4602 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_ES); |
4603 | seg.selector = vmcs12->host_ss_selector; |
4604 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_SS); |
4605 | seg.selector = vmcs12->host_fs_selector; |
4606 | seg.base = vmcs12->host_fs_base; |
4607 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_FS); |
4608 | seg.selector = vmcs12->host_gs_selector; |
4609 | seg.base = vmcs12->host_gs_base; |
4610 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_GS); |
4611 | seg = (struct kvm_segment) { |
4612 | .base = vmcs12->host_tr_base, |
4613 | .limit = 0x67, |
4614 | .selector = vmcs12->host_tr_selector, |
4615 | .type = 11, |
4616 | .present = 1 |
4617 | }; |
4618 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_TR); |
4619 | |
4620 | memset(&seg, 0, sizeof(seg)); |
4621 | seg.unusable = 1; |
4622 | __vmx_set_segment(vcpu, var: &seg, seg: VCPU_SREG_LDTR); |
4623 | |
4624 | kvm_set_dr(vcpu, dr: 7, val: 0x400); |
4625 | vmcs_write64(field: GUEST_IA32_DEBUGCTL, value: 0); |
4626 | |
4627 | if (nested_vmx_load_msr(vcpu, gpa: vmcs12->vm_exit_msr_load_addr, |
4628 | count: vmcs12->vm_exit_msr_load_count)) |
4629 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); |
4630 | |
4631 | to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); |
4632 | } |
4633 | |
4634 | static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) |
4635 | { |
4636 | struct vmx_uret_msr *efer_msr; |
4637 | unsigned int i; |
4638 | |
4639 | if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) |
4640 | return vmcs_read64(field: GUEST_IA32_EFER); |
4641 | |
4642 | if (cpu_has_load_ia32_efer()) |
4643 | return host_efer; |
4644 | |
4645 | for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { |
4646 | if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) |
4647 | return vmx->msr_autoload.guest.val[i].value; |
4648 | } |
4649 | |
4650 | efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); |
4651 | if (efer_msr) |
4652 | return efer_msr->data; |
4653 | |
4654 | return host_efer; |
4655 | } |
4656 | |
4657 | static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) |
4658 | { |
4659 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
4660 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4661 | struct vmx_msr_entry g, h; |
4662 | gpa_t gpa; |
4663 | u32 i, j; |
4664 | |
4665 | vcpu->arch.pat = vmcs_read64(field: GUEST_IA32_PAT); |
4666 | |
4667 | if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { |
4668 | /* |
4669 | * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set |
4670 | * as vmcs01.GUEST_DR7 contains a userspace defined value |
4671 | * and vcpu->arch.dr7 is not squirreled away before the |
4672 | * nested VMENTER (not worth adding a variable in nested_vmx). |
4673 | */ |
4674 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) |
4675 | kvm_set_dr(vcpu, dr: 7, DR7_FIXED_1); |
4676 | else |
4677 | WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); |
4678 | } |
4679 | |
4680 | /* |
4681 | * Note that calling vmx_set_{efer,cr0,cr4} is important as they |
4682 | * handle a variety of side effects to KVM's software model. |
4683 | */ |
4684 | vmx_set_efer(vcpu, efer: nested_vmx_get_vmcs01_guest_efer(vmx)); |
4685 | |
4686 | vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); |
4687 | vmx_set_cr0(vcpu, cr0: vmcs_readl(field: CR0_READ_SHADOW)); |
4688 | |
4689 | vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(field: CR4_GUEST_HOST_MASK); |
4690 | vmx_set_cr4(vcpu, cr4: vmcs_readl(field: CR4_READ_SHADOW)); |
4691 | |
4692 | nested_ept_uninit_mmu_context(vcpu); |
4693 | vcpu->arch.cr3 = vmcs_readl(field: GUEST_CR3); |
4694 | kvm_register_mark_available(vcpu, reg: VCPU_EXREG_CR3); |
4695 | |
4696 | /* |
4697 | * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs |
4698 | * from vmcs01 (if necessary). The PDPTRs are not loaded on |
4699 | * VMFail, like everything else we just need to ensure our |
4700 | * software model is up-to-date. |
4701 | */ |
4702 | if (enable_ept && is_pae_paging(vcpu)) |
4703 | ept_save_pdptrs(vcpu); |
4704 | |
4705 | kvm_mmu_reset_context(vcpu); |
4706 | |
4707 | /* |
4708 | * This nasty bit of open coding is a compromise between blindly |
4709 | * loading L1's MSRs using the exit load lists (incorrect emulation |
4710 | * of VMFail), leaving the nested VM's MSRs in the software model |
4711 | * (incorrect behavior) and snapshotting the modified MSRs (too |
4712 | * expensive since the lists are unbound by hardware). For each |
4713 | * MSR that was (prematurely) loaded from the nested VMEntry load |
4714 | * list, reload it from the exit load list if it exists and differs |
4715 | * from the guest value. The intent is to stuff host state as |
4716 | * silently as possible, not to fully process the exit load list. |
4717 | */ |
4718 | for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { |
4719 | gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); |
4720 | if (kvm_vcpu_read_guest(vcpu, gpa, data: &g, len: sizeof(g))) { |
4721 | pr_debug_ratelimited( |
4722 | "%s read MSR index failed (%u, 0x%08llx)\n" , |
4723 | __func__, i, gpa); |
4724 | goto vmabort; |
4725 | } |
4726 | |
4727 | for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { |
4728 | gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); |
4729 | if (kvm_vcpu_read_guest(vcpu, gpa, data: &h, len: sizeof(h))) { |
4730 | pr_debug_ratelimited( |
4731 | "%s read MSR failed (%u, 0x%08llx)\n" , |
4732 | __func__, j, gpa); |
4733 | goto vmabort; |
4734 | } |
4735 | if (h.index != g.index) |
4736 | continue; |
4737 | if (h.value == g.value) |
4738 | break; |
4739 | |
4740 | if (nested_vmx_load_msr_check(vcpu, e: &h)) { |
4741 | pr_debug_ratelimited( |
4742 | "%s check failed (%u, 0x%x, 0x%x)\n" , |
4743 | __func__, j, h.index, h.reserved); |
4744 | goto vmabort; |
4745 | } |
4746 | |
4747 | if (kvm_set_msr(vcpu, index: h.index, data: h.value)) { |
4748 | pr_debug_ratelimited( |
4749 | "%s WRMSR failed (%u, 0x%x, 0x%llx)\n" , |
4750 | __func__, j, h.index, h.value); |
4751 | goto vmabort; |
4752 | } |
4753 | } |
4754 | } |
4755 | |
4756 | return; |
4757 | |
4758 | vmabort: |
4759 | nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); |
4760 | } |
4761 | |
4762 | /* |
4763 | * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 |
4764 | * and modify vmcs12 to make it see what it would expect to see there if |
4765 | * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) |
4766 | */ |
4767 | void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, |
4768 | u32 exit_intr_info, unsigned long exit_qualification) |
4769 | { |
4770 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
4771 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
4772 | |
4773 | /* Pending MTF traps are discarded on VM-Exit. */ |
4774 | vmx->nested.mtf_pending = false; |
4775 | |
4776 | /* trying to cancel vmlaunch/vmresume is a bug */ |
4777 | WARN_ON_ONCE(vmx->nested.nested_run_pending); |
4778 | |
4779 | #ifdef CONFIG_KVM_HYPERV |
4780 | if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { |
4781 | /* |
4782 | * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map |
4783 | * Enlightened VMCS after migration and we still need to |
4784 | * do that when something is forcing L2->L1 exit prior to |
4785 | * the first L2 run. |
4786 | */ |
4787 | (void)nested_get_evmcs_page(vcpu); |
4788 | } |
4789 | #endif |
4790 | |
4791 | /* Service pending TLB flush requests for L2 before switching to L1. */ |
4792 | kvm_service_local_tlb_flush_requests(vcpu); |
4793 | |
4794 | /* |
4795 | * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between |
4796 | * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are |
4797 | * up-to-date before switching to L1. |
4798 | */ |
4799 | if (enable_ept && is_pae_paging(vcpu)) |
4800 | vmx_ept_load_pdptrs(vcpu); |
4801 | |
4802 | leave_guest_mode(vcpu); |
4803 | |
4804 | if (nested_cpu_has_preemption_timer(vmcs12)) |
4805 | hrtimer_cancel(timer: &to_vmx(vcpu)->nested.preemption_timer); |
4806 | |
4807 | if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { |
4808 | vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; |
4809 | if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) |
4810 | vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; |
4811 | } |
4812 | |
4813 | if (likely(!vmx->fail)) { |
4814 | sync_vmcs02_to_vmcs12(vcpu, vmcs12); |
4815 | |
4816 | if (vm_exit_reason != -1) |
4817 | prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, |
4818 | exit_intr_info, exit_qualification); |
4819 | |
4820 | /* |
4821 | * Must happen outside of sync_vmcs02_to_vmcs12() as it will |
4822 | * also be used to capture vmcs12 cache as part of |
4823 | * capturing nVMX state for snapshot (migration). |
4824 | * |
4825 | * Otherwise, this flush will dirty guest memory at a |
4826 | * point it is already assumed by user-space to be |
4827 | * immutable. |
4828 | */ |
4829 | nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); |
4830 | } else { |
4831 | /* |
4832 | * The only expected VM-instruction error is "VM entry with |
4833 | * invalid control field(s)." Anything else indicates a |
4834 | * problem with L0. And we should never get here with a |
4835 | * VMFail of any type if early consistency checks are enabled. |
4836 | */ |
4837 | WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != |
4838 | VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
4839 | WARN_ON_ONCE(nested_early_check); |
4840 | } |
4841 | |
4842 | /* |
4843 | * Drop events/exceptions that were queued for re-injection to L2 |
4844 | * (picked up via vmx_complete_interrupts()), as well as exceptions |
4845 | * that were pending for L2. Note, this must NOT be hoisted above |
4846 | * prepare_vmcs12(), events/exceptions queued for re-injection need to |
4847 | * be captured in vmcs12 (see vmcs12_save_pending_event()). |
4848 | */ |
4849 | vcpu->arch.nmi_injected = false; |
4850 | kvm_clear_exception_queue(vcpu); |
4851 | kvm_clear_interrupt_queue(vcpu); |
4852 | |
4853 | vmx_switch_vmcs(vcpu, vmcs: &vmx->vmcs01); |
4854 | |
4855 | /* |
4856 | * If IBRS is advertised to the vCPU, KVM must flush the indirect |
4857 | * branch predictors when transitioning from L2 to L1, as L1 expects |
4858 | * hardware (KVM in this case) to provide separate predictor modes. |
4859 | * Bare metal isolates VMX root (host) from VMX non-root (guest), but |
4860 | * doesn't isolate different VMCSs, i.e. in this case, doesn't provide |
4861 | * separate modes for L2 vs L1. |
4862 | */ |
4863 | if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) |
4864 | indirect_branch_prediction_barrier(); |
4865 | |
4866 | /* Update any VMCS fields that might have changed while L2 ran */ |
4867 | vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: vmx->msr_autoload.host.nr); |
4868 | vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: vmx->msr_autoload.guest.nr); |
4869 | vmcs_write64(field: TSC_OFFSET, value: vcpu->arch.tsc_offset); |
4870 | if (kvm_caps.has_tsc_control) |
4871 | vmcs_write64(field: TSC_MULTIPLIER, value: vcpu->arch.tsc_scaling_ratio); |
4872 | |
4873 | if (vmx->nested.l1_tpr_threshold != -1) |
4874 | vmcs_write32(field: TPR_THRESHOLD, value: vmx->nested.l1_tpr_threshold); |
4875 | |
4876 | if (vmx->nested.change_vmcs01_virtual_apic_mode) { |
4877 | vmx->nested.change_vmcs01_virtual_apic_mode = false; |
4878 | vmx_set_virtual_apic_mode(vcpu); |
4879 | } |
4880 | |
4881 | if (vmx->nested.update_vmcs01_cpu_dirty_logging) { |
4882 | vmx->nested.update_vmcs01_cpu_dirty_logging = false; |
4883 | vmx_update_cpu_dirty_logging(vcpu); |
4884 | } |
4885 | |
4886 | /* Unpin physical memory we referred to in vmcs02 */ |
4887 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.apic_access_page_map, dirty: false); |
4888 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.virtual_apic_map, dirty: true); |
4889 | kvm_vcpu_unmap(vcpu, map: &vmx->nested.pi_desc_map, dirty: true); |
4890 | vmx->nested.pi_desc = NULL; |
4891 | |
4892 | if (vmx->nested.reload_vmcs01_apic_access_page) { |
4893 | vmx->nested.reload_vmcs01_apic_access_page = false; |
4894 | kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); |
4895 | } |
4896 | |
4897 | if (vmx->nested.update_vmcs01_apicv_status) { |
4898 | vmx->nested.update_vmcs01_apicv_status = false; |
4899 | kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); |
4900 | } |
4901 | |
4902 | if ((vm_exit_reason != -1) && |
4903 | (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) |
4904 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
4905 | |
4906 | /* in case we halted in L2 */ |
4907 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
4908 | |
4909 | if (likely(!vmx->fail)) { |
4910 | if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && |
4911 | nested_exit_intr_ack_set(vcpu)) { |
4912 | int irq = kvm_cpu_get_interrupt(v: vcpu); |
4913 | WARN_ON(irq < 0); |
4914 | vmcs12->vm_exit_intr_info = irq | |
4915 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; |
4916 | } |
4917 | |
4918 | if (vm_exit_reason != -1) |
4919 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, |
4920 | vmcs12->exit_qualification, |
4921 | vmcs12->idt_vectoring_info_field, |
4922 | vmcs12->vm_exit_intr_info, |
4923 | vmcs12->vm_exit_intr_error_code, |
4924 | KVM_ISA_VMX); |
4925 | |
4926 | load_vmcs12_host_state(vcpu, vmcs12); |
4927 | |
4928 | return; |
4929 | } |
4930 | |
4931 | /* |
4932 | * After an early L2 VM-entry failure, we're now back |
4933 | * in L1 which thinks it just finished a VMLAUNCH or |
4934 | * VMRESUME instruction, so we need to set the failure |
4935 | * flag and the VM-instruction error field of the VMCS |
4936 | * accordingly, and skip the emulated instruction. |
4937 | */ |
4938 | (void)nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_ENTRY_INVALID_CONTROL_FIELD); |
4939 | |
4940 | /* |
4941 | * Restore L1's host state to KVM's software model. We're here |
4942 | * because a consistency check was caught by hardware, which |
4943 | * means some amount of guest state has been propagated to KVM's |
4944 | * model and needs to be unwound to the host's state. |
4945 | */ |
4946 | nested_vmx_restore_host_state(vcpu); |
4947 | |
4948 | vmx->fail = 0; |
4949 | } |
4950 | |
4951 | static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) |
4952 | { |
4953 | kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
4954 | nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, exit_intr_info: 0, exit_qualification: 0); |
4955 | } |
4956 | |
4957 | /* |
4958 | * Decode the memory-address operand of a vmx instruction, as recorded on an |
4959 | * exit caused by such an instruction (run by a guest hypervisor). |
4960 | * On success, returns 0. When the operand is invalid, returns 1 and throws |
4961 | * #UD, #GP, or #SS. |
4962 | */ |
4963 | int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, |
4964 | u32 vmx_instruction_info, bool wr, int len, gva_t *ret) |
4965 | { |
4966 | gva_t off; |
4967 | bool exn; |
4968 | struct kvm_segment s; |
4969 | |
4970 | /* |
4971 | * According to Vol. 3B, "Information for VM Exits Due to Instruction |
4972 | * Execution", on an exit, vmx_instruction_info holds most of the |
4973 | * addressing components of the operand. Only the displacement part |
4974 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). |
4975 | * For how an actual address is calculated from all these components, |
4976 | * refer to Vol. 1, "Operand Addressing". |
4977 | */ |
4978 | int scaling = vmx_instruction_info & 3; |
4979 | int addr_size = (vmx_instruction_info >> 7) & 7; |
4980 | bool is_reg = vmx_instruction_info & (1u << 10); |
4981 | int seg_reg = (vmx_instruction_info >> 15) & 7; |
4982 | int index_reg = (vmx_instruction_info >> 18) & 0xf; |
4983 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); |
4984 | int base_reg = (vmx_instruction_info >> 23) & 0xf; |
4985 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); |
4986 | |
4987 | if (is_reg) { |
4988 | kvm_queue_exception(vcpu, UD_VECTOR); |
4989 | return 1; |
4990 | } |
4991 | |
4992 | /* Addr = segment_base + offset */ |
4993 | /* offset = base + [index * scale] + displacement */ |
4994 | off = exit_qualification; /* holds the displacement */ |
4995 | if (addr_size == 1) |
4996 | off = (gva_t)sign_extend64(value: off, index: 31); |
4997 | else if (addr_size == 0) |
4998 | off = (gva_t)sign_extend64(value: off, index: 15); |
4999 | if (base_is_valid) |
5000 | off += kvm_register_read(vcpu, reg: base_reg); |
5001 | if (index_is_valid) |
5002 | off += kvm_register_read(vcpu, reg: index_reg) << scaling; |
5003 | vmx_get_segment(vcpu, var: &s, seg: seg_reg); |
5004 | |
5005 | /* |
5006 | * The effective address, i.e. @off, of a memory operand is truncated |
5007 | * based on the address size of the instruction. Note that this is |
5008 | * the *effective address*, i.e. the address prior to accounting for |
5009 | * the segment's base. |
5010 | */ |
5011 | if (addr_size == 1) /* 32 bit */ |
5012 | off &= 0xffffffff; |
5013 | else if (addr_size == 0) /* 16 bit */ |
5014 | off &= 0xffff; |
5015 | |
5016 | /* Checks for #GP/#SS exceptions. */ |
5017 | exn = false; |
5018 | if (is_long_mode(vcpu)) { |
5019 | /* |
5020 | * The virtual/linear address is never truncated in 64-bit |
5021 | * mode, e.g. a 32-bit address size can yield a 64-bit virtual |
5022 | * address when using FS/GS with a non-zero base. |
5023 | */ |
5024 | if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) |
5025 | *ret = s.base + off; |
5026 | else |
5027 | *ret = off; |
5028 | |
5029 | *ret = vmx_get_untagged_addr(vcpu, gva: *ret, flags: 0); |
5030 | /* Long mode: #GP(0)/#SS(0) if the memory address is in a |
5031 | * non-canonical form. This is the only check on the memory |
5032 | * destination for long mode! |
5033 | */ |
5034 | exn = is_noncanonical_address(la: *ret, vcpu); |
5035 | } else { |
5036 | /* |
5037 | * When not in long mode, the virtual/linear address is |
5038 | * unconditionally truncated to 32 bits regardless of the |
5039 | * address size. |
5040 | */ |
5041 | *ret = (s.base + off) & 0xffffffff; |
5042 | |
5043 | /* Protected mode: apply checks for segment validity in the |
5044 | * following order: |
5045 | * - segment type check (#GP(0) may be thrown) |
5046 | * - usability check (#GP(0)/#SS(0)) |
5047 | * - limit check (#GP(0)/#SS(0)) |
5048 | */ |
5049 | if (wr) |
5050 | /* #GP(0) if the destination operand is located in a |
5051 | * read-only data segment or any code segment. |
5052 | */ |
5053 | exn = ((s.type & 0xa) == 0 || (s.type & 8)); |
5054 | else |
5055 | /* #GP(0) if the source operand is located in an |
5056 | * execute-only code segment |
5057 | */ |
5058 | exn = ((s.type & 0xa) == 8); |
5059 | if (exn) { |
5060 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code: 0); |
5061 | return 1; |
5062 | } |
5063 | /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. |
5064 | */ |
5065 | exn = (s.unusable != 0); |
5066 | |
5067 | /* |
5068 | * Protected mode: #GP(0)/#SS(0) if the memory operand is |
5069 | * outside the segment limit. All CPUs that support VMX ignore |
5070 | * limit checks for flat segments, i.e. segments with base==0, |
5071 | * limit==0xffffffff and of type expand-up data or code. |
5072 | */ |
5073 | if (!(s.base == 0 && s.limit == 0xffffffff && |
5074 | ((s.type & 8) || !(s.type & 4)))) |
5075 | exn = exn || ((u64)off + len - 1 > s.limit); |
5076 | } |
5077 | if (exn) { |
5078 | kvm_queue_exception_e(vcpu, |
5079 | nr: seg_reg == VCPU_SREG_SS ? |
5080 | SS_VECTOR : GP_VECTOR, |
5081 | error_code: 0); |
5082 | return 1; |
5083 | } |
5084 | |
5085 | return 0; |
5086 | } |
5087 | |
5088 | static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, |
5089 | int *ret) |
5090 | { |
5091 | gva_t gva; |
5092 | struct x86_exception e; |
5093 | int r; |
5094 | |
5095 | if (get_vmx_mem_address(vcpu, exit_qualification: vmx_get_exit_qual(vcpu), |
5096 | vmx_instruction_info: vmcs_read32(field: VMX_INSTRUCTION_INFO), wr: false, |
5097 | len: sizeof(*vmpointer), ret: &gva)) { |
5098 | *ret = 1; |
5099 | return -EINVAL; |
5100 | } |
5101 | |
5102 | r = kvm_read_guest_virt(vcpu, addr: gva, val: vmpointer, bytes: sizeof(*vmpointer), exception: &e); |
5103 | if (r != X86EMUL_CONTINUE) { |
5104 | *ret = kvm_handle_memory_failure(vcpu, r, e: &e); |
5105 | return -EINVAL; |
5106 | } |
5107 | |
5108 | return 0; |
5109 | } |
5110 | |
5111 | /* |
5112 | * Allocate a shadow VMCS and associate it with the currently loaded |
5113 | * VMCS, unless such a shadow VMCS already exists. The newly allocated |
5114 | * VMCS is also VMCLEARed, so that it is ready for use. |
5115 | */ |
5116 | static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) |
5117 | { |
5118 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5119 | struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; |
5120 | |
5121 | /* |
5122 | * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it |
5123 | * when L1 executes VMXOFF or the vCPU is forced out of nested |
5124 | * operation. VMXON faults if the CPU is already post-VMXON, so it |
5125 | * should be impossible to already have an allocated shadow VMCS. KVM |
5126 | * doesn't support virtualization of VMCS shadowing, so vmcs01 should |
5127 | * always be the loaded VMCS. |
5128 | */ |
5129 | if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) |
5130 | return loaded_vmcs->shadow_vmcs; |
5131 | |
5132 | loaded_vmcs->shadow_vmcs = alloc_vmcs(shadow: true); |
5133 | if (loaded_vmcs->shadow_vmcs) |
5134 | vmcs_clear(vmcs: loaded_vmcs->shadow_vmcs); |
5135 | |
5136 | return loaded_vmcs->shadow_vmcs; |
5137 | } |
5138 | |
5139 | static int enter_vmx_operation(struct kvm_vcpu *vcpu) |
5140 | { |
5141 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5142 | int r; |
5143 | |
5144 | r = alloc_loaded_vmcs(loaded_vmcs: &vmx->nested.vmcs02); |
5145 | if (r < 0) |
5146 | goto out_vmcs02; |
5147 | |
5148 | vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
5149 | if (!vmx->nested.cached_vmcs12) |
5150 | goto out_cached_vmcs12; |
5151 | |
5152 | vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; |
5153 | vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); |
5154 | if (!vmx->nested.cached_shadow_vmcs12) |
5155 | goto out_cached_shadow_vmcs12; |
5156 | |
5157 | if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) |
5158 | goto out_shadow_vmcs; |
5159 | |
5160 | hrtimer_init(timer: &vmx->nested.preemption_timer, CLOCK_MONOTONIC, |
5161 | mode: HRTIMER_MODE_ABS_PINNED); |
5162 | vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; |
5163 | |
5164 | vmx->nested.vpid02 = allocate_vpid(); |
5165 | |
5166 | vmx->nested.vmcs02_initialized = false; |
5167 | vmx->nested.vmxon = true; |
5168 | |
5169 | if (vmx_pt_mode_is_host_guest()) { |
5170 | vmx->pt_desc.guest.ctl = 0; |
5171 | pt_update_intercept_for_msr(vcpu); |
5172 | } |
5173 | |
5174 | return 0; |
5175 | |
5176 | out_shadow_vmcs: |
5177 | kfree(objp: vmx->nested.cached_shadow_vmcs12); |
5178 | |
5179 | out_cached_shadow_vmcs12: |
5180 | kfree(objp: vmx->nested.cached_vmcs12); |
5181 | |
5182 | out_cached_vmcs12: |
5183 | free_loaded_vmcs(loaded_vmcs: &vmx->nested.vmcs02); |
5184 | |
5185 | out_vmcs02: |
5186 | return -ENOMEM; |
5187 | } |
5188 | |
5189 | /* Emulate the VMXON instruction. */ |
5190 | static int handle_vmxon(struct kvm_vcpu *vcpu) |
5191 | { |
5192 | int ret; |
5193 | gpa_t vmptr; |
5194 | uint32_t revision; |
5195 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5196 | const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED |
5197 | | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; |
5198 | |
5199 | /* |
5200 | * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter |
5201 | * the guest and so cannot rely on hardware to perform the check, |
5202 | * which has higher priority than VM-Exit (see Intel SDM's pseudocode |
5203 | * for VMXON). |
5204 | * |
5205 | * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 |
5206 | * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't |
5207 | * force any of the relevant guest state. For a restricted guest, KVM |
5208 | * does force CR0.PE=1, but only to also force VM86 in order to emulate |
5209 | * Real Mode, and so there's no need to check CR0.PE manually. |
5210 | */ |
5211 | if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { |
5212 | kvm_queue_exception(vcpu, UD_VECTOR); |
5213 | return 1; |
5214 | } |
5215 | |
5216 | /* |
5217 | * The CPL is checked for "not in VMX operation" and for "in VMX root", |
5218 | * and has higher priority than the VM-Fail due to being post-VMXON, |
5219 | * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, |
5220 | * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits |
5221 | * from L2 to L1, i.e. there's no need to check for the vCPU being in |
5222 | * VMX non-root. |
5223 | * |
5224 | * Forwarding the VM-Exit unconditionally, i.e. without performing the |
5225 | * #UD checks (see above), is functionally ok because KVM doesn't allow |
5226 | * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's |
5227 | * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are |
5228 | * missed by hardware due to shadowing CR0 and/or CR4. |
5229 | */ |
5230 | if (vmx_get_cpl(vcpu)) { |
5231 | kvm_inject_gp(vcpu, error_code: 0); |
5232 | return 1; |
5233 | } |
5234 | |
5235 | if (vmx->nested.vmxon) |
5236 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMXON_IN_VMX_ROOT_OPERATION); |
5237 | |
5238 | /* |
5239 | * Invalid CR0/CR4 generates #GP. These checks are performed if and |
5240 | * only if the vCPU isn't already in VMX operation, i.e. effectively |
5241 | * have lower priority than the VM-Fail above. |
5242 | */ |
5243 | if (!nested_host_cr0_valid(vcpu, val: kvm_read_cr0(vcpu)) || |
5244 | !nested_host_cr4_valid(vcpu, val: kvm_read_cr4(vcpu))) { |
5245 | kvm_inject_gp(vcpu, error_code: 0); |
5246 | return 1; |
5247 | } |
5248 | |
5249 | if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) |
5250 | != VMXON_NEEDED_FEATURES) { |
5251 | kvm_inject_gp(vcpu, error_code: 0); |
5252 | return 1; |
5253 | } |
5254 | |
5255 | if (nested_vmx_get_vmptr(vcpu, vmpointer: &vmptr, ret: &ret)) |
5256 | return ret; |
5257 | |
5258 | /* |
5259 | * SDM 3: 24.11.5 |
5260 | * The first 4 bytes of VMXON region contain the supported |
5261 | * VMCS revision identifier |
5262 | * |
5263 | * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; |
5264 | * which replaces physical address width with 32 |
5265 | */ |
5266 | if (!page_address_valid(vcpu, gpa: vmptr)) |
5267 | return nested_vmx_failInvalid(vcpu); |
5268 | |
5269 | if (kvm_read_guest(kvm: vcpu->kvm, gpa: vmptr, data: &revision, len: sizeof(revision)) || |
5270 | revision != VMCS12_REVISION) |
5271 | return nested_vmx_failInvalid(vcpu); |
5272 | |
5273 | vmx->nested.vmxon_ptr = vmptr; |
5274 | ret = enter_vmx_operation(vcpu); |
5275 | if (ret) |
5276 | return ret; |
5277 | |
5278 | return nested_vmx_succeed(vcpu); |
5279 | } |
5280 | |
5281 | static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) |
5282 | { |
5283 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5284 | |
5285 | if (vmx->nested.current_vmptr == INVALID_GPA) |
5286 | return; |
5287 | |
5288 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12: get_vmcs12(vcpu)); |
5289 | |
5290 | if (enable_shadow_vmcs) { |
5291 | /* copy to memory all shadowed fields in case |
5292 | they were modified */ |
5293 | copy_shadow_to_vmcs12(vmx); |
5294 | vmx_disable_shadow_vmcs(vmx); |
5295 | } |
5296 | vmx->nested.posted_intr_nv = -1; |
5297 | |
5298 | /* Flush VMCS12 to guest memory */ |
5299 | kvm_vcpu_write_guest_page(vcpu, |
5300 | gfn: vmx->nested.current_vmptr >> PAGE_SHIFT, |
5301 | data: vmx->nested.cached_vmcs12, offset: 0, VMCS12_SIZE); |
5302 | |
5303 | kvm_mmu_free_roots(kvm: vcpu->kvm, mmu: &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); |
5304 | |
5305 | vmx->nested.current_vmptr = INVALID_GPA; |
5306 | } |
5307 | |
5308 | /* Emulate the VMXOFF instruction */ |
5309 | static int handle_vmxoff(struct kvm_vcpu *vcpu) |
5310 | { |
5311 | if (!nested_vmx_check_permission(vcpu)) |
5312 | return 1; |
5313 | |
5314 | free_nested(vcpu); |
5315 | |
5316 | if (kvm_apic_has_pending_init_or_sipi(vcpu)) |
5317 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5318 | |
5319 | return nested_vmx_succeed(vcpu); |
5320 | } |
5321 | |
5322 | /* Emulate the VMCLEAR instruction */ |
5323 | static int handle_vmclear(struct kvm_vcpu *vcpu) |
5324 | { |
5325 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5326 | u32 zero = 0; |
5327 | gpa_t vmptr; |
5328 | int r; |
5329 | |
5330 | if (!nested_vmx_check_permission(vcpu)) |
5331 | return 1; |
5332 | |
5333 | if (nested_vmx_get_vmptr(vcpu, vmpointer: &vmptr, ret: &r)) |
5334 | return r; |
5335 | |
5336 | if (!page_address_valid(vcpu, gpa: vmptr)) |
5337 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMCLEAR_INVALID_ADDRESS); |
5338 | |
5339 | if (vmptr == vmx->nested.vmxon_ptr) |
5340 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMCLEAR_VMXON_POINTER); |
5341 | |
5342 | if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { |
5343 | if (vmptr == vmx->nested.current_vmptr) |
5344 | nested_release_vmcs12(vcpu); |
5345 | |
5346 | /* |
5347 | * Silently ignore memory errors on VMCLEAR, Intel's pseudocode |
5348 | * for VMCLEAR includes a "ensure that data for VMCS referenced |
5349 | * by the operand is in memory" clause that guards writes to |
5350 | * memory, i.e. doing nothing for I/O is architecturally valid. |
5351 | * |
5352 | * FIXME: Suppress failures if and only if no memslot is found, |
5353 | * i.e. exit to userspace if __copy_to_user() fails. |
5354 | */ |
5355 | (void)kvm_vcpu_write_guest(vcpu, |
5356 | gpa: vmptr + offsetof(struct vmcs12, |
5357 | launch_state), |
5358 | data: &zero, len: sizeof(zero)); |
5359 | } |
5360 | |
5361 | return nested_vmx_succeed(vcpu); |
5362 | } |
5363 | |
5364 | /* Emulate the VMLAUNCH instruction */ |
5365 | static int handle_vmlaunch(struct kvm_vcpu *vcpu) |
5366 | { |
5367 | return nested_vmx_run(vcpu, launch: true); |
5368 | } |
5369 | |
5370 | /* Emulate the VMRESUME instruction */ |
5371 | static int handle_vmresume(struct kvm_vcpu *vcpu) |
5372 | { |
5373 | |
5374 | return nested_vmx_run(vcpu, launch: false); |
5375 | } |
5376 | |
5377 | static int handle_vmread(struct kvm_vcpu *vcpu) |
5378 | { |
5379 | struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) |
5380 | : get_vmcs12(vcpu); |
5381 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
5382 | u32 instr_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
5383 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5384 | struct x86_exception e; |
5385 | unsigned long field; |
5386 | u64 value; |
5387 | gva_t gva = 0; |
5388 | short offset; |
5389 | int len, r; |
5390 | |
5391 | if (!nested_vmx_check_permission(vcpu)) |
5392 | return 1; |
5393 | |
5394 | /* Decode instruction info and find the field to read */ |
5395 | field = kvm_register_read(vcpu, reg: (((instr_info) >> 28) & 0xf)); |
5396 | |
5397 | if (!nested_vmx_is_evmptr12_valid(vmx)) { |
5398 | /* |
5399 | * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, |
5400 | * any VMREAD sets the ALU flags for VMfailInvalid. |
5401 | */ |
5402 | if (vmx->nested.current_vmptr == INVALID_GPA || |
5403 | (is_guest_mode(vcpu) && |
5404 | get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) |
5405 | return nested_vmx_failInvalid(vcpu); |
5406 | |
5407 | offset = get_vmcs12_field_offset(field); |
5408 | if (offset < 0) |
5409 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_UNSUPPORTED_VMCS_COMPONENT); |
5410 | |
5411 | if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) |
5412 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
5413 | |
5414 | /* Read the field, zero-extended to a u64 value */ |
5415 | value = vmcs12_read_any(vmcs12, field, offset); |
5416 | } else { |
5417 | /* |
5418 | * Hyper-V TLFS (as of 6.0b) explicitly states, that while an |
5419 | * enlightened VMCS is active VMREAD/VMWRITE instructions are |
5420 | * unsupported. Unfortunately, certain versions of Windows 11 |
5421 | * don't comply with this requirement which is not enforced in |
5422 | * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a |
5423 | * workaround, as misbehaving guests will panic on VM-Fail. |
5424 | * Note, enlightened VMCS is incompatible with shadow VMCS so |
5425 | * all VMREADs from L2 should go to L1. |
5426 | */ |
5427 | if (WARN_ON_ONCE(is_guest_mode(vcpu))) |
5428 | return nested_vmx_failInvalid(vcpu); |
5429 | |
5430 | offset = evmcs_field_offset(field, NULL); |
5431 | if (offset < 0) |
5432 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_UNSUPPORTED_VMCS_COMPONENT); |
5433 | |
5434 | /* Read the field, zero-extended to a u64 value */ |
5435 | value = evmcs_read_any(evmcs: nested_vmx_evmcs(vmx), field, offset); |
5436 | } |
5437 | |
5438 | /* |
5439 | * Now copy part of this value to register or memory, as requested. |
5440 | * Note that the number of bits actually copied is 32 or 64 depending |
5441 | * on the guest's mode (32 or 64 bit), not on the given field's length. |
5442 | */ |
5443 | if (instr_info & BIT(10)) { |
5444 | kvm_register_write(vcpu, reg: (((instr_info) >> 3) & 0xf), val: value); |
5445 | } else { |
5446 | len = is_64_bit_mode(vcpu) ? 8 : 4; |
5447 | if (get_vmx_mem_address(vcpu, exit_qualification, |
5448 | vmx_instruction_info: instr_info, wr: true, len, ret: &gva)) |
5449 | return 1; |
5450 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ |
5451 | r = kvm_write_guest_virt_system(vcpu, addr: gva, val: &value, bytes: len, exception: &e); |
5452 | if (r != X86EMUL_CONTINUE) |
5453 | return kvm_handle_memory_failure(vcpu, r, e: &e); |
5454 | } |
5455 | |
5456 | return nested_vmx_succeed(vcpu); |
5457 | } |
5458 | |
5459 | static bool is_shadow_field_rw(unsigned long field) |
5460 | { |
5461 | switch (field) { |
5462 | #define SHADOW_FIELD_RW(x, y) case x: |
5463 | #include "vmcs_shadow_fields.h" |
5464 | return true; |
5465 | default: |
5466 | break; |
5467 | } |
5468 | return false; |
5469 | } |
5470 | |
5471 | static bool is_shadow_field_ro(unsigned long field) |
5472 | { |
5473 | switch (field) { |
5474 | #define SHADOW_FIELD_RO(x, y) case x: |
5475 | #include "vmcs_shadow_fields.h" |
5476 | return true; |
5477 | default: |
5478 | break; |
5479 | } |
5480 | return false; |
5481 | } |
5482 | |
5483 | static int handle_vmwrite(struct kvm_vcpu *vcpu) |
5484 | { |
5485 | struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) |
5486 | : get_vmcs12(vcpu); |
5487 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
5488 | u32 instr_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
5489 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5490 | struct x86_exception e; |
5491 | unsigned long field; |
5492 | short offset; |
5493 | gva_t gva; |
5494 | int len, r; |
5495 | |
5496 | /* |
5497 | * The value to write might be 32 or 64 bits, depending on L1's long |
5498 | * mode, and eventually we need to write that into a field of several |
5499 | * possible lengths. The code below first zero-extends the value to 64 |
5500 | * bit (value), and then copies only the appropriate number of |
5501 | * bits into the vmcs12 field. |
5502 | */ |
5503 | u64 value = 0; |
5504 | |
5505 | if (!nested_vmx_check_permission(vcpu)) |
5506 | return 1; |
5507 | |
5508 | /* |
5509 | * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, |
5510 | * any VMWRITE sets the ALU flags for VMfailInvalid. |
5511 | */ |
5512 | if (vmx->nested.current_vmptr == INVALID_GPA || |
5513 | (is_guest_mode(vcpu) && |
5514 | get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) |
5515 | return nested_vmx_failInvalid(vcpu); |
5516 | |
5517 | if (instr_info & BIT(10)) |
5518 | value = kvm_register_read(vcpu, reg: (((instr_info) >> 3) & 0xf)); |
5519 | else { |
5520 | len = is_64_bit_mode(vcpu) ? 8 : 4; |
5521 | if (get_vmx_mem_address(vcpu, exit_qualification, |
5522 | vmx_instruction_info: instr_info, wr: false, len, ret: &gva)) |
5523 | return 1; |
5524 | r = kvm_read_guest_virt(vcpu, addr: gva, val: &value, bytes: len, exception: &e); |
5525 | if (r != X86EMUL_CONTINUE) |
5526 | return kvm_handle_memory_failure(vcpu, r, e: &e); |
5527 | } |
5528 | |
5529 | field = kvm_register_read(vcpu, reg: (((instr_info) >> 28) & 0xf)); |
5530 | |
5531 | offset = get_vmcs12_field_offset(field); |
5532 | if (offset < 0) |
5533 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_UNSUPPORTED_VMCS_COMPONENT); |
5534 | |
5535 | /* |
5536 | * If the vCPU supports "VMWRITE to any supported field in the |
5537 | * VMCS," then the "read-only" fields are actually read/write. |
5538 | */ |
5539 | if (vmcs_field_readonly(field) && |
5540 | !nested_cpu_has_vmwrite_any_field(vcpu)) |
5541 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); |
5542 | |
5543 | /* |
5544 | * Ensure vmcs12 is up-to-date before any VMWRITE that dirties |
5545 | * vmcs12, else we may crush a field or consume a stale value. |
5546 | */ |
5547 | if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) |
5548 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
5549 | |
5550 | /* |
5551 | * Some Intel CPUs intentionally drop the reserved bits of the AR byte |
5552 | * fields on VMWRITE. Emulate this behavior to ensure consistent KVM |
5553 | * behavior regardless of the underlying hardware, e.g. if an AR_BYTE |
5554 | * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD |
5555 | * from L1 will return a different value than VMREAD from L2 (L1 sees |
5556 | * the stripped down value, L2 sees the full value as stored by KVM). |
5557 | */ |
5558 | if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) |
5559 | value &= 0x1f0ff; |
5560 | |
5561 | vmcs12_write_any(vmcs12, field, offset, field_value: value); |
5562 | |
5563 | /* |
5564 | * Do not track vmcs12 dirty-state if in guest-mode as we actually |
5565 | * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated |
5566 | * by L1 without a vmexit are always updated in the vmcs02, i.e. don't |
5567 | * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. |
5568 | */ |
5569 | if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { |
5570 | /* |
5571 | * L1 can read these fields without exiting, ensure the |
5572 | * shadow VMCS is up-to-date. |
5573 | */ |
5574 | if (enable_shadow_vmcs && is_shadow_field_ro(field)) { |
5575 | preempt_disable(); |
5576 | vmcs_load(vmcs: vmx->vmcs01.shadow_vmcs); |
5577 | |
5578 | __vmcs_writel(field, value); |
5579 | |
5580 | vmcs_clear(vmcs: vmx->vmcs01.shadow_vmcs); |
5581 | vmcs_load(vmcs: vmx->loaded_vmcs->vmcs); |
5582 | preempt_enable(); |
5583 | } |
5584 | vmx->nested.dirty_vmcs12 = true; |
5585 | } |
5586 | |
5587 | return nested_vmx_succeed(vcpu); |
5588 | } |
5589 | |
5590 | static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) |
5591 | { |
5592 | vmx->nested.current_vmptr = vmptr; |
5593 | if (enable_shadow_vmcs) { |
5594 | secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); |
5595 | vmcs_write64(field: VMCS_LINK_POINTER, |
5596 | __pa(vmx->vmcs01.shadow_vmcs)); |
5597 | vmx->nested.need_vmcs12_to_shadow_sync = true; |
5598 | } |
5599 | vmx->nested.dirty_vmcs12 = true; |
5600 | vmx->nested.force_msr_bitmap_recalc = true; |
5601 | } |
5602 | |
5603 | /* Emulate the VMPTRLD instruction */ |
5604 | static int handle_vmptrld(struct kvm_vcpu *vcpu) |
5605 | { |
5606 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5607 | gpa_t vmptr; |
5608 | int r; |
5609 | |
5610 | if (!nested_vmx_check_permission(vcpu)) |
5611 | return 1; |
5612 | |
5613 | if (nested_vmx_get_vmptr(vcpu, vmpointer: &vmptr, ret: &r)) |
5614 | return r; |
5615 | |
5616 | if (!page_address_valid(vcpu, gpa: vmptr)) |
5617 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMPTRLD_INVALID_ADDRESS); |
5618 | |
5619 | if (vmptr == vmx->nested.vmxon_ptr) |
5620 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_VMPTRLD_VMXON_POINTER); |
5621 | |
5622 | /* Forbid normal VMPTRLD if Enlightened version was used */ |
5623 | if (nested_vmx_is_evmptr12_valid(vmx)) |
5624 | return 1; |
5625 | |
5626 | if (vmx->nested.current_vmptr != vmptr) { |
5627 | struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; |
5628 | struct vmcs_hdr hdr; |
5629 | |
5630 | if (kvm_gfn_to_hva_cache_init(kvm: vcpu->kvm, ghc, gpa: vmptr, VMCS12_SIZE)) { |
5631 | /* |
5632 | * Reads from an unbacked page return all 1s, |
5633 | * which means that the 32 bits located at the |
5634 | * given physical address won't match the required |
5635 | * VMCS12_REVISION identifier. |
5636 | */ |
5637 | return nested_vmx_fail(vcpu, |
5638 | vm_instruction_error: VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
5639 | } |
5640 | |
5641 | if (kvm_read_guest_offset_cached(kvm: vcpu->kvm, ghc, data: &hdr, |
5642 | offsetof(struct vmcs12, hdr), |
5643 | len: sizeof(hdr))) { |
5644 | return nested_vmx_fail(vcpu, |
5645 | vm_instruction_error: VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
5646 | } |
5647 | |
5648 | if (hdr.revision_id != VMCS12_REVISION || |
5649 | (hdr.shadow_vmcs && |
5650 | !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { |
5651 | return nested_vmx_fail(vcpu, |
5652 | vm_instruction_error: VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
5653 | } |
5654 | |
5655 | nested_release_vmcs12(vcpu); |
5656 | |
5657 | /* |
5658 | * Load VMCS12 from guest memory since it is not already |
5659 | * cached. |
5660 | */ |
5661 | if (kvm_read_guest_cached(kvm: vcpu->kvm, ghc, data: vmx->nested.cached_vmcs12, |
5662 | VMCS12_SIZE)) { |
5663 | return nested_vmx_fail(vcpu, |
5664 | vm_instruction_error: VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); |
5665 | } |
5666 | |
5667 | set_current_vmptr(vmx, vmptr); |
5668 | } |
5669 | |
5670 | return nested_vmx_succeed(vcpu); |
5671 | } |
5672 | |
5673 | /* Emulate the VMPTRST instruction */ |
5674 | static int handle_vmptrst(struct kvm_vcpu *vcpu) |
5675 | { |
5676 | unsigned long exit_qual = vmx_get_exit_qual(vcpu); |
5677 | u32 instr_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
5678 | gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; |
5679 | struct x86_exception e; |
5680 | gva_t gva; |
5681 | int r; |
5682 | |
5683 | if (!nested_vmx_check_permission(vcpu)) |
5684 | return 1; |
5685 | |
5686 | if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) |
5687 | return 1; |
5688 | |
5689 | if (get_vmx_mem_address(vcpu, exit_qualification: exit_qual, vmx_instruction_info: instr_info, |
5690 | wr: true, len: sizeof(gpa_t), ret: &gva)) |
5691 | return 1; |
5692 | /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ |
5693 | r = kvm_write_guest_virt_system(vcpu, addr: gva, val: (void *)¤t_vmptr, |
5694 | bytes: sizeof(gpa_t), exception: &e); |
5695 | if (r != X86EMUL_CONTINUE) |
5696 | return kvm_handle_memory_failure(vcpu, r, e: &e); |
5697 | |
5698 | return nested_vmx_succeed(vcpu); |
5699 | } |
5700 | |
5701 | /* Emulate the INVEPT instruction */ |
5702 | static int handle_invept(struct kvm_vcpu *vcpu) |
5703 | { |
5704 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5705 | u32 vmx_instruction_info, types; |
5706 | unsigned long type, roots_to_free; |
5707 | struct kvm_mmu *mmu; |
5708 | gva_t gva; |
5709 | struct x86_exception e; |
5710 | struct { |
5711 | u64 eptp, gpa; |
5712 | } operand; |
5713 | int i, r, gpr_index; |
5714 | |
5715 | if (!(vmx->nested.msrs.secondary_ctls_high & |
5716 | SECONDARY_EXEC_ENABLE_EPT) || |
5717 | !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { |
5718 | kvm_queue_exception(vcpu, UD_VECTOR); |
5719 | return 1; |
5720 | } |
5721 | |
5722 | if (!nested_vmx_check_permission(vcpu)) |
5723 | return 1; |
5724 | |
5725 | vmx_instruction_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
5726 | gpr_index = vmx_get_instr_info_reg2(vmx_instr_info: vmx_instruction_info); |
5727 | type = kvm_register_read(vcpu, reg: gpr_index); |
5728 | |
5729 | types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; |
5730 | |
5731 | if (type >= 32 || !(types & (1 << type))) |
5732 | return nested_vmx_fail(vcpu, vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5733 | |
5734 | /* According to the Intel VMX instruction reference, the memory |
5735 | * operand is read even if it isn't needed (e.g., for type==global) |
5736 | */ |
5737 | if (get_vmx_mem_address(vcpu, exit_qualification: vmx_get_exit_qual(vcpu), |
5738 | vmx_instruction_info, wr: false, len: sizeof(operand), ret: &gva)) |
5739 | return 1; |
5740 | r = kvm_read_guest_virt(vcpu, addr: gva, val: &operand, bytes: sizeof(operand), exception: &e); |
5741 | if (r != X86EMUL_CONTINUE) |
5742 | return kvm_handle_memory_failure(vcpu, r, e: &e); |
5743 | |
5744 | /* |
5745 | * Nested EPT roots are always held through guest_mmu, |
5746 | * not root_mmu. |
5747 | */ |
5748 | mmu = &vcpu->arch.guest_mmu; |
5749 | |
5750 | switch (type) { |
5751 | case VMX_EPT_EXTENT_CONTEXT: |
5752 | if (!nested_vmx_check_eptp(vcpu, new_eptp: operand.eptp)) |
5753 | return nested_vmx_fail(vcpu, |
5754 | vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5755 | |
5756 | roots_to_free = 0; |
5757 | if (nested_ept_root_matches(root_hpa: mmu->root.hpa, root_eptp: mmu->root.pgd, |
5758 | eptp: operand.eptp)) |
5759 | roots_to_free |= KVM_MMU_ROOT_CURRENT; |
5760 | |
5761 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { |
5762 | if (nested_ept_root_matches(root_hpa: mmu->prev_roots[i].hpa, |
5763 | root_eptp: mmu->prev_roots[i].pgd, |
5764 | eptp: operand.eptp)) |
5765 | roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); |
5766 | } |
5767 | break; |
5768 | case VMX_EPT_EXTENT_GLOBAL: |
5769 | roots_to_free = KVM_MMU_ROOTS_ALL; |
5770 | break; |
5771 | default: |
5772 | BUG(); |
5773 | break; |
5774 | } |
5775 | |
5776 | if (roots_to_free) |
5777 | kvm_mmu_free_roots(kvm: vcpu->kvm, mmu, roots_to_free); |
5778 | |
5779 | return nested_vmx_succeed(vcpu); |
5780 | } |
5781 | |
5782 | static int handle_invvpid(struct kvm_vcpu *vcpu) |
5783 | { |
5784 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5785 | u32 vmx_instruction_info; |
5786 | unsigned long type, types; |
5787 | gva_t gva; |
5788 | struct x86_exception e; |
5789 | struct { |
5790 | u64 vpid; |
5791 | u64 gla; |
5792 | } operand; |
5793 | u16 vpid02; |
5794 | int r, gpr_index; |
5795 | |
5796 | if (!(vmx->nested.msrs.secondary_ctls_high & |
5797 | SECONDARY_EXEC_ENABLE_VPID) || |
5798 | !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { |
5799 | kvm_queue_exception(vcpu, UD_VECTOR); |
5800 | return 1; |
5801 | } |
5802 | |
5803 | if (!nested_vmx_check_permission(vcpu)) |
5804 | return 1; |
5805 | |
5806 | vmx_instruction_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
5807 | gpr_index = vmx_get_instr_info_reg2(vmx_instr_info: vmx_instruction_info); |
5808 | type = kvm_register_read(vcpu, reg: gpr_index); |
5809 | |
5810 | types = (vmx->nested.msrs.vpid_caps & |
5811 | VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; |
5812 | |
5813 | if (type >= 32 || !(types & (1 << type))) |
5814 | return nested_vmx_fail(vcpu, |
5815 | vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5816 | |
5817 | /* according to the intel vmx instruction reference, the memory |
5818 | * operand is read even if it isn't needed (e.g., for type==global) |
5819 | */ |
5820 | if (get_vmx_mem_address(vcpu, exit_qualification: vmx_get_exit_qual(vcpu), |
5821 | vmx_instruction_info, wr: false, len: sizeof(operand), ret: &gva)) |
5822 | return 1; |
5823 | r = kvm_read_guest_virt(vcpu, addr: gva, val: &operand, bytes: sizeof(operand), exception: &e); |
5824 | if (r != X86EMUL_CONTINUE) |
5825 | return kvm_handle_memory_failure(vcpu, r, e: &e); |
5826 | |
5827 | if (operand.vpid >> 16) |
5828 | return nested_vmx_fail(vcpu, |
5829 | vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5830 | |
5831 | vpid02 = nested_get_vpid02(vcpu); |
5832 | switch (type) { |
5833 | case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: |
5834 | /* |
5835 | * LAM doesn't apply to addresses that are inputs to TLB |
5836 | * invalidation. |
5837 | */ |
5838 | if (!operand.vpid || |
5839 | is_noncanonical_address(la: operand.gla, vcpu)) |
5840 | return nested_vmx_fail(vcpu, |
5841 | vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5842 | vpid_sync_vcpu_addr(vpid: vpid02, addr: operand.gla); |
5843 | break; |
5844 | case VMX_VPID_EXTENT_SINGLE_CONTEXT: |
5845 | case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: |
5846 | if (!operand.vpid) |
5847 | return nested_vmx_fail(vcpu, |
5848 | vm_instruction_error: VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); |
5849 | vpid_sync_context(vpid: vpid02); |
5850 | break; |
5851 | case VMX_VPID_EXTENT_ALL_CONTEXT: |
5852 | vpid_sync_context(vpid: vpid02); |
5853 | break; |
5854 | default: |
5855 | WARN_ON_ONCE(1); |
5856 | return kvm_skip_emulated_instruction(vcpu); |
5857 | } |
5858 | |
5859 | /* |
5860 | * Sync the shadow page tables if EPT is disabled, L1 is invalidating |
5861 | * linear mappings for L2 (tagged with L2's VPID). Free all guest |
5862 | * roots as VPIDs are not tracked in the MMU role. |
5863 | * |
5864 | * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share |
5865 | * an MMU when EPT is disabled. |
5866 | * |
5867 | * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. |
5868 | */ |
5869 | if (!enable_ept) |
5870 | kvm_mmu_free_guest_mode_roots(kvm: vcpu->kvm, mmu: &vcpu->arch.root_mmu); |
5871 | |
5872 | return nested_vmx_succeed(vcpu); |
5873 | } |
5874 | |
5875 | static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, |
5876 | struct vmcs12 *vmcs12) |
5877 | { |
5878 | u32 index = kvm_rcx_read(vcpu); |
5879 | u64 new_eptp; |
5880 | |
5881 | if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) |
5882 | return 1; |
5883 | if (index >= VMFUNC_EPTP_ENTRIES) |
5884 | return 1; |
5885 | |
5886 | if (kvm_vcpu_read_guest_page(vcpu, gfn: vmcs12->eptp_list_address >> PAGE_SHIFT, |
5887 | data: &new_eptp, offset: index * 8, len: 8)) |
5888 | return 1; |
5889 | |
5890 | /* |
5891 | * If the (L2) guest does a vmfunc to the currently |
5892 | * active ept pointer, we don't have to do anything else |
5893 | */ |
5894 | if (vmcs12->ept_pointer != new_eptp) { |
5895 | if (!nested_vmx_check_eptp(vcpu, new_eptp)) |
5896 | return 1; |
5897 | |
5898 | vmcs12->ept_pointer = new_eptp; |
5899 | nested_ept_new_eptp(vcpu); |
5900 | |
5901 | if (!nested_cpu_has_vpid(vmcs12)) |
5902 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
5903 | } |
5904 | |
5905 | return 0; |
5906 | } |
5907 | |
5908 | static int handle_vmfunc(struct kvm_vcpu *vcpu) |
5909 | { |
5910 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
5911 | struct vmcs12 *vmcs12; |
5912 | u32 function = kvm_rax_read(vcpu); |
5913 | |
5914 | /* |
5915 | * VMFUNC should never execute cleanly while L1 is active; KVM supports |
5916 | * VMFUNC for nested VMs, but not for L1. |
5917 | */ |
5918 | if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { |
5919 | kvm_queue_exception(vcpu, UD_VECTOR); |
5920 | return 1; |
5921 | } |
5922 | |
5923 | vmcs12 = get_vmcs12(vcpu); |
5924 | |
5925 | /* |
5926 | * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC |
5927 | * is enabled in vmcs02 if and only if it's enabled in vmcs12. |
5928 | */ |
5929 | if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { |
5930 | kvm_queue_exception(vcpu, UD_VECTOR); |
5931 | return 1; |
5932 | } |
5933 | |
5934 | if (!(vmcs12->vm_function_control & BIT_ULL(function))) |
5935 | goto fail; |
5936 | |
5937 | switch (function) { |
5938 | case 0: |
5939 | if (nested_vmx_eptp_switching(vcpu, vmcs12)) |
5940 | goto fail; |
5941 | break; |
5942 | default: |
5943 | goto fail; |
5944 | } |
5945 | return kvm_skip_emulated_instruction(vcpu); |
5946 | |
5947 | fail: |
5948 | /* |
5949 | * This is effectively a reflected VM-Exit, as opposed to a synthesized |
5950 | * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode |
5951 | * EXIT_REASON_VMFUNC as the exit reason. |
5952 | */ |
5953 | nested_vmx_vmexit(vcpu, vm_exit_reason: vmx->exit_reason.full, |
5954 | exit_intr_info: vmx_get_intr_info(vcpu), |
5955 | exit_qualification: vmx_get_exit_qual(vcpu)); |
5956 | return 1; |
5957 | } |
5958 | |
5959 | /* |
5960 | * Return true if an IO instruction with the specified port and size should cause |
5961 | * a VM-exit into L1. |
5962 | */ |
5963 | bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, |
5964 | int size) |
5965 | { |
5966 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
5967 | gpa_t bitmap, last_bitmap; |
5968 | u8 b; |
5969 | |
5970 | last_bitmap = INVALID_GPA; |
5971 | b = -1; |
5972 | |
5973 | while (size > 0) { |
5974 | if (port < 0x8000) |
5975 | bitmap = vmcs12->io_bitmap_a; |
5976 | else if (port < 0x10000) |
5977 | bitmap = vmcs12->io_bitmap_b; |
5978 | else |
5979 | return true; |
5980 | bitmap += (port & 0x7fff) / 8; |
5981 | |
5982 | if (last_bitmap != bitmap) |
5983 | if (kvm_vcpu_read_guest(vcpu, gpa: bitmap, data: &b, len: 1)) |
5984 | return true; |
5985 | if (b & (1 << (port & 7))) |
5986 | return true; |
5987 | |
5988 | port++; |
5989 | size--; |
5990 | last_bitmap = bitmap; |
5991 | } |
5992 | |
5993 | return false; |
5994 | } |
5995 | |
5996 | static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, |
5997 | struct vmcs12 *vmcs12) |
5998 | { |
5999 | unsigned long exit_qualification; |
6000 | unsigned short port; |
6001 | int size; |
6002 | |
6003 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) |
6004 | return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); |
6005 | |
6006 | exit_qualification = vmx_get_exit_qual(vcpu); |
6007 | |
6008 | port = exit_qualification >> 16; |
6009 | size = (exit_qualification & 7) + 1; |
6010 | |
6011 | return nested_vmx_check_io_bitmaps(vcpu, port, size); |
6012 | } |
6013 | |
6014 | /* |
6015 | * Return 1 if we should exit from L2 to L1 to handle an MSR access, |
6016 | * rather than handle it ourselves in L0. I.e., check whether L1 expressed |
6017 | * disinterest in the current event (read or write a specific MSR) by using an |
6018 | * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. |
6019 | */ |
6020 | static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, |
6021 | struct vmcs12 *vmcs12, |
6022 | union vmx_exit_reason exit_reason) |
6023 | { |
6024 | u32 msr_index = kvm_rcx_read(vcpu); |
6025 | gpa_t bitmap; |
6026 | |
6027 | if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) |
6028 | return true; |
6029 | |
6030 | /* |
6031 | * The MSR_BITMAP page is divided into four 1024-byte bitmaps, |
6032 | * for the four combinations of read/write and low/high MSR numbers. |
6033 | * First we need to figure out which of the four to use: |
6034 | */ |
6035 | bitmap = vmcs12->msr_bitmap; |
6036 | if (exit_reason.basic == EXIT_REASON_MSR_WRITE) |
6037 | bitmap += 2048; |
6038 | if (msr_index >= 0xc0000000) { |
6039 | msr_index -= 0xc0000000; |
6040 | bitmap += 1024; |
6041 | } |
6042 | |
6043 | /* Then read the msr_index'th bit from this bitmap: */ |
6044 | if (msr_index < 1024*8) { |
6045 | unsigned char b; |
6046 | if (kvm_vcpu_read_guest(vcpu, gpa: bitmap + msr_index/8, data: &b, len: 1)) |
6047 | return true; |
6048 | return 1 & (b >> (msr_index & 7)); |
6049 | } else |
6050 | return true; /* let L1 handle the wrong parameter */ |
6051 | } |
6052 | |
6053 | /* |
6054 | * Return 1 if we should exit from L2 to L1 to handle a CR access exit, |
6055 | * rather than handle it ourselves in L0. I.e., check if L1 wanted to |
6056 | * intercept (via guest_host_mask etc.) the current event. |
6057 | */ |
6058 | static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, |
6059 | struct vmcs12 *vmcs12) |
6060 | { |
6061 | unsigned long exit_qualification = vmx_get_exit_qual(vcpu); |
6062 | int cr = exit_qualification & 15; |
6063 | int reg; |
6064 | unsigned long val; |
6065 | |
6066 | switch ((exit_qualification >> 4) & 3) { |
6067 | case 0: /* mov to cr */ |
6068 | reg = (exit_qualification >> 8) & 15; |
6069 | val = kvm_register_read(vcpu, reg); |
6070 | switch (cr) { |
6071 | case 0: |
6072 | if (vmcs12->cr0_guest_host_mask & |
6073 | (val ^ vmcs12->cr0_read_shadow)) |
6074 | return true; |
6075 | break; |
6076 | case 3: |
6077 | if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) |
6078 | return true; |
6079 | break; |
6080 | case 4: |
6081 | if (vmcs12->cr4_guest_host_mask & |
6082 | (vmcs12->cr4_read_shadow ^ val)) |
6083 | return true; |
6084 | break; |
6085 | case 8: |
6086 | if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) |
6087 | return true; |
6088 | break; |
6089 | } |
6090 | break; |
6091 | case 2: /* clts */ |
6092 | if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && |
6093 | (vmcs12->cr0_read_shadow & X86_CR0_TS)) |
6094 | return true; |
6095 | break; |
6096 | case 1: /* mov from cr */ |
6097 | switch (cr) { |
6098 | case 3: |
6099 | if (vmcs12->cpu_based_vm_exec_control & |
6100 | CPU_BASED_CR3_STORE_EXITING) |
6101 | return true; |
6102 | break; |
6103 | case 8: |
6104 | if (vmcs12->cpu_based_vm_exec_control & |
6105 | CPU_BASED_CR8_STORE_EXITING) |
6106 | return true; |
6107 | break; |
6108 | } |
6109 | break; |
6110 | case 3: /* lmsw */ |
6111 | /* |
6112 | * lmsw can change bits 1..3 of cr0, and only set bit 0 of |
6113 | * cr0. Other attempted changes are ignored, with no exit. |
6114 | */ |
6115 | val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; |
6116 | if (vmcs12->cr0_guest_host_mask & 0xe & |
6117 | (val ^ vmcs12->cr0_read_shadow)) |
6118 | return true; |
6119 | if ((vmcs12->cr0_guest_host_mask & 0x1) && |
6120 | !(vmcs12->cr0_read_shadow & 0x1) && |
6121 | (val & 0x1)) |
6122 | return true; |
6123 | break; |
6124 | } |
6125 | return false; |
6126 | } |
6127 | |
6128 | static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, |
6129 | struct vmcs12 *vmcs12) |
6130 | { |
6131 | u32 encls_leaf; |
6132 | |
6133 | if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || |
6134 | !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) |
6135 | return false; |
6136 | |
6137 | encls_leaf = kvm_rax_read(vcpu); |
6138 | if (encls_leaf > 62) |
6139 | encls_leaf = 63; |
6140 | return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); |
6141 | } |
6142 | |
6143 | static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, |
6144 | struct vmcs12 *vmcs12, gpa_t bitmap) |
6145 | { |
6146 | u32 vmx_instruction_info; |
6147 | unsigned long field; |
6148 | u8 b; |
6149 | |
6150 | if (!nested_cpu_has_shadow_vmcs(vmcs12)) |
6151 | return true; |
6152 | |
6153 | /* Decode instruction info and find the field to access */ |
6154 | vmx_instruction_info = vmcs_read32(field: VMX_INSTRUCTION_INFO); |
6155 | field = kvm_register_read(vcpu, reg: (((vmx_instruction_info) >> 28) & 0xf)); |
6156 | |
6157 | /* Out-of-range fields always cause a VM exit from L2 to L1 */ |
6158 | if (field >> 15) |
6159 | return true; |
6160 | |
6161 | if (kvm_vcpu_read_guest(vcpu, gpa: bitmap + field/8, data: &b, len: 1)) |
6162 | return true; |
6163 | |
6164 | return 1 & (b >> (field & 7)); |
6165 | } |
6166 | |
6167 | static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) |
6168 | { |
6169 | u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; |
6170 | |
6171 | if (nested_cpu_has_mtf(vmcs12)) |
6172 | return true; |
6173 | |
6174 | /* |
6175 | * An MTF VM-exit may be injected into the guest by setting the |
6176 | * interruption-type to 7 (other event) and the vector field to 0. Such |
6177 | * is the case regardless of the 'monitor trap flag' VM-execution |
6178 | * control. |
6179 | */ |
6180 | return entry_intr_info == (INTR_INFO_VALID_MASK |
6181 | | INTR_TYPE_OTHER_EVENT); |
6182 | } |
6183 | |
6184 | /* |
6185 | * Return true if L0 wants to handle an exit from L2 regardless of whether or not |
6186 | * L1 wants the exit. Only call this when in is_guest_mode (L2). |
6187 | */ |
6188 | static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, |
6189 | union vmx_exit_reason exit_reason) |
6190 | { |
6191 | u32 intr_info; |
6192 | |
6193 | switch ((u16)exit_reason.basic) { |
6194 | case EXIT_REASON_EXCEPTION_NMI: |
6195 | intr_info = vmx_get_intr_info(vcpu); |
6196 | if (is_nmi(intr_info)) |
6197 | return true; |
6198 | else if (is_page_fault(intr_info)) |
6199 | return vcpu->arch.apf.host_apf_flags || |
6200 | vmx_need_pf_intercept(vcpu); |
6201 | else if (is_debug(intr_info) && |
6202 | vcpu->guest_debug & |
6203 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) |
6204 | return true; |
6205 | else if (is_breakpoint(intr_info) && |
6206 | vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
6207 | return true; |
6208 | else if (is_alignment_check(intr_info) && |
6209 | !vmx_guest_inject_ac(vcpu)) |
6210 | return true; |
6211 | return false; |
6212 | case EXIT_REASON_EXTERNAL_INTERRUPT: |
6213 | return true; |
6214 | case EXIT_REASON_MCE_DURING_VMENTRY: |
6215 | return true; |
6216 | case EXIT_REASON_EPT_VIOLATION: |
6217 | /* |
6218 | * L0 always deals with the EPT violation. If nested EPT is |
6219 | * used, and the nested mmu code discovers that the address is |
6220 | * missing in the guest EPT table (EPT12), the EPT violation |
6221 | * will be injected with nested_ept_inject_page_fault() |
6222 | */ |
6223 | return true; |
6224 | case EXIT_REASON_EPT_MISCONFIG: |
6225 | /* |
6226 | * L2 never uses directly L1's EPT, but rather L0's own EPT |
6227 | * table (shadow on EPT) or a merged EPT table that L0 built |
6228 | * (EPT on EPT). So any problems with the structure of the |
6229 | * table is L0's fault. |
6230 | */ |
6231 | return true; |
6232 | case EXIT_REASON_PREEMPTION_TIMER: |
6233 | return true; |
6234 | case EXIT_REASON_PML_FULL: |
6235 | /* |
6236 | * PML is emulated for an L1 VMM and should never be enabled in |
6237 | * vmcs02, always "handle" PML_FULL by exiting to userspace. |
6238 | */ |
6239 | return true; |
6240 | case EXIT_REASON_VMFUNC: |
6241 | /* VM functions are emulated through L2->L0 vmexits. */ |
6242 | return true; |
6243 | case EXIT_REASON_BUS_LOCK: |
6244 | /* |
6245 | * At present, bus lock VM exit is never exposed to L1. |
6246 | * Handle L2's bus locks in L0 directly. |
6247 | */ |
6248 | return true; |
6249 | #ifdef CONFIG_KVM_HYPERV |
6250 | case EXIT_REASON_VMCALL: |
6251 | /* Hyper-V L2 TLB flush hypercall is handled by L0 */ |
6252 | return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && |
6253 | nested_evmcs_l2_tlb_flush_enabled(vcpu) && |
6254 | kvm_hv_is_tlb_flush_hcall(vcpu); |
6255 | #endif |
6256 | default: |
6257 | break; |
6258 | } |
6259 | return false; |
6260 | } |
6261 | |
6262 | /* |
6263 | * Return 1 if L1 wants to intercept an exit from L2. Only call this when in |
6264 | * is_guest_mode (L2). |
6265 | */ |
6266 | static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, |
6267 | union vmx_exit_reason exit_reason) |
6268 | { |
6269 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
6270 | u32 intr_info; |
6271 | |
6272 | switch ((u16)exit_reason.basic) { |
6273 | case EXIT_REASON_EXCEPTION_NMI: |
6274 | intr_info = vmx_get_intr_info(vcpu); |
6275 | if (is_nmi(intr_info)) |
6276 | return true; |
6277 | else if (is_page_fault(intr_info)) |
6278 | return true; |
6279 | return vmcs12->exception_bitmap & |
6280 | (1u << (intr_info & INTR_INFO_VECTOR_MASK)); |
6281 | case EXIT_REASON_EXTERNAL_INTERRUPT: |
6282 | return nested_exit_on_intr(vcpu); |
6283 | case EXIT_REASON_TRIPLE_FAULT: |
6284 | return true; |
6285 | case EXIT_REASON_INTERRUPT_WINDOW: |
6286 | return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); |
6287 | case EXIT_REASON_NMI_WINDOW: |
6288 | return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); |
6289 | case EXIT_REASON_TASK_SWITCH: |
6290 | return true; |
6291 | case EXIT_REASON_CPUID: |
6292 | return true; |
6293 | case EXIT_REASON_HLT: |
6294 | return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); |
6295 | case EXIT_REASON_INVD: |
6296 | return true; |
6297 | case EXIT_REASON_INVLPG: |
6298 | return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); |
6299 | case EXIT_REASON_RDPMC: |
6300 | return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); |
6301 | case EXIT_REASON_RDRAND: |
6302 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); |
6303 | case EXIT_REASON_RDSEED: |
6304 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); |
6305 | case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: |
6306 | return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); |
6307 | case EXIT_REASON_VMREAD: |
6308 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, |
6309 | bitmap: vmcs12->vmread_bitmap); |
6310 | case EXIT_REASON_VMWRITE: |
6311 | return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, |
6312 | bitmap: vmcs12->vmwrite_bitmap); |
6313 | case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: |
6314 | case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: |
6315 | case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: |
6316 | case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: |
6317 | case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: |
6318 | /* |
6319 | * VMX instructions trap unconditionally. This allows L1 to |
6320 | * emulate them for its L2 guest, i.e., allows 3-level nesting! |
6321 | */ |
6322 | return true; |
6323 | case EXIT_REASON_CR_ACCESS: |
6324 | return nested_vmx_exit_handled_cr(vcpu, vmcs12); |
6325 | case EXIT_REASON_DR_ACCESS: |
6326 | return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); |
6327 | case EXIT_REASON_IO_INSTRUCTION: |
6328 | return nested_vmx_exit_handled_io(vcpu, vmcs12); |
6329 | case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: |
6330 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); |
6331 | case EXIT_REASON_MSR_READ: |
6332 | case EXIT_REASON_MSR_WRITE: |
6333 | return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); |
6334 | case EXIT_REASON_INVALID_STATE: |
6335 | return true; |
6336 | case EXIT_REASON_MWAIT_INSTRUCTION: |
6337 | return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); |
6338 | case EXIT_REASON_MONITOR_TRAP_FLAG: |
6339 | return nested_vmx_exit_handled_mtf(vmcs12); |
6340 | case EXIT_REASON_MONITOR_INSTRUCTION: |
6341 | return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); |
6342 | case EXIT_REASON_PAUSE_INSTRUCTION: |
6343 | return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || |
6344 | nested_cpu_has2(vmcs12, |
6345 | SECONDARY_EXEC_PAUSE_LOOP_EXITING); |
6346 | case EXIT_REASON_MCE_DURING_VMENTRY: |
6347 | return true; |
6348 | case EXIT_REASON_TPR_BELOW_THRESHOLD: |
6349 | return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); |
6350 | case EXIT_REASON_APIC_ACCESS: |
6351 | case EXIT_REASON_APIC_WRITE: |
6352 | case EXIT_REASON_EOI_INDUCED: |
6353 | /* |
6354 | * The controls for "virtualize APIC accesses," "APIC- |
6355 | * register virtualization," and "virtual-interrupt |
6356 | * delivery" only come from vmcs12. |
6357 | */ |
6358 | return true; |
6359 | case EXIT_REASON_INVPCID: |
6360 | return |
6361 | nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && |
6362 | nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); |
6363 | case EXIT_REASON_WBINVD: |
6364 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); |
6365 | case EXIT_REASON_XSETBV: |
6366 | return true; |
6367 | case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: |
6368 | /* |
6369 | * This should never happen, since it is not possible to |
6370 | * set XSS to a non-zero value---neither in L1 nor in L2. |
6371 | * If if it were, XSS would have to be checked against |
6372 | * the XSS exit bitmap in vmcs12. |
6373 | */ |
6374 | return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); |
6375 | case EXIT_REASON_UMWAIT: |
6376 | case EXIT_REASON_TPAUSE: |
6377 | return nested_cpu_has2(vmcs12, |
6378 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); |
6379 | case EXIT_REASON_ENCLS: |
6380 | return nested_vmx_exit_handled_encls(vcpu, vmcs12); |
6381 | case EXIT_REASON_NOTIFY: |
6382 | /* Notify VM exit is not exposed to L1 */ |
6383 | return false; |
6384 | default: |
6385 | return true; |
6386 | } |
6387 | } |
6388 | |
6389 | /* |
6390 | * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was |
6391 | * reflected into L1. |
6392 | */ |
6393 | bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) |
6394 | { |
6395 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
6396 | union vmx_exit_reason exit_reason = vmx->exit_reason; |
6397 | unsigned long exit_qual; |
6398 | u32 exit_intr_info; |
6399 | |
6400 | WARN_ON_ONCE(vmx->nested.nested_run_pending); |
6401 | |
6402 | /* |
6403 | * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM |
6404 | * has already loaded L2's state. |
6405 | */ |
6406 | if (unlikely(vmx->fail)) { |
6407 | trace_kvm_nested_vmenter_failed( |
6408 | "hardware VM-instruction error: " , |
6409 | vmcs_read32(field: VM_INSTRUCTION_ERROR)); |
6410 | exit_intr_info = 0; |
6411 | exit_qual = 0; |
6412 | goto reflect_vmexit; |
6413 | } |
6414 | |
6415 | trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); |
6416 | |
6417 | /* If L0 (KVM) wants the exit, it trumps L1's desires. */ |
6418 | if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) |
6419 | return false; |
6420 | |
6421 | /* If L1 doesn't want the exit, handle it in L0. */ |
6422 | if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) |
6423 | return false; |
6424 | |
6425 | /* |
6426 | * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For |
6427 | * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would |
6428 | * need to be synthesized by querying the in-kernel LAPIC, but external |
6429 | * interrupts are never reflected to L1 so it's a non-issue. |
6430 | */ |
6431 | exit_intr_info = vmx_get_intr_info(vcpu); |
6432 | if (is_exception_with_error_code(intr_info: exit_intr_info)) { |
6433 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
6434 | |
6435 | vmcs12->vm_exit_intr_error_code = |
6436 | vmcs_read32(field: VM_EXIT_INTR_ERROR_CODE); |
6437 | } |
6438 | exit_qual = vmx_get_exit_qual(vcpu); |
6439 | |
6440 | reflect_vmexit: |
6441 | nested_vmx_vmexit(vcpu, vm_exit_reason: exit_reason.full, exit_intr_info, exit_qualification: exit_qual); |
6442 | return true; |
6443 | } |
6444 | |
6445 | static int vmx_get_nested_state(struct kvm_vcpu *vcpu, |
6446 | struct kvm_nested_state __user *user_kvm_nested_state, |
6447 | u32 user_data_size) |
6448 | { |
6449 | struct vcpu_vmx *vmx; |
6450 | struct vmcs12 *vmcs12; |
6451 | struct kvm_nested_state kvm_state = { |
6452 | .flags = 0, |
6453 | .format = KVM_STATE_NESTED_FORMAT_VMX, |
6454 | .size = sizeof(kvm_state), |
6455 | .hdr.vmx.flags = 0, |
6456 | .hdr.vmx.vmxon_pa = INVALID_GPA, |
6457 | .hdr.vmx.vmcs12_pa = INVALID_GPA, |
6458 | .hdr.vmx.preemption_timer_deadline = 0, |
6459 | }; |
6460 | struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = |
6461 | &user_kvm_nested_state->data.vmx[0]; |
6462 | |
6463 | if (!vcpu) |
6464 | return kvm_state.size + sizeof(*user_vmx_nested_state); |
6465 | |
6466 | vmx = to_vmx(vcpu); |
6467 | vmcs12 = get_vmcs12(vcpu); |
6468 | |
6469 | if (guest_can_use(vcpu, X86_FEATURE_VMX) && |
6470 | (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { |
6471 | kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; |
6472 | kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; |
6473 | |
6474 | if (vmx_has_valid_vmcs12(vcpu)) { |
6475 | kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); |
6476 | |
6477 | /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ |
6478 | if (nested_vmx_is_evmptr12_set(vmx)) |
6479 | kvm_state.flags |= KVM_STATE_NESTED_EVMCS; |
6480 | |
6481 | if (is_guest_mode(vcpu) && |
6482 | nested_cpu_has_shadow_vmcs(vmcs12) && |
6483 | vmcs12->vmcs_link_pointer != INVALID_GPA) |
6484 | kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); |
6485 | } |
6486 | |
6487 | if (vmx->nested.smm.vmxon) |
6488 | kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; |
6489 | |
6490 | if (vmx->nested.smm.guest_mode) |
6491 | kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; |
6492 | |
6493 | if (is_guest_mode(vcpu)) { |
6494 | kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; |
6495 | |
6496 | if (vmx->nested.nested_run_pending) |
6497 | kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; |
6498 | |
6499 | if (vmx->nested.mtf_pending) |
6500 | kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; |
6501 | |
6502 | if (nested_cpu_has_preemption_timer(vmcs12) && |
6503 | vmx->nested.has_preemption_timer_deadline) { |
6504 | kvm_state.hdr.vmx.flags |= |
6505 | KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; |
6506 | kvm_state.hdr.vmx.preemption_timer_deadline = |
6507 | vmx->nested.preemption_timer_deadline; |
6508 | } |
6509 | } |
6510 | } |
6511 | |
6512 | if (user_data_size < kvm_state.size) |
6513 | goto out; |
6514 | |
6515 | if (copy_to_user(to: user_kvm_nested_state, from: &kvm_state, n: sizeof(kvm_state))) |
6516 | return -EFAULT; |
6517 | |
6518 | if (!vmx_has_valid_vmcs12(vcpu)) |
6519 | goto out; |
6520 | |
6521 | /* |
6522 | * When running L2, the authoritative vmcs12 state is in the |
6523 | * vmcs02. When running L1, the authoritative vmcs12 state is |
6524 | * in the shadow or enlightened vmcs linked to vmcs01, unless |
6525 | * need_vmcs12_to_shadow_sync is set, in which case, the authoritative |
6526 | * vmcs12 state is in the vmcs12 already. |
6527 | */ |
6528 | if (is_guest_mode(vcpu)) { |
6529 | sync_vmcs02_to_vmcs12(vcpu, vmcs12); |
6530 | sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); |
6531 | } else { |
6532 | copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12: get_vmcs12(vcpu)); |
6533 | if (!vmx->nested.need_vmcs12_to_shadow_sync) { |
6534 | if (nested_vmx_is_evmptr12_valid(vmx)) |
6535 | /* |
6536 | * L1 hypervisor is not obliged to keep eVMCS |
6537 | * clean fields data always up-to-date while |
6538 | * not in guest mode, 'hv_clean_fields' is only |
6539 | * supposed to be actual upon vmentry so we need |
6540 | * to ignore it here and do full copy. |
6541 | */ |
6542 | copy_enlightened_to_vmcs12(vmx, hv_clean_fields: 0); |
6543 | else if (enable_shadow_vmcs) |
6544 | copy_shadow_to_vmcs12(vmx); |
6545 | } |
6546 | } |
6547 | |
6548 | BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); |
6549 | BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); |
6550 | |
6551 | /* |
6552 | * Copy over the full allocated size of vmcs12 rather than just the size |
6553 | * of the struct. |
6554 | */ |
6555 | if (copy_to_user(to: user_vmx_nested_state->vmcs12, from: vmcs12, VMCS12_SIZE)) |
6556 | return -EFAULT; |
6557 | |
6558 | if (nested_cpu_has_shadow_vmcs(vmcs12) && |
6559 | vmcs12->vmcs_link_pointer != INVALID_GPA) { |
6560 | if (copy_to_user(to: user_vmx_nested_state->shadow_vmcs12, |
6561 | from: get_shadow_vmcs12(vcpu), VMCS12_SIZE)) |
6562 | return -EFAULT; |
6563 | } |
6564 | out: |
6565 | return kvm_state.size; |
6566 | } |
6567 | |
6568 | void vmx_leave_nested(struct kvm_vcpu *vcpu) |
6569 | { |
6570 | if (is_guest_mode(vcpu)) { |
6571 | to_vmx(vcpu)->nested.nested_run_pending = 0; |
6572 | nested_vmx_vmexit(vcpu, vm_exit_reason: -1, exit_intr_info: 0, exit_qualification: 0); |
6573 | } |
6574 | free_nested(vcpu); |
6575 | } |
6576 | |
6577 | static int vmx_set_nested_state(struct kvm_vcpu *vcpu, |
6578 | struct kvm_nested_state __user *user_kvm_nested_state, |
6579 | struct kvm_nested_state *kvm_state) |
6580 | { |
6581 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
6582 | struct vmcs12 *vmcs12; |
6583 | enum vm_entry_failure_code ignored; |
6584 | struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = |
6585 | &user_kvm_nested_state->data.vmx[0]; |
6586 | int ret; |
6587 | |
6588 | if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) |
6589 | return -EINVAL; |
6590 | |
6591 | if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { |
6592 | if (kvm_state->hdr.vmx.smm.flags) |
6593 | return -EINVAL; |
6594 | |
6595 | if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) |
6596 | return -EINVAL; |
6597 | |
6598 | /* |
6599 | * KVM_STATE_NESTED_EVMCS used to signal that KVM should |
6600 | * enable eVMCS capability on vCPU. However, since then |
6601 | * code was changed such that flag signals vmcs12 should |
6602 | * be copied into eVMCS in guest memory. |
6603 | * |
6604 | * To preserve backwards compatibility, allow user |
6605 | * to set this flag even when there is no VMXON region. |
6606 | */ |
6607 | if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) |
6608 | return -EINVAL; |
6609 | } else { |
6610 | if (!guest_can_use(vcpu, X86_FEATURE_VMX)) |
6611 | return -EINVAL; |
6612 | |
6613 | if (!page_address_valid(vcpu, gpa: kvm_state->hdr.vmx.vmxon_pa)) |
6614 | return -EINVAL; |
6615 | } |
6616 | |
6617 | if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && |
6618 | (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) |
6619 | return -EINVAL; |
6620 | |
6621 | if (kvm_state->hdr.vmx.smm.flags & |
6622 | ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) |
6623 | return -EINVAL; |
6624 | |
6625 | if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) |
6626 | return -EINVAL; |
6627 | |
6628 | /* |
6629 | * SMM temporarily disables VMX, so we cannot be in guest mode, |
6630 | * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags |
6631 | * must be zero. |
6632 | */ |
6633 | if (is_smm(vcpu) ? |
6634 | (kvm_state->flags & |
6635 | (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) |
6636 | : kvm_state->hdr.vmx.smm.flags) |
6637 | return -EINVAL; |
6638 | |
6639 | if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && |
6640 | !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) |
6641 | return -EINVAL; |
6642 | |
6643 | if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && |
6644 | (!guest_can_use(vcpu, X86_FEATURE_VMX) || |
6645 | !vmx->nested.enlightened_vmcs_enabled)) |
6646 | return -EINVAL; |
6647 | |
6648 | vmx_leave_nested(vcpu); |
6649 | |
6650 | if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) |
6651 | return 0; |
6652 | |
6653 | vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; |
6654 | ret = enter_vmx_operation(vcpu); |
6655 | if (ret) |
6656 | return ret; |
6657 | |
6658 | /* Empty 'VMXON' state is permitted if no VMCS loaded */ |
6659 | if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { |
6660 | /* See vmx_has_valid_vmcs12. */ |
6661 | if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || |
6662 | (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || |
6663 | (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) |
6664 | return -EINVAL; |
6665 | else |
6666 | return 0; |
6667 | } |
6668 | |
6669 | if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { |
6670 | if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || |
6671 | !page_address_valid(vcpu, gpa: kvm_state->hdr.vmx.vmcs12_pa)) |
6672 | return -EINVAL; |
6673 | |
6674 | set_current_vmptr(vmx, vmptr: kvm_state->hdr.vmx.vmcs12_pa); |
6675 | #ifdef CONFIG_KVM_HYPERV |
6676 | } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { |
6677 | /* |
6678 | * nested_vmx_handle_enlightened_vmptrld() cannot be called |
6679 | * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be |
6680 | * restored yet. EVMCS will be mapped from |
6681 | * nested_get_vmcs12_pages(). |
6682 | */ |
6683 | vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; |
6684 | kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); |
6685 | #endif |
6686 | } else { |
6687 | return -EINVAL; |
6688 | } |
6689 | |
6690 | if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { |
6691 | vmx->nested.smm.vmxon = true; |
6692 | vmx->nested.vmxon = false; |
6693 | |
6694 | if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) |
6695 | vmx->nested.smm.guest_mode = true; |
6696 | } |
6697 | |
6698 | vmcs12 = get_vmcs12(vcpu); |
6699 | if (copy_from_user(to: vmcs12, from: user_vmx_nested_state->vmcs12, n: sizeof(*vmcs12))) |
6700 | return -EFAULT; |
6701 | |
6702 | if (vmcs12->hdr.revision_id != VMCS12_REVISION) |
6703 | return -EINVAL; |
6704 | |
6705 | if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) |
6706 | return 0; |
6707 | |
6708 | vmx->nested.nested_run_pending = |
6709 | !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); |
6710 | |
6711 | vmx->nested.mtf_pending = |
6712 | !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); |
6713 | |
6714 | ret = -EINVAL; |
6715 | if (nested_cpu_has_shadow_vmcs(vmcs12) && |
6716 | vmcs12->vmcs_link_pointer != INVALID_GPA) { |
6717 | struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); |
6718 | |
6719 | if (kvm_state->size < |
6720 | sizeof(*kvm_state) + |
6721 | sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) |
6722 | goto error_guest_mode; |
6723 | |
6724 | if (copy_from_user(to: shadow_vmcs12, |
6725 | from: user_vmx_nested_state->shadow_vmcs12, |
6726 | n: sizeof(*shadow_vmcs12))) { |
6727 | ret = -EFAULT; |
6728 | goto error_guest_mode; |
6729 | } |
6730 | |
6731 | if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || |
6732 | !shadow_vmcs12->hdr.shadow_vmcs) |
6733 | goto error_guest_mode; |
6734 | } |
6735 | |
6736 | vmx->nested.has_preemption_timer_deadline = false; |
6737 | if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { |
6738 | vmx->nested.has_preemption_timer_deadline = true; |
6739 | vmx->nested.preemption_timer_deadline = |
6740 | kvm_state->hdr.vmx.preemption_timer_deadline; |
6741 | } |
6742 | |
6743 | if (nested_vmx_check_controls(vcpu, vmcs12) || |
6744 | nested_vmx_check_host_state(vcpu, vmcs12) || |
6745 | nested_vmx_check_guest_state(vcpu, vmcs12, entry_failure_code: &ignored)) |
6746 | goto error_guest_mode; |
6747 | |
6748 | vmx->nested.dirty_vmcs12 = true; |
6749 | vmx->nested.force_msr_bitmap_recalc = true; |
6750 | ret = nested_vmx_enter_non_root_mode(vcpu, from_vmentry: false); |
6751 | if (ret) |
6752 | goto error_guest_mode; |
6753 | |
6754 | if (vmx->nested.mtf_pending) |
6755 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
6756 | |
6757 | return 0; |
6758 | |
6759 | error_guest_mode: |
6760 | vmx->nested.nested_run_pending = 0; |
6761 | return ret; |
6762 | } |
6763 | |
6764 | void nested_vmx_set_vmcs_shadowing_bitmap(void) |
6765 | { |
6766 | if (enable_shadow_vmcs) { |
6767 | vmcs_write64(field: VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); |
6768 | vmcs_write64(field: VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); |
6769 | } |
6770 | } |
6771 | |
6772 | /* |
6773 | * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo |
6774 | * that madness to get the encoding for comparison. |
6775 | */ |
6776 | #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) |
6777 | |
6778 | static u64 nested_vmx_calc_vmcs_enum_msr(void) |
6779 | { |
6780 | /* |
6781 | * Note these are the so called "index" of the VMCS field encoding, not |
6782 | * the index into vmcs12. |
6783 | */ |
6784 | unsigned int max_idx, idx; |
6785 | int i; |
6786 | |
6787 | /* |
6788 | * For better or worse, KVM allows VMREAD/VMWRITE to all fields in |
6789 | * vmcs12, regardless of whether or not the associated feature is |
6790 | * exposed to L1. Simply find the field with the highest index. |
6791 | */ |
6792 | max_idx = 0; |
6793 | for (i = 0; i < nr_vmcs12_fields; i++) { |
6794 | /* The vmcs12 table is very, very sparsely populated. */ |
6795 | if (!vmcs12_field_offsets[i]) |
6796 | continue; |
6797 | |
6798 | idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); |
6799 | if (idx > max_idx) |
6800 | max_idx = idx; |
6801 | } |
6802 | |
6803 | return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; |
6804 | } |
6805 | |
6806 | static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, |
6807 | struct nested_vmx_msrs *msrs) |
6808 | { |
6809 | msrs->pinbased_ctls_low = |
6810 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
6811 | |
6812 | msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; |
6813 | msrs->pinbased_ctls_high &= |
6814 | PIN_BASED_EXT_INTR_MASK | |
6815 | PIN_BASED_NMI_EXITING | |
6816 | PIN_BASED_VIRTUAL_NMIS | |
6817 | (enable_apicv ? PIN_BASED_POSTED_INTR : 0); |
6818 | msrs->pinbased_ctls_high |= |
6819 | PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | |
6820 | PIN_BASED_VMX_PREEMPTION_TIMER; |
6821 | } |
6822 | |
6823 | static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, |
6824 | struct nested_vmx_msrs *msrs) |
6825 | { |
6826 | msrs->exit_ctls_low = |
6827 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; |
6828 | |
6829 | msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; |
6830 | msrs->exit_ctls_high &= |
6831 | #ifdef CONFIG_X86_64 |
6832 | VM_EXIT_HOST_ADDR_SPACE_SIZE | |
6833 | #endif |
6834 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | |
6835 | VM_EXIT_CLEAR_BNDCFGS; |
6836 | msrs->exit_ctls_high |= |
6837 | VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | |
6838 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | |
6839 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | |
6840 | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; |
6841 | |
6842 | /* We support free control of debug control saving. */ |
6843 | msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; |
6844 | } |
6845 | |
6846 | static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, |
6847 | struct nested_vmx_msrs *msrs) |
6848 | { |
6849 | msrs->entry_ctls_low = |
6850 | VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; |
6851 | |
6852 | msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; |
6853 | msrs->entry_ctls_high &= |
6854 | #ifdef CONFIG_X86_64 |
6855 | VM_ENTRY_IA32E_MODE | |
6856 | #endif |
6857 | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; |
6858 | msrs->entry_ctls_high |= |
6859 | (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | |
6860 | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); |
6861 | |
6862 | /* We support free control of debug control loading. */ |
6863 | msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; |
6864 | } |
6865 | |
6866 | static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, |
6867 | struct nested_vmx_msrs *msrs) |
6868 | { |
6869 | msrs->procbased_ctls_low = |
6870 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; |
6871 | |
6872 | msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; |
6873 | msrs->procbased_ctls_high &= |
6874 | CPU_BASED_INTR_WINDOW_EXITING | |
6875 | CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | |
6876 | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | |
6877 | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | |
6878 | CPU_BASED_CR3_STORE_EXITING | |
6879 | #ifdef CONFIG_X86_64 |
6880 | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | |
6881 | #endif |
6882 | CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | |
6883 | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | |
6884 | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | |
6885 | CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | |
6886 | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
6887 | /* |
6888 | * We can allow some features even when not supported by the |
6889 | * hardware. For example, L1 can specify an MSR bitmap - and we |
6890 | * can use it to avoid exits to L1 - even when L0 runs L2 |
6891 | * without MSR bitmaps. |
6892 | */ |
6893 | msrs->procbased_ctls_high |= |
6894 | CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | |
6895 | CPU_BASED_USE_MSR_BITMAPS; |
6896 | |
6897 | /* We support free control of CR3 access interception. */ |
6898 | msrs->procbased_ctls_low &= |
6899 | ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); |
6900 | } |
6901 | |
6902 | static void nested_vmx_setup_secondary_ctls(u32 ept_caps, |
6903 | struct vmcs_config *vmcs_conf, |
6904 | struct nested_vmx_msrs *msrs) |
6905 | { |
6906 | msrs->secondary_ctls_low = 0; |
6907 | |
6908 | msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; |
6909 | msrs->secondary_ctls_high &= |
6910 | SECONDARY_EXEC_DESC | |
6911 | SECONDARY_EXEC_ENABLE_RDTSCP | |
6912 | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | |
6913 | SECONDARY_EXEC_WBINVD_EXITING | |
6914 | SECONDARY_EXEC_APIC_REGISTER_VIRT | |
6915 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | |
6916 | SECONDARY_EXEC_RDRAND_EXITING | |
6917 | SECONDARY_EXEC_ENABLE_INVPCID | |
6918 | SECONDARY_EXEC_ENABLE_VMFUNC | |
6919 | SECONDARY_EXEC_RDSEED_EXITING | |
6920 | SECONDARY_EXEC_ENABLE_XSAVES | |
6921 | SECONDARY_EXEC_TSC_SCALING | |
6922 | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; |
6923 | |
6924 | /* |
6925 | * We can emulate "VMCS shadowing," even if the hardware |
6926 | * doesn't support it. |
6927 | */ |
6928 | msrs->secondary_ctls_high |= |
6929 | SECONDARY_EXEC_SHADOW_VMCS; |
6930 | |
6931 | if (enable_ept) { |
6932 | /* nested EPT: emulate EPT also to L1 */ |
6933 | msrs->secondary_ctls_high |= |
6934 | SECONDARY_EXEC_ENABLE_EPT; |
6935 | msrs->ept_caps = |
6936 | VMX_EPT_PAGE_WALK_4_BIT | |
6937 | VMX_EPT_PAGE_WALK_5_BIT | |
6938 | VMX_EPTP_WB_BIT | |
6939 | VMX_EPT_INVEPT_BIT | |
6940 | VMX_EPT_EXECUTE_ONLY_BIT; |
6941 | |
6942 | msrs->ept_caps &= ept_caps; |
6943 | msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | |
6944 | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | |
6945 | VMX_EPT_1GB_PAGE_BIT; |
6946 | if (enable_ept_ad_bits) { |
6947 | msrs->secondary_ctls_high |= |
6948 | SECONDARY_EXEC_ENABLE_PML; |
6949 | msrs->ept_caps |= VMX_EPT_AD_BIT; |
6950 | } |
6951 | |
6952 | /* |
6953 | * Advertise EPTP switching irrespective of hardware support, |
6954 | * KVM emulates it in software so long as VMFUNC is supported. |
6955 | */ |
6956 | if (cpu_has_vmx_vmfunc()) |
6957 | msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; |
6958 | } |
6959 | |
6960 | /* |
6961 | * Old versions of KVM use the single-context version without |
6962 | * checking for support, so declare that it is supported even |
6963 | * though it is treated as global context. The alternative is |
6964 | * not failing the single-context invvpid, and it is worse. |
6965 | */ |
6966 | if (enable_vpid) { |
6967 | msrs->secondary_ctls_high |= |
6968 | SECONDARY_EXEC_ENABLE_VPID; |
6969 | msrs->vpid_caps = VMX_VPID_INVVPID_BIT | |
6970 | VMX_VPID_EXTENT_SUPPORTED_MASK; |
6971 | } |
6972 | |
6973 | if (enable_unrestricted_guest) |
6974 | msrs->secondary_ctls_high |= |
6975 | SECONDARY_EXEC_UNRESTRICTED_GUEST; |
6976 | |
6977 | if (flexpriority_enabled) |
6978 | msrs->secondary_ctls_high |= |
6979 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
6980 | |
6981 | if (enable_sgx) |
6982 | msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; |
6983 | } |
6984 | |
6985 | static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, |
6986 | struct nested_vmx_msrs *msrs) |
6987 | { |
6988 | msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; |
6989 | msrs->misc_low |= |
6990 | MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | |
6991 | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | |
6992 | VMX_MISC_ACTIVITY_HLT | |
6993 | VMX_MISC_ACTIVITY_WAIT_SIPI; |
6994 | msrs->misc_high = 0; |
6995 | } |
6996 | |
6997 | static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) |
6998 | { |
6999 | /* |
7000 | * This MSR reports some information about VMX support. We |
7001 | * should return information about the VMX we emulate for the |
7002 | * guest, and the VMCS structure we give it - not about the |
7003 | * VMX support of the underlying hardware. |
7004 | */ |
7005 | msrs->basic = |
7006 | VMCS12_REVISION | |
7007 | VMX_BASIC_TRUE_CTLS | |
7008 | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | |
7009 | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); |
7010 | |
7011 | if (cpu_has_vmx_basic_inout()) |
7012 | msrs->basic |= VMX_BASIC_INOUT; |
7013 | } |
7014 | |
7015 | static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) |
7016 | { |
7017 | /* |
7018 | * These MSRs specify bits which the guest must keep fixed on |
7019 | * while L1 is in VMXON mode (in L1's root mode, or running an L2). |
7020 | * We picked the standard core2 setting. |
7021 | */ |
7022 | #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) |
7023 | #define VMXON_CR4_ALWAYSON X86_CR4_VMXE |
7024 | msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; |
7025 | msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; |
7026 | |
7027 | /* These MSRs specify bits which the guest must keep fixed off. */ |
7028 | rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); |
7029 | rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); |
7030 | |
7031 | if (vmx_umip_emulated()) |
7032 | msrs->cr4_fixed1 |= X86_CR4_UMIP; |
7033 | } |
7034 | |
7035 | /* |
7036 | * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be |
7037 | * returned for the various VMX controls MSRs when nested VMX is enabled. |
7038 | * The same values should also be used to verify that vmcs12 control fields are |
7039 | * valid during nested entry from L1 to L2. |
7040 | * Each of these control msrs has a low and high 32-bit half: A low bit is on |
7041 | * if the corresponding bit in the (32-bit) control field *must* be on, and a |
7042 | * bit in the high half is on if the corresponding bit in the control field |
7043 | * may be on. See also vmx_control_verify(). |
7044 | */ |
7045 | void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) |
7046 | { |
7047 | struct nested_vmx_msrs *msrs = &vmcs_conf->nested; |
7048 | |
7049 | /* |
7050 | * Note that as a general rule, the high half of the MSRs (bits in |
7051 | * the control fields which may be 1) should be initialized by the |
7052 | * intersection of the underlying hardware's MSR (i.e., features which |
7053 | * can be supported) and the list of features we want to expose - |
7054 | * because they are known to be properly supported in our code. |
7055 | * Also, usually, the low half of the MSRs (bits which must be 1) can |
7056 | * be set to 0, meaning that L1 may turn off any of these bits. The |
7057 | * reason is that if one of these bits is necessary, it will appear |
7058 | * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control |
7059 | * fields of vmcs01 and vmcs02, will turn these bits off - and |
7060 | * nested_vmx_l1_wants_exit() will not pass related exits to L1. |
7061 | * These rules have exceptions below. |
7062 | */ |
7063 | nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); |
7064 | |
7065 | nested_vmx_setup_exit_ctls(vmcs_conf, msrs); |
7066 | |
7067 | nested_vmx_setup_entry_ctls(vmcs_conf, msrs); |
7068 | |
7069 | nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); |
7070 | |
7071 | nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); |
7072 | |
7073 | nested_vmx_setup_misc_data(vmcs_conf, msrs); |
7074 | |
7075 | nested_vmx_setup_basic(msrs); |
7076 | |
7077 | nested_vmx_setup_cr_fixed(msrs); |
7078 | |
7079 | msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); |
7080 | } |
7081 | |
7082 | void nested_vmx_hardware_unsetup(void) |
7083 | { |
7084 | int i; |
7085 | |
7086 | if (enable_shadow_vmcs) { |
7087 | for (i = 0; i < VMX_BITMAP_NR; i++) |
7088 | free_page((unsigned long)vmx_bitmap[i]); |
7089 | } |
7090 | } |
7091 | |
7092 | __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) |
7093 | { |
7094 | int i; |
7095 | |
7096 | if (!cpu_has_vmx_shadow_vmcs()) |
7097 | enable_shadow_vmcs = 0; |
7098 | if (enable_shadow_vmcs) { |
7099 | for (i = 0; i < VMX_BITMAP_NR; i++) { |
7100 | /* |
7101 | * The vmx_bitmap is not tied to a VM and so should |
7102 | * not be charged to a memcg. |
7103 | */ |
7104 | vmx_bitmap[i] = (unsigned long *) |
7105 | __get_free_page(GFP_KERNEL); |
7106 | if (!vmx_bitmap[i]) { |
7107 | nested_vmx_hardware_unsetup(); |
7108 | return -ENOMEM; |
7109 | } |
7110 | } |
7111 | |
7112 | init_vmcs_shadow_fields(); |
7113 | } |
7114 | |
7115 | exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; |
7116 | exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; |
7117 | exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; |
7118 | exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; |
7119 | exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; |
7120 | exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; |
7121 | exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; |
7122 | exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; |
7123 | exit_handlers[EXIT_REASON_VMON] = handle_vmxon; |
7124 | exit_handlers[EXIT_REASON_INVEPT] = handle_invept; |
7125 | exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; |
7126 | exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; |
7127 | |
7128 | return 0; |
7129 | } |
7130 | |
7131 | struct kvm_x86_nested_ops vmx_nested_ops = { |
7132 | .leave_nested = vmx_leave_nested, |
7133 | .is_exception_vmexit = nested_vmx_is_exception_vmexit, |
7134 | .check_events = vmx_check_nested_events, |
7135 | .has_events = vmx_has_nested_events, |
7136 | .triple_fault = nested_vmx_triple_fault, |
7137 | .get_state = vmx_get_nested_state, |
7138 | .set_state = vmx_set_nested_state, |
7139 | .get_nested_state_pages = vmx_get_nested_state_pages, |
7140 | .write_log_dirty = nested_vmx_write_pml_buffer, |
7141 | #ifdef CONFIG_KVM_HYPERV |
7142 | .enable_evmcs = nested_enable_evmcs, |
7143 | .get_evmcs_version = nested_get_evmcs_version, |
7144 | .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, |
7145 | #endif |
7146 | }; |
7147 | |