1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * X86 specific Hyper-V initialization code. |
4 | * |
5 | * Copyright (C) 2016, Microsoft, Inc. |
6 | * |
7 | * Author : K. Y. Srinivasan <kys@microsoft.com> |
8 | */ |
9 | |
10 | #define pr_fmt(fmt) "Hyper-V: " fmt |
11 | |
12 | #include <linux/efi.h> |
13 | #include <linux/types.h> |
14 | #include <linux/bitfield.h> |
15 | #include <linux/io.h> |
16 | #include <asm/apic.h> |
17 | #include <asm/desc.h> |
18 | #include <asm/sev.h> |
19 | #include <asm/ibt.h> |
20 | #include <asm/hypervisor.h> |
21 | #include <asm/hyperv-tlfs.h> |
22 | #include <asm/mshyperv.h> |
23 | #include <asm/idtentry.h> |
24 | #include <asm/set_memory.h> |
25 | #include <linux/kexec.h> |
26 | #include <linux/version.h> |
27 | #include <linux/vmalloc.h> |
28 | #include <linux/mm.h> |
29 | #include <linux/hyperv.h> |
30 | #include <linux/slab.h> |
31 | #include <linux/kernel.h> |
32 | #include <linux/cpuhotplug.h> |
33 | #include <linux/syscore_ops.h> |
34 | #include <clocksource/hyperv_timer.h> |
35 | #include <linux/highmem.h> |
36 | |
37 | int hyperv_init_cpuhp; |
38 | u64 hv_current_partition_id = ~0ull; |
39 | EXPORT_SYMBOL_GPL(hv_current_partition_id); |
40 | |
41 | void *hv_hypercall_pg; |
42 | EXPORT_SYMBOL_GPL(hv_hypercall_pg); |
43 | |
44 | union hv_ghcb * __percpu *hv_ghcb_pg; |
45 | |
46 | /* Storage to save the hypercall page temporarily for hibernation */ |
47 | static void *hv_hypercall_pg_saved; |
48 | |
49 | struct hv_vp_assist_page **hv_vp_assist_page; |
50 | EXPORT_SYMBOL_GPL(hv_vp_assist_page); |
51 | |
52 | static int hyperv_init_ghcb(void) |
53 | { |
54 | u64 ghcb_gpa; |
55 | void *ghcb_va; |
56 | void **ghcb_base; |
57 | |
58 | if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) |
59 | return 0; |
60 | |
61 | if (!hv_ghcb_pg) |
62 | return -EINVAL; |
63 | |
64 | /* |
65 | * GHCB page is allocated by paravisor. The address |
66 | * returned by MSR_AMD64_SEV_ES_GHCB is above shared |
67 | * memory boundary and map it here. |
68 | */ |
69 | rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); |
70 | |
71 | /* Mask out vTOM bit. ioremap_cache() maps decrypted */ |
72 | ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; |
73 | ghcb_va = (void *)ioremap_cache(offset: ghcb_gpa, HV_HYP_PAGE_SIZE); |
74 | if (!ghcb_va) |
75 | return -ENOMEM; |
76 | |
77 | ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); |
78 | *ghcb_base = ghcb_va; |
79 | |
80 | return 0; |
81 | } |
82 | |
83 | static int hv_cpu_init(unsigned int cpu) |
84 | { |
85 | union hv_vp_assist_msr_contents msr = { 0 }; |
86 | struct hv_vp_assist_page **hvp; |
87 | int ret; |
88 | |
89 | ret = hv_common_cpu_init(cpu); |
90 | if (ret) |
91 | return ret; |
92 | |
93 | if (!hv_vp_assist_page) |
94 | return 0; |
95 | |
96 | hvp = &hv_vp_assist_page[cpu]; |
97 | if (hv_root_partition) { |
98 | /* |
99 | * For root partition we get the hypervisor provided VP assist |
100 | * page, instead of allocating a new page. |
101 | */ |
102 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); |
103 | *hvp = memremap(offset: msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, |
104 | PAGE_SIZE, flags: MEMREMAP_WB); |
105 | } else { |
106 | /* |
107 | * The VP assist page is an "overlay" page (see Hyper-V TLFS's |
108 | * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed |
109 | * out to make sure we always write the EOI MSR in |
110 | * hv_apic_eoi_write() *after* the EOI optimization is disabled |
111 | * in hv_cpu_die(), otherwise a CPU may not be stopped in the |
112 | * case of CPU offlining and the VM will hang. |
113 | */ |
114 | if (!*hvp) { |
115 | *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); |
116 | |
117 | /* |
118 | * Hyper-V should never specify a VM that is a Confidential |
119 | * VM and also running in the root partition. Root partition |
120 | * is blocked to run in Confidential VM. So only decrypt assist |
121 | * page in non-root partition here. |
122 | */ |
123 | if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
124 | WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); |
125 | memset(*hvp, 0, PAGE_SIZE); |
126 | } |
127 | } |
128 | |
129 | if (*hvp) |
130 | msr.pfn = vmalloc_to_pfn(addr: *hvp); |
131 | |
132 | } |
133 | if (!WARN_ON(!(*hvp))) { |
134 | msr.enable = 1; |
135 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val: msr.as_uint64); |
136 | } |
137 | |
138 | return hyperv_init_ghcb(); |
139 | } |
140 | |
141 | static void (*hv_reenlightenment_cb)(void); |
142 | |
143 | static void hv_reenlightenment_notify(struct work_struct *dummy) |
144 | { |
145 | struct hv_tsc_emulation_status emu_status; |
146 | |
147 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); |
148 | |
149 | /* Don't issue the callback if TSC accesses are not emulated */ |
150 | if (hv_reenlightenment_cb && emu_status.inprogress) |
151 | hv_reenlightenment_cb(); |
152 | } |
153 | static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); |
154 | |
155 | void hyperv_stop_tsc_emulation(void) |
156 | { |
157 | u64 freq; |
158 | struct hv_tsc_emulation_status emu_status; |
159 | |
160 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); |
161 | emu_status.inprogress = 0; |
162 | wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, val: *(u64 *)&emu_status); |
163 | |
164 | rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); |
165 | tsc_khz = div64_u64(dividend: freq, divisor: 1000); |
166 | } |
167 | EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); |
168 | |
169 | static inline bool hv_reenlightenment_available(void) |
170 | { |
171 | /* |
172 | * Check for required features and privileges to make TSC frequency |
173 | * change notifications work. |
174 | */ |
175 | return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && |
176 | ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && |
177 | ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; |
178 | } |
179 | |
180 | DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) |
181 | { |
182 | apic_eoi(); |
183 | inc_irq_stat(irq_hv_reenlightenment_count); |
184 | schedule_delayed_work(dwork: &hv_reenlightenment_work, HZ/10); |
185 | } |
186 | |
187 | void set_hv_tscchange_cb(void (*cb)(void)) |
188 | { |
189 | struct hv_reenlightenment_control re_ctrl = { |
190 | .vector = HYPERV_REENLIGHTENMENT_VECTOR, |
191 | .enabled = 1, |
192 | }; |
193 | struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; |
194 | |
195 | if (!hv_reenlightenment_available()) { |
196 | pr_warn("reenlightenment support is unavailable\n" ); |
197 | return; |
198 | } |
199 | |
200 | if (!hv_vp_index) |
201 | return; |
202 | |
203 | hv_reenlightenment_cb = cb; |
204 | |
205 | /* Make sure callback is registered before we write to MSRs */ |
206 | wmb(); |
207 | |
208 | re_ctrl.target_vp = hv_vp_index[get_cpu()]; |
209 | |
210 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *((u64 *)&re_ctrl)); |
211 | wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, val: *((u64 *)&emu_ctrl)); |
212 | |
213 | put_cpu(); |
214 | } |
215 | EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); |
216 | |
217 | void clear_hv_tscchange_cb(void) |
218 | { |
219 | struct hv_reenlightenment_control re_ctrl; |
220 | |
221 | if (!hv_reenlightenment_available()) |
222 | return; |
223 | |
224 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); |
225 | re_ctrl.enabled = 0; |
226 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *(u64 *)&re_ctrl); |
227 | |
228 | hv_reenlightenment_cb = NULL; |
229 | } |
230 | EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); |
231 | |
232 | static int hv_cpu_die(unsigned int cpu) |
233 | { |
234 | struct hv_reenlightenment_control re_ctrl; |
235 | unsigned int new_cpu; |
236 | void **ghcb_va; |
237 | |
238 | if (hv_ghcb_pg) { |
239 | ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); |
240 | if (*ghcb_va) |
241 | iounmap(addr: *ghcb_va); |
242 | *ghcb_va = NULL; |
243 | } |
244 | |
245 | hv_common_cpu_die(cpu); |
246 | |
247 | if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { |
248 | union hv_vp_assist_msr_contents msr = { 0 }; |
249 | if (hv_root_partition) { |
250 | /* |
251 | * For root partition the VP assist page is mapped to |
252 | * hypervisor provided page, and thus we unmap the |
253 | * page here and nullify it, so that in future we have |
254 | * correct page address mapped in hv_cpu_init. |
255 | */ |
256 | memunmap(addr: hv_vp_assist_page[cpu]); |
257 | hv_vp_assist_page[cpu] = NULL; |
258 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); |
259 | msr.enable = 0; |
260 | } |
261 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val: msr.as_uint64); |
262 | } |
263 | |
264 | if (hv_reenlightenment_cb == NULL) |
265 | return 0; |
266 | |
267 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); |
268 | if (re_ctrl.target_vp == hv_vp_index[cpu]) { |
269 | /* |
270 | * Reassign reenlightenment notifications to some other online |
271 | * CPU or just disable the feature if there are no online CPUs |
272 | * left (happens on hibernation). |
273 | */ |
274 | new_cpu = cpumask_any_but(cpu_online_mask, cpu); |
275 | |
276 | if (new_cpu < nr_cpu_ids) |
277 | re_ctrl.target_vp = hv_vp_index[new_cpu]; |
278 | else |
279 | re_ctrl.enabled = 0; |
280 | |
281 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *((u64 *)&re_ctrl)); |
282 | } |
283 | |
284 | return 0; |
285 | } |
286 | |
287 | static int __init hv_pci_init(void) |
288 | { |
289 | int gen2vm = efi_enabled(EFI_BOOT); |
290 | |
291 | /* |
292 | * For Generation-2 VM, we exit from pci_arch_init() by returning 0. |
293 | * The purpose is to suppress the harmless warning: |
294 | * "PCI: Fatal: No config space access function found" |
295 | */ |
296 | if (gen2vm) |
297 | return 0; |
298 | |
299 | /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ |
300 | return 1; |
301 | } |
302 | |
303 | static int hv_suspend(void) |
304 | { |
305 | union hv_x64_msr_hypercall_contents hypercall_msr; |
306 | int ret; |
307 | |
308 | if (hv_root_partition) |
309 | return -EPERM; |
310 | |
311 | /* |
312 | * Reset the hypercall page as it is going to be invalidated |
313 | * across hibernation. Setting hv_hypercall_pg to NULL ensures |
314 | * that any subsequent hypercall operation fails safely instead of |
315 | * crashing due to an access of an invalid page. The hypercall page |
316 | * pointer is restored on resume. |
317 | */ |
318 | hv_hypercall_pg_saved = hv_hypercall_pg; |
319 | hv_hypercall_pg = NULL; |
320 | |
321 | /* Disable the hypercall page in the hypervisor */ |
322 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
323 | hypercall_msr.enable = 0; |
324 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
325 | |
326 | ret = hv_cpu_die(cpu: 0); |
327 | return ret; |
328 | } |
329 | |
330 | static void hv_resume(void) |
331 | { |
332 | union hv_x64_msr_hypercall_contents hypercall_msr; |
333 | int ret; |
334 | |
335 | ret = hv_cpu_init(cpu: 0); |
336 | WARN_ON(ret); |
337 | |
338 | /* Re-enable the hypercall page */ |
339 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
340 | hypercall_msr.enable = 1; |
341 | hypercall_msr.guest_physical_address = |
342 | vmalloc_to_pfn(addr: hv_hypercall_pg_saved); |
343 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
344 | |
345 | hv_hypercall_pg = hv_hypercall_pg_saved; |
346 | hv_hypercall_pg_saved = NULL; |
347 | |
348 | /* |
349 | * Reenlightenment notifications are disabled by hv_cpu_die(0), |
350 | * reenable them here if hv_reenlightenment_cb was previously set. |
351 | */ |
352 | if (hv_reenlightenment_cb) |
353 | set_hv_tscchange_cb(hv_reenlightenment_cb); |
354 | } |
355 | |
356 | /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ |
357 | static struct syscore_ops hv_syscore_ops = { |
358 | .suspend = hv_suspend, |
359 | .resume = hv_resume, |
360 | }; |
361 | |
362 | static void (* __initdata old_setup_percpu_clockev)(void); |
363 | |
364 | static void __init hv_stimer_setup_percpu_clockev(void) |
365 | { |
366 | /* |
367 | * Ignore any errors in setting up stimer clockevents |
368 | * as we can run with the LAPIC timer as a fallback. |
369 | */ |
370 | (void)hv_stimer_alloc(have_percpu_irqs: false); |
371 | |
372 | /* |
373 | * Still register the LAPIC timer, because the direct-mode STIMER is |
374 | * not supported by old versions of Hyper-V. This also allows users |
375 | * to switch to LAPIC timer via /sys, if they want to. |
376 | */ |
377 | if (old_setup_percpu_clockev) |
378 | old_setup_percpu_clockev(); |
379 | } |
380 | |
381 | static void __init hv_get_partition_id(void) |
382 | { |
383 | struct hv_get_partition_id *output_page; |
384 | u64 status; |
385 | unsigned long flags; |
386 | |
387 | local_irq_save(flags); |
388 | output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); |
389 | status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, outputaddr: output_page); |
390 | if (!hv_result_success(status)) { |
391 | /* No point in proceeding if this failed */ |
392 | pr_err("Failed to get partition ID: %lld\n" , status); |
393 | BUG(); |
394 | } |
395 | hv_current_partition_id = output_page->partition_id; |
396 | local_irq_restore(flags); |
397 | } |
398 | |
399 | #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) |
400 | static u8 __init get_vtl(void) |
401 | { |
402 | u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; |
403 | struct hv_get_vp_registers_input *input; |
404 | struct hv_get_vp_registers_output *output; |
405 | unsigned long flags; |
406 | u64 ret; |
407 | |
408 | local_irq_save(flags); |
409 | input = *this_cpu_ptr(hyperv_pcpu_input_arg); |
410 | output = (struct hv_get_vp_registers_output *)input; |
411 | |
412 | memset(input, 0, struct_size(input, element, 1)); |
413 | input->header.partitionid = HV_PARTITION_ID_SELF; |
414 | input->header.vpindex = HV_VP_INDEX_SELF; |
415 | input->header.inputvtl = 0; |
416 | input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; |
417 | |
418 | ret = hv_do_hypercall(control, inputaddr: input, outputaddr: output); |
419 | if (hv_result_success(status: ret)) { |
420 | ret = output->as64.low & HV_X64_VTL_MASK; |
421 | } else { |
422 | pr_err("Failed to get VTL(error: %lld) exiting...\n" , ret); |
423 | BUG(); |
424 | } |
425 | |
426 | local_irq_restore(flags); |
427 | return ret; |
428 | } |
429 | #else |
430 | static inline u8 get_vtl(void) { return 0; } |
431 | #endif |
432 | |
433 | /* |
434 | * This function is to be invoked early in the boot sequence after the |
435 | * hypervisor has been detected. |
436 | * |
437 | * 1. Setup the hypercall page. |
438 | * 2. Register Hyper-V specific clocksource. |
439 | * 3. Setup Hyper-V specific APIC entry points. |
440 | */ |
441 | void __init hyperv_init(void) |
442 | { |
443 | u64 guest_id; |
444 | union hv_x64_msr_hypercall_contents hypercall_msr; |
445 | int cpuhp; |
446 | |
447 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) |
448 | return; |
449 | |
450 | if (hv_common_init()) |
451 | return; |
452 | |
453 | /* |
454 | * The VP assist page is useless to a TDX guest: the only use we |
455 | * would have for it is lazy EOI, which can not be used with TDX. |
456 | */ |
457 | if (hv_isolation_type_tdx()) |
458 | hv_vp_assist_page = NULL; |
459 | else |
460 | hv_vp_assist_page = kcalloc(num_possible_cpus(), |
461 | size: sizeof(*hv_vp_assist_page), |
462 | GFP_KERNEL); |
463 | if (!hv_vp_assist_page) { |
464 | ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; |
465 | |
466 | if (!hv_isolation_type_tdx()) |
467 | goto common_free; |
468 | } |
469 | |
470 | if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
471 | /* Negotiate GHCB Version. */ |
472 | if (!hv_ghcb_negotiate_protocol()) |
473 | hv_ghcb_terminate(SEV_TERM_SET_GEN, |
474 | GHCB_SEV_ES_PROT_UNSUPPORTED); |
475 | |
476 | hv_ghcb_pg = alloc_percpu(union hv_ghcb *); |
477 | if (!hv_ghcb_pg) |
478 | goto free_vp_assist_page; |
479 | } |
480 | |
481 | cpuhp = cpuhp_setup_state(state: CPUHP_AP_HYPERV_ONLINE, name: "x86/hyperv_init:online" , |
482 | startup: hv_cpu_init, teardown: hv_cpu_die); |
483 | if (cpuhp < 0) |
484 | goto free_ghcb_page; |
485 | |
486 | /* |
487 | * Setup the hypercall page and enable hypercalls. |
488 | * 1. Register the guest ID |
489 | * 2. Enable the hypercall and register the hypercall page |
490 | * |
491 | * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: |
492 | * when the hypercall input is a page, such a VM must pass a decrypted |
493 | * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page |
494 | * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. |
495 | * |
496 | * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, |
497 | * which are handled by the paravisor and the VM must use an encrypted |
498 | * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and |
499 | * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and |
500 | * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: |
501 | * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). |
502 | * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. |
503 | * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; |
504 | * instead, hv_post_message() uses the post_msg_page, which is decrypted |
505 | * in such a VM and is only used in such a VM. |
506 | */ |
507 | guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); |
508 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: guest_id); |
509 | |
510 | /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ |
511 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: guest_id); |
512 | |
513 | /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ |
514 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) |
515 | goto skip_hypercall_pg_init; |
516 | |
517 | hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, align: 1, VMALLOC_START, |
518 | VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, |
519 | VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, |
520 | caller: __builtin_return_address(0)); |
521 | if (hv_hypercall_pg == NULL) |
522 | goto clean_guest_os_id; |
523 | |
524 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
525 | hypercall_msr.enable = 1; |
526 | |
527 | if (hv_root_partition) { |
528 | struct page *pg; |
529 | void *src; |
530 | |
531 | /* |
532 | * For the root partition, the hypervisor will set up its |
533 | * hypercall page. The hypervisor guarantees it will not show |
534 | * up in the root's address space. The root can't change the |
535 | * location of the hypercall page. |
536 | * |
537 | * Order is important here. We must enable the hypercall page |
538 | * so it is populated with code, then copy the code to an |
539 | * executable page. |
540 | */ |
541 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
542 | |
543 | pg = vmalloc_to_page(addr: hv_hypercall_pg); |
544 | src = memremap(offset: hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, |
545 | flags: MEMREMAP_WB); |
546 | BUG_ON(!src); |
547 | memcpy_to_page(page: pg, offset: 0, from: src, HV_HYP_PAGE_SIZE); |
548 | memunmap(addr: src); |
549 | |
550 | hv_remap_tsc_clocksource(); |
551 | } else { |
552 | hypercall_msr.guest_physical_address = vmalloc_to_pfn(addr: hv_hypercall_pg); |
553 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
554 | } |
555 | |
556 | skip_hypercall_pg_init: |
557 | /* |
558 | * Some versions of Hyper-V that provide IBT in guest VMs have a bug |
559 | * in that there's no ENDBR64 instruction at the entry to the |
560 | * hypercall page. Because hypercalls are invoked via an indirect call |
561 | * to the hypercall page, all hypercall attempts fail when IBT is |
562 | * enabled, and Linux panics. For such buggy versions, disable IBT. |
563 | * |
564 | * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall |
565 | * page, so if future Linux kernel versions enable IBT for 32-bit |
566 | * builds, additional hypercall page hackery will be required here |
567 | * to provide an ENDBR32. |
568 | */ |
569 | #ifdef CONFIG_X86_KERNEL_IBT |
570 | if (cpu_feature_enabled(X86_FEATURE_IBT) && |
571 | *(u32 *)hv_hypercall_pg != gen_endbr()) { |
572 | setup_clear_cpu_cap(X86_FEATURE_IBT); |
573 | pr_warn("Disabling IBT because of Hyper-V bug\n" ); |
574 | } |
575 | #endif |
576 | |
577 | /* |
578 | * hyperv_init() is called before LAPIC is initialized: see |
579 | * apic_intr_mode_init() -> x86_platform.apic_post_init() and |
580 | * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER |
581 | * depends on LAPIC, so hv_stimer_alloc() should be called from |
582 | * x86_init.timers.setup_percpu_clockev. |
583 | */ |
584 | old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; |
585 | x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; |
586 | |
587 | hv_apic_init(); |
588 | |
589 | x86_init.pci.arch_init = hv_pci_init; |
590 | |
591 | register_syscore_ops(ops: &hv_syscore_ops); |
592 | |
593 | hyperv_init_cpuhp = cpuhp; |
594 | |
595 | if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) |
596 | hv_get_partition_id(); |
597 | |
598 | BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); |
599 | |
600 | #ifdef CONFIG_PCI_MSI |
601 | /* |
602 | * If we're running as root, we want to create our own PCI MSI domain. |
603 | * We can't set this in hv_pci_init because that would be too late. |
604 | */ |
605 | if (hv_root_partition) |
606 | x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; |
607 | #endif |
608 | |
609 | /* Query the VMs extended capability once, so that it can be cached. */ |
610 | hv_query_ext_cap(cap_query: 0); |
611 | |
612 | /* Find the VTL */ |
613 | ms_hyperv.vtl = get_vtl(); |
614 | |
615 | if (ms_hyperv.vtl > 0) /* non default VTL */ |
616 | hv_vtl_early_init(); |
617 | |
618 | return; |
619 | |
620 | clean_guest_os_id: |
621 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: 0); |
622 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: 0); |
623 | cpuhp_remove_state(state: cpuhp); |
624 | free_ghcb_page: |
625 | free_percpu(pdata: hv_ghcb_pg); |
626 | free_vp_assist_page: |
627 | kfree(objp: hv_vp_assist_page); |
628 | hv_vp_assist_page = NULL; |
629 | common_free: |
630 | hv_common_free(); |
631 | } |
632 | |
633 | /* |
634 | * This routine is called before kexec/kdump, it does the required cleanup. |
635 | */ |
636 | void hyperv_cleanup(void) |
637 | { |
638 | union hv_x64_msr_hypercall_contents hypercall_msr; |
639 | union hv_reference_tsc_msr tsc_msr; |
640 | |
641 | /* Reset our OS id */ |
642 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: 0); |
643 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: 0); |
644 | |
645 | /* |
646 | * Reset hypercall page reference before reset the page, |
647 | * let hypercall operations fail safely rather than |
648 | * panic the kernel for using invalid hypercall page |
649 | */ |
650 | hv_hypercall_pg = NULL; |
651 | |
652 | /* Reset the hypercall page */ |
653 | hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL); |
654 | hypercall_msr.enable = 0; |
655 | hv_set_register(HV_X64_MSR_HYPERCALL, value: hypercall_msr.as_uint64); |
656 | |
657 | /* Reset the TSC page */ |
658 | tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC); |
659 | tsc_msr.enable = 0; |
660 | hv_set_register(HV_X64_MSR_REFERENCE_TSC, value: tsc_msr.as_uint64); |
661 | } |
662 | |
663 | void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) |
664 | { |
665 | static bool panic_reported; |
666 | u64 guest_id; |
667 | |
668 | if (in_die && !panic_on_oops) |
669 | return; |
670 | |
671 | /* |
672 | * We prefer to report panic on 'die' chain as we have proper |
673 | * registers to report, but if we miss it (e.g. on BUG()) we need |
674 | * to report it on 'panic'. |
675 | */ |
676 | if (panic_reported) |
677 | return; |
678 | panic_reported = true; |
679 | |
680 | rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); |
681 | |
682 | wrmsrl(HV_X64_MSR_CRASH_P0, val: err); |
683 | wrmsrl(HV_X64_MSR_CRASH_P1, val: guest_id); |
684 | wrmsrl(HV_X64_MSR_CRASH_P2, val: regs->ip); |
685 | wrmsrl(HV_X64_MSR_CRASH_P3, val: regs->ax); |
686 | wrmsrl(HV_X64_MSR_CRASH_P4, val: regs->sp); |
687 | |
688 | /* |
689 | * Let Hyper-V know there is crash data available |
690 | */ |
691 | wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); |
692 | } |
693 | EXPORT_SYMBOL_GPL(hyperv_report_panic); |
694 | |
695 | bool hv_is_hyperv_initialized(void) |
696 | { |
697 | union hv_x64_msr_hypercall_contents hypercall_msr; |
698 | |
699 | /* |
700 | * Ensure that we're really on Hyper-V, and not a KVM or Xen |
701 | * emulation of Hyper-V |
702 | */ |
703 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) |
704 | return false; |
705 | |
706 | /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ |
707 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) |
708 | return true; |
709 | /* |
710 | * Verify that earlier initialization succeeded by checking |
711 | * that the hypercall page is setup |
712 | */ |
713 | hypercall_msr.as_uint64 = 0; |
714 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
715 | |
716 | return hypercall_msr.enable; |
717 | } |
718 | EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); |
719 | |