| 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | /* |
| 3 | * HyperV Detection code. |
| 4 | * |
| 5 | * Copyright (C) 2010, Novell, Inc. |
| 6 | * Author : K. Y. Srinivasan <ksrinivasan@novell.com> |
| 7 | */ |
| 8 | |
| 9 | #include <linux/types.h> |
| 10 | #include <linux/time.h> |
| 11 | #include <linux/clocksource.h> |
| 12 | #include <linux/init.h> |
| 13 | #include <linux/export.h> |
| 14 | #include <linux/hardirq.h> |
| 15 | #include <linux/efi.h> |
| 16 | #include <linux/interrupt.h> |
| 17 | #include <linux/irq.h> |
| 18 | #include <linux/kexec.h> |
| 19 | #include <linux/random.h> |
| 20 | #include <asm/processor.h> |
| 21 | #include <asm/hypervisor.h> |
| 22 | #include <hyperv/hvhdk.h> |
| 23 | #include <asm/mshyperv.h> |
| 24 | #include <asm/desc.h> |
| 25 | #include <asm/idtentry.h> |
| 26 | #include <asm/irq_regs.h> |
| 27 | #include <asm/i8259.h> |
| 28 | #include <asm/apic.h> |
| 29 | #include <asm/timer.h> |
| 30 | #include <asm/reboot.h> |
| 31 | #include <asm/msr.h> |
| 32 | #include <asm/nmi.h> |
| 33 | #include <clocksource/hyperv_timer.h> |
| 34 | #include <asm/numa.h> |
| 35 | #include <asm/svm.h> |
| 36 | |
| 37 | /* Is Linux running on nested Microsoft Hypervisor */ |
| 38 | bool hv_nested; |
| 39 | struct ms_hyperv_info ms_hyperv; |
| 40 | |
| 41 | #if IS_ENABLED(CONFIG_HYPERV) |
| 42 | /* |
| 43 | * When running with the paravisor, controls proxying the synthetic interrupts |
| 44 | * from the host |
| 45 | */ |
| 46 | static bool hv_para_sint_proxy; |
| 47 | |
| 48 | static inline unsigned int hv_get_nested_msr(unsigned int reg) |
| 49 | { |
| 50 | if (hv_is_sint_msr(reg)) |
| 51 | return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0; |
| 52 | |
| 53 | switch (reg) { |
| 54 | case HV_X64_MSR_SIMP: |
| 55 | return HV_X64_MSR_NESTED_SIMP; |
| 56 | case HV_X64_MSR_SIEFP: |
| 57 | return HV_X64_MSR_NESTED_SIEFP; |
| 58 | case HV_X64_MSR_SVERSION: |
| 59 | return HV_X64_MSR_NESTED_SVERSION; |
| 60 | case HV_X64_MSR_SCONTROL: |
| 61 | return HV_X64_MSR_NESTED_SCONTROL; |
| 62 | case HV_X64_MSR_EOM: |
| 63 | return HV_X64_MSR_NESTED_EOM; |
| 64 | default: |
| 65 | return reg; |
| 66 | } |
| 67 | } |
| 68 | |
| 69 | u64 hv_get_non_nested_msr(unsigned int reg) |
| 70 | { |
| 71 | u64 value; |
| 72 | |
| 73 | if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) |
| 74 | hv_ivm_msr_read(msr: reg, value: &value); |
| 75 | else |
| 76 | rdmsrq(reg, value); |
| 77 | return value; |
| 78 | } |
| 79 | EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); |
| 80 | |
| 81 | void hv_set_non_nested_msr(unsigned int reg, u64 value) |
| 82 | { |
| 83 | if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { |
| 84 | /* The hypervisor will get the intercept. */ |
| 85 | hv_ivm_msr_write(msr: reg, value); |
| 86 | |
| 87 | /* Using wrmsrq so the following goes to the paravisor. */ |
| 88 | if (hv_is_sint_msr(reg)) { |
| 89 | union hv_synic_sint sint = { .as_uint64 = value }; |
| 90 | |
| 91 | sint.proxy = hv_para_sint_proxy; |
| 92 | native_wrmsrq(reg, sint.as_uint64); |
| 93 | } |
| 94 | } else { |
| 95 | native_wrmsrq(reg, value); |
| 96 | } |
| 97 | } |
| 98 | EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); |
| 99 | |
| 100 | /* |
| 101 | * Enable or disable proxying synthetic interrupts |
| 102 | * to the paravisor. |
| 103 | */ |
| 104 | void hv_para_set_sint_proxy(bool enable) |
| 105 | { |
| 106 | hv_para_sint_proxy = enable; |
| 107 | } |
| 108 | |
| 109 | /* |
| 110 | * Get the SynIC register value from the paravisor. |
| 111 | */ |
| 112 | u64 hv_para_get_synic_register(unsigned int reg) |
| 113 | { |
| 114 | if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) |
| 115 | return ~0ULL; |
| 116 | return native_read_msr(msr: reg); |
| 117 | } |
| 118 | |
| 119 | /* |
| 120 | * Set the SynIC register value with the paravisor. |
| 121 | */ |
| 122 | void hv_para_set_synic_register(unsigned int reg, u64 val) |
| 123 | { |
| 124 | if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) |
| 125 | return; |
| 126 | native_write_msr(msr: reg, val); |
| 127 | } |
| 128 | |
| 129 | u64 hv_get_msr(unsigned int reg) |
| 130 | { |
| 131 | if (hv_nested) |
| 132 | reg = hv_get_nested_msr(reg); |
| 133 | |
| 134 | return hv_get_non_nested_msr(reg); |
| 135 | } |
| 136 | EXPORT_SYMBOL_GPL(hv_get_msr); |
| 137 | |
| 138 | void hv_set_msr(unsigned int reg, u64 value) |
| 139 | { |
| 140 | if (hv_nested) |
| 141 | reg = hv_get_nested_msr(reg); |
| 142 | |
| 143 | hv_set_non_nested_msr(reg, value); |
| 144 | } |
| 145 | EXPORT_SYMBOL_GPL(hv_set_msr); |
| 146 | |
| 147 | static void (*mshv_handler)(void); |
| 148 | static void (*vmbus_handler)(void); |
| 149 | static void (*hv_stimer0_handler)(void); |
| 150 | static void (*hv_kexec_handler)(void); |
| 151 | static void (*hv_crash_handler)(struct pt_regs *regs); |
| 152 | |
| 153 | DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) |
| 154 | { |
| 155 | struct pt_regs *old_regs = set_irq_regs(regs); |
| 156 | |
| 157 | inc_irq_stat(irq_hv_callback_count); |
| 158 | if (mshv_handler) |
| 159 | mshv_handler(); |
| 160 | |
| 161 | if (vmbus_handler) |
| 162 | vmbus_handler(); |
| 163 | |
| 164 | if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) |
| 165 | apic_eoi(); |
| 166 | |
| 167 | set_irq_regs(old_regs); |
| 168 | } |
| 169 | |
| 170 | void hv_setup_mshv_handler(void (*handler)(void)) |
| 171 | { |
| 172 | mshv_handler = handler; |
| 173 | } |
| 174 | |
| 175 | void hv_setup_vmbus_handler(void (*handler)(void)) |
| 176 | { |
| 177 | vmbus_handler = handler; |
| 178 | } |
| 179 | |
| 180 | void hv_remove_vmbus_handler(void) |
| 181 | { |
| 182 | /* We have no way to deallocate the interrupt gate */ |
| 183 | vmbus_handler = NULL; |
| 184 | } |
| 185 | |
| 186 | /* |
| 187 | * Routines to do per-architecture handling of stimer0 |
| 188 | * interrupts when in Direct Mode |
| 189 | */ |
| 190 | DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) |
| 191 | { |
| 192 | struct pt_regs *old_regs = set_irq_regs(regs); |
| 193 | |
| 194 | inc_irq_stat(hyperv_stimer0_count); |
| 195 | if (hv_stimer0_handler) |
| 196 | hv_stimer0_handler(); |
| 197 | add_interrupt_randomness(HYPERV_STIMER0_VECTOR); |
| 198 | apic_eoi(); |
| 199 | |
| 200 | set_irq_regs(old_regs); |
| 201 | } |
| 202 | |
| 203 | /* For x86/x64, override weak placeholders in hyperv_timer.c */ |
| 204 | void hv_setup_stimer0_handler(void (*handler)(void)) |
| 205 | { |
| 206 | hv_stimer0_handler = handler; |
| 207 | } |
| 208 | |
| 209 | void hv_remove_stimer0_handler(void) |
| 210 | { |
| 211 | /* We have no way to deallocate the interrupt gate */ |
| 212 | hv_stimer0_handler = NULL; |
| 213 | } |
| 214 | |
| 215 | void hv_setup_kexec_handler(void (*handler)(void)) |
| 216 | { |
| 217 | hv_kexec_handler = handler; |
| 218 | } |
| 219 | |
| 220 | void hv_remove_kexec_handler(void) |
| 221 | { |
| 222 | hv_kexec_handler = NULL; |
| 223 | } |
| 224 | |
| 225 | void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) |
| 226 | { |
| 227 | hv_crash_handler = handler; |
| 228 | } |
| 229 | |
| 230 | void hv_remove_crash_handler(void) |
| 231 | { |
| 232 | hv_crash_handler = NULL; |
| 233 | } |
| 234 | |
| 235 | #ifdef CONFIG_KEXEC_CORE |
| 236 | static void hv_machine_shutdown(void) |
| 237 | { |
| 238 | if (kexec_in_progress && hv_kexec_handler) |
| 239 | hv_kexec_handler(); |
| 240 | |
| 241 | /* |
| 242 | * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor |
| 243 | * corrupts the old VP Assist Pages and can crash the kexec kernel. |
| 244 | */ |
| 245 | if (kexec_in_progress) |
| 246 | cpuhp_remove_state(state: CPUHP_AP_HYPERV_ONLINE); |
| 247 | |
| 248 | /* The function calls stop_other_cpus(). */ |
| 249 | native_machine_shutdown(); |
| 250 | |
| 251 | /* Disable the hypercall page when there is only 1 active CPU. */ |
| 252 | if (kexec_in_progress) |
| 253 | hyperv_cleanup(); |
| 254 | } |
| 255 | #endif /* CONFIG_KEXEC_CORE */ |
| 256 | |
| 257 | #ifdef CONFIG_CRASH_DUMP |
| 258 | static void hv_guest_crash_shutdown(struct pt_regs *regs) |
| 259 | { |
| 260 | if (hv_crash_handler) |
| 261 | hv_crash_handler(regs); |
| 262 | |
| 263 | /* The function calls crash_smp_send_stop(). */ |
| 264 | native_machine_crash_shutdown(regs); |
| 265 | |
| 266 | /* Disable the hypercall page when there is only 1 active CPU. */ |
| 267 | hyperv_cleanup(); |
| 268 | } |
| 269 | #endif /* CONFIG_CRASH_DUMP */ |
| 270 | |
| 271 | static u64 hv_ref_counter_at_suspend; |
| 272 | static void (*old_save_sched_clock_state)(void); |
| 273 | static void (*old_restore_sched_clock_state)(void); |
| 274 | |
| 275 | /* |
| 276 | * Hyper-V clock counter resets during hibernation. Save and restore clock |
| 277 | * offset during suspend/resume, while also considering the time passed |
| 278 | * before suspend. This is to make sure that sched_clock using hv tsc page |
| 279 | * based clocksource, proceeds from where it left off during suspend and |
| 280 | * it shows correct time for the timestamps of kernel messages after resume. |
| 281 | */ |
| 282 | static void save_hv_clock_tsc_state(void) |
| 283 | { |
| 284 | hv_ref_counter_at_suspend = hv_read_reference_counter(); |
| 285 | } |
| 286 | |
| 287 | static void restore_hv_clock_tsc_state(void) |
| 288 | { |
| 289 | /* |
| 290 | * Adjust the offsets used by hv tsc clocksource to |
| 291 | * account for the time spent before hibernation. |
| 292 | * adjusted value = reference counter (time) at suspend |
| 293 | * - reference counter (time) now. |
| 294 | */ |
| 295 | hv_adj_sched_clock_offset(offset: hv_ref_counter_at_suspend - hv_read_reference_counter()); |
| 296 | } |
| 297 | |
| 298 | /* |
| 299 | * Functions to override save_sched_clock_state and restore_sched_clock_state |
| 300 | * functions of x86_platform. The Hyper-V clock counter is reset during |
| 301 | * suspend-resume and the offset used to measure time needs to be |
| 302 | * corrected, post resume. |
| 303 | */ |
| 304 | static void hv_save_sched_clock_state(void) |
| 305 | { |
| 306 | old_save_sched_clock_state(); |
| 307 | save_hv_clock_tsc_state(); |
| 308 | } |
| 309 | |
| 310 | static void hv_restore_sched_clock_state(void) |
| 311 | { |
| 312 | restore_hv_clock_tsc_state(); |
| 313 | old_restore_sched_clock_state(); |
| 314 | } |
| 315 | |
| 316 | static void __init x86_setup_ops_for_tsc_pg_clock(void) |
| 317 | { |
| 318 | if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) |
| 319 | return; |
| 320 | |
| 321 | old_save_sched_clock_state = x86_platform.save_sched_clock_state; |
| 322 | x86_platform.save_sched_clock_state = hv_save_sched_clock_state; |
| 323 | |
| 324 | old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; |
| 325 | x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; |
| 326 | } |
| 327 | |
| 328 | #ifdef CONFIG_X86_64 |
| 329 | DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall); |
| 330 | EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall); |
| 331 | #define hypercall_update(hc) static_call_update(hv_hypercall, hc) |
| 332 | #endif |
| 333 | #endif /* CONFIG_HYPERV */ |
| 334 | |
| 335 | #ifndef hypercall_update |
| 336 | #define hypercall_update(hc) (void)hc |
| 337 | #endif |
| 338 | |
| 339 | static uint32_t __init ms_hyperv_platform(void) |
| 340 | { |
| 341 | u32 eax; |
| 342 | u32 hyp_signature[3]; |
| 343 | |
| 344 | if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) |
| 345 | return 0; |
| 346 | |
| 347 | cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, |
| 348 | eax: &eax, ebx: &hyp_signature[0], ecx: &hyp_signature[1], edx: &hyp_signature[2]); |
| 349 | |
| 350 | if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || |
| 351 | memcmp(p: "Microsoft Hv" , q: hyp_signature, size: 12)) |
| 352 | return 0; |
| 353 | |
| 354 | /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ |
| 355 | eax = cpuid_eax(HYPERV_CPUID_FEATURES); |
| 356 | if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { |
| 357 | pr_warn("x86/hyperv: HYPERCALL MSR not available.\n" ); |
| 358 | return 0; |
| 359 | } |
| 360 | if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { |
| 361 | pr_warn("x86/hyperv: VP_INDEX MSR not available.\n" ); |
| 362 | return 0; |
| 363 | } |
| 364 | |
| 365 | return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; |
| 366 | } |
| 367 | |
| 368 | #ifdef CONFIG_X86_LOCAL_APIC |
| 369 | /* |
| 370 | * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes |
| 371 | * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle |
| 372 | * unknown NMI on the first CPU which gets it. |
| 373 | */ |
| 374 | static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) |
| 375 | { |
| 376 | static atomic_t nmi_cpu = ATOMIC_INIT(-1); |
| 377 | unsigned int old_cpu, this_cpu; |
| 378 | |
| 379 | if (!unknown_nmi_panic) |
| 380 | return NMI_DONE; |
| 381 | |
| 382 | old_cpu = -1; |
| 383 | this_cpu = raw_smp_processor_id(); |
| 384 | if (!atomic_try_cmpxchg(v: &nmi_cpu, old: &old_cpu, new: this_cpu)) |
| 385 | return NMI_HANDLED; |
| 386 | |
| 387 | return NMI_DONE; |
| 388 | } |
| 389 | #endif |
| 390 | |
| 391 | static unsigned long hv_get_tsc_khz(void) |
| 392 | { |
| 393 | unsigned long freq; |
| 394 | |
| 395 | rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); |
| 396 | |
| 397 | return freq / 1000; |
| 398 | } |
| 399 | |
| 400 | #if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV) |
| 401 | static void __init hv_smp_prepare_boot_cpu(void) |
| 402 | { |
| 403 | native_smp_prepare_boot_cpu(); |
| 404 | #if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS) |
| 405 | hv_init_spinlocks(); |
| 406 | #endif |
| 407 | } |
| 408 | |
| 409 | static void __init hv_smp_prepare_cpus(unsigned int max_cpus) |
| 410 | { |
| 411 | #ifdef CONFIG_X86_64 |
| 412 | int i; |
| 413 | int ret; |
| 414 | #endif |
| 415 | |
| 416 | native_smp_prepare_cpus(max_cpus); |
| 417 | |
| 418 | /* |
| 419 | * Override wakeup_secondary_cpu_64 callback for SEV-SNP |
| 420 | * enlightened guest. |
| 421 | */ |
| 422 | if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
| 423 | apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; |
| 424 | return; |
| 425 | } |
| 426 | |
| 427 | #ifdef CONFIG_X86_64 |
| 428 | for_each_present_cpu(i) { |
| 429 | if (i == 0) |
| 430 | continue; |
| 431 | ret = hv_call_add_logical_proc(node: numa_cpu_node(cpu: i), lp_index: i, cpu_physical_id(i)); |
| 432 | BUG_ON(ret); |
| 433 | } |
| 434 | |
| 435 | for_each_present_cpu(i) { |
| 436 | if (i == 0) |
| 437 | continue; |
| 438 | ret = hv_call_create_vp(node: numa_cpu_node(cpu: i), partition_id: hv_current_partition_id, vp_index: i, flags: i); |
| 439 | BUG_ON(ret); |
| 440 | } |
| 441 | #endif |
| 442 | } |
| 443 | #endif |
| 444 | |
| 445 | /* |
| 446 | * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the |
| 447 | * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 |
| 448 | * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns |
| 449 | * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a |
| 450 | * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and |
| 451 | * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs |
| 452 | * from the array and hence doesn't create the necessary irq description info. |
| 453 | * |
| 454 | * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, |
| 455 | * except don't change 'legacy_pic', which keeps its default value |
| 456 | * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero |
| 457 | * nr_legacy_irqs() and eventually serial console interrupts works properly. |
| 458 | */ |
| 459 | static void __init reduced_hw_init(void) |
| 460 | { |
| 461 | x86_init.timers.timer_init = x86_init_noop; |
| 462 | x86_init.irqs.pre_vector_init = x86_init_noop; |
| 463 | } |
| 464 | |
| 465 | int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) |
| 466 | { |
| 467 | unsigned int hv_max_functions; |
| 468 | |
| 469 | hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); |
| 470 | if (hv_max_functions < HYPERV_CPUID_VERSION) { |
| 471 | pr_err("%s: Could not detect Hyper-V version\n" , __func__); |
| 472 | return -ENODEV; |
| 473 | } |
| 474 | |
| 475 | cpuid(HYPERV_CPUID_VERSION, eax: &info->eax, ebx: &info->ebx, ecx: &info->ecx, edx: &info->edx); |
| 476 | |
| 477 | return 0; |
| 478 | } |
| 479 | EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); |
| 480 | |
| 481 | static void __init ms_hyperv_init_platform(void) |
| 482 | { |
| 483 | int hv_max_functions_eax, eax; |
| 484 | |
| 485 | #ifdef CONFIG_PARAVIRT |
| 486 | pv_info.name = "Hyper-V" ; |
| 487 | #endif |
| 488 | |
| 489 | /* |
| 490 | * Extract the features and hints |
| 491 | */ |
| 492 | ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); |
| 493 | ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); |
| 494 | ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES); |
| 495 | ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); |
| 496 | ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); |
| 497 | |
| 498 | hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); |
| 499 | |
| 500 | pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n" , |
| 501 | ms_hyperv.features, ms_hyperv.priv_high, |
| 502 | ms_hyperv.ext_features, ms_hyperv.hints, |
| 503 | ms_hyperv.misc_features); |
| 504 | |
| 505 | ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); |
| 506 | ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); |
| 507 | |
| 508 | pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n" , |
| 509 | ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); |
| 510 | |
| 511 | hv_identify_partition_type(); |
| 512 | |
| 513 | if (cc_platform_has(attr: CC_ATTR_SNP_SECURE_AVIC)) |
| 514 | ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED; |
| 515 | |
| 516 | if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { |
| 517 | hv_nested = true; |
| 518 | pr_info("Hyper-V: running on a nested hypervisor\n" ); |
| 519 | } |
| 520 | |
| 521 | /* |
| 522 | * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID |
| 523 | * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2 |
| 524 | * root) will not get them because the nested (L1) hypervisor filters them out. |
| 525 | * These are handled through intercept processing by the Windows Hyper-V stack |
| 526 | * or the paravisor. |
| 527 | */ |
| 528 | eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); |
| 529 | ms_hyperv.confidential_vmbus_available = |
| 530 | eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE; |
| 531 | ms_hyperv.msi_ext_dest_id = |
| 532 | eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; |
| 533 | |
| 534 | if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && |
| 535 | ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { |
| 536 | x86_platform.calibrate_tsc = hv_get_tsc_khz; |
| 537 | x86_platform.calibrate_cpu = hv_get_tsc_khz; |
| 538 | setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); |
| 539 | } |
| 540 | |
| 541 | if (ms_hyperv.priv_high & HV_ISOLATION) { |
| 542 | ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); |
| 543 | ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); |
| 544 | |
| 545 | if (ms_hyperv.shared_gpa_boundary_active) |
| 546 | ms_hyperv.shared_gpa_boundary = |
| 547 | BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); |
| 548 | |
| 549 | pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n" , |
| 550 | ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); |
| 551 | |
| 552 | |
| 553 | if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { |
| 554 | static_branch_enable(&isolation_type_snp); |
| 555 | if (!ms_hyperv.paravisor_present) |
| 556 | hypercall_update(hv_snp_hypercall); |
| 557 | } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { |
| 558 | static_branch_enable(&isolation_type_tdx); |
| 559 | |
| 560 | /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ |
| 561 | ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; |
| 562 | |
| 563 | if (!ms_hyperv.paravisor_present) { |
| 564 | hypercall_update(hv_tdx_hypercall); |
| 565 | /* |
| 566 | * Mark the Hyper-V TSC page feature as disabled |
| 567 | * in a TDX VM without paravisor so that the |
| 568 | * Invariant TSC, which is a better clocksource |
| 569 | * anyway, is used instead. |
| 570 | */ |
| 571 | ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; |
| 572 | |
| 573 | /* |
| 574 | * The Invariant TSC is expected to be available |
| 575 | * in a TDX VM without paravisor, but if not, |
| 576 | * print a warning message. The slower Hyper-V MSR-based |
| 577 | * Ref Counter should end up being the clocksource. |
| 578 | */ |
| 579 | if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) |
| 580 | pr_warn("Hyper-V: Invariant TSC is unavailable\n" ); |
| 581 | |
| 582 | /* HV_MSR_CRASH_CTL is unsupported. */ |
| 583 | ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; |
| 584 | |
| 585 | /* Don't trust Hyper-V's TLB-flushing hypercalls. */ |
| 586 | ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; |
| 587 | |
| 588 | x86_init.acpi.reduced_hw_early_init = reduced_hw_init; |
| 589 | } |
| 590 | } |
| 591 | } |
| 592 | |
| 593 | if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { |
| 594 | ms_hyperv.nested_features = |
| 595 | cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); |
| 596 | pr_info("Hyper-V: Nested features: 0x%x\n" , |
| 597 | ms_hyperv.nested_features); |
| 598 | } |
| 599 | |
| 600 | #ifdef CONFIG_X86_LOCAL_APIC |
| 601 | if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && |
| 602 | ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { |
| 603 | /* |
| 604 | * Get the APIC frequency. |
| 605 | */ |
| 606 | u64 hv_lapic_frequency; |
| 607 | |
| 608 | rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); |
| 609 | hv_lapic_frequency = div_u64(dividend: hv_lapic_frequency, HZ); |
| 610 | lapic_timer_period = hv_lapic_frequency; |
| 611 | pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n" , |
| 612 | lapic_timer_period); |
| 613 | } |
| 614 | |
| 615 | register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, |
| 616 | "hv_nmi_unknown" ); |
| 617 | #endif |
| 618 | |
| 619 | #ifdef CONFIG_X86_IO_APIC |
| 620 | no_timer_check = 1; |
| 621 | #endif |
| 622 | |
| 623 | #if IS_ENABLED(CONFIG_HYPERV) |
| 624 | if (hv_root_partition()) |
| 625 | machine_ops.power_off = hv_machine_power_off; |
| 626 | #if defined(CONFIG_KEXEC_CORE) |
| 627 | machine_ops.shutdown = hv_machine_shutdown; |
| 628 | #endif |
| 629 | #if defined(CONFIG_CRASH_DUMP) |
| 630 | if (!hv_root_partition()) |
| 631 | machine_ops.crash_shutdown = hv_guest_crash_shutdown; |
| 632 | #endif |
| 633 | #endif |
| 634 | /* |
| 635 | * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root |
| 636 | * partition doesn't need to write to synthetic MSR to enable invariant |
| 637 | * TSC feature. It sees what the hardware provides. |
| 638 | */ |
| 639 | if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { |
| 640 | /* |
| 641 | * Writing to synthetic MSR 0x40000118 updates/changes the |
| 642 | * guest visible CPUIDs. Setting bit 0 of this MSR enables |
| 643 | * guests to report invariant TSC feature through CPUID |
| 644 | * instruction, CPUID 0x800000007/EDX, bit 8. See code in |
| 645 | * early_init_intel() where this bit is examined. The |
| 646 | * setting of this MSR bit should happen before init_intel() |
| 647 | * is called. |
| 648 | */ |
| 649 | wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); |
| 650 | setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); |
| 651 | } |
| 652 | |
| 653 | /* |
| 654 | * Generation 2 instances don't support reading the NMI status from |
| 655 | * 0x61 port. |
| 656 | */ |
| 657 | if (efi_enabled(EFI_BOOT)) |
| 658 | x86_platform.get_nmi_reason = hv_get_nmi_reason; |
| 659 | |
| 660 | #if IS_ENABLED(CONFIG_HYPERV) |
| 661 | if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || |
| 662 | ms_hyperv.paravisor_present) |
| 663 | hv_vtom_init(); |
| 664 | /* |
| 665 | * Setup the hook to get control post apic initialization. |
| 666 | */ |
| 667 | x86_platform.apic_post_init = hyperv_init; |
| 668 | hyperv_setup_mmu_ops(); |
| 669 | |
| 670 | /* Install system interrupt handler for hypervisor callback */ |
| 671 | sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); |
| 672 | |
| 673 | /* Install system interrupt handler for reenlightenment notifications */ |
| 674 | if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { |
| 675 | sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); |
| 676 | } |
| 677 | |
| 678 | /* Install system interrupt handler for stimer0 */ |
| 679 | if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { |
| 680 | sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); |
| 681 | } |
| 682 | |
| 683 | # ifdef CONFIG_SMP |
| 684 | smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; |
| 685 | if (hv_root_partition() || |
| 686 | (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) |
| 687 | smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; |
| 688 | # endif |
| 689 | |
| 690 | /* |
| 691 | * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, |
| 692 | * set x2apic destination mode to physical mode when x2apic is available |
| 693 | * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs |
| 694 | * have 8-bit APIC id. |
| 695 | */ |
| 696 | # ifdef CONFIG_X86_X2APIC |
| 697 | if (x2apic_supported()) |
| 698 | x2apic_phys = 1; |
| 699 | # endif |
| 700 | |
| 701 | /* Register Hyper-V specific clocksource */ |
| 702 | hv_init_clocksource(); |
| 703 | x86_setup_ops_for_tsc_pg_clock(); |
| 704 | hv_vtl_init_platform(); |
| 705 | #endif |
| 706 | /* |
| 707 | * TSC should be marked as unstable only after Hyper-V |
| 708 | * clocksource has been initialized. This ensures that the |
| 709 | * stability of the sched_clock is not altered. |
| 710 | * |
| 711 | * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No |
| 712 | * need to check for it. |
| 713 | */ |
| 714 | if (!hv_root_partition() && |
| 715 | !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) |
| 716 | mark_tsc_unstable(reason: "running on Hyper-V" ); |
| 717 | |
| 718 | hardlockup_detector_disable(); |
| 719 | } |
| 720 | |
| 721 | static bool __init ms_hyperv_x2apic_available(void) |
| 722 | { |
| 723 | return x2apic_supported(); |
| 724 | } |
| 725 | |
| 726 | /* |
| 727 | * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() |
| 728 | * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the |
| 729 | * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). |
| 730 | * |
| 731 | * Note: for a VM on Hyper-V, the I/O-APIC is the only device which |
| 732 | * (logically) generates MSIs directly to the system APIC irq domain. |
| 733 | * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the |
| 734 | * pci-hyperv host bridge. |
| 735 | * |
| 736 | * Note: for a Hyper-V root partition, this will always return false. |
| 737 | */ |
| 738 | static bool __init ms_hyperv_msi_ext_dest_id(void) |
| 739 | { |
| 740 | return ms_hyperv.msi_ext_dest_id; |
| 741 | } |
| 742 | |
| 743 | #ifdef CONFIG_AMD_MEM_ENCRYPT |
| 744 | static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) |
| 745 | { |
| 746 | /* RAX and CPL are already in the GHCB */ |
| 747 | ghcb_set_rcx(ghcb, value: regs->cx); |
| 748 | ghcb_set_rdx(ghcb, value: regs->dx); |
| 749 | ghcb_set_r8(ghcb, value: regs->r8); |
| 750 | } |
| 751 | |
| 752 | static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) |
| 753 | { |
| 754 | /* No checking of the return state needed */ |
| 755 | return true; |
| 756 | } |
| 757 | #endif |
| 758 | |
| 759 | const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { |
| 760 | .name = "Microsoft Hyper-V" , |
| 761 | .detect = ms_hyperv_platform, |
| 762 | .type = X86_HYPER_MS_HYPERV, |
| 763 | .init.x2apic_available = ms_hyperv_x2apic_available, |
| 764 | .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, |
| 765 | .init.init_platform = ms_hyperv_init_platform, |
| 766 | .init.guest_late_init = ms_hyperv_late_init, |
| 767 | #ifdef CONFIG_AMD_MEM_ENCRYPT |
| 768 | .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, |
| 769 | .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, |
| 770 | #endif |
| 771 | }; |
| 772 | |