| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright (c) 2023, Microsoft Corporation. |
| 4 | * |
| 5 | * Author: |
| 6 | * Saurabh Sengar <ssengar@microsoft.com> |
| 7 | */ |
| 8 | |
| 9 | #include <asm/apic.h> |
| 10 | #include <asm/boot.h> |
| 11 | #include <asm/desc.h> |
| 12 | #include <asm/fpu/api.h> |
| 13 | #include <asm/fpu/types.h> |
| 14 | #include <asm/i8259.h> |
| 15 | #include <asm/mshyperv.h> |
| 16 | #include <asm/msr.h> |
| 17 | #include <asm/realmode.h> |
| 18 | #include <asm/reboot.h> |
| 19 | #include <asm/smap.h> |
| 20 | #include <linux/export.h> |
| 21 | #include <../kernel/smpboot.h> |
| 22 | #include "../../kernel/fpu/legacy.h" |
| 23 | |
| 24 | extern struct boot_params boot_params; |
| 25 | static struct real_mode_header ; |
| 26 | |
| 27 | static bool __init hv_vtl_msi_ext_dest_id(void) |
| 28 | { |
| 29 | return true; |
| 30 | } |
| 31 | |
| 32 | /* |
| 33 | * The `native_machine_emergency_restart` function from `reboot.c` writes |
| 34 | * to the physical address 0x472 to indicate the type of reboot for the |
| 35 | * firmware. We cannot have that in VSM as the memory composition might |
| 36 | * be more generic, and such write effectively corrupts the memory thus |
| 37 | * making diagnostics harder at the very least. |
| 38 | */ |
| 39 | static void __noreturn hv_vtl_emergency_restart(void) |
| 40 | { |
| 41 | /* |
| 42 | * Cause a triple fault and the immediate reset. Here the code does not run |
| 43 | * on the top of any firmware, whereby cannot reach out to its services. |
| 44 | * The inifinite loop is for the improbable case that the triple fault does |
| 45 | * not work and have to preserve the state intact for debugging. |
| 46 | */ |
| 47 | for (;;) { |
| 48 | idt_invalidate(); |
| 49 | __asm__ __volatile__("int3" ); |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | /* |
| 54 | * The only way to restart in the VTL mode is to triple fault as the kernel runs |
| 55 | * as firmware. |
| 56 | */ |
| 57 | static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) |
| 58 | { |
| 59 | hv_vtl_emergency_restart(); |
| 60 | } |
| 61 | |
| 62 | void __init hv_vtl_init_platform(void) |
| 63 | { |
| 64 | /* |
| 65 | * This function is a no-op if the VTL mode is not enabled. |
| 66 | * If it is, this function runs if and only the kernel boots in |
| 67 | * VTL2 which the x86 hv initialization path makes sure of. |
| 68 | */ |
| 69 | pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n" , ms_hyperv.vtl); |
| 70 | |
| 71 | x86_platform.realmode_reserve = x86_init_noop; |
| 72 | x86_platform.realmode_init = x86_init_noop; |
| 73 | x86_init.irqs.pre_vector_init = x86_init_noop; |
| 74 | x86_init.timers.timer_init = x86_init_noop; |
| 75 | x86_init.resources.probe_roms = x86_init_noop; |
| 76 | |
| 77 | /* Avoid searching for BIOS MP tables */ |
| 78 | x86_init.mpparse.find_mptable = x86_init_noop; |
| 79 | x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; |
| 80 | |
| 81 | x86_platform.get_wallclock = get_rtc_noop; |
| 82 | x86_platform.set_wallclock = set_rtc_noop; |
| 83 | x86_platform.get_nmi_reason = hv_get_nmi_reason; |
| 84 | |
| 85 | x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; |
| 86 | x86_platform.legacy.rtc = 0; |
| 87 | x86_platform.legacy.warm_reset = 0; |
| 88 | x86_platform.legacy.reserve_bios_regions = 0; |
| 89 | x86_platform.legacy.devices.pnpbios = 0; |
| 90 | |
| 91 | x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; |
| 92 | } |
| 93 | |
| 94 | static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) |
| 95 | { |
| 96 | return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) | |
| 97 | (desc->base1 << 16) | desc->base0; |
| 98 | } |
| 99 | |
| 100 | static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc) |
| 101 | { |
| 102 | return ((u32)desc->limit1 << 16) | (u32)desc->limit0; |
| 103 | } |
| 104 | |
| 105 | typedef void (*secondary_startup_64_fn)(void*, void*); |
| 106 | static void hv_vtl_ap_entry(void) |
| 107 | { |
| 108 | ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params); |
| 109 | } |
| 110 | |
| 111 | static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) |
| 112 | { |
| 113 | u64 status; |
| 114 | int ret = 0; |
| 115 | struct hv_enable_vp_vtl *input; |
| 116 | unsigned long irq_flags; |
| 117 | |
| 118 | struct desc_ptr gdt_ptr; |
| 119 | struct desc_ptr idt_ptr; |
| 120 | |
| 121 | struct ldttss_desc *tss; |
| 122 | struct ldttss_desc *ldt; |
| 123 | struct desc_struct *gdt; |
| 124 | |
| 125 | struct task_struct *idle = idle_thread_get(cpu); |
| 126 | u64 rsp = (unsigned long)idle->thread.sp; |
| 127 | |
| 128 | u64 rip = (u64)&hv_vtl_ap_entry; |
| 129 | |
| 130 | native_store_gdt(dtr: &gdt_ptr); |
| 131 | store_idt(dtr: &idt_ptr); |
| 132 | |
| 133 | gdt = (struct desc_struct *)((void *)(gdt_ptr.address)); |
| 134 | tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
| 135 | ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT); |
| 136 | |
| 137 | local_irq_save(irq_flags); |
| 138 | |
| 139 | input = *this_cpu_ptr(hyperv_pcpu_input_arg); |
| 140 | memset(input, 0, sizeof(*input)); |
| 141 | |
| 142 | input->partition_id = HV_PARTITION_ID_SELF; |
| 143 | input->vp_index = target_vp_index; |
| 144 | input->target_vtl.target_vtl = HV_VTL_MGMT; |
| 145 | |
| 146 | /* |
| 147 | * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit |
| 148 | * mode transition sequence after waking up an AP with SIPI whose |
| 149 | * vector points to the 16-bit AP startup trampoline code. Here in |
| 150 | * VTL2, we can't perform that sequence as the AP has to start in |
| 151 | * the 64-bit mode. |
| 152 | * |
| 153 | * To make this happen, we tell the hypervisor to load a valid 64-bit |
| 154 | * context (most of which is just magic numbers from the CPU manual) |
| 155 | * so that AP jumps right to the 64-bit entry of the kernel, and the |
| 156 | * control registers are loaded with values that let the AP fetch the |
| 157 | * code and data and carry on with work it gets assigned. |
| 158 | */ |
| 159 | |
| 160 | input->vp_context.rip = rip; |
| 161 | input->vp_context.rsp = rsp; |
| 162 | input->vp_context.rflags = 0x0000000000000002; |
| 163 | input->vp_context.efer = native_rdmsrq(MSR_EFER); |
| 164 | input->vp_context.cr0 = native_read_cr0(); |
| 165 | input->vp_context.cr3 = __native_read_cr3(); |
| 166 | input->vp_context.cr4 = native_read_cr4(); |
| 167 | input->vp_context.msr_cr_pat = native_rdmsrq(MSR_IA32_CR_PAT); |
| 168 | input->vp_context.idtr.limit = idt_ptr.size; |
| 169 | input->vp_context.idtr.base = idt_ptr.address; |
| 170 | input->vp_context.gdtr.limit = gdt_ptr.size; |
| 171 | input->vp_context.gdtr.base = gdt_ptr.address; |
| 172 | |
| 173 | /* Non-system desc (64bit), long, code, present */ |
| 174 | input->vp_context.cs.selector = __KERNEL_CS; |
| 175 | input->vp_context.cs.base = 0; |
| 176 | input->vp_context.cs.limit = 0xffffffff; |
| 177 | input->vp_context.cs.attributes = 0xa09b; |
| 178 | /* Non-system desc (64bit), data, present, granularity, default */ |
| 179 | input->vp_context.ss.selector = __KERNEL_DS; |
| 180 | input->vp_context.ss.base = 0; |
| 181 | input->vp_context.ss.limit = 0xffffffff; |
| 182 | input->vp_context.ss.attributes = 0xc093; |
| 183 | |
| 184 | /* System desc (128bit), present, LDT */ |
| 185 | input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8; |
| 186 | input->vp_context.ldtr.base = hv_vtl_system_desc_base(desc: ldt); |
| 187 | input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(desc: ldt); |
| 188 | input->vp_context.ldtr.attributes = 0x82; |
| 189 | |
| 190 | /* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */ |
| 191 | input->vp_context.tr.selector = GDT_ENTRY_TSS * 8; |
| 192 | input->vp_context.tr.base = hv_vtl_system_desc_base(desc: tss); |
| 193 | input->vp_context.tr.limit = hv_vtl_system_desc_limit(desc: tss); |
| 194 | input->vp_context.tr.attributes = 0x8b; |
| 195 | |
| 196 | status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, inputaddr: input, NULL); |
| 197 | |
| 198 | if (!hv_result_success(status) && |
| 199 | hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) { |
| 200 | pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]" , |
| 201 | target_vp_index, status); |
| 202 | ret = -EINVAL; |
| 203 | goto free_lock; |
| 204 | } |
| 205 | |
| 206 | status = hv_do_hypercall(HVCALL_START_VP, inputaddr: input, NULL); |
| 207 | |
| 208 | if (!hv_result_success(status)) { |
| 209 | pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n" , |
| 210 | target_vp_index, status); |
| 211 | ret = -EINVAL; |
| 212 | } |
| 213 | |
| 214 | free_lock: |
| 215 | local_irq_restore(irq_flags); |
| 216 | |
| 217 | return ret; |
| 218 | } |
| 219 | |
| 220 | static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu) |
| 221 | { |
| 222 | int vp_index; |
| 223 | |
| 224 | pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n" , apicid); |
| 225 | vp_index = hv_apicid_to_vp_index(apic_id: apicid); |
| 226 | |
| 227 | if (vp_index < 0) { |
| 228 | pr_err("Couldn't find CPU with APIC ID %d\n" , apicid); |
| 229 | return -EINVAL; |
| 230 | } |
| 231 | if (vp_index > ms_hyperv.max_vp_index) { |
| 232 | pr_err("Invalid CPU id %d for APIC ID %d\n" , vp_index, apicid); |
| 233 | return -EINVAL; |
| 234 | } |
| 235 | |
| 236 | return hv_vtl_bringup_vcpu(target_vp_index: vp_index, cpu, eip_ignored: start_eip); |
| 237 | } |
| 238 | |
| 239 | int __init hv_vtl_early_init(void) |
| 240 | { |
| 241 | machine_ops.emergency_restart = hv_vtl_emergency_restart; |
| 242 | machine_ops.restart = hv_vtl_restart; |
| 243 | |
| 244 | /* |
| 245 | * `boot_cpu_has` returns the runtime feature support, |
| 246 | * and here is the earliest it can be used. |
| 247 | */ |
| 248 | if (cpu_feature_enabled(X86_FEATURE_XSAVE)) |
| 249 | panic(fmt: "XSAVE has to be disabled as it is not supported by this module.\n" |
| 250 | "Please add 'noxsave' to the kernel command line.\n" ); |
| 251 | |
| 252 | real_mode_header = &hv_vtl_real_mode_header; |
| 253 | apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu); |
| 254 | |
| 255 | return 0; |
| 256 | } |
| 257 | |
| 258 | DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void)); |
| 259 | |
| 260 | void mshv_vtl_return_call_init(u64 vtl_return_offset) |
| 261 | { |
| 262 | static_call_update(__mshv_vtl_return_hypercall, |
| 263 | (void *)((u8 *)hv_hypercall_pg + vtl_return_offset)); |
| 264 | } |
| 265 | EXPORT_SYMBOL(mshv_vtl_return_call_init); |
| 266 | |
| 267 | void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) |
| 268 | { |
| 269 | struct hv_vp_assist_page *hvp; |
| 270 | |
| 271 | hvp = hv_vp_assist_page[smp_processor_id()]; |
| 272 | hvp->vtl_ret_x64rax = vtl0->rax; |
| 273 | hvp->vtl_ret_x64rcx = vtl0->rcx; |
| 274 | |
| 275 | kernel_fpu_begin_mask(kfpu_mask: 0); |
| 276 | fxrstor(fx: &vtl0->fx_state); |
| 277 | __mshv_vtl_return_call(vtl0); |
| 278 | fxsave(fx: &vtl0->fx_state); |
| 279 | kernel_fpu_end(); |
| 280 | } |
| 281 | EXPORT_SYMBOL(mshv_vtl_return_call); |
| 282 | |