1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * AMD Memory Encryption Support |
4 | * |
5 | * Copyright (C) 2019 SUSE |
6 | * |
7 | * Author: Joerg Roedel <jroedel@suse.de> |
8 | */ |
9 | |
10 | #define pr_fmt(fmt) "SEV: " fmt |
11 | |
12 | #include <linux/sched/debug.h> /* For show_regs() */ |
13 | #include <linux/percpu-defs.h> |
14 | #include <linux/cc_platform.h> |
15 | #include <linux/printk.h> |
16 | #include <linux/mm_types.h> |
17 | #include <linux/set_memory.h> |
18 | #include <linux/memblock.h> |
19 | #include <linux/kernel.h> |
20 | #include <linux/mm.h> |
21 | #include <linux/cpumask.h> |
22 | #include <linux/efi.h> |
23 | #include <linux/platform_device.h> |
24 | #include <linux/io.h> |
25 | #include <linux/psp-sev.h> |
26 | #include <uapi/linux/sev-guest.h> |
27 | |
28 | #include <asm/cpu_entry_area.h> |
29 | #include <asm/stacktrace.h> |
30 | #include <asm/sev.h> |
31 | #include <asm/insn-eval.h> |
32 | #include <asm/fpu/xcr.h> |
33 | #include <asm/processor.h> |
34 | #include <asm/realmode.h> |
35 | #include <asm/setup.h> |
36 | #include <asm/traps.h> |
37 | #include <asm/svm.h> |
38 | #include <asm/smp.h> |
39 | #include <asm/cpu.h> |
40 | #include <asm/apic.h> |
41 | #include <asm/cpuid.h> |
42 | #include <asm/cmdline.h> |
43 | |
44 | #define DR7_RESET_VALUE 0x400 |
45 | |
46 | /* AP INIT values as documented in the APM2 section "Processor Initialization State" */ |
47 | #define AP_INIT_CS_LIMIT 0xffff |
48 | #define AP_INIT_DS_LIMIT 0xffff |
49 | #define AP_INIT_LDTR_LIMIT 0xffff |
50 | #define AP_INIT_GDTR_LIMIT 0xffff |
51 | #define AP_INIT_IDTR_LIMIT 0xffff |
52 | #define AP_INIT_TR_LIMIT 0xffff |
53 | #define AP_INIT_RFLAGS_DEFAULT 0x2 |
54 | #define AP_INIT_DR6_DEFAULT 0xffff0ff0 |
55 | #define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL |
56 | #define AP_INIT_XCR0_DEFAULT 0x1 |
57 | #define AP_INIT_X87_FTW_DEFAULT 0x5555 |
58 | #define AP_INIT_X87_FCW_DEFAULT 0x0040 |
59 | #define AP_INIT_CR0_DEFAULT 0x60000010 |
60 | #define AP_INIT_MXCSR_DEFAULT 0x1f80 |
61 | |
62 | /* For early boot hypervisor communication in SEV-ES enabled guests */ |
63 | static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); |
64 | |
65 | /* |
66 | * Needs to be in the .data section because we need it NULL before bss is |
67 | * cleared |
68 | */ |
69 | static struct ghcb *boot_ghcb __section(".data" ); |
70 | |
71 | /* Bitmap of SEV features supported by the hypervisor */ |
72 | static u64 sev_hv_features __ro_after_init; |
73 | |
74 | /* #VC handler runtime per-CPU data */ |
75 | struct sev_es_runtime_data { |
76 | struct ghcb ghcb_page; |
77 | |
78 | /* |
79 | * Reserve one page per CPU as backup storage for the unencrypted GHCB. |
80 | * It is needed when an NMI happens while the #VC handler uses the real |
81 | * GHCB, and the NMI handler itself is causing another #VC exception. In |
82 | * that case the GHCB content of the first handler needs to be backed up |
83 | * and restored. |
84 | */ |
85 | struct ghcb backup_ghcb; |
86 | |
87 | /* |
88 | * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. |
89 | * There is no need for it to be atomic, because nothing is written to |
90 | * the GHCB between the read and the write of ghcb_active. So it is safe |
91 | * to use it when a nested #VC exception happens before the write. |
92 | * |
93 | * This is necessary for example in the #VC->NMI->#VC case when the NMI |
94 | * happens while the first #VC handler uses the GHCB. When the NMI code |
95 | * raises a second #VC handler it might overwrite the contents of the |
96 | * GHCB written by the first handler. To avoid this the content of the |
97 | * GHCB is saved and restored when the GHCB is detected to be in use |
98 | * already. |
99 | */ |
100 | bool ghcb_active; |
101 | bool backup_ghcb_active; |
102 | |
103 | /* |
104 | * Cached DR7 value - write it on DR7 writes and return it on reads. |
105 | * That value will never make it to the real hardware DR7 as debugging |
106 | * is currently unsupported in SEV-ES guests. |
107 | */ |
108 | unsigned long dr7; |
109 | }; |
110 | |
111 | struct ghcb_state { |
112 | struct ghcb *ghcb; |
113 | }; |
114 | |
115 | static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); |
116 | static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); |
117 | |
118 | struct sev_config { |
119 | __u64 debug : 1, |
120 | |
121 | /* |
122 | * A flag used by __set_pages_state() that indicates when the |
123 | * per-CPU GHCB has been created and registered and thus can be |
124 | * used by the BSP instead of the early boot GHCB. |
125 | * |
126 | * For APs, the per-CPU GHCB is created before they are started |
127 | * and registered upon startup, so this flag can be used globally |
128 | * for the BSP and APs. |
129 | */ |
130 | ghcbs_initialized : 1, |
131 | |
132 | __reserved : 62; |
133 | }; |
134 | |
135 | static struct sev_config sev_cfg __read_mostly; |
136 | |
137 | static __always_inline bool on_vc_stack(struct pt_regs *regs) |
138 | { |
139 | unsigned long sp = regs->sp; |
140 | |
141 | /* User-mode RSP is not trusted */ |
142 | if (user_mode(regs)) |
143 | return false; |
144 | |
145 | /* SYSCALL gap still has user-mode RSP */ |
146 | if (ip_within_syscall_gap(regs)) |
147 | return false; |
148 | |
149 | return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); |
150 | } |
151 | |
152 | /* |
153 | * This function handles the case when an NMI is raised in the #VC |
154 | * exception handler entry code, before the #VC handler has switched off |
155 | * its IST stack. In this case, the IST entry for #VC must be adjusted, |
156 | * so that any nested #VC exception will not overwrite the stack |
157 | * contents of the interrupted #VC handler. |
158 | * |
159 | * The IST entry is adjusted unconditionally so that it can be also be |
160 | * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a |
161 | * nested sev_es_ist_exit() call may adjust back the IST entry too |
162 | * early. |
163 | * |
164 | * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run |
165 | * on the NMI IST stack, as they are only called from NMI handling code |
166 | * right now. |
167 | */ |
168 | void noinstr __sev_es_ist_enter(struct pt_regs *regs) |
169 | { |
170 | unsigned long old_ist, new_ist; |
171 | |
172 | /* Read old IST entry */ |
173 | new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); |
174 | |
175 | /* |
176 | * If NMI happened while on the #VC IST stack, set the new IST |
177 | * value below regs->sp, so that the interrupted stack frame is |
178 | * not overwritten by subsequent #VC exceptions. |
179 | */ |
180 | if (on_vc_stack(regs)) |
181 | new_ist = regs->sp; |
182 | |
183 | /* |
184 | * Reserve additional 8 bytes and store old IST value so this |
185 | * adjustment can be unrolled in __sev_es_ist_exit(). |
186 | */ |
187 | new_ist -= sizeof(old_ist); |
188 | *(unsigned long *)new_ist = old_ist; |
189 | |
190 | /* Set new IST entry */ |
191 | this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); |
192 | } |
193 | |
194 | void noinstr __sev_es_ist_exit(void) |
195 | { |
196 | unsigned long ist; |
197 | |
198 | /* Read IST entry */ |
199 | ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); |
200 | |
201 | if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) |
202 | return; |
203 | |
204 | /* Read back old IST entry and write it to the TSS */ |
205 | this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); |
206 | } |
207 | |
208 | /* |
209 | * Nothing shall interrupt this code path while holding the per-CPU |
210 | * GHCB. The backup GHCB is only for NMIs interrupting this path. |
211 | * |
212 | * Callers must disable local interrupts around it. |
213 | */ |
214 | static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) |
215 | { |
216 | struct sev_es_runtime_data *data; |
217 | struct ghcb *ghcb; |
218 | |
219 | WARN_ON(!irqs_disabled()); |
220 | |
221 | data = this_cpu_read(runtime_data); |
222 | ghcb = &data->ghcb_page; |
223 | |
224 | if (unlikely(data->ghcb_active)) { |
225 | /* GHCB is already in use - save its contents */ |
226 | |
227 | if (unlikely(data->backup_ghcb_active)) { |
228 | /* |
229 | * Backup-GHCB is also already in use. There is no way |
230 | * to continue here so just kill the machine. To make |
231 | * panic() work, mark GHCBs inactive so that messages |
232 | * can be printed out. |
233 | */ |
234 | data->ghcb_active = false; |
235 | data->backup_ghcb_active = false; |
236 | |
237 | instrumentation_begin(); |
238 | panic(fmt: "Unable to handle #VC exception! GHCB and Backup GHCB are already in use" ); |
239 | instrumentation_end(); |
240 | } |
241 | |
242 | /* Mark backup_ghcb active before writing to it */ |
243 | data->backup_ghcb_active = true; |
244 | |
245 | state->ghcb = &data->backup_ghcb; |
246 | |
247 | /* Backup GHCB content */ |
248 | *state->ghcb = *ghcb; |
249 | } else { |
250 | state->ghcb = NULL; |
251 | data->ghcb_active = true; |
252 | } |
253 | |
254 | return ghcb; |
255 | } |
256 | |
257 | static inline u64 sev_es_rd_ghcb_msr(void) |
258 | { |
259 | return __rdmsr(MSR_AMD64_SEV_ES_GHCB); |
260 | } |
261 | |
262 | static __always_inline void sev_es_wr_ghcb_msr(u64 val) |
263 | { |
264 | u32 low, high; |
265 | |
266 | low = (u32)(val); |
267 | high = (u32)(val >> 32); |
268 | |
269 | native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); |
270 | } |
271 | |
272 | static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, |
273 | unsigned char *buffer) |
274 | { |
275 | return copy_from_kernel_nofault(dst: buffer, src: (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); |
276 | } |
277 | |
278 | static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) |
279 | { |
280 | char buffer[MAX_INSN_SIZE]; |
281 | int insn_bytes; |
282 | |
283 | insn_bytes = insn_fetch_from_user_inatomic(regs: ctxt->regs, buf: buffer); |
284 | if (insn_bytes == 0) { |
285 | /* Nothing could be copied */ |
286 | ctxt->fi.vector = X86_TRAP_PF; |
287 | ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; |
288 | ctxt->fi.cr2 = ctxt->regs->ip; |
289 | return ES_EXCEPTION; |
290 | } else if (insn_bytes == -EINVAL) { |
291 | /* Effective RIP could not be calculated */ |
292 | ctxt->fi.vector = X86_TRAP_GP; |
293 | ctxt->fi.error_code = 0; |
294 | ctxt->fi.cr2 = 0; |
295 | return ES_EXCEPTION; |
296 | } |
297 | |
298 | if (!insn_decode_from_regs(insn: &ctxt->insn, regs: ctxt->regs, buf: buffer, buf_size: insn_bytes)) |
299 | return ES_DECODE_FAILED; |
300 | |
301 | if (ctxt->insn.immediate.got) |
302 | return ES_OK; |
303 | else |
304 | return ES_DECODE_FAILED; |
305 | } |
306 | |
307 | static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) |
308 | { |
309 | char buffer[MAX_INSN_SIZE]; |
310 | int res, ret; |
311 | |
312 | res = vc_fetch_insn_kernel(ctxt, buffer); |
313 | if (res) { |
314 | ctxt->fi.vector = X86_TRAP_PF; |
315 | ctxt->fi.error_code = X86_PF_INSTR; |
316 | ctxt->fi.cr2 = ctxt->regs->ip; |
317 | return ES_EXCEPTION; |
318 | } |
319 | |
320 | ret = insn_decode(insn: &ctxt->insn, kaddr: buffer, MAX_INSN_SIZE, m: INSN_MODE_64); |
321 | if (ret < 0) |
322 | return ES_DECODE_FAILED; |
323 | else |
324 | return ES_OK; |
325 | } |
326 | |
327 | static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) |
328 | { |
329 | if (user_mode(regs: ctxt->regs)) |
330 | return __vc_decode_user_insn(ctxt); |
331 | else |
332 | return __vc_decode_kern_insn(ctxt); |
333 | } |
334 | |
335 | static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, |
336 | char *dst, char *buf, size_t size) |
337 | { |
338 | unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; |
339 | |
340 | /* |
341 | * This function uses __put_user() independent of whether kernel or user |
342 | * memory is accessed. This works fine because __put_user() does no |
343 | * sanity checks of the pointer being accessed. All that it does is |
344 | * to report when the access failed. |
345 | * |
346 | * Also, this function runs in atomic context, so __put_user() is not |
347 | * allowed to sleep. The page-fault handler detects that it is running |
348 | * in atomic context and will not try to take mmap_sem and handle the |
349 | * fault, so additional pagefault_enable()/disable() calls are not |
350 | * needed. |
351 | * |
352 | * The access can't be done via copy_to_user() here because |
353 | * vc_write_mem() must not use string instructions to access unsafe |
354 | * memory. The reason is that MOVS is emulated by the #VC handler by |
355 | * splitting the move up into a read and a write and taking a nested #VC |
356 | * exception on whatever of them is the MMIO access. Using string |
357 | * instructions here would cause infinite nesting. |
358 | */ |
359 | switch (size) { |
360 | case 1: { |
361 | u8 d1; |
362 | u8 __user *target = (u8 __user *)dst; |
363 | |
364 | memcpy(&d1, buf, 1); |
365 | if (__put_user(d1, target)) |
366 | goto fault; |
367 | break; |
368 | } |
369 | case 2: { |
370 | u16 d2; |
371 | u16 __user *target = (u16 __user *)dst; |
372 | |
373 | memcpy(&d2, buf, 2); |
374 | if (__put_user(d2, target)) |
375 | goto fault; |
376 | break; |
377 | } |
378 | case 4: { |
379 | u32 d4; |
380 | u32 __user *target = (u32 __user *)dst; |
381 | |
382 | memcpy(&d4, buf, 4); |
383 | if (__put_user(d4, target)) |
384 | goto fault; |
385 | break; |
386 | } |
387 | case 8: { |
388 | u64 d8; |
389 | u64 __user *target = (u64 __user *)dst; |
390 | |
391 | memcpy(&d8, buf, 8); |
392 | if (__put_user(d8, target)) |
393 | goto fault; |
394 | break; |
395 | } |
396 | default: |
397 | WARN_ONCE(1, "%s: Invalid size: %zu\n" , __func__, size); |
398 | return ES_UNSUPPORTED; |
399 | } |
400 | |
401 | return ES_OK; |
402 | |
403 | fault: |
404 | if (user_mode(regs: ctxt->regs)) |
405 | error_code |= X86_PF_USER; |
406 | |
407 | ctxt->fi.vector = X86_TRAP_PF; |
408 | ctxt->fi.error_code = error_code; |
409 | ctxt->fi.cr2 = (unsigned long)dst; |
410 | |
411 | return ES_EXCEPTION; |
412 | } |
413 | |
414 | static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, |
415 | char *src, char *buf, size_t size) |
416 | { |
417 | unsigned long error_code = X86_PF_PROT; |
418 | |
419 | /* |
420 | * This function uses __get_user() independent of whether kernel or user |
421 | * memory is accessed. This works fine because __get_user() does no |
422 | * sanity checks of the pointer being accessed. All that it does is |
423 | * to report when the access failed. |
424 | * |
425 | * Also, this function runs in atomic context, so __get_user() is not |
426 | * allowed to sleep. The page-fault handler detects that it is running |
427 | * in atomic context and will not try to take mmap_sem and handle the |
428 | * fault, so additional pagefault_enable()/disable() calls are not |
429 | * needed. |
430 | * |
431 | * The access can't be done via copy_from_user() here because |
432 | * vc_read_mem() must not use string instructions to access unsafe |
433 | * memory. The reason is that MOVS is emulated by the #VC handler by |
434 | * splitting the move up into a read and a write and taking a nested #VC |
435 | * exception on whatever of them is the MMIO access. Using string |
436 | * instructions here would cause infinite nesting. |
437 | */ |
438 | switch (size) { |
439 | case 1: { |
440 | u8 d1; |
441 | u8 __user *s = (u8 __user *)src; |
442 | |
443 | if (__get_user(d1, s)) |
444 | goto fault; |
445 | memcpy(buf, &d1, 1); |
446 | break; |
447 | } |
448 | case 2: { |
449 | u16 d2; |
450 | u16 __user *s = (u16 __user *)src; |
451 | |
452 | if (__get_user(d2, s)) |
453 | goto fault; |
454 | memcpy(buf, &d2, 2); |
455 | break; |
456 | } |
457 | case 4: { |
458 | u32 d4; |
459 | u32 __user *s = (u32 __user *)src; |
460 | |
461 | if (__get_user(d4, s)) |
462 | goto fault; |
463 | memcpy(buf, &d4, 4); |
464 | break; |
465 | } |
466 | case 8: { |
467 | u64 d8; |
468 | u64 __user *s = (u64 __user *)src; |
469 | if (__get_user(d8, s)) |
470 | goto fault; |
471 | memcpy(buf, &d8, 8); |
472 | break; |
473 | } |
474 | default: |
475 | WARN_ONCE(1, "%s: Invalid size: %zu\n" , __func__, size); |
476 | return ES_UNSUPPORTED; |
477 | } |
478 | |
479 | return ES_OK; |
480 | |
481 | fault: |
482 | if (user_mode(regs: ctxt->regs)) |
483 | error_code |= X86_PF_USER; |
484 | |
485 | ctxt->fi.vector = X86_TRAP_PF; |
486 | ctxt->fi.error_code = error_code; |
487 | ctxt->fi.cr2 = (unsigned long)src; |
488 | |
489 | return ES_EXCEPTION; |
490 | } |
491 | |
492 | static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, |
493 | unsigned long vaddr, phys_addr_t *paddr) |
494 | { |
495 | unsigned long va = (unsigned long)vaddr; |
496 | unsigned int level; |
497 | phys_addr_t pa; |
498 | pgd_t *pgd; |
499 | pte_t *pte; |
500 | |
501 | pgd = __va(read_cr3_pa()); |
502 | pgd = &pgd[pgd_index(va)]; |
503 | pte = lookup_address_in_pgd(pgd, address: va, level: &level); |
504 | if (!pte) { |
505 | ctxt->fi.vector = X86_TRAP_PF; |
506 | ctxt->fi.cr2 = vaddr; |
507 | ctxt->fi.error_code = 0; |
508 | |
509 | if (user_mode(regs: ctxt->regs)) |
510 | ctxt->fi.error_code |= X86_PF_USER; |
511 | |
512 | return ES_EXCEPTION; |
513 | } |
514 | |
515 | if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) |
516 | /* Emulated MMIO to/from encrypted memory not supported */ |
517 | return ES_UNSUPPORTED; |
518 | |
519 | pa = (phys_addr_t)pte_pfn(pte: *pte) << PAGE_SHIFT; |
520 | pa |= va & ~page_level_mask(level); |
521 | |
522 | *paddr = pa; |
523 | |
524 | return ES_OK; |
525 | } |
526 | |
527 | static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) |
528 | { |
529 | BUG_ON(size > 4); |
530 | |
531 | if (user_mode(regs: ctxt->regs)) { |
532 | struct thread_struct *t = ¤t->thread; |
533 | struct io_bitmap *iobm = t->io_bitmap; |
534 | size_t idx; |
535 | |
536 | if (!iobm) |
537 | goto fault; |
538 | |
539 | for (idx = port; idx < port + size; ++idx) { |
540 | if (test_bit(idx, iobm->bitmap)) |
541 | goto fault; |
542 | } |
543 | } |
544 | |
545 | return ES_OK; |
546 | |
547 | fault: |
548 | ctxt->fi.vector = X86_TRAP_GP; |
549 | ctxt->fi.error_code = 0; |
550 | |
551 | return ES_EXCEPTION; |
552 | } |
553 | |
554 | /* Include code shared with pre-decompression boot stage */ |
555 | #include "sev-shared.c" |
556 | |
557 | static noinstr void __sev_put_ghcb(struct ghcb_state *state) |
558 | { |
559 | struct sev_es_runtime_data *data; |
560 | struct ghcb *ghcb; |
561 | |
562 | WARN_ON(!irqs_disabled()); |
563 | |
564 | data = this_cpu_read(runtime_data); |
565 | ghcb = &data->ghcb_page; |
566 | |
567 | if (state->ghcb) { |
568 | /* Restore GHCB from Backup */ |
569 | *ghcb = *state->ghcb; |
570 | data->backup_ghcb_active = false; |
571 | state->ghcb = NULL; |
572 | } else { |
573 | /* |
574 | * Invalidate the GHCB so a VMGEXIT instruction issued |
575 | * from userspace won't appear to be valid. |
576 | */ |
577 | vc_ghcb_invalidate(ghcb); |
578 | data->ghcb_active = false; |
579 | } |
580 | } |
581 | |
582 | void noinstr __sev_es_nmi_complete(void) |
583 | { |
584 | struct ghcb_state state; |
585 | struct ghcb *ghcb; |
586 | |
587 | ghcb = __sev_get_ghcb(state: &state); |
588 | |
589 | vc_ghcb_invalidate(ghcb); |
590 | ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); |
591 | ghcb_set_sw_exit_info_1(ghcb, value: 0); |
592 | ghcb_set_sw_exit_info_2(ghcb, value: 0); |
593 | |
594 | sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); |
595 | VMGEXIT(); |
596 | |
597 | __sev_put_ghcb(state: &state); |
598 | } |
599 | |
600 | static u64 __init get_secrets_page(void) |
601 | { |
602 | u64 pa_data = boot_params.cc_blob_address; |
603 | struct cc_blob_sev_info info; |
604 | void *map; |
605 | |
606 | /* |
607 | * The CC blob contains the address of the secrets page, check if the |
608 | * blob is present. |
609 | */ |
610 | if (!pa_data) |
611 | return 0; |
612 | |
613 | map = early_memremap(phys_addr: pa_data, size: sizeof(info)); |
614 | if (!map) { |
615 | pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n" ); |
616 | return 0; |
617 | } |
618 | memcpy(&info, map, sizeof(info)); |
619 | early_memunmap(addr: map, size: sizeof(info)); |
620 | |
621 | /* smoke-test the secrets page passed */ |
622 | if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) |
623 | return 0; |
624 | |
625 | return info.secrets_phys; |
626 | } |
627 | |
628 | static u64 __init get_snp_jump_table_addr(void) |
629 | { |
630 | struct snp_secrets_page_layout *layout; |
631 | void __iomem *mem; |
632 | u64 pa, addr; |
633 | |
634 | pa = get_secrets_page(); |
635 | if (!pa) |
636 | return 0; |
637 | |
638 | mem = ioremap_encrypted(phys_addr: pa, PAGE_SIZE); |
639 | if (!mem) { |
640 | pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n" ); |
641 | return 0; |
642 | } |
643 | |
644 | layout = (__force struct snp_secrets_page_layout *)mem; |
645 | |
646 | addr = layout->os_area.ap_jump_table_pa; |
647 | iounmap(addr: mem); |
648 | |
649 | return addr; |
650 | } |
651 | |
652 | static u64 __init get_jump_table_addr(void) |
653 | { |
654 | struct ghcb_state state; |
655 | unsigned long flags; |
656 | struct ghcb *ghcb; |
657 | u64 ret = 0; |
658 | |
659 | if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
660 | return get_snp_jump_table_addr(); |
661 | |
662 | local_irq_save(flags); |
663 | |
664 | ghcb = __sev_get_ghcb(state: &state); |
665 | |
666 | vc_ghcb_invalidate(ghcb); |
667 | ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); |
668 | ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); |
669 | ghcb_set_sw_exit_info_2(ghcb, value: 0); |
670 | |
671 | sev_es_wr_ghcb_msr(__pa(ghcb)); |
672 | VMGEXIT(); |
673 | |
674 | if (ghcb_sw_exit_info_1_is_valid(ghcb) && |
675 | ghcb_sw_exit_info_2_is_valid(ghcb)) |
676 | ret = ghcb->save.sw_exit_info_2; |
677 | |
678 | __sev_put_ghcb(state: &state); |
679 | |
680 | local_irq_restore(flags); |
681 | |
682 | return ret; |
683 | } |
684 | |
685 | static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, |
686 | unsigned long npages, enum psc_op op) |
687 | { |
688 | unsigned long paddr_end; |
689 | u64 val; |
690 | int ret; |
691 | |
692 | vaddr = vaddr & PAGE_MASK; |
693 | |
694 | paddr = paddr & PAGE_MASK; |
695 | paddr_end = paddr + (npages << PAGE_SHIFT); |
696 | |
697 | while (paddr < paddr_end) { |
698 | if (op == SNP_PAGE_STATE_SHARED) { |
699 | /* Page validation must be rescinded before changing to shared */ |
700 | ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: false); |
701 | if (WARN(ret, "Failed to validate address 0x%lx ret %d" , paddr, ret)) |
702 | goto e_term; |
703 | } |
704 | |
705 | /* |
706 | * Use the MSR protocol because this function can be called before |
707 | * the GHCB is established. |
708 | */ |
709 | sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); |
710 | VMGEXIT(); |
711 | |
712 | val = sev_es_rd_ghcb_msr(); |
713 | |
714 | if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, |
715 | "Wrong PSC response code: 0x%x\n" , |
716 | (unsigned int)GHCB_RESP_CODE(val))) |
717 | goto e_term; |
718 | |
719 | if (WARN(GHCB_MSR_PSC_RESP_VAL(val), |
720 | "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n" , |
721 | op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared" , |
722 | paddr, GHCB_MSR_PSC_RESP_VAL(val))) |
723 | goto e_term; |
724 | |
725 | if (op == SNP_PAGE_STATE_PRIVATE) { |
726 | /* Page validation must be performed after changing to private */ |
727 | ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: true); |
728 | if (WARN(ret, "Failed to validate address 0x%lx ret %d" , paddr, ret)) |
729 | goto e_term; |
730 | } |
731 | |
732 | vaddr += PAGE_SIZE; |
733 | paddr += PAGE_SIZE; |
734 | } |
735 | |
736 | return; |
737 | |
738 | e_term: |
739 | sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); |
740 | } |
741 | |
742 | void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, |
743 | unsigned long npages) |
744 | { |
745 | /* |
746 | * This can be invoked in early boot while running identity mapped, so |
747 | * use an open coded check for SNP instead of using cc_platform_has(). |
748 | * This eliminates worries about jump tables or checking boot_cpu_data |
749 | * in the cc_platform_has() function. |
750 | */ |
751 | if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) |
752 | return; |
753 | |
754 | /* |
755 | * Ask the hypervisor to mark the memory pages as private in the RMP |
756 | * table. |
757 | */ |
758 | early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_PRIVATE); |
759 | } |
760 | |
761 | void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, |
762 | unsigned long npages) |
763 | { |
764 | /* |
765 | * This can be invoked in early boot while running identity mapped, so |
766 | * use an open coded check for SNP instead of using cc_platform_has(). |
767 | * This eliminates worries about jump tables or checking boot_cpu_data |
768 | * in the cc_platform_has() function. |
769 | */ |
770 | if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) |
771 | return; |
772 | |
773 | /* Ask hypervisor to mark the memory pages shared in the RMP table. */ |
774 | early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_SHARED); |
775 | } |
776 | |
777 | void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) |
778 | { |
779 | unsigned long vaddr, npages; |
780 | |
781 | vaddr = (unsigned long)__va(paddr); |
782 | npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; |
783 | |
784 | if (op == SNP_PAGE_STATE_PRIVATE) |
785 | early_snp_set_memory_private(vaddr, paddr, npages); |
786 | else if (op == SNP_PAGE_STATE_SHARED) |
787 | early_snp_set_memory_shared(vaddr, paddr, npages); |
788 | else |
789 | WARN(1, "invalid memory op %d\n" , op); |
790 | } |
791 | |
792 | static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, |
793 | unsigned long vaddr_end, int op) |
794 | { |
795 | struct ghcb_state state; |
796 | bool use_large_entry; |
797 | struct psc_hdr *hdr; |
798 | struct psc_entry *e; |
799 | unsigned long flags; |
800 | unsigned long pfn; |
801 | struct ghcb *ghcb; |
802 | int i; |
803 | |
804 | hdr = &data->hdr; |
805 | e = data->entries; |
806 | |
807 | memset(data, 0, sizeof(*data)); |
808 | i = 0; |
809 | |
810 | while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { |
811 | hdr->end_entry = i; |
812 | |
813 | if (is_vmalloc_addr(x: (void *)vaddr)) { |
814 | pfn = vmalloc_to_pfn(addr: (void *)vaddr); |
815 | use_large_entry = false; |
816 | } else { |
817 | pfn = __pa(vaddr) >> PAGE_SHIFT; |
818 | use_large_entry = true; |
819 | } |
820 | |
821 | e->gfn = pfn; |
822 | e->operation = op; |
823 | |
824 | if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && |
825 | (vaddr_end - vaddr) >= PMD_SIZE) { |
826 | e->pagesize = RMP_PG_SIZE_2M; |
827 | vaddr += PMD_SIZE; |
828 | } else { |
829 | e->pagesize = RMP_PG_SIZE_4K; |
830 | vaddr += PAGE_SIZE; |
831 | } |
832 | |
833 | e++; |
834 | i++; |
835 | } |
836 | |
837 | /* Page validation must be rescinded before changing to shared */ |
838 | if (op == SNP_PAGE_STATE_SHARED) |
839 | pvalidate_pages(desc: data); |
840 | |
841 | local_irq_save(flags); |
842 | |
843 | if (sev_cfg.ghcbs_initialized) |
844 | ghcb = __sev_get_ghcb(state: &state); |
845 | else |
846 | ghcb = boot_ghcb; |
847 | |
848 | /* Invoke the hypervisor to perform the page state changes */ |
849 | if (!ghcb || vmgexit_psc(ghcb, desc: data)) |
850 | sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); |
851 | |
852 | if (sev_cfg.ghcbs_initialized) |
853 | __sev_put_ghcb(state: &state); |
854 | |
855 | local_irq_restore(flags); |
856 | |
857 | /* Page validation must be performed after changing to private */ |
858 | if (op == SNP_PAGE_STATE_PRIVATE) |
859 | pvalidate_pages(desc: data); |
860 | |
861 | return vaddr; |
862 | } |
863 | |
864 | static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) |
865 | { |
866 | struct snp_psc_desc desc; |
867 | unsigned long vaddr_end; |
868 | |
869 | /* Use the MSR protocol when a GHCB is not available. */ |
870 | if (!boot_ghcb) |
871 | return early_set_pages_state(vaddr, __pa(vaddr), npages, op); |
872 | |
873 | vaddr = vaddr & PAGE_MASK; |
874 | vaddr_end = vaddr + (npages << PAGE_SHIFT); |
875 | |
876 | while (vaddr < vaddr_end) |
877 | vaddr = __set_pages_state(data: &desc, vaddr, vaddr_end, op); |
878 | } |
879 | |
880 | void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) |
881 | { |
882 | if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
883 | return; |
884 | |
885 | set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_SHARED); |
886 | } |
887 | |
888 | void snp_set_memory_private(unsigned long vaddr, unsigned long npages) |
889 | { |
890 | if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
891 | return; |
892 | |
893 | set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE); |
894 | } |
895 | |
896 | void snp_accept_memory(phys_addr_t start, phys_addr_t end) |
897 | { |
898 | unsigned long vaddr, npages; |
899 | |
900 | if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
901 | return; |
902 | |
903 | vaddr = (unsigned long)__va(start); |
904 | npages = (end - start) >> PAGE_SHIFT; |
905 | |
906 | set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE); |
907 | } |
908 | |
909 | static int snp_set_vmsa(void *va, bool vmsa) |
910 | { |
911 | u64 attrs; |
912 | |
913 | /* |
914 | * Running at VMPL0 allows the kernel to change the VMSA bit for a page |
915 | * using the RMPADJUST instruction. However, for the instruction to |
916 | * succeed it must target the permissions of a lesser privileged |
917 | * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST |
918 | * instruction in the AMD64 APM Volume 3). |
919 | */ |
920 | attrs = 1; |
921 | if (vmsa) |
922 | attrs |= RMPADJUST_VMSA_PAGE_BIT; |
923 | |
924 | return rmpadjust(vaddr: (unsigned long)va, RMP_PG_SIZE_4K, attrs); |
925 | } |
926 | |
927 | #define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) |
928 | #define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) |
929 | #define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) |
930 | |
931 | #define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) |
932 | #define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) |
933 | |
934 | static void *snp_alloc_vmsa_page(void) |
935 | { |
936 | struct page *p; |
937 | |
938 | /* |
939 | * Allocate VMSA page to work around the SNP erratum where the CPU will |
940 | * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) |
941 | * collides with the RMP entry of VMSA page. The recommended workaround |
942 | * is to not use a large page. |
943 | * |
944 | * Allocate an 8k page which is also 8k-aligned. |
945 | */ |
946 | p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, order: 1); |
947 | if (!p) |
948 | return NULL; |
949 | |
950 | split_page(page: p, order: 1); |
951 | |
952 | /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ |
953 | __free_page(p); |
954 | |
955 | return page_address(p + 1); |
956 | } |
957 | |
958 | static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) |
959 | { |
960 | int err; |
961 | |
962 | err = snp_set_vmsa(va: vmsa, vmsa: false); |
963 | if (err) |
964 | pr_err("clear VMSA page failed (%u), leaking page\n" , err); |
965 | else |
966 | free_page((unsigned long)vmsa); |
967 | } |
968 | |
969 | static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) |
970 | { |
971 | struct sev_es_save_area *cur_vmsa, *vmsa; |
972 | struct ghcb_state state; |
973 | unsigned long flags; |
974 | struct ghcb *ghcb; |
975 | u8 sipi_vector; |
976 | int cpu, ret; |
977 | u64 cr4; |
978 | |
979 | /* |
980 | * The hypervisor SNP feature support check has happened earlier, just check |
981 | * the AP_CREATION one here. |
982 | */ |
983 | if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) |
984 | return -EOPNOTSUPP; |
985 | |
986 | /* |
987 | * Verify the desired start IP against the known trampoline start IP |
988 | * to catch any future new trampolines that may be introduced that |
989 | * would require a new protected guest entry point. |
990 | */ |
991 | if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, |
992 | "Unsupported SNP start_ip: %lx\n" , start_ip)) |
993 | return -EINVAL; |
994 | |
995 | /* Override start_ip with known protected guest start IP */ |
996 | start_ip = real_mode_header->sev_es_trampoline_start; |
997 | |
998 | /* Find the logical CPU for the APIC ID */ |
999 | for_each_present_cpu(cpu) { |
1000 | if (arch_match_cpu_phys_id(cpu, phys_id: apic_id)) |
1001 | break; |
1002 | } |
1003 | if (cpu >= nr_cpu_ids) |
1004 | return -EINVAL; |
1005 | |
1006 | cur_vmsa = per_cpu(sev_vmsa, cpu); |
1007 | |
1008 | /* |
1009 | * A new VMSA is created each time because there is no guarantee that |
1010 | * the current VMSA is the kernels or that the vCPU is not running. If |
1011 | * an attempt was done to use the current VMSA with a running vCPU, a |
1012 | * #VMEXIT of that vCPU would wipe out all of the settings being done |
1013 | * here. |
1014 | */ |
1015 | vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); |
1016 | if (!vmsa) |
1017 | return -ENOMEM; |
1018 | |
1019 | /* CR4 should maintain the MCE value */ |
1020 | cr4 = native_read_cr4() & X86_CR4_MCE; |
1021 | |
1022 | /* Set the CS value based on the start_ip converted to a SIPI vector */ |
1023 | sipi_vector = (start_ip >> 12); |
1024 | vmsa->cs.base = sipi_vector << 12; |
1025 | vmsa->cs.limit = AP_INIT_CS_LIMIT; |
1026 | vmsa->cs.attrib = INIT_CS_ATTRIBS; |
1027 | vmsa->cs.selector = sipi_vector << 8; |
1028 | |
1029 | /* Set the RIP value based on start_ip */ |
1030 | vmsa->rip = start_ip & 0xfff; |
1031 | |
1032 | /* Set AP INIT defaults as documented in the APM */ |
1033 | vmsa->ds.limit = AP_INIT_DS_LIMIT; |
1034 | vmsa->ds.attrib = INIT_DS_ATTRIBS; |
1035 | vmsa->es = vmsa->ds; |
1036 | vmsa->fs = vmsa->ds; |
1037 | vmsa->gs = vmsa->ds; |
1038 | vmsa->ss = vmsa->ds; |
1039 | |
1040 | vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; |
1041 | vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; |
1042 | vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; |
1043 | vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; |
1044 | vmsa->tr.limit = AP_INIT_TR_LIMIT; |
1045 | vmsa->tr.attrib = INIT_TR_ATTRIBS; |
1046 | |
1047 | vmsa->cr4 = cr4; |
1048 | vmsa->cr0 = AP_INIT_CR0_DEFAULT; |
1049 | vmsa->dr7 = DR7_RESET_VALUE; |
1050 | vmsa->dr6 = AP_INIT_DR6_DEFAULT; |
1051 | vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; |
1052 | vmsa->g_pat = AP_INIT_GPAT_DEFAULT; |
1053 | vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; |
1054 | vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; |
1055 | vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; |
1056 | vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; |
1057 | |
1058 | /* SVME must be set. */ |
1059 | vmsa->efer = EFER_SVME; |
1060 | |
1061 | /* |
1062 | * Set the SNP-specific fields for this VMSA: |
1063 | * VMPL level |
1064 | * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) |
1065 | */ |
1066 | vmsa->vmpl = 0; |
1067 | vmsa->sev_features = sev_status >> 2; |
1068 | |
1069 | /* Switch the page over to a VMSA page now that it is initialized */ |
1070 | ret = snp_set_vmsa(va: vmsa, vmsa: true); |
1071 | if (ret) { |
1072 | pr_err("set VMSA page failed (%u)\n" , ret); |
1073 | free_page((unsigned long)vmsa); |
1074 | |
1075 | return -EINVAL; |
1076 | } |
1077 | |
1078 | /* Issue VMGEXIT AP Creation NAE event */ |
1079 | local_irq_save(flags); |
1080 | |
1081 | ghcb = __sev_get_ghcb(state: &state); |
1082 | |
1083 | vc_ghcb_invalidate(ghcb); |
1084 | ghcb_set_rax(ghcb, value: vmsa->sev_features); |
1085 | ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); |
1086 | ghcb_set_sw_exit_info_1(ghcb, value: ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); |
1087 | ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); |
1088 | |
1089 | sev_es_wr_ghcb_msr(__pa(ghcb)); |
1090 | VMGEXIT(); |
1091 | |
1092 | if (!ghcb_sw_exit_info_1_is_valid(ghcb) || |
1093 | lower_32_bits(ghcb->save.sw_exit_info_1)) { |
1094 | pr_err("SNP AP Creation error\n" ); |
1095 | ret = -EINVAL; |
1096 | } |
1097 | |
1098 | __sev_put_ghcb(state: &state); |
1099 | |
1100 | local_irq_restore(flags); |
1101 | |
1102 | /* Perform cleanup if there was an error */ |
1103 | if (ret) { |
1104 | snp_cleanup_vmsa(vmsa); |
1105 | vmsa = NULL; |
1106 | } |
1107 | |
1108 | /* Free up any previous VMSA page */ |
1109 | if (cur_vmsa) |
1110 | snp_cleanup_vmsa(vmsa: cur_vmsa); |
1111 | |
1112 | /* Record the current VMSA page */ |
1113 | per_cpu(sev_vmsa, cpu) = vmsa; |
1114 | |
1115 | return ret; |
1116 | } |
1117 | |
1118 | void __init snp_set_wakeup_secondary_cpu(void) |
1119 | { |
1120 | if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
1121 | return; |
1122 | |
1123 | /* |
1124 | * Always set this override if SNP is enabled. This makes it the |
1125 | * required method to start APs under SNP. If the hypervisor does |
1126 | * not support AP creation, then no APs will be started. |
1127 | */ |
1128 | apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); |
1129 | } |
1130 | |
1131 | int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) |
1132 | { |
1133 | u16 startup_cs, startup_ip; |
1134 | phys_addr_t jump_table_pa; |
1135 | u64 jump_table_addr; |
1136 | u16 __iomem *jump_table; |
1137 | |
1138 | jump_table_addr = get_jump_table_addr(); |
1139 | |
1140 | /* On UP guests there is no jump table so this is not a failure */ |
1141 | if (!jump_table_addr) |
1142 | return 0; |
1143 | |
1144 | /* Check if AP Jump Table is page-aligned */ |
1145 | if (jump_table_addr & ~PAGE_MASK) |
1146 | return -EINVAL; |
1147 | |
1148 | jump_table_pa = jump_table_addr & PAGE_MASK; |
1149 | |
1150 | startup_cs = (u16)(rmh->trampoline_start >> 4); |
1151 | startup_ip = (u16)(rmh->sev_es_trampoline_start - |
1152 | rmh->trampoline_start); |
1153 | |
1154 | jump_table = ioremap_encrypted(phys_addr: jump_table_pa, PAGE_SIZE); |
1155 | if (!jump_table) |
1156 | return -EIO; |
1157 | |
1158 | writew(val: startup_ip, addr: &jump_table[0]); |
1159 | writew(val: startup_cs, addr: &jump_table[1]); |
1160 | |
1161 | iounmap(addr: jump_table); |
1162 | |
1163 | return 0; |
1164 | } |
1165 | |
1166 | /* |
1167 | * This is needed by the OVMF UEFI firmware which will use whatever it finds in |
1168 | * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu |
1169 | * runtime GHCBs used by the kernel are also mapped in the EFI page-table. |
1170 | */ |
1171 | int __init sev_es_efi_map_ghcbs(pgd_t *pgd) |
1172 | { |
1173 | struct sev_es_runtime_data *data; |
1174 | unsigned long address, pflags; |
1175 | int cpu; |
1176 | u64 pfn; |
1177 | |
1178 | if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT)) |
1179 | return 0; |
1180 | |
1181 | pflags = _PAGE_NX | _PAGE_RW; |
1182 | |
1183 | for_each_possible_cpu(cpu) { |
1184 | data = per_cpu(runtime_data, cpu); |
1185 | |
1186 | address = __pa(&data->ghcb_page); |
1187 | pfn = address >> PAGE_SHIFT; |
1188 | |
1189 | if (kernel_map_pages_in_pgd(pgd, pfn, address, numpages: 1, page_flags: pflags)) |
1190 | return 1; |
1191 | } |
1192 | |
1193 | return 0; |
1194 | } |
1195 | |
1196 | static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) |
1197 | { |
1198 | struct pt_regs *regs = ctxt->regs; |
1199 | enum es_result ret; |
1200 | u64 exit_info_1; |
1201 | |
1202 | /* Is it a WRMSR? */ |
1203 | exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; |
1204 | |
1205 | ghcb_set_rcx(ghcb, value: regs->cx); |
1206 | if (exit_info_1) { |
1207 | ghcb_set_rax(ghcb, value: regs->ax); |
1208 | ghcb_set_rdx(ghcb, value: regs->dx); |
1209 | } |
1210 | |
1211 | ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, exit_info_2: 0); |
1212 | |
1213 | if ((ret == ES_OK) && (!exit_info_1)) { |
1214 | regs->ax = ghcb->save.rax; |
1215 | regs->dx = ghcb->save.rdx; |
1216 | } |
1217 | |
1218 | return ret; |
1219 | } |
1220 | |
1221 | static void snp_register_per_cpu_ghcb(void) |
1222 | { |
1223 | struct sev_es_runtime_data *data; |
1224 | struct ghcb *ghcb; |
1225 | |
1226 | data = this_cpu_read(runtime_data); |
1227 | ghcb = &data->ghcb_page; |
1228 | |
1229 | snp_register_ghcb_early(__pa(ghcb)); |
1230 | } |
1231 | |
1232 | void setup_ghcb(void) |
1233 | { |
1234 | if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT)) |
1235 | return; |
1236 | |
1237 | /* First make sure the hypervisor talks a supported protocol. */ |
1238 | if (!sev_es_negotiate_protocol()) |
1239 | sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); |
1240 | |
1241 | /* |
1242 | * Check whether the runtime #VC exception handler is active. It uses |
1243 | * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). |
1244 | * |
1245 | * If SNP is active, register the per-CPU GHCB page so that the runtime |
1246 | * exception handler can use it. |
1247 | */ |
1248 | if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { |
1249 | if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
1250 | snp_register_per_cpu_ghcb(); |
1251 | |
1252 | sev_cfg.ghcbs_initialized = true; |
1253 | |
1254 | return; |
1255 | } |
1256 | |
1257 | /* |
1258 | * Clear the boot_ghcb. The first exception comes in before the bss |
1259 | * section is cleared. |
1260 | */ |
1261 | memset(&boot_ghcb_page, 0, PAGE_SIZE); |
1262 | |
1263 | /* Alright - Make the boot-ghcb public */ |
1264 | boot_ghcb = &boot_ghcb_page; |
1265 | |
1266 | /* SNP guest requires that GHCB GPA must be registered. */ |
1267 | if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
1268 | snp_register_ghcb_early(__pa(&boot_ghcb_page)); |
1269 | } |
1270 | |
1271 | #ifdef CONFIG_HOTPLUG_CPU |
1272 | static void sev_es_ap_hlt_loop(void) |
1273 | { |
1274 | struct ghcb_state state; |
1275 | struct ghcb *ghcb; |
1276 | |
1277 | ghcb = __sev_get_ghcb(state: &state); |
1278 | |
1279 | while (true) { |
1280 | vc_ghcb_invalidate(ghcb); |
1281 | ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); |
1282 | ghcb_set_sw_exit_info_1(ghcb, value: 0); |
1283 | ghcb_set_sw_exit_info_2(ghcb, value: 0); |
1284 | |
1285 | sev_es_wr_ghcb_msr(__pa(ghcb)); |
1286 | VMGEXIT(); |
1287 | |
1288 | /* Wakeup signal? */ |
1289 | if (ghcb_sw_exit_info_2_is_valid(ghcb) && |
1290 | ghcb->save.sw_exit_info_2) |
1291 | break; |
1292 | } |
1293 | |
1294 | __sev_put_ghcb(state: &state); |
1295 | } |
1296 | |
1297 | /* |
1298 | * Play_dead handler when running under SEV-ES. This is needed because |
1299 | * the hypervisor can't deliver an SIPI request to restart the AP. |
1300 | * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the |
1301 | * hypervisor wakes it up again. |
1302 | */ |
1303 | static void sev_es_play_dead(void) |
1304 | { |
1305 | play_dead_common(); |
1306 | |
1307 | /* IRQs now disabled */ |
1308 | |
1309 | sev_es_ap_hlt_loop(); |
1310 | |
1311 | /* |
1312 | * If we get here, the VCPU was woken up again. Jump to CPU |
1313 | * startup code to get it back online. |
1314 | */ |
1315 | soft_restart_cpu(); |
1316 | } |
1317 | #else /* CONFIG_HOTPLUG_CPU */ |
1318 | #define sev_es_play_dead native_play_dead |
1319 | #endif /* CONFIG_HOTPLUG_CPU */ |
1320 | |
1321 | #ifdef CONFIG_SMP |
1322 | static void __init sev_es_setup_play_dead(void) |
1323 | { |
1324 | smp_ops.play_dead = sev_es_play_dead; |
1325 | } |
1326 | #else |
1327 | static inline void sev_es_setup_play_dead(void) { } |
1328 | #endif |
1329 | |
1330 | static void __init alloc_runtime_data(int cpu) |
1331 | { |
1332 | struct sev_es_runtime_data *data; |
1333 | |
1334 | data = memblock_alloc(size: sizeof(*data), PAGE_SIZE); |
1335 | if (!data) |
1336 | panic(fmt: "Can't allocate SEV-ES runtime data" ); |
1337 | |
1338 | per_cpu(runtime_data, cpu) = data; |
1339 | } |
1340 | |
1341 | static void __init init_ghcb(int cpu) |
1342 | { |
1343 | struct sev_es_runtime_data *data; |
1344 | int err; |
1345 | |
1346 | data = per_cpu(runtime_data, cpu); |
1347 | |
1348 | err = early_set_memory_decrypted(vaddr: (unsigned long)&data->ghcb_page, |
1349 | size: sizeof(data->ghcb_page)); |
1350 | if (err) |
1351 | panic(fmt: "Can't map GHCBs unencrypted" ); |
1352 | |
1353 | memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); |
1354 | |
1355 | data->ghcb_active = false; |
1356 | data->backup_ghcb_active = false; |
1357 | } |
1358 | |
1359 | void __init sev_es_init_vc_handling(void) |
1360 | { |
1361 | int cpu; |
1362 | |
1363 | BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); |
1364 | |
1365 | if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT)) |
1366 | return; |
1367 | |
1368 | if (!sev_es_check_cpu_features()) |
1369 | panic(fmt: "SEV-ES CPU Features missing" ); |
1370 | |
1371 | /* |
1372 | * SNP is supported in v2 of the GHCB spec which mandates support for HV |
1373 | * features. |
1374 | */ |
1375 | if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) { |
1376 | sev_hv_features = get_hv_features(); |
1377 | |
1378 | if (!(sev_hv_features & GHCB_HV_FT_SNP)) |
1379 | sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); |
1380 | } |
1381 | |
1382 | /* Initialize per-cpu GHCB pages */ |
1383 | for_each_possible_cpu(cpu) { |
1384 | alloc_runtime_data(cpu); |
1385 | init_ghcb(cpu); |
1386 | } |
1387 | |
1388 | sev_es_setup_play_dead(); |
1389 | |
1390 | /* Secondary CPUs use the runtime #VC handler */ |
1391 | initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; |
1392 | } |
1393 | |
1394 | static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) |
1395 | { |
1396 | int trapnr = ctxt->fi.vector; |
1397 | |
1398 | if (trapnr == X86_TRAP_PF) |
1399 | native_write_cr2(val: ctxt->fi.cr2); |
1400 | |
1401 | ctxt->regs->orig_ax = ctxt->fi.error_code; |
1402 | do_early_exception(regs: ctxt->regs, trapnr); |
1403 | } |
1404 | |
1405 | static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) |
1406 | { |
1407 | long *reg_array; |
1408 | int offset; |
1409 | |
1410 | reg_array = (long *)ctxt->regs; |
1411 | offset = insn_get_modrm_rm_off(insn: &ctxt->insn, regs: ctxt->regs); |
1412 | |
1413 | if (offset < 0) |
1414 | return NULL; |
1415 | |
1416 | offset /= sizeof(long); |
1417 | |
1418 | return reg_array + offset; |
1419 | } |
1420 | static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, |
1421 | unsigned int bytes, bool read) |
1422 | { |
1423 | u64 exit_code, exit_info_1, exit_info_2; |
1424 | unsigned long ghcb_pa = __pa(ghcb); |
1425 | enum es_result res; |
1426 | phys_addr_t paddr; |
1427 | void __user *ref; |
1428 | |
1429 | ref = insn_get_addr_ref(insn: &ctxt->insn, regs: ctxt->regs); |
1430 | if (ref == (void __user *)-1L) |
1431 | return ES_UNSUPPORTED; |
1432 | |
1433 | exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; |
1434 | |
1435 | res = vc_slow_virt_to_phys(ghcb, ctxt, vaddr: (unsigned long)ref, paddr: &paddr); |
1436 | if (res != ES_OK) { |
1437 | if (res == ES_EXCEPTION && !read) |
1438 | ctxt->fi.error_code |= X86_PF_WRITE; |
1439 | |
1440 | return res; |
1441 | } |
1442 | |
1443 | exit_info_1 = paddr; |
1444 | /* Can never be greater than 8 */ |
1445 | exit_info_2 = bytes; |
1446 | |
1447 | ghcb_set_sw_scratch(ghcb, value: ghcb_pa + offsetof(struct ghcb, shared_buffer)); |
1448 | |
1449 | return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); |
1450 | } |
1451 | |
1452 | /* |
1453 | * The MOVS instruction has two memory operands, which raises the |
1454 | * problem that it is not known whether the access to the source or the |
1455 | * destination caused the #VC exception (and hence whether an MMIO read |
1456 | * or write operation needs to be emulated). |
1457 | * |
1458 | * Instead of playing games with walking page-tables and trying to guess |
1459 | * whether the source or destination is an MMIO range, split the move |
1460 | * into two operations, a read and a write with only one memory operand. |
1461 | * This will cause a nested #VC exception on the MMIO address which can |
1462 | * then be handled. |
1463 | * |
1464 | * This implementation has the benefit that it also supports MOVS where |
1465 | * source _and_ destination are MMIO regions. |
1466 | * |
1467 | * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a |
1468 | * rare operation. If it turns out to be a performance problem the split |
1469 | * operations can be moved to memcpy_fromio() and memcpy_toio(). |
1470 | */ |
1471 | static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, |
1472 | unsigned int bytes) |
1473 | { |
1474 | unsigned long ds_base, es_base; |
1475 | unsigned char *src, *dst; |
1476 | unsigned char buffer[8]; |
1477 | enum es_result ret; |
1478 | bool rep; |
1479 | int off; |
1480 | |
1481 | ds_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_DS); |
1482 | es_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_ES); |
1483 | |
1484 | if (ds_base == -1L || es_base == -1L) { |
1485 | ctxt->fi.vector = X86_TRAP_GP; |
1486 | ctxt->fi.error_code = 0; |
1487 | return ES_EXCEPTION; |
1488 | } |
1489 | |
1490 | src = ds_base + (unsigned char *)ctxt->regs->si; |
1491 | dst = es_base + (unsigned char *)ctxt->regs->di; |
1492 | |
1493 | ret = vc_read_mem(ctxt, src, buf: buffer, size: bytes); |
1494 | if (ret != ES_OK) |
1495 | return ret; |
1496 | |
1497 | ret = vc_write_mem(ctxt, dst, buf: buffer, size: bytes); |
1498 | if (ret != ES_OK) |
1499 | return ret; |
1500 | |
1501 | if (ctxt->regs->flags & X86_EFLAGS_DF) |
1502 | off = -bytes; |
1503 | else |
1504 | off = bytes; |
1505 | |
1506 | ctxt->regs->si += off; |
1507 | ctxt->regs->di += off; |
1508 | |
1509 | rep = insn_has_rep_prefix(insn: &ctxt->insn); |
1510 | if (rep) |
1511 | ctxt->regs->cx -= 1; |
1512 | |
1513 | if (!rep || ctxt->regs->cx == 0) |
1514 | return ES_OK; |
1515 | else |
1516 | return ES_RETRY; |
1517 | } |
1518 | |
1519 | static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) |
1520 | { |
1521 | struct insn *insn = &ctxt->insn; |
1522 | enum insn_mmio_type mmio; |
1523 | unsigned int bytes = 0; |
1524 | enum es_result ret; |
1525 | u8 sign_byte; |
1526 | long *reg_data; |
1527 | |
1528 | mmio = insn_decode_mmio(insn, bytes: &bytes); |
1529 | if (mmio == INSN_MMIO_DECODE_FAILED) |
1530 | return ES_DECODE_FAILED; |
1531 | |
1532 | if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { |
1533 | reg_data = insn_get_modrm_reg_ptr(insn, regs: ctxt->regs); |
1534 | if (!reg_data) |
1535 | return ES_DECODE_FAILED; |
1536 | } |
1537 | |
1538 | if (user_mode(regs: ctxt->regs)) |
1539 | return ES_UNSUPPORTED; |
1540 | |
1541 | switch (mmio) { |
1542 | case INSN_MMIO_WRITE: |
1543 | memcpy(ghcb->shared_buffer, reg_data, bytes); |
1544 | ret = vc_do_mmio(ghcb, ctxt, bytes, read: false); |
1545 | break; |
1546 | case INSN_MMIO_WRITE_IMM: |
1547 | memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); |
1548 | ret = vc_do_mmio(ghcb, ctxt, bytes, read: false); |
1549 | break; |
1550 | case INSN_MMIO_READ: |
1551 | ret = vc_do_mmio(ghcb, ctxt, bytes, read: true); |
1552 | if (ret) |
1553 | break; |
1554 | |
1555 | /* Zero-extend for 32-bit operation */ |
1556 | if (bytes == 4) |
1557 | *reg_data = 0; |
1558 | |
1559 | memcpy(reg_data, ghcb->shared_buffer, bytes); |
1560 | break; |
1561 | case INSN_MMIO_READ_ZERO_EXTEND: |
1562 | ret = vc_do_mmio(ghcb, ctxt, bytes, read: true); |
1563 | if (ret) |
1564 | break; |
1565 | |
1566 | /* Zero extend based on operand size */ |
1567 | memset(reg_data, 0, insn->opnd_bytes); |
1568 | memcpy(reg_data, ghcb->shared_buffer, bytes); |
1569 | break; |
1570 | case INSN_MMIO_READ_SIGN_EXTEND: |
1571 | ret = vc_do_mmio(ghcb, ctxt, bytes, read: true); |
1572 | if (ret) |
1573 | break; |
1574 | |
1575 | if (bytes == 1) { |
1576 | u8 *val = (u8 *)ghcb->shared_buffer; |
1577 | |
1578 | sign_byte = (*val & 0x80) ? 0xff : 0x00; |
1579 | } else { |
1580 | u16 *val = (u16 *)ghcb->shared_buffer; |
1581 | |
1582 | sign_byte = (*val & 0x8000) ? 0xff : 0x00; |
1583 | } |
1584 | |
1585 | /* Sign extend based on operand size */ |
1586 | memset(reg_data, sign_byte, insn->opnd_bytes); |
1587 | memcpy(reg_data, ghcb->shared_buffer, bytes); |
1588 | break; |
1589 | case INSN_MMIO_MOVS: |
1590 | ret = vc_handle_mmio_movs(ctxt, bytes); |
1591 | break; |
1592 | default: |
1593 | ret = ES_UNSUPPORTED; |
1594 | break; |
1595 | } |
1596 | |
1597 | return ret; |
1598 | } |
1599 | |
1600 | static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, |
1601 | struct es_em_ctxt *ctxt) |
1602 | { |
1603 | struct sev_es_runtime_data *data = this_cpu_read(runtime_data); |
1604 | long val, *reg = vc_insn_get_rm(ctxt); |
1605 | enum es_result ret; |
1606 | |
1607 | if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) |
1608 | return ES_VMM_ERROR; |
1609 | |
1610 | if (!reg) |
1611 | return ES_DECODE_FAILED; |
1612 | |
1613 | val = *reg; |
1614 | |
1615 | /* Upper 32 bits must be written as zeroes */ |
1616 | if (val >> 32) { |
1617 | ctxt->fi.vector = X86_TRAP_GP; |
1618 | ctxt->fi.error_code = 0; |
1619 | return ES_EXCEPTION; |
1620 | } |
1621 | |
1622 | /* Clear out other reserved bits and set bit 10 */ |
1623 | val = (val & 0xffff23ffL) | BIT(10); |
1624 | |
1625 | /* Early non-zero writes to DR7 are not supported */ |
1626 | if (!data && (val & ~DR7_RESET_VALUE)) |
1627 | return ES_UNSUPPORTED; |
1628 | |
1629 | /* Using a value of 0 for ExitInfo1 means RAX holds the value */ |
1630 | ghcb_set_rax(ghcb, value: val); |
1631 | ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, exit_info_1: 0, exit_info_2: 0); |
1632 | if (ret != ES_OK) |
1633 | return ret; |
1634 | |
1635 | if (data) |
1636 | data->dr7 = val; |
1637 | |
1638 | return ES_OK; |
1639 | } |
1640 | |
1641 | static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, |
1642 | struct es_em_ctxt *ctxt) |
1643 | { |
1644 | struct sev_es_runtime_data *data = this_cpu_read(runtime_data); |
1645 | long *reg = vc_insn_get_rm(ctxt); |
1646 | |
1647 | if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) |
1648 | return ES_VMM_ERROR; |
1649 | |
1650 | if (!reg) |
1651 | return ES_DECODE_FAILED; |
1652 | |
1653 | if (data) |
1654 | *reg = data->dr7; |
1655 | else |
1656 | *reg = DR7_RESET_VALUE; |
1657 | |
1658 | return ES_OK; |
1659 | } |
1660 | |
1661 | static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, |
1662 | struct es_em_ctxt *ctxt) |
1663 | { |
1664 | return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, exit_info_1: 0, exit_info_2: 0); |
1665 | } |
1666 | |
1667 | static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) |
1668 | { |
1669 | enum es_result ret; |
1670 | |
1671 | ghcb_set_rcx(ghcb, value: ctxt->regs->cx); |
1672 | |
1673 | ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, exit_info_1: 0, exit_info_2: 0); |
1674 | if (ret != ES_OK) |
1675 | return ret; |
1676 | |
1677 | if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) |
1678 | return ES_VMM_ERROR; |
1679 | |
1680 | ctxt->regs->ax = ghcb->save.rax; |
1681 | ctxt->regs->dx = ghcb->save.rdx; |
1682 | |
1683 | return ES_OK; |
1684 | } |
1685 | |
1686 | static enum es_result vc_handle_monitor(struct ghcb *ghcb, |
1687 | struct es_em_ctxt *ctxt) |
1688 | { |
1689 | /* |
1690 | * Treat it as a NOP and do not leak a physical address to the |
1691 | * hypervisor. |
1692 | */ |
1693 | return ES_OK; |
1694 | } |
1695 | |
1696 | static enum es_result vc_handle_mwait(struct ghcb *ghcb, |
1697 | struct es_em_ctxt *ctxt) |
1698 | { |
1699 | /* Treat the same as MONITOR/MONITORX */ |
1700 | return ES_OK; |
1701 | } |
1702 | |
1703 | static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, |
1704 | struct es_em_ctxt *ctxt) |
1705 | { |
1706 | enum es_result ret; |
1707 | |
1708 | ghcb_set_rax(ghcb, value: ctxt->regs->ax); |
1709 | ghcb_set_cpl(ghcb, value: user_mode(regs: ctxt->regs) ? 3 : 0); |
1710 | |
1711 | if (x86_platform.hyper.sev_es_hcall_prepare) |
1712 | x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); |
1713 | |
1714 | ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, exit_info_1: 0, exit_info_2: 0); |
1715 | if (ret != ES_OK) |
1716 | return ret; |
1717 | |
1718 | if (!ghcb_rax_is_valid(ghcb)) |
1719 | return ES_VMM_ERROR; |
1720 | |
1721 | ctxt->regs->ax = ghcb->save.rax; |
1722 | |
1723 | /* |
1724 | * Call sev_es_hcall_finish() after regs->ax is already set. |
1725 | * This allows the hypervisor handler to overwrite it again if |
1726 | * necessary. |
1727 | */ |
1728 | if (x86_platform.hyper.sev_es_hcall_finish && |
1729 | !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) |
1730 | return ES_VMM_ERROR; |
1731 | |
1732 | return ES_OK; |
1733 | } |
1734 | |
1735 | static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, |
1736 | struct es_em_ctxt *ctxt) |
1737 | { |
1738 | /* |
1739 | * Calling ecx_alignment_check() directly does not work, because it |
1740 | * enables IRQs and the GHCB is active. Forward the exception and call |
1741 | * it later from vc_forward_exception(). |
1742 | */ |
1743 | ctxt->fi.vector = X86_TRAP_AC; |
1744 | ctxt->fi.error_code = 0; |
1745 | return ES_EXCEPTION; |
1746 | } |
1747 | |
1748 | static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, |
1749 | struct ghcb *ghcb, |
1750 | unsigned long exit_code) |
1751 | { |
1752 | enum es_result result; |
1753 | |
1754 | switch (exit_code) { |
1755 | case SVM_EXIT_READ_DR7: |
1756 | result = vc_handle_dr7_read(ghcb, ctxt); |
1757 | break; |
1758 | case SVM_EXIT_WRITE_DR7: |
1759 | result = vc_handle_dr7_write(ghcb, ctxt); |
1760 | break; |
1761 | case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: |
1762 | result = vc_handle_trap_ac(ghcb, ctxt); |
1763 | break; |
1764 | case SVM_EXIT_RDTSC: |
1765 | case SVM_EXIT_RDTSCP: |
1766 | result = vc_handle_rdtsc(ghcb, ctxt, exit_code); |
1767 | break; |
1768 | case SVM_EXIT_RDPMC: |
1769 | result = vc_handle_rdpmc(ghcb, ctxt); |
1770 | break; |
1771 | case SVM_EXIT_INVD: |
1772 | pr_err_ratelimited("#VC exception for INVD??? Seriously???\n" ); |
1773 | result = ES_UNSUPPORTED; |
1774 | break; |
1775 | case SVM_EXIT_CPUID: |
1776 | result = vc_handle_cpuid(ghcb, ctxt); |
1777 | break; |
1778 | case SVM_EXIT_IOIO: |
1779 | result = vc_handle_ioio(ghcb, ctxt); |
1780 | break; |
1781 | case SVM_EXIT_MSR: |
1782 | result = vc_handle_msr(ghcb, ctxt); |
1783 | break; |
1784 | case SVM_EXIT_VMMCALL: |
1785 | result = vc_handle_vmmcall(ghcb, ctxt); |
1786 | break; |
1787 | case SVM_EXIT_WBINVD: |
1788 | result = vc_handle_wbinvd(ghcb, ctxt); |
1789 | break; |
1790 | case SVM_EXIT_MONITOR: |
1791 | result = vc_handle_monitor(ghcb, ctxt); |
1792 | break; |
1793 | case SVM_EXIT_MWAIT: |
1794 | result = vc_handle_mwait(ghcb, ctxt); |
1795 | break; |
1796 | case SVM_EXIT_NPF: |
1797 | result = vc_handle_mmio(ghcb, ctxt); |
1798 | break; |
1799 | default: |
1800 | /* |
1801 | * Unexpected #VC exception |
1802 | */ |
1803 | result = ES_UNSUPPORTED; |
1804 | } |
1805 | |
1806 | return result; |
1807 | } |
1808 | |
1809 | static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) |
1810 | { |
1811 | long error_code = ctxt->fi.error_code; |
1812 | int trapnr = ctxt->fi.vector; |
1813 | |
1814 | ctxt->regs->orig_ax = ctxt->fi.error_code; |
1815 | |
1816 | switch (trapnr) { |
1817 | case X86_TRAP_GP: |
1818 | exc_general_protection(regs: ctxt->regs, error_code); |
1819 | break; |
1820 | case X86_TRAP_UD: |
1821 | exc_invalid_op(regs: ctxt->regs); |
1822 | break; |
1823 | case X86_TRAP_PF: |
1824 | write_cr2(x: ctxt->fi.cr2); |
1825 | exc_page_fault(regs: ctxt->regs, error_code); |
1826 | break; |
1827 | case X86_TRAP_AC: |
1828 | exc_alignment_check(regs: ctxt->regs, error_code); |
1829 | break; |
1830 | default: |
1831 | pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n" ); |
1832 | BUG(); |
1833 | } |
1834 | } |
1835 | |
1836 | static __always_inline bool is_vc2_stack(unsigned long sp) |
1837 | { |
1838 | return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); |
1839 | } |
1840 | |
1841 | static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) |
1842 | { |
1843 | unsigned long sp, prev_sp; |
1844 | |
1845 | sp = (unsigned long)regs; |
1846 | prev_sp = regs->sp; |
1847 | |
1848 | /* |
1849 | * If the code was already executing on the VC2 stack when the #VC |
1850 | * happened, let it proceed to the normal handling routine. This way the |
1851 | * code executing on the VC2 stack can cause #VC exceptions to get handled. |
1852 | */ |
1853 | return is_vc2_stack(sp) && !is_vc2_stack(sp: prev_sp); |
1854 | } |
1855 | |
1856 | static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) |
1857 | { |
1858 | struct ghcb_state state; |
1859 | struct es_em_ctxt ctxt; |
1860 | enum es_result result; |
1861 | struct ghcb *ghcb; |
1862 | bool ret = true; |
1863 | |
1864 | ghcb = __sev_get_ghcb(state: &state); |
1865 | |
1866 | vc_ghcb_invalidate(ghcb); |
1867 | result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code: error_code); |
1868 | |
1869 | if (result == ES_OK) |
1870 | result = vc_handle_exitcode(ctxt: &ctxt, ghcb, exit_code: error_code); |
1871 | |
1872 | __sev_put_ghcb(state: &state); |
1873 | |
1874 | /* Done - now check the result */ |
1875 | switch (result) { |
1876 | case ES_OK: |
1877 | vc_finish_insn(ctxt: &ctxt); |
1878 | break; |
1879 | case ES_UNSUPPORTED: |
1880 | pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n" , |
1881 | error_code, regs->ip); |
1882 | ret = false; |
1883 | break; |
1884 | case ES_VMM_ERROR: |
1885 | pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n" , |
1886 | error_code, regs->ip); |
1887 | ret = false; |
1888 | break; |
1889 | case ES_DECODE_FAILED: |
1890 | pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n" , |
1891 | error_code, regs->ip); |
1892 | ret = false; |
1893 | break; |
1894 | case ES_EXCEPTION: |
1895 | vc_forward_exception(ctxt: &ctxt); |
1896 | break; |
1897 | case ES_RETRY: |
1898 | /* Nothing to do */ |
1899 | break; |
1900 | default: |
1901 | pr_emerg("Unknown result in %s():%d\n" , __func__, result); |
1902 | /* |
1903 | * Emulating the instruction which caused the #VC exception |
1904 | * failed - can't continue so print debug information |
1905 | */ |
1906 | BUG(); |
1907 | } |
1908 | |
1909 | return ret; |
1910 | } |
1911 | |
1912 | static __always_inline bool vc_is_db(unsigned long error_code) |
1913 | { |
1914 | return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; |
1915 | } |
1916 | |
1917 | /* |
1918 | * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode |
1919 | * and will panic when an error happens. |
1920 | */ |
1921 | DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) |
1922 | { |
1923 | irqentry_state_t irq_state; |
1924 | |
1925 | /* |
1926 | * With the current implementation it is always possible to switch to a |
1927 | * safe stack because #VC exceptions only happen at known places, like |
1928 | * intercepted instructions or accesses to MMIO areas/IO ports. They can |
1929 | * also happen with code instrumentation when the hypervisor intercepts |
1930 | * #DB, but the critical paths are forbidden to be instrumented, so #DB |
1931 | * exceptions currently also only happen in safe places. |
1932 | * |
1933 | * But keep this here in case the noinstr annotations are violated due |
1934 | * to bug elsewhere. |
1935 | */ |
1936 | if (unlikely(vc_from_invalid_context(regs))) { |
1937 | instrumentation_begin(); |
1938 | panic(fmt: "Can't handle #VC exception from unsupported context\n" ); |
1939 | instrumentation_end(); |
1940 | } |
1941 | |
1942 | /* |
1943 | * Handle #DB before calling into !noinstr code to avoid recursive #DB. |
1944 | */ |
1945 | if (vc_is_db(error_code)) { |
1946 | exc_debug(regs); |
1947 | return; |
1948 | } |
1949 | |
1950 | irq_state = irqentry_nmi_enter(regs); |
1951 | |
1952 | instrumentation_begin(); |
1953 | |
1954 | if (!vc_raw_handle_exception(regs, error_code)) { |
1955 | /* Show some debug info */ |
1956 | show_regs(regs); |
1957 | |
1958 | /* Ask hypervisor to sev_es_terminate */ |
1959 | sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); |
1960 | |
1961 | /* If that fails and we get here - just panic */ |
1962 | panic(fmt: "Returned from Terminate-Request to Hypervisor\n" ); |
1963 | } |
1964 | |
1965 | instrumentation_end(); |
1966 | irqentry_nmi_exit(regs, irq_state); |
1967 | } |
1968 | |
1969 | /* |
1970 | * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode |
1971 | * and will kill the current task with SIGBUS when an error happens. |
1972 | */ |
1973 | DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) |
1974 | { |
1975 | /* |
1976 | * Handle #DB before calling into !noinstr code to avoid recursive #DB. |
1977 | */ |
1978 | if (vc_is_db(error_code)) { |
1979 | noist_exc_debug(regs); |
1980 | return; |
1981 | } |
1982 | |
1983 | irqentry_enter_from_user_mode(regs); |
1984 | instrumentation_begin(); |
1985 | |
1986 | if (!vc_raw_handle_exception(regs, error_code)) { |
1987 | /* |
1988 | * Do not kill the machine if user-space triggered the |
1989 | * exception. Send SIGBUS instead and let user-space deal with |
1990 | * it. |
1991 | */ |
1992 | force_sig_fault(SIGBUS, BUS_OBJERR, addr: (void __user *)0); |
1993 | } |
1994 | |
1995 | instrumentation_end(); |
1996 | irqentry_exit_to_user_mode(regs); |
1997 | } |
1998 | |
1999 | bool __init handle_vc_boot_ghcb(struct pt_regs *regs) |
2000 | { |
2001 | unsigned long exit_code = regs->orig_ax; |
2002 | struct es_em_ctxt ctxt; |
2003 | enum es_result result; |
2004 | |
2005 | vc_ghcb_invalidate(ghcb: boot_ghcb); |
2006 | |
2007 | result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code); |
2008 | if (result == ES_OK) |
2009 | result = vc_handle_exitcode(ctxt: &ctxt, ghcb: boot_ghcb, exit_code); |
2010 | |
2011 | /* Done - now check the result */ |
2012 | switch (result) { |
2013 | case ES_OK: |
2014 | vc_finish_insn(ctxt: &ctxt); |
2015 | break; |
2016 | case ES_UNSUPPORTED: |
2017 | early_printk(fmt: "PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n" , |
2018 | exit_code, regs->ip); |
2019 | goto fail; |
2020 | case ES_VMM_ERROR: |
2021 | early_printk(fmt: "PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n" , |
2022 | exit_code, regs->ip); |
2023 | goto fail; |
2024 | case ES_DECODE_FAILED: |
2025 | early_printk(fmt: "PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n" , |
2026 | exit_code, regs->ip); |
2027 | goto fail; |
2028 | case ES_EXCEPTION: |
2029 | vc_early_forward_exception(ctxt: &ctxt); |
2030 | break; |
2031 | case ES_RETRY: |
2032 | /* Nothing to do */ |
2033 | break; |
2034 | default: |
2035 | BUG(); |
2036 | } |
2037 | |
2038 | return true; |
2039 | |
2040 | fail: |
2041 | show_regs(regs); |
2042 | |
2043 | sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); |
2044 | } |
2045 | |
2046 | /* |
2047 | * Initial set up of SNP relies on information provided by the |
2048 | * Confidential Computing blob, which can be passed to the kernel |
2049 | * in the following ways, depending on how it is booted: |
2050 | * |
2051 | * - when booted via the boot/decompress kernel: |
2052 | * - via boot_params |
2053 | * |
2054 | * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): |
2055 | * - via a setup_data entry, as defined by the Linux Boot Protocol |
2056 | * |
2057 | * Scan for the blob in that order. |
2058 | */ |
2059 | static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) |
2060 | { |
2061 | struct cc_blob_sev_info *cc_info; |
2062 | |
2063 | /* Boot kernel would have passed the CC blob via boot_params. */ |
2064 | if (bp->cc_blob_address) { |
2065 | cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; |
2066 | goto found_cc_info; |
2067 | } |
2068 | |
2069 | /* |
2070 | * If kernel was booted directly, without the use of the |
2071 | * boot/decompression kernel, the CC blob may have been passed via |
2072 | * setup_data instead. |
2073 | */ |
2074 | cc_info = find_cc_blob_setup_data(bp); |
2075 | if (!cc_info) |
2076 | return NULL; |
2077 | |
2078 | found_cc_info: |
2079 | if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) |
2080 | snp_abort(); |
2081 | |
2082 | return cc_info; |
2083 | } |
2084 | |
2085 | bool __init snp_init(struct boot_params *bp) |
2086 | { |
2087 | struct cc_blob_sev_info *cc_info; |
2088 | |
2089 | if (!bp) |
2090 | return false; |
2091 | |
2092 | cc_info = find_cc_blob(bp); |
2093 | if (!cc_info) |
2094 | return false; |
2095 | |
2096 | setup_cpuid_table(cc_info); |
2097 | |
2098 | /* |
2099 | * The CC blob will be used later to access the secrets page. Cache |
2100 | * it here like the boot kernel does. |
2101 | */ |
2102 | bp->cc_blob_address = (u32)(unsigned long)cc_info; |
2103 | |
2104 | return true; |
2105 | } |
2106 | |
2107 | void __init __noreturn snp_abort(void) |
2108 | { |
2109 | sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); |
2110 | } |
2111 | |
2112 | static void dump_cpuid_table(void) |
2113 | { |
2114 | const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); |
2115 | int i = 0; |
2116 | |
2117 | pr_info("count=%d reserved=0x%x reserved2=0x%llx\n" , |
2118 | cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); |
2119 | |
2120 | for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { |
2121 | const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; |
2122 | |
2123 | pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n" , |
2124 | i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, |
2125 | fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); |
2126 | } |
2127 | } |
2128 | |
2129 | /* |
2130 | * It is useful from an auditing/testing perspective to provide an easy way |
2131 | * for the guest owner to know that the CPUID table has been initialized as |
2132 | * expected, but that initialization happens too early in boot to print any |
2133 | * sort of indicator, and there's not really any other good place to do it, |
2134 | * so do it here. |
2135 | */ |
2136 | static int __init report_cpuid_table(void) |
2137 | { |
2138 | const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); |
2139 | |
2140 | if (!cpuid_table->count) |
2141 | return 0; |
2142 | |
2143 | pr_info("Using SNP CPUID table, %d entries present.\n" , |
2144 | cpuid_table->count); |
2145 | |
2146 | if (sev_cfg.debug) |
2147 | dump_cpuid_table(); |
2148 | |
2149 | return 0; |
2150 | } |
2151 | arch_initcall(report_cpuid_table); |
2152 | |
2153 | static int __init init_sev_config(char *str) |
2154 | { |
2155 | char *s; |
2156 | |
2157 | while ((s = strsep(&str, "," ))) { |
2158 | if (!strcmp(s, "debug" )) { |
2159 | sev_cfg.debug = true; |
2160 | continue; |
2161 | } |
2162 | |
2163 | pr_info("SEV command-line option '%s' was not recognized\n" , s); |
2164 | } |
2165 | |
2166 | return 1; |
2167 | } |
2168 | __setup("sev=" , init_sev_config); |
2169 | |
2170 | int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) |
2171 | { |
2172 | struct ghcb_state state; |
2173 | struct es_em_ctxt ctxt; |
2174 | unsigned long flags; |
2175 | struct ghcb *ghcb; |
2176 | int ret; |
2177 | |
2178 | rio->exitinfo2 = SEV_RET_NO_FW_CALL; |
2179 | |
2180 | /* |
2181 | * __sev_get_ghcb() needs to run with IRQs disabled because it is using |
2182 | * a per-CPU GHCB. |
2183 | */ |
2184 | local_irq_save(flags); |
2185 | |
2186 | ghcb = __sev_get_ghcb(state: &state); |
2187 | if (!ghcb) { |
2188 | ret = -EIO; |
2189 | goto e_restore_irq; |
2190 | } |
2191 | |
2192 | vc_ghcb_invalidate(ghcb); |
2193 | |
2194 | if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { |
2195 | ghcb_set_rax(ghcb, value: input->data_gpa); |
2196 | ghcb_set_rbx(ghcb, value: input->data_npages); |
2197 | } |
2198 | |
2199 | ret = sev_es_ghcb_hv_call(ghcb, ctxt: &ctxt, exit_code, exit_info_1: input->req_gpa, exit_info_2: input->resp_gpa); |
2200 | if (ret) |
2201 | goto e_put; |
2202 | |
2203 | rio->exitinfo2 = ghcb->save.sw_exit_info_2; |
2204 | switch (rio->exitinfo2) { |
2205 | case 0: |
2206 | break; |
2207 | |
2208 | case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): |
2209 | ret = -EAGAIN; |
2210 | break; |
2211 | |
2212 | case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): |
2213 | /* Number of expected pages are returned in RBX */ |
2214 | if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { |
2215 | input->data_npages = ghcb_get_rbx(ghcb); |
2216 | ret = -ENOSPC; |
2217 | break; |
2218 | } |
2219 | fallthrough; |
2220 | default: |
2221 | ret = -EIO; |
2222 | break; |
2223 | } |
2224 | |
2225 | e_put: |
2226 | __sev_put_ghcb(state: &state); |
2227 | e_restore_irq: |
2228 | local_irq_restore(flags); |
2229 | |
2230 | return ret; |
2231 | } |
2232 | EXPORT_SYMBOL_GPL(snp_issue_guest_request); |
2233 | |
2234 | static struct platform_device sev_guest_device = { |
2235 | .name = "sev-guest" , |
2236 | .id = -1, |
2237 | }; |
2238 | |
2239 | static int __init snp_init_platform_device(void) |
2240 | { |
2241 | struct sev_guest_platform_data data; |
2242 | u64 gpa; |
2243 | |
2244 | if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) |
2245 | return -ENODEV; |
2246 | |
2247 | gpa = get_secrets_page(); |
2248 | if (!gpa) |
2249 | return -ENODEV; |
2250 | |
2251 | data.secrets_gpa = gpa; |
2252 | if (platform_device_add_data(pdev: &sev_guest_device, data: &data, size: sizeof(data))) |
2253 | return -ENODEV; |
2254 | |
2255 | if (platform_device_register(&sev_guest_device)) |
2256 | return -ENODEV; |
2257 | |
2258 | pr_info("SNP guest platform device initialized.\n" ); |
2259 | return 0; |
2260 | } |
2261 | device_initcall(snp_init_platform_device); |
2262 | |