sev.c source code [linux/arch/x86/kernel/sev.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* AMD Memory Encryption Support
4	*
5	* Copyright (C) 2019 SUSE
6	*
7	* Author: Joerg Roedel <jroedel@suse.de>
8	*/
9
10	#define pr_fmt(fmt) "SEV: " fmt
11
12	#include <linux/sched/debug.h> /* For show_regs() */
13	#include <linux/percpu-defs.h>
14	#include <linux/cc_platform.h>
15	#include <linux/printk.h>
16	#include <linux/mm_types.h>
17	#include <linux/set_memory.h>
18	#include <linux/memblock.h>
19	#include <linux/kernel.h>
20	#include <linux/mm.h>
21	#include <linux/cpumask.h>
22	#include <linux/efi.h>
23	#include <linux/platform_device.h>
24	#include <linux/io.h>
25	#include <linux/psp-sev.h>
26	#include <uapi/linux/sev-guest.h>
27
28	#include <asm/cpu_entry_area.h>
29	#include <asm/stacktrace.h>
30	#include <asm/sev.h>
31	#include <asm/insn-eval.h>
32	#include <asm/fpu/xcr.h>
33	#include <asm/processor.h>
34	#include <asm/realmode.h>
35	#include <asm/setup.h>
36	#include <asm/traps.h>
37	#include <asm/svm.h>
38	#include <asm/smp.h>
39	#include <asm/cpu.h>
40	#include <asm/apic.h>
41	#include <asm/cpuid.h>
42	#include <asm/cmdline.h>
43
44	#define DR7_RESET_VALUE 0x400
45
46	/ AP INIT values as documented in the APM2 section "Processor Initialization State" /
47	#define AP_INIT_CS_LIMIT 0xffff
48	#define AP_INIT_DS_LIMIT 0xffff
49	#define AP_INIT_LDTR_LIMIT 0xffff
50	#define AP_INIT_GDTR_LIMIT 0xffff
51	#define AP_INIT_IDTR_LIMIT 0xffff
52	#define AP_INIT_TR_LIMIT 0xffff
53	#define AP_INIT_RFLAGS_DEFAULT 0x2
54	#define AP_INIT_DR6_DEFAULT 0xffff0ff0
55	#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
56	#define AP_INIT_XCR0_DEFAULT 0x1
57	#define AP_INIT_X87_FTW_DEFAULT 0x5555
58	#define AP_INIT_X87_FCW_DEFAULT 0x0040
59	#define AP_INIT_CR0_DEFAULT 0x60000010
60	#define AP_INIT_MXCSR_DEFAULT 0x1f80
61
62	/ For early boot hypervisor communication in SEV-ES enabled guests /
63	static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
64
65	/*
66	* Needs to be in the .data section because we need it NULL before bss is
67	* cleared
68	*/
69	static struct ghcb *boot_ghcb __section(".data");
70
71	/ Bitmap of SEV features supported by the hypervisor /
72	static u64 sev_hv_features __ro_after_init;
73
74	/ #VC handler runtime per-CPU data /
75	struct sev_es_runtime_data {
76	struct ghcb ghcb_page;
77
78	/*
79	* Reserve one page per CPU as backup storage for the unencrypted GHCB.
80	* It is needed when an NMI happens while the #VC handler uses the real
81	* GHCB, and the NMI handler itself is causing another #VC exception. In
82	* that case the GHCB content of the first handler needs to be backed up
83	* and restored.
84	*/
85	struct ghcb backup_ghcb;
86
87	/*
88	* Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
89	* There is no need for it to be atomic, because nothing is written to
90	* the GHCB between the read and the write of ghcb_active. So it is safe
91	* to use it when a nested #VC exception happens before the write.
92	*
93	* This is necessary for example in the #VC->NMI->#VC case when the NMI
94	* happens while the first #VC handler uses the GHCB. When the NMI code
95	* raises a second #VC handler it might overwrite the contents of the
96	* GHCB written by the first handler. To avoid this the content of the
97	* GHCB is saved and restored when the GHCB is detected to be in use
98	* already.
99	*/
100	bool ghcb_active;
101	bool backup_ghcb_active;
102
103	/*
104	* Cached DR7 value - write it on DR7 writes and return it on reads.
105	* That value will never make it to the real hardware DR7 as debugging
106	* is currently unsupported in SEV-ES guests.
107	*/
108	unsigned long dr7;
109	};
110
111	struct ghcb_state {
112	struct ghcb *ghcb;
113	};
114
115	static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
116	static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
117
118	struct sev_config {
119	__u64 debug : `1`,
120
121	/*
122	* A flag used by __set_pages_state() that indicates when the
123	* per-CPU GHCB has been created and registered and thus can be
124	* used by the BSP instead of the early boot GHCB.
125	*
126	* For APs, the per-CPU GHCB is created before they are started
127	* and registered upon startup, so this flag can be used globally
128	* for the BSP and APs.
129	*/
130	ghcbs_initialized : `1`,
131
132	__reserved : `62`;
133	};
134
135	static struct sev_config sev_cfg __read_mostly;
136
137	static __always_inline bool on_vc_stack(struct pt_regs *regs)
138	{
139	unsigned long sp = regs->sp;
140
141	/ User-mode RSP is not trusted /
142	if (user_mode(regs))
143	return false;
144
145	/ SYSCALL gap still has user-mode RSP /
146	if (ip_within_syscall_gap(regs))
147	return false;
148
149	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
150	}
151
152	/*
153	* This function handles the case when an NMI is raised in the #VC
154	* exception handler entry code, before the #VC handler has switched off
155	* its IST stack. In this case, the IST entry for #VC must be adjusted,
156	* so that any nested #VC exception will not overwrite the stack
157	* contents of the interrupted #VC handler.
158	*
159	* The IST entry is adjusted unconditionally so that it can be also be
160	* unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
161	* nested sev_es_ist_exit() call may adjust back the IST entry too
162	* early.
163	*
164	* The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
165	* on the NMI IST stack, as they are only called from NMI handling code
166	* right now.
167	*/
168	void noinstr __sev_es_ist_enter(struct pt_regs *regs)
169	{
170	unsigned long old_ist, new_ist;
171
172	/ Read old IST entry /
173	new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
174
175	/*
176	* If NMI happened while on the #VC IST stack, set the new IST
177	* value below regs->sp, so that the interrupted stack frame is
178	* not overwritten by subsequent #VC exceptions.
179	*/
180	if (on_vc_stack(regs))
181	new_ist = regs->sp;
182
183	/*
184	* Reserve additional 8 bytes and store old IST value so this
185	* adjustment can be unrolled in __sev_es_ist_exit().
186	*/
187	new_ist -= sizeof(old_ist);
188	(unsigned* long *)new_ist = old_ist;
189
190	/ Set new IST entry /
191	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
192	}
193
194	void noinstr __sev_es_ist_exit(void)
195	{
196	unsigned long ist;
197
198	/ Read IST entry /
199	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
200
201	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
202	return;
203
204	/ Read back old IST entry and write it to the TSS /
205	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], (unsigned* long *)ist);
206	}
207
208	/*
209	* Nothing shall interrupt this code path while holding the per-CPU
210	* GHCB. The backup GHCB is only for NMIs interrupting this path.
211	*
212	* Callers must disable local interrupts around it.
213	*/
214	static noinstr struct ghcb __sev_get_ghcb(struct* ghcb_state *state)
215	{
216	struct sev_es_runtime_data *data;
217	struct ghcb *ghcb;
218
219	WARN_ON(!irqs_disabled());
220
221	data = this_cpu_read(runtime_data);
222	ghcb = &data->ghcb_page;
223
224	if (unlikely(data->ghcb_active)) {
225	/ GHCB is already in use - save its contents /
226
227	if (unlikely(data->backup_ghcb_active)) {
228	/*
229	* Backup-GHCB is also already in use. There is no way
230	* to continue here so just kill the machine. To make
231	* panic() work, mark GHCBs inactive so that messages
232	* can be printed out.
233	*/
234	data->ghcb_active = false;
235	data->backup_ghcb_active = false;
236
237	instrumentation_begin();
238	panic(fmt: "Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
239	instrumentation_end();
240	}
241
242	/ Mark backup_ghcb active before writing to it /
243	data->backup_ghcb_active = true;
244
245	state->ghcb = &data->backup_ghcb;
246
247	/ Backup GHCB content /
248	state->ghcb = ghcb;
249	} else {
250	state->ghcb = NULL;
251	data->ghcb_active = true;
252	}
253
254	return ghcb;
255	}
256
257	static inline u64 sev_es_rd_ghcb_msr(void)
258	{
259	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
260	}
261
262	static __always_inline void sev_es_wr_ghcb_msr(u64 val)
263	{
264	u32 low, high;
265
266	low = (u32)(val);
267	high = (u32)(val >> `32`);
268
269	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
270	}
271
272	static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
273	unsigned char *buffer)
274	{
275	return copy_from_kernel_nofault(dst: buffer, src: (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
276	}
277
278	static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
279	{
280	char buffer[MAX_INSN_SIZE];
281	int insn_bytes;
282
283	insn_bytes = insn_fetch_from_user_inatomic(regs: ctxt->regs, buf: buffer);
284	if (insn_bytes == `0`) {
285	/ Nothing could be copied /
286	ctxt->fi.vector = X86_TRAP_PF;
287	ctxt->fi.error_code = X86_PF_INSTR \| X86_PF_USER;
288	ctxt->fi.cr2 = ctxt->regs->ip;
289	return ES_EXCEPTION;
290	} else if (insn_bytes == -EINVAL) {
291	/ Effective RIP could not be calculated /
292	ctxt->fi.vector = X86_TRAP_GP;
293	ctxt->fi.error_code = `0`;
294	ctxt->fi.cr2 = `0`;
295	return ES_EXCEPTION;
296	}
297
298	if (!insn_decode_from_regs(insn: &ctxt->insn, regs: ctxt->regs, buf: buffer, buf_size: insn_bytes))
299	return ES_DECODE_FAILED;
300
301	if (ctxt->insn.immediate.got)
302	return ES_OK;
303	else
304	return ES_DECODE_FAILED;
305	}
306
307	static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
308	{
309	char buffer[MAX_INSN_SIZE];
310	int res, ret;
311
312	res = vc_fetch_insn_kernel(ctxt, buffer);
313	if (res) {
314	ctxt->fi.vector = X86_TRAP_PF;
315	ctxt->fi.error_code = X86_PF_INSTR;
316	ctxt->fi.cr2 = ctxt->regs->ip;
317	return ES_EXCEPTION;
318	}
319
320	ret = insn_decode(insn: &ctxt->insn, kaddr: buffer, MAX_INSN_SIZE, m: INSN_MODE_64);
321	if (ret < `0`)
322	return ES_DECODE_FAILED;
323	else
324	return ES_OK;
325	}
326
327	static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
328	{
329	if (user_mode(regs: ctxt->regs))
330	return __vc_decode_user_insn(ctxt);
331	else
332	return __vc_decode_kern_insn(ctxt);
333	}
334
335	static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
336	char dst, char* *buf, size_t size)
337	{
338	unsigned long error_code = X86_PF_PROT \| X86_PF_WRITE;
339
340	/*
341	* This function uses __put_user() independent of whether kernel or user
342	* memory is accessed. This works fine because __put_user() does no
343	* sanity checks of the pointer being accessed. All that it does is
344	* to report when the access failed.
345	*
346	* Also, this function runs in atomic context, so __put_user() is not
347	* allowed to sleep. The page-fault handler detects that it is running
348	* in atomic context and will not try to take mmap_sem and handle the
349	* fault, so additional pagefault_enable()/disable() calls are not
350	* needed.
351	*
352	* The access can't be done via copy_to_user() here because
353	* vc_write_mem() must not use string instructions to access unsafe
354	* memory. The reason is that MOVS is emulated by the #VC handler by
355	* splitting the move up into a read and a write and taking a nested #VC
356	* exception on whatever of them is the MMIO access. Using string
357	* instructions here would cause infinite nesting.
358	*/
359	switch (size) {
360	case `1`: {
361	u8 d1;
362	u8 __user target = (u8 __user )dst;
363
364	memcpy(&d1, buf, `1`);
365	if (__put_user(d1, target))
366	goto fault;
367	break;
368	}
369	case `2`: {
370	u16 d2;
371	u16 __user target = (u16 __user )dst;
372
373	memcpy(&d2, buf, `2`);
374	if (__put_user(d2, target))
375	goto fault;
376	break;
377	}
378	case `4`: {
379	u32 d4;
380	u32 __user target = (u32 __user )dst;
381
382	memcpy(&d4, buf, `4`);
383	if (__put_user(d4, target))
384	goto fault;
385	break;
386	}
387	case `8`: {
388	u64 d8;
389	u64 __user target = (u64 __user )dst;
390
391	memcpy(&d8, buf, `8`);
392	if (__put_user(d8, target))
393	goto fault;
394	break;
395	}
396	default:
397	WARN_ONCE(`1`, "%s: Invalid size: %zu\n", __func__, size);
398	return ES_UNSUPPORTED;
399	}
400
401	return ES_OK;
402
403	fault:
404	if (user_mode(regs: ctxt->regs))
405	error_code \|= X86_PF_USER;
406
407	ctxt->fi.vector = X86_TRAP_PF;
408	ctxt->fi.error_code = error_code;
409	ctxt->fi.cr2 = (unsigned long)dst;
410
411	return ES_EXCEPTION;
412	}
413
414	static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
415	char src, char* *buf, size_t size)
416	{
417	unsigned long error_code = X86_PF_PROT;
418
419	/*
420	* This function uses __get_user() independent of whether kernel or user
421	* memory is accessed. This works fine because __get_user() does no
422	* sanity checks of the pointer being accessed. All that it does is
423	* to report when the access failed.
424	*
425	* Also, this function runs in atomic context, so __get_user() is not
426	* allowed to sleep. The page-fault handler detects that it is running
427	* in atomic context and will not try to take mmap_sem and handle the
428	* fault, so additional pagefault_enable()/disable() calls are not
429	* needed.
430	*
431	* The access can't be done via copy_from_user() here because
432	* vc_read_mem() must not use string instructions to access unsafe
433	* memory. The reason is that MOVS is emulated by the #VC handler by
434	* splitting the move up into a read and a write and taking a nested #VC
435	* exception on whatever of them is the MMIO access. Using string
436	* instructions here would cause infinite nesting.
437	*/
438	switch (size) {
439	case `1`: {
440	u8 d1;
441	u8 __user s = (u8 __user )src;
442
443	if (__get_user(d1, s))
444	goto fault;
445	memcpy(buf, &d1, `1`);
446	break;
447	}
448	case `2`: {
449	u16 d2;
450	u16 __user s = (u16 __user )src;
451
452	if (__get_user(d2, s))
453	goto fault;
454	memcpy(buf, &d2, `2`);
455	break;
456	}
457	case `4`: {
458	u32 d4;
459	u32 __user s = (u32 __user )src;
460
461	if (__get_user(d4, s))
462	goto fault;
463	memcpy(buf, &d4, `4`);
464	break;
465	}
466	case `8`: {
467	u64 d8;
468	u64 __user s = (u64 __user )src;
469	if (__get_user(d8, s))
470	goto fault;
471	memcpy(buf, &d8, `8`);
472	break;
473	}
474	default:
475	WARN_ONCE(`1`, "%s: Invalid size: %zu\n", __func__, size);
476	return ES_UNSUPPORTED;
477	}
478
479	return ES_OK;
480
481	fault:
482	if (user_mode(regs: ctxt->regs))
483	error_code \|= X86_PF_USER;
484
485	ctxt->fi.vector = X86_TRAP_PF;
486	ctxt->fi.error_code = error_code;
487	ctxt->fi.cr2 = (unsigned long)src;
488
489	return ES_EXCEPTION;
490	}
491
492	static enum es_result vc_slow_virt_to_phys(struct ghcb ghcb, struct* es_em_ctxt *ctxt,
493	unsigned long vaddr, phys_addr_t *paddr)
494	{
495	unsigned long va = (unsigned long)vaddr;
496	unsigned int level;
497	phys_addr_t pa;
498	pgd_t *pgd;
499	pte_t *pte;
500
501	pgd = __va(read_cr3_pa());
502	pgd = &pgd[pgd_index(va)];
503	pte = lookup_address_in_pgd(pgd, address: va, level: &level);
504	if (!pte) {
505	ctxt->fi.vector = X86_TRAP_PF;
506	ctxt->fi.cr2 = vaddr;
507	ctxt->fi.error_code = `0`;
508
509	if (user_mode(regs: ctxt->regs))
510	ctxt->fi.error_code \|= X86_PF_USER;
511
512	return ES_EXCEPTION;
513	}
514
515	if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
516	/ Emulated MMIO to/from encrypted memory not supported /
517	return ES_UNSUPPORTED;
518
519	pa = (phys_addr_t)pte_pfn(pte: *pte) << PAGE_SHIFT;
520	pa \|= va & ~page_level_mask(level);
521
522	*paddr = pa;
523
524	return ES_OK;
525	}
526
527	static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
528	{
529	BUG_ON(size > `4`);
530
531	if (user_mode(regs: ctxt->regs)) {
532	struct thread_struct *t = &current->thread;
533	struct io_bitmap *iobm = t->io_bitmap;
534	size_t idx;
535
536	if (!iobm)
537	goto fault;
538
539	for (idx = port; idx < port + size; ++idx) {
540	if (test_bit(idx, iobm->bitmap))
541	goto fault;
542	}
543	}
544
545	return ES_OK;
546
547	fault:
548	ctxt->fi.vector = X86_TRAP_GP;
549	ctxt->fi.error_code = `0`;
550
551	return ES_EXCEPTION;
552	}
553
554	/ Include code shared with pre-decompression boot stage /
555	#include "sev-shared.c"
556
557	static noinstr void __sev_put_ghcb(struct ghcb_state *state)
558	{
559	struct sev_es_runtime_data *data;
560	struct ghcb *ghcb;
561
562	WARN_ON(!irqs_disabled());
563
564	data = this_cpu_read(runtime_data);
565	ghcb = &data->ghcb_page;
566
567	if (state->ghcb) {
568	/ Restore GHCB from Backup /
569	ghcb = state->ghcb;
570	data->backup_ghcb_active = false;
571	state->ghcb = NULL;
572	} else {
573	/*
574	* Invalidate the GHCB so a VMGEXIT instruction issued
575	* from userspace won't appear to be valid.
576	*/
577	vc_ghcb_invalidate(ghcb);
578	data->ghcb_active = false;
579	}
580	}
581
582	void noinstr __sev_es_nmi_complete(void)
583	{
584	struct ghcb_state state;
585	struct ghcb *ghcb;
586
587	ghcb = __sev_get_ghcb(state: &state);
588
589	vc_ghcb_invalidate(ghcb);
590	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
591	ghcb_set_sw_exit_info_1(ghcb, value: `0`);
592	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
593
594	sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
595	VMGEXIT();
596
597	__sev_put_ghcb(state: &state);
598	}
599
600	static u64 __init get_secrets_page(void)
601	{
602	u64 pa_data = boot_params.cc_blob_address;
603	struct cc_blob_sev_info info;
604	void *map;
605
606	/*
607	* The CC blob contains the address of the secrets page, check if the
608	* blob is present.
609	*/
610	if (!pa_data)
611	return `0`;
612
613	map = early_memremap(phys_addr: pa_data, size: sizeof(info));
614	if (!map) {
615	pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
616	return `0`;
617	}
618	memcpy(&info, map, sizeof(info));
619	early_memunmap(addr: map, size: sizeof(info));
620
621	/ smoke-test the secrets page passed /
622	if (!info.secrets_phys \|\| info.secrets_len != PAGE_SIZE)
623	return `0`;
624
625	return info.secrets_phys;
626	}
627
628	static u64 __init get_snp_jump_table_addr(void)
629	{
630	struct snp_secrets_page_layout *layout;
631	void __iomem *mem;
632	u64 pa, addr;
633
634	pa = get_secrets_page();
635	if (!pa)
636	return `0`;
637
638	mem = ioremap_encrypted(phys_addr: pa, PAGE_SIZE);
639	if (!mem) {
640	pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
641	return `0`;
642	}
643
644	layout = (__force struct snp_secrets_page_layout *)mem;
645
646	addr = layout->os_area.ap_jump_table_pa;
647	iounmap(addr: mem);
648
649	return addr;
650	}
651
652	static u64 __init get_jump_table_addr(void)
653	{
654	struct ghcb_state state;
655	unsigned long flags;
656	struct ghcb *ghcb;
657	u64 ret = `0`;
658
659	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
660	return get_snp_jump_table_addr();
661
662	local_irq_save(flags);
663
664	ghcb = __sev_get_ghcb(state: &state);
665
666	vc_ghcb_invalidate(ghcb);
667	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
668	ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
669	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
670
671	sev_es_wr_ghcb_msr(__pa(ghcb));
672	VMGEXIT();
673
674	if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
675	ghcb_sw_exit_info_2_is_valid(ghcb))
676	ret = ghcb->save.sw_exit_info_2;
677
678	__sev_put_ghcb(state: &state);
679
680	local_irq_restore(flags);
681
682	return ret;
683	}
684
685	static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
686	unsigned long npages, enum psc_op op)
687	{
688	unsigned long paddr_end;
689	u64 val;
690	int ret;
691
692	vaddr = vaddr & PAGE_MASK;
693
694	paddr = paddr & PAGE_MASK;
695	paddr_end = paddr + (npages << PAGE_SHIFT);
696
697	while (paddr < paddr_end) {
698	if (op == SNP_PAGE_STATE_SHARED) {
699	/ Page validation must be rescinded before changing to shared /
700	ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: false);
701	if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
702	goto e_term;
703	}
704
705	/*
706	* Use the MSR protocol because this function can be called before
707	* the GHCB is established.
708	*/
709	sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
710	VMGEXIT();
711
712	val = sev_es_rd_ghcb_msr();
713
714	if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
715	"Wrong PSC response code: 0x%x\n",
716	(unsigned int)GHCB_RESP_CODE(val)))
717	goto e_term;
718
719	if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
720	"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
721	op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
722	paddr, GHCB_MSR_PSC_RESP_VAL(val)))
723	goto e_term;
724
725	if (op == SNP_PAGE_STATE_PRIVATE) {
726	/ Page validation must be performed after changing to private /
727	ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: true);
728	if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
729	goto e_term;
730	}
731
732	vaddr += PAGE_SIZE;
733	paddr += PAGE_SIZE;
734	}
735
736	return;
737
738	e_term:
739	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
740	}
741
742	void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
743	unsigned long npages)
744	{
745	/*
746	* This can be invoked in early boot while running identity mapped, so
747	* use an open coded check for SNP instead of using cc_platform_has().
748	* This eliminates worries about jump tables or checking boot_cpu_data
749	* in the cc_platform_has() function.
750	*/
751	if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
752	return;
753
754	/*
755	* Ask the hypervisor to mark the memory pages as private in the RMP
756	* table.
757	*/
758	early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_PRIVATE);
759	}
760
761	void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
762	unsigned long npages)
763	{
764	/*
765	* This can be invoked in early boot while running identity mapped, so
766	* use an open coded check for SNP instead of using cc_platform_has().
767	* This eliminates worries about jump tables or checking boot_cpu_data
768	* in the cc_platform_has() function.
769	*/
770	if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
771	return;
772
773	/ Ask hypervisor to mark the memory pages shared in the RMP table. /
774	early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_SHARED);
775	}
776
777	void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
778	{
779	unsigned long vaddr, npages;
780
781	vaddr = (unsigned long)__va(paddr);
782	npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
783
784	if (op == SNP_PAGE_STATE_PRIVATE)
785	early_snp_set_memory_private(vaddr, paddr, npages);
786	else if (op == SNP_PAGE_STATE_SHARED)
787	early_snp_set_memory_shared(vaddr, paddr, npages);
788	else
789	WARN(`1`, "invalid memory op %d\n", op);
790	}
791
792	static unsigned long __set_pages_state(struct snp_psc_desc data, unsigned* long vaddr,
793	unsigned long vaddr_end, int op)
794	{
795	struct ghcb_state state;
796	bool use_large_entry;
797	struct psc_hdr *hdr;
798	struct psc_entry *e;
799	unsigned long flags;
800	unsigned long pfn;
801	struct ghcb *ghcb;
802	int i;
803
804	hdr = &data->hdr;
805	e = data->entries;
806
807	memset(data, `0`, sizeof(*data));
808	i = `0`;
809
810	while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
811	hdr->end_entry = i;
812
813	if (is_vmalloc_addr(x: (void *)vaddr)) {
814	pfn = vmalloc_to_pfn(addr: (void *)vaddr);
815	use_large_entry = false;
816	} else {
817	pfn = __pa(vaddr) >> PAGE_SHIFT;
818	use_large_entry = true;
819	}
820
821	e->gfn = pfn;
822	e->operation = op;
823
824	if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
825	(vaddr_end - vaddr) >= PMD_SIZE) {
826	e->pagesize = RMP_PG_SIZE_2M;
827	vaddr += PMD_SIZE;
828	} else {
829	e->pagesize = RMP_PG_SIZE_4K;
830	vaddr += PAGE_SIZE;
831	}
832
833	e++;
834	i++;
835	}
836
837	/ Page validation must be rescinded before changing to shared /
838	if (op == SNP_PAGE_STATE_SHARED)
839	pvalidate_pages(desc: data);
840
841	local_irq_save(flags);
842
843	if (sev_cfg.ghcbs_initialized)
844	ghcb = __sev_get_ghcb(state: &state);
845	else
846	ghcb = boot_ghcb;
847
848	/ Invoke the hypervisor to perform the page state changes /
849	if (!ghcb \|\| vmgexit_psc(ghcb, desc: data))
850	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
851
852	if (sev_cfg.ghcbs_initialized)
853	__sev_put_ghcb(state: &state);
854
855	local_irq_restore(flags);
856
857	/ Page validation must be performed after changing to private /
858	if (op == SNP_PAGE_STATE_PRIVATE)
859	pvalidate_pages(desc: data);
860
861	return vaddr;
862	}
863
864	static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
865	{
866	struct snp_psc_desc desc;
867	unsigned long vaddr_end;
868
869	/ Use the MSR protocol when a GHCB is not available. /
870	if (!boot_ghcb)
871	return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
872
873	vaddr = vaddr & PAGE_MASK;
874	vaddr_end = vaddr + (npages << PAGE_SHIFT);
875
876	while (vaddr < vaddr_end)
877	vaddr = __set_pages_state(data: &desc, vaddr, vaddr_end, op);
878	}
879
880	void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
881	{
882	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
883	return;
884
885	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_SHARED);
886	}
887
888	void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
889	{
890	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
891	return;
892
893	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE);
894	}
895
896	void snp_accept_memory(phys_addr_t start, phys_addr_t end)
897	{
898	unsigned long vaddr, npages;
899
900	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
901	return;
902
903	vaddr = (unsigned long)__va(start);
904	npages = (end - start) >> PAGE_SHIFT;
905
906	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE);
907	}
908
909	static int snp_set_vmsa(void *va, bool vmsa)
910	{
911	u64 attrs;
912
913	/*
914	* Running at VMPL0 allows the kernel to change the VMSA bit for a page
915	* using the RMPADJUST instruction. However, for the instruction to
916	* succeed it must target the permissions of a lesser privileged
917	* (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
918	* instruction in the AMD64 APM Volume 3).
919	*/
920	attrs = `1`;
921	if (vmsa)
922	attrs \|= RMPADJUST_VMSA_PAGE_BIT;
923
924	return rmpadjust(vaddr: (unsigned long)va, RMP_PG_SIZE_4K, attrs);
925	}
926
927	#define __ATTR_BASE (SVM_SELECTOR_P_MASK \| SVM_SELECTOR_S_MASK)
928	#define INIT_CS_ATTRIBS (__ATTR_BASE \| SVM_SELECTOR_READ_MASK \| SVM_SELECTOR_CODE_MASK)
929	#define INIT_DS_ATTRIBS (__ATTR_BASE \| SVM_SELECTOR_WRITE_MASK)
930
931	#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK \| 2)
932	#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK \| 3)
933
934	static void snp_alloc_vmsa_page(void*)
935	{
936	struct page *p;
937
938	/*
939	* Allocate VMSA page to work around the SNP erratum where the CPU will
940	* incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
941	* collides with the RMP entry of VMSA page. The recommended workaround
942	* is to not use a large page.
943	*
944	* Allocate an 8k page which is also 8k-aligned.
945	*/
946	p = alloc_pages(GFP_KERNEL_ACCOUNT \| __GFP_ZERO, order: `1`);
947	if (!p)
948	return NULL;
949
950	split_page(page: p, order: `1`);
951
952	/ Free the first 4k. This page may be 2M/1G aligned and cannot be used. /
953	__free_page(p);
954
955	return page_address(p + `1`);
956	}
957
958	static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
959	{
960	int err;
961
962	err = snp_set_vmsa(va: vmsa, vmsa: false);
963	if (err)
964	pr_err("clear VMSA page failed (%u), leaking page\n", err);
965	else
966	free_page((unsigned long)vmsa);
967	}
968
969	static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
970	{
971	struct sev_es_save_area cur_vmsa, vmsa;
972	struct ghcb_state state;
973	unsigned long flags;
974	struct ghcb *ghcb;
975	u8 sipi_vector;
976	int cpu, ret;
977	u64 cr4;
978
979	/*
980	* The hypervisor SNP feature support check has happened earlier, just check
981	* the AP_CREATION one here.
982	*/
983	if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
984	return -EOPNOTSUPP;
985
986	/*
987	* Verify the desired start IP against the known trampoline start IP
988	* to catch any future new trampolines that may be introduced that
989	* would require a new protected guest entry point.
990	*/
991	if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
992	"Unsupported SNP start_ip: %lx\n", start_ip))
993	return -EINVAL;
994
995	/ Override start_ip with known protected guest start IP /
996	start_ip = real_mode_header->sev_es_trampoline_start;
997
998	/ Find the logical CPU for the APIC ID /
999	for_each_present_cpu(cpu) {
1000	if (arch_match_cpu_phys_id(cpu, phys_id: apic_id))
1001	break;
1002	}
1003	if (cpu >= nr_cpu_ids)
1004	return -EINVAL;
1005
1006	cur_vmsa = per_cpu(sev_vmsa, cpu);
1007
1008	/*
1009	* A new VMSA is created each time because there is no guarantee that
1010	* the current VMSA is the kernels or that the vCPU is not running. If
1011	* an attempt was done to use the current VMSA with a running vCPU, a
1012	* #VMEXIT of that vCPU would wipe out all of the settings being done
1013	* here.
1014	*/
1015	vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
1016	if (!vmsa)
1017	return -ENOMEM;
1018
1019	/ CR4 should maintain the MCE value /
1020	cr4 = native_read_cr4() & X86_CR4_MCE;
1021
1022	/ Set the CS value based on the start_ip converted to a SIPI vector /
1023	sipi_vector = (start_ip >> `12`);
1024	vmsa->cs.base = sipi_vector << `12`;
1025	vmsa->cs.limit = AP_INIT_CS_LIMIT;
1026	vmsa->cs.attrib = INIT_CS_ATTRIBS;
1027	vmsa->cs.selector = sipi_vector << `8`;
1028
1029	/ Set the RIP value based on start_ip /
1030	vmsa->rip = start_ip & `0xfff`;
1031
1032	/ Set AP INIT defaults as documented in the APM /
1033	vmsa->ds.limit = AP_INIT_DS_LIMIT;
1034	vmsa->ds.attrib = INIT_DS_ATTRIBS;
1035	vmsa->es = vmsa->ds;
1036	vmsa->fs = vmsa->ds;
1037	vmsa->gs = vmsa->ds;
1038	vmsa->ss = vmsa->ds;
1039
1040	vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
1041	vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
1042	vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
1043	vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
1044	vmsa->tr.limit = AP_INIT_TR_LIMIT;
1045	vmsa->tr.attrib = INIT_TR_ATTRIBS;
1046
1047	vmsa->cr4 = cr4;
1048	vmsa->cr0 = AP_INIT_CR0_DEFAULT;
1049	vmsa->dr7 = DR7_RESET_VALUE;
1050	vmsa->dr6 = AP_INIT_DR6_DEFAULT;
1051	vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
1052	vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
1053	vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
1054	vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
1055	vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
1056	vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
1057
1058	/ SVME must be set. /
1059	vmsa->efer = EFER_SVME;
1060
1061	/*
1062	* Set the SNP-specific fields for this VMSA:
1063	* VMPL level
1064	* SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
1065	*/
1066	vmsa->vmpl = `0`;
1067	vmsa->sev_features = sev_status >> `2`;
1068
1069	/ Switch the page over to a VMSA page now that it is initialized /
1070	ret = snp_set_vmsa(va: vmsa, vmsa: true);
1071	if (ret) {
1072	pr_err("set VMSA page failed (%u)\n", ret);
1073	free_page((unsigned long)vmsa);
1074
1075	return -EINVAL;
1076	}
1077
1078	/ Issue VMGEXIT AP Creation NAE event /
1079	local_irq_save(flags);
1080
1081	ghcb = __sev_get_ghcb(state: &state);
1082
1083	vc_ghcb_invalidate(ghcb);
1084	ghcb_set_rax(ghcb, value: vmsa->sev_features);
1085	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
1086	ghcb_set_sw_exit_info_1(ghcb, value: ((u64)apic_id << `32`) \| SVM_VMGEXIT_AP_CREATE);
1087	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
1088
1089	sev_es_wr_ghcb_msr(__pa(ghcb));
1090	VMGEXIT();
1091
1092	if (!ghcb_sw_exit_info_1_is_valid(ghcb) \|\|
1093	lower_32_bits(ghcb->save.sw_exit_info_1)) {
1094	pr_err("SNP AP Creation error\n");
1095	ret = -EINVAL;
1096	}
1097
1098	__sev_put_ghcb(state: &state);
1099
1100	local_irq_restore(flags);
1101
1102	/ Perform cleanup if there was an error /
1103	if (ret) {
1104	snp_cleanup_vmsa(vmsa);
1105	vmsa = NULL;
1106	}
1107
1108	/ Free up any previous VMSA page /
1109	if (cur_vmsa)
1110	snp_cleanup_vmsa(vmsa: cur_vmsa);
1111
1112	/ Record the current VMSA page /
1113	per_cpu(sev_vmsa, cpu) = vmsa;
1114
1115	return ret;
1116	}
1117
1118	void __init snp_set_wakeup_secondary_cpu(void)
1119	{
1120	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1121	return;
1122
1123	/*
1124	* Always set this override if SNP is enabled. This makes it the
1125	* required method to start APs under SNP. If the hypervisor does
1126	* not support AP creation, then no APs will be started.
1127	*/
1128	apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
1129	}
1130
1131	int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
1132	{
1133	u16 startup_cs, startup_ip;
1134	phys_addr_t jump_table_pa;
1135	u64 jump_table_addr;
1136	u16 __iomem *jump_table;
1137
1138	jump_table_addr = get_jump_table_addr();
1139
1140	/ On UP guests there is no jump table so this is not a failure /
1141	if (!jump_table_addr)
1142	return `0`;
1143
1144	/ Check if AP Jump Table is page-aligned /
1145	if (jump_table_addr & ~PAGE_MASK)
1146	return -EINVAL;
1147
1148	jump_table_pa = jump_table_addr & PAGE_MASK;
1149
1150	startup_cs = (u16)(rmh->trampoline_start >> `4`);
1151	startup_ip = (u16)(rmh->sev_es_trampoline_start -
1152	rmh->trampoline_start);
1153
1154	jump_table = ioremap_encrypted(phys_addr: jump_table_pa, PAGE_SIZE);
1155	if (!jump_table)
1156	return -EIO;
1157
1158	writew(val: startup_ip, addr: &jump_table[`0`]);
1159	writew(val: startup_cs, addr: &jump_table[`1`]);
1160
1161	iounmap(addr: jump_table);
1162
1163	return `0`;
1164	}
1165
1166	/*
1167	* This is needed by the OVMF UEFI firmware which will use whatever it finds in
1168	* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
1169	* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
1170	*/
1171	int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
1172	{
1173	struct sev_es_runtime_data *data;
1174	unsigned long address, pflags;
1175	int cpu;
1176	u64 pfn;
1177
1178	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1179	return `0`;
1180
1181	pflags = _PAGE_NX \| _PAGE_RW;
1182
1183	for_each_possible_cpu(cpu) {
1184	data = per_cpu(runtime_data, cpu);
1185
1186	address = __pa(&data->ghcb_page);
1187	pfn = address >> PAGE_SHIFT;
1188
1189	if (kernel_map_pages_in_pgd(pgd, pfn, address, numpages: `1`, page_flags: pflags))
1190	return `1`;
1191	}
1192
1193	return `0`;
1194	}
1195
1196	static enum es_result vc_handle_msr(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1197	{
1198	struct pt_regs *regs = ctxt->regs;
1199	enum es_result ret;
1200	u64 exit_info_1;
1201
1202	/ Is it a WRMSR? /
1203	exit_info_1 = (ctxt->insn.opcode.bytes[`1`] == `0x30`) ? `1` : `0`;
1204
1205	ghcb_set_rcx(ghcb, value: regs->cx);
1206	if (exit_info_1) {
1207	ghcb_set_rax(ghcb, value: regs->ax);
1208	ghcb_set_rdx(ghcb, value: regs->dx);
1209	}
1210
1211	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, exit_info_2: `0`);
1212
1213	if ((ret == ES_OK) && (!exit_info_1)) {
1214	regs->ax = ghcb->save.rax;
1215	regs->dx = ghcb->save.rdx;
1216	}
1217
1218	return ret;
1219	}
1220
1221	static void snp_register_per_cpu_ghcb(void)
1222	{
1223	struct sev_es_runtime_data *data;
1224	struct ghcb *ghcb;
1225
1226	data = this_cpu_read(runtime_data);
1227	ghcb = &data->ghcb_page;
1228
1229	snp_register_ghcb_early(__pa(ghcb));
1230	}
1231
1232	void setup_ghcb(void)
1233	{
1234	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1235	return;
1236
1237	/ First make sure the hypervisor talks a supported protocol. /
1238	if (!sev_es_negotiate_protocol())
1239	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1240
1241	/*
1242	* Check whether the runtime #VC exception handler is active. It uses
1243	* the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
1244	*
1245	* If SNP is active, register the per-CPU GHCB page so that the runtime
1246	* exception handler can use it.
1247	*/
1248	if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
1249	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1250	snp_register_per_cpu_ghcb();
1251
1252	sev_cfg.ghcbs_initialized = true;
1253
1254	return;
1255	}
1256
1257	/*
1258	* Clear the boot_ghcb. The first exception comes in before the bss
1259	* section is cleared.
1260	*/
1261	memset(&boot_ghcb_page, `0`, PAGE_SIZE);
1262
1263	/ Alright - Make the boot-ghcb public /
1264	boot_ghcb = &boot_ghcb_page;
1265
1266	/ SNP guest requires that GHCB GPA must be registered. /
1267	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1268	snp_register_ghcb_early(__pa(&boot_ghcb_page));
1269	}
1270
1271	#ifdef CONFIG_HOTPLUG_CPU
1272	static void sev_es_ap_hlt_loop(void)
1273	{
1274	struct ghcb_state state;
1275	struct ghcb *ghcb;
1276
1277	ghcb = __sev_get_ghcb(state: &state);
1278
1279	while (true) {
1280	vc_ghcb_invalidate(ghcb);
1281	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
1282	ghcb_set_sw_exit_info_1(ghcb, value: `0`);
1283	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
1284
1285	sev_es_wr_ghcb_msr(__pa(ghcb));
1286	VMGEXIT();
1287
1288	/ Wakeup signal? /
1289	if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
1290	ghcb->save.sw_exit_info_2)
1291	break;
1292	}
1293
1294	__sev_put_ghcb(state: &state);
1295	}
1296
1297	/*
1298	* Play_dead handler when running under SEV-ES. This is needed because
1299	* the hypervisor can't deliver an SIPI request to restart the AP.
1300	* Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
1301	* hypervisor wakes it up again.
1302	*/
1303	static void sev_es_play_dead(void)
1304	{
1305	play_dead_common();
1306
1307	/ IRQs now disabled /
1308
1309	sev_es_ap_hlt_loop();
1310
1311	/*
1312	* If we get here, the VCPU was woken up again. Jump to CPU
1313	* startup code to get it back online.
1314	*/
1315	soft_restart_cpu();
1316	}
1317	#else /* CONFIG_HOTPLUG_CPU */
1318	#define sev_es_play_dead native_play_dead
1319	#endif /* CONFIG_HOTPLUG_CPU */
1320
1321	#ifdef CONFIG_SMP
1322	static void __init sev_es_setup_play_dead(void)
1323	{
1324	smp_ops.play_dead = sev_es_play_dead;
1325	}
1326	#else
1327	static inline void sev_es_setup_play_dead(void) { }
1328	#endif
1329
1330	static void __init alloc_runtime_data(int cpu)
1331	{
1332	struct sev_es_runtime_data *data;
1333
1334	data = memblock_alloc(size: sizeof(*data), PAGE_SIZE);
1335	if (!data)
1336	panic(fmt: "Can't allocate SEV-ES runtime data");
1337
1338	per_cpu(runtime_data, cpu) = data;
1339	}
1340
1341	static void __init init_ghcb(int cpu)
1342	{
1343	struct sev_es_runtime_data *data;
1344	int err;
1345
1346	data = per_cpu(runtime_data, cpu);
1347
1348	err = early_set_memory_decrypted(vaddr: (unsigned long)&data->ghcb_page,
1349	size: sizeof(data->ghcb_page));
1350	if (err)
1351	panic(fmt: "Can't map GHCBs unencrypted");
1352
1353	memset(&data->ghcb_page, `0`, sizeof(data->ghcb_page));
1354
1355	data->ghcb_active = false;
1356	data->backup_ghcb_active = false;
1357	}
1358
1359	void __init sev_es_init_vc_handling(void)
1360	{
1361	int cpu;
1362
1363	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
1364
1365	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1366	return;
1367
1368	if (!sev_es_check_cpu_features())
1369	panic(fmt: "SEV-ES CPU Features missing");
1370
1371	/*
1372	* SNP is supported in v2 of the GHCB spec which mandates support for HV
1373	* features.
1374	*/
1375	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) {
1376	sev_hv_features = get_hv_features();
1377
1378	if (!(sev_hv_features & GHCB_HV_FT_SNP))
1379	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
1380	}
1381
1382	/ Initialize per-cpu GHCB pages /
1383	for_each_possible_cpu(cpu) {
1384	alloc_runtime_data(cpu);
1385	init_ghcb(cpu);
1386	}
1387
1388	sev_es_setup_play_dead();
1389
1390	/ Secondary CPUs use the runtime #VC handler /
1391	initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
1392	}
1393
1394	static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
1395	{
1396	int trapnr = ctxt->fi.vector;
1397
1398	if (trapnr == X86_TRAP_PF)
1399	native_write_cr2(val: ctxt->fi.cr2);
1400
1401	ctxt->regs->orig_ax = ctxt->fi.error_code;
1402	do_early_exception(regs: ctxt->regs, trapnr);
1403	}
1404
1405	static long vc_insn_get_rm(struct* es_em_ctxt *ctxt)
1406	{
1407	long *reg_array;
1408	int offset;
1409
1410	reg_array = (long *)ctxt->regs;
1411	offset = insn_get_modrm_rm_off(insn: &ctxt->insn, regs: ctxt->regs);
1412
1413	if (offset < `0`)
1414	return NULL;
1415
1416	offset /= sizeof(long);
1417
1418	return reg_array + offset;
1419	}
1420	static enum es_result vc_do_mmio(struct ghcb ghcb, struct* es_em_ctxt *ctxt,
1421	unsigned int bytes, bool read)
1422	{
1423	u64 exit_code, exit_info_1, exit_info_2;
1424	unsigned long ghcb_pa = __pa(ghcb);
1425	enum es_result res;
1426	phys_addr_t paddr;
1427	void __user *ref;
1428
1429	ref = insn_get_addr_ref(insn: &ctxt->insn, regs: ctxt->regs);
1430	if (ref == (void __user *)-`1L`)
1431	return ES_UNSUPPORTED;
1432
1433	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
1434
1435	res = vc_slow_virt_to_phys(ghcb, ctxt, vaddr: (unsigned long)ref, paddr: &paddr);
1436	if (res != ES_OK) {
1437	if (res == ES_EXCEPTION && !read)
1438	ctxt->fi.error_code \|= X86_PF_WRITE;
1439
1440	return res;
1441	}
1442
1443	exit_info_1 = paddr;
1444	/ Can never be greater than 8 /
1445	exit_info_2 = bytes;
1446
1447	ghcb_set_sw_scratch(ghcb, value: ghcb_pa + offsetof(struct ghcb, shared_buffer));
1448
1449	return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
1450	}
1451
1452	/*
1453	* The MOVS instruction has two memory operands, which raises the
1454	* problem that it is not known whether the access to the source or the
1455	* destination caused the #VC exception (and hence whether an MMIO read
1456	* or write operation needs to be emulated).
1457	*
1458	* Instead of playing games with walking page-tables and trying to guess
1459	* whether the source or destination is an MMIO range, split the move
1460	* into two operations, a read and a write with only one memory operand.
1461	* This will cause a nested #VC exception on the MMIO address which can
1462	* then be handled.
1463	*
1464	* This implementation has the benefit that it also supports MOVS where
1465	* source _and_ destination are MMIO regions.
1466	*
1467	* It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
1468	* rare operation. If it turns out to be a performance problem the split
1469	* operations can be moved to memcpy_fromio() and memcpy_toio().
1470	*/
1471	static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
1472	unsigned int bytes)
1473	{
1474	unsigned long ds_base, es_base;
1475	unsigned char src, dst;
1476	unsigned char buffer[`8`];
1477	enum es_result ret;
1478	bool rep;
1479	int off;
1480
1481	ds_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_DS);
1482	es_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_ES);
1483
1484	if (ds_base == -`1L` \|\| es_base == -`1L`) {
1485	ctxt->fi.vector = X86_TRAP_GP;
1486	ctxt->fi.error_code = `0`;
1487	return ES_EXCEPTION;
1488	}
1489
1490	src = ds_base + (unsigned char *)ctxt->regs->si;
1491	dst = es_base + (unsigned char *)ctxt->regs->di;
1492
1493	ret = vc_read_mem(ctxt, src, buf: buffer, size: bytes);
1494	if (ret != ES_OK)
1495	return ret;
1496
1497	ret = vc_write_mem(ctxt, dst, buf: buffer, size: bytes);
1498	if (ret != ES_OK)
1499	return ret;
1500
1501	if (ctxt->regs->flags & X86_EFLAGS_DF)
1502	off = -bytes;
1503	else
1504	off = bytes;
1505
1506	ctxt->regs->si += off;
1507	ctxt->regs->di += off;
1508
1509	rep = insn_has_rep_prefix(insn: &ctxt->insn);
1510	if (rep)
1511	ctxt->regs->cx -= `1`;
1512
1513	if (!rep \|\| ctxt->regs->cx == `0`)
1514	return ES_OK;
1515	else
1516	return ES_RETRY;
1517	}
1518
1519	static enum es_result vc_handle_mmio(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1520	{
1521	struct insn *insn = &ctxt->insn;
1522	enum insn_mmio_type mmio;
1523	unsigned int bytes = `0`;
1524	enum es_result ret;
1525	u8 sign_byte;
1526	long *reg_data;
1527
1528	mmio = insn_decode_mmio(insn, bytes: &bytes);
1529	if (mmio == INSN_MMIO_DECODE_FAILED)
1530	return ES_DECODE_FAILED;
1531
1532	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
1533	reg_data = insn_get_modrm_reg_ptr(insn, regs: ctxt->regs);
1534	if (!reg_data)
1535	return ES_DECODE_FAILED;
1536	}
1537
1538	if (user_mode(regs: ctxt->regs))
1539	return ES_UNSUPPORTED;
1540
1541	switch (mmio) {
1542	case INSN_MMIO_WRITE:
1543	memcpy(ghcb->shared_buffer, reg_data, bytes);
1544	ret = vc_do_mmio(ghcb, ctxt, bytes, read: false);
1545	break;
1546	case INSN_MMIO_WRITE_IMM:
1547	memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
1548	ret = vc_do_mmio(ghcb, ctxt, bytes, read: false);
1549	break;
1550	case INSN_MMIO_READ:
1551	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1552	if (ret)
1553	break;
1554
1555	/ Zero-extend for 32-bit operation /
1556	if (bytes == `4`)
1557	*reg_data = `0`;
1558
1559	memcpy(reg_data, ghcb->shared_buffer, bytes);
1560	break;
1561	case INSN_MMIO_READ_ZERO_EXTEND:
1562	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1563	if (ret)
1564	break;
1565
1566	/ Zero extend based on operand size /
1567	memset(reg_data, `0`, insn->opnd_bytes);
1568	memcpy(reg_data, ghcb->shared_buffer, bytes);
1569	break;
1570	case INSN_MMIO_READ_SIGN_EXTEND:
1571	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1572	if (ret)
1573	break;
1574
1575	if (bytes == `1`) {
1576	u8 val = (u8 )ghcb->shared_buffer;
1577
1578	sign_byte = (*val & `0x80`) ? `0xff` : `0x00`;
1579	} else {
1580	u16 val = (u16 )ghcb->shared_buffer;
1581
1582	sign_byte = (*val & `0x8000`) ? `0xff` : `0x00`;
1583	}
1584
1585	/ Sign extend based on operand size /
1586	memset(reg_data, sign_byte, insn->opnd_bytes);
1587	memcpy(reg_data, ghcb->shared_buffer, bytes);
1588	break;
1589	case INSN_MMIO_MOVS:
1590	ret = vc_handle_mmio_movs(ctxt, bytes);
1591	break;
1592	default:
1593	ret = ES_UNSUPPORTED;
1594	break;
1595	}
1596
1597	return ret;
1598	}
1599
1600	static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
1601	struct es_em_ctxt *ctxt)
1602	{
1603	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1604	long val, *reg = vc_insn_get_rm(ctxt);
1605	enum es_result ret;
1606
1607	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1608	return ES_VMM_ERROR;
1609
1610	if (!reg)
1611	return ES_DECODE_FAILED;
1612
1613	val = *reg;
1614
1615	/ Upper 32 bits must be written as zeroes /
1616	if (val >> `32`) {
1617	ctxt->fi.vector = X86_TRAP_GP;
1618	ctxt->fi.error_code = `0`;
1619	return ES_EXCEPTION;
1620	}
1621
1622	/ Clear out other reserved bits and set bit 10 /
1623	val = (val & `0xffff23ffL`) \| BIT(`10`);
1624
1625	/ Early non-zero writes to DR7 are not supported /
1626	if (!data && (val & ~DR7_RESET_VALUE))
1627	return ES_UNSUPPORTED;
1628
1629	/ Using a value of 0 for ExitInfo1 means RAX holds the value /
1630	ghcb_set_rax(ghcb, value: val);
1631	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, exit_info_1: `0`, exit_info_2: `0`);
1632	if (ret != ES_OK)
1633	return ret;
1634
1635	if (data)
1636	data->dr7 = val;
1637
1638	return ES_OK;
1639	}
1640
1641	static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
1642	struct es_em_ctxt *ctxt)
1643	{
1644	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1645	long *reg = vc_insn_get_rm(ctxt);
1646
1647	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1648	return ES_VMM_ERROR;
1649
1650	if (!reg)
1651	return ES_DECODE_FAILED;
1652
1653	if (data)
1654	*reg = data->dr7;
1655	else
1656	*reg = DR7_RESET_VALUE;
1657
1658	return ES_OK;
1659	}
1660
1661	static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
1662	struct es_em_ctxt *ctxt)
1663	{
1664	return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, exit_info_1: `0`, exit_info_2: `0`);
1665	}
1666
1667	static enum es_result vc_handle_rdpmc(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1668	{
1669	enum es_result ret;
1670
1671	ghcb_set_rcx(ghcb, value: ctxt->regs->cx);
1672
1673	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, exit_info_1: `0`, exit_info_2: `0`);
1674	if (ret != ES_OK)
1675	return ret;
1676
1677	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
1678	return ES_VMM_ERROR;
1679
1680	ctxt->regs->ax = ghcb->save.rax;
1681	ctxt->regs->dx = ghcb->save.rdx;
1682
1683	return ES_OK;
1684	}
1685
1686	static enum es_result vc_handle_monitor(struct ghcb *ghcb,
1687	struct es_em_ctxt *ctxt)
1688	{
1689	/*
1690	* Treat it as a NOP and do not leak a physical address to the
1691	* hypervisor.
1692	*/
1693	return ES_OK;
1694	}
1695
1696	static enum es_result vc_handle_mwait(struct ghcb *ghcb,
1697	struct es_em_ctxt *ctxt)
1698	{
1699	/ Treat the same as MONITOR/MONITORX /
1700	return ES_OK;
1701	}
1702
1703	static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
1704	struct es_em_ctxt *ctxt)
1705	{
1706	enum es_result ret;
1707
1708	ghcb_set_rax(ghcb, value: ctxt->regs->ax);
1709	ghcb_set_cpl(ghcb, value: user_mode(regs: ctxt->regs) ? `3` : `0`);
1710
1711	if (x86_platform.hyper.sev_es_hcall_prepare)
1712	x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
1713
1714	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, exit_info_1: `0`, exit_info_2: `0`);
1715	if (ret != ES_OK)
1716	return ret;
1717
1718	if (!ghcb_rax_is_valid(ghcb))
1719	return ES_VMM_ERROR;
1720
1721	ctxt->regs->ax = ghcb->save.rax;
1722
1723	/*
1724	* Call sev_es_hcall_finish() after regs->ax is already set.
1725	* This allows the hypervisor handler to overwrite it again if
1726	* necessary.
1727	*/
1728	if (x86_platform.hyper.sev_es_hcall_finish &&
1729	!x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
1730	return ES_VMM_ERROR;
1731
1732	return ES_OK;
1733	}
1734
1735	static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
1736	struct es_em_ctxt *ctxt)
1737	{
1738	/*
1739	* Calling ecx_alignment_check() directly does not work, because it
1740	* enables IRQs and the GHCB is active. Forward the exception and call
1741	* it later from vc_forward_exception().
1742	*/
1743	ctxt->fi.vector = X86_TRAP_AC;
1744	ctxt->fi.error_code = `0`;
1745	return ES_EXCEPTION;
1746	}
1747
1748	static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
1749	struct ghcb *ghcb,
1750	unsigned long exit_code)
1751	{
1752	enum es_result result;
1753
1754	switch (exit_code) {
1755	case SVM_EXIT_READ_DR7:
1756	result = vc_handle_dr7_read(ghcb, ctxt);
1757	break;
1758	case SVM_EXIT_WRITE_DR7:
1759	result = vc_handle_dr7_write(ghcb, ctxt);
1760	break;
1761	case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
1762	result = vc_handle_trap_ac(ghcb, ctxt);
1763	break;
1764	case SVM_EXIT_RDTSC:
1765	case SVM_EXIT_RDTSCP:
1766	result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
1767	break;
1768	case SVM_EXIT_RDPMC:
1769	result = vc_handle_rdpmc(ghcb, ctxt);
1770	break;
1771	case SVM_EXIT_INVD:
1772	pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
1773	result = ES_UNSUPPORTED;
1774	break;
1775	case SVM_EXIT_CPUID:
1776	result = vc_handle_cpuid(ghcb, ctxt);
1777	break;
1778	case SVM_EXIT_IOIO:
1779	result = vc_handle_ioio(ghcb, ctxt);
1780	break;
1781	case SVM_EXIT_MSR:
1782	result = vc_handle_msr(ghcb, ctxt);
1783	break;
1784	case SVM_EXIT_VMMCALL:
1785	result = vc_handle_vmmcall(ghcb, ctxt);
1786	break;
1787	case SVM_EXIT_WBINVD:
1788	result = vc_handle_wbinvd(ghcb, ctxt);
1789	break;
1790	case SVM_EXIT_MONITOR:
1791	result = vc_handle_monitor(ghcb, ctxt);
1792	break;
1793	case SVM_EXIT_MWAIT:
1794	result = vc_handle_mwait(ghcb, ctxt);
1795	break;
1796	case SVM_EXIT_NPF:
1797	result = vc_handle_mmio(ghcb, ctxt);
1798	break;
1799	default:
1800	/*
1801	* Unexpected #VC exception
1802	*/
1803	result = ES_UNSUPPORTED;
1804	}
1805
1806	return result;
1807	}
1808
1809	static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
1810	{
1811	long error_code = ctxt->fi.error_code;
1812	int trapnr = ctxt->fi.vector;
1813
1814	ctxt->regs->orig_ax = ctxt->fi.error_code;
1815
1816	switch (trapnr) {
1817	case X86_TRAP_GP:
1818	exc_general_protection(regs: ctxt->regs, error_code);
1819	break;
1820	case X86_TRAP_UD:
1821	exc_invalid_op(regs: ctxt->regs);
1822	break;
1823	case X86_TRAP_PF:
1824	write_cr2(x: ctxt->fi.cr2);
1825	exc_page_fault(regs: ctxt->regs, error_code);
1826	break;
1827	case X86_TRAP_AC:
1828	exc_alignment_check(regs: ctxt->regs, error_code);
1829	break;
1830	default:
1831	pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
1832	BUG();
1833	}
1834	}
1835
1836	static __always_inline bool is_vc2_stack(unsigned long sp)
1837	{
1838	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
1839	}
1840
1841	static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
1842	{
1843	unsigned long sp, prev_sp;
1844
1845	sp = (unsigned long)regs;
1846	prev_sp = regs->sp;
1847
1848	/*
1849	* If the code was already executing on the VC2 stack when the #VC
1850	* happened, let it proceed to the normal handling routine. This way the
1851	* code executing on the VC2 stack can cause #VC exceptions to get handled.
1852	*/
1853	return is_vc2_stack(sp) && !is_vc2_stack(sp: prev_sp);
1854	}
1855
1856	static bool vc_raw_handle_exception(struct pt_regs regs, unsigned* long error_code)
1857	{
1858	struct ghcb_state state;
1859	struct es_em_ctxt ctxt;
1860	enum es_result result;
1861	struct ghcb *ghcb;
1862	bool ret = true;
1863
1864	ghcb = __sev_get_ghcb(state: &state);
1865
1866	vc_ghcb_invalidate(ghcb);
1867	result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code: error_code);
1868
1869	if (result == ES_OK)
1870	result = vc_handle_exitcode(ctxt: &ctxt, ghcb, exit_code: error_code);
1871
1872	__sev_put_ghcb(state: &state);
1873
1874	/ Done - now check the result /
1875	switch (result) {
1876	case ES_OK:
1877	vc_finish_insn(ctxt: &ctxt);
1878	break;
1879	case ES_UNSUPPORTED:
1880	pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
1881	error_code, regs->ip);
1882	ret = false;
1883	break;
1884	case ES_VMM_ERROR:
1885	pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
1886	error_code, regs->ip);
1887	ret = false;
1888	break;
1889	case ES_DECODE_FAILED:
1890	pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
1891	error_code, regs->ip);
1892	ret = false;
1893	break;
1894	case ES_EXCEPTION:
1895	vc_forward_exception(ctxt: &ctxt);
1896	break;
1897	case ES_RETRY:
1898	/ Nothing to do /
1899	break;
1900	default:
1901	pr_emerg("Unknown result in %s():%d\n", __func__, result);
1902	/*
1903	* Emulating the instruction which caused the #VC exception
1904	* failed - can't continue so print debug information
1905	*/
1906	BUG();
1907	}
1908
1909	return ret;
1910	}
1911
1912	static __always_inline bool vc_is_db(unsigned long error_code)
1913	{
1914	return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
1915	}
1916
1917	/*
1918	* Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
1919	* and will panic when an error happens.
1920	*/
1921	DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
1922	{
1923	irqentry_state_t irq_state;
1924
1925	/*
1926	* With the current implementation it is always possible to switch to a
1927	* safe stack because #VC exceptions only happen at known places, like
1928	* intercepted instructions or accesses to MMIO areas/IO ports. They can
1929	* also happen with code instrumentation when the hypervisor intercepts
1930	* #DB, but the critical paths are forbidden to be instrumented, so #DB
1931	* exceptions currently also only happen in safe places.
1932	*
1933	* But keep this here in case the noinstr annotations are violated due
1934	* to bug elsewhere.
1935	*/
1936	if (unlikely(vc_from_invalid_context(regs))) {
1937	instrumentation_begin();
1938	panic(fmt: "Can't handle #VC exception from unsupported context\n");
1939	instrumentation_end();
1940	}
1941
1942	/*
1943	* Handle #DB before calling into !noinstr code to avoid recursive #DB.
1944	*/
1945	if (vc_is_db(error_code)) {
1946	exc_debug(regs);
1947	return;
1948	}
1949
1950	irq_state = irqentry_nmi_enter(regs);
1951
1952	instrumentation_begin();
1953
1954	if (!vc_raw_handle_exception(regs, error_code)) {
1955	/ Show some debug info /
1956	show_regs(regs);
1957
1958	/ Ask hypervisor to sev_es_terminate /
1959	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1960
1961	/ If that fails and we get here - just panic /
1962	panic(fmt: "Returned from Terminate-Request to Hypervisor\n");
1963	}
1964
1965	instrumentation_end();
1966	irqentry_nmi_exit(regs, irq_state);
1967	}
1968
1969	/*
1970	* Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
1971	* and will kill the current task with SIGBUS when an error happens.
1972	*/
1973	DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
1974	{
1975	/*
1976	* Handle #DB before calling into !noinstr code to avoid recursive #DB.
1977	*/
1978	if (vc_is_db(error_code)) {
1979	noist_exc_debug(regs);
1980	return;
1981	}
1982
1983	irqentry_enter_from_user_mode(regs);
1984	instrumentation_begin();
1985
1986	if (!vc_raw_handle_exception(regs, error_code)) {
1987	/*
1988	* Do not kill the machine if user-space triggered the
1989	* exception. Send SIGBUS instead and let user-space deal with
1990	* it.
1991	*/
1992	force_sig_fault(SIGBUS, BUS_OBJERR, addr: (void __user *)`0`);
1993	}
1994
1995	instrumentation_end();
1996	irqentry_exit_to_user_mode(regs);
1997	}
1998
1999	bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
2000	{
2001	unsigned long exit_code = regs->orig_ax;
2002	struct es_em_ctxt ctxt;
2003	enum es_result result;
2004
2005	vc_ghcb_invalidate(ghcb: boot_ghcb);
2006
2007	result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code);
2008	if (result == ES_OK)
2009	result = vc_handle_exitcode(ctxt: &ctxt, ghcb: boot_ghcb, exit_code);
2010
2011	/ Done - now check the result /
2012	switch (result) {
2013	case ES_OK:
2014	vc_finish_insn(ctxt: &ctxt);
2015	break;
2016	case ES_UNSUPPORTED:
2017	early_printk(fmt: "PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
2018	exit_code, regs->ip);
2019	goto fail;
2020	case ES_VMM_ERROR:
2021	early_printk(fmt: "PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
2022	exit_code, regs->ip);
2023	goto fail;
2024	case ES_DECODE_FAILED:
2025	early_printk(fmt: "PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
2026	exit_code, regs->ip);
2027	goto fail;
2028	case ES_EXCEPTION:
2029	vc_early_forward_exception(ctxt: &ctxt);
2030	break;
2031	case ES_RETRY:
2032	/ Nothing to do /
2033	break;
2034	default:
2035	BUG();
2036	}
2037
2038	return true;
2039
2040	fail:
2041	show_regs(regs);
2042
2043	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
2044	}
2045
2046	/*
2047	* Initial set up of SNP relies on information provided by the
2048	* Confidential Computing blob, which can be passed to the kernel
2049	* in the following ways, depending on how it is booted:
2050	*
2051	* - when booted via the boot/decompress kernel:
2052	* - via boot_params
2053	*
2054	* - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
2055	* - via a setup_data entry, as defined by the Linux Boot Protocol
2056	*
2057	* Scan for the blob in that order.
2058	*/
2059	static __init struct cc_blob_sev_info find_cc_blob(struct* boot_params *bp)
2060	{
2061	struct cc_blob_sev_info *cc_info;
2062
2063	/ Boot kernel would have passed the CC blob via boot_params. /
2064	if (bp->cc_blob_address) {
2065	cc_info = (struct cc_blob_sev_info )(unsigned* long)bp->cc_blob_address;
2066	goto found_cc_info;
2067	}
2068
2069	/*
2070	* If kernel was booted directly, without the use of the
2071	* boot/decompression kernel, the CC blob may have been passed via
2072	* setup_data instead.
2073	*/
2074	cc_info = find_cc_blob_setup_data(bp);
2075	if (!cc_info)
2076	return NULL;
2077
2078	found_cc_info:
2079	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
2080	snp_abort();
2081
2082	return cc_info;
2083	}
2084
2085	bool __init snp_init(struct boot_params *bp)
2086	{
2087	struct cc_blob_sev_info *cc_info;
2088
2089	if (!bp)
2090	return false;
2091
2092	cc_info = find_cc_blob(bp);
2093	if (!cc_info)
2094	return false;
2095
2096	setup_cpuid_table(cc_info);
2097
2098	/*
2099	* The CC blob will be used later to access the secrets page. Cache
2100	* it here like the boot kernel does.
2101	*/
2102	bp->cc_blob_address = (u32)(unsigned long)cc_info;
2103
2104	return true;
2105	}
2106
2107	void __init __noreturn snp_abort(void)
2108	{
2109	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
2110	}
2111
2112	static void dump_cpuid_table(void)
2113	{
2114	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2115	int i = `0`;
2116
2117	pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
2118	cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
2119
2120	for (i = `0`; i < SNP_CPUID_COUNT_MAX; i++) {
2121	const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
2122
2123	pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
2124	i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
2125	fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
2126	}
2127	}
2128
2129	/*
2130	* It is useful from an auditing/testing perspective to provide an easy way
2131	* for the guest owner to know that the CPUID table has been initialized as
2132	* expected, but that initialization happens too early in boot to print any
2133	* sort of indicator, and there's not really any other good place to do it,
2134	* so do it here.
2135	*/
2136	static int __init report_cpuid_table(void)
2137	{
2138	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2139
2140	if (!cpuid_table->count)
2141	return `0`;
2142
2143	pr_info("Using SNP CPUID table, %d entries present.\n",
2144	cpuid_table->count);
2145
2146	if (sev_cfg.debug)
2147	dump_cpuid_table();
2148
2149	return `0`;
2150	}
2151	arch_initcall(report_cpuid_table);
2152
2153	static int __init init_sev_config(char *str)
2154	{
2155	char *s;
2156
2157	while ((s = strsep(&str, ","))) {
2158	if (!strcmp(s, "debug")) {
2159	sev_cfg.debug = true;
2160	continue;
2161	}
2162
2163	pr_info("SEV command-line option '%s' was not recognized\n", s);
2164	}
2165
2166	return `1`;
2167	}
2168	__setup("sev=", init_sev_config);
2169
2170	int snp_issue_guest_request(u64 exit_code, struct snp_req_data input, struct* snp_guest_request_ioctl *rio)
2171	{
2172	struct ghcb_state state;
2173	struct es_em_ctxt ctxt;
2174	unsigned long flags;
2175	struct ghcb *ghcb;
2176	int ret;
2177
2178	rio->exitinfo2 = SEV_RET_NO_FW_CALL;
2179
2180	/*
2181	* __sev_get_ghcb() needs to run with IRQs disabled because it is using
2182	* a per-CPU GHCB.
2183	*/
2184	local_irq_save(flags);
2185
2186	ghcb = __sev_get_ghcb(state: &state);
2187	if (!ghcb) {
2188	ret = -EIO;
2189	goto e_restore_irq;
2190	}
2191
2192	vc_ghcb_invalidate(ghcb);
2193
2194	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2195	ghcb_set_rax(ghcb, value: input->data_gpa);
2196	ghcb_set_rbx(ghcb, value: input->data_npages);
2197	}
2198
2199	ret = sev_es_ghcb_hv_call(ghcb, ctxt: &ctxt, exit_code, exit_info_1: input->req_gpa, exit_info_2: input->resp_gpa);
2200	if (ret)
2201	goto e_put;
2202
2203	rio->exitinfo2 = ghcb->save.sw_exit_info_2;
2204	switch (rio->exitinfo2) {
2205	case `0`:
2206	break;
2207
2208	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
2209	ret = -EAGAIN;
2210	break;
2211
2212	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
2213	/ Number of expected pages are returned in RBX /
2214	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2215	input->data_npages = ghcb_get_rbx(ghcb);
2216	ret = -ENOSPC;
2217	break;
2218	}
2219	fallthrough;
2220	default:
2221	ret = -EIO;
2222	break;
2223	}
2224
2225	e_put:
2226	__sev_put_ghcb(state: &state);
2227	e_restore_irq:
2228	local_irq_restore(flags);
2229
2230	return ret;
2231	}
2232	EXPORT_SYMBOL_GPL(snp_issue_guest_request);
2233
2234	static struct platform_device sev_guest_device = {
2235	.name = "sev-guest",
2236	.id = -`1`,
2237	};
2238
2239	static int __init snp_init_platform_device(void)
2240	{
2241	struct sev_guest_platform_data data;
2242	u64 gpa;
2243
2244	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
2245	return -ENODEV;
2246
2247	gpa = get_secrets_page();
2248	if (!gpa)
2249	return -ENODEV;
2250
2251	data.secrets_gpa = gpa;
2252	if (platform_device_add_data(pdev: &sev_guest_device, data: &data, size: sizeof(data)))
2253	return -ENODEV;
2254
2255	if (platform_device_register(&sev_guest_device))
2256	return -ENODEV;
2257
2258	pr_info("SNP guest platform device initialized.\n");
2259	return `0`;
2260	}
2261	device_initcall(snp_init_platform_device);
2262

source code of linux/arch/x86/kernel/sev.c