tdx.c source code [linux/arch/x86/coco/tdx/tdx.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/ Copyright (C) 2021-2022 Intel Corporation /
3
4	#undef pr_fmt
5	#define pr_fmt(fmt) "tdx: " fmt
6
7	#include <linux/cpufeature.h>
8	#include <linux/export.h>
9	#include <linux/io.h>
10	#include <asm/coco.h>
11	#include <asm/tdx.h>
12	#include <asm/vmx.h>
13	#include <asm/ia32.h>
14	#include <asm/insn.h>
15	#include <asm/insn-eval.h>
16	#include <asm/pgtable.h>
17
18	/ MMIO direction /
19	#define EPT_READ 0
20	#define EPT_WRITE 1
21
22	/ Port I/O direction /
23	#define PORT_READ 0
24	#define PORT_WRITE 1
25
26	/ See Exit Qualification for I/O Instructions in VMX documentation /
27	#define VE_IS_IO_IN(e) ((e) & BIT(3))
28	#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
29	#define VE_GET_PORT_NUM(e) ((e) >> 16)
30	#define VE_IS_IO_STRING(e) ((e) & BIT(4))
31
32	#define ATTR_DEBUG BIT(0)
33	#define ATTR_SEPT_VE_DISABLE BIT(28)
34
35	/ TDX Module call error codes /
36	#define TDCALL_RETURN_CODE(a) ((a) >> 32)
37	#define TDCALL_INVALID_OPERAND 0xc0000100
38
39	#define TDREPORT_SUBTYPE_0 0
40
41	/ Called from __tdx_hypercall() for unrecoverable failure /
42	noinstr void __noreturn __tdx_hypercall_failed(void)
43	{
44	instrumentation_begin();
45	panic(fmt: "TDVMCALL failed. TDX module bug?");
46	}
47
48	#ifdef CONFIG_KVM_GUEST
49	long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
50	unsigned long p3, unsigned long p4)
51	{
52	struct tdx_module_args args = {
53	.r10 = nr,
54	.r11 = p1,
55	.r12 = p2,
56	.r13 = p3,
57	.r14 = p4,
58	};
59
60	return __tdx_hypercall(args: &args);
61	}
62	EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
63	#endif
64
65	/*
66	* Used for TDX guests to make calls directly to the TD module. This
67	* should only be used for calls that have no legitimate reason to fail
68	* or where the kernel can not survive the call failing.
69	*/
70	static inline void tdcall(u64 fn, struct tdx_module_args *args)
71	{
72	if (__tdcall_ret(fn, args))
73	panic(fmt: "TDCALL %lld failed (Buggy TDX module!)\n", fn);
74	}
75
76	/**
77	* tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
78	* subtype 0) using TDG.MR.REPORT TDCALL.
79	* @reportdata: Address of the input buffer which contains user-defined
80	* REPORTDATA to be included into TDREPORT.
81	* @tdreport: Address of the output buffer to store TDREPORT.
82	*
83	* Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
84	* v1.0 specification for more information on TDG.MR.REPORT TDCALL.
85	* It is used in the TDX guest driver module to get the TDREPORT0.
86	*
87	* Return 0 on success, -EINVAL for invalid operands, or -EIO on
88	* other TDCALL failures.
89	*/
90	int tdx_mcall_get_report0(u8 reportdata, u8 tdreport)
91	{
92	struct tdx_module_args args = {
93	.rcx = virt_to_phys(address: tdreport),
94	.rdx = virt_to_phys(address: reportdata),
95	.r8 = TDREPORT_SUBTYPE_0,
96	};
97	u64 ret;
98
99	ret = __tdcall(TDG_MR_REPORT, args: &args);
100	if (ret) {
101	if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
102	return -EINVAL;
103	return -EIO;
104	}
105
106	return `0`;
107	}
108	EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
109
110	/**
111	* tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote
112	* hypercall.
113	* @buf: Address of the directly mapped shared kernel buffer which
114	* contains TDREPORT. The same buffer will be used by VMM to
115	* store the generated TD Quote output.
116	* @size: size of the tdquote buffer (4KB-aligned).
117	*
118	* Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI
119	* v1.0 specification for more information on GetQuote hypercall.
120	* It is used in the TDX guest driver module to get the TD Quote.
121	*
122	* Return 0 on success or error code on failure.
123	*/
124	u64 tdx_hcall_get_quote(u8 *buf, size_t size)
125	{
126	/ Since buf is a shared memory, set the shared (decrypted) bits /
127	return _tdx_hypercall(TDVMCALL_GET_QUOTE, r12: cc_mkdec(virt_to_phys(address: buf)), r13: size, r14: `0`, r15: `0`);
128	}
129	EXPORT_SYMBOL_GPL(tdx_hcall_get_quote);
130
131	static void __noreturn tdx_panic(const char *msg)
132	{
133	struct tdx_module_args args = {
134	.r10 = TDX_HYPERCALL_STANDARD,
135	.r11 = TDVMCALL_REPORT_FATAL_ERROR,
136	.r12 = `0`, / Error code: 0 is Panic /
137	};
138	union {
139	/ Define register order according to the GHCI /
140	struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
141
142	char str[`64`];
143	} message;
144
145	/ VMM assumes '\0' in byte 65, if the message took all 64 bytes /
146	strtomem_pad(message.str, msg, `'\0'`);
147
148	args.r8 = message.r8;
149	args.r9 = message.r9;
150	args.r14 = message.r14;
151	args.r15 = message.r15;
152	args.rdi = message.rdi;
153	args.rsi = message.rsi;
154	args.rbx = message.rbx;
155	args.rdx = message.rdx;
156
157	/*
158	* This hypercall should never return and it is not safe
159	* to keep the guest running. Call it forever if it
160	* happens to return.
161	*/
162	while (`1`)
163	__tdx_hypercall(args: &args);
164	}
165
166	static void tdx_parse_tdinfo(u64 *cc_mask)
167	{
168	struct tdx_module_args args = {};
169	unsigned int gpa_width;
170	u64 td_attr;
171
172	/*
173	* TDINFO TDX module call is used to get the TD execution environment
174	* information like GPA width, number of available vcpus, debug mode
175	* information, etc. More details about the ABI can be found in TDX
176	* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
177	* [TDG.VP.INFO].
178	*/
179	tdcall(TDG_VP_INFO, args: &args);
180
181	/*
182	* The highest bit of a guest physical address is the "sharing" bit.
183	* Set it for shared pages and clear it for private pages.
184	*
185	* The GPA width that comes out of this call is critical. TDX guests
186	* can not meaningfully run without it.
187	*/
188	gpa_width = args.rcx & GENMASK(`5`, `0`);
189	*cc_mask = BIT_ULL(gpa_width - `1`);
190
191	/*
192	* The kernel can not handle #VE's when accessing normal kernel
193	* memory. Ensure that no #VE will be delivered for accesses to
194	* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
195	*/
196	td_attr = args.rdx;
197	if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
198	const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
199
200	/ Relax SEPT_VE_DISABLE check for debug TD. /
201	if (td_attr & ATTR_DEBUG)
202	pr_warn("%s\n", msg);
203	else
204	tdx_panic(msg);
205	}
206	}
207
208	/*
209	* The TDX module spec states that #VE may be injected for a limited set of
210	* reasons:
211	*
212	* - Emulation of the architectural #VE injection on EPT violation;
213	*
214	* - As a result of guest TD execution of a disallowed instruction,
215	* a disallowed MSR access, or CPUID virtualization;
216	*
217	* - A notification to the guest TD about anomalous behavior;
218	*
219	* The last one is opt-in and is not used by the kernel.
220	*
221	* The Intel Software Developer's Manual describes cases when instruction
222	* length field can be used in section "Information for VM Exits Due to
223	* Instruction Execution".
224	*
225	* For TDX, it ultimately means GET_VEINFO provides reliable instruction length
226	* information if #VE occurred due to instruction execution, but not for EPT
227	* violations.
228	*/
229	static int ve_instr_len(struct ve_info *ve)
230	{
231	switch (ve->exit_reason) {
232	case EXIT_REASON_HLT:
233	case EXIT_REASON_MSR_READ:
234	case EXIT_REASON_MSR_WRITE:
235	case EXIT_REASON_CPUID:
236	case EXIT_REASON_IO_INSTRUCTION:
237	/ It is safe to use ve->instr_len for #VE due instructions /
238	return ve->instr_len;
239	case EXIT_REASON_EPT_VIOLATION:
240	/*
241	* For EPT violations, ve->insn_len is not defined. For those,
242	* the kernel must decode instructions manually and should not
243	* be using this function.
244	*/
245	WARN_ONCE(`1`, "ve->instr_len is not defined for EPT violations");
246	return `0`;
247	default:
248	WARN_ONCE(`1`, "Unexpected #VE-type: %lld\n", ve->exit_reason);
249	return ve->instr_len;
250	}
251	}
252
253	static u64 __cpuidle __halt(const bool irq_disabled)
254	{
255	struct tdx_module_args args = {
256	.r10 = TDX_HYPERCALL_STANDARD,
257	.r11 = hcall_func(EXIT_REASON_HLT),
258	.r12 = irq_disabled,
259	};
260
261	/*
262	* Emulate HLT operation via hypercall. More info about ABI
263	* can be found in TDX Guest-Host-Communication Interface
264	* (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
265	*
266	* The VMM uses the "IRQ disabled" param to understand IRQ
267	* enabled status (RFLAGS.IF) of the TD guest and to determine
268	* whether or not it should schedule the halted vCPU if an
269	* IRQ becomes pending. E.g. if IRQs are disabled, the VMM
270	* can keep the vCPU in virtual HLT, even if an IRQ is
271	* pending, without hanging/breaking the guest.
272	*/
273	return __tdx_hypercall(args: &args);
274	}
275
276	static int handle_halt(struct ve_info *ve)
277	{
278	const bool irq_disabled = irqs_disabled();
279
280	if (__halt(irq_disabled))
281	return -EIO;
282
283	return ve_instr_len(ve);
284	}
285
286	void __cpuidle tdx_safe_halt(void)
287	{
288	const bool irq_disabled = false;
289
290	/*
291	* Use WARN_ONCE() to report the failure.
292	*/
293	if (__halt(irq_disabled))
294	WARN_ONCE(`1`, "HLT instruction emulation failed\n");
295	}
296
297	static int read_msr(struct pt_regs regs, struct* ve_info *ve)
298	{
299	struct tdx_module_args args = {
300	.r10 = TDX_HYPERCALL_STANDARD,
301	.r11 = hcall_func(EXIT_REASON_MSR_READ),
302	.r12 = regs->cx,
303	};
304
305	/*
306	* Emulate the MSR read via hypercall. More info about ABI
307	* can be found in TDX Guest-Host-Communication Interface
308	* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
309	*/
310	if (__tdx_hypercall(args: &args))
311	return -EIO;
312
313	regs->ax = lower_32_bits(args.r11);
314	regs->dx = upper_32_bits(args.r11);
315	return ve_instr_len(ve);
316	}
317
318	static int write_msr(struct pt_regs regs, struct* ve_info *ve)
319	{
320	struct tdx_module_args args = {
321	.r10 = TDX_HYPERCALL_STANDARD,
322	.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
323	.r12 = regs->cx,
324	.r13 = (u64)regs->dx << `32` \| regs->ax,
325	};
326
327	/*
328	* Emulate the MSR write via hypercall. More info about ABI
329	* can be found in TDX Guest-Host-Communication Interface
330	* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
331	*/
332	if (__tdx_hypercall(args: &args))
333	return -EIO;
334
335	return ve_instr_len(ve);
336	}
337
338	static int handle_cpuid(struct pt_regs regs, struct* ve_info *ve)
339	{
340	struct tdx_module_args args = {
341	.r10 = TDX_HYPERCALL_STANDARD,
342	.r11 = hcall_func(EXIT_REASON_CPUID),
343	.r12 = regs->ax,
344	.r13 = regs->cx,
345	};
346
347	/*
348	* Only allow VMM to control range reserved for hypervisor
349	* communication.
350	*
351	* Return all-zeros for any CPUID outside the range. It matches CPU
352	* behaviour for non-supported leaf.
353	*/
354	if (regs->ax < `0x40000000` \|\| regs->ax > `0x4FFFFFFF`) {
355	regs->ax = regs->bx = regs->cx = regs->dx = `0`;
356	return ve_instr_len(ve);
357	}
358
359	/*
360	* Emulate the CPUID instruction via a hypercall. More info about
361	* ABI can be found in TDX Guest-Host-Communication Interface
362	* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
363	*/
364	if (__tdx_hypercall(args: &args))
365	return -EIO;
366
367	/*
368	* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
369	* EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
370	* So copy the register contents back to pt_regs.
371	*/
372	regs->ax = args.r12;
373	regs->bx = args.r13;
374	regs->cx = args.r14;
375	regs->dx = args.r15;
376
377	return ve_instr_len(ve);
378	}
379
380	static bool mmio_read(int size, unsigned long addr, unsigned long *val)
381	{
382	struct tdx_module_args args = {
383	.r10 = TDX_HYPERCALL_STANDARD,
384	.r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
385	.r12 = size,
386	.r13 = EPT_READ,
387	.r14 = addr,
388	.r15 = *val,
389	};
390
391	if (__tdx_hypercall(args: &args))
392	return false;
393
394	*val = args.r11;
395	return true;
396	}
397
398	static bool mmio_write(int size, unsigned long addr, unsigned long val)
399	{
400	return !_tdx_hypercall(fn: hcall_func(EXIT_REASON_EPT_VIOLATION), r12: size,
401	EPT_WRITE, r14: addr, r15: val);
402	}
403
404	static int handle_mmio(struct pt_regs regs, struct* ve_info *ve)
405	{
406	unsigned long *reg, val, vaddr;
407	char buffer[MAX_INSN_SIZE];
408	enum insn_mmio_type mmio;
409	struct insn insn = {};
410	int size, extend_size;
411	u8 extend_val = `0`;
412
413	/ Only in-kernel MMIO is supported /
414	if (WARN_ON_ONCE(user_mode(regs)))
415	return -EFAULT;
416
417	if (copy_from_kernel_nofault(dst: buffer, src: (void *)regs->ip, MAX_INSN_SIZE))
418	return -EFAULT;
419
420	if (insn_decode(insn: &insn, kaddr: buffer, MAX_INSN_SIZE, m: INSN_MODE_64))
421	return -EINVAL;
422
423	mmio = insn_decode_mmio(insn: &insn, bytes: &size);
424	if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
425	return -EINVAL;
426
427	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
428	reg = insn_get_modrm_reg_ptr(insn: &insn, regs);
429	if (!reg)
430	return -EINVAL;
431	}
432
433	/*
434	* Reject EPT violation #VEs that split pages.
435	*
436	* MMIO accesses are supposed to be naturally aligned and therefore
437	* never cross page boundaries. Seeing split page accesses indicates
438	* a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
439	*
440	* load_unaligned_zeropad() will recover using exception fixups.
441	*/
442	vaddr = (unsigned long)insn_get_addr_ref(insn: &insn, regs);
443	if (vaddr / PAGE_SIZE != (vaddr + size - `1`) / PAGE_SIZE)
444	return -EFAULT;
445
446	/ Handle writes first /
447	switch (mmio) {
448	case INSN_MMIO_WRITE:
449	memcpy(&val, reg, size);
450	if (!mmio_write(size, addr: ve->gpa, val))
451	return -EIO;
452	return insn.length;
453	case INSN_MMIO_WRITE_IMM:
454	val = insn.immediate.value;
455	if (!mmio_write(size, addr: ve->gpa, val))
456	return -EIO;
457	return insn.length;
458	case INSN_MMIO_READ:
459	case INSN_MMIO_READ_ZERO_EXTEND:
460	case INSN_MMIO_READ_SIGN_EXTEND:
461	/ Reads are handled below /
462	break;
463	case INSN_MMIO_MOVS:
464	case INSN_MMIO_DECODE_FAILED:
465	/*
466	* MMIO was accessed with an instruction that could not be
467	* decoded or handled properly. It was likely not using io.h
468	* helpers or accessed MMIO accidentally.
469	*/
470	return -EINVAL;
471	default:
472	WARN_ONCE(`1`, "Unknown insn_decode_mmio() decode value?");
473	return -EINVAL;
474	}
475
476	/ Handle reads /
477	if (!mmio_read(size, addr: ve->gpa, val: &val))
478	return -EIO;
479
480	switch (mmio) {
481	case INSN_MMIO_READ:
482	/ Zero-extend for 32-bit operation /
483	extend_size = size == `4` ? sizeof(*reg) : `0`;
484	break;
485	case INSN_MMIO_READ_ZERO_EXTEND:
486	/ Zero extend based on operand size /
487	extend_size = insn.opnd_bytes;
488	break;
489	case INSN_MMIO_READ_SIGN_EXTEND:
490	/ Sign extend based on operand size /
491	extend_size = insn.opnd_bytes;
492	if (size == `1` && val & BIT(`7`))
493	extend_val = `0xFF`;
494	else if (size > `1` && val & BIT(`15`))
495	extend_val = `0xFF`;
496	break;
497	default:
498	/ All other cases has to be covered with the first switch() /
499	WARN_ON_ONCE(`1`);
500	return -EINVAL;
501	}
502
503	if (extend_size)
504	memset(reg, extend_val, extend_size);
505	memcpy(reg, &val, size);
506	return insn.length;
507	}
508
509	static bool handle_in(struct pt_regs regs, int* size, int port)
510	{
511	struct tdx_module_args args = {
512	.r10 = TDX_HYPERCALL_STANDARD,
513	.r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
514	.r12 = size,
515	.r13 = PORT_READ,
516	.r14 = port,
517	};
518	u64 mask = GENMASK(BITS_PER_BYTE * size, `0`);
519	bool success;
520
521	/*
522	* Emulate the I/O read via hypercall. More info about ABI can be found
523	* in TDX Guest-Host-Communication Interface (GHCI) section titled
524	* "TDG.VP.VMCALL<Instruction.IO>".
525	*/
526	success = !__tdx_hypercall(args: &args);
527
528	/ Update part of the register affected by the emulated instruction /
529	regs->ax &= ~mask;
530	if (success)
531	regs->ax \|= args.r11 & mask;
532
533	return success;
534	}
535
536	static bool handle_out(struct pt_regs regs, int* size, int port)
537	{
538	u64 mask = GENMASK(BITS_PER_BYTE * size, `0`);
539
540	/*
541	* Emulate the I/O write via hypercall. More info about ABI can be found
542	* in TDX Guest-Host-Communication Interface (GHCI) section titled
543	* "TDG.VP.VMCALL<Instruction.IO>".
544	*/
545	return !_tdx_hypercall(fn: hcall_func(EXIT_REASON_IO_INSTRUCTION), r12: size,
546	PORT_WRITE, r14: port, r15: regs->ax & mask);
547	}
548
549	/*
550	* Emulate I/O using hypercall.
551	*
552	* Assumes the IO instruction was using ax, which is enforced
553	* by the standard io.h macros.
554	*
555	* Return True on success or False on failure.
556	*/
557	static int handle_io(struct pt_regs regs, struct* ve_info *ve)
558	{
559	u32 exit_qual = ve->exit_qual;
560	int size, port;
561	bool in, ret;
562
563	if (VE_IS_IO_STRING(exit_qual))
564	return -EIO;
565
566	in = VE_IS_IO_IN(exit_qual);
567	size = VE_GET_IO_SIZE(exit_qual);
568	port = VE_GET_PORT_NUM(exit_qual);
569
570
571	if (in)
572	ret = handle_in(regs, size, port);
573	else
574	ret = handle_out(regs, size, port);
575	if (!ret)
576	return -EIO;
577
578	return ve_instr_len(ve);
579	}
580
581	/*
582	* Early #VE exception handler. Only handles a subset of port I/O.
583	* Intended only for earlyprintk. If failed, return false.
584	*/
585	__init bool tdx_early_handle_ve(struct pt_regs *regs)
586	{
587	struct ve_info ve;
588	int insn_len;
589
590	tdx_get_ve_info(ve: &ve);
591
592	if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
593	return false;
594
595	insn_len = handle_io(regs, ve: &ve);
596	if (insn_len < `0`)
597	return false;
598
599	regs->ip += insn_len;
600	return true;
601	}
602
603	void tdx_get_ve_info(struct ve_info *ve)
604	{
605	struct tdx_module_args args = {};
606
607	/*
608	* Called during #VE handling to retrieve the #VE info from the
609	* TDX module.
610	*
611	* This has to be called early in #VE handling. A "nested" #VE which
612	* occurs before this will raise a #DF and is not recoverable.
613	*
614	* The call retrieves the #VE info from the TDX module, which also
615	* clears the "#VE valid" flag. This must be done before anything else
616	* because any #VE that occurs while the valid flag is set will lead to
617	* #DF.
618	*
619	* Note, the TDX module treats virtual NMIs as inhibited if the #VE
620	* valid flag is set. It means that NMI=>#VE will not result in a #DF.
621	*/
622	tdcall(TDG_VP_VEINFO_GET, args: &args);
623
624	/ Transfer the output parameters /
625	ve->exit_reason = args.rcx;
626	ve->exit_qual = args.rdx;
627	ve->gla = args.r8;
628	ve->gpa = args.r9;
629	ve->instr_len = lower_32_bits(args.r10);
630	ve->instr_info = upper_32_bits(args.r10);
631	}
632
633	/*
634	* Handle the user initiated #VE.
635	*
636	* On success, returns the number of bytes RIP should be incremented (>=0)
637	* or -errno on error.
638	*/
639	static int virt_exception_user(struct pt_regs regs, struct* ve_info *ve)
640	{
641	switch (ve->exit_reason) {
642	case EXIT_REASON_CPUID:
643	return handle_cpuid(regs, ve);
644	default:
645	pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
646	return -EIO;
647	}
648	}
649
650	static inline bool is_private_gpa(u64 gpa)
651	{
652	return gpa == cc_mkenc(val: gpa);
653	}
654
655	/*
656	* Handle the kernel #VE.
657	*
658	* On success, returns the number of bytes RIP should be incremented (>=0)
659	* or -errno on error.
660	*/
661	static int virt_exception_kernel(struct pt_regs regs, struct* ve_info *ve)
662	{
663	switch (ve->exit_reason) {
664	case EXIT_REASON_HLT:
665	return handle_halt(ve);
666	case EXIT_REASON_MSR_READ:
667	return read_msr(regs, ve);
668	case EXIT_REASON_MSR_WRITE:
669	return write_msr(regs, ve);
670	case EXIT_REASON_CPUID:
671	return handle_cpuid(regs, ve);
672	case EXIT_REASON_EPT_VIOLATION:
673	if (is_private_gpa(gpa: ve->gpa))
674	panic(fmt: "Unexpected EPT-violation on private memory.");
675	return handle_mmio(regs, ve);
676	case EXIT_REASON_IO_INSTRUCTION:
677	return handle_io(regs, ve);
678	default:
679	pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
680	return -EIO;
681	}
682	}
683
684	bool tdx_handle_virt_exception(struct pt_regs regs, struct* ve_info *ve)
685	{
686	int insn_len;
687
688	if (user_mode(regs))
689	insn_len = virt_exception_user(regs, ve);
690	else
691	insn_len = virt_exception_kernel(regs, ve);
692	if (insn_len < `0`)
693	return false;
694
695	/ After successful #VE handling, move the IP /
696	regs->ip += insn_len;
697
698	return true;
699	}
700
701	static bool tdx_tlb_flush_required(bool private)
702	{
703	/*
704	* TDX guest is responsible for flushing TLB on private->shared
705	* transition. VMM is responsible for flushing on shared->private.
706	*
707	* The VMM _can't_ flush private addresses as it can't generate PAs
708	* with the guest's HKID. Shared memory isn't subject to integrity
709	* checking, i.e. the VMM doesn't need to flush for its own protection.
710	*
711	* There's no need to flush when converting from shared to private,
712	* as flushing is the VMM's responsibility in this case, e.g. it must
713	* flush to avoid integrity failures in the face of a buggy or
714	* malicious guest.
715	*/
716	return !private;
717	}
718
719	static bool tdx_cache_flush_required(void)
720	{
721	/*
722	* AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
723	* TDX doesn't have such capability.
724	*
725	* Flush cache unconditionally.
726	*/
727	return true;
728	}
729
730	/*
731	* Notify the VMM about page mapping conversion. More info about ABI
732	* can be found in TDX Guest-Host-Communication Interface (GHCI),
733	* section "TDG.VP.VMCALL<MapGPA>".
734	*/
735	static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
736	{
737	/ Retrying the hypercall a second time should succeed; use 3 just in case /
738	const int max_retries_per_page = `3`;
739	int retry_count = `0`;
740
741	if (!enc) {
742	/ Set the shared (decrypted) bits: /
743	start \|= cc_mkdec(val: `0`);
744	end \|= cc_mkdec(val: `0`);
745	}
746
747	while (retry_count < max_retries_per_page) {
748	struct tdx_module_args args = {
749	.r10 = TDX_HYPERCALL_STANDARD,
750	.r11 = TDVMCALL_MAP_GPA,
751	.r12 = start,
752	.r13 = end - start };
753
754	u64 map_fail_paddr;
755	u64 ret = __tdx_hypercall(args: &args);
756
757	if (ret != TDVMCALL_STATUS_RETRY)
758	return !ret;
759	/*
760	* The guest must retry the operation for the pages in the
761	* region starting at the GPA specified in R11. R11 comes
762	* from the untrusted VMM. Sanity check it.
763	*/
764	map_fail_paddr = args.r11;
765	if (map_fail_paddr < start \|\| map_fail_paddr >= end)
766	return false;
767
768	/ "Consume" a retry without forward progress /
769	if (map_fail_paddr == start) {
770	retry_count++;
771	continue;
772	}
773
774	start = map_fail_paddr;
775	retry_count = `0`;
776	}
777
778	return false;
779	}
780
781	/*
782	* Inform the VMM of the guest's intent for this physical page: shared with
783	* the VMM or private to the guest. The VMM is expected to change its mapping
784	* of the page in response.
785	*/
786	static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
787	{
788	phys_addr_t start = __pa(vaddr);
789	phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
790
791	if (!tdx_map_gpa(start, end, enc))
792	return false;
793
794	/ shared->private conversion requires memory to be accepted before use /
795	if (enc)
796	return tdx_accept_memory(start, end);
797
798	return true;
799	}
800
801	static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
802	bool enc)
803	{
804	/*
805	* Only handle shared->private conversion here.
806	* See the comment in tdx_early_init().
807	*/
808	if (enc)
809	return tdx_enc_status_changed(vaddr, numpages, enc);
810	return true;
811	}
812
813	static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
814	bool enc)
815	{
816	/*
817	* Only handle private->shared conversion here.
818	* See the comment in tdx_early_init().
819	*/
820	if (!enc)
821	return tdx_enc_status_changed(vaddr, numpages, enc);
822	return true;
823	}
824
825	void __init tdx_early_init(void)
826	{
827	struct tdx_module_args args = {
828	.rdx = TDCS_NOTIFY_ENABLES,
829	.r9 = -`1ULL`,
830	};
831	u64 cc_mask;
832	u32 eax, sig[`3`];
833
834	cpuid_count(TDX_CPUID_LEAF_ID, count: `0`, eax: &eax, ebx: &sig[`0`], ecx: &sig[`2`], edx: &sig[`1`]);
835
836	if (memcmp(TDX_IDENT, q: sig, size: sizeof(sig)))
837	return;
838
839	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
840
841	/ TSC is the only reliable clock in TDX guest /
842	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
843
844	cc_vendor = CC_VENDOR_INTEL;
845	tdx_parse_tdinfo(cc_mask: &cc_mask);
846	cc_set_mask(mask: cc_mask);
847
848	/ Kernel does not use NOTIFY_ENABLES and does not need random #VEs /
849	tdcall(TDG_VM_WR, args: &args);
850
851	/*
852	* All bits above GPA width are reserved and kernel treats shared bit
853	* as flag, not as part of physical address.
854	*
855	* Adjust physical mask to only cover valid GPA bits.
856	*/
857	physical_mask &= cc_mask - `1`;
858
859	/*
860	* The kernel mapping should match the TDX metadata for the page.
861	* load_unaligned_zeropad() can touch memory adjacent to that which is
862	* owned by the caller and can catch even _momentary_ mismatches. Bad
863	* things happen on mismatch:
864	*
865	* - Private mapping => Shared Page == Guest shutdown
866	* - Shared mapping => Private Page == Recoverable #VE
867	*
868	* guest.enc_status_change_prepare() converts the page from
869	* shared=>private before the mapping becomes private.
870	*
871	* guest.enc_status_change_finish() converts the page from
872	* private=>shared after the mapping becomes private.
873	*
874	* In both cases there is a temporary shared mapping to a private page,
875	* which can result in a #VE. But, there is never a private mapping to
876	* a shared page.
877	*/
878	x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare;
879	x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish;
880
881	x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
882	x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
883
884	/*
885	* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
886	* bringup low level code. That raises #VE which cannot be handled
887	* there.
888	*
889	* Intel-TDX has a secure RDMSR hypercall, but that needs to be
890	* implemented separately in the low level startup ASM code.
891	* Until that is in place, disable parallel bringup for TDX.
892	*/
893	x86_cpuinit.parallel_bringup = false;
894
895	pr_info("Guest detected\n");
896	}
897

source code of linux/arch/x86/coco/tdx/tdx.c