1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (C) 2021-2022 Intel Corporation */ |
3 | |
4 | #undef pr_fmt |
5 | #define pr_fmt(fmt) "tdx: " fmt |
6 | |
7 | #include <linux/cpufeature.h> |
8 | #include <linux/export.h> |
9 | #include <linux/io.h> |
10 | #include <asm/coco.h> |
11 | #include <asm/tdx.h> |
12 | #include <asm/vmx.h> |
13 | #include <asm/ia32.h> |
14 | #include <asm/insn.h> |
15 | #include <asm/insn-eval.h> |
16 | #include <asm/pgtable.h> |
17 | |
18 | /* MMIO direction */ |
19 | #define EPT_READ 0 |
20 | #define EPT_WRITE 1 |
21 | |
22 | /* Port I/O direction */ |
23 | #define PORT_READ 0 |
24 | #define PORT_WRITE 1 |
25 | |
26 | /* See Exit Qualification for I/O Instructions in VMX documentation */ |
27 | #define VE_IS_IO_IN(e) ((e) & BIT(3)) |
28 | #define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1) |
29 | #define VE_GET_PORT_NUM(e) ((e) >> 16) |
30 | #define VE_IS_IO_STRING(e) ((e) & BIT(4)) |
31 | |
32 | #define ATTR_DEBUG BIT(0) |
33 | #define ATTR_SEPT_VE_DISABLE BIT(28) |
34 | |
35 | /* TDX Module call error codes */ |
36 | #define TDCALL_RETURN_CODE(a) ((a) >> 32) |
37 | #define TDCALL_INVALID_OPERAND 0xc0000100 |
38 | |
39 | #define TDREPORT_SUBTYPE_0 0 |
40 | |
41 | /* Called from __tdx_hypercall() for unrecoverable failure */ |
42 | noinstr void __noreturn __tdx_hypercall_failed(void) |
43 | { |
44 | instrumentation_begin(); |
45 | panic(fmt: "TDVMCALL failed. TDX module bug?" ); |
46 | } |
47 | |
48 | #ifdef CONFIG_KVM_GUEST |
49 | long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, |
50 | unsigned long p3, unsigned long p4) |
51 | { |
52 | struct tdx_module_args args = { |
53 | .r10 = nr, |
54 | .r11 = p1, |
55 | .r12 = p2, |
56 | .r13 = p3, |
57 | .r14 = p4, |
58 | }; |
59 | |
60 | return __tdx_hypercall(args: &args); |
61 | } |
62 | EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); |
63 | #endif |
64 | |
65 | /* |
66 | * Used for TDX guests to make calls directly to the TD module. This |
67 | * should only be used for calls that have no legitimate reason to fail |
68 | * or where the kernel can not survive the call failing. |
69 | */ |
70 | static inline void tdcall(u64 fn, struct tdx_module_args *args) |
71 | { |
72 | if (__tdcall_ret(fn, args)) |
73 | panic(fmt: "TDCALL %lld failed (Buggy TDX module!)\n" , fn); |
74 | } |
75 | |
76 | /** |
77 | * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT |
78 | * subtype 0) using TDG.MR.REPORT TDCALL. |
79 | * @reportdata: Address of the input buffer which contains user-defined |
80 | * REPORTDATA to be included into TDREPORT. |
81 | * @tdreport: Address of the output buffer to store TDREPORT. |
82 | * |
83 | * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module |
84 | * v1.0 specification for more information on TDG.MR.REPORT TDCALL. |
85 | * It is used in the TDX guest driver module to get the TDREPORT0. |
86 | * |
87 | * Return 0 on success, -EINVAL for invalid operands, or -EIO on |
88 | * other TDCALL failures. |
89 | */ |
90 | int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport) |
91 | { |
92 | struct tdx_module_args args = { |
93 | .rcx = virt_to_phys(address: tdreport), |
94 | .rdx = virt_to_phys(address: reportdata), |
95 | .r8 = TDREPORT_SUBTYPE_0, |
96 | }; |
97 | u64 ret; |
98 | |
99 | ret = __tdcall(TDG_MR_REPORT, args: &args); |
100 | if (ret) { |
101 | if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND) |
102 | return -EINVAL; |
103 | return -EIO; |
104 | } |
105 | |
106 | return 0; |
107 | } |
108 | EXPORT_SYMBOL_GPL(tdx_mcall_get_report0); |
109 | |
110 | /** |
111 | * tdx_hcall_get_quote() - Wrapper to request TD Quote using GetQuote |
112 | * hypercall. |
113 | * @buf: Address of the directly mapped shared kernel buffer which |
114 | * contains TDREPORT. The same buffer will be used by VMM to |
115 | * store the generated TD Quote output. |
116 | * @size: size of the tdquote buffer (4KB-aligned). |
117 | * |
118 | * Refer to section titled "TDG.VP.VMCALL<GetQuote>" in the TDX GHCI |
119 | * v1.0 specification for more information on GetQuote hypercall. |
120 | * It is used in the TDX guest driver module to get the TD Quote. |
121 | * |
122 | * Return 0 on success or error code on failure. |
123 | */ |
124 | u64 tdx_hcall_get_quote(u8 *buf, size_t size) |
125 | { |
126 | /* Since buf is a shared memory, set the shared (decrypted) bits */ |
127 | return _tdx_hypercall(TDVMCALL_GET_QUOTE, r12: cc_mkdec(virt_to_phys(address: buf)), r13: size, r14: 0, r15: 0); |
128 | } |
129 | EXPORT_SYMBOL_GPL(tdx_hcall_get_quote); |
130 | |
131 | static void __noreturn tdx_panic(const char *msg) |
132 | { |
133 | struct tdx_module_args args = { |
134 | .r10 = TDX_HYPERCALL_STANDARD, |
135 | .r11 = TDVMCALL_REPORT_FATAL_ERROR, |
136 | .r12 = 0, /* Error code: 0 is Panic */ |
137 | }; |
138 | union { |
139 | /* Define register order according to the GHCI */ |
140 | struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; }; |
141 | |
142 | char str[64]; |
143 | } message; |
144 | |
145 | /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ |
146 | strtomem_pad(message.str, msg, '\0'); |
147 | |
148 | args.r8 = message.r8; |
149 | args.r9 = message.r9; |
150 | args.r14 = message.r14; |
151 | args.r15 = message.r15; |
152 | args.rdi = message.rdi; |
153 | args.rsi = message.rsi; |
154 | args.rbx = message.rbx; |
155 | args.rdx = message.rdx; |
156 | |
157 | /* |
158 | * This hypercall should never return and it is not safe |
159 | * to keep the guest running. Call it forever if it |
160 | * happens to return. |
161 | */ |
162 | while (1) |
163 | __tdx_hypercall(args: &args); |
164 | } |
165 | |
166 | static void tdx_parse_tdinfo(u64 *cc_mask) |
167 | { |
168 | struct tdx_module_args args = {}; |
169 | unsigned int gpa_width; |
170 | u64 td_attr; |
171 | |
172 | /* |
173 | * TDINFO TDX module call is used to get the TD execution environment |
174 | * information like GPA width, number of available vcpus, debug mode |
175 | * information, etc. More details about the ABI can be found in TDX |
176 | * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL |
177 | * [TDG.VP.INFO]. |
178 | */ |
179 | tdcall(TDG_VP_INFO, args: &args); |
180 | |
181 | /* |
182 | * The highest bit of a guest physical address is the "sharing" bit. |
183 | * Set it for shared pages and clear it for private pages. |
184 | * |
185 | * The GPA width that comes out of this call is critical. TDX guests |
186 | * can not meaningfully run without it. |
187 | */ |
188 | gpa_width = args.rcx & GENMASK(5, 0); |
189 | *cc_mask = BIT_ULL(gpa_width - 1); |
190 | |
191 | /* |
192 | * The kernel can not handle #VE's when accessing normal kernel |
193 | * memory. Ensure that no #VE will be delivered for accesses to |
194 | * TD-private memory. Only VMM-shared memory (MMIO) will #VE. |
195 | */ |
196 | td_attr = args.rdx; |
197 | if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { |
198 | const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set." ; |
199 | |
200 | /* Relax SEPT_VE_DISABLE check for debug TD. */ |
201 | if (td_attr & ATTR_DEBUG) |
202 | pr_warn("%s\n" , msg); |
203 | else |
204 | tdx_panic(msg); |
205 | } |
206 | } |
207 | |
208 | /* |
209 | * The TDX module spec states that #VE may be injected for a limited set of |
210 | * reasons: |
211 | * |
212 | * - Emulation of the architectural #VE injection on EPT violation; |
213 | * |
214 | * - As a result of guest TD execution of a disallowed instruction, |
215 | * a disallowed MSR access, or CPUID virtualization; |
216 | * |
217 | * - A notification to the guest TD about anomalous behavior; |
218 | * |
219 | * The last one is opt-in and is not used by the kernel. |
220 | * |
221 | * The Intel Software Developer's Manual describes cases when instruction |
222 | * length field can be used in section "Information for VM Exits Due to |
223 | * Instruction Execution". |
224 | * |
225 | * For TDX, it ultimately means GET_VEINFO provides reliable instruction length |
226 | * information if #VE occurred due to instruction execution, but not for EPT |
227 | * violations. |
228 | */ |
229 | static int ve_instr_len(struct ve_info *ve) |
230 | { |
231 | switch (ve->exit_reason) { |
232 | case EXIT_REASON_HLT: |
233 | case EXIT_REASON_MSR_READ: |
234 | case EXIT_REASON_MSR_WRITE: |
235 | case EXIT_REASON_CPUID: |
236 | case EXIT_REASON_IO_INSTRUCTION: |
237 | /* It is safe to use ve->instr_len for #VE due instructions */ |
238 | return ve->instr_len; |
239 | case EXIT_REASON_EPT_VIOLATION: |
240 | /* |
241 | * For EPT violations, ve->insn_len is not defined. For those, |
242 | * the kernel must decode instructions manually and should not |
243 | * be using this function. |
244 | */ |
245 | WARN_ONCE(1, "ve->instr_len is not defined for EPT violations" ); |
246 | return 0; |
247 | default: |
248 | WARN_ONCE(1, "Unexpected #VE-type: %lld\n" , ve->exit_reason); |
249 | return ve->instr_len; |
250 | } |
251 | } |
252 | |
253 | static u64 __cpuidle __halt(const bool irq_disabled) |
254 | { |
255 | struct tdx_module_args args = { |
256 | .r10 = TDX_HYPERCALL_STANDARD, |
257 | .r11 = hcall_func(EXIT_REASON_HLT), |
258 | .r12 = irq_disabled, |
259 | }; |
260 | |
261 | /* |
262 | * Emulate HLT operation via hypercall. More info about ABI |
263 | * can be found in TDX Guest-Host-Communication Interface |
264 | * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>. |
265 | * |
266 | * The VMM uses the "IRQ disabled" param to understand IRQ |
267 | * enabled status (RFLAGS.IF) of the TD guest and to determine |
268 | * whether or not it should schedule the halted vCPU if an |
269 | * IRQ becomes pending. E.g. if IRQs are disabled, the VMM |
270 | * can keep the vCPU in virtual HLT, even if an IRQ is |
271 | * pending, without hanging/breaking the guest. |
272 | */ |
273 | return __tdx_hypercall(args: &args); |
274 | } |
275 | |
276 | static int handle_halt(struct ve_info *ve) |
277 | { |
278 | const bool irq_disabled = irqs_disabled(); |
279 | |
280 | if (__halt(irq_disabled)) |
281 | return -EIO; |
282 | |
283 | return ve_instr_len(ve); |
284 | } |
285 | |
286 | void __cpuidle tdx_safe_halt(void) |
287 | { |
288 | const bool irq_disabled = false; |
289 | |
290 | /* |
291 | * Use WARN_ONCE() to report the failure. |
292 | */ |
293 | if (__halt(irq_disabled)) |
294 | WARN_ONCE(1, "HLT instruction emulation failed\n" ); |
295 | } |
296 | |
297 | static int read_msr(struct pt_regs *regs, struct ve_info *ve) |
298 | { |
299 | struct tdx_module_args args = { |
300 | .r10 = TDX_HYPERCALL_STANDARD, |
301 | .r11 = hcall_func(EXIT_REASON_MSR_READ), |
302 | .r12 = regs->cx, |
303 | }; |
304 | |
305 | /* |
306 | * Emulate the MSR read via hypercall. More info about ABI |
307 | * can be found in TDX Guest-Host-Communication Interface |
308 | * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". |
309 | */ |
310 | if (__tdx_hypercall(args: &args)) |
311 | return -EIO; |
312 | |
313 | regs->ax = lower_32_bits(args.r11); |
314 | regs->dx = upper_32_bits(args.r11); |
315 | return ve_instr_len(ve); |
316 | } |
317 | |
318 | static int write_msr(struct pt_regs *regs, struct ve_info *ve) |
319 | { |
320 | struct tdx_module_args args = { |
321 | .r10 = TDX_HYPERCALL_STANDARD, |
322 | .r11 = hcall_func(EXIT_REASON_MSR_WRITE), |
323 | .r12 = regs->cx, |
324 | .r13 = (u64)regs->dx << 32 | regs->ax, |
325 | }; |
326 | |
327 | /* |
328 | * Emulate the MSR write via hypercall. More info about ABI |
329 | * can be found in TDX Guest-Host-Communication Interface |
330 | * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". |
331 | */ |
332 | if (__tdx_hypercall(args: &args)) |
333 | return -EIO; |
334 | |
335 | return ve_instr_len(ve); |
336 | } |
337 | |
338 | static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) |
339 | { |
340 | struct tdx_module_args args = { |
341 | .r10 = TDX_HYPERCALL_STANDARD, |
342 | .r11 = hcall_func(EXIT_REASON_CPUID), |
343 | .r12 = regs->ax, |
344 | .r13 = regs->cx, |
345 | }; |
346 | |
347 | /* |
348 | * Only allow VMM to control range reserved for hypervisor |
349 | * communication. |
350 | * |
351 | * Return all-zeros for any CPUID outside the range. It matches CPU |
352 | * behaviour for non-supported leaf. |
353 | */ |
354 | if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { |
355 | regs->ax = regs->bx = regs->cx = regs->dx = 0; |
356 | return ve_instr_len(ve); |
357 | } |
358 | |
359 | /* |
360 | * Emulate the CPUID instruction via a hypercall. More info about |
361 | * ABI can be found in TDX Guest-Host-Communication Interface |
362 | * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". |
363 | */ |
364 | if (__tdx_hypercall(args: &args)) |
365 | return -EIO; |
366 | |
367 | /* |
368 | * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of |
369 | * EAX, EBX, ECX, EDX registers after the CPUID instruction execution. |
370 | * So copy the register contents back to pt_regs. |
371 | */ |
372 | regs->ax = args.r12; |
373 | regs->bx = args.r13; |
374 | regs->cx = args.r14; |
375 | regs->dx = args.r15; |
376 | |
377 | return ve_instr_len(ve); |
378 | } |
379 | |
380 | static bool mmio_read(int size, unsigned long addr, unsigned long *val) |
381 | { |
382 | struct tdx_module_args args = { |
383 | .r10 = TDX_HYPERCALL_STANDARD, |
384 | .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION), |
385 | .r12 = size, |
386 | .r13 = EPT_READ, |
387 | .r14 = addr, |
388 | .r15 = *val, |
389 | }; |
390 | |
391 | if (__tdx_hypercall(args: &args)) |
392 | return false; |
393 | |
394 | *val = args.r11; |
395 | return true; |
396 | } |
397 | |
398 | static bool mmio_write(int size, unsigned long addr, unsigned long val) |
399 | { |
400 | return !_tdx_hypercall(fn: hcall_func(EXIT_REASON_EPT_VIOLATION), r12: size, |
401 | EPT_WRITE, r14: addr, r15: val); |
402 | } |
403 | |
404 | static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) |
405 | { |
406 | unsigned long *reg, val, vaddr; |
407 | char buffer[MAX_INSN_SIZE]; |
408 | enum insn_mmio_type mmio; |
409 | struct insn insn = {}; |
410 | int size, extend_size; |
411 | u8 extend_val = 0; |
412 | |
413 | /* Only in-kernel MMIO is supported */ |
414 | if (WARN_ON_ONCE(user_mode(regs))) |
415 | return -EFAULT; |
416 | |
417 | if (copy_from_kernel_nofault(dst: buffer, src: (void *)regs->ip, MAX_INSN_SIZE)) |
418 | return -EFAULT; |
419 | |
420 | if (insn_decode(insn: &insn, kaddr: buffer, MAX_INSN_SIZE, m: INSN_MODE_64)) |
421 | return -EINVAL; |
422 | |
423 | mmio = insn_decode_mmio(insn: &insn, bytes: &size); |
424 | if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED)) |
425 | return -EINVAL; |
426 | |
427 | if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { |
428 | reg = insn_get_modrm_reg_ptr(insn: &insn, regs); |
429 | if (!reg) |
430 | return -EINVAL; |
431 | } |
432 | |
433 | /* |
434 | * Reject EPT violation #VEs that split pages. |
435 | * |
436 | * MMIO accesses are supposed to be naturally aligned and therefore |
437 | * never cross page boundaries. Seeing split page accesses indicates |
438 | * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. |
439 | * |
440 | * load_unaligned_zeropad() will recover using exception fixups. |
441 | */ |
442 | vaddr = (unsigned long)insn_get_addr_ref(insn: &insn, regs); |
443 | if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) |
444 | return -EFAULT; |
445 | |
446 | /* Handle writes first */ |
447 | switch (mmio) { |
448 | case INSN_MMIO_WRITE: |
449 | memcpy(&val, reg, size); |
450 | if (!mmio_write(size, addr: ve->gpa, val)) |
451 | return -EIO; |
452 | return insn.length; |
453 | case INSN_MMIO_WRITE_IMM: |
454 | val = insn.immediate.value; |
455 | if (!mmio_write(size, addr: ve->gpa, val)) |
456 | return -EIO; |
457 | return insn.length; |
458 | case INSN_MMIO_READ: |
459 | case INSN_MMIO_READ_ZERO_EXTEND: |
460 | case INSN_MMIO_READ_SIGN_EXTEND: |
461 | /* Reads are handled below */ |
462 | break; |
463 | case INSN_MMIO_MOVS: |
464 | case INSN_MMIO_DECODE_FAILED: |
465 | /* |
466 | * MMIO was accessed with an instruction that could not be |
467 | * decoded or handled properly. It was likely not using io.h |
468 | * helpers or accessed MMIO accidentally. |
469 | */ |
470 | return -EINVAL; |
471 | default: |
472 | WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?" ); |
473 | return -EINVAL; |
474 | } |
475 | |
476 | /* Handle reads */ |
477 | if (!mmio_read(size, addr: ve->gpa, val: &val)) |
478 | return -EIO; |
479 | |
480 | switch (mmio) { |
481 | case INSN_MMIO_READ: |
482 | /* Zero-extend for 32-bit operation */ |
483 | extend_size = size == 4 ? sizeof(*reg) : 0; |
484 | break; |
485 | case INSN_MMIO_READ_ZERO_EXTEND: |
486 | /* Zero extend based on operand size */ |
487 | extend_size = insn.opnd_bytes; |
488 | break; |
489 | case INSN_MMIO_READ_SIGN_EXTEND: |
490 | /* Sign extend based on operand size */ |
491 | extend_size = insn.opnd_bytes; |
492 | if (size == 1 && val & BIT(7)) |
493 | extend_val = 0xFF; |
494 | else if (size > 1 && val & BIT(15)) |
495 | extend_val = 0xFF; |
496 | break; |
497 | default: |
498 | /* All other cases has to be covered with the first switch() */ |
499 | WARN_ON_ONCE(1); |
500 | return -EINVAL; |
501 | } |
502 | |
503 | if (extend_size) |
504 | memset(reg, extend_val, extend_size); |
505 | memcpy(reg, &val, size); |
506 | return insn.length; |
507 | } |
508 | |
509 | static bool handle_in(struct pt_regs *regs, int size, int port) |
510 | { |
511 | struct tdx_module_args args = { |
512 | .r10 = TDX_HYPERCALL_STANDARD, |
513 | .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), |
514 | .r12 = size, |
515 | .r13 = PORT_READ, |
516 | .r14 = port, |
517 | }; |
518 | u64 mask = GENMASK(BITS_PER_BYTE * size, 0); |
519 | bool success; |
520 | |
521 | /* |
522 | * Emulate the I/O read via hypercall. More info about ABI can be found |
523 | * in TDX Guest-Host-Communication Interface (GHCI) section titled |
524 | * "TDG.VP.VMCALL<Instruction.IO>". |
525 | */ |
526 | success = !__tdx_hypercall(args: &args); |
527 | |
528 | /* Update part of the register affected by the emulated instruction */ |
529 | regs->ax &= ~mask; |
530 | if (success) |
531 | regs->ax |= args.r11 & mask; |
532 | |
533 | return success; |
534 | } |
535 | |
536 | static bool handle_out(struct pt_regs *regs, int size, int port) |
537 | { |
538 | u64 mask = GENMASK(BITS_PER_BYTE * size, 0); |
539 | |
540 | /* |
541 | * Emulate the I/O write via hypercall. More info about ABI can be found |
542 | * in TDX Guest-Host-Communication Interface (GHCI) section titled |
543 | * "TDG.VP.VMCALL<Instruction.IO>". |
544 | */ |
545 | return !_tdx_hypercall(fn: hcall_func(EXIT_REASON_IO_INSTRUCTION), r12: size, |
546 | PORT_WRITE, r14: port, r15: regs->ax & mask); |
547 | } |
548 | |
549 | /* |
550 | * Emulate I/O using hypercall. |
551 | * |
552 | * Assumes the IO instruction was using ax, which is enforced |
553 | * by the standard io.h macros. |
554 | * |
555 | * Return True on success or False on failure. |
556 | */ |
557 | static int handle_io(struct pt_regs *regs, struct ve_info *ve) |
558 | { |
559 | u32 exit_qual = ve->exit_qual; |
560 | int size, port; |
561 | bool in, ret; |
562 | |
563 | if (VE_IS_IO_STRING(exit_qual)) |
564 | return -EIO; |
565 | |
566 | in = VE_IS_IO_IN(exit_qual); |
567 | size = VE_GET_IO_SIZE(exit_qual); |
568 | port = VE_GET_PORT_NUM(exit_qual); |
569 | |
570 | |
571 | if (in) |
572 | ret = handle_in(regs, size, port); |
573 | else |
574 | ret = handle_out(regs, size, port); |
575 | if (!ret) |
576 | return -EIO; |
577 | |
578 | return ve_instr_len(ve); |
579 | } |
580 | |
581 | /* |
582 | * Early #VE exception handler. Only handles a subset of port I/O. |
583 | * Intended only for earlyprintk. If failed, return false. |
584 | */ |
585 | __init bool tdx_early_handle_ve(struct pt_regs *regs) |
586 | { |
587 | struct ve_info ve; |
588 | int insn_len; |
589 | |
590 | tdx_get_ve_info(ve: &ve); |
591 | |
592 | if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) |
593 | return false; |
594 | |
595 | insn_len = handle_io(regs, ve: &ve); |
596 | if (insn_len < 0) |
597 | return false; |
598 | |
599 | regs->ip += insn_len; |
600 | return true; |
601 | } |
602 | |
603 | void tdx_get_ve_info(struct ve_info *ve) |
604 | { |
605 | struct tdx_module_args args = {}; |
606 | |
607 | /* |
608 | * Called during #VE handling to retrieve the #VE info from the |
609 | * TDX module. |
610 | * |
611 | * This has to be called early in #VE handling. A "nested" #VE which |
612 | * occurs before this will raise a #DF and is not recoverable. |
613 | * |
614 | * The call retrieves the #VE info from the TDX module, which also |
615 | * clears the "#VE valid" flag. This must be done before anything else |
616 | * because any #VE that occurs while the valid flag is set will lead to |
617 | * #DF. |
618 | * |
619 | * Note, the TDX module treats virtual NMIs as inhibited if the #VE |
620 | * valid flag is set. It means that NMI=>#VE will not result in a #DF. |
621 | */ |
622 | tdcall(TDG_VP_VEINFO_GET, args: &args); |
623 | |
624 | /* Transfer the output parameters */ |
625 | ve->exit_reason = args.rcx; |
626 | ve->exit_qual = args.rdx; |
627 | ve->gla = args.r8; |
628 | ve->gpa = args.r9; |
629 | ve->instr_len = lower_32_bits(args.r10); |
630 | ve->instr_info = upper_32_bits(args.r10); |
631 | } |
632 | |
633 | /* |
634 | * Handle the user initiated #VE. |
635 | * |
636 | * On success, returns the number of bytes RIP should be incremented (>=0) |
637 | * or -errno on error. |
638 | */ |
639 | static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) |
640 | { |
641 | switch (ve->exit_reason) { |
642 | case EXIT_REASON_CPUID: |
643 | return handle_cpuid(regs, ve); |
644 | default: |
645 | pr_warn("Unexpected #VE: %lld\n" , ve->exit_reason); |
646 | return -EIO; |
647 | } |
648 | } |
649 | |
650 | static inline bool is_private_gpa(u64 gpa) |
651 | { |
652 | return gpa == cc_mkenc(val: gpa); |
653 | } |
654 | |
655 | /* |
656 | * Handle the kernel #VE. |
657 | * |
658 | * On success, returns the number of bytes RIP should be incremented (>=0) |
659 | * or -errno on error. |
660 | */ |
661 | static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) |
662 | { |
663 | switch (ve->exit_reason) { |
664 | case EXIT_REASON_HLT: |
665 | return handle_halt(ve); |
666 | case EXIT_REASON_MSR_READ: |
667 | return read_msr(regs, ve); |
668 | case EXIT_REASON_MSR_WRITE: |
669 | return write_msr(regs, ve); |
670 | case EXIT_REASON_CPUID: |
671 | return handle_cpuid(regs, ve); |
672 | case EXIT_REASON_EPT_VIOLATION: |
673 | if (is_private_gpa(gpa: ve->gpa)) |
674 | panic(fmt: "Unexpected EPT-violation on private memory." ); |
675 | return handle_mmio(regs, ve); |
676 | case EXIT_REASON_IO_INSTRUCTION: |
677 | return handle_io(regs, ve); |
678 | default: |
679 | pr_warn("Unexpected #VE: %lld\n" , ve->exit_reason); |
680 | return -EIO; |
681 | } |
682 | } |
683 | |
684 | bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) |
685 | { |
686 | int insn_len; |
687 | |
688 | if (user_mode(regs)) |
689 | insn_len = virt_exception_user(regs, ve); |
690 | else |
691 | insn_len = virt_exception_kernel(regs, ve); |
692 | if (insn_len < 0) |
693 | return false; |
694 | |
695 | /* After successful #VE handling, move the IP */ |
696 | regs->ip += insn_len; |
697 | |
698 | return true; |
699 | } |
700 | |
701 | static bool tdx_tlb_flush_required(bool private) |
702 | { |
703 | /* |
704 | * TDX guest is responsible for flushing TLB on private->shared |
705 | * transition. VMM is responsible for flushing on shared->private. |
706 | * |
707 | * The VMM _can't_ flush private addresses as it can't generate PAs |
708 | * with the guest's HKID. Shared memory isn't subject to integrity |
709 | * checking, i.e. the VMM doesn't need to flush for its own protection. |
710 | * |
711 | * There's no need to flush when converting from shared to private, |
712 | * as flushing is the VMM's responsibility in this case, e.g. it must |
713 | * flush to avoid integrity failures in the face of a buggy or |
714 | * malicious guest. |
715 | */ |
716 | return !private; |
717 | } |
718 | |
719 | static bool tdx_cache_flush_required(void) |
720 | { |
721 | /* |
722 | * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence. |
723 | * TDX doesn't have such capability. |
724 | * |
725 | * Flush cache unconditionally. |
726 | */ |
727 | return true; |
728 | } |
729 | |
730 | /* |
731 | * Notify the VMM about page mapping conversion. More info about ABI |
732 | * can be found in TDX Guest-Host-Communication Interface (GHCI), |
733 | * section "TDG.VP.VMCALL<MapGPA>". |
734 | */ |
735 | static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc) |
736 | { |
737 | /* Retrying the hypercall a second time should succeed; use 3 just in case */ |
738 | const int max_retries_per_page = 3; |
739 | int retry_count = 0; |
740 | |
741 | if (!enc) { |
742 | /* Set the shared (decrypted) bits: */ |
743 | start |= cc_mkdec(val: 0); |
744 | end |= cc_mkdec(val: 0); |
745 | } |
746 | |
747 | while (retry_count < max_retries_per_page) { |
748 | struct tdx_module_args args = { |
749 | .r10 = TDX_HYPERCALL_STANDARD, |
750 | .r11 = TDVMCALL_MAP_GPA, |
751 | .r12 = start, |
752 | .r13 = end - start }; |
753 | |
754 | u64 map_fail_paddr; |
755 | u64 ret = __tdx_hypercall(args: &args); |
756 | |
757 | if (ret != TDVMCALL_STATUS_RETRY) |
758 | return !ret; |
759 | /* |
760 | * The guest must retry the operation for the pages in the |
761 | * region starting at the GPA specified in R11. R11 comes |
762 | * from the untrusted VMM. Sanity check it. |
763 | */ |
764 | map_fail_paddr = args.r11; |
765 | if (map_fail_paddr < start || map_fail_paddr >= end) |
766 | return false; |
767 | |
768 | /* "Consume" a retry without forward progress */ |
769 | if (map_fail_paddr == start) { |
770 | retry_count++; |
771 | continue; |
772 | } |
773 | |
774 | start = map_fail_paddr; |
775 | retry_count = 0; |
776 | } |
777 | |
778 | return false; |
779 | } |
780 | |
781 | /* |
782 | * Inform the VMM of the guest's intent for this physical page: shared with |
783 | * the VMM or private to the guest. The VMM is expected to change its mapping |
784 | * of the page in response. |
785 | */ |
786 | static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc) |
787 | { |
788 | phys_addr_t start = __pa(vaddr); |
789 | phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE); |
790 | |
791 | if (!tdx_map_gpa(start, end, enc)) |
792 | return false; |
793 | |
794 | /* shared->private conversion requires memory to be accepted before use */ |
795 | if (enc) |
796 | return tdx_accept_memory(start, end); |
797 | |
798 | return true; |
799 | } |
800 | |
801 | static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages, |
802 | bool enc) |
803 | { |
804 | /* |
805 | * Only handle shared->private conversion here. |
806 | * See the comment in tdx_early_init(). |
807 | */ |
808 | if (enc) |
809 | return tdx_enc_status_changed(vaddr, numpages, enc); |
810 | return true; |
811 | } |
812 | |
813 | static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages, |
814 | bool enc) |
815 | { |
816 | /* |
817 | * Only handle private->shared conversion here. |
818 | * See the comment in tdx_early_init(). |
819 | */ |
820 | if (!enc) |
821 | return tdx_enc_status_changed(vaddr, numpages, enc); |
822 | return true; |
823 | } |
824 | |
825 | void __init tdx_early_init(void) |
826 | { |
827 | struct tdx_module_args args = { |
828 | .rdx = TDCS_NOTIFY_ENABLES, |
829 | .r9 = -1ULL, |
830 | }; |
831 | u64 cc_mask; |
832 | u32 eax, sig[3]; |
833 | |
834 | cpuid_count(TDX_CPUID_LEAF_ID, count: 0, eax: &eax, ebx: &sig[0], ecx: &sig[2], edx: &sig[1]); |
835 | |
836 | if (memcmp(TDX_IDENT, q: sig, size: sizeof(sig))) |
837 | return; |
838 | |
839 | setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); |
840 | |
841 | /* TSC is the only reliable clock in TDX guest */ |
842 | setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); |
843 | |
844 | cc_vendor = CC_VENDOR_INTEL; |
845 | tdx_parse_tdinfo(cc_mask: &cc_mask); |
846 | cc_set_mask(mask: cc_mask); |
847 | |
848 | /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ |
849 | tdcall(TDG_VM_WR, args: &args); |
850 | |
851 | /* |
852 | * All bits above GPA width are reserved and kernel treats shared bit |
853 | * as flag, not as part of physical address. |
854 | * |
855 | * Adjust physical mask to only cover valid GPA bits. |
856 | */ |
857 | physical_mask &= cc_mask - 1; |
858 | |
859 | /* |
860 | * The kernel mapping should match the TDX metadata for the page. |
861 | * load_unaligned_zeropad() can touch memory *adjacent* to that which is |
862 | * owned by the caller and can catch even _momentary_ mismatches. Bad |
863 | * things happen on mismatch: |
864 | * |
865 | * - Private mapping => Shared Page == Guest shutdown |
866 | * - Shared mapping => Private Page == Recoverable #VE |
867 | * |
868 | * guest.enc_status_change_prepare() converts the page from |
869 | * shared=>private before the mapping becomes private. |
870 | * |
871 | * guest.enc_status_change_finish() converts the page from |
872 | * private=>shared after the mapping becomes private. |
873 | * |
874 | * In both cases there is a temporary shared mapping to a private page, |
875 | * which can result in a #VE. But, there is never a private mapping to |
876 | * a shared page. |
877 | */ |
878 | x86_platform.guest.enc_status_change_prepare = tdx_enc_status_change_prepare; |
879 | x86_platform.guest.enc_status_change_finish = tdx_enc_status_change_finish; |
880 | |
881 | x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required; |
882 | x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; |
883 | |
884 | /* |
885 | * TDX intercepts the RDMSR to read the X2APIC ID in the parallel |
886 | * bringup low level code. That raises #VE which cannot be handled |
887 | * there. |
888 | * |
889 | * Intel-TDX has a secure RDMSR hypercall, but that needs to be |
890 | * implemented separately in the low level startup ASM code. |
891 | * Until that is in place, disable parallel bringup for TDX. |
892 | */ |
893 | x86_cpuinit.parallel_bringup = false; |
894 | |
895 | pr_info("Guest detected\n" ); |
896 | } |
897 | |