1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (C) 2021-2022 Intel Corporation */
3
4 #undef pr_fmt
5 #define pr_fmt(fmt) "tdx: " fmt
6
7 #include <linux/cpufeature.h>
8 #include <linux/export.h>
9 #include <linux/io.h>
10 #include <asm/coco.h>
11 #include <asm/tdx.h>
12 #include <asm/vmx.h>
13 #include <asm/insn.h>
14 #include <asm/insn-eval.h>
15 #include <asm/pgtable.h>
16
17 /* TDX module Call Leaf IDs */
18 #define TDX_GET_INFO 1
19 #define TDX_GET_VEINFO 3
20 #define TDX_GET_REPORT 4
21 #define TDX_ACCEPT_PAGE 6
22 #define TDX_WR 8
23
24 /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
25 #define TDCS_NOTIFY_ENABLES 0x9100000000000010
26
27 /* TDX hypercall Leaf IDs */
28 #define TDVMCALL_MAP_GPA 0x10001
29 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003
30
31 /* MMIO direction */
32 #define EPT_READ 0
33 #define EPT_WRITE 1
34
35 /* Port I/O direction */
36 #define PORT_READ 0
37 #define PORT_WRITE 1
38
39 /* See Exit Qualification for I/O Instructions in VMX documentation */
40 #define VE_IS_IO_IN(e) ((e) & BIT(3))
41 #define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
42 #define VE_GET_PORT_NUM(e) ((e) >> 16)
43 #define VE_IS_IO_STRING(e) ((e) & BIT(4))
44
45 #define ATTR_DEBUG BIT(0)
46 #define ATTR_SEPT_VE_DISABLE BIT(28)
47
48 /* TDX Module call error codes */
49 #define TDCALL_RETURN_CODE(a) ((a) >> 32)
50 #define TDCALL_INVALID_OPERAND 0xc0000100
51
52 #define TDREPORT_SUBTYPE_0 0
53
54 /*
55 * Wrapper for standard use of __tdx_hypercall with no output aside from
56 * return code.
57 */
_tdx_hypercall(u64 fn,u64 r12,u64 r13,u64 r14,u64 r15)58 static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
59 {
60 struct tdx_hypercall_args args = {
61 .r10 = TDX_HYPERCALL_STANDARD,
62 .r11 = fn,
63 .r12 = r12,
64 .r13 = r13,
65 .r14 = r14,
66 .r15 = r15,
67 };
68
69 return __tdx_hypercall(&args, 0);
70 }
71
72 /* Called from __tdx_hypercall() for unrecoverable failure */
__tdx_hypercall_failed(void)73 noinstr void __tdx_hypercall_failed(void)
74 {
75 instrumentation_begin();
76 panic("TDVMCALL failed. TDX module bug?");
77 }
78
79 /*
80 * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
81 * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
82 * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
83 * guest sides of these calls.
84 */
hcall_func(u64 exit_reason)85 static __always_inline u64 hcall_func(u64 exit_reason)
86 {
87 return exit_reason;
88 }
89
90 #ifdef CONFIG_KVM_GUEST
tdx_kvm_hypercall(unsigned int nr,unsigned long p1,unsigned long p2,unsigned long p3,unsigned long p4)91 long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
92 unsigned long p3, unsigned long p4)
93 {
94 struct tdx_hypercall_args args = {
95 .r10 = nr,
96 .r11 = p1,
97 .r12 = p2,
98 .r13 = p3,
99 .r14 = p4,
100 };
101
102 return __tdx_hypercall(&args, 0);
103 }
104 EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
105 #endif
106
107 /*
108 * Used for TDX guests to make calls directly to the TD module. This
109 * should only be used for calls that have no legitimate reason to fail
110 * or where the kernel can not survive the call failing.
111 */
tdx_module_call(u64 fn,u64 rcx,u64 rdx,u64 r8,u64 r9,struct tdx_module_output * out)112 static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
113 struct tdx_module_output *out)
114 {
115 if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
116 panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
117 }
118
119 /**
120 * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
121 * subtype 0) using TDG.MR.REPORT TDCALL.
122 * @reportdata: Address of the input buffer which contains user-defined
123 * REPORTDATA to be included into TDREPORT.
124 * @tdreport: Address of the output buffer to store TDREPORT.
125 *
126 * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
127 * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
128 * It is used in the TDX guest driver module to get the TDREPORT0.
129 *
130 * Return 0 on success, -EINVAL for invalid operands, or -EIO on
131 * other TDCALL failures.
132 */
tdx_mcall_get_report0(u8 * reportdata,u8 * tdreport)133 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
134 {
135 u64 ret;
136
137 ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
138 virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
139 0, NULL);
140 if (ret) {
141 if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
142 return -EINVAL;
143 return -EIO;
144 }
145
146 return 0;
147 }
148 EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
149
tdx_panic(const char * msg)150 static void __noreturn tdx_panic(const char *msg)
151 {
152 struct tdx_hypercall_args args = {
153 .r10 = TDX_HYPERCALL_STANDARD,
154 .r11 = TDVMCALL_REPORT_FATAL_ERROR,
155 .r12 = 0, /* Error code: 0 is Panic */
156 };
157 union {
158 /* Define register order according to the GHCI */
159 struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
160
161 char str[64];
162 } message;
163
164 /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
165 strncpy(message.str, msg, 64);
166
167 args.r8 = message.r8;
168 args.r9 = message.r9;
169 args.r14 = message.r14;
170 args.r15 = message.r15;
171 args.rdi = message.rdi;
172 args.rsi = message.rsi;
173 args.rbx = message.rbx;
174 args.rdx = message.rdx;
175
176 /*
177 * This hypercall should never return and it is not safe
178 * to keep the guest running. Call it forever if it
179 * happens to return.
180 */
181 while (1)
182 __tdx_hypercall(&args, 0);
183 }
184
tdx_parse_tdinfo(u64 * cc_mask)185 static void tdx_parse_tdinfo(u64 *cc_mask)
186 {
187 struct tdx_module_output out;
188 unsigned int gpa_width;
189 u64 td_attr;
190
191 /*
192 * TDINFO TDX module call is used to get the TD execution environment
193 * information like GPA width, number of available vcpus, debug mode
194 * information, etc. More details about the ABI can be found in TDX
195 * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
196 * [TDG.VP.INFO].
197 */
198 tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
199
200 /*
201 * The highest bit of a guest physical address is the "sharing" bit.
202 * Set it for shared pages and clear it for private pages.
203 *
204 * The GPA width that comes out of this call is critical. TDX guests
205 * can not meaningfully run without it.
206 */
207 gpa_width = out.rcx & GENMASK(5, 0);
208 *cc_mask = BIT_ULL(gpa_width - 1);
209
210 /*
211 * The kernel can not handle #VE's when accessing normal kernel
212 * memory. Ensure that no #VE will be delivered for accesses to
213 * TD-private memory. Only VMM-shared memory (MMIO) will #VE.
214 */
215 td_attr = out.rdx;
216 if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
217 const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
218
219 /* Relax SEPT_VE_DISABLE check for debug TD. */
220 if (td_attr & ATTR_DEBUG)
221 pr_warn("%s\n", msg);
222 else
223 tdx_panic(msg);
224 }
225 }
226
227 /*
228 * The TDX module spec states that #VE may be injected for a limited set of
229 * reasons:
230 *
231 * - Emulation of the architectural #VE injection on EPT violation;
232 *
233 * - As a result of guest TD execution of a disallowed instruction,
234 * a disallowed MSR access, or CPUID virtualization;
235 *
236 * - A notification to the guest TD about anomalous behavior;
237 *
238 * The last one is opt-in and is not used by the kernel.
239 *
240 * The Intel Software Developer's Manual describes cases when instruction
241 * length field can be used in section "Information for VM Exits Due to
242 * Instruction Execution".
243 *
244 * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
245 * information if #VE occurred due to instruction execution, but not for EPT
246 * violations.
247 */
ve_instr_len(struct ve_info * ve)248 static int ve_instr_len(struct ve_info *ve)
249 {
250 switch (ve->exit_reason) {
251 case EXIT_REASON_HLT:
252 case EXIT_REASON_MSR_READ:
253 case EXIT_REASON_MSR_WRITE:
254 case EXIT_REASON_CPUID:
255 case EXIT_REASON_IO_INSTRUCTION:
256 /* It is safe to use ve->instr_len for #VE due instructions */
257 return ve->instr_len;
258 case EXIT_REASON_EPT_VIOLATION:
259 /*
260 * For EPT violations, ve->insn_len is not defined. For those,
261 * the kernel must decode instructions manually and should not
262 * be using this function.
263 */
264 WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
265 return 0;
266 default:
267 WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
268 return ve->instr_len;
269 }
270 }
271
__halt(const bool irq_disabled)272 static u64 __cpuidle __halt(const bool irq_disabled)
273 {
274 struct tdx_hypercall_args args = {
275 .r10 = TDX_HYPERCALL_STANDARD,
276 .r11 = hcall_func(EXIT_REASON_HLT),
277 .r12 = irq_disabled,
278 };
279
280 /*
281 * Emulate HLT operation via hypercall. More info about ABI
282 * can be found in TDX Guest-Host-Communication Interface
283 * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
284 *
285 * The VMM uses the "IRQ disabled" param to understand IRQ
286 * enabled status (RFLAGS.IF) of the TD guest and to determine
287 * whether or not it should schedule the halted vCPU if an
288 * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
289 * can keep the vCPU in virtual HLT, even if an IRQ is
290 * pending, without hanging/breaking the guest.
291 */
292 return __tdx_hypercall(&args, 0);
293 }
294
handle_halt(struct ve_info * ve)295 static int handle_halt(struct ve_info *ve)
296 {
297 const bool irq_disabled = irqs_disabled();
298
299 if (__halt(irq_disabled))
300 return -EIO;
301
302 return ve_instr_len(ve);
303 }
304
tdx_safe_halt(void)305 void __cpuidle tdx_safe_halt(void)
306 {
307 const bool irq_disabled = false;
308
309 /*
310 * Use WARN_ONCE() to report the failure.
311 */
312 if (__halt(irq_disabled))
313 WARN_ONCE(1, "HLT instruction emulation failed\n");
314 }
315
read_msr(struct pt_regs * regs,struct ve_info * ve)316 static int read_msr(struct pt_regs *regs, struct ve_info *ve)
317 {
318 struct tdx_hypercall_args args = {
319 .r10 = TDX_HYPERCALL_STANDARD,
320 .r11 = hcall_func(EXIT_REASON_MSR_READ),
321 .r12 = regs->cx,
322 };
323
324 /*
325 * Emulate the MSR read via hypercall. More info about ABI
326 * can be found in TDX Guest-Host-Communication Interface
327 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
328 */
329 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
330 return -EIO;
331
332 regs->ax = lower_32_bits(args.r11);
333 regs->dx = upper_32_bits(args.r11);
334 return ve_instr_len(ve);
335 }
336
write_msr(struct pt_regs * regs,struct ve_info * ve)337 static int write_msr(struct pt_regs *regs, struct ve_info *ve)
338 {
339 struct tdx_hypercall_args args = {
340 .r10 = TDX_HYPERCALL_STANDARD,
341 .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
342 .r12 = regs->cx,
343 .r13 = (u64)regs->dx << 32 | regs->ax,
344 };
345
346 /*
347 * Emulate the MSR write via hypercall. More info about ABI
348 * can be found in TDX Guest-Host-Communication Interface
349 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
350 */
351 if (__tdx_hypercall(&args, 0))
352 return -EIO;
353
354 return ve_instr_len(ve);
355 }
356
handle_cpuid(struct pt_regs * regs,struct ve_info * ve)357 static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
358 {
359 struct tdx_hypercall_args args = {
360 .r10 = TDX_HYPERCALL_STANDARD,
361 .r11 = hcall_func(EXIT_REASON_CPUID),
362 .r12 = regs->ax,
363 .r13 = regs->cx,
364 };
365
366 /*
367 * Only allow VMM to control range reserved for hypervisor
368 * communication.
369 *
370 * Return all-zeros for any CPUID outside the range. It matches CPU
371 * behaviour for non-supported leaf.
372 */
373 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
374 regs->ax = regs->bx = regs->cx = regs->dx = 0;
375 return ve_instr_len(ve);
376 }
377
378 /*
379 * Emulate the CPUID instruction via a hypercall. More info about
380 * ABI can be found in TDX Guest-Host-Communication Interface
381 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
382 */
383 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
384 return -EIO;
385
386 /*
387 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
388 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
389 * So copy the register contents back to pt_regs.
390 */
391 regs->ax = args.r12;
392 regs->bx = args.r13;
393 regs->cx = args.r14;
394 regs->dx = args.r15;
395
396 return ve_instr_len(ve);
397 }
398
mmio_read(int size,unsigned long addr,unsigned long * val)399 static bool mmio_read(int size, unsigned long addr, unsigned long *val)
400 {
401 struct tdx_hypercall_args args = {
402 .r10 = TDX_HYPERCALL_STANDARD,
403 .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
404 .r12 = size,
405 .r13 = EPT_READ,
406 .r14 = addr,
407 .r15 = *val,
408 };
409
410 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
411 return false;
412 *val = args.r11;
413 return true;
414 }
415
mmio_write(int size,unsigned long addr,unsigned long val)416 static bool mmio_write(int size, unsigned long addr, unsigned long val)
417 {
418 return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
419 EPT_WRITE, addr, val);
420 }
421
handle_mmio(struct pt_regs * regs,struct ve_info * ve)422 static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
423 {
424 unsigned long *reg, val, vaddr;
425 char buffer[MAX_INSN_SIZE];
426 enum insn_mmio_type mmio;
427 struct insn insn = {};
428 int size, extend_size;
429 u8 extend_val = 0;
430
431 /* Only in-kernel MMIO is supported */
432 if (WARN_ON_ONCE(user_mode(regs)))
433 return -EFAULT;
434
435 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
436 return -EFAULT;
437
438 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
439 return -EINVAL;
440
441 mmio = insn_decode_mmio(&insn, &size);
442 if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
443 return -EINVAL;
444
445 if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
446 reg = insn_get_modrm_reg_ptr(&insn, regs);
447 if (!reg)
448 return -EINVAL;
449 }
450
451 /*
452 * Reject EPT violation #VEs that split pages.
453 *
454 * MMIO accesses are supposed to be naturally aligned and therefore
455 * never cross page boundaries. Seeing split page accesses indicates
456 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
457 *
458 * load_unaligned_zeropad() will recover using exception fixups.
459 */
460 vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
461 if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
462 return -EFAULT;
463
464 /* Handle writes first */
465 switch (mmio) {
466 case INSN_MMIO_WRITE:
467 memcpy(&val, reg, size);
468 if (!mmio_write(size, ve->gpa, val))
469 return -EIO;
470 return insn.length;
471 case INSN_MMIO_WRITE_IMM:
472 val = insn.immediate.value;
473 if (!mmio_write(size, ve->gpa, val))
474 return -EIO;
475 return insn.length;
476 case INSN_MMIO_READ:
477 case INSN_MMIO_READ_ZERO_EXTEND:
478 case INSN_MMIO_READ_SIGN_EXTEND:
479 /* Reads are handled below */
480 break;
481 case INSN_MMIO_MOVS:
482 case INSN_MMIO_DECODE_FAILED:
483 /*
484 * MMIO was accessed with an instruction that could not be
485 * decoded or handled properly. It was likely not using io.h
486 * helpers or accessed MMIO accidentally.
487 */
488 return -EINVAL;
489 default:
490 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
491 return -EINVAL;
492 }
493
494 /* Handle reads */
495 if (!mmio_read(size, ve->gpa, &val))
496 return -EIO;
497
498 switch (mmio) {
499 case INSN_MMIO_READ:
500 /* Zero-extend for 32-bit operation */
501 extend_size = size == 4 ? sizeof(*reg) : 0;
502 break;
503 case INSN_MMIO_READ_ZERO_EXTEND:
504 /* Zero extend based on operand size */
505 extend_size = insn.opnd_bytes;
506 break;
507 case INSN_MMIO_READ_SIGN_EXTEND:
508 /* Sign extend based on operand size */
509 extend_size = insn.opnd_bytes;
510 if (size == 1 && val & BIT(7))
511 extend_val = 0xFF;
512 else if (size > 1 && val & BIT(15))
513 extend_val = 0xFF;
514 break;
515 default:
516 /* All other cases has to be covered with the first switch() */
517 WARN_ON_ONCE(1);
518 return -EINVAL;
519 }
520
521 if (extend_size)
522 memset(reg, extend_val, extend_size);
523 memcpy(reg, &val, size);
524 return insn.length;
525 }
526
handle_in(struct pt_regs * regs,int size,int port)527 static bool handle_in(struct pt_regs *regs, int size, int port)
528 {
529 struct tdx_hypercall_args args = {
530 .r10 = TDX_HYPERCALL_STANDARD,
531 .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
532 .r12 = size,
533 .r13 = PORT_READ,
534 .r14 = port,
535 };
536 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
537 bool success;
538
539 /*
540 * Emulate the I/O read via hypercall. More info about ABI can be found
541 * in TDX Guest-Host-Communication Interface (GHCI) section titled
542 * "TDG.VP.VMCALL<Instruction.IO>".
543 */
544 success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
545
546 /* Update part of the register affected by the emulated instruction */
547 regs->ax &= ~mask;
548 if (success)
549 regs->ax |= args.r11 & mask;
550
551 return success;
552 }
553
handle_out(struct pt_regs * regs,int size,int port)554 static bool handle_out(struct pt_regs *regs, int size, int port)
555 {
556 u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
557
558 /*
559 * Emulate the I/O write via hypercall. More info about ABI can be found
560 * in TDX Guest-Host-Communication Interface (GHCI) section titled
561 * "TDG.VP.VMCALL<Instruction.IO>".
562 */
563 return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
564 PORT_WRITE, port, regs->ax & mask);
565 }
566
567 /*
568 * Emulate I/O using hypercall.
569 *
570 * Assumes the IO instruction was using ax, which is enforced
571 * by the standard io.h macros.
572 *
573 * Return True on success or False on failure.
574 */
handle_io(struct pt_regs * regs,struct ve_info * ve)575 static int handle_io(struct pt_regs *regs, struct ve_info *ve)
576 {
577 u32 exit_qual = ve->exit_qual;
578 int size, port;
579 bool in, ret;
580
581 if (VE_IS_IO_STRING(exit_qual))
582 return -EIO;
583
584 in = VE_IS_IO_IN(exit_qual);
585 size = VE_GET_IO_SIZE(exit_qual);
586 port = VE_GET_PORT_NUM(exit_qual);
587
588
589 if (in)
590 ret = handle_in(regs, size, port);
591 else
592 ret = handle_out(regs, size, port);
593 if (!ret)
594 return -EIO;
595
596 return ve_instr_len(ve);
597 }
598
599 /*
600 * Early #VE exception handler. Only handles a subset of port I/O.
601 * Intended only for earlyprintk. If failed, return false.
602 */
tdx_early_handle_ve(struct pt_regs * regs)603 __init bool tdx_early_handle_ve(struct pt_regs *regs)
604 {
605 struct ve_info ve;
606 int insn_len;
607
608 tdx_get_ve_info(&ve);
609
610 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
611 return false;
612
613 insn_len = handle_io(regs, &ve);
614 if (insn_len < 0)
615 return false;
616
617 regs->ip += insn_len;
618 return true;
619 }
620
tdx_get_ve_info(struct ve_info * ve)621 void tdx_get_ve_info(struct ve_info *ve)
622 {
623 struct tdx_module_output out;
624
625 /*
626 * Called during #VE handling to retrieve the #VE info from the
627 * TDX module.
628 *
629 * This has to be called early in #VE handling. A "nested" #VE which
630 * occurs before this will raise a #DF and is not recoverable.
631 *
632 * The call retrieves the #VE info from the TDX module, which also
633 * clears the "#VE valid" flag. This must be done before anything else
634 * because any #VE that occurs while the valid flag is set will lead to
635 * #DF.
636 *
637 * Note, the TDX module treats virtual NMIs as inhibited if the #VE
638 * valid flag is set. It means that NMI=>#VE will not result in a #DF.
639 */
640 tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
641
642 /* Transfer the output parameters */
643 ve->exit_reason = out.rcx;
644 ve->exit_qual = out.rdx;
645 ve->gla = out.r8;
646 ve->gpa = out.r9;
647 ve->instr_len = lower_32_bits(out.r10);
648 ve->instr_info = upper_32_bits(out.r10);
649 }
650
651 /*
652 * Handle the user initiated #VE.
653 *
654 * On success, returns the number of bytes RIP should be incremented (>=0)
655 * or -errno on error.
656 */
virt_exception_user(struct pt_regs * regs,struct ve_info * ve)657 static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
658 {
659 switch (ve->exit_reason) {
660 case EXIT_REASON_CPUID:
661 return handle_cpuid(regs, ve);
662 default:
663 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
664 return -EIO;
665 }
666 }
667
is_private_gpa(u64 gpa)668 static inline bool is_private_gpa(u64 gpa)
669 {
670 return gpa == cc_mkenc(gpa);
671 }
672
673 /*
674 * Handle the kernel #VE.
675 *
676 * On success, returns the number of bytes RIP should be incremented (>=0)
677 * or -errno on error.
678 */
virt_exception_kernel(struct pt_regs * regs,struct ve_info * ve)679 static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
680 {
681 switch (ve->exit_reason) {
682 case EXIT_REASON_HLT:
683 return handle_halt(ve);
684 case EXIT_REASON_MSR_READ:
685 return read_msr(regs, ve);
686 case EXIT_REASON_MSR_WRITE:
687 return write_msr(regs, ve);
688 case EXIT_REASON_CPUID:
689 return handle_cpuid(regs, ve);
690 case EXIT_REASON_EPT_VIOLATION:
691 if (is_private_gpa(ve->gpa))
692 panic("Unexpected EPT-violation on private memory.");
693 return handle_mmio(regs, ve);
694 case EXIT_REASON_IO_INSTRUCTION:
695 return handle_io(regs, ve);
696 default:
697 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
698 return -EIO;
699 }
700 }
701
tdx_handle_virt_exception(struct pt_regs * regs,struct ve_info * ve)702 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
703 {
704 int insn_len;
705
706 if (user_mode(regs))
707 insn_len = virt_exception_user(regs, ve);
708 else
709 insn_len = virt_exception_kernel(regs, ve);
710 if (insn_len < 0)
711 return false;
712
713 /* After successful #VE handling, move the IP */
714 regs->ip += insn_len;
715
716 return true;
717 }
718
tdx_tlb_flush_required(bool private)719 static bool tdx_tlb_flush_required(bool private)
720 {
721 /*
722 * TDX guest is responsible for flushing TLB on private->shared
723 * transition. VMM is responsible for flushing on shared->private.
724 *
725 * The VMM _can't_ flush private addresses as it can't generate PAs
726 * with the guest's HKID. Shared memory isn't subject to integrity
727 * checking, i.e. the VMM doesn't need to flush for its own protection.
728 *
729 * There's no need to flush when converting from shared to private,
730 * as flushing is the VMM's responsibility in this case, e.g. it must
731 * flush to avoid integrity failures in the face of a buggy or
732 * malicious guest.
733 */
734 return !private;
735 }
736
tdx_cache_flush_required(void)737 static bool tdx_cache_flush_required(void)
738 {
739 /*
740 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
741 * TDX doesn't have such capability.
742 *
743 * Flush cache unconditionally.
744 */
745 return true;
746 }
747
try_accept_one(phys_addr_t * start,unsigned long len,enum pg_level pg_level)748 static bool try_accept_one(phys_addr_t *start, unsigned long len,
749 enum pg_level pg_level)
750 {
751 unsigned long accept_size = page_level_size(pg_level);
752 u64 tdcall_rcx;
753 u8 page_size;
754
755 if (!IS_ALIGNED(*start, accept_size))
756 return false;
757
758 if (len < accept_size)
759 return false;
760
761 /*
762 * Pass the page physical address to the TDX module to accept the
763 * pending, private page.
764 *
765 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
766 */
767 switch (pg_level) {
768 case PG_LEVEL_4K:
769 page_size = 0;
770 break;
771 case PG_LEVEL_2M:
772 page_size = 1;
773 break;
774 case PG_LEVEL_1G:
775 page_size = 2;
776 break;
777 default:
778 return false;
779 }
780
781 tdcall_rcx = *start | page_size;
782 if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
783 return false;
784
785 *start += accept_size;
786 return true;
787 }
788
789 /*
790 * Inform the VMM of the guest's intent for this physical page: shared with
791 * the VMM or private to the guest. The VMM is expected to change its mapping
792 * of the page in response.
793 */
tdx_enc_status_changed(unsigned long vaddr,int numpages,bool enc)794 static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
795 {
796 phys_addr_t start = __pa(vaddr);
797 phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
798
799 if (!enc) {
800 /* Set the shared (decrypted) bits: */
801 start |= cc_mkdec(0);
802 end |= cc_mkdec(0);
803 }
804
805 /*
806 * Notify the VMM about page mapping conversion. More info about ABI
807 * can be found in TDX Guest-Host-Communication Interface (GHCI),
808 * section "TDG.VP.VMCALL<MapGPA>"
809 */
810 if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
811 return false;
812
813 /* private->shared conversion requires only MapGPA call */
814 if (!enc)
815 return true;
816
817 /*
818 * For shared->private conversion, accept the page using
819 * TDX_ACCEPT_PAGE TDX module call.
820 */
821 while (start < end) {
822 unsigned long len = end - start;
823
824 /*
825 * Try larger accepts first. It gives chance to VMM to keep
826 * 1G/2M SEPT entries where possible and speeds up process by
827 * cutting number of hypercalls (if successful).
828 */
829
830 if (try_accept_one(&start, len, PG_LEVEL_1G))
831 continue;
832
833 if (try_accept_one(&start, len, PG_LEVEL_2M))
834 continue;
835
836 if (!try_accept_one(&start, len, PG_LEVEL_4K))
837 return false;
838 }
839
840 return true;
841 }
842
tdx_early_init(void)843 void __init tdx_early_init(void)
844 {
845 u64 cc_mask;
846 u32 eax, sig[3];
847
848 cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
849
850 if (memcmp(TDX_IDENT, sig, sizeof(sig)))
851 return;
852
853 setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
854
855 cc_set_vendor(CC_VENDOR_INTEL);
856 tdx_parse_tdinfo(&cc_mask);
857 cc_set_mask(cc_mask);
858
859 /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
860 tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
861
862 /*
863 * All bits above GPA width are reserved and kernel treats shared bit
864 * as flag, not as part of physical address.
865 *
866 * Adjust physical mask to only cover valid GPA bits.
867 */
868 physical_mask &= cc_mask - 1;
869
870 x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
871 x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
872 x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
873
874 pr_info("Guest detected\n");
875 }
876