1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*
21 * Copyright (C) 1991, 1992 Linus Torvalds
22 *
23 * Pentium III FXSR, SSE support
24 * Gareth Hughes <gareth@valinux.com>, May 2000
25 */
26
27 #include <xen/init.h>
28 #include <xen/sched.h>
29 #include <xen/lib.h>
30 #include <xen/err.h>
31 #include <xen/errno.h>
32 #include <xen/mm.h>
33 #include <xen/param.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <xen/guest_access.h>
37 #include <asm/regs.h>
38 #include <xen/delay.h>
39 #include <xen/event.h>
40 #include <xen/spinlock.h>
41 #include <xen/irq.h>
42 #include <xen/perfc.h>
43 #include <xen/softirq.h>
44 #include <xen/domain_page.h>
45 #include <xen/symbols.h>
46 #include <xen/iocap.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <xen/virtual_region.h>
52 #include <xen/watchdog.h>
53 #include <xen/livepatch.h>
54 #include <asm/system.h>
55 #include <asm/io.h>
56 #include <asm/atomic.h>
57 #include <xen/bitops.h>
58 #include <asm/desc.h>
59 #include <asm/debugreg.h>
60 #include <asm/smp.h>
61 #include <asm/flushtlb.h>
62 #include <asm/uaccess.h>
63 #include <asm/i387.h>
64 #include <asm/xstate.h>
65 #include <asm/debugger.h>
66 #include <asm/msr.h>
67 #include <asm/nmi.h>
68 #include <asm/shared.h>
69 #include <asm/x86_emulate.h>
70 #include <asm/traps.h>
71 #include <asm/hvm/vpt.h>
72 #include <asm/hypercall.h>
73 #include <asm/mce.h>
74 #include <asm/apic.h>
75 #include <asm/mc146818rtc.h>
76 #include <asm/hpet.h>
77 #include <asm/vpmu.h>
78 #include <public/arch-x86/cpuid.h>
79 #include <public/hvm/params.h>
80 #include <asm/cpuid.h>
81 #include <xsm/xsm.h>
82 #include <asm/pv/traps.h>
83 #include <asm/pv/mm.h>
84
85 /*
86 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
87 * fatal: Xen prints diagnostic message and then hangs.
88 * dom0: The NMI is virtualised to DOM0.
89 * ignore: The NMI error is cleared and ignored.
90 */
91 #ifdef NDEBUG
92 static char __read_mostly opt_nmi[10] = "dom0";
93 #else
94 static char __read_mostly opt_nmi[10] = "fatal";
95 #endif
96 string_param("nmi", opt_nmi);
97
98 DEFINE_PER_CPU(uint64_t, efer);
99 static DEFINE_PER_CPU(unsigned long, last_extable_addr);
100
101 DEFINE_PER_CPU_READ_MOSTLY(seg_desc_t *, gdt);
102 DEFINE_PER_CPU_READ_MOSTLY(l1_pgentry_t, gdt_l1e);
103 #ifdef CONFIG_PV32
104 DEFINE_PER_CPU_READ_MOSTLY(seg_desc_t *, compat_gdt);
105 DEFINE_PER_CPU_READ_MOSTLY(l1_pgentry_t, compat_gdt_l1e);
106 #endif
107
108 /* Master table, used by CPU0. */
109 idt_entry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
110 idt_table[IDT_ENTRIES];
111
112 /* Pointer to the IDT of every CPU. */
113 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
114
115 /*
116 * The TSS is smaller than a page, but we give it a full page to avoid
117 * adjacent per-cpu data leaking via Meltdown when XPTI is in use.
118 */
119 DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_page, tss_page);
120
121 static int debug_stack_lines = 20;
122 integer_param("debug_stack_lines", debug_stack_lines);
123
124 static bool opt_ler;
125 boolean_param("ler", opt_ler);
126
127 /* LastExceptionFromIP on this hardware. Zero if LER is not in use. */
128 unsigned int __read_mostly ler_msr;
129
130 const unsigned int nmi_cpu;
131
132 #define stack_words_per_line 4
133 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
134
135 static void do_trap(struct cpu_user_regs *regs);
136 static void do_reserved_trap(struct cpu_user_regs *regs);
137
138 void (* const exception_table[TRAP_nr])(struct cpu_user_regs *regs) = {
139 [TRAP_divide_error] = do_trap,
140 [TRAP_debug] = do_debug,
141 [TRAP_nmi] = (void *)do_nmi,
142 [TRAP_int3] = do_int3,
143 [TRAP_overflow] = do_trap,
144 [TRAP_bounds] = do_trap,
145 [TRAP_invalid_op] = do_invalid_op,
146 [TRAP_no_device] = do_device_not_available,
147 [TRAP_double_fault] = do_reserved_trap,
148 [TRAP_copro_seg] = do_reserved_trap,
149 [TRAP_invalid_tss] = do_trap,
150 [TRAP_no_segment] = do_trap,
151 [TRAP_stack_error] = do_trap,
152 [TRAP_gp_fault] = do_general_protection,
153 [TRAP_page_fault] = do_page_fault,
154 [TRAP_spurious_int] = do_reserved_trap,
155 [TRAP_copro_error] = do_trap,
156 [TRAP_alignment_check] = do_trap,
157 [TRAP_machine_check] = (void *)do_machine_check,
158 [TRAP_simd_error] = do_trap,
159 [TRAP_virtualisation] = do_reserved_trap,
160 [X86_EXC_CP] = do_entry_CP,
161 [X86_EXC_CP + 1 ...
162 (ARRAY_SIZE(exception_table) - 1)] = do_reserved_trap,
163 };
164
show_code(const struct cpu_user_regs * regs)165 void show_code(const struct cpu_user_regs *regs)
166 {
167 unsigned char insns_before[8] = {}, insns_after[16] = {};
168 unsigned int i, tmp, missing_before, missing_after;
169
170 if ( guest_mode(regs) )
171 return;
172
173 stac();
174
175 /*
176 * Copy forward from regs->rip. In the case of a fault, %ecx contains the
177 * number of bytes remaining to copy.
178 */
179 asm volatile ("1: rep movsb; 2:"
180 _ASM_EXTABLE(1b, 2b)
181 : "=&c" (missing_after),
182 "=&D" (tmp), "=&S" (tmp)
183 : "0" (ARRAY_SIZE(insns_after)),
184 "1" (insns_after),
185 "2" (regs->rip));
186
187 /*
188 * Copy backwards from regs->rip - 1. In the case of a fault, %ecx
189 * contains the number of bytes remaining to copy.
190 */
191 asm volatile ("std;"
192 "1: rep movsb;"
193 "2: cld;"
194 _ASM_EXTABLE(1b, 2b)
195 : "=&c" (missing_before),
196 "=&D" (tmp), "=&S" (tmp)
197 : "0" (ARRAY_SIZE(insns_before)),
198 "1" (insns_before + ARRAY_SIZE(insns_before) - 1),
199 "2" (regs->rip - 1));
200 clac();
201
202 printk("Xen code around <%p> (%ps)%s:\n",
203 _p(regs->rip), _p(regs->rip),
204 (missing_before || missing_after) ? " [fault on access]" : "");
205
206 /* Print bytes from insns_before[]. */
207 for ( i = 0; i < ARRAY_SIZE(insns_before); ++i )
208 {
209 if ( i < missing_before )
210 printk(" --");
211 else
212 printk(" %02x", insns_before[i]);
213 }
214
215 /* Print the byte under %rip. */
216 if ( missing_after != ARRAY_SIZE(insns_after) )
217 printk(" <%02x>", insns_after[0]);
218 else
219 printk(" <-->");
220
221 /* Print bytes from insns_after[]. */
222 for ( i = 1; i < ARRAY_SIZE(insns_after); ++i )
223 {
224 if ( i < (ARRAY_SIZE(insns_after) - missing_after) )
225 printk(" %02x", insns_after[i]);
226 else
227 printk(" --");
228 }
229
230 printk("\n");
231 }
232
compat_show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs,int debug_stack_lines)233 static void compat_show_guest_stack(struct vcpu *v,
234 const struct cpu_user_regs *regs,
235 int debug_stack_lines)
236 {
237 unsigned int i, *stack, addr, mask = STACK_SIZE;
238 void *stack_page = NULL;
239
240 stack = (unsigned int *)(unsigned long)regs->esp;
241 printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack);
242
243 if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) )
244 {
245 printk("Guest-inaccessible memory.\n");
246 return;
247 }
248
249 if ( v != current )
250 {
251 struct vcpu *vcpu;
252 unsigned long mfn;
253
254 ASSERT(guest_kernel_mode(v, regs));
255 mfn = read_cr3() >> PAGE_SHIFT;
256 for_each_vcpu( v->domain, vcpu )
257 if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
258 break;
259 if ( !vcpu )
260 {
261 stack_page = stack = do_page_walk(v, (unsigned long)stack);
262 if ( (unsigned long)stack < PAGE_SIZE )
263 {
264 printk("Inaccessible guest memory.\n");
265 return;
266 }
267 mask = PAGE_SIZE;
268 }
269 }
270
271 for ( i = 0; i < debug_stack_lines * 8; i++ )
272 {
273 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
274 break;
275 if ( __get_user(addr, stack) )
276 {
277 if ( i != 0 )
278 printk("\n ");
279 printk("Fault while accessing guest memory.");
280 i = 1;
281 break;
282 }
283 if ( (i != 0) && ((i % 8) == 0) )
284 printk("\n ");
285 printk(" %08x", addr);
286 stack++;
287 }
288
289 UNMAP_DOMAIN_PAGE(stack_page);
290
291 if ( i == 0 )
292 printk("Stack empty.");
293 printk("\n");
294 }
295
show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs)296 static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
297 {
298 int i;
299 unsigned long *stack, addr;
300 unsigned long mask = STACK_SIZE;
301 void *stack_page = NULL;
302
303 /* Avoid HVM as we don't know what the stack looks like. */
304 if ( is_hvm_vcpu(v) )
305 return;
306
307 if ( is_pv_32bit_vcpu(v) )
308 {
309 compat_show_guest_stack(v, regs, debug_stack_lines);
310 return;
311 }
312
313 stack = (unsigned long *)regs->rsp;
314 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
315
316 if ( !access_ok(stack, sizeof(*stack)) )
317 {
318 printk("Guest-inaccessible memory.\n");
319 return;
320 }
321
322 if ( v != current )
323 {
324 struct vcpu *vcpu;
325
326 ASSERT(guest_kernel_mode(v, regs));
327 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
328 if ( !vcpu )
329 {
330 stack_page = stack = do_page_walk(v, (unsigned long)stack);
331 if ( (unsigned long)stack < PAGE_SIZE )
332 {
333 printk("Inaccessible guest memory.\n");
334 return;
335 }
336 mask = PAGE_SIZE;
337 }
338 }
339
340 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
341 {
342 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
343 break;
344 if ( __get_user(addr, stack) )
345 {
346 if ( i != 0 )
347 printk("\n ");
348 printk("Fault while accessing guest memory.");
349 i = 1;
350 break;
351 }
352 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
353 printk("\n ");
354 printk(" %p", _p(addr));
355 stack++;
356 }
357
358 UNMAP_DOMAIN_PAGE(stack_page);
359
360 if ( i == 0 )
361 printk("Stack empty.");
362 printk("\n");
363 }
364
365 /*
366 * Notes for get_{stack,shstk}*_bottom() helpers
367 *
368 * Stack pages 1 - 4:
369 * These are all 1-page IST stacks. Each of these stacks have an exception
370 * frame and saved register state at the top. The interesting bound for a
371 * trace is the word adjacent to this, while the bound for a dump is the
372 * very top, including the exception frame.
373 *
374 * Stack pages 0 and 5:
375 * Shadow stacks. These are mapped read-only, and used by CET-SS capable
376 * processors. They will never contain regular stack data.
377 *
378 * Stack pages 6 and 7:
379 * These form the primary stack, and have a cpu_info at the top. For a
380 * trace, the interesting bound is adjacent to the cpu_info, while for a
381 * dump, the entire cpu_info is interesting.
382 *
383 * For the cases where the stack should not be inspected, pretend that the
384 * passed stack pointer is already out of reasonable bounds.
385 */
get_stack_trace_bottom(unsigned long sp)386 unsigned long get_stack_trace_bottom(unsigned long sp)
387 {
388 switch ( get_stack_page(sp) )
389 {
390 case 1 ... 4:
391 return ROUNDUP(sp, PAGE_SIZE) -
392 offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
393
394 case 6 ... 7:
395 return ROUNDUP(sp, STACK_SIZE) -
396 sizeof(struct cpu_info) - sizeof(unsigned long);
397
398 default:
399 return sp - sizeof(unsigned long);
400 }
401 }
402
get_shstk_bottom(unsigned long sp)403 static unsigned long get_shstk_bottom(unsigned long sp)
404 {
405 switch ( get_stack_page(sp) )
406 {
407 #ifdef CONFIG_XEN_SHSTK
408 case 0: return ROUNDUP(sp, IST_SHSTK_SIZE) - sizeof(unsigned long);
409 case 5: return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
410 #endif
411 default: return sp - sizeof(unsigned long);
412 }
413 }
414
get_stack_dump_bottom(unsigned long sp)415 unsigned long get_stack_dump_bottom(unsigned long sp)
416 {
417 switch ( get_stack_page(sp) )
418 {
419 case 1 ... 4:
420 return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
421
422 case 6 ... 7:
423 return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
424
425 default:
426 return sp - sizeof(unsigned long);
427 }
428 }
429
430 #if !defined(CONFIG_FRAME_POINTER)
431
432 /*
433 * Stack trace from pointers found in stack, unaided by frame pointers. For
434 * caller convenience, this has the same prototype as its alternative, and
435 * simply ignores the base pointer parameter.
436 */
_show_trace(unsigned long sp,unsigned long __maybe_unused bp)437 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
438 {
439 unsigned long *stack = (unsigned long *)sp, addr;
440 unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
441
442 while ( stack <= bottom )
443 {
444 addr = *stack++;
445 if ( is_active_kernel_text(addr) )
446 printk(" [<%p>] S %pS\n", _p(addr), _p(addr));
447 }
448 }
449
450 #else
451
452 /* Stack trace from frames in the stack, using frame pointers */
_show_trace(unsigned long sp,unsigned long bp)453 static void _show_trace(unsigned long sp, unsigned long bp)
454 {
455 unsigned long *frame, next, addr;
456
457 /* Bounds for range of valid frame pointer. */
458 unsigned long low = sp, high = get_stack_trace_bottom(sp);
459
460 /* The initial frame pointer. */
461 next = bp;
462
463 for ( ; ; )
464 {
465 /* Valid frame pointer? */
466 if ( (next < low) || (next >= high) )
467 {
468 /*
469 * Exception stack frames have a different layout, denoted by an
470 * inverted frame pointer.
471 */
472 next = ~next;
473 if ( (next < low) || (next >= high) )
474 break;
475 frame = (unsigned long *)next;
476 next = frame[0];
477 addr = frame[(offsetof(struct cpu_user_regs, rip) -
478 offsetof(struct cpu_user_regs, rbp))
479 / BYTES_PER_LONG];
480 }
481 else
482 {
483 /* Ordinary stack frame. */
484 frame = (unsigned long *)next;
485 next = frame[0];
486 addr = frame[1];
487 }
488
489 printk(" [<%p>] F %pS\n", _p(addr), _p(addr));
490
491 low = (unsigned long)&frame[2];
492 }
493 }
494
495 #endif
496
show_trace(const struct cpu_user_regs * regs)497 static void show_trace(const struct cpu_user_regs *regs)
498 {
499 unsigned long *sp = ESP_BEFORE_EXCEPTION(regs), tos = 0;
500 bool fault = false;
501
502 printk("Xen call trace:\n");
503
504 /* Guarded read of the stack top. */
505 asm ( "1: mov %[data], %[tos]; 2:\n"
506 ".pushsection .fixup,\"ax\"\n"
507 "3: movb $1, %[fault]; jmp 2b\n"
508 ".popsection\n"
509 _ASM_EXTABLE(1b, 3b)
510 : [tos] "+r" (tos), [fault] "+qm" (fault) : [data] "m" (*sp) );
511
512 /*
513 * If RIP looks sensible, or the top of the stack doesn't, print RIP at
514 * the top of the stack trace.
515 */
516 if ( is_active_kernel_text(regs->rip) ||
517 !is_active_kernel_text(tos) )
518 printk(" [<%p>] R %pS\n", _p(regs->rip), _p(regs->rip));
519
520 if ( fault )
521 {
522 printk(" [Fault on access]\n");
523 return;
524 }
525
526 /*
527 * If RIP looks bad or the top of the stack looks good, log the top of
528 * stack as well. Perhaps we followed a wild function pointer, or we're
529 * in a function without frame pointer, or in a function prologue before
530 * the frame pointer gets set up? Let's assume the top of the stack is a
531 * return address; print it and skip past so _show_trace() doesn't print
532 * it again.
533 */
534 if ( !is_active_kernel_text(regs->rip) ||
535 is_active_kernel_text(tos) )
536 {
537 printk(" [<%p>] S %pS\n", _p(tos), _p(tos));
538 sp++;
539 }
540
541 _show_trace((unsigned long)sp, regs->rbp);
542
543 printk("\n");
544 }
545
show_stack(const struct cpu_user_regs * regs)546 void show_stack(const struct cpu_user_regs *regs)
547 {
548 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
549 int i;
550
551 if ( guest_mode(regs) )
552 return show_guest_stack(current, regs);
553
554 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
555
556 stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
557
558 for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
559 (stack <= stack_bottom); i++ )
560 {
561 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
562 printk("\n ");
563 addr = *stack++;
564 printk(" %p", _p(addr));
565 }
566 if ( i == 0 )
567 printk("Stack empty.");
568 printk("\n");
569
570 show_trace(regs);
571 }
572
show_stack_overflow(unsigned int cpu,const struct cpu_user_regs * regs)573 void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
574 {
575 unsigned long esp = regs->rsp;
576 unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
577 #ifdef MEMORY_GUARD
578 unsigned long esp_top, esp_bottom;
579 #endif
580
581 if ( _p(curr_stack_base) != stack_base[cpu] )
582 printk("Current stack base %p differs from expected %p\n",
583 _p(curr_stack_base), stack_base[cpu]);
584
585 #ifdef MEMORY_GUARD
586 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
587 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
588
589 printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n",
590 (void *)esp_top, (void *)esp_bottom, (void *)esp,
591 (void *)per_cpu(tss_page, cpu).tss.rsp0);
592
593 /*
594 * Trigger overflow trace if %esp is anywhere within the guard page, or
595 * with fewer than 512 bytes remaining on the primary stack.
596 */
597 if ( (esp > (esp_top + 512)) ||
598 (esp < (esp_top - PAGE_SIZE)) )
599 {
600 printk("No stack overflow detected. Skipping stack trace.\n");
601 return;
602 }
603
604 if ( esp < esp_top )
605 esp = esp_top;
606
607 printk("Xen stack overflow (dumping trace %p-%p):\n",
608 (void *)esp, (void *)esp_bottom);
609
610 _show_trace(esp, regs->rbp);
611
612 printk("\n");
613 #endif
614 }
615
show_execution_state(const struct cpu_user_regs * regs)616 void show_execution_state(const struct cpu_user_regs *regs)
617 {
618 /* Prevent interleaving of output. */
619 unsigned long flags = console_lock_recursive_irqsave();
620
621 show_registers(regs);
622 show_code(regs);
623 show_stack(regs);
624
625 console_unlock_recursive_irqrestore(flags);
626 }
627
vcpu_show_execution_state(struct vcpu * v)628 void vcpu_show_execution_state(struct vcpu *v)
629 {
630 unsigned long flags;
631
632 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
633 v->domain->domain_id, v->vcpu_id);
634
635 if ( v == current )
636 {
637 show_execution_state(guest_cpu_user_regs());
638 return;
639 }
640
641 vcpu_pause(v); /* acceptably dangerous */
642
643 /* Prevent interleaving of output. */
644 flags = console_lock_recursive_irqsave();
645
646 vcpu_show_registers(v);
647 if ( guest_kernel_mode(v, &v->arch.user_regs) )
648 show_guest_stack(v, &v->arch.user_regs);
649
650 console_unlock_recursive_irqrestore(flags);
651
652 vcpu_unpause(v);
653 }
654
655 static cpumask_t show_state_mask;
656 static bool opt_show_all;
657 boolean_param("async-show-all", opt_show_all);
658
nmi_show_execution_state(const struct cpu_user_regs * regs,int cpu)659 static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
660 {
661 if ( !cpumask_test_cpu(cpu, &show_state_mask) )
662 return 0;
663
664 if ( opt_show_all )
665 show_execution_state(regs);
666 else
667 printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs,
668 regs->rip, guest_mode(regs) ? NULL : _p(regs->rip));
669 cpumask_clear_cpu(cpu, &show_state_mask);
670
671 return 1;
672 }
673
trapstr(unsigned int trapnr)674 const char *trapstr(unsigned int trapnr)
675 {
676 static const char * const strings[] = {
677 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
678 "invalid opcode", "device not available", "double fault",
679 "coprocessor segment", "invalid tss", "segment not found",
680 "stack error", "general protection fault", "page fault",
681 "spurious interrupt", "coprocessor error", "alignment check",
682 "machine check", "simd error", "virtualisation exception"
683 };
684
685 return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
686 }
687
vec_name(unsigned int vec)688 static const char *vec_name(unsigned int vec)
689 {
690 static const char names[][4] = {
691 #define P(x) [X86_EXC_ ## x] = "#" #x
692 #define N(x) [X86_EXC_ ## x] = #x
693 P(DE), P(DB), N(NMI), P(BP), P(OF), P(BR), P(UD), P(NM),
694 P(DF), N(CSO), P(TS), P(NP), P(SS), P(GP), P(PF), N(SPV),
695 P(MF), P(AC), P(MC), P(XM), P(VE), P(CP),
696 P(HV), P(VC), P(SX),
697 #undef N
698 #undef P
699 };
700
701 return (vec < ARRAY_SIZE(names) && names[vec][0]) ? names[vec] : "???";
702 }
703
704 /*
705 * This is called for faults at very unexpected times (e.g., when interrupts
706 * are disabled). In such situations we can't do much that is safe. We try to
707 * print out some tracing and then we just spin.
708 */
fatal_trap(const struct cpu_user_regs * regs,bool show_remote)709 void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
710 {
711 static DEFINE_PER_CPU(char, depth);
712 unsigned int trapnr = regs->entry_vector;
713
714 /* Set AC to reduce chance of further SMAP faults */
715 stac();
716
717 /*
718 * In some cases, we can end up in a vicious cycle of fatal_trap()s
719 * within fatal_trap()s. We give the problem a couple of iterations to
720 * bottom out, and then we just panic.
721 */
722 if ( ++this_cpu(depth) < 3 )
723 {
724 watchdog_disable();
725 console_start_sync();
726
727 show_execution_state(regs);
728
729 if ( trapnr == TRAP_page_fault )
730 show_page_walk(read_cr2());
731
732 if ( show_remote )
733 {
734 unsigned int msecs, pending;
735
736 cpumask_andnot(&show_state_mask, &cpu_online_map,
737 cpumask_of(smp_processor_id()));
738 set_nmi_callback(nmi_show_execution_state);
739 /* Ensure new callback is set before sending out the NMI. */
740 smp_wmb();
741 smp_send_nmi_allbutself();
742
743 /* Wait at most 10ms for some other CPU to respond. */
744 msecs = 10;
745 pending = cpumask_weight(&show_state_mask);
746 while ( pending && msecs-- )
747 {
748 unsigned int left;
749
750 mdelay(1);
751 left = cpumask_weight(&show_state_mask);
752 if ( left < pending )
753 {
754 pending = left;
755 msecs = 10;
756 }
757 }
758 }
759 }
760
761 panic("FATAL TRAP: vec %u, %s[%04x]%s\n",
762 trapnr, vec_name(trapnr), regs->error_code,
763 (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT CONTEXT");
764 }
765
do_reserved_trap(struct cpu_user_regs * regs)766 static void do_reserved_trap(struct cpu_user_regs *regs)
767 {
768 unsigned int trapnr = regs->entry_vector;
769
770 if ( debugger_trap_fatal(trapnr, regs) )
771 return;
772
773 show_execution_state(regs);
774 panic("FATAL RESERVED TRAP: vec %u, %s[%04x]\n",
775 trapnr, vec_name(trapnr), regs->error_code);
776 }
777
extable_shstk_fixup(struct cpu_user_regs * regs,unsigned long fixup)778 static void extable_shstk_fixup(struct cpu_user_regs *regs, unsigned long fixup)
779 {
780 unsigned long ssp, *ptr, *base;
781
782 asm ( "rdsspq %0" : "=r" (ssp) : "0" (1) );
783 if ( ssp == 1 )
784 return;
785
786 ptr = _p(ssp);
787 base = _p(get_shstk_bottom(ssp));
788
789 for ( ; ptr < base; ++ptr )
790 {
791 /*
792 * Search for %rip. The shstk currently looks like this:
793 *
794 * ... [Likely pointed to by SSP]
795 * %cs [== regs->cs]
796 * %rip [== regs->rip]
797 * SSP [Likely points to 3 slots higher, above %cs]
798 * ... [call tree to this function, likely 2/3 slots]
799 *
800 * and we want to overwrite %rip with fixup. There are two
801 * complications:
802 * 1) We cant depend on SSP values, because they won't differ by 3
803 * slots if the exception is taken on an IST stack.
804 * 2) There are synthetic (unrealistic but not impossible) scenarios
805 * where %rip can end up in the call tree to this function, so we
806 * can't check against regs->rip alone.
807 *
808 * Check for both regs->rip and regs->cs matching.
809 */
810 if ( ptr[0] == regs->rip && ptr[1] == regs->cs )
811 {
812 asm ( "wrssq %[fix], %[stk]"
813 : [stk] "=m" (ptr[0])
814 : [fix] "r" (fixup) );
815 return;
816 }
817 }
818
819 /*
820 * We failed to locate and fix up the shadow IRET frame. This could be
821 * due to shadow stack corruption, or bad logic above. We cannot continue
822 * executing the interrupted context.
823 */
824 BUG();
825 }
826
extable_fixup(struct cpu_user_regs * regs,bool print)827 static bool extable_fixup(struct cpu_user_regs *regs, bool print)
828 {
829 unsigned long fixup = search_exception_table(regs);
830
831 if ( unlikely(fixup == 0) )
832 return false;
833
834 /*
835 * Don't use dprintk() because the __FILE__ reference is unhelpful.
836 * Can currently be triggered by guests. Make sure we ratelimit.
837 */
838 if ( IS_ENABLED(CONFIG_DEBUG) && print )
839 printk(XENLOG_GUEST XENLOG_WARNING "Fixup %s[%04x]: %p [%ps] -> %p\n",
840 vec_name(regs->entry_vector), regs->error_code,
841 _p(regs->rip), _p(regs->rip), _p(fixup));
842
843 if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
844 extable_shstk_fixup(regs, fixup);
845
846 regs->rip = fixup;
847 this_cpu(last_extable_addr) = regs->rip;
848
849 return true;
850 }
851
do_trap(struct cpu_user_regs * regs)852 static void do_trap(struct cpu_user_regs *regs)
853 {
854 unsigned int trapnr = regs->entry_vector;
855
856 if ( regs->error_code & X86_XEC_EXT )
857 goto hardware_trap;
858
859 if ( debugger_trap_entry(trapnr, regs) )
860 return;
861
862 ASSERT(trapnr < 32);
863
864 if ( guest_mode(regs) )
865 {
866 pv_inject_hw_exception(trapnr,
867 (TRAP_HAVE_EC & (1u << trapnr))
868 ? regs->error_code : X86_EVENT_NO_EC);
869 return;
870 }
871
872 if ( likely(extable_fixup(regs, true)) )
873 return;
874
875 hardware_trap:
876 if ( debugger_trap_fatal(trapnr, regs) )
877 return;
878
879 show_execution_state(regs);
880 panic("FATAL TRAP: vector = %d (%s)\n"
881 "[error_code=%04x]\n",
882 trapnr, trapstr(trapnr), regs->error_code);
883 }
884
guest_rdmsr_xen(const struct vcpu * v,uint32_t idx,uint64_t * val)885 int guest_rdmsr_xen(const struct vcpu *v, uint32_t idx, uint64_t *val)
886 {
887 const struct domain *d = v->domain;
888 /* Optionally shift out of the way of Viridian architectural MSRs. */
889 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
890
891 switch ( idx - base )
892 {
893 case 0: /* Write hypercall page MSR. Read as zero. */
894 *val = 0;
895 return X86EMUL_OKAY;
896 }
897
898 return X86EMUL_EXCEPTION;
899 }
900
guest_wrmsr_xen(struct vcpu * v,uint32_t idx,uint64_t val)901 int guest_wrmsr_xen(struct vcpu *v, uint32_t idx, uint64_t val)
902 {
903 struct domain *d = v->domain;
904 /* Optionally shift out of the way of Viridian architectural MSRs. */
905 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
906
907 switch ( idx - base )
908 {
909 case 0: /* Write hypercall page */
910 {
911 void *hypercall_page;
912 unsigned long gmfn = val >> PAGE_SHIFT;
913 unsigned int page_index = val & (PAGE_SIZE - 1);
914 struct page_info *page;
915 p2m_type_t t;
916
917 if ( page_index > 0 )
918 {
919 gdprintk(XENLOG_WARNING,
920 "wrmsr hypercall page index %#x unsupported\n",
921 page_index);
922 return X86EMUL_EXCEPTION;
923 }
924
925 page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC);
926
927 if ( !page || !get_page_type(page, PGT_writable_page) )
928 {
929 if ( page )
930 put_page(page);
931
932 if ( p2m_is_paging(t) )
933 {
934 p2m_mem_paging_populate(d, _gfn(gmfn));
935 return X86EMUL_RETRY;
936 }
937
938 gdprintk(XENLOG_WARNING,
939 "Bad GMFN %lx (MFN %#"PRI_mfn") to MSR %08x\n",
940 gmfn, mfn_x(page ? page_to_mfn(page) : INVALID_MFN), base);
941 return X86EMUL_EXCEPTION;
942 }
943
944 hypercall_page = __map_domain_page(page);
945 init_hypercall_page(d, hypercall_page);
946 unmap_domain_page(hypercall_page);
947
948 put_page_and_type(page);
949 return X86EMUL_OKAY;
950 }
951
952 default:
953 return X86EMUL_EXCEPTION;
954 }
955 }
956
cpuid_hypervisor_leaves(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)957 void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
958 uint32_t subleaf, struct cpuid_leaf *res)
959 {
960 const struct domain *d = v->domain;
961 const struct cpuid_policy *p = d->arch.cpuid;
962 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
963 uint32_t idx = leaf - base;
964 unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit;
965
966 if ( limit == 0 )
967 /* Default number of leaves */
968 limit = XEN_CPUID_MAX_NUM_LEAVES;
969 else
970 /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */
971 limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u);
972
973 if ( idx > limit )
974 return;
975
976 switch ( idx )
977 {
978 case 0:
979 res->a = base + limit; /* Largest leaf */
980 res->b = XEN_CPUID_SIGNATURE_EBX;
981 res->c = XEN_CPUID_SIGNATURE_ECX;
982 res->d = XEN_CPUID_SIGNATURE_EDX;
983 break;
984
985 case 1:
986 res->a = (xen_major_version() << 16) | xen_minor_version();
987 break;
988
989 case 2:
990 res->a = 1; /* Number of hypercall-transfer pages */
991 /* MSR base address */
992 res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
993 if ( is_pv_domain(d) ) /* Features */
994 res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
995 break;
996
997 case 3: /* Time leaf. */
998 switch ( subleaf )
999 {
1000 case 0: /* features */
1001 res->a = ((d->arch.vtsc << 0) |
1002 (!!host_tsc_is_safe() << 1) |
1003 (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2));
1004 res->b = d->arch.tsc_mode;
1005 res->c = d->arch.tsc_khz;
1006 res->d = d->arch.incarnation;
1007 break;
1008
1009 case 1: /* scale and offset */
1010 {
1011 uint64_t offset;
1012
1013 if ( !d->arch.vtsc )
1014 offset = d->arch.vtsc_offset;
1015 else
1016 /* offset already applied to value returned by virtual rdtscp */
1017 offset = 0;
1018 res->a = offset;
1019 res->b = offset >> 32;
1020 res->c = d->arch.vtsc_to_ns.mul_frac;
1021 res->d = (s8)d->arch.vtsc_to_ns.shift;
1022 break;
1023 }
1024
1025 case 2: /* physical cpu_khz */
1026 res->a = cpu_khz;
1027 break;
1028 }
1029 break;
1030
1031 case 4: /* HVM hypervisor leaf. */
1032 if ( !is_hvm_domain(d) || subleaf != 0 )
1033 break;
1034
1035 if ( cpu_has_vmx_apic_reg_virt )
1036 res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
1037
1038 /*
1039 * We want to claim that x2APIC is virtualized if APIC MSR accesses
1040 * are not intercepted. When all three of these are true both rdmsr
1041 * and wrmsr in the guest will run without VMEXITs (see
1042 * vmx_vlapic_msr_changed()).
1043 */
1044 if ( cpu_has_vmx_virtualize_x2apic_mode &&
1045 cpu_has_vmx_apic_reg_virt &&
1046 cpu_has_vmx_virtual_intr_delivery )
1047 res->a |= XEN_HVM_CPUID_X2APIC_VIRT;
1048
1049 /*
1050 * Indicate that memory mapped from other domains (either grants or
1051 * foreign pages) has valid IOMMU entries.
1052 */
1053 res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
1054
1055 /* Indicate presence of vcpu id and set it in ebx */
1056 res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
1057 res->b = v->vcpu_id;
1058
1059 /* Indicate presence of domain id and set it in ecx */
1060 res->a |= XEN_HVM_CPUID_DOMID_PRESENT;
1061 res->c = d->domain_id;
1062
1063 break;
1064
1065 case 5: /* PV-specific parameters */
1066 if ( is_hvm_domain(d) || subleaf != 0 )
1067 break;
1068
1069 res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT;
1070 break;
1071
1072 default:
1073 ASSERT_UNREACHABLE();
1074 }
1075 }
1076
do_invalid_op(struct cpu_user_regs * regs)1077 void do_invalid_op(struct cpu_user_regs *regs)
1078 {
1079 const struct bug_frame *bug = NULL;
1080 u8 bug_insn[2];
1081 const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip;
1082 unsigned long fixup;
1083 int id = -1, lineno;
1084 const struct virtual_region *region;
1085
1086 if ( debugger_trap_entry(TRAP_invalid_op, regs) )
1087 return;
1088
1089 if ( likely(guest_mode(regs)) )
1090 {
1091 if ( pv_emulate_invalid_op(regs) )
1092 pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
1093 return;
1094 }
1095
1096 if ( !is_active_kernel_text(regs->rip) ||
1097 __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
1098 memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
1099 goto die;
1100
1101 region = find_text_region(regs->rip);
1102 if ( region )
1103 {
1104 for ( id = 0; id < BUGFRAME_NR; id++ )
1105 {
1106 const struct bug_frame *b;
1107 unsigned int i;
1108
1109 for ( i = 0, b = region->frame[id].bugs;
1110 i < region->frame[id].n_bugs; b++, i++ )
1111 {
1112 if ( bug_loc(b) == eip )
1113 {
1114 bug = b;
1115 goto found;
1116 }
1117 }
1118 }
1119 }
1120
1121 found:
1122 if ( !bug )
1123 goto die;
1124 eip += sizeof(bug_insn);
1125 if ( id == BUGFRAME_run_fn )
1126 {
1127 void (*fn)(struct cpu_user_regs *) = bug_ptr(bug);
1128
1129 fn(regs);
1130 regs->rip = (unsigned long)eip;
1131 return;
1132 }
1133
1134 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
1135 filename = bug_ptr(bug);
1136 if ( !is_kernel(filename) && !is_patch(filename) )
1137 goto die;
1138 fixup = strlen(filename);
1139 if ( fixup > 50 )
1140 {
1141 filename += fixup - 47;
1142 prefix = "...";
1143 }
1144 lineno = bug_line(bug);
1145
1146 switch ( id )
1147 {
1148 case BUGFRAME_warn:
1149 printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno);
1150 show_execution_state(regs);
1151 regs->rip = (unsigned long)eip;
1152 return;
1153
1154 case BUGFRAME_bug:
1155 printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1156
1157 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1158 return;
1159
1160 show_execution_state(regs);
1161 panic("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1162
1163 case BUGFRAME_assert:
1164 /* ASSERT: decode the predicate string pointer. */
1165 predicate = bug_msg(bug);
1166 if ( !is_kernel(predicate) && !is_patch(predicate) )
1167 predicate = "<unknown>";
1168
1169 printk("Assertion '%s' failed at %s%s:%d\n",
1170 predicate, prefix, filename, lineno);
1171
1172 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1173 return;
1174
1175 show_execution_state(regs);
1176 panic("Assertion '%s' failed at %s%s:%d\n",
1177 predicate, prefix, filename, lineno);
1178 }
1179
1180 die:
1181 if ( likely(extable_fixup(regs, true)) )
1182 return;
1183
1184 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1185 return;
1186
1187 show_execution_state(regs);
1188 panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
1189 }
1190
do_int3(struct cpu_user_regs * regs)1191 void do_int3(struct cpu_user_regs *regs)
1192 {
1193 if ( debugger_trap_entry(TRAP_int3, regs) )
1194 return;
1195
1196 if ( !guest_mode(regs) )
1197 {
1198 if ( likely(extable_fixup(regs, true)) )
1199 return;
1200
1201 if ( !debugger_trap_fatal(TRAP_int3, regs) )
1202 printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n",
1203 _p(regs->rip), _p(regs->rip));
1204
1205 return;
1206 }
1207
1208 pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC);
1209 }
1210
1211 #ifdef CONFIG_PV
handle_ldt_mapping_fault(unsigned int offset,struct cpu_user_regs * regs)1212 static int handle_ldt_mapping_fault(unsigned int offset,
1213 struct cpu_user_regs *regs)
1214 {
1215 struct vcpu *curr = current;
1216
1217 /*
1218 * Not in PV context? Something is very broken. Leave it to the #PF
1219 * handler, which will probably result in a panic().
1220 */
1221 if ( !is_pv_vcpu(curr) )
1222 return 0;
1223
1224 /* Try to copy a mapping from the guest's LDT, if it is valid. */
1225 if ( likely(pv_map_ldt_shadow_page(offset)) )
1226 {
1227 if ( guest_mode(regs) )
1228 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1229 regs->rip, offset);
1230 }
1231 else
1232 {
1233 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1234 if ( !guest_mode(regs) )
1235 return 0;
1236
1237 /* Access would have become non-canonical? Pass #GP[sel] back. */
1238 if ( unlikely(!is_canonical_address(curr->arch.pv.ldt_base + offset)) )
1239 {
1240 uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI;
1241
1242 pv_inject_hw_exception(TRAP_gp_fault, ec);
1243 }
1244 else
1245 /* else pass the #PF back, with adjusted %cr2. */
1246 pv_inject_page_fault(regs->error_code,
1247 curr->arch.pv.ldt_base + offset);
1248 }
1249
1250 return EXCRET_fault_fixed;
1251 }
1252
handle_gdt_ldt_mapping_fault(unsigned long offset,struct cpu_user_regs * regs)1253 static int handle_gdt_ldt_mapping_fault(unsigned long offset,
1254 struct cpu_user_regs *regs)
1255 {
1256 struct vcpu *curr = current;
1257 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1258 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1259 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1260
1261 /*
1262 * If the fault is in another vcpu's area, it cannot be due to
1263 * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1264 * indeed we have to since pv_map_ldt_shadow_page() works correctly only on
1265 * accesses to a vcpu's own area.
1266 */
1267 if ( vcpu_area != curr->vcpu_id )
1268 return 0;
1269
1270 /* Byte offset within the gdt/ldt sub-area. */
1271 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1272
1273 if ( likely(is_ldt_area) )
1274 return handle_ldt_mapping_fault(offset, regs);
1275
1276 /* GDT fault: handle the fault as #GP[sel]. */
1277 regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI);
1278 do_general_protection(regs);
1279
1280 return EXCRET_fault_fixed;
1281 }
1282 #endif
1283
1284 #define IN_HYPERVISOR_RANGE(va) \
1285 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1286
1287 enum pf_type {
1288 real_fault,
1289 smep_fault,
1290 smap_fault,
1291 spurious_fault
1292 };
1293
__page_fault_type(unsigned long addr,const struct cpu_user_regs * regs)1294 static enum pf_type __page_fault_type(unsigned long addr,
1295 const struct cpu_user_regs *regs)
1296 {
1297 unsigned long mfn, cr3 = read_cr3();
1298 l4_pgentry_t l4e, *l4t;
1299 l3_pgentry_t l3e, *l3t;
1300 l2_pgentry_t l2e, *l2t;
1301 l1_pgentry_t l1e, *l1t;
1302 unsigned int required_flags, disallowed_flags, page_user;
1303 unsigned int error_code = regs->error_code;
1304
1305 /*
1306 * We do not take spurious page faults in IRQ handlers as we do not
1307 * modify page tables in IRQ context. We therefore bail here because
1308 * map_domain_page() is not IRQ-safe.
1309 */
1310 if ( in_irq() )
1311 return real_fault;
1312
1313 required_flags = _PAGE_PRESENT;
1314 if ( error_code & PFEC_write_access )
1315 required_flags |= _PAGE_RW;
1316 if ( error_code & PFEC_user_mode )
1317 required_flags |= _PAGE_USER;
1318
1319 disallowed_flags = 0;
1320 if ( error_code & PFEC_insn_fetch )
1321 disallowed_flags |= _PAGE_NX_BIT;
1322
1323 page_user = _PAGE_USER;
1324
1325 mfn = cr3 >> PAGE_SHIFT;
1326
1327 l4t = map_domain_page(_mfn(mfn));
1328 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1329 mfn = l4e_get_pfn(l4e);
1330 unmap_domain_page(l4t);
1331 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1332 (l4e_get_flags(l4e) & disallowed_flags) )
1333 return real_fault;
1334 page_user &= l4e_get_flags(l4e);
1335
1336 l3t = map_domain_page(_mfn(mfn));
1337 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1338 mfn = l3e_get_pfn(l3e);
1339 unmap_domain_page(l3t);
1340 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1341 (l3e_get_flags(l3e) & disallowed_flags) )
1342 return real_fault;
1343 page_user &= l3e_get_flags(l3e);
1344 if ( l3e_get_flags(l3e) & _PAGE_PSE )
1345 goto leaf;
1346
1347 l2t = map_domain_page(_mfn(mfn));
1348 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1349 mfn = l2e_get_pfn(l2e);
1350 unmap_domain_page(l2t);
1351 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1352 (l2e_get_flags(l2e) & disallowed_flags) )
1353 return real_fault;
1354 page_user &= l2e_get_flags(l2e);
1355 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1356 goto leaf;
1357
1358 l1t = map_domain_page(_mfn(mfn));
1359 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1360 mfn = l1e_get_pfn(l1e);
1361 unmap_domain_page(l1t);
1362 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1363 (l1e_get_flags(l1e) & disallowed_flags) )
1364 return real_fault;
1365 page_user &= l1e_get_flags(l1e);
1366
1367 leaf:
1368 if ( page_user )
1369 {
1370 unsigned long cr4 = read_cr4();
1371 /*
1372 * Supervisor Mode Execution Prevention (SMEP):
1373 * Disallow supervisor execution from user-accessible mappings
1374 */
1375 if ( (cr4 & X86_CR4_SMEP) &&
1376 ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
1377 return smep_fault;
1378
1379 /*
1380 * Supervisor Mode Access Prevention (SMAP):
1381 * Disallow supervisor access user-accessible mappings
1382 * A fault is considered as an SMAP violation if the following
1383 * conditions are true:
1384 * - X86_CR4_SMAP is set in CR4
1385 * - A user page is being accessed
1386 * - CPL=3 or X86_EFLAGS_AC is clear
1387 * - Page fault in kernel mode
1388 */
1389 if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) &&
1390 (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) )
1391 return smap_fault;
1392 }
1393
1394 return spurious_fault;
1395 }
1396
spurious_page_fault(unsigned long addr,const struct cpu_user_regs * regs)1397 static enum pf_type spurious_page_fault(unsigned long addr,
1398 const struct cpu_user_regs *regs)
1399 {
1400 unsigned long flags;
1401 enum pf_type pf_type;
1402
1403 /*
1404 * Disabling interrupts prevents TLB flushing, and hence prevents
1405 * page tables from becoming invalid under our feet during the walk.
1406 */
1407 local_irq_save(flags);
1408 pf_type = __page_fault_type(addr, regs);
1409 local_irq_restore(flags);
1410
1411 return pf_type;
1412 }
1413
fixup_page_fault(unsigned long addr,struct cpu_user_regs * regs)1414 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1415 {
1416 struct vcpu *v = current;
1417 struct domain *d = v->domain;
1418
1419 /* No fixups in interrupt context or when interrupts are disabled. */
1420 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1421 return 0;
1422
1423 if ( !(regs->error_code & PFEC_page_present) &&
1424 (pagefault_by_memadd(addr, regs)) )
1425 return handle_memadd_fault(addr, regs);
1426
1427 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1428 {
1429 #ifdef CONFIG_PV
1430 if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1431 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1432 return handle_gdt_ldt_mapping_fault(
1433 addr - GDT_LDT_VIRT_START, regs);
1434 #endif
1435 return 0;
1436 }
1437
1438 if ( guest_kernel_mode(v, regs) &&
1439 !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
1440 (regs->error_code & PFEC_write_access) )
1441 {
1442 bool ptwr, mmio_ro;
1443
1444 ptwr = VM_ASSIST(d, writable_pagetables) &&
1445 /* Do not check if access-protection fault since the page may
1446 legitimately be not present in shadow page tables */
1447 (paging_mode_enabled(d) ||
1448 (regs->error_code & PFEC_page_present));
1449
1450 mmio_ro = is_hardware_domain(d) &&
1451 (regs->error_code & PFEC_page_present);
1452
1453 if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) )
1454 return EXCRET_fault_fixed;
1455 }
1456
1457 /*
1458 * For non-external shadowed guests, we fix up both their own pagefaults
1459 * and Xen's, since they share the pagetables. This includes hypervisor
1460 * faults, e.g. from copy_to_user().
1461 */
1462 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1463 {
1464 int ret = paging_fault(addr, regs);
1465
1466 if ( ret == EXCRET_fault_fixed )
1467 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr);
1468 return ret;
1469 }
1470
1471 return 0;
1472 }
1473
do_page_fault(struct cpu_user_regs * regs)1474 void do_page_fault(struct cpu_user_regs *regs)
1475 {
1476 unsigned long addr;
1477 unsigned int error_code;
1478
1479 addr = read_cr2();
1480
1481 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1482 error_code = regs->error_code;
1483
1484 if ( debugger_trap_entry(TRAP_page_fault, regs) )
1485 return;
1486
1487 perfc_incr(page_faults);
1488
1489 /* Any shadow stack access fault is a bug in Xen. */
1490 if ( error_code & PFEC_shstk )
1491 goto fatal;
1492
1493 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1494 return;
1495
1496 /*
1497 * Xen doesn't have reserved bits set in its pagetables, nor do we permit
1498 * PV guests to write any. Such entries would generally be vulnerable to
1499 * the L1TF sidechannel.
1500 *
1501 * The shadow pagetable logic may use reserved bits as part of
1502 * SHOPT_FAST_FAULT_PATH. Pagefaults arising from these will be resolved
1503 * via the fixup_page_fault() path.
1504 *
1505 * Anything remaining is an error, constituting corruption of the
1506 * pagetables and probably an L1TF vulnerable gadget.
1507 */
1508 if ( error_code & PFEC_reserved_bit )
1509 goto fatal;
1510
1511 if ( unlikely(!guest_mode(regs)) )
1512 {
1513 enum pf_type pf_type = spurious_page_fault(addr, regs);
1514
1515 if ( (pf_type == smep_fault) || (pf_type == smap_fault) )
1516 {
1517 console_start_sync();
1518 printk("Xen SM%cP violation\n",
1519 (pf_type == smep_fault) ? 'E' : 'A');
1520 fatal_trap(regs, 0);
1521 }
1522
1523 if ( pf_type != real_fault )
1524 return;
1525
1526 if ( likely(extable_fixup(regs, false)) )
1527 {
1528 perfc_incr(copy_user_faults);
1529 return;
1530 }
1531
1532 fatal:
1533 if ( debugger_trap_fatal(TRAP_page_fault, regs) )
1534 return;
1535
1536 show_execution_state(regs);
1537 show_page_walk(addr);
1538 panic("FATAL PAGE FAULT\n"
1539 "[error_code=%04x]\n"
1540 "Faulting linear address: %p\n",
1541 error_code, _p(addr));
1542 }
1543
1544 pv_inject_page_fault(regs->error_code, addr);
1545 }
1546
1547 /*
1548 * Early #PF handler to print CR2, error code, and stack.
1549 *
1550 * We also deal with spurious faults here, even though they should never happen
1551 * during early boot (an issue was seen once, but was most likely a hardware
1552 * problem).
1553 */
do_early_page_fault(struct cpu_user_regs * regs)1554 void __init do_early_page_fault(struct cpu_user_regs *regs)
1555 {
1556 static unsigned int __initdata stuck;
1557 static unsigned long __initdata prev_eip, prev_cr2;
1558 unsigned long cr2 = read_cr2();
1559
1560 BUG_ON(smp_processor_id() != 0);
1561
1562 if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) )
1563 {
1564 prev_eip = regs->rip;
1565 prev_cr2 = cr2;
1566 stuck = 0;
1567 return;
1568 }
1569
1570 if ( stuck++ == 1000 )
1571 {
1572 console_start_sync();
1573 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1574 regs->cs, _p(regs->rip), _p(cr2), regs->error_code);
1575 fatal_trap(regs, 0);
1576 }
1577 }
1578
do_general_protection(struct cpu_user_regs * regs)1579 void do_general_protection(struct cpu_user_regs *regs)
1580 {
1581 #ifdef CONFIG_PV
1582 struct vcpu *v = current;
1583 #endif
1584
1585 if ( debugger_trap_entry(TRAP_gp_fault, regs) )
1586 return;
1587
1588 if ( regs->error_code & X86_XEC_EXT )
1589 goto hardware_gp;
1590
1591 if ( !guest_mode(regs) )
1592 goto gp_in_kernel;
1593
1594 #ifdef CONFIG_PV
1595 /*
1596 * Cunning trick to allow arbitrary "INT n" handling.
1597 *
1598 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1599 * instruction from trapping to the appropriate vector, when that might not
1600 * be expected by Xen or the guest OS. For example, that entry might be for
1601 * a fault handler (unlike traps, faults don't increment EIP), or might
1602 * expect an error code on the stack (which a software trap never
1603 * provides), or might be a hardware interrupt handler that doesn't like
1604 * being called spuriously.
1605 *
1606 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1607 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1608 * clear (which got already checked above) to indicate that it's a software
1609 * fault, not a hardware one.
1610 *
1611 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1612 * okay because they can only be triggered by an explicit DPL-checked
1613 * instruction. The DPL specified by the guest OS for these vectors is NOT
1614 * CHECKED!!
1615 */
1616 if ( regs->error_code & X86_XEC_IDT )
1617 {
1618 /* This fault must be due to <INT n> instruction. */
1619 uint8_t vector = regs->error_code >> 3;
1620 const struct trap_info *ti = &v->arch.pv.trap_ctxt[vector];
1621
1622 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1623 {
1624 regs->rip += 2;
1625 pv_inject_sw_interrupt(vector);
1626 return;
1627 }
1628 }
1629 else if ( is_pv_32bit_vcpu(v) && regs->error_code )
1630 {
1631 pv_emulate_gate_op(regs);
1632 return;
1633 }
1634
1635 /* Emulate some simple privileged and I/O instructions. */
1636 if ( (regs->error_code == 0) &&
1637 pv_emulate_privileged_op(regs) )
1638 {
1639 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
1640 return;
1641 }
1642
1643 /* Pass on GPF as is. */
1644 pv_inject_hw_exception(TRAP_gp_fault, regs->error_code);
1645 return;
1646 #endif
1647
1648 gp_in_kernel:
1649
1650 if ( likely(extable_fixup(regs, true)) )
1651 return;
1652
1653 hardware_gp:
1654 if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
1655 return;
1656
1657 show_execution_state(regs);
1658 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1659 }
1660
pci_serr_softirq(void)1661 static void pci_serr_softirq(void)
1662 {
1663 printk("\n\nNMI - PCI system error (SERR)\n");
1664 outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
1665 }
1666
nmi_hwdom_report(unsigned int reason_idx)1667 static void nmi_hwdom_report(unsigned int reason_idx)
1668 {
1669 struct domain *d = hardware_domain;
1670
1671 if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
1672 return;
1673
1674 set_bit(reason_idx, nmi_reason(d));
1675
1676 pv_raise_nmi(d->vcpu[0]);
1677 }
1678
pci_serr_error(const struct cpu_user_regs * regs)1679 static void pci_serr_error(const struct cpu_user_regs *regs)
1680 {
1681 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
1682
1683 switch ( opt_nmi[0] )
1684 {
1685 case 'd': /* 'dom0' */
1686 nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
1687 /* fallthrough */
1688 case 'i': /* 'ignore' */
1689 /* Would like to print a diagnostic here but can't call printk()
1690 from NMI context -- raise a softirq instead. */
1691 raise_softirq(PCI_SERR_SOFTIRQ);
1692 break;
1693 default: /* 'fatal' */
1694 console_force_unlock();
1695 printk("\n\nNMI - PCI system error (SERR)\n");
1696 fatal_trap(regs, 0);
1697 }
1698 }
1699
io_check_error(const struct cpu_user_regs * regs)1700 static void io_check_error(const struct cpu_user_regs *regs)
1701 {
1702 switch ( opt_nmi[0] )
1703 {
1704 case 'd': /* 'dom0' */
1705 nmi_hwdom_report(_XEN_NMIREASON_io_error);
1706 case 'i': /* 'ignore' */
1707 break;
1708 default: /* 'fatal' */
1709 console_force_unlock();
1710 printk("\n\nNMI - I/O ERROR\n");
1711 fatal_trap(regs, 0);
1712 }
1713
1714 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1715 mdelay(1);
1716 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1717 }
1718
unknown_nmi_error(const struct cpu_user_regs * regs,unsigned char reason)1719 static void unknown_nmi_error(const struct cpu_user_regs *regs,
1720 unsigned char reason)
1721 {
1722 switch ( opt_nmi[0] )
1723 {
1724 case 'd': /* 'dom0' */
1725 nmi_hwdom_report(_XEN_NMIREASON_unknown);
1726 case 'i': /* 'ignore' */
1727 break;
1728 default: /* 'fatal' */
1729 console_force_unlock();
1730 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1731 printk("Do you have a strange power saving mode enabled?\n");
1732 fatal_trap(regs, 0);
1733 }
1734 }
1735
dummy_nmi_callback(const struct cpu_user_regs * regs,int cpu)1736 static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu)
1737 {
1738 return 0;
1739 }
1740
1741 static nmi_callback_t *nmi_callback = dummy_nmi_callback;
1742
1743 DEFINE_PER_CPU(unsigned int, nmi_count);
1744
do_nmi(const struct cpu_user_regs * regs)1745 void do_nmi(const struct cpu_user_regs *regs)
1746 {
1747 unsigned int cpu = smp_processor_id();
1748 unsigned char reason = 0;
1749 bool handle_unknown = false;
1750
1751 this_cpu(nmi_count)++;
1752 nmi_enter();
1753
1754 if ( nmi_callback(regs, cpu) )
1755 {
1756 nmi_exit();
1757 return;
1758 }
1759
1760 /*
1761 * Accessing port 0x61 may trap to SMM which has been actually
1762 * observed on some production SKX servers. This SMI sometimes
1763 * takes enough time for the next NMI tick to happen. By reading
1764 * this port before we re-arm the NMI watchdog, we reduce the chance
1765 * of having an NMI watchdog expire while in the SMI handler.
1766 */
1767 if ( cpu == nmi_cpu )
1768 reason = inb(0x61);
1769
1770 if ( (nmi_watchdog == NMI_NONE) ||
1771 (!nmi_watchdog_tick(regs) && watchdog_force) )
1772 handle_unknown = true;
1773
1774 /* Only the BSP gets external NMIs from the system. */
1775 if ( cpu == nmi_cpu )
1776 {
1777 if ( reason & 0x80 )
1778 pci_serr_error(regs);
1779 if ( reason & 0x40 )
1780 io_check_error(regs);
1781 if ( !(reason & 0xc0) && handle_unknown )
1782 unknown_nmi_error(regs, reason);
1783 }
1784
1785 nmi_exit();
1786 }
1787
set_nmi_callback(nmi_callback_t * callback)1788 nmi_callback_t *set_nmi_callback(nmi_callback_t *callback)
1789 {
1790 nmi_callback_t *old_nmi_callback = nmi_callback;
1791
1792 nmi_callback = callback;
1793
1794 return old_nmi_callback;
1795 }
1796
unset_nmi_callback(void)1797 void unset_nmi_callback(void)
1798 {
1799 nmi_callback = dummy_nmi_callback;
1800 }
1801
do_device_not_available(struct cpu_user_regs * regs)1802 void do_device_not_available(struct cpu_user_regs *regs)
1803 {
1804 #ifdef CONFIG_PV
1805 struct vcpu *curr = current;
1806 #endif
1807
1808 if ( !guest_mode(regs) )
1809 {
1810 /*
1811 * We shouldn't be able to reach here, but for release builds have
1812 * the recovery logic in place nevertheless.
1813 */
1814 if ( extable_fixup(regs, true) )
1815 {
1816 ASSERT_UNREACHABLE();
1817 return;
1818 }
1819
1820 fatal_trap(regs, false);
1821 }
1822
1823 #ifdef CONFIG_PV
1824 vcpu_restore_fpu_lazy(curr);
1825
1826 if ( curr->arch.pv.ctrlreg[0] & X86_CR0_TS )
1827 {
1828 pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1829 curr->arch.pv.ctrlreg[0] &= ~X86_CR0_TS;
1830 }
1831 else
1832 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
1833 #else
1834 ASSERT_UNREACHABLE();
1835 #endif
1836 }
1837
do_debug(struct cpu_user_regs * regs)1838 void do_debug(struct cpu_user_regs *regs)
1839 {
1840 unsigned long dr6;
1841 struct vcpu *v = current;
1842
1843 /* Stash dr6 as early as possible. */
1844 dr6 = read_debugreg(6);
1845
1846 if ( debugger_trap_entry(TRAP_debug, regs) )
1847 return;
1848
1849 /*
1850 * At the time of writing (March 2018), on the subject of %dr6:
1851 *
1852 * The Intel manual says:
1853 * Certain debug exceptions may clear bits 0-3. The remaining contents
1854 * of the DR6 register are never cleared by the processor. To avoid
1855 * confusion in identifying debug exceptions, debug handlers should
1856 * clear the register (except bit 16, which they should set) before
1857 * returning to the interrupted task.
1858 *
1859 * The AMD manual says:
1860 * Bits 15:13 of the DR6 register are not cleared by the processor and
1861 * must be cleared by software after the contents have been read.
1862 *
1863 * Some bits are reserved set, some are reserved clear, and some bits
1864 * which were previously reserved set are reused and cleared by hardware.
1865 * For future compatibility, reset to the default value, which will allow
1866 * us to spot any bit being changed by hardware to its non-default value.
1867 */
1868 write_debugreg(6, X86_DR6_DEFAULT);
1869
1870 /* #DB automatically disabled LBR. Reinstate it if debugging Xen. */
1871 if ( cpu_has_xen_lbr )
1872 wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
1873
1874 if ( !guest_mode(regs) )
1875 {
1876 /*
1877 * !!! WARNING !!!
1878 *
1879 * %dr6 is mostly guest controlled at this point. Any decsions base
1880 * on its value must be crosschecked with non-guest controlled state.
1881 */
1882
1883 if ( regs->eflags & X86_EFLAGS_TF )
1884 {
1885 #ifdef CONFIG_PV
1886 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
1887 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
1888 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
1889 {
1890 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
1891 regs->eflags &= ~X86_EFLAGS_TF;
1892 return;
1893 }
1894 #endif
1895 if ( !debugger_trap_fatal(TRAP_debug, regs) )
1896 {
1897 WARN();
1898 regs->eflags &= ~X86_EFLAGS_TF;
1899 }
1900 }
1901
1902 /*
1903 * Check for fault conditions. General Detect, and instruction
1904 * breakpoints are faults rather than traps, at which point attempting
1905 * to ignore and continue will result in a livelock.
1906 *
1907 * However, on entering the #DB handler, hardware clears %dr7.gd for
1908 * us (as confirmed by the earlier %dr6 accesses succeeding), meaning
1909 * that a real General Detect exception is restartable.
1910 *
1911 * PV guests are not permitted to point %dr{0..3} at Xen linear
1912 * addresses, and Instruction Breakpoints (being faults) don't get
1913 * delayed by a MovSS shadow, so we should never encounter one in
1914 * hypervisor context.
1915 *
1916 * If however we do, safety measures need to be enacted. Use a big
1917 * hammer and clear all debug settings.
1918 */
1919 if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
1920 {
1921 unsigned int bp, dr7 = read_debugreg(7);
1922
1923 for ( bp = 0; bp < 4; ++bp )
1924 {
1925 if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
1926 (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
1927 ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
1928 DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
1929 {
1930 ASSERT_UNREACHABLE();
1931
1932 printk(XENLOG_ERR
1933 "Hit instruction breakpoint in Xen context\n");
1934 write_debugreg(7, 0);
1935 break;
1936 }
1937 }
1938 }
1939
1940 /*
1941 * Whatever caused this #DB should be restartable by this point. Note
1942 * it and continue. Guests can trigger this in certain corner cases,
1943 * so ensure the message is ratelimited.
1944 */
1945 gprintk(XENLOG_WARNING,
1946 "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
1947 regs->cs, _p(regs->rip), _p(regs->rip),
1948 regs->ss, _p(regs->rsp), dr6);
1949
1950 return;
1951 }
1952
1953 /* Save debug status register where guest OS can peek at it */
1954 v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
1955 v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
1956
1957 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1958 }
1959
do_entry_CP(struct cpu_user_regs * regs)1960 void do_entry_CP(struct cpu_user_regs *regs)
1961 {
1962 static const char errors[][10] = {
1963 [1] = "near ret",
1964 [2] = "far/iret",
1965 [3] = "endbranch",
1966 [4] = "rstorssp",
1967 [5] = "setssbsy",
1968 };
1969 const char *err = "??";
1970 unsigned int ec = regs->error_code;
1971
1972 if ( debugger_trap_entry(TRAP_debug, regs) )
1973 return;
1974
1975 /* Decode ec if possible */
1976 if ( ec < ARRAY_SIZE(errors) && errors[ec][0] )
1977 err = errors[ec];
1978
1979 /*
1980 * For now, only supervisors shadow stacks should be active. A #CP from
1981 * guest context is probably a Xen bug, but kill the guest in an attempt
1982 * to recover.
1983 */
1984 if ( guest_mode(regs) )
1985 {
1986 gprintk(XENLOG_ERR, "Hit #CP[%04x] in guest context %04x:%p\n",
1987 ec, regs->cs, _p(regs->rip));
1988 ASSERT_UNREACHABLE();
1989 domain_crash(current->domain);
1990 return;
1991 }
1992
1993 show_execution_state(regs);
1994 panic("CONTROL-FLOW PROTECTION FAULT: #CP[%04x] %s\n", ec, err);
1995 }
1996
__set_intr_gate(unsigned int n,uint32_t dpl,void * addr)1997 static void __init noinline __set_intr_gate(unsigned int n,
1998 uint32_t dpl, void *addr)
1999 {
2000 _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr);
2001 }
2002
set_swint_gate(unsigned int n,void * addr)2003 static void __init set_swint_gate(unsigned int n, void *addr)
2004 {
2005 __set_intr_gate(n, 3, addr);
2006 }
2007
set_intr_gate(unsigned int n,void * addr)2008 static void __init set_intr_gate(unsigned int n, void *addr)
2009 {
2010 __set_intr_gate(n, 0, addr);
2011 }
2012
calc_ler_msr(void)2013 static unsigned int calc_ler_msr(void)
2014 {
2015 switch ( boot_cpu_data.x86_vendor )
2016 {
2017 case X86_VENDOR_INTEL:
2018 switch ( boot_cpu_data.x86 )
2019 {
2020 case 6:
2021 return MSR_IA32_LASTINTFROMIP;
2022
2023 case 15:
2024 return MSR_P4_LER_FROM_LIP;
2025 }
2026 break;
2027
2028 case X86_VENDOR_AMD:
2029 switch ( boot_cpu_data.x86 )
2030 {
2031 case 6:
2032 case 0xf ... 0x19:
2033 return MSR_IA32_LASTINTFROMIP;
2034 }
2035 break;
2036
2037 case X86_VENDOR_HYGON:
2038 return MSR_IA32_LASTINTFROMIP;
2039 }
2040
2041 return 0;
2042 }
2043
percpu_traps_init(void)2044 void percpu_traps_init(void)
2045 {
2046 subarch_percpu_traps_init();
2047
2048 if ( !opt_ler )
2049 return;
2050
2051 if ( !ler_msr && (ler_msr = calc_ler_msr()) )
2052 setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
2053
2054 if ( cpu_has_xen_lbr )
2055 wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
2056 }
2057
init_idt_traps(void)2058 void __init init_idt_traps(void)
2059 {
2060 /*
2061 * Note that interrupt gates are always used, rather than trap gates. We
2062 * must have interrupts disabled until DS/ES/FS/GS are saved because the
2063 * first activation must have the "bad" value(s) for these registers and
2064 * we may lose them if another activation is installed before they are
2065 * saved. The page-fault handler also needs interrupts disabled until %cr2
2066 * has been read and saved on the stack.
2067 */
2068 set_intr_gate(TRAP_divide_error,÷_error);
2069 set_intr_gate(TRAP_debug,&debug);
2070 set_intr_gate(TRAP_nmi,&nmi);
2071 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
2072 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2073 set_intr_gate(TRAP_bounds,&bounds);
2074 set_intr_gate(TRAP_invalid_op,&invalid_op);
2075 set_intr_gate(TRAP_no_device,&device_not_available);
2076 set_intr_gate(TRAP_double_fault,&double_fault);
2077 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2078 set_intr_gate(TRAP_no_segment,&segment_not_present);
2079 set_intr_gate(TRAP_stack_error,&stack_segment);
2080 set_intr_gate(TRAP_gp_fault,&general_protection);
2081 set_intr_gate(TRAP_page_fault,&early_page_fault);
2082 set_intr_gate(TRAP_copro_error,&coprocessor_error);
2083 set_intr_gate(TRAP_alignment_check,&alignment_check);
2084 set_intr_gate(TRAP_machine_check,&machine_check);
2085 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2086 set_intr_gate(X86_EXC_CP, entry_CP);
2087
2088 /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
2089 enable_each_ist(idt_table);
2090
2091 /* CPU0 uses the master IDT. */
2092 idt_tables[0] = idt_table;
2093
2094 this_cpu(gdt) = boot_gdt;
2095 if ( IS_ENABLED(CONFIG_PV32) )
2096 this_cpu(compat_gdt) = boot_compat_gdt;
2097 }
2098
2099 extern void (*const autogen_entrypoints[X86_NR_VECTORS])(void);
trap_init(void)2100 void __init trap_init(void)
2101 {
2102 unsigned int vector;
2103
2104 /* Replace early pagefault with real pagefault handler. */
2105 set_intr_gate(TRAP_page_fault, &page_fault);
2106
2107 pv_trap_init();
2108
2109 for ( vector = 0; vector < X86_NR_VECTORS; ++vector )
2110 {
2111 if ( autogen_entrypoints[vector] )
2112 {
2113 /* Found autogen entry: check we won't clobber an existing trap. */
2114 ASSERT(idt_table[vector].b == 0);
2115 set_intr_gate(vector, autogen_entrypoints[vector]);
2116 }
2117 else
2118 {
2119 /* No entry point: confirm we have an existing trap in place. */
2120 ASSERT(idt_table[vector].b != 0);
2121 }
2122 }
2123
2124 /* Cache {,compat_}gdt_l1e now that physically relocation is done. */
2125 this_cpu(gdt_l1e) =
2126 l1e_from_pfn(virt_to_mfn(boot_gdt), __PAGE_HYPERVISOR_RW);
2127 if ( IS_ENABLED(CONFIG_PV32) )
2128 this_cpu(compat_gdt_l1e) =
2129 l1e_from_pfn(virt_to_mfn(boot_compat_gdt), __PAGE_HYPERVISOR_RW);
2130
2131 percpu_traps_init();
2132
2133 cpu_init();
2134
2135 open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq);
2136 }
2137
activate_debugregs(const struct vcpu * curr)2138 void activate_debugregs(const struct vcpu *curr)
2139 {
2140 ASSERT(curr == current);
2141
2142 write_debugreg(0, curr->arch.dr[0]);
2143 write_debugreg(1, curr->arch.dr[1]);
2144 write_debugreg(2, curr->arch.dr[2]);
2145 write_debugreg(3, curr->arch.dr[3]);
2146 write_debugreg(6, curr->arch.dr6);
2147
2148 /*
2149 * Avoid writing the subsequently getting replaced value when getting
2150 * called from set_debugreg() below. Eventual future callers will need
2151 * to take this into account.
2152 */
2153 if ( curr->arch.dr7 & DR7_ACTIVE_MASK )
2154 write_debugreg(7, curr->arch.dr7);
2155
2156 if ( boot_cpu_has(X86_FEATURE_DBEXT) )
2157 {
2158 wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]);
2159 wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.msrs->dr_mask[1]);
2160 wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.msrs->dr_mask[2]);
2161 wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.msrs->dr_mask[3]);
2162 }
2163 }
2164
asm_domain_crash_synchronous(unsigned long addr)2165 void asm_domain_crash_synchronous(unsigned long addr)
2166 {
2167 /*
2168 * We need clear AC bit here because in entry.S AC is set
2169 * by ASM_STAC to temporarily allow accesses to user pages
2170 * which is prevented by SMAP by default.
2171 *
2172 * For some code paths, where this function is called, clac()
2173 * is not needed, but adding clac() here instead of each place
2174 * asm_domain_crash_synchronous() is called can reduce the code
2175 * redundancy, and it is harmless as well.
2176 */
2177 clac();
2178
2179 if ( addr == 0 )
2180 addr = this_cpu(last_extable_addr);
2181
2182 printk("domain_crash_sync called from entry.S: fault at %p %pS\n",
2183 _p(addr), _p(addr));
2184
2185 __domain_crash(current->domain);
2186
2187 for ( ; ; )
2188 do_softirq();
2189 }
2190
2191 /*
2192 * Local variables:
2193 * mode: C
2194 * c-file-style: "BSD"
2195 * c-basic-offset: 4
2196 * tab-width: 4
2197 * indent-tabs-mode: nil
2198 * End:
2199 */
2200