1 /******************************************************************************
2  * arch/x86/traps.c
3  *
4  * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /*
21  *  Copyright (C) 1991, 1992  Linus Torvalds
22  *
23  *  Pentium III FXSR, SSE support
24  * Gareth Hughes <gareth@valinux.com>, May 2000
25  */
26 
27 #include <xen/init.h>
28 #include <xen/sched.h>
29 #include <xen/lib.h>
30 #include <xen/err.h>
31 #include <xen/errno.h>
32 #include <xen/mm.h>
33 #include <xen/param.h>
34 #include <xen/console.h>
35 #include <xen/shutdown.h>
36 #include <xen/guest_access.h>
37 #include <asm/regs.h>
38 #include <xen/delay.h>
39 #include <xen/event.h>
40 #include <xen/spinlock.h>
41 #include <xen/irq.h>
42 #include <xen/perfc.h>
43 #include <xen/softirq.h>
44 #include <xen/domain_page.h>
45 #include <xen/symbols.h>
46 #include <xen/iocap.h>
47 #include <xen/version.h>
48 #include <xen/kexec.h>
49 #include <xen/trace.h>
50 #include <xen/paging.h>
51 #include <xen/virtual_region.h>
52 #include <xen/watchdog.h>
53 #include <xen/livepatch.h>
54 #include <asm/system.h>
55 #include <asm/io.h>
56 #include <asm/atomic.h>
57 #include <xen/bitops.h>
58 #include <asm/desc.h>
59 #include <asm/debugreg.h>
60 #include <asm/smp.h>
61 #include <asm/flushtlb.h>
62 #include <asm/uaccess.h>
63 #include <asm/i387.h>
64 #include <asm/xstate.h>
65 #include <asm/debugger.h>
66 #include <asm/msr.h>
67 #include <asm/nmi.h>
68 #include <asm/shared.h>
69 #include <asm/x86_emulate.h>
70 #include <asm/traps.h>
71 #include <asm/hvm/vpt.h>
72 #include <asm/hypercall.h>
73 #include <asm/mce.h>
74 #include <asm/apic.h>
75 #include <asm/mc146818rtc.h>
76 #include <asm/hpet.h>
77 #include <asm/vpmu.h>
78 #include <public/arch-x86/cpuid.h>
79 #include <public/hvm/params.h>
80 #include <asm/cpuid.h>
81 #include <xsm/xsm.h>
82 #include <asm/pv/traps.h>
83 #include <asm/pv/mm.h>
84 
85 /*
86  * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
87  *  fatal:  Xen prints diagnostic message and then hangs.
88  *  dom0:   The NMI is virtualised to DOM0.
89  *  ignore: The NMI error is cleared and ignored.
90  */
91 #ifdef NDEBUG
92 static char __read_mostly opt_nmi[10] = "dom0";
93 #else
94 static char __read_mostly opt_nmi[10] = "fatal";
95 #endif
96 string_param("nmi", opt_nmi);
97 
98 DEFINE_PER_CPU(uint64_t, efer);
99 static DEFINE_PER_CPU(unsigned long, last_extable_addr);
100 
101 DEFINE_PER_CPU_READ_MOSTLY(seg_desc_t *, gdt);
102 DEFINE_PER_CPU_READ_MOSTLY(l1_pgentry_t, gdt_l1e);
103 #ifdef CONFIG_PV32
104 DEFINE_PER_CPU_READ_MOSTLY(seg_desc_t *, compat_gdt);
105 DEFINE_PER_CPU_READ_MOSTLY(l1_pgentry_t, compat_gdt_l1e);
106 #endif
107 
108 /* Master table, used by CPU0. */
109 idt_entry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
110     idt_table[IDT_ENTRIES];
111 
112 /* Pointer to the IDT of every CPU. */
113 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
114 
115 /*
116  * The TSS is smaller than a page, but we give it a full page to avoid
117  * adjacent per-cpu data leaking via Meltdown when XPTI is in use.
118  */
119 DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_page, tss_page);
120 
121 static int debug_stack_lines = 20;
122 integer_param("debug_stack_lines", debug_stack_lines);
123 
124 static bool opt_ler;
125 boolean_param("ler", opt_ler);
126 
127 /* LastExceptionFromIP on this hardware.  Zero if LER is not in use. */
128 unsigned int __read_mostly ler_msr;
129 
130 const unsigned int nmi_cpu;
131 
132 #define stack_words_per_line 4
133 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
134 
135 static void do_trap(struct cpu_user_regs *regs);
136 static void do_reserved_trap(struct cpu_user_regs *regs);
137 
138 void (* const exception_table[TRAP_nr])(struct cpu_user_regs *regs) = {
139     [TRAP_divide_error]                 = do_trap,
140     [TRAP_debug]                        = do_debug,
141     [TRAP_nmi]                          = (void *)do_nmi,
142     [TRAP_int3]                         = do_int3,
143     [TRAP_overflow]                     = do_trap,
144     [TRAP_bounds]                       = do_trap,
145     [TRAP_invalid_op]                   = do_invalid_op,
146     [TRAP_no_device]                    = do_device_not_available,
147     [TRAP_double_fault]                 = do_reserved_trap,
148     [TRAP_copro_seg]                    = do_reserved_trap,
149     [TRAP_invalid_tss]                  = do_trap,
150     [TRAP_no_segment]                   = do_trap,
151     [TRAP_stack_error]                  = do_trap,
152     [TRAP_gp_fault]                     = do_general_protection,
153     [TRAP_page_fault]                   = do_page_fault,
154     [TRAP_spurious_int]                 = do_reserved_trap,
155     [TRAP_copro_error]                  = do_trap,
156     [TRAP_alignment_check]              = do_trap,
157     [TRAP_machine_check]                = (void *)do_machine_check,
158     [TRAP_simd_error]                   = do_trap,
159     [TRAP_virtualisation]               = do_reserved_trap,
160     [X86_EXC_CP]                        = do_entry_CP,
161     [X86_EXC_CP + 1 ...
162      (ARRAY_SIZE(exception_table) - 1)] = do_reserved_trap,
163 };
164 
show_code(const struct cpu_user_regs * regs)165 void show_code(const struct cpu_user_regs *regs)
166 {
167     unsigned char insns_before[8] = {}, insns_after[16] = {};
168     unsigned int i, tmp, missing_before, missing_after;
169 
170     if ( guest_mode(regs) )
171         return;
172 
173     stac();
174 
175     /*
176      * Copy forward from regs->rip.  In the case of a fault, %ecx contains the
177      * number of bytes remaining to copy.
178      */
179     asm volatile ("1: rep movsb; 2:"
180                   _ASM_EXTABLE(1b, 2b)
181                   : "=&c" (missing_after),
182                     "=&D" (tmp), "=&S" (tmp)
183                   : "0" (ARRAY_SIZE(insns_after)),
184                     "1" (insns_after),
185                     "2" (regs->rip));
186 
187     /*
188      * Copy backwards from regs->rip - 1.  In the case of a fault, %ecx
189      * contains the number of bytes remaining to copy.
190      */
191     asm volatile ("std;"
192                   "1: rep movsb;"
193                   "2: cld;"
194                   _ASM_EXTABLE(1b, 2b)
195                   : "=&c" (missing_before),
196                     "=&D" (tmp), "=&S" (tmp)
197                   : "0" (ARRAY_SIZE(insns_before)),
198                     "1" (insns_before + ARRAY_SIZE(insns_before) - 1),
199                     "2" (regs->rip - 1));
200     clac();
201 
202     printk("Xen code around <%p> (%ps)%s:\n",
203            _p(regs->rip), _p(regs->rip),
204            (missing_before || missing_after) ? " [fault on access]" : "");
205 
206     /* Print bytes from insns_before[]. */
207     for ( i = 0; i < ARRAY_SIZE(insns_before); ++i )
208     {
209         if ( i < missing_before )
210             printk(" --");
211         else
212             printk(" %02x", insns_before[i]);
213     }
214 
215     /* Print the byte under %rip. */
216     if ( missing_after != ARRAY_SIZE(insns_after) )
217         printk(" <%02x>", insns_after[0]);
218     else
219         printk(" <-->");
220 
221     /* Print bytes from insns_after[]. */
222     for ( i = 1; i < ARRAY_SIZE(insns_after); ++i )
223     {
224         if ( i < (ARRAY_SIZE(insns_after) - missing_after) )
225             printk(" %02x", insns_after[i]);
226         else
227             printk(" --");
228     }
229 
230     printk("\n");
231 }
232 
compat_show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs,int debug_stack_lines)233 static void compat_show_guest_stack(struct vcpu *v,
234                                     const struct cpu_user_regs *regs,
235                                     int debug_stack_lines)
236 {
237     unsigned int i, *stack, addr, mask = STACK_SIZE;
238     void *stack_page = NULL;
239 
240     stack = (unsigned int *)(unsigned long)regs->esp;
241     printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack);
242 
243     if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) )
244     {
245         printk("Guest-inaccessible memory.\n");
246         return;
247     }
248 
249     if ( v != current )
250     {
251         struct vcpu *vcpu;
252         unsigned long mfn;
253 
254         ASSERT(guest_kernel_mode(v, regs));
255         mfn = read_cr3() >> PAGE_SHIFT;
256         for_each_vcpu( v->domain, vcpu )
257             if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
258                 break;
259         if ( !vcpu )
260         {
261             stack_page = stack = do_page_walk(v, (unsigned long)stack);
262             if ( (unsigned long)stack < PAGE_SIZE )
263             {
264                 printk("Inaccessible guest memory.\n");
265                 return;
266             }
267             mask = PAGE_SIZE;
268         }
269     }
270 
271     for ( i = 0; i < debug_stack_lines * 8; i++ )
272     {
273         if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
274             break;
275         if ( __get_user(addr, stack) )
276         {
277             if ( i != 0 )
278                 printk("\n    ");
279             printk("Fault while accessing guest memory.");
280             i = 1;
281             break;
282         }
283         if ( (i != 0) && ((i % 8) == 0) )
284             printk("\n ");
285         printk(" %08x", addr);
286         stack++;
287     }
288 
289     UNMAP_DOMAIN_PAGE(stack_page);
290 
291     if ( i == 0 )
292         printk("Stack empty.");
293     printk("\n");
294 }
295 
show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs)296 static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
297 {
298     int i;
299     unsigned long *stack, addr;
300     unsigned long mask = STACK_SIZE;
301     void *stack_page = NULL;
302 
303     /* Avoid HVM as we don't know what the stack looks like. */
304     if ( is_hvm_vcpu(v) )
305         return;
306 
307     if ( is_pv_32bit_vcpu(v) )
308     {
309         compat_show_guest_stack(v, regs, debug_stack_lines);
310         return;
311     }
312 
313     stack = (unsigned long *)regs->rsp;
314     printk("Guest stack trace from "__OP"sp=%p:\n  ", stack);
315 
316     if ( !access_ok(stack, sizeof(*stack)) )
317     {
318         printk("Guest-inaccessible memory.\n");
319         return;
320     }
321 
322     if ( v != current )
323     {
324         struct vcpu *vcpu;
325 
326         ASSERT(guest_kernel_mode(v, regs));
327         vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
328         if ( !vcpu )
329         {
330             stack_page = stack = do_page_walk(v, (unsigned long)stack);
331             if ( (unsigned long)stack < PAGE_SIZE )
332             {
333                 printk("Inaccessible guest memory.\n");
334                 return;
335             }
336             mask = PAGE_SIZE;
337         }
338     }
339 
340     for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
341     {
342         if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
343             break;
344         if ( __get_user(addr, stack) )
345         {
346             if ( i != 0 )
347                 printk("\n    ");
348             printk("Fault while accessing guest memory.");
349             i = 1;
350             break;
351         }
352         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
353             printk("\n  ");
354         printk(" %p", _p(addr));
355         stack++;
356     }
357 
358     UNMAP_DOMAIN_PAGE(stack_page);
359 
360     if ( i == 0 )
361         printk("Stack empty.");
362     printk("\n");
363 }
364 
365 /*
366  * Notes for get_{stack,shstk}*_bottom() helpers
367  *
368  * Stack pages 1 - 4:
369  *   These are all 1-page IST stacks.  Each of these stacks have an exception
370  *   frame and saved register state at the top.  The interesting bound for a
371  *   trace is the word adjacent to this, while the bound for a dump is the
372  *   very top, including the exception frame.
373  *
374  * Stack pages 0 and 5:
375  *   Shadow stacks.  These are mapped read-only, and used by CET-SS capable
376  *   processors.  They will never contain regular stack data.
377  *
378  * Stack pages 6 and 7:
379  *   These form the primary stack, and have a cpu_info at the top.  For a
380  *   trace, the interesting bound is adjacent to the cpu_info, while for a
381  *   dump, the entire cpu_info is interesting.
382  *
383  * For the cases where the stack should not be inspected, pretend that the
384  * passed stack pointer is already out of reasonable bounds.
385  */
get_stack_trace_bottom(unsigned long sp)386 unsigned long get_stack_trace_bottom(unsigned long sp)
387 {
388     switch ( get_stack_page(sp) )
389     {
390     case 1 ... 4:
391         return ROUNDUP(sp, PAGE_SIZE) -
392             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
393 
394     case 6 ... 7:
395         return ROUNDUP(sp, STACK_SIZE) -
396             sizeof(struct cpu_info) - sizeof(unsigned long);
397 
398     default:
399         return sp - sizeof(unsigned long);
400     }
401 }
402 
get_shstk_bottom(unsigned long sp)403 static unsigned long get_shstk_bottom(unsigned long sp)
404 {
405     switch ( get_stack_page(sp) )
406     {
407 #ifdef CONFIG_XEN_SHSTK
408     case 0:  return ROUNDUP(sp, IST_SHSTK_SIZE) - sizeof(unsigned long);
409     case 5:  return ROUNDUP(sp, PAGE_SIZE)      - sizeof(unsigned long);
410 #endif
411     default: return sp - sizeof(unsigned long);
412     }
413 }
414 
get_stack_dump_bottom(unsigned long sp)415 unsigned long get_stack_dump_bottom(unsigned long sp)
416 {
417     switch ( get_stack_page(sp) )
418     {
419     case 1 ... 4:
420         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
421 
422     case 6 ... 7:
423         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
424 
425     default:
426         return sp - sizeof(unsigned long);
427     }
428 }
429 
430 #if !defined(CONFIG_FRAME_POINTER)
431 
432 /*
433  * Stack trace from pointers found in stack, unaided by frame pointers.  For
434  * caller convenience, this has the same prototype as its alternative, and
435  * simply ignores the base pointer parameter.
436  */
_show_trace(unsigned long sp,unsigned long __maybe_unused bp)437 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
438 {
439     unsigned long *stack = (unsigned long *)sp, addr;
440     unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
441 
442     while ( stack <= bottom )
443     {
444         addr = *stack++;
445         if ( is_active_kernel_text(addr) )
446             printk("   [<%p>] S %pS\n", _p(addr), _p(addr));
447     }
448 }
449 
450 #else
451 
452 /* Stack trace from frames in the stack, using frame pointers */
_show_trace(unsigned long sp,unsigned long bp)453 static void _show_trace(unsigned long sp, unsigned long bp)
454 {
455     unsigned long *frame, next, addr;
456 
457     /* Bounds for range of valid frame pointer. */
458     unsigned long low = sp, high = get_stack_trace_bottom(sp);
459 
460     /* The initial frame pointer. */
461     next = bp;
462 
463     for ( ; ; )
464     {
465         /* Valid frame pointer? */
466         if ( (next < low) || (next >= high) )
467         {
468             /*
469              * Exception stack frames have a different layout, denoted by an
470              * inverted frame pointer.
471              */
472             next = ~next;
473             if ( (next < low) || (next >= high) )
474                 break;
475             frame = (unsigned long *)next;
476             next  = frame[0];
477             addr  = frame[(offsetof(struct cpu_user_regs, rip) -
478                            offsetof(struct cpu_user_regs, rbp))
479                          / BYTES_PER_LONG];
480         }
481         else
482         {
483             /* Ordinary stack frame. */
484             frame = (unsigned long *)next;
485             next  = frame[0];
486             addr  = frame[1];
487         }
488 
489         printk("   [<%p>] F %pS\n", _p(addr), _p(addr));
490 
491         low = (unsigned long)&frame[2];
492     }
493 }
494 
495 #endif
496 
show_trace(const struct cpu_user_regs * regs)497 static void show_trace(const struct cpu_user_regs *regs)
498 {
499     unsigned long *sp = ESP_BEFORE_EXCEPTION(regs), tos = 0;
500     bool fault = false;
501 
502     printk("Xen call trace:\n");
503 
504     /* Guarded read of the stack top. */
505     asm ( "1: mov %[data], %[tos]; 2:\n"
506           ".pushsection .fixup,\"ax\"\n"
507           "3: movb $1, %[fault]; jmp 2b\n"
508           ".popsection\n"
509           _ASM_EXTABLE(1b, 3b)
510           : [tos] "+r" (tos), [fault] "+qm" (fault) : [data] "m" (*sp) );
511 
512     /*
513      * If RIP looks sensible, or the top of the stack doesn't, print RIP at
514      * the top of the stack trace.
515      */
516     if ( is_active_kernel_text(regs->rip) ||
517          !is_active_kernel_text(tos) )
518         printk("   [<%p>] R %pS\n", _p(regs->rip), _p(regs->rip));
519 
520     if ( fault )
521     {
522         printk("   [Fault on access]\n");
523         return;
524     }
525 
526     /*
527      * If RIP looks bad or the top of the stack looks good, log the top of
528      * stack as well.  Perhaps we followed a wild function pointer, or we're
529      * in a function without frame pointer, or in a function prologue before
530      * the frame pointer gets set up?  Let's assume the top of the stack is a
531      * return address; print it and skip past so _show_trace() doesn't print
532      * it again.
533      */
534     if ( !is_active_kernel_text(regs->rip) ||
535          is_active_kernel_text(tos) )
536     {
537         printk("   [<%p>] S %pS\n", _p(tos), _p(tos));
538         sp++;
539     }
540 
541     _show_trace((unsigned long)sp, regs->rbp);
542 
543     printk("\n");
544 }
545 
show_stack(const struct cpu_user_regs * regs)546 void show_stack(const struct cpu_user_regs *regs)
547 {
548     unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
549     int i;
550 
551     if ( guest_mode(regs) )
552         return show_guest_stack(current, regs);
553 
554     printk("Xen stack trace from "__OP"sp=%p:\n  ", stack);
555 
556     stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
557 
558     for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
559               (stack <= stack_bottom); i++ )
560     {
561         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
562             printk("\n  ");
563         addr = *stack++;
564         printk(" %p", _p(addr));
565     }
566     if ( i == 0 )
567         printk("Stack empty.");
568     printk("\n");
569 
570     show_trace(regs);
571 }
572 
show_stack_overflow(unsigned int cpu,const struct cpu_user_regs * regs)573 void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
574 {
575     unsigned long esp = regs->rsp;
576     unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
577 #ifdef MEMORY_GUARD
578     unsigned long esp_top, esp_bottom;
579 #endif
580 
581     if ( _p(curr_stack_base) != stack_base[cpu] )
582         printk("Current stack base %p differs from expected %p\n",
583                _p(curr_stack_base), stack_base[cpu]);
584 
585 #ifdef MEMORY_GUARD
586     esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
587     esp_top    = esp_bottom - PRIMARY_STACK_SIZE;
588 
589     printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n",
590            (void *)esp_top, (void *)esp_bottom, (void *)esp,
591            (void *)per_cpu(tss_page, cpu).tss.rsp0);
592 
593     /*
594      * Trigger overflow trace if %esp is anywhere within the guard page, or
595      * with fewer than 512 bytes remaining on the primary stack.
596      */
597     if ( (esp > (esp_top + 512)) ||
598          (esp < (esp_top - PAGE_SIZE)) )
599     {
600         printk("No stack overflow detected. Skipping stack trace.\n");
601         return;
602     }
603 
604     if ( esp < esp_top )
605         esp = esp_top;
606 
607     printk("Xen stack overflow (dumping trace %p-%p):\n",
608            (void *)esp, (void *)esp_bottom);
609 
610     _show_trace(esp, regs->rbp);
611 
612     printk("\n");
613 #endif
614 }
615 
show_execution_state(const struct cpu_user_regs * regs)616 void show_execution_state(const struct cpu_user_regs *regs)
617 {
618     /* Prevent interleaving of output. */
619     unsigned long flags = console_lock_recursive_irqsave();
620 
621     show_registers(regs);
622     show_code(regs);
623     show_stack(regs);
624 
625     console_unlock_recursive_irqrestore(flags);
626 }
627 
vcpu_show_execution_state(struct vcpu * v)628 void vcpu_show_execution_state(struct vcpu *v)
629 {
630     unsigned long flags;
631 
632     printk("*** Dumping Dom%d vcpu#%d state: ***\n",
633            v->domain->domain_id, v->vcpu_id);
634 
635     if ( v == current )
636     {
637         show_execution_state(guest_cpu_user_regs());
638         return;
639     }
640 
641     vcpu_pause(v); /* acceptably dangerous */
642 
643     /* Prevent interleaving of output. */
644     flags = console_lock_recursive_irqsave();
645 
646     vcpu_show_registers(v);
647     if ( guest_kernel_mode(v, &v->arch.user_regs) )
648         show_guest_stack(v, &v->arch.user_regs);
649 
650     console_unlock_recursive_irqrestore(flags);
651 
652     vcpu_unpause(v);
653 }
654 
655 static cpumask_t show_state_mask;
656 static bool opt_show_all;
657 boolean_param("async-show-all", opt_show_all);
658 
nmi_show_execution_state(const struct cpu_user_regs * regs,int cpu)659 static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
660 {
661     if ( !cpumask_test_cpu(cpu, &show_state_mask) )
662         return 0;
663 
664     if ( opt_show_all )
665         show_execution_state(regs);
666     else
667         printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs,
668                regs->rip, guest_mode(regs) ? NULL : _p(regs->rip));
669     cpumask_clear_cpu(cpu, &show_state_mask);
670 
671     return 1;
672 }
673 
trapstr(unsigned int trapnr)674 const char *trapstr(unsigned int trapnr)
675 {
676     static const char * const strings[] = {
677         "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
678         "invalid opcode", "device not available", "double fault",
679         "coprocessor segment", "invalid tss", "segment not found",
680         "stack error", "general protection fault", "page fault",
681         "spurious interrupt", "coprocessor error", "alignment check",
682         "machine check", "simd error", "virtualisation exception"
683     };
684 
685     return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
686 }
687 
vec_name(unsigned int vec)688 static const char *vec_name(unsigned int vec)
689 {
690     static const char names[][4] = {
691 #define P(x) [X86_EXC_ ## x] = "#" #x
692 #define N(x) [X86_EXC_ ## x] = #x
693         P(DE),  P(DB),  N(NMI), P(BP),  P(OF),  P(BR),  P(UD),  P(NM),
694         P(DF),  N(CSO), P(TS),  P(NP),  P(SS),  P(GP),  P(PF),  N(SPV),
695         P(MF),  P(AC),  P(MC),  P(XM),  P(VE),  P(CP),
696                                         P(HV),  P(VC),  P(SX),
697 #undef N
698 #undef P
699     };
700 
701     return (vec < ARRAY_SIZE(names) && names[vec][0]) ? names[vec] : "???";
702 }
703 
704 /*
705  * This is called for faults at very unexpected times (e.g., when interrupts
706  * are disabled). In such situations we can't do much that is safe. We try to
707  * print out some tracing and then we just spin.
708  */
fatal_trap(const struct cpu_user_regs * regs,bool show_remote)709 void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
710 {
711     static DEFINE_PER_CPU(char, depth);
712     unsigned int trapnr = regs->entry_vector;
713 
714     /* Set AC to reduce chance of further SMAP faults */
715     stac();
716 
717     /*
718      * In some cases, we can end up in a vicious cycle of fatal_trap()s
719      * within fatal_trap()s. We give the problem a couple of iterations to
720      * bottom out, and then we just panic.
721      */
722     if ( ++this_cpu(depth) < 3 )
723     {
724         watchdog_disable();
725         console_start_sync();
726 
727         show_execution_state(regs);
728 
729         if ( trapnr == TRAP_page_fault )
730             show_page_walk(read_cr2());
731 
732         if ( show_remote )
733         {
734             unsigned int msecs, pending;
735 
736             cpumask_andnot(&show_state_mask, &cpu_online_map,
737                            cpumask_of(smp_processor_id()));
738             set_nmi_callback(nmi_show_execution_state);
739             /* Ensure new callback is set before sending out the NMI. */
740             smp_wmb();
741             smp_send_nmi_allbutself();
742 
743             /* Wait at most 10ms for some other CPU to respond. */
744             msecs = 10;
745             pending = cpumask_weight(&show_state_mask);
746             while ( pending && msecs-- )
747             {
748                 unsigned int left;
749 
750                 mdelay(1);
751                 left = cpumask_weight(&show_state_mask);
752                 if ( left < pending )
753                 {
754                     pending = left;
755                     msecs = 10;
756                 }
757             }
758         }
759     }
760 
761     panic("FATAL TRAP: vec %u, %s[%04x]%s\n",
762           trapnr, vec_name(trapnr), regs->error_code,
763           (regs->eflags & X86_EFLAGS_IF) ? "" : " IN INTERRUPT CONTEXT");
764 }
765 
do_reserved_trap(struct cpu_user_regs * regs)766 static void do_reserved_trap(struct cpu_user_regs *regs)
767 {
768     unsigned int trapnr = regs->entry_vector;
769 
770     if ( debugger_trap_fatal(trapnr, regs) )
771         return;
772 
773     show_execution_state(regs);
774     panic("FATAL RESERVED TRAP: vec %u, %s[%04x]\n",
775           trapnr, vec_name(trapnr), regs->error_code);
776 }
777 
extable_shstk_fixup(struct cpu_user_regs * regs,unsigned long fixup)778 static void extable_shstk_fixup(struct cpu_user_regs *regs, unsigned long fixup)
779 {
780     unsigned long ssp, *ptr, *base;
781 
782     asm ( "rdsspq %0" : "=r" (ssp) : "0" (1) );
783     if ( ssp == 1 )
784         return;
785 
786     ptr = _p(ssp);
787     base = _p(get_shstk_bottom(ssp));
788 
789     for ( ; ptr < base; ++ptr )
790     {
791         /*
792          * Search for %rip.  The shstk currently looks like this:
793          *
794          *   ...  [Likely pointed to by SSP]
795          *   %cs  [== regs->cs]
796          *   %rip [== regs->rip]
797          *   SSP  [Likely points to 3 slots higher, above %cs]
798          *   ...  [call tree to this function, likely 2/3 slots]
799          *
800          * and we want to overwrite %rip with fixup.  There are two
801          * complications:
802          *   1) We cant depend on SSP values, because they won't differ by 3
803          *      slots if the exception is taken on an IST stack.
804          *   2) There are synthetic (unrealistic but not impossible) scenarios
805          *      where %rip can end up in the call tree to this function, so we
806          *      can't check against regs->rip alone.
807          *
808          * Check for both regs->rip and regs->cs matching.
809          */
810         if ( ptr[0] == regs->rip && ptr[1] == regs->cs )
811         {
812             asm ( "wrssq %[fix], %[stk]"
813                   : [stk] "=m" (ptr[0])
814                   : [fix] "r" (fixup) );
815             return;
816         }
817     }
818 
819     /*
820      * We failed to locate and fix up the shadow IRET frame.  This could be
821      * due to shadow stack corruption, or bad logic above.  We cannot continue
822      * executing the interrupted context.
823      */
824     BUG();
825 }
826 
extable_fixup(struct cpu_user_regs * regs,bool print)827 static bool extable_fixup(struct cpu_user_regs *regs, bool print)
828 {
829     unsigned long fixup = search_exception_table(regs);
830 
831     if ( unlikely(fixup == 0) )
832         return false;
833 
834     /*
835      * Don't use dprintk() because the __FILE__ reference is unhelpful.
836      * Can currently be triggered by guests.  Make sure we ratelimit.
837      */
838     if ( IS_ENABLED(CONFIG_DEBUG) && print )
839         printk(XENLOG_GUEST XENLOG_WARNING "Fixup %s[%04x]: %p [%ps] -> %p\n",
840                vec_name(regs->entry_vector), regs->error_code,
841                _p(regs->rip), _p(regs->rip), _p(fixup));
842 
843     if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
844         extable_shstk_fixup(regs, fixup);
845 
846     regs->rip = fixup;
847     this_cpu(last_extable_addr) = regs->rip;
848 
849     return true;
850 }
851 
do_trap(struct cpu_user_regs * regs)852 static void do_trap(struct cpu_user_regs *regs)
853 {
854     unsigned int trapnr = regs->entry_vector;
855 
856     if ( regs->error_code & X86_XEC_EXT )
857         goto hardware_trap;
858 
859     if ( debugger_trap_entry(trapnr, regs) )
860         return;
861 
862     ASSERT(trapnr < 32);
863 
864     if ( guest_mode(regs) )
865     {
866         pv_inject_hw_exception(trapnr,
867                                (TRAP_HAVE_EC & (1u << trapnr))
868                                ? regs->error_code : X86_EVENT_NO_EC);
869         return;
870     }
871 
872     if ( likely(extable_fixup(regs, true)) )
873         return;
874 
875  hardware_trap:
876     if ( debugger_trap_fatal(trapnr, regs) )
877         return;
878 
879     show_execution_state(regs);
880     panic("FATAL TRAP: vector = %d (%s)\n"
881           "[error_code=%04x]\n",
882           trapnr, trapstr(trapnr), regs->error_code);
883 }
884 
guest_rdmsr_xen(const struct vcpu * v,uint32_t idx,uint64_t * val)885 int guest_rdmsr_xen(const struct vcpu *v, uint32_t idx, uint64_t *val)
886 {
887     const struct domain *d = v->domain;
888     /* Optionally shift out of the way of Viridian architectural MSRs. */
889     uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
890 
891     switch ( idx - base )
892     {
893     case 0: /* Write hypercall page MSR.  Read as zero. */
894         *val = 0;
895         return X86EMUL_OKAY;
896     }
897 
898     return X86EMUL_EXCEPTION;
899 }
900 
guest_wrmsr_xen(struct vcpu * v,uint32_t idx,uint64_t val)901 int guest_wrmsr_xen(struct vcpu *v, uint32_t idx, uint64_t val)
902 {
903     struct domain *d = v->domain;
904     /* Optionally shift out of the way of Viridian architectural MSRs. */
905     uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
906 
907     switch ( idx - base )
908     {
909     case 0: /* Write hypercall page */
910     {
911         void *hypercall_page;
912         unsigned long gmfn = val >> PAGE_SHIFT;
913         unsigned int page_index = val & (PAGE_SIZE - 1);
914         struct page_info *page;
915         p2m_type_t t;
916 
917         if ( page_index > 0 )
918         {
919             gdprintk(XENLOG_WARNING,
920                      "wrmsr hypercall page index %#x unsupported\n",
921                      page_index);
922             return X86EMUL_EXCEPTION;
923         }
924 
925         page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC);
926 
927         if ( !page || !get_page_type(page, PGT_writable_page) )
928         {
929             if ( page )
930                 put_page(page);
931 
932             if ( p2m_is_paging(t) )
933             {
934                 p2m_mem_paging_populate(d, _gfn(gmfn));
935                 return X86EMUL_RETRY;
936             }
937 
938             gdprintk(XENLOG_WARNING,
939                      "Bad GMFN %lx (MFN %#"PRI_mfn") to MSR %08x\n",
940                      gmfn, mfn_x(page ? page_to_mfn(page) : INVALID_MFN), base);
941             return X86EMUL_EXCEPTION;
942         }
943 
944         hypercall_page = __map_domain_page(page);
945         init_hypercall_page(d, hypercall_page);
946         unmap_domain_page(hypercall_page);
947 
948         put_page_and_type(page);
949         return X86EMUL_OKAY;
950     }
951 
952     default:
953         return X86EMUL_EXCEPTION;
954     }
955 }
956 
cpuid_hypervisor_leaves(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)957 void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
958                              uint32_t subleaf, struct cpuid_leaf *res)
959 {
960     const struct domain *d = v->domain;
961     const struct cpuid_policy *p = d->arch.cpuid;
962     uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
963     uint32_t idx  = leaf - base;
964     unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit;
965 
966     if ( limit == 0 )
967         /* Default number of leaves */
968         limit = XEN_CPUID_MAX_NUM_LEAVES;
969     else
970         /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */
971         limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u);
972 
973     if ( idx > limit )
974         return;
975 
976     switch ( idx )
977     {
978     case 0:
979         res->a = base + limit; /* Largest leaf */
980         res->b = XEN_CPUID_SIGNATURE_EBX;
981         res->c = XEN_CPUID_SIGNATURE_ECX;
982         res->d = XEN_CPUID_SIGNATURE_EDX;
983         break;
984 
985     case 1:
986         res->a = (xen_major_version() << 16) | xen_minor_version();
987         break;
988 
989     case 2:
990         res->a = 1;            /* Number of hypercall-transfer pages */
991                                /* MSR base address */
992         res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
993         if ( is_pv_domain(d) ) /* Features */
994             res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
995         break;
996 
997     case 3: /* Time leaf. */
998         switch ( subleaf )
999         {
1000         case 0: /* features */
1001             res->a = ((d->arch.vtsc << 0) |
1002                       (!!host_tsc_is_safe() << 1) |
1003                       (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2));
1004             res->b = d->arch.tsc_mode;
1005             res->c = d->arch.tsc_khz;
1006             res->d = d->arch.incarnation;
1007             break;
1008 
1009         case 1: /* scale and offset */
1010         {
1011             uint64_t offset;
1012 
1013             if ( !d->arch.vtsc )
1014                 offset = d->arch.vtsc_offset;
1015             else
1016                 /* offset already applied to value returned by virtual rdtscp */
1017                 offset = 0;
1018             res->a = offset;
1019             res->b = offset >> 32;
1020             res->c = d->arch.vtsc_to_ns.mul_frac;
1021             res->d = (s8)d->arch.vtsc_to_ns.shift;
1022             break;
1023         }
1024 
1025         case 2: /* physical cpu_khz */
1026             res->a = cpu_khz;
1027             break;
1028         }
1029         break;
1030 
1031     case 4: /* HVM hypervisor leaf. */
1032         if ( !is_hvm_domain(d) || subleaf != 0 )
1033             break;
1034 
1035         if ( cpu_has_vmx_apic_reg_virt )
1036             res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
1037 
1038         /*
1039          * We want to claim that x2APIC is virtualized if APIC MSR accesses
1040          * are not intercepted. When all three of these are true both rdmsr
1041          * and wrmsr in the guest will run without VMEXITs (see
1042          * vmx_vlapic_msr_changed()).
1043          */
1044         if ( cpu_has_vmx_virtualize_x2apic_mode &&
1045              cpu_has_vmx_apic_reg_virt &&
1046              cpu_has_vmx_virtual_intr_delivery )
1047             res->a |= XEN_HVM_CPUID_X2APIC_VIRT;
1048 
1049         /*
1050          * Indicate that memory mapped from other domains (either grants or
1051          * foreign pages) has valid IOMMU entries.
1052          */
1053         res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
1054 
1055         /* Indicate presence of vcpu id and set it in ebx */
1056         res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
1057         res->b = v->vcpu_id;
1058 
1059         /* Indicate presence of domain id and set it in ecx */
1060         res->a |= XEN_HVM_CPUID_DOMID_PRESENT;
1061         res->c = d->domain_id;
1062 
1063         break;
1064 
1065     case 5: /* PV-specific parameters */
1066         if ( is_hvm_domain(d) || subleaf != 0 )
1067             break;
1068 
1069         res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT;
1070         break;
1071 
1072     default:
1073         ASSERT_UNREACHABLE();
1074     }
1075 }
1076 
do_invalid_op(struct cpu_user_regs * regs)1077 void do_invalid_op(struct cpu_user_regs *regs)
1078 {
1079     const struct bug_frame *bug = NULL;
1080     u8 bug_insn[2];
1081     const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip;
1082     unsigned long fixup;
1083     int id = -1, lineno;
1084     const struct virtual_region *region;
1085 
1086     if ( debugger_trap_entry(TRAP_invalid_op, regs) )
1087         return;
1088 
1089     if ( likely(guest_mode(regs)) )
1090     {
1091         if ( pv_emulate_invalid_op(regs) )
1092             pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
1093         return;
1094     }
1095 
1096     if ( !is_active_kernel_text(regs->rip) ||
1097          __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
1098          memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
1099         goto die;
1100 
1101     region = find_text_region(regs->rip);
1102     if ( region )
1103     {
1104         for ( id = 0; id < BUGFRAME_NR; id++ )
1105         {
1106             const struct bug_frame *b;
1107             unsigned int i;
1108 
1109             for ( i = 0, b = region->frame[id].bugs;
1110                   i < region->frame[id].n_bugs; b++, i++ )
1111             {
1112                 if ( bug_loc(b) == eip )
1113                 {
1114                     bug = b;
1115                     goto found;
1116                 }
1117             }
1118         }
1119     }
1120 
1121  found:
1122     if ( !bug )
1123         goto die;
1124     eip += sizeof(bug_insn);
1125     if ( id == BUGFRAME_run_fn )
1126     {
1127         void (*fn)(struct cpu_user_regs *) = bug_ptr(bug);
1128 
1129         fn(regs);
1130         regs->rip = (unsigned long)eip;
1131         return;
1132     }
1133 
1134     /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
1135     filename = bug_ptr(bug);
1136     if ( !is_kernel(filename) && !is_patch(filename) )
1137         goto die;
1138     fixup = strlen(filename);
1139     if ( fixup > 50 )
1140     {
1141         filename += fixup - 47;
1142         prefix = "...";
1143     }
1144     lineno = bug_line(bug);
1145 
1146     switch ( id )
1147     {
1148     case BUGFRAME_warn:
1149         printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno);
1150         show_execution_state(regs);
1151         regs->rip = (unsigned long)eip;
1152         return;
1153 
1154     case BUGFRAME_bug:
1155         printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1156 
1157         if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1158             return;
1159 
1160         show_execution_state(regs);
1161         panic("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1162 
1163     case BUGFRAME_assert:
1164         /* ASSERT: decode the predicate string pointer. */
1165         predicate = bug_msg(bug);
1166         if ( !is_kernel(predicate) && !is_patch(predicate) )
1167             predicate = "<unknown>";
1168 
1169         printk("Assertion '%s' failed at %s%s:%d\n",
1170                predicate, prefix, filename, lineno);
1171 
1172         if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1173             return;
1174 
1175         show_execution_state(regs);
1176         panic("Assertion '%s' failed at %s%s:%d\n",
1177               predicate, prefix, filename, lineno);
1178     }
1179 
1180  die:
1181     if ( likely(extable_fixup(regs, true)) )
1182         return;
1183 
1184     if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1185         return;
1186 
1187     show_execution_state(regs);
1188     panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op);
1189 }
1190 
do_int3(struct cpu_user_regs * regs)1191 void do_int3(struct cpu_user_regs *regs)
1192 {
1193     if ( debugger_trap_entry(TRAP_int3, regs) )
1194         return;
1195 
1196     if ( !guest_mode(regs) )
1197     {
1198         if ( likely(extable_fixup(regs, true)) )
1199             return;
1200 
1201         if ( !debugger_trap_fatal(TRAP_int3, regs) )
1202             printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n",
1203                    _p(regs->rip), _p(regs->rip));
1204 
1205         return;
1206     }
1207 
1208     pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC);
1209 }
1210 
1211 #ifdef CONFIG_PV
handle_ldt_mapping_fault(unsigned int offset,struct cpu_user_regs * regs)1212 static int handle_ldt_mapping_fault(unsigned int offset,
1213                                     struct cpu_user_regs *regs)
1214 {
1215     struct vcpu *curr = current;
1216 
1217     /*
1218      * Not in PV context?  Something is very broken.  Leave it to the #PF
1219      * handler, which will probably result in a panic().
1220      */
1221     if ( !is_pv_vcpu(curr) )
1222         return 0;
1223 
1224     /* Try to copy a mapping from the guest's LDT, if it is valid. */
1225     if ( likely(pv_map_ldt_shadow_page(offset)) )
1226     {
1227         if ( guest_mode(regs) )
1228             trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1229                                 regs->rip, offset);
1230     }
1231     else
1232     {
1233         /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1234         if ( !guest_mode(regs) )
1235             return 0;
1236 
1237         /* Access would have become non-canonical? Pass #GP[sel] back. */
1238         if ( unlikely(!is_canonical_address(curr->arch.pv.ldt_base + offset)) )
1239         {
1240             uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI;
1241 
1242             pv_inject_hw_exception(TRAP_gp_fault, ec);
1243         }
1244         else
1245             /* else pass the #PF back, with adjusted %cr2. */
1246             pv_inject_page_fault(regs->error_code,
1247                                  curr->arch.pv.ldt_base + offset);
1248     }
1249 
1250     return EXCRET_fault_fixed;
1251 }
1252 
handle_gdt_ldt_mapping_fault(unsigned long offset,struct cpu_user_regs * regs)1253 static int handle_gdt_ldt_mapping_fault(unsigned long offset,
1254                                         struct cpu_user_regs *regs)
1255 {
1256     struct vcpu *curr = current;
1257     /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1258     unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1259     unsigned int vcpu_area   = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1260 
1261     /*
1262      * If the fault is in another vcpu's area, it cannot be due to
1263      * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1264      * indeed we have to since pv_map_ldt_shadow_page() works correctly only on
1265      * accesses to a vcpu's own area.
1266      */
1267     if ( vcpu_area != curr->vcpu_id )
1268         return 0;
1269 
1270     /* Byte offset within the gdt/ldt sub-area. */
1271     offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1272 
1273     if ( likely(is_ldt_area) )
1274         return handle_ldt_mapping_fault(offset, regs);
1275 
1276     /* GDT fault: handle the fault as #GP[sel]. */
1277     regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI);
1278     do_general_protection(regs);
1279 
1280     return EXCRET_fault_fixed;
1281 }
1282 #endif
1283 
1284 #define IN_HYPERVISOR_RANGE(va) \
1285     (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1286 
1287 enum pf_type {
1288     real_fault,
1289     smep_fault,
1290     smap_fault,
1291     spurious_fault
1292 };
1293 
__page_fault_type(unsigned long addr,const struct cpu_user_regs * regs)1294 static enum pf_type __page_fault_type(unsigned long addr,
1295                                       const struct cpu_user_regs *regs)
1296 {
1297     unsigned long mfn, cr3 = read_cr3();
1298     l4_pgentry_t l4e, *l4t;
1299     l3_pgentry_t l3e, *l3t;
1300     l2_pgentry_t l2e, *l2t;
1301     l1_pgentry_t l1e, *l1t;
1302     unsigned int required_flags, disallowed_flags, page_user;
1303     unsigned int error_code = regs->error_code;
1304 
1305     /*
1306      * We do not take spurious page faults in IRQ handlers as we do not
1307      * modify page tables in IRQ context. We therefore bail here because
1308      * map_domain_page() is not IRQ-safe.
1309      */
1310     if ( in_irq() )
1311         return real_fault;
1312 
1313     required_flags  = _PAGE_PRESENT;
1314     if ( error_code & PFEC_write_access )
1315         required_flags |= _PAGE_RW;
1316     if ( error_code & PFEC_user_mode )
1317         required_flags |= _PAGE_USER;
1318 
1319     disallowed_flags = 0;
1320     if ( error_code & PFEC_insn_fetch )
1321         disallowed_flags |= _PAGE_NX_BIT;
1322 
1323     page_user = _PAGE_USER;
1324 
1325     mfn = cr3 >> PAGE_SHIFT;
1326 
1327     l4t = map_domain_page(_mfn(mfn));
1328     l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1329     mfn = l4e_get_pfn(l4e);
1330     unmap_domain_page(l4t);
1331     if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1332          (l4e_get_flags(l4e) & disallowed_flags) )
1333         return real_fault;
1334     page_user &= l4e_get_flags(l4e);
1335 
1336     l3t  = map_domain_page(_mfn(mfn));
1337     l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1338     mfn = l3e_get_pfn(l3e);
1339     unmap_domain_page(l3t);
1340     if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1341          (l3e_get_flags(l3e) & disallowed_flags) )
1342         return real_fault;
1343     page_user &= l3e_get_flags(l3e);
1344     if ( l3e_get_flags(l3e) & _PAGE_PSE )
1345         goto leaf;
1346 
1347     l2t = map_domain_page(_mfn(mfn));
1348     l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1349     mfn = l2e_get_pfn(l2e);
1350     unmap_domain_page(l2t);
1351     if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1352          (l2e_get_flags(l2e) & disallowed_flags) )
1353         return real_fault;
1354     page_user &= l2e_get_flags(l2e);
1355     if ( l2e_get_flags(l2e) & _PAGE_PSE )
1356         goto leaf;
1357 
1358     l1t = map_domain_page(_mfn(mfn));
1359     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1360     mfn = l1e_get_pfn(l1e);
1361     unmap_domain_page(l1t);
1362     if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1363          (l1e_get_flags(l1e) & disallowed_flags) )
1364         return real_fault;
1365     page_user &= l1e_get_flags(l1e);
1366 
1367 leaf:
1368     if ( page_user )
1369     {
1370         unsigned long cr4 = read_cr4();
1371         /*
1372          * Supervisor Mode Execution Prevention (SMEP):
1373          * Disallow supervisor execution from user-accessible mappings
1374          */
1375         if ( (cr4 & X86_CR4_SMEP) &&
1376              ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
1377             return smep_fault;
1378 
1379         /*
1380          * Supervisor Mode Access Prevention (SMAP):
1381          * Disallow supervisor access user-accessible mappings
1382          * A fault is considered as an SMAP violation if the following
1383          * conditions are true:
1384          *   - X86_CR4_SMAP is set in CR4
1385          *   - A user page is being accessed
1386          *   - CPL=3 or X86_EFLAGS_AC is clear
1387          *   - Page fault in kernel mode
1388          */
1389         if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) &&
1390              (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) )
1391             return smap_fault;
1392     }
1393 
1394     return spurious_fault;
1395 }
1396 
spurious_page_fault(unsigned long addr,const struct cpu_user_regs * regs)1397 static enum pf_type spurious_page_fault(unsigned long addr,
1398                                         const struct cpu_user_regs *regs)
1399 {
1400     unsigned long flags;
1401     enum pf_type pf_type;
1402 
1403     /*
1404      * Disabling interrupts prevents TLB flushing, and hence prevents
1405      * page tables from becoming invalid under our feet during the walk.
1406      */
1407     local_irq_save(flags);
1408     pf_type = __page_fault_type(addr, regs);
1409     local_irq_restore(flags);
1410 
1411     return pf_type;
1412 }
1413 
fixup_page_fault(unsigned long addr,struct cpu_user_regs * regs)1414 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1415 {
1416     struct vcpu   *v = current;
1417     struct domain *d = v->domain;
1418 
1419     /* No fixups in interrupt context or when interrupts are disabled. */
1420     if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1421         return 0;
1422 
1423     if ( !(regs->error_code & PFEC_page_present) &&
1424           (pagefault_by_memadd(addr, regs)) )
1425         return handle_memadd_fault(addr, regs);
1426 
1427     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1428     {
1429 #ifdef CONFIG_PV
1430         if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1431              (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1432             return handle_gdt_ldt_mapping_fault(
1433                 addr - GDT_LDT_VIRT_START, regs);
1434 #endif
1435         return 0;
1436     }
1437 
1438     if ( guest_kernel_mode(v, regs) &&
1439          !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
1440          (regs->error_code & PFEC_write_access) )
1441     {
1442         bool ptwr, mmio_ro;
1443 
1444         ptwr = VM_ASSIST(d, writable_pagetables) &&
1445                /* Do not check if access-protection fault since the page may
1446                   legitimately be not present in shadow page tables */
1447                (paging_mode_enabled(d) ||
1448                 (regs->error_code & PFEC_page_present));
1449 
1450         mmio_ro = is_hardware_domain(d) &&
1451                   (regs->error_code & PFEC_page_present);
1452 
1453         if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) )
1454             return EXCRET_fault_fixed;
1455     }
1456 
1457     /*
1458      * For non-external shadowed guests, we fix up both their own pagefaults
1459      * and Xen's, since they share the pagetables.  This includes hypervisor
1460      * faults, e.g. from copy_to_user().
1461      */
1462     if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1463     {
1464         int ret = paging_fault(addr, regs);
1465 
1466         if ( ret == EXCRET_fault_fixed )
1467             trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr);
1468         return ret;
1469     }
1470 
1471     return 0;
1472 }
1473 
do_page_fault(struct cpu_user_regs * regs)1474 void do_page_fault(struct cpu_user_regs *regs)
1475 {
1476     unsigned long addr;
1477     unsigned int error_code;
1478 
1479     addr = read_cr2();
1480 
1481     /* fixup_page_fault() might change regs->error_code, so cache it here. */
1482     error_code = regs->error_code;
1483 
1484     if ( debugger_trap_entry(TRAP_page_fault, regs) )
1485         return;
1486 
1487     perfc_incr(page_faults);
1488 
1489     /* Any shadow stack access fault is a bug in Xen. */
1490     if ( error_code & PFEC_shstk )
1491         goto fatal;
1492 
1493     if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1494         return;
1495 
1496     /*
1497      * Xen doesn't have reserved bits set in its pagetables, nor do we permit
1498      * PV guests to write any.  Such entries would generally be vulnerable to
1499      * the L1TF sidechannel.
1500      *
1501      * The shadow pagetable logic may use reserved bits as part of
1502      * SHOPT_FAST_FAULT_PATH.  Pagefaults arising from these will be resolved
1503      * via the fixup_page_fault() path.
1504      *
1505      * Anything remaining is an error, constituting corruption of the
1506      * pagetables and probably an L1TF vulnerable gadget.
1507      */
1508     if ( error_code & PFEC_reserved_bit )
1509         goto fatal;
1510 
1511     if ( unlikely(!guest_mode(regs)) )
1512     {
1513         enum pf_type pf_type = spurious_page_fault(addr, regs);
1514 
1515         if ( (pf_type == smep_fault) || (pf_type == smap_fault) )
1516         {
1517             console_start_sync();
1518             printk("Xen SM%cP violation\n",
1519                    (pf_type == smep_fault) ? 'E' : 'A');
1520             fatal_trap(regs, 0);
1521         }
1522 
1523         if ( pf_type != real_fault )
1524             return;
1525 
1526         if ( likely(extable_fixup(regs, false)) )
1527         {
1528             perfc_incr(copy_user_faults);
1529             return;
1530         }
1531 
1532     fatal:
1533         if ( debugger_trap_fatal(TRAP_page_fault, regs) )
1534             return;
1535 
1536         show_execution_state(regs);
1537         show_page_walk(addr);
1538         panic("FATAL PAGE FAULT\n"
1539               "[error_code=%04x]\n"
1540               "Faulting linear address: %p\n",
1541               error_code, _p(addr));
1542     }
1543 
1544     pv_inject_page_fault(regs->error_code, addr);
1545 }
1546 
1547 /*
1548  * Early #PF handler to print CR2, error code, and stack.
1549  *
1550  * We also deal with spurious faults here, even though they should never happen
1551  * during early boot (an issue was seen once, but was most likely a hardware
1552  * problem).
1553  */
do_early_page_fault(struct cpu_user_regs * regs)1554 void __init do_early_page_fault(struct cpu_user_regs *regs)
1555 {
1556     static unsigned int __initdata stuck;
1557     static unsigned long __initdata prev_eip, prev_cr2;
1558     unsigned long cr2 = read_cr2();
1559 
1560     BUG_ON(smp_processor_id() != 0);
1561 
1562     if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) )
1563     {
1564         prev_eip = regs->rip;
1565         prev_cr2 = cr2;
1566         stuck    = 0;
1567         return;
1568     }
1569 
1570     if ( stuck++ == 1000 )
1571     {
1572         console_start_sync();
1573         printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1574                regs->cs, _p(regs->rip), _p(cr2), regs->error_code);
1575         fatal_trap(regs, 0);
1576     }
1577 }
1578 
do_general_protection(struct cpu_user_regs * regs)1579 void do_general_protection(struct cpu_user_regs *regs)
1580 {
1581 #ifdef CONFIG_PV
1582     struct vcpu *v = current;
1583 #endif
1584 
1585     if ( debugger_trap_entry(TRAP_gp_fault, regs) )
1586         return;
1587 
1588     if ( regs->error_code & X86_XEC_EXT )
1589         goto hardware_gp;
1590 
1591     if ( !guest_mode(regs) )
1592         goto gp_in_kernel;
1593 
1594 #ifdef CONFIG_PV
1595     /*
1596      * Cunning trick to allow arbitrary "INT n" handling.
1597      *
1598      * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1599      * instruction from trapping to the appropriate vector, when that might not
1600      * be expected by Xen or the guest OS. For example, that entry might be for
1601      * a fault handler (unlike traps, faults don't increment EIP), or might
1602      * expect an error code on the stack (which a software trap never
1603      * provides), or might be a hardware interrupt handler that doesn't like
1604      * being called spuriously.
1605      *
1606      * Instead, a GPF occurs with the faulting IDT vector in the error code.
1607      * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1608      * clear (which got already checked above) to indicate that it's a software
1609      * fault, not a hardware one.
1610      *
1611      * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1612      * okay because they can only be triggered by an explicit DPL-checked
1613      * instruction. The DPL specified by the guest OS for these vectors is NOT
1614      * CHECKED!!
1615      */
1616     if ( regs->error_code & X86_XEC_IDT )
1617     {
1618         /* This fault must be due to <INT n> instruction. */
1619         uint8_t vector = regs->error_code >> 3;
1620         const struct trap_info *ti = &v->arch.pv.trap_ctxt[vector];
1621 
1622         if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1623         {
1624             regs->rip += 2;
1625             pv_inject_sw_interrupt(vector);
1626             return;
1627         }
1628     }
1629     else if ( is_pv_32bit_vcpu(v) && regs->error_code )
1630     {
1631         pv_emulate_gate_op(regs);
1632         return;
1633     }
1634 
1635     /* Emulate some simple privileged and I/O instructions. */
1636     if ( (regs->error_code == 0) &&
1637          pv_emulate_privileged_op(regs) )
1638     {
1639         trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
1640         return;
1641     }
1642 
1643     /* Pass on GPF as is. */
1644     pv_inject_hw_exception(TRAP_gp_fault, regs->error_code);
1645     return;
1646 #endif
1647 
1648  gp_in_kernel:
1649 
1650     if ( likely(extable_fixup(regs, true)) )
1651         return;
1652 
1653  hardware_gp:
1654     if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
1655         return;
1656 
1657     show_execution_state(regs);
1658     panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
1659 }
1660 
pci_serr_softirq(void)1661 static void pci_serr_softirq(void)
1662 {
1663     printk("\n\nNMI - PCI system error (SERR)\n");
1664     outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
1665 }
1666 
nmi_hwdom_report(unsigned int reason_idx)1667 static void nmi_hwdom_report(unsigned int reason_idx)
1668 {
1669     struct domain *d = hardware_domain;
1670 
1671     if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
1672         return;
1673 
1674     set_bit(reason_idx, nmi_reason(d));
1675 
1676     pv_raise_nmi(d->vcpu[0]);
1677 }
1678 
pci_serr_error(const struct cpu_user_regs * regs)1679 static void pci_serr_error(const struct cpu_user_regs *regs)
1680 {
1681     outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
1682 
1683     switch ( opt_nmi[0] )
1684     {
1685     case 'd': /* 'dom0' */
1686         nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
1687         /* fallthrough */
1688     case 'i': /* 'ignore' */
1689         /* Would like to print a diagnostic here but can't call printk()
1690            from NMI context -- raise a softirq instead. */
1691         raise_softirq(PCI_SERR_SOFTIRQ);
1692         break;
1693     default:  /* 'fatal' */
1694         console_force_unlock();
1695         printk("\n\nNMI - PCI system error (SERR)\n");
1696         fatal_trap(regs, 0);
1697     }
1698 }
1699 
io_check_error(const struct cpu_user_regs * regs)1700 static void io_check_error(const struct cpu_user_regs *regs)
1701 {
1702     switch ( opt_nmi[0] )
1703     {
1704     case 'd': /* 'dom0' */
1705         nmi_hwdom_report(_XEN_NMIREASON_io_error);
1706     case 'i': /* 'ignore' */
1707         break;
1708     default:  /* 'fatal' */
1709         console_force_unlock();
1710         printk("\n\nNMI - I/O ERROR\n");
1711         fatal_trap(regs, 0);
1712     }
1713 
1714     outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1715     mdelay(1);
1716     outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1717 }
1718 
unknown_nmi_error(const struct cpu_user_regs * regs,unsigned char reason)1719 static void unknown_nmi_error(const struct cpu_user_regs *regs,
1720                               unsigned char reason)
1721 {
1722     switch ( opt_nmi[0] )
1723     {
1724     case 'd': /* 'dom0' */
1725         nmi_hwdom_report(_XEN_NMIREASON_unknown);
1726     case 'i': /* 'ignore' */
1727         break;
1728     default:  /* 'fatal' */
1729         console_force_unlock();
1730         printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1731         printk("Do you have a strange power saving mode enabled?\n");
1732         fatal_trap(regs, 0);
1733     }
1734 }
1735 
dummy_nmi_callback(const struct cpu_user_regs * regs,int cpu)1736 static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu)
1737 {
1738     return 0;
1739 }
1740 
1741 static nmi_callback_t *nmi_callback = dummy_nmi_callback;
1742 
1743 DEFINE_PER_CPU(unsigned int, nmi_count);
1744 
do_nmi(const struct cpu_user_regs * regs)1745 void do_nmi(const struct cpu_user_regs *regs)
1746 {
1747     unsigned int cpu = smp_processor_id();
1748     unsigned char reason = 0;
1749     bool handle_unknown = false;
1750 
1751     this_cpu(nmi_count)++;
1752     nmi_enter();
1753 
1754     if ( nmi_callback(regs, cpu) )
1755     {
1756         nmi_exit();
1757         return;
1758     }
1759 
1760     /*
1761      * Accessing port 0x61 may trap to SMM which has been actually
1762      * observed on some production SKX servers. This SMI sometimes
1763      * takes enough time for the next NMI tick to happen. By reading
1764      * this port before we re-arm the NMI watchdog, we reduce the chance
1765      * of having an NMI watchdog expire while in the SMI handler.
1766      */
1767     if ( cpu == nmi_cpu )
1768         reason = inb(0x61);
1769 
1770     if ( (nmi_watchdog == NMI_NONE) ||
1771          (!nmi_watchdog_tick(regs) && watchdog_force) )
1772         handle_unknown = true;
1773 
1774     /* Only the BSP gets external NMIs from the system. */
1775     if ( cpu == nmi_cpu )
1776     {
1777         if ( reason & 0x80 )
1778             pci_serr_error(regs);
1779         if ( reason & 0x40 )
1780             io_check_error(regs);
1781         if ( !(reason & 0xc0) && handle_unknown )
1782             unknown_nmi_error(regs, reason);
1783     }
1784 
1785     nmi_exit();
1786 }
1787 
set_nmi_callback(nmi_callback_t * callback)1788 nmi_callback_t *set_nmi_callback(nmi_callback_t *callback)
1789 {
1790     nmi_callback_t *old_nmi_callback = nmi_callback;
1791 
1792     nmi_callback = callback;
1793 
1794     return old_nmi_callback;
1795 }
1796 
unset_nmi_callback(void)1797 void unset_nmi_callback(void)
1798 {
1799     nmi_callback = dummy_nmi_callback;
1800 }
1801 
do_device_not_available(struct cpu_user_regs * regs)1802 void do_device_not_available(struct cpu_user_regs *regs)
1803 {
1804 #ifdef CONFIG_PV
1805     struct vcpu *curr = current;
1806 #endif
1807 
1808     if ( !guest_mode(regs) )
1809     {
1810         /*
1811          * We shouldn't be able to reach here, but for release builds have
1812          * the recovery logic in place nevertheless.
1813          */
1814         if ( extable_fixup(regs, true) )
1815         {
1816             ASSERT_UNREACHABLE();
1817             return;
1818         }
1819 
1820         fatal_trap(regs, false);
1821     }
1822 
1823 #ifdef CONFIG_PV
1824     vcpu_restore_fpu_lazy(curr);
1825 
1826     if ( curr->arch.pv.ctrlreg[0] & X86_CR0_TS )
1827     {
1828         pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1829         curr->arch.pv.ctrlreg[0] &= ~X86_CR0_TS;
1830     }
1831     else
1832         TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
1833 #else
1834     ASSERT_UNREACHABLE();
1835 #endif
1836 }
1837 
do_debug(struct cpu_user_regs * regs)1838 void do_debug(struct cpu_user_regs *regs)
1839 {
1840     unsigned long dr6;
1841     struct vcpu *v = current;
1842 
1843     /* Stash dr6 as early as possible. */
1844     dr6 = read_debugreg(6);
1845 
1846     if ( debugger_trap_entry(TRAP_debug, regs) )
1847         return;
1848 
1849     /*
1850      * At the time of writing (March 2018), on the subject of %dr6:
1851      *
1852      * The Intel manual says:
1853      *   Certain debug exceptions may clear bits 0-3. The remaining contents
1854      *   of the DR6 register are never cleared by the processor. To avoid
1855      *   confusion in identifying debug exceptions, debug handlers should
1856      *   clear the register (except bit 16, which they should set) before
1857      *   returning to the interrupted task.
1858      *
1859      * The AMD manual says:
1860      *   Bits 15:13 of the DR6 register are not cleared by the processor and
1861      *   must be cleared by software after the contents have been read.
1862      *
1863      * Some bits are reserved set, some are reserved clear, and some bits
1864      * which were previously reserved set are reused and cleared by hardware.
1865      * For future compatibility, reset to the default value, which will allow
1866      * us to spot any bit being changed by hardware to its non-default value.
1867      */
1868     write_debugreg(6, X86_DR6_DEFAULT);
1869 
1870     /* #DB automatically disabled LBR.  Reinstate it if debugging Xen. */
1871     if ( cpu_has_xen_lbr )
1872         wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
1873 
1874     if ( !guest_mode(regs) )
1875     {
1876         /*
1877          * !!! WARNING !!!
1878          *
1879          * %dr6 is mostly guest controlled at this point.  Any decsions base
1880          * on its value must be crosschecked with non-guest controlled state.
1881          */
1882 
1883         if ( regs->eflags & X86_EFLAGS_TF )
1884         {
1885 #ifdef CONFIG_PV
1886             /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
1887             if ( (regs->rip >= (unsigned long)sysenter_entry) &&
1888                  (regs->rip <= (unsigned long)sysenter_eflags_saved) )
1889             {
1890                 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
1891                     regs->eflags &= ~X86_EFLAGS_TF;
1892                 return;
1893             }
1894 #endif
1895             if ( !debugger_trap_fatal(TRAP_debug, regs) )
1896             {
1897                 WARN();
1898                 regs->eflags &= ~X86_EFLAGS_TF;
1899             }
1900         }
1901 
1902         /*
1903          * Check for fault conditions.  General Detect, and instruction
1904          * breakpoints are faults rather than traps, at which point attempting
1905          * to ignore and continue will result in a livelock.
1906          *
1907          * However, on entering the #DB handler, hardware clears %dr7.gd for
1908          * us (as confirmed by the earlier %dr6 accesses succeeding), meaning
1909          * that a real General Detect exception is restartable.
1910          *
1911          * PV guests are not permitted to point %dr{0..3} at Xen linear
1912          * addresses, and Instruction Breakpoints (being faults) don't get
1913          * delayed by a MovSS shadow, so we should never encounter one in
1914          * hypervisor context.
1915          *
1916          * If however we do, safety measures need to be enacted.  Use a big
1917          * hammer and clear all debug settings.
1918          */
1919         if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
1920         {
1921             unsigned int bp, dr7 = read_debugreg(7);
1922 
1923             for ( bp = 0; bp < 4; ++bp )
1924             {
1925                 if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
1926                      (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
1927                      ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
1928                                      DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
1929                 {
1930                     ASSERT_UNREACHABLE();
1931 
1932                     printk(XENLOG_ERR
1933                            "Hit instruction breakpoint in Xen context\n");
1934                     write_debugreg(7, 0);
1935                     break;
1936                 }
1937             }
1938         }
1939 
1940         /*
1941          * Whatever caused this #DB should be restartable by this point.  Note
1942          * it and continue.  Guests can trigger this in certain corner cases,
1943          * so ensure the message is ratelimited.
1944          */
1945         gprintk(XENLOG_WARNING,
1946                 "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
1947                 regs->cs, _p(regs->rip), _p(regs->rip),
1948                 regs->ss, _p(regs->rsp), dr6);
1949 
1950         return;
1951     }
1952 
1953     /* Save debug status register where guest OS can peek at it */
1954     v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
1955     v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
1956 
1957     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1958 }
1959 
do_entry_CP(struct cpu_user_regs * regs)1960 void do_entry_CP(struct cpu_user_regs *regs)
1961 {
1962     static const char errors[][10] = {
1963         [1] = "near ret",
1964         [2] = "far/iret",
1965         [3] = "endbranch",
1966         [4] = "rstorssp",
1967         [5] = "setssbsy",
1968     };
1969     const char *err = "??";
1970     unsigned int ec = regs->error_code;
1971 
1972     if ( debugger_trap_entry(TRAP_debug, regs) )
1973         return;
1974 
1975     /* Decode ec if possible */
1976     if ( ec < ARRAY_SIZE(errors) && errors[ec][0] )
1977         err = errors[ec];
1978 
1979     /*
1980      * For now, only supervisors shadow stacks should be active.  A #CP from
1981      * guest context is probably a Xen bug, but kill the guest in an attempt
1982      * to recover.
1983      */
1984     if ( guest_mode(regs) )
1985     {
1986         gprintk(XENLOG_ERR, "Hit #CP[%04x] in guest context %04x:%p\n",
1987                 ec, regs->cs, _p(regs->rip));
1988         ASSERT_UNREACHABLE();
1989         domain_crash(current->domain);
1990         return;
1991     }
1992 
1993     show_execution_state(regs);
1994     panic("CONTROL-FLOW PROTECTION FAULT: #CP[%04x] %s\n", ec, err);
1995 }
1996 
__set_intr_gate(unsigned int n,uint32_t dpl,void * addr)1997 static void __init noinline __set_intr_gate(unsigned int n,
1998                                             uint32_t dpl, void *addr)
1999 {
2000     _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr);
2001 }
2002 
set_swint_gate(unsigned int n,void * addr)2003 static void __init set_swint_gate(unsigned int n, void *addr)
2004 {
2005     __set_intr_gate(n, 3, addr);
2006 }
2007 
set_intr_gate(unsigned int n,void * addr)2008 static void __init set_intr_gate(unsigned int n, void *addr)
2009 {
2010     __set_intr_gate(n, 0, addr);
2011 }
2012 
calc_ler_msr(void)2013 static unsigned int calc_ler_msr(void)
2014 {
2015     switch ( boot_cpu_data.x86_vendor )
2016     {
2017     case X86_VENDOR_INTEL:
2018         switch ( boot_cpu_data.x86 )
2019         {
2020         case 6:
2021             return MSR_IA32_LASTINTFROMIP;
2022 
2023         case 15:
2024             return MSR_P4_LER_FROM_LIP;
2025         }
2026         break;
2027 
2028     case X86_VENDOR_AMD:
2029         switch ( boot_cpu_data.x86 )
2030         {
2031         case 6:
2032         case 0xf ... 0x19:
2033             return MSR_IA32_LASTINTFROMIP;
2034         }
2035         break;
2036 
2037     case X86_VENDOR_HYGON:
2038         return MSR_IA32_LASTINTFROMIP;
2039     }
2040 
2041     return 0;
2042 }
2043 
percpu_traps_init(void)2044 void percpu_traps_init(void)
2045 {
2046     subarch_percpu_traps_init();
2047 
2048     if ( !opt_ler )
2049         return;
2050 
2051     if ( !ler_msr && (ler_msr = calc_ler_msr()) )
2052         setup_force_cpu_cap(X86_FEATURE_XEN_LBR);
2053 
2054     if ( cpu_has_xen_lbr )
2055         wrmsrl(MSR_IA32_DEBUGCTLMSR, IA32_DEBUGCTLMSR_LBR);
2056 }
2057 
init_idt_traps(void)2058 void __init init_idt_traps(void)
2059 {
2060     /*
2061      * Note that interrupt gates are always used, rather than trap gates. We
2062      * must have interrupts disabled until DS/ES/FS/GS are saved because the
2063      * first activation must have the "bad" value(s) for these registers and
2064      * we may lose them if another activation is installed before they are
2065      * saved. The page-fault handler also needs interrupts disabled until %cr2
2066      * has been read and saved on the stack.
2067      */
2068     set_intr_gate(TRAP_divide_error,&divide_error);
2069     set_intr_gate(TRAP_debug,&debug);
2070     set_intr_gate(TRAP_nmi,&nmi);
2071     set_swint_gate(TRAP_int3,&int3);         /* usable from all privileges */
2072     set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
2073     set_intr_gate(TRAP_bounds,&bounds);
2074     set_intr_gate(TRAP_invalid_op,&invalid_op);
2075     set_intr_gate(TRAP_no_device,&device_not_available);
2076     set_intr_gate(TRAP_double_fault,&double_fault);
2077     set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
2078     set_intr_gate(TRAP_no_segment,&segment_not_present);
2079     set_intr_gate(TRAP_stack_error,&stack_segment);
2080     set_intr_gate(TRAP_gp_fault,&general_protection);
2081     set_intr_gate(TRAP_page_fault,&early_page_fault);
2082     set_intr_gate(TRAP_copro_error,&coprocessor_error);
2083     set_intr_gate(TRAP_alignment_check,&alignment_check);
2084     set_intr_gate(TRAP_machine_check,&machine_check);
2085     set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
2086     set_intr_gate(X86_EXC_CP, entry_CP);
2087 
2088     /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
2089     enable_each_ist(idt_table);
2090 
2091     /* CPU0 uses the master IDT. */
2092     idt_tables[0] = idt_table;
2093 
2094     this_cpu(gdt) = boot_gdt;
2095     if ( IS_ENABLED(CONFIG_PV32) )
2096         this_cpu(compat_gdt) = boot_compat_gdt;
2097 }
2098 
2099 extern void (*const autogen_entrypoints[X86_NR_VECTORS])(void);
trap_init(void)2100 void __init trap_init(void)
2101 {
2102     unsigned int vector;
2103 
2104     /* Replace early pagefault with real pagefault handler. */
2105     set_intr_gate(TRAP_page_fault, &page_fault);
2106 
2107     pv_trap_init();
2108 
2109     for ( vector = 0; vector < X86_NR_VECTORS; ++vector )
2110     {
2111         if ( autogen_entrypoints[vector] )
2112         {
2113             /* Found autogen entry: check we won't clobber an existing trap. */
2114             ASSERT(idt_table[vector].b == 0);
2115             set_intr_gate(vector, autogen_entrypoints[vector]);
2116         }
2117         else
2118         {
2119             /* No entry point: confirm we have an existing trap in place. */
2120             ASSERT(idt_table[vector].b != 0);
2121         }
2122     }
2123 
2124     /* Cache {,compat_}gdt_l1e now that physically relocation is done. */
2125     this_cpu(gdt_l1e) =
2126         l1e_from_pfn(virt_to_mfn(boot_gdt), __PAGE_HYPERVISOR_RW);
2127     if ( IS_ENABLED(CONFIG_PV32) )
2128         this_cpu(compat_gdt_l1e) =
2129             l1e_from_pfn(virt_to_mfn(boot_compat_gdt), __PAGE_HYPERVISOR_RW);
2130 
2131     percpu_traps_init();
2132 
2133     cpu_init();
2134 
2135     open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq);
2136 }
2137 
activate_debugregs(const struct vcpu * curr)2138 void activate_debugregs(const struct vcpu *curr)
2139 {
2140     ASSERT(curr == current);
2141 
2142     write_debugreg(0, curr->arch.dr[0]);
2143     write_debugreg(1, curr->arch.dr[1]);
2144     write_debugreg(2, curr->arch.dr[2]);
2145     write_debugreg(3, curr->arch.dr[3]);
2146     write_debugreg(6, curr->arch.dr6);
2147 
2148     /*
2149      * Avoid writing the subsequently getting replaced value when getting
2150      * called from set_debugreg() below. Eventual future callers will need
2151      * to take this into account.
2152      */
2153     if ( curr->arch.dr7 & DR7_ACTIVE_MASK )
2154         write_debugreg(7, curr->arch.dr7);
2155 
2156     if ( boot_cpu_has(X86_FEATURE_DBEXT) )
2157     {
2158         wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]);
2159         wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.msrs->dr_mask[1]);
2160         wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.msrs->dr_mask[2]);
2161         wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.msrs->dr_mask[3]);
2162     }
2163 }
2164 
asm_domain_crash_synchronous(unsigned long addr)2165 void asm_domain_crash_synchronous(unsigned long addr)
2166 {
2167     /*
2168      * We need clear AC bit here because in entry.S AC is set
2169      * by ASM_STAC to temporarily allow accesses to user pages
2170      * which is prevented by SMAP by default.
2171      *
2172      * For some code paths, where this function is called, clac()
2173      * is not needed, but adding clac() here instead of each place
2174      * asm_domain_crash_synchronous() is called can reduce the code
2175      * redundancy, and it is harmless as well.
2176      */
2177     clac();
2178 
2179     if ( addr == 0 )
2180         addr = this_cpu(last_extable_addr);
2181 
2182     printk("domain_crash_sync called from entry.S: fault at %p %pS\n",
2183            _p(addr), _p(addr));
2184 
2185     __domain_crash(current->domain);
2186 
2187     for ( ; ; )
2188         do_softirq();
2189 }
2190 
2191 /*
2192  * Local variables:
2193  * mode: C
2194  * c-file-style: "BSD"
2195  * c-basic-offset: 4
2196  * tab-width: 4
2197  * indent-tabs-mode: nil
2198  * End:
2199  */
2200