1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/param.h>
8 #include <xen/smp.h>
9 #include <xen/mm.h>
10 #include <xen/cpu.h>
11 #include <asm/processor.h>
12 #include <public/sysctl.h>
13 #include <asm/system.h>
14 #include <asm/msr.h>
15 #include <asm/p2m.h>
16 #include <asm/mce.h>
17 #include <asm/apic.h>
18 #include "mce.h"
19 #include "x86_mca.h"
20 #include "barrier.h"
21 #include "util.h"
22 #include "vmce.h"
23 #include "mcaction.h"
24 
25 static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned);
26 bool __read_mostly cmci_support;
27 static bool __read_mostly ser_support;
28 static bool __read_mostly mce_force_broadcast;
29 boolean_param("mce_fb", mce_force_broadcast);
30 
31 static int __read_mostly nr_intel_ext_msrs;
32 
33 /* If mce_force_broadcast == 1, lmce_support will be disabled forcibly. */
34 bool __read_mostly lmce_support;
35 
36 /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */
37 #define INTEL_MCCOD_MASK 0xFFFF
38 
39 /*
40  * Currently Intel SDM define 2 kinds of srao errors:
41  * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF
42  * 2). L3 explicit writeback error, error code = 0x17A
43  */
44 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
45 #define INTEL_SRAO_L3_EWB    0x17A
46 
47 /*
48  * Currently Intel SDM define 2 kinds of srar errors:
49  * 1). Data Load error, error code = 0x134
50  * 2). Instruction Fetch error, error code = 0x150
51  */
52 #define INTEL_SRAR_DATA_LOAD	0x134
53 #define INTEL_SRAR_INSTR_FETCH	0x150
54 
55 #define MCE_RING                0x1
56 static DEFINE_PER_CPU(int, last_state);
57 
intel_thermal_interrupt(struct cpu_user_regs * regs)58 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
59 {
60     uint64_t msr_content;
61     unsigned int cpu = smp_processor_id();
62     static DEFINE_PER_CPU(s_time_t, next);
63     int *this_last_state;
64 
65     ack_APIC_irq();
66 
67     if ( NOW() < per_cpu(next, cpu) )
68         return;
69 
70     per_cpu(next, cpu) = NOW() + MILLISECS(5000);
71     rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
72     this_last_state = &per_cpu(last_state, cpu);
73     if ( *this_last_state == (msr_content & MCE_RING) )
74         return;
75     *this_last_state = msr_content & MCE_RING;
76     if ( msr_content & MCE_RING )
77     {
78         printk(KERN_EMERG "CPU%u: Temperature above threshold\n", cpu);
79         printk(KERN_EMERG "CPU%u: Running in modulated clock mode\n", cpu);
80         add_taint(TAINT_MACHINE_CHECK);
81     } else
82         printk(KERN_INFO "CPU%u: Temperature/speed normal\n", cpu);
83 }
84 
85 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
intel_thermal_supported(struct cpuinfo_x86 * c)86 static bool intel_thermal_supported(struct cpuinfo_x86 *c)
87 {
88     if ( !cpu_has_apic )
89         return false;
90     if ( !cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_TM1) )
91         return false;
92     return true;
93 }
94 
95 static u32 __read_mostly lvtthmr_init;
96 
mcheck_intel_therm_init(void)97 static void __init mcheck_intel_therm_init(void)
98 {
99     /*
100      * This function is only called on boot CPU. Save the init thermal
101      * LVT value on BSP and use that value to restore APs' thermal LVT
102      * entry BIOS programmed later
103      */
104     if ( intel_thermal_supported(&boot_cpu_data) )
105         lvtthmr_init = apic_read(APIC_LVTTHMR);
106 }
107 
108 /* P4/Xeon Thermal regulation detect and init */
intel_init_thermal(struct cpuinfo_x86 * c)109 static void intel_init_thermal(struct cpuinfo_x86 *c)
110 {
111     uint64_t msr_content;
112     uint32_t val;
113     int tm2 = 0;
114     unsigned int cpu = smp_processor_id();
115     static uint8_t thermal_apic_vector;
116 
117     if ( !intel_thermal_supported(c) )
118         return; /* -ENODEV */
119 
120     /* first check if its enabled already, in which case there might
121      * be some SMM goo which handles it, so we can't even put a handler
122      * since it might be delivered via SMI already -zwanem.
123      */
124     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
125     val = lvtthmr_init;
126     /*
127      * The initial value of thermal LVT entries on all APs always reads
128      * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
129      * sequence to them and LVT registers are reset to 0s except for
130      * the mask bits which are set to 1s when APs receive INIT IPI.
131      * If BIOS takes over the thermal interrupt and sets its interrupt
132      * delivery mode to SMI (not fixed), it restores the value that the
133      * BIOS has programmed on AP based on BSP's info we saved (since BIOS
134      * is required to set the same value for all threads/cores).
135      */
136     if ( (val & APIC_MODE_MASK) != APIC_DM_FIXED
137          || (val & APIC_VECTOR_MASK) > 0xf )
138         apic_write(APIC_LVTTHMR, val);
139 
140     if ( (msr_content & (1ULL<<3))
141          && (val & APIC_MODE_MASK) == APIC_DM_SMI )
142     {
143         if ( c == &boot_cpu_data )
144             printk(KERN_DEBUG "Thermal monitoring handled by SMI\n");
145         return; /* -EBUSY */
146     }
147 
148     if ( cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)) )
149         tm2 = 1;
150 
151     /* check whether a vector already exists, temporarily masked? */
152     if ( val & APIC_VECTOR_MASK )
153     {
154         if ( c == &boot_cpu_data )
155             printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n",
156                    val & APIC_VECTOR_MASK);
157         return; /* -EBUSY */
158     }
159 
160     alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt);
161 
162     /* The temperature transition interrupt handler setup */
163     val = thermal_apic_vector;    /* our delivery vector */
164     val |= (APIC_DM_FIXED | APIC_LVT_MASKED);  /* we'll mask till we're ready */
165     apic_write(APIC_LVTTHMR, val);
166 
167     rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
168     wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
169 
170     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
171     wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
172 
173     apic_write(APIC_LVTTHMR, val & ~APIC_LVT_MASKED);
174     if ( opt_cpu_info )
175         printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
176                cpu, tm2 ? "TM2" : "TM1");
177 }
178 
179 /* Intel MCE handler */
intel_get_extended_msr(struct mcinfo_extended * ext,u32 msr)180 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
181 {
182     if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
183          && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs )
184     {
185         ext->mc_msr[ext->mc_msrs].reg = msr;
186         rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
187         ++ext->mc_msrs;
188     }
189 }
190 
191 
192 struct mcinfo_extended *
intel_get_extended_msrs(struct mcinfo_global * mig,struct mc_info * mi)193 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
194 {
195     struct mcinfo_extended *mc_ext;
196     int i;
197 
198     /*
199      * According to spec, processor _support_ 64 bit will always
200      * have MSR beyond IA32_MCG_MISC
201      */
202     if ( !mi|| !mig || nr_intel_ext_msrs == 0 ||
203          !(mig->mc_gstatus & MCG_STATUS_EIPV) )
204         return NULL;
205 
206     mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext), MC_TYPE_EXTENDED);
207     if ( !mc_ext )
208     {
209         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
210         return NULL;
211     }
212 
213     for ( i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++ )
214         intel_get_extended_msr(mc_ext, i);
215 
216     for ( i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++ )
217         intel_get_extended_msr(mc_ext, i);
218 
219     return mc_ext;
220 }
221 
222 enum intel_mce_type
223 {
224     intel_mce_invalid,
225     intel_mce_fatal,
226     intel_mce_corrected,
227     intel_mce_ucr_ucna,
228     intel_mce_ucr_srao,
229     intel_mce_ucr_srar,
230 };
231 
intel_check_mce_type(uint64_t status)232 static enum intel_mce_type intel_check_mce_type(uint64_t status)
233 {
234     if ( !(status & MCi_STATUS_VAL) )
235         return intel_mce_invalid;
236 
237     if ( status & MCi_STATUS_PCC )
238         return intel_mce_fatal;
239 
240     /* Corrected error? */
241     if ( !(status & MCi_STATUS_UC) )
242         return intel_mce_corrected;
243 
244     if ( !ser_support )
245         return intel_mce_fatal;
246 
247     if ( status & MCi_STATUS_S )
248     {
249         if ( status & MCi_STATUS_AR )
250         {
251             if ( status & MCi_STATUS_OVER )
252                 return intel_mce_fatal;
253             else
254                 return intel_mce_ucr_srar;
255         } else
256             return intel_mce_ucr_srao;
257     }
258     else
259         return intel_mce_ucr_ucna;
260 
261     /* Any type not included abovoe ? */
262     return intel_mce_fatal;
263 }
264 
intel_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)265 static void intel_memerr_dhandler(
266              struct mca_binfo *binfo,
267              enum mce_result *result,
268              const struct cpu_user_regs *regs)
269 {
270     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
271     mc_memerr_dhandler(binfo, result, regs);
272 }
273 
intel_srar_check(uint64_t status)274 static bool intel_srar_check(uint64_t status)
275 {
276     return (intel_check_mce_type(status) == intel_mce_ucr_srar);
277 }
278 
intel_checkaddr(uint64_t status,uint64_t misc,int addrtype)279 static bool intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
280 {
281     if ( !(status & MCi_STATUS_ADDRV) ||
282          !(status & MCi_STATUS_MISCV) ||
283          ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
284         /* addr is virtual */
285         return (addrtype == MC_ADDR_VIRTUAL);
286 
287     return (addrtype == MC_ADDR_PHYSICAL);
288 }
289 
intel_srar_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)290 static void intel_srar_dhandler(
291              struct mca_binfo *binfo,
292              enum mce_result *result,
293              const struct cpu_user_regs *regs)
294 {
295     uint64_t status = binfo->mib->mc_status;
296 
297     /* For unknown srar error code, reset system */
298     *result = MCER_RESET;
299 
300     switch ( status & INTEL_MCCOD_MASK )
301     {
302     case INTEL_SRAR_DATA_LOAD:
303     case INTEL_SRAR_INSTR_FETCH:
304         intel_memerr_dhandler(binfo, result, regs);
305         break;
306     }
307 }
308 
intel_srao_check(uint64_t status)309 static bool intel_srao_check(uint64_t status)
310 {
311     return (intel_check_mce_type(status) == intel_mce_ucr_srao);
312 }
313 
intel_srao_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)314 static void intel_srao_dhandler(
315              struct mca_binfo *binfo,
316              enum mce_result *result,
317              const struct cpu_user_regs *regs)
318 {
319     uint64_t status = binfo->mib->mc_status;
320 
321     /* For unknown srao error code, no action required */
322     *result = MCER_CONTINUE;
323 
324     if ( status & MCi_STATUS_VAL )
325     {
326         switch ( status & INTEL_MCCOD_MASK )
327         {
328         case INTEL_SRAO_MEM_SCRUB:
329         case INTEL_SRAO_L3_EWB:
330             intel_memerr_dhandler(binfo, result, regs);
331             break;
332         }
333     }
334 }
335 
intel_default_check(uint64_t status)336 static bool intel_default_check(uint64_t status)
337 {
338     return true;
339 }
340 
intel_default_mce_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)341 static void intel_default_mce_dhandler(
342              struct mca_binfo *binfo,
343              enum mce_result *result,
344              const struct cpu_user_regs * regs)
345 {
346     uint64_t status = binfo->mib->mc_status;
347     enum intel_mce_type type;
348 
349     type = intel_check_mce_type(status);
350 
351     if ( type == intel_mce_fatal )
352         *result = MCER_RESET;
353     else
354         *result = MCER_CONTINUE;
355 }
356 
357 static const struct mca_error_handler intel_mce_dhandlers[] = {
358     {intel_srao_check, intel_srao_dhandler},
359     {intel_srar_check, intel_srar_dhandler},
360     {intel_default_check, intel_default_mce_dhandler}
361 };
362 
intel_default_mce_uhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)363 static void intel_default_mce_uhandler(
364              struct mca_binfo *binfo,
365              enum mce_result *result,
366              const struct cpu_user_regs *regs)
367 {
368     uint64_t status = binfo->mib->mc_status;
369     enum intel_mce_type type;
370 
371     type = intel_check_mce_type(status);
372 
373     switch ( type )
374     {
375     case intel_mce_fatal:
376         *result = MCER_RESET;
377         break;
378 
379     default:
380         *result = MCER_CONTINUE;
381         break;
382     }
383 }
384 
385 static const struct mca_error_handler intel_mce_uhandlers[] = {
386     {intel_default_check, intel_default_mce_uhandler}
387 };
388 
389 /* According to MCA OS writer guide, CMCI handler need to clear bank when
390  * 1) CE (UC = 0)
391  * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
392  * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
393  * MCA handler need to clear bank when
394  * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
395  * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
396  * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
397  */
398 
intel_need_clearbank_scan(enum mca_source who,u64 status)399 static bool intel_need_clearbank_scan(enum mca_source who, u64 status)
400 {
401     if ( who == MCA_CMCI_HANDLER )
402     {
403         /* CMCI need clear bank */
404         if ( !(status & MCi_STATUS_UC) )
405             return true;
406         /* Spurious need clear bank */
407         else if ( ser_support && !(status & MCi_STATUS_OVER)
408                   && !(status & MCi_STATUS_EN) )
409             return true;
410         /* UCNA OVER = 0 need clear bank */
411         else if ( ser_support && !(status & MCi_STATUS_OVER)
412                   && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
413                   && !(status & MCi_STATUS_AR) )
414             return true;
415         /* Only Log, no clear */
416         else return false;
417     }
418     else if ( who == MCA_MCE_SCAN )
419     {
420         if ( !ser_support )
421             return false;
422         /*
423          * For fatal error, it shouldn't be cleared so that sticky bank
424          * have chance to be handled after reboot by polling
425          */
426         if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) )
427             return false;
428         /* Spurious need clear bank */
429         else if ( !(status & MCi_STATUS_OVER)
430                   && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN) )
431             return true;
432         /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
433         else if ( (status & MCi_STATUS_UC)
434                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR)
435                   && !(status & MCi_STATUS_OVER) )
436             return true;
437         /* SRAO need clear bank */
438         else if ( !(status & MCi_STATUS_AR)
439                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC) )
440             return true;
441         else
442             return false;
443     }
444 
445     return true;
446 }
447 
448 /*
449  * MCE continues/is recoverable when
450  * 1) CE UC = 0
451  * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
452  * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
453  * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
454  * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
455  */
intel_recoverable_scan(uint64_t status)456 static bool intel_recoverable_scan(uint64_t status)
457 {
458 
459     if ( !(status & MCi_STATUS_UC ) )
460         return true;
461     else if ( ser_support && !(status & MCi_STATUS_EN)
462               && !(status & MCi_STATUS_OVER) )
463         return true;
464     /* SRAR error */
465     else if ( ser_support && !(status & MCi_STATUS_OVER)
466               && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
467               && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
468         return true;
469     /* SRAO error */
470     else if ( ser_support && !(status & MCi_STATUS_PCC)
471               && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
472               && (status & MCi_STATUS_EN) )
473         return true;
474     /* UCNA error */
475     else if ( ser_support && !(status & MCi_STATUS_OVER)
476               && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
477               && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) )
478         return true;
479     return false;
480 }
481 
482 /* CMCI */
483 static DEFINE_SPINLOCK(cmci_discover_lock);
484 
485 /*
486  * Discover bank sharing using the algorithm recommended in the SDM.
487  */
do_cmci_discover(int i)488 static int do_cmci_discover(int i)
489 {
490     unsigned msr = MSR_IA32_MCx_CTL2(i);
491     u64 val;
492     unsigned int threshold, max_threshold;
493     unsigned int cpu = smp_processor_id();
494     static unsigned int cmci_threshold = 2;
495     integer_param("cmci-threshold", cmci_threshold);
496 
497     rdmsrl(msr, val);
498     /* Some other CPU already owns this bank. */
499     if ( val & CMCI_EN )
500     {
501         mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
502         goto out;
503     }
504 
505     if ( cmci_threshold )
506     {
507         wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
508         rdmsrl(msr, val);
509     }
510 
511     if ( !(val & CMCI_EN) )
512     {
513         /* This bank does not support CMCI. Polling timer has to handle it. */
514         mcabanks_set(i, per_cpu(no_cmci_banks, cpu));
515         wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
516         return 0;
517     }
518     max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
519     threshold = cmci_threshold;
520     if ( threshold > max_threshold )
521     {
522         mce_printk(MCE_QUIET,
523                    "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
524                    threshold, cpu, i, max_threshold);
525         threshold = max_threshold;
526     }
527     wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
528     mcabanks_set(i, per_cpu(mce_banks_owned, cpu));
529 out:
530     mcabanks_clear(i, per_cpu(no_cmci_banks, cpu));
531     return 1;
532 }
533 
cmci_discover(void)534 static void cmci_discover(void)
535 {
536     unsigned long flags;
537     unsigned int i, cpu = smp_processor_id();
538     mctelem_cookie_t mctc;
539     struct mca_summary bs;
540 
541     mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%u\n", cpu);
542 
543     spin_lock_irqsave(&cmci_discover_lock, flags);
544 
545     for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
546         if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
547             do_cmci_discover(i);
548 
549     spin_unlock_irqrestore(&cmci_discover_lock, flags);
550 
551     /*
552      * In case CMCI happended when do owner change.
553      * If CMCI happened yet not processed immediately,
554      * MCi_status (error_count bit 38~52) is not cleared,
555      * the CMCI interrupt will never be triggered again.
556      */
557 
558     mctc = mcheck_mca_logout(
559         MCA_CMCI_HANDLER, per_cpu(mce_banks_owned, cpu), &bs, NULL);
560 
561     if ( bs.errcnt && mctc != NULL )
562     {
563         if ( dom0_vmce_enabled() )
564         {
565             mctelem_commit(mctc);
566             send_global_virq(VIRQ_MCA);
567         }
568         else
569         {
570             x86_mcinfo_dump(mctelem_dataptr(mctc));
571             mctelem_dismiss(mctc);
572         }
573     }
574     else if ( mctc != NULL )
575         mctelem_dismiss(mctc);
576 
577     mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
578                cpu,
579                per_cpu(mce_banks_owned, cpu)->bank_map[0],
580                per_cpu(no_cmci_banks, cpu)->bank_map[0]);
581 }
582 
583 /*
584  * Define an owner for each bank. Banks can be shared between CPUs
585  * and to avoid reporting events multiple times always set up one
586  * CPU as owner.
587  *
588  * The assignment has to be redone when CPUs go offline and
589  * any of the owners goes away. Also pollers run in parallel so we
590  * have to be careful to update the banks in a way that doesn't
591  * lose or duplicate events.
592  */
593 
mce_set_owner(void)594 static void mce_set_owner(void)
595 {
596     if ( !cmci_support || !opt_mce )
597         return;
598 
599     cmci_discover();
600 }
601 
__cpu_mcheck_distribute_cmci(void * unused)602 static void __cpu_mcheck_distribute_cmci(void *unused)
603 {
604     cmci_discover();
605 }
606 
cpu_mcheck_distribute_cmci(void)607 static void cpu_mcheck_distribute_cmci(void)
608 {
609     if ( cmci_support && opt_mce )
610         on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
611 }
612 
clear_cmci(void)613 static void clear_cmci(void)
614 {
615     unsigned int i, cpu = smp_processor_id();
616 
617     if ( !cmci_support || !opt_mce )
618         return;
619 
620     mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%u\n", cpu);
621 
622     for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
623     {
624         unsigned msr = MSR_IA32_MCx_CTL2(i);
625         u64 val;
626 
627         if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
628             continue;
629         rdmsrl(msr, val);
630         if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
631             wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
632         mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
633     }
634 }
635 
cpu_mcheck_disable(void)636 static void cpu_mcheck_disable(void)
637 {
638     if ( cmci_support && opt_mce )
639         clear_cmci();
640 }
641 
cmci_interrupt(struct cpu_user_regs * regs)642 static void cmci_interrupt(struct cpu_user_regs *regs)
643 {
644     mctelem_cookie_t mctc;
645     struct mca_summary bs;
646 
647     ack_APIC_irq();
648 
649     mctc = mcheck_mca_logout(
650         MCA_CMCI_HANDLER, this_cpu(mce_banks_owned), &bs, NULL);
651 
652     if ( bs.errcnt && mctc != NULL )
653     {
654         if ( dom0_vmce_enabled() )
655         {
656             mctelem_commit(mctc);
657             mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
658             send_global_virq(VIRQ_MCA);
659         }
660         else
661         {
662             x86_mcinfo_dump(mctelem_dataptr(mctc));
663             mctelem_dismiss(mctc);
664         }
665     }
666     else if ( mctc != NULL )
667         mctelem_dismiss(mctc);
668 }
669 
intel_init_cmci(struct cpuinfo_x86 * c)670 static void intel_init_cmci(struct cpuinfo_x86 *c)
671 {
672     u32 l, apic;
673     int cpu = smp_processor_id();
674 
675     if ( !mce_available(c) || !cmci_support )
676     {
677         if ( opt_cpu_info )
678             mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
679         return;
680     }
681 
682     apic = apic_read(APIC_CMCI);
683     if ( apic & APIC_VECTOR_MASK )
684     {
685         mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
686                    cpu, ( apic & APIC_VECTOR_MASK ));
687         return;
688     }
689 
690     alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt);
691 
692     apic = cmci_apic_vector;
693     apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
694     apic_write(APIC_CMCI, apic);
695 
696     l = apic_read(APIC_CMCI);
697     apic_write(APIC_CMCI, l & ~APIC_LVT_MASKED);
698 
699     mce_set_owner();
700 }
701 
702 /* MCA */
703 
mce_is_broadcast(struct cpuinfo_x86 * c)704 static bool mce_is_broadcast(struct cpuinfo_x86 *c)
705 {
706     if ( mce_force_broadcast )
707         return true;
708 
709     /*
710      * According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
711      * DisplayFamily_DisplayModel encoding of 06H_EH and above,
712      * a MCA signal is broadcast to all logical processors in the system
713      */
714     if ( c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
715          c->x86_model >= 0xe )
716         return true;
717     return false;
718 }
719 
intel_enable_lmce(void)720 static bool intel_enable_lmce(void)
721 {
722     uint64_t msr_content;
723 
724     /*
725      * Section "Enabling Local Machine Check" in Intel SDM Vol 3
726      * requires software must ensure the LOCK bit and LMCE_ON bit
727      * of MSR_IA32_FEATURE_CONTROL are set before setting
728      * MSR_IA32_MCG_EXT_CTL.LMCE_EN.
729      */
730 
731     if ( rdmsr_safe(MSR_IA32_FEATURE_CONTROL, msr_content) )
732         return false;
733 
734     if ( (msr_content & IA32_FEATURE_CONTROL_LOCK) &&
735          (msr_content & IA32_FEATURE_CONTROL_LMCE_ON) )
736     {
737         wrmsrl(MSR_IA32_MCG_EXT_CTL, MCG_EXT_CTL_LMCE_EN);
738         return true;
739     }
740 
741     return false;
742 }
743 
744 /* Check and init MCA */
intel_init_mca(struct cpuinfo_x86 * c)745 static void intel_init_mca(struct cpuinfo_x86 *c)
746 {
747     bool broadcast, cmci = false, ser = false, lmce = false;
748     int ext_num = 0, first;
749     uint64_t msr_content;
750 
751     broadcast = mce_is_broadcast(c);
752 
753     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
754 
755     if ( (msr_content & MCG_CMCI_P) && cpu_has_apic )
756         cmci = true;
757 
758     /* Support Software Error Recovery */
759     if ( msr_content & MCG_SER_P )
760         ser = true;
761 
762     if ( msr_content & MCG_EXT_P )
763         ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
764 
765     first = mce_firstbank(c);
766 
767     if ( !mce_force_broadcast && (msr_content & MCG_LMCE_P) )
768         lmce = intel_enable_lmce();
769 
770 #define CAP(enabled, name) ((enabled) ? ", " name : "")
771     if ( smp_processor_id() == 0 )
772     {
773         dprintk(XENLOG_INFO,
774                 "MCA Capability: firstbank %d, extended MCE MSR %d%s%s%s%s\n",
775                 first, ext_num,
776                 CAP(broadcast, "BCAST"),
777                 CAP(ser, "SER"),
778                 CAP(cmci, "CMCI"),
779                 CAP(lmce, "LMCE"));
780 
781         mce_broadcast = broadcast;
782         cmci_support = cmci;
783         ser_support = ser;
784         lmce_support = lmce;
785         nr_intel_ext_msrs = ext_num;
786         firstbank = first;
787     }
788     else if ( cmci != cmci_support || ser != ser_support ||
789               broadcast != mce_broadcast ||
790               first != firstbank || ext_num != nr_intel_ext_msrs ||
791               lmce != lmce_support )
792         dprintk(XENLOG_WARNING,
793                 "CPU%u has different MCA capability "
794                 "(firstbank %d, extended MCE MSR %d%s%s%s%s)"
795                 " than BSP, may cause undetermined result!!!\n",
796                 smp_processor_id(), first, ext_num,
797                 CAP(broadcast, "BCAST"),
798                 CAP(ser, "SER"),
799                 CAP(cmci, "CMCI"),
800                 CAP(lmce, "LMCE"));
801 #undef CAP
802 }
803 
intel_mce_post_reset(void)804 static void intel_mce_post_reset(void)
805 {
806     mctelem_cookie_t mctc;
807     struct mca_summary bs;
808 
809     mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
810 
811     /* in the boot up stage, print out and also log in DOM0 boot process */
812     if ( bs.errcnt && mctc != NULL )
813     {
814         x86_mcinfo_dump(mctelem_dataptr(mctc));
815         mctelem_commit(mctc);
816     }
817     return;
818 }
819 
intel_init_mce(void)820 static void intel_init_mce(void)
821 {
822     uint64_t msr_content;
823     int i;
824 
825     intel_mce_post_reset();
826 
827     /* clear all banks */
828     for ( i = firstbank; i < this_cpu(nr_mce_banks); i++ )
829     {
830         /*
831          * Some banks are shared across cores, use MCi_CTRL to judge whether
832          * this bank has been initialized by other cores already.
833          */
834         rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
835         if ( !msr_content )
836         {
837             /* if ctl is 0, this bank is never initialized */
838             mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
839             wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
840             wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
841         }
842     }
843     if ( firstbank ) /* if cmci enabled, firstbank = 0 */
844         wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
845 
846     x86_mce_vector_register(mcheck_cmn_handler);
847     mce_recoverable_register(intel_recoverable_scan);
848     mce_need_clearbank_register(intel_need_clearbank_scan);
849     mce_register_addrcheck(intel_checkaddr);
850 
851     mce_dhandlers = intel_mce_dhandlers;
852     mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
853     mce_uhandlers = intel_mce_uhandlers;
854     mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers);
855 }
856 
intel_init_ppin(const struct cpuinfo_x86 * c)857 static void intel_init_ppin(const struct cpuinfo_x86 *c)
858 {
859     /*
860      * Even if testing the presence of the MSR would be enough, we don't
861      * want to risk the situation where other models reuse this MSR for
862      * other purposes.
863      */
864     switch ( c->x86_model )
865     {
866         uint64_t val;
867 
868     case 0x3e: /* IvyBridge X */
869     case 0x3f: /* Haswell X */
870     case 0x4f: /* Broadwell X */
871     case 0x55: /* Skylake X */
872     case 0x56: /* Broadwell Xeon D */
873     case 0x57: /* Knights Landing */
874     case 0x6a: /* Icelake X */
875     case 0x85: /* Knights Mill */
876 
877         if ( (c != &boot_cpu_data && !ppin_msr) ||
878              rdmsr_safe(MSR_PPIN_CTL, val) )
879             return;
880 
881         /* If PPIN is disabled, but not locked, try to enable. */
882         if ( !(val & (PPIN_ENABLE | PPIN_LOCKOUT)) )
883         {
884             wrmsr_safe(MSR_PPIN_CTL, val | PPIN_ENABLE);
885             rdmsr_safe(MSR_PPIN_CTL, val);
886         }
887 
888         if ( !(val & PPIN_ENABLE) )
889             ppin_msr = 0;
890         else if ( c == &boot_cpu_data )
891             ppin_msr = MSR_PPIN;
892     }
893 }
894 
cpu_mcabank_free(unsigned int cpu)895 static void cpu_mcabank_free(unsigned int cpu)
896 {
897     struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
898     struct mca_banks *owned = per_cpu(mce_banks_owned, cpu);
899 
900     mcabanks_free(cmci);
901     mcabanks_free(owned);
902 }
903 
cpu_mcabank_alloc(unsigned int cpu)904 static int cpu_mcabank_alloc(unsigned int cpu)
905 {
906     unsigned int nr = per_cpu(nr_mce_banks, cpu);
907     struct mca_banks *cmci = mcabanks_alloc(nr);
908     struct mca_banks *owned = mcabanks_alloc(nr);
909 
910     if ( !cmci || !owned )
911         goto out;
912 
913     per_cpu(no_cmci_banks, cpu) = cmci;
914     per_cpu(mce_banks_owned, cpu) = owned;
915     per_cpu(last_state, cpu) = -1;
916 
917     return 0;
918  out:
919     mcabanks_free(cmci);
920     mcabanks_free(owned);
921     return -ENOMEM;
922 }
923 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)924 static int cpu_callback(
925     struct notifier_block *nfb, unsigned long action, void *hcpu)
926 {
927     unsigned int cpu = (unsigned long)hcpu;
928     int rc = 0;
929 
930     switch ( action )
931     {
932     case CPU_UP_PREPARE:
933         rc = cpu_mcabank_alloc(cpu);
934         break;
935 
936     case CPU_DYING:
937         cpu_mcheck_disable();
938         break;
939 
940     case CPU_UP_CANCELED:
941     case CPU_DEAD:
942         cpu_mcheck_distribute_cmci();
943         cpu_mcabank_free(cpu);
944         break;
945     }
946 
947     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
948 }
949 
950 static struct notifier_block cpu_nfb = {
951     .notifier_call = cpu_callback
952 };
953 
954 /* p4/p6 family have similar MCA initialization process */
intel_mcheck_init(struct cpuinfo_x86 * c,bool bsp)955 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp)
956 {
957     if ( bsp )
958     {
959         /* Early MCE initialisation for BSP. */
960         if ( cpu_mcabank_alloc(0) )
961             BUG();
962         register_cpu_notifier(&cpu_nfb);
963         mcheck_intel_therm_init();
964     }
965     else
966     {
967         unsigned int cpu = smp_processor_id();
968 
969         per_cpu(no_cmci_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
970         per_cpu(mce_banks_owned, cpu)->num = per_cpu(nr_mce_banks, cpu);
971     }
972 
973     intel_init_mca(c);
974 
975     mce_handler_init();
976 
977     intel_init_mce();
978 
979     intel_init_cmci(c);
980 
981     intel_init_thermal(c);
982 
983     intel_init_ppin(c);
984 
985     return mcheck_intel;
986 }
987 
988 /* intel specific MCA MSR */
vmce_intel_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)989 int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
990 {
991     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
992 
993     if ( bank < GUEST_MC_BANK_NUM )
994     {
995         v->arch.vmce.bank[bank].mci_ctl2 = val;
996         mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val);
997     }
998 
999     return 1;
1000 }
1001 
vmce_intel_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)1002 int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
1003 {
1004     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1005 
1006     if ( bank < GUEST_MC_BANK_NUM )
1007     {
1008         *val = v->arch.vmce.bank[bank].mci_ctl2;
1009         mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val);
1010     }
1011 
1012     return 1;
1013 }
1014 
vmce_has_lmce(const struct vcpu * v)1015 bool vmce_has_lmce(const struct vcpu *v)
1016 {
1017     return v->arch.vmce.mcg_cap & MCG_LMCE_P;
1018 }
1019