1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/param.h>
8 #include <xen/smp.h>
9 #include <xen/mm.h>
10 #include <xen/cpu.h>
11 #include <asm/processor.h>
12 #include <public/sysctl.h>
13 #include <asm/system.h>
14 #include <asm/msr.h>
15 #include <asm/p2m.h>
16 #include <asm/mce.h>
17 #include <asm/apic.h>
18 #include "mce.h"
19 #include "x86_mca.h"
20 #include "barrier.h"
21 #include "util.h"
22 #include "vmce.h"
23 #include "mcaction.h"
24
25 static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned);
26 bool __read_mostly cmci_support;
27 static bool __read_mostly ser_support;
28 static bool __read_mostly mce_force_broadcast;
29 boolean_param("mce_fb", mce_force_broadcast);
30
31 static int __read_mostly nr_intel_ext_msrs;
32
33 /* If mce_force_broadcast == 1, lmce_support will be disabled forcibly. */
34 bool __read_mostly lmce_support;
35
36 /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */
37 #define INTEL_MCCOD_MASK 0xFFFF
38
39 /*
40 * Currently Intel SDM define 2 kinds of srao errors:
41 * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF
42 * 2). L3 explicit writeback error, error code = 0x17A
43 */
44 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
45 #define INTEL_SRAO_L3_EWB 0x17A
46
47 /*
48 * Currently Intel SDM define 2 kinds of srar errors:
49 * 1). Data Load error, error code = 0x134
50 * 2). Instruction Fetch error, error code = 0x150
51 */
52 #define INTEL_SRAR_DATA_LOAD 0x134
53 #define INTEL_SRAR_INSTR_FETCH 0x150
54
55 #define MCE_RING 0x1
56 static DEFINE_PER_CPU(int, last_state);
57
intel_thermal_interrupt(struct cpu_user_regs * regs)58 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
59 {
60 uint64_t msr_content;
61 unsigned int cpu = smp_processor_id();
62 static DEFINE_PER_CPU(s_time_t, next);
63 int *this_last_state;
64
65 ack_APIC_irq();
66
67 if ( NOW() < per_cpu(next, cpu) )
68 return;
69
70 per_cpu(next, cpu) = NOW() + MILLISECS(5000);
71 rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
72 this_last_state = &per_cpu(last_state, cpu);
73 if ( *this_last_state == (msr_content & MCE_RING) )
74 return;
75 *this_last_state = msr_content & MCE_RING;
76 if ( msr_content & MCE_RING )
77 {
78 printk(KERN_EMERG "CPU%u: Temperature above threshold\n", cpu);
79 printk(KERN_EMERG "CPU%u: Running in modulated clock mode\n", cpu);
80 add_taint(TAINT_MACHINE_CHECK);
81 } else
82 printk(KERN_INFO "CPU%u: Temperature/speed normal\n", cpu);
83 }
84
85 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
intel_thermal_supported(struct cpuinfo_x86 * c)86 static bool intel_thermal_supported(struct cpuinfo_x86 *c)
87 {
88 if ( !cpu_has_apic )
89 return false;
90 if ( !cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_TM1) )
91 return false;
92 return true;
93 }
94
95 static u32 __read_mostly lvtthmr_init;
96
mcheck_intel_therm_init(void)97 static void __init mcheck_intel_therm_init(void)
98 {
99 /*
100 * This function is only called on boot CPU. Save the init thermal
101 * LVT value on BSP and use that value to restore APs' thermal LVT
102 * entry BIOS programmed later
103 */
104 if ( intel_thermal_supported(&boot_cpu_data) )
105 lvtthmr_init = apic_read(APIC_LVTTHMR);
106 }
107
108 /* P4/Xeon Thermal regulation detect and init */
intel_init_thermal(struct cpuinfo_x86 * c)109 static void intel_init_thermal(struct cpuinfo_x86 *c)
110 {
111 uint64_t msr_content;
112 uint32_t val;
113 int tm2 = 0;
114 unsigned int cpu = smp_processor_id();
115 static uint8_t thermal_apic_vector;
116
117 if ( !intel_thermal_supported(c) )
118 return; /* -ENODEV */
119
120 /* first check if its enabled already, in which case there might
121 * be some SMM goo which handles it, so we can't even put a handler
122 * since it might be delivered via SMI already -zwanem.
123 */
124 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
125 val = lvtthmr_init;
126 /*
127 * The initial value of thermal LVT entries on all APs always reads
128 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
129 * sequence to them and LVT registers are reset to 0s except for
130 * the mask bits which are set to 1s when APs receive INIT IPI.
131 * If BIOS takes over the thermal interrupt and sets its interrupt
132 * delivery mode to SMI (not fixed), it restores the value that the
133 * BIOS has programmed on AP based on BSP's info we saved (since BIOS
134 * is required to set the same value for all threads/cores).
135 */
136 if ( (val & APIC_MODE_MASK) != APIC_DM_FIXED
137 || (val & APIC_VECTOR_MASK) > 0xf )
138 apic_write(APIC_LVTTHMR, val);
139
140 if ( (msr_content & (1ULL<<3))
141 && (val & APIC_MODE_MASK) == APIC_DM_SMI )
142 {
143 if ( c == &boot_cpu_data )
144 printk(KERN_DEBUG "Thermal monitoring handled by SMI\n");
145 return; /* -EBUSY */
146 }
147
148 if ( cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)) )
149 tm2 = 1;
150
151 /* check whether a vector already exists, temporarily masked? */
152 if ( val & APIC_VECTOR_MASK )
153 {
154 if ( c == &boot_cpu_data )
155 printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n",
156 val & APIC_VECTOR_MASK);
157 return; /* -EBUSY */
158 }
159
160 alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt);
161
162 /* The temperature transition interrupt handler setup */
163 val = thermal_apic_vector; /* our delivery vector */
164 val |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
165 apic_write(APIC_LVTTHMR, val);
166
167 rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
168 wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
169
170 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
171 wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
172
173 apic_write(APIC_LVTTHMR, val & ~APIC_LVT_MASKED);
174 if ( opt_cpu_info )
175 printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
176 cpu, tm2 ? "TM2" : "TM1");
177 }
178
179 /* Intel MCE handler */
intel_get_extended_msr(struct mcinfo_extended * ext,u32 msr)180 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
181 {
182 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
183 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs )
184 {
185 ext->mc_msr[ext->mc_msrs].reg = msr;
186 rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
187 ++ext->mc_msrs;
188 }
189 }
190
191
192 struct mcinfo_extended *
intel_get_extended_msrs(struct mcinfo_global * mig,struct mc_info * mi)193 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
194 {
195 struct mcinfo_extended *mc_ext;
196 int i;
197
198 /*
199 * According to spec, processor _support_ 64 bit will always
200 * have MSR beyond IA32_MCG_MISC
201 */
202 if ( !mi|| !mig || nr_intel_ext_msrs == 0 ||
203 !(mig->mc_gstatus & MCG_STATUS_EIPV) )
204 return NULL;
205
206 mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext), MC_TYPE_EXTENDED);
207 if ( !mc_ext )
208 {
209 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
210 return NULL;
211 }
212
213 for ( i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++ )
214 intel_get_extended_msr(mc_ext, i);
215
216 for ( i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++ )
217 intel_get_extended_msr(mc_ext, i);
218
219 return mc_ext;
220 }
221
222 enum intel_mce_type
223 {
224 intel_mce_invalid,
225 intel_mce_fatal,
226 intel_mce_corrected,
227 intel_mce_ucr_ucna,
228 intel_mce_ucr_srao,
229 intel_mce_ucr_srar,
230 };
231
intel_check_mce_type(uint64_t status)232 static enum intel_mce_type intel_check_mce_type(uint64_t status)
233 {
234 if ( !(status & MCi_STATUS_VAL) )
235 return intel_mce_invalid;
236
237 if ( status & MCi_STATUS_PCC )
238 return intel_mce_fatal;
239
240 /* Corrected error? */
241 if ( !(status & MCi_STATUS_UC) )
242 return intel_mce_corrected;
243
244 if ( !ser_support )
245 return intel_mce_fatal;
246
247 if ( status & MCi_STATUS_S )
248 {
249 if ( status & MCi_STATUS_AR )
250 {
251 if ( status & MCi_STATUS_OVER )
252 return intel_mce_fatal;
253 else
254 return intel_mce_ucr_srar;
255 } else
256 return intel_mce_ucr_srao;
257 }
258 else
259 return intel_mce_ucr_ucna;
260
261 /* Any type not included abovoe ? */
262 return intel_mce_fatal;
263 }
264
intel_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)265 static void intel_memerr_dhandler(
266 struct mca_binfo *binfo,
267 enum mce_result *result,
268 const struct cpu_user_regs *regs)
269 {
270 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
271 mc_memerr_dhandler(binfo, result, regs);
272 }
273
intel_srar_check(uint64_t status)274 static bool intel_srar_check(uint64_t status)
275 {
276 return (intel_check_mce_type(status) == intel_mce_ucr_srar);
277 }
278
intel_checkaddr(uint64_t status,uint64_t misc,int addrtype)279 static bool intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
280 {
281 if ( !(status & MCi_STATUS_ADDRV) ||
282 !(status & MCi_STATUS_MISCV) ||
283 ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
284 /* addr is virtual */
285 return (addrtype == MC_ADDR_VIRTUAL);
286
287 return (addrtype == MC_ADDR_PHYSICAL);
288 }
289
intel_srar_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)290 static void intel_srar_dhandler(
291 struct mca_binfo *binfo,
292 enum mce_result *result,
293 const struct cpu_user_regs *regs)
294 {
295 uint64_t status = binfo->mib->mc_status;
296
297 /* For unknown srar error code, reset system */
298 *result = MCER_RESET;
299
300 switch ( status & INTEL_MCCOD_MASK )
301 {
302 case INTEL_SRAR_DATA_LOAD:
303 case INTEL_SRAR_INSTR_FETCH:
304 intel_memerr_dhandler(binfo, result, regs);
305 break;
306 }
307 }
308
intel_srao_check(uint64_t status)309 static bool intel_srao_check(uint64_t status)
310 {
311 return (intel_check_mce_type(status) == intel_mce_ucr_srao);
312 }
313
intel_srao_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)314 static void intel_srao_dhandler(
315 struct mca_binfo *binfo,
316 enum mce_result *result,
317 const struct cpu_user_regs *regs)
318 {
319 uint64_t status = binfo->mib->mc_status;
320
321 /* For unknown srao error code, no action required */
322 *result = MCER_CONTINUE;
323
324 if ( status & MCi_STATUS_VAL )
325 {
326 switch ( status & INTEL_MCCOD_MASK )
327 {
328 case INTEL_SRAO_MEM_SCRUB:
329 case INTEL_SRAO_L3_EWB:
330 intel_memerr_dhandler(binfo, result, regs);
331 break;
332 }
333 }
334 }
335
intel_default_check(uint64_t status)336 static bool intel_default_check(uint64_t status)
337 {
338 return true;
339 }
340
intel_default_mce_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)341 static void intel_default_mce_dhandler(
342 struct mca_binfo *binfo,
343 enum mce_result *result,
344 const struct cpu_user_regs * regs)
345 {
346 uint64_t status = binfo->mib->mc_status;
347 enum intel_mce_type type;
348
349 type = intel_check_mce_type(status);
350
351 if ( type == intel_mce_fatal )
352 *result = MCER_RESET;
353 else
354 *result = MCER_CONTINUE;
355 }
356
357 static const struct mca_error_handler intel_mce_dhandlers[] = {
358 {intel_srao_check, intel_srao_dhandler},
359 {intel_srar_check, intel_srar_dhandler},
360 {intel_default_check, intel_default_mce_dhandler}
361 };
362
intel_default_mce_uhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)363 static void intel_default_mce_uhandler(
364 struct mca_binfo *binfo,
365 enum mce_result *result,
366 const struct cpu_user_regs *regs)
367 {
368 uint64_t status = binfo->mib->mc_status;
369 enum intel_mce_type type;
370
371 type = intel_check_mce_type(status);
372
373 switch ( type )
374 {
375 case intel_mce_fatal:
376 *result = MCER_RESET;
377 break;
378
379 default:
380 *result = MCER_CONTINUE;
381 break;
382 }
383 }
384
385 static const struct mca_error_handler intel_mce_uhandlers[] = {
386 {intel_default_check, intel_default_mce_uhandler}
387 };
388
389 /* According to MCA OS writer guide, CMCI handler need to clear bank when
390 * 1) CE (UC = 0)
391 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
392 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
393 * MCA handler need to clear bank when
394 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
395 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
396 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
397 */
398
intel_need_clearbank_scan(enum mca_source who,u64 status)399 static bool intel_need_clearbank_scan(enum mca_source who, u64 status)
400 {
401 if ( who == MCA_CMCI_HANDLER )
402 {
403 /* CMCI need clear bank */
404 if ( !(status & MCi_STATUS_UC) )
405 return true;
406 /* Spurious need clear bank */
407 else if ( ser_support && !(status & MCi_STATUS_OVER)
408 && !(status & MCi_STATUS_EN) )
409 return true;
410 /* UCNA OVER = 0 need clear bank */
411 else if ( ser_support && !(status & MCi_STATUS_OVER)
412 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
413 && !(status & MCi_STATUS_AR) )
414 return true;
415 /* Only Log, no clear */
416 else return false;
417 }
418 else if ( who == MCA_MCE_SCAN )
419 {
420 if ( !ser_support )
421 return false;
422 /*
423 * For fatal error, it shouldn't be cleared so that sticky bank
424 * have chance to be handled after reboot by polling
425 */
426 if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) )
427 return false;
428 /* Spurious need clear bank */
429 else if ( !(status & MCi_STATUS_OVER)
430 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN) )
431 return true;
432 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
433 else if ( (status & MCi_STATUS_UC)
434 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR)
435 && !(status & MCi_STATUS_OVER) )
436 return true;
437 /* SRAO need clear bank */
438 else if ( !(status & MCi_STATUS_AR)
439 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC) )
440 return true;
441 else
442 return false;
443 }
444
445 return true;
446 }
447
448 /*
449 * MCE continues/is recoverable when
450 * 1) CE UC = 0
451 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
452 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
453 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
454 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
455 */
intel_recoverable_scan(uint64_t status)456 static bool intel_recoverable_scan(uint64_t status)
457 {
458
459 if ( !(status & MCi_STATUS_UC ) )
460 return true;
461 else if ( ser_support && !(status & MCi_STATUS_EN)
462 && !(status & MCi_STATUS_OVER) )
463 return true;
464 /* SRAR error */
465 else if ( ser_support && !(status & MCi_STATUS_OVER)
466 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
467 && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
468 return true;
469 /* SRAO error */
470 else if ( ser_support && !(status & MCi_STATUS_PCC)
471 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
472 && (status & MCi_STATUS_EN) )
473 return true;
474 /* UCNA error */
475 else if ( ser_support && !(status & MCi_STATUS_OVER)
476 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
477 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) )
478 return true;
479 return false;
480 }
481
482 /* CMCI */
483 static DEFINE_SPINLOCK(cmci_discover_lock);
484
485 /*
486 * Discover bank sharing using the algorithm recommended in the SDM.
487 */
do_cmci_discover(int i)488 static int do_cmci_discover(int i)
489 {
490 unsigned msr = MSR_IA32_MCx_CTL2(i);
491 u64 val;
492 unsigned int threshold, max_threshold;
493 unsigned int cpu = smp_processor_id();
494 static unsigned int cmci_threshold = 2;
495 integer_param("cmci-threshold", cmci_threshold);
496
497 rdmsrl(msr, val);
498 /* Some other CPU already owns this bank. */
499 if ( val & CMCI_EN )
500 {
501 mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
502 goto out;
503 }
504
505 if ( cmci_threshold )
506 {
507 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
508 rdmsrl(msr, val);
509 }
510
511 if ( !(val & CMCI_EN) )
512 {
513 /* This bank does not support CMCI. Polling timer has to handle it. */
514 mcabanks_set(i, per_cpu(no_cmci_banks, cpu));
515 wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
516 return 0;
517 }
518 max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
519 threshold = cmci_threshold;
520 if ( threshold > max_threshold )
521 {
522 mce_printk(MCE_QUIET,
523 "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
524 threshold, cpu, i, max_threshold);
525 threshold = max_threshold;
526 }
527 wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
528 mcabanks_set(i, per_cpu(mce_banks_owned, cpu));
529 out:
530 mcabanks_clear(i, per_cpu(no_cmci_banks, cpu));
531 return 1;
532 }
533
cmci_discover(void)534 static void cmci_discover(void)
535 {
536 unsigned long flags;
537 unsigned int i, cpu = smp_processor_id();
538 mctelem_cookie_t mctc;
539 struct mca_summary bs;
540
541 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%u\n", cpu);
542
543 spin_lock_irqsave(&cmci_discover_lock, flags);
544
545 for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
546 if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
547 do_cmci_discover(i);
548
549 spin_unlock_irqrestore(&cmci_discover_lock, flags);
550
551 /*
552 * In case CMCI happended when do owner change.
553 * If CMCI happened yet not processed immediately,
554 * MCi_status (error_count bit 38~52) is not cleared,
555 * the CMCI interrupt will never be triggered again.
556 */
557
558 mctc = mcheck_mca_logout(
559 MCA_CMCI_HANDLER, per_cpu(mce_banks_owned, cpu), &bs, NULL);
560
561 if ( bs.errcnt && mctc != NULL )
562 {
563 if ( dom0_vmce_enabled() )
564 {
565 mctelem_commit(mctc);
566 send_global_virq(VIRQ_MCA);
567 }
568 else
569 {
570 x86_mcinfo_dump(mctelem_dataptr(mctc));
571 mctelem_dismiss(mctc);
572 }
573 }
574 else if ( mctc != NULL )
575 mctelem_dismiss(mctc);
576
577 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
578 cpu,
579 per_cpu(mce_banks_owned, cpu)->bank_map[0],
580 per_cpu(no_cmci_banks, cpu)->bank_map[0]);
581 }
582
583 /*
584 * Define an owner for each bank. Banks can be shared between CPUs
585 * and to avoid reporting events multiple times always set up one
586 * CPU as owner.
587 *
588 * The assignment has to be redone when CPUs go offline and
589 * any of the owners goes away. Also pollers run in parallel so we
590 * have to be careful to update the banks in a way that doesn't
591 * lose or duplicate events.
592 */
593
mce_set_owner(void)594 static void mce_set_owner(void)
595 {
596 if ( !cmci_support || !opt_mce )
597 return;
598
599 cmci_discover();
600 }
601
__cpu_mcheck_distribute_cmci(void * unused)602 static void __cpu_mcheck_distribute_cmci(void *unused)
603 {
604 cmci_discover();
605 }
606
cpu_mcheck_distribute_cmci(void)607 static void cpu_mcheck_distribute_cmci(void)
608 {
609 if ( cmci_support && opt_mce )
610 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
611 }
612
clear_cmci(void)613 static void clear_cmci(void)
614 {
615 unsigned int i, cpu = smp_processor_id();
616
617 if ( !cmci_support || !opt_mce )
618 return;
619
620 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%u\n", cpu);
621
622 for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
623 {
624 unsigned msr = MSR_IA32_MCx_CTL2(i);
625 u64 val;
626
627 if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
628 continue;
629 rdmsrl(msr, val);
630 if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
631 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
632 mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
633 }
634 }
635
cpu_mcheck_disable(void)636 static void cpu_mcheck_disable(void)
637 {
638 if ( cmci_support && opt_mce )
639 clear_cmci();
640 }
641
cmci_interrupt(struct cpu_user_regs * regs)642 static void cmci_interrupt(struct cpu_user_regs *regs)
643 {
644 mctelem_cookie_t mctc;
645 struct mca_summary bs;
646
647 ack_APIC_irq();
648
649 mctc = mcheck_mca_logout(
650 MCA_CMCI_HANDLER, this_cpu(mce_banks_owned), &bs, NULL);
651
652 if ( bs.errcnt && mctc != NULL )
653 {
654 if ( dom0_vmce_enabled() )
655 {
656 mctelem_commit(mctc);
657 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
658 send_global_virq(VIRQ_MCA);
659 }
660 else
661 {
662 x86_mcinfo_dump(mctelem_dataptr(mctc));
663 mctelem_dismiss(mctc);
664 }
665 }
666 else if ( mctc != NULL )
667 mctelem_dismiss(mctc);
668 }
669
intel_init_cmci(struct cpuinfo_x86 * c)670 static void intel_init_cmci(struct cpuinfo_x86 *c)
671 {
672 u32 l, apic;
673 int cpu = smp_processor_id();
674
675 if ( !mce_available(c) || !cmci_support )
676 {
677 if ( opt_cpu_info )
678 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
679 return;
680 }
681
682 apic = apic_read(APIC_CMCI);
683 if ( apic & APIC_VECTOR_MASK )
684 {
685 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
686 cpu, ( apic & APIC_VECTOR_MASK ));
687 return;
688 }
689
690 alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt);
691
692 apic = cmci_apic_vector;
693 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
694 apic_write(APIC_CMCI, apic);
695
696 l = apic_read(APIC_CMCI);
697 apic_write(APIC_CMCI, l & ~APIC_LVT_MASKED);
698
699 mce_set_owner();
700 }
701
702 /* MCA */
703
mce_is_broadcast(struct cpuinfo_x86 * c)704 static bool mce_is_broadcast(struct cpuinfo_x86 *c)
705 {
706 if ( mce_force_broadcast )
707 return true;
708
709 /*
710 * According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
711 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
712 * a MCA signal is broadcast to all logical processors in the system
713 */
714 if ( c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
715 c->x86_model >= 0xe )
716 return true;
717 return false;
718 }
719
intel_enable_lmce(void)720 static bool intel_enable_lmce(void)
721 {
722 uint64_t msr_content;
723
724 /*
725 * Section "Enabling Local Machine Check" in Intel SDM Vol 3
726 * requires software must ensure the LOCK bit and LMCE_ON bit
727 * of MSR_IA32_FEATURE_CONTROL are set before setting
728 * MSR_IA32_MCG_EXT_CTL.LMCE_EN.
729 */
730
731 if ( rdmsr_safe(MSR_IA32_FEATURE_CONTROL, msr_content) )
732 return false;
733
734 if ( (msr_content & IA32_FEATURE_CONTROL_LOCK) &&
735 (msr_content & IA32_FEATURE_CONTROL_LMCE_ON) )
736 {
737 wrmsrl(MSR_IA32_MCG_EXT_CTL, MCG_EXT_CTL_LMCE_EN);
738 return true;
739 }
740
741 return false;
742 }
743
744 /* Check and init MCA */
intel_init_mca(struct cpuinfo_x86 * c)745 static void intel_init_mca(struct cpuinfo_x86 *c)
746 {
747 bool broadcast, cmci = false, ser = false, lmce = false;
748 int ext_num = 0, first;
749 uint64_t msr_content;
750
751 broadcast = mce_is_broadcast(c);
752
753 rdmsrl(MSR_IA32_MCG_CAP, msr_content);
754
755 if ( (msr_content & MCG_CMCI_P) && cpu_has_apic )
756 cmci = true;
757
758 /* Support Software Error Recovery */
759 if ( msr_content & MCG_SER_P )
760 ser = true;
761
762 if ( msr_content & MCG_EXT_P )
763 ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
764
765 first = mce_firstbank(c);
766
767 if ( !mce_force_broadcast && (msr_content & MCG_LMCE_P) )
768 lmce = intel_enable_lmce();
769
770 #define CAP(enabled, name) ((enabled) ? ", " name : "")
771 if ( smp_processor_id() == 0 )
772 {
773 dprintk(XENLOG_INFO,
774 "MCA Capability: firstbank %d, extended MCE MSR %d%s%s%s%s\n",
775 first, ext_num,
776 CAP(broadcast, "BCAST"),
777 CAP(ser, "SER"),
778 CAP(cmci, "CMCI"),
779 CAP(lmce, "LMCE"));
780
781 mce_broadcast = broadcast;
782 cmci_support = cmci;
783 ser_support = ser;
784 lmce_support = lmce;
785 nr_intel_ext_msrs = ext_num;
786 firstbank = first;
787 }
788 else if ( cmci != cmci_support || ser != ser_support ||
789 broadcast != mce_broadcast ||
790 first != firstbank || ext_num != nr_intel_ext_msrs ||
791 lmce != lmce_support )
792 dprintk(XENLOG_WARNING,
793 "CPU%u has different MCA capability "
794 "(firstbank %d, extended MCE MSR %d%s%s%s%s)"
795 " than BSP, may cause undetermined result!!!\n",
796 smp_processor_id(), first, ext_num,
797 CAP(broadcast, "BCAST"),
798 CAP(ser, "SER"),
799 CAP(cmci, "CMCI"),
800 CAP(lmce, "LMCE"));
801 #undef CAP
802 }
803
intel_mce_post_reset(void)804 static void intel_mce_post_reset(void)
805 {
806 mctelem_cookie_t mctc;
807 struct mca_summary bs;
808
809 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
810
811 /* in the boot up stage, print out and also log in DOM0 boot process */
812 if ( bs.errcnt && mctc != NULL )
813 {
814 x86_mcinfo_dump(mctelem_dataptr(mctc));
815 mctelem_commit(mctc);
816 }
817 return;
818 }
819
intel_init_mce(void)820 static void intel_init_mce(void)
821 {
822 uint64_t msr_content;
823 int i;
824
825 intel_mce_post_reset();
826
827 /* clear all banks */
828 for ( i = firstbank; i < this_cpu(nr_mce_banks); i++ )
829 {
830 /*
831 * Some banks are shared across cores, use MCi_CTRL to judge whether
832 * this bank has been initialized by other cores already.
833 */
834 rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
835 if ( !msr_content )
836 {
837 /* if ctl is 0, this bank is never initialized */
838 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
839 wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
840 wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
841 }
842 }
843 if ( firstbank ) /* if cmci enabled, firstbank = 0 */
844 wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
845
846 x86_mce_vector_register(mcheck_cmn_handler);
847 mce_recoverable_register(intel_recoverable_scan);
848 mce_need_clearbank_register(intel_need_clearbank_scan);
849 mce_register_addrcheck(intel_checkaddr);
850
851 mce_dhandlers = intel_mce_dhandlers;
852 mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
853 mce_uhandlers = intel_mce_uhandlers;
854 mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers);
855 }
856
intel_init_ppin(const struct cpuinfo_x86 * c)857 static void intel_init_ppin(const struct cpuinfo_x86 *c)
858 {
859 /*
860 * Even if testing the presence of the MSR would be enough, we don't
861 * want to risk the situation where other models reuse this MSR for
862 * other purposes.
863 */
864 switch ( c->x86_model )
865 {
866 uint64_t val;
867
868 case 0x3e: /* IvyBridge X */
869 case 0x3f: /* Haswell X */
870 case 0x4f: /* Broadwell X */
871 case 0x55: /* Skylake X */
872 case 0x56: /* Broadwell Xeon D */
873 case 0x57: /* Knights Landing */
874 case 0x6a: /* Icelake X */
875 case 0x85: /* Knights Mill */
876
877 if ( (c != &boot_cpu_data && !ppin_msr) ||
878 rdmsr_safe(MSR_PPIN_CTL, val) )
879 return;
880
881 /* If PPIN is disabled, but not locked, try to enable. */
882 if ( !(val & (PPIN_ENABLE | PPIN_LOCKOUT)) )
883 {
884 wrmsr_safe(MSR_PPIN_CTL, val | PPIN_ENABLE);
885 rdmsr_safe(MSR_PPIN_CTL, val);
886 }
887
888 if ( !(val & PPIN_ENABLE) )
889 ppin_msr = 0;
890 else if ( c == &boot_cpu_data )
891 ppin_msr = MSR_PPIN;
892 }
893 }
894
cpu_mcabank_free(unsigned int cpu)895 static void cpu_mcabank_free(unsigned int cpu)
896 {
897 struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
898 struct mca_banks *owned = per_cpu(mce_banks_owned, cpu);
899
900 mcabanks_free(cmci);
901 mcabanks_free(owned);
902 }
903
cpu_mcabank_alloc(unsigned int cpu)904 static int cpu_mcabank_alloc(unsigned int cpu)
905 {
906 unsigned int nr = per_cpu(nr_mce_banks, cpu);
907 struct mca_banks *cmci = mcabanks_alloc(nr);
908 struct mca_banks *owned = mcabanks_alloc(nr);
909
910 if ( !cmci || !owned )
911 goto out;
912
913 per_cpu(no_cmci_banks, cpu) = cmci;
914 per_cpu(mce_banks_owned, cpu) = owned;
915 per_cpu(last_state, cpu) = -1;
916
917 return 0;
918 out:
919 mcabanks_free(cmci);
920 mcabanks_free(owned);
921 return -ENOMEM;
922 }
923
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)924 static int cpu_callback(
925 struct notifier_block *nfb, unsigned long action, void *hcpu)
926 {
927 unsigned int cpu = (unsigned long)hcpu;
928 int rc = 0;
929
930 switch ( action )
931 {
932 case CPU_UP_PREPARE:
933 rc = cpu_mcabank_alloc(cpu);
934 break;
935
936 case CPU_DYING:
937 cpu_mcheck_disable();
938 break;
939
940 case CPU_UP_CANCELED:
941 case CPU_DEAD:
942 cpu_mcheck_distribute_cmci();
943 cpu_mcabank_free(cpu);
944 break;
945 }
946
947 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
948 }
949
950 static struct notifier_block cpu_nfb = {
951 .notifier_call = cpu_callback
952 };
953
954 /* p4/p6 family have similar MCA initialization process */
intel_mcheck_init(struct cpuinfo_x86 * c,bool bsp)955 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp)
956 {
957 if ( bsp )
958 {
959 /* Early MCE initialisation for BSP. */
960 if ( cpu_mcabank_alloc(0) )
961 BUG();
962 register_cpu_notifier(&cpu_nfb);
963 mcheck_intel_therm_init();
964 }
965 else
966 {
967 unsigned int cpu = smp_processor_id();
968
969 per_cpu(no_cmci_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
970 per_cpu(mce_banks_owned, cpu)->num = per_cpu(nr_mce_banks, cpu);
971 }
972
973 intel_init_mca(c);
974
975 mce_handler_init();
976
977 intel_init_mce();
978
979 intel_init_cmci(c);
980
981 intel_init_thermal(c);
982
983 intel_init_ppin(c);
984
985 return mcheck_intel;
986 }
987
988 /* intel specific MCA MSR */
vmce_intel_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)989 int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
990 {
991 unsigned int bank = msr - MSR_IA32_MC0_CTL2;
992
993 if ( bank < GUEST_MC_BANK_NUM )
994 {
995 v->arch.vmce.bank[bank].mci_ctl2 = val;
996 mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val);
997 }
998
999 return 1;
1000 }
1001
vmce_intel_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)1002 int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
1003 {
1004 unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1005
1006 if ( bank < GUEST_MC_BANK_NUM )
1007 {
1008 *val = v->arch.vmce.bank[bank].mci_ctl2;
1009 mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val);
1010 }
1011
1012 return 1;
1013 }
1014
vmce_has_lmce(const struct vcpu * v)1015 bool vmce_has_lmce(const struct vcpu *v)
1016 {
1017 return v->arch.vmce.mcg_cap & MCG_LMCE_P;
1018 }
1019