1 /*
2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation, version 2 of the
8 * License.
9 */
10
11 /*
12 * mctelem.c - x86 Machine Check Telemetry Transport
13 */
14
15 #include <xen/init.h>
16 #include <xen/types.h>
17 #include <xen/kernel.h>
18 #include <xen/smp.h>
19 #include <xen/errno.h>
20 #include <xen/sched.h>
21 #include <xen/cpumask.h>
22 #include <xen/event.h>
23
24 #include <asm/processor.h>
25 #include <asm/system.h>
26 #include <asm/msr.h>
27
28 #include "mce.h"
29
30 struct mctelem_ent {
31 struct mctelem_ent *mcte_next; /* next in chronological order */
32 struct mctelem_ent *mcte_prev; /* previous in chronological order */
33 uint32_t mcte_flags; /* See MCTE_F_* below */
34 uint32_t mcte_refcnt; /* Reference count */
35 void *mcte_data; /* corresponding data payload */
36 };
37
38 #define MCTE_F_CLASS_URGENT 0x0001U /* in use - urgent errors */
39 #define MCTE_F_CLASS_NONURGENT 0x0002U /* in use - nonurgent errors */
40 #define MCTE_F_STATE_FREE 0x0010U /* on a freelist */
41 #define MCTE_F_STATE_UNCOMMITTED 0x0020U /* reserved; on no list */
42 #define MCTE_F_STATE_COMMITTED 0x0040U /* on a committed list */
43 #define MCTE_F_STATE_PROCESSING 0x0080U /* on a processing list */
44
45 #define MCTE_F_MASK_CLASS (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
46 #define MCTE_F_MASK_STATE (MCTE_F_STATE_FREE | \
47 MCTE_F_STATE_UNCOMMITTED | \
48 MCTE_F_STATE_COMMITTED | \
49 MCTE_F_STATE_PROCESSING)
50
51 #define MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
52 #define MCTE_SET_CLASS(tep, new) do { \
53 (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
54 (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
55
56 #define MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
57 #define MCTE_TRANSITION_STATE(tep, old, new) do { \
58 BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
59 (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
60 (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
61
62 #define MC_URGENT_NENT 10
63 #define MC_NONURGENT_NENT 20
64
65 #define MC_NENT (MC_URGENT_NENT + MC_NONURGENT_NENT)
66
67 #define MC_NCLASSES (MC_NONURGENT + 1)
68
69 #define COOKIE2MCTE(c) ((struct mctelem_ent *)(c))
70 #define MCTE2COOKIE(tep) ((mctelem_cookie_t)(tep))
71
72 static struct mc_telem_ctl {
73 /* Linked lists that thread the array members together.
74 *
75 * The free lists is a bit array where bit 1 means free.
76 * This as element number is quite small and is easy to
77 * atomically allocate that way.
78 *
79 * The committed list grows at the head and we do not maintain a
80 * tail pointer; insertions are performed atomically. The head
81 * thus has the most-recently committed telemetry, i.e. the
82 * list is in reverse chronological order. The committed list
83 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
84 * When we move telemetry from the committed list to the processing
85 * list we atomically unlink the committed list and keep a pointer
86 * to the head of that list; we then traverse the list following
87 * mcte_prev and fill in mcte_next to doubly-link the list, and then
88 * append the tail of the list onto the processing list. If we panic
89 * during this manipulation of the committed list we still have
90 * the pointer to its head so we can recover all entries during
91 * the panic flow (albeit in reverse chronological order).
92 *
93 * The processing list is updated in a controlled context, and
94 * we can lock it for updates. The head of the processing list
95 * always has the oldest telemetry, and we append (as above)
96 * at the tail of the processing list. */
97 DECLARE_BITMAP(mctc_free, MC_NENT);
98 struct mctelem_ent *mctc_committed[MC_NCLASSES];
99 struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
100 struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
101 /*
102 * Telemetry array
103 */
104 struct mctelem_ent *mctc_elems;
105 } mctctl;
106
107 struct mc_telem_cpu_ctl {
108 /*
109 * Per-CPU processing lists, used for deferred (softirq)
110 * processing of telemetry.
111 *
112 * The two pending lists @lmce_pending and @pending grow at
113 * the head in the reverse chronological order.
114 *
115 * @pending and @lmce_pending on the same CPU are mutually
116 * exclusive, i.e. deferred MCE on a CPU are either all in
117 * @lmce_pending or all in @pending. In the former case, all
118 * deferred MCE are LMCE. In the latter case, both LMCE and
119 * non-local MCE can be in @pending, and @pending contains at
120 * least one non-local MCE if it's not empty.
121 *
122 * Changes to @pending and @lmce_pending should be performed
123 * via mctelem_process_deferred() and mctelem_defer(), in order
124 * to guarantee the above mutual exclusivity.
125 */
126 struct mctelem_ent *pending, *lmce_pending;
127 struct mctelem_ent *processing;
128 };
129 static DEFINE_PER_CPU(struct mc_telem_cpu_ctl, mctctl);
130
131 /* Lock protecting all processing lists */
132 static DEFINE_SPINLOCK(processing_lock);
133
mctelem_xchg_head(struct mctelem_ent ** headp,struct mctelem_ent ** linkp,struct mctelem_ent * new)134 static void mctelem_xchg_head(struct mctelem_ent **headp,
135 struct mctelem_ent **linkp,
136 struct mctelem_ent *new)
137 {
138 for (;;) {
139 struct mctelem_ent *old;
140
141 *linkp = old = *headp;
142 if (cmpxchgptr(headp, old, new) == old)
143 break;
144 }
145 }
146
147 /**
148 * Append a telemetry of deferred MCE to a per-cpu pending list,
149 * either @pending or @lmce_pending, according to rules below:
150 * - if @pending is not empty, then the new telemetry will be
151 * appended to @pending;
152 * - if @pending is empty and the new telemetry is for a deferred
153 * LMCE, then the new telemetry will be appended to @lmce_pending;
154 * - if @pending is empty and the new telemetry is for a deferred
155 * non-local MCE, all existing telemetries in @lmce_pending will be
156 * moved to @pending and then the new telemetry will be appended to
157 * @pending.
158 *
159 * This function must be called with MCIP bit set, so that it does not
160 * need to worry about MC# re-occurring in this function.
161 *
162 * As a result, this function can preserve the mutual exclusivity
163 * between @pending and @lmce_pending (see their comments in struct
164 * mc_telem_cpu_ctl).
165 *
166 * Parameters:
167 * @cookie: telemetry of the deferred MCE
168 * @lmce: indicate whether the telemetry is for LMCE
169 */
mctelem_defer(mctelem_cookie_t cookie,bool lmce)170 void mctelem_defer(mctelem_cookie_t cookie, bool lmce)
171 {
172 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
173 struct mc_telem_cpu_ctl *mctctl = &this_cpu(mctctl);
174
175 ASSERT(mctctl->pending == NULL || mctctl->lmce_pending == NULL);
176
177 if (mctctl->pending)
178 mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
179 else if (lmce)
180 mctelem_xchg_head(&mctctl->lmce_pending, &tep->mcte_next, tep);
181 else {
182 /*
183 * LMCE is supported on Skylake-server and later CPUs, on
184 * which mce_broadcast is always true. Therefore, non-empty
185 * mctctl->lmce_pending in this branch implies a broadcasting
186 * MC# is being handled, every CPU is in the exception
187 * context, and no one is consuming mctctl->pending at this
188 * moment. As a result, the following two exchanges together
189 * can be treated as atomic.
190 */
191 if (mctctl->lmce_pending)
192 mctelem_xchg_head(&mctctl->lmce_pending,
193 &mctctl->pending, NULL);
194 mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
195 }
196 }
197
198 /**
199 * Move telemetries of deferred MCE from the per-cpu pending list on
200 * this or another CPU to the per-cpu processing list on this CPU, and
201 * then process all deferred MCE on the processing list.
202 *
203 * This function can be called with MCIP bit set (e.g. from MC#
204 * handler) or cleared (from MCE softirq handler). In the latter case,
205 * MC# may re-occur in this function.
206 *
207 * Parameters:
208 * @cpu: indicate the CPU where the pending list is
209 * @fn: the function to handle the deferred MCE
210 * @lmce: indicate which pending list on @cpu is handled
211 */
mctelem_process_deferred(unsigned int cpu,int (* fn)(mctelem_cookie_t),bool lmce)212 void mctelem_process_deferred(unsigned int cpu,
213 int (*fn)(mctelem_cookie_t),
214 bool lmce)
215 {
216 struct mctelem_ent *tep;
217 struct mctelem_ent *head, *prev;
218 struct mc_telem_cpu_ctl *mctctl = &per_cpu(mctctl, cpu);
219 int ret;
220
221 /*
222 * First, unhook the list of telemetry structures, and
223 * hook it up to the processing list head for this CPU.
224 *
225 * If @lmce is true and a non-local MC# occurs before the
226 * following atomic exchange, @lmce will not hold after
227 * resumption, because all telemetries in @lmce_pending on
228 * @cpu are moved to @pending on @cpu in mcheck_cmn_handler().
229 * In such a case, no telemetries will be handled in this
230 * function after resumption. Another round of MCE softirq,
231 * which was raised by above mcheck_cmn_handler(), will handle
232 * those moved telemetries in @pending on @cpu.
233 *
234 * Any MC# occurring after the following atomic exchange will be
235 * handled by another round of MCE softirq.
236 */
237 mctelem_xchg_head(lmce ? &mctctl->lmce_pending : &mctctl->pending,
238 &this_cpu(mctctl.processing), NULL);
239
240 head = this_cpu(mctctl.processing);
241
242 /*
243 * Then, fix up the list to include prev pointers, to make
244 * things a little easier, as the list must be traversed in
245 * chronological order, which is backward from the order they
246 * are in.
247 */
248 for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
249 tep->mcte_prev = prev;
250 prev = tep;
251 }
252
253 /*
254 * Now walk the list of telemetry structures, handling each
255 * one of them. Unhooking the structure here does not need to
256 * be atomic, as this list is only accessed from a softirq
257 * context; the MCE handler does not touch it.
258 */
259 for (tep = prev; tep != NULL; tep = prev) {
260 prev = tep->mcte_prev;
261 tep->mcte_next = tep->mcte_prev = NULL;
262
263 ret = fn(MCTE2COOKIE(tep));
264 if (prev != NULL)
265 prev->mcte_next = NULL;
266 tep->mcte_prev = tep->mcte_next = NULL;
267 if (ret != 0)
268 mctelem_commit(MCTE2COOKIE(tep));
269 else
270 mctelem_dismiss(MCTE2COOKIE(tep));
271 }
272 }
273
mctelem_has_deferred(unsigned int cpu)274 bool mctelem_has_deferred(unsigned int cpu)
275 {
276 if (per_cpu(mctctl.pending, cpu) != NULL)
277 return true;
278 return false;
279 }
280
mctelem_has_deferred_lmce(unsigned int cpu)281 bool mctelem_has_deferred_lmce(unsigned int cpu)
282 {
283 return per_cpu(mctctl.lmce_pending, cpu) != NULL;
284 }
285
286 /* Free an entry to its native free list; the entry must not be linked on
287 * any list.
288 */
mctelem_free(struct mctelem_ent * tep)289 static void mctelem_free(struct mctelem_ent *tep)
290 {
291 BUG_ON(tep->mcte_refcnt != 0);
292 BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
293
294 tep->mcte_prev = NULL;
295 tep->mcte_next = NULL;
296
297 /* set free in array */
298 set_bit(tep - mctctl.mctc_elems, mctctl.mctc_free);
299 }
300
301 /* Increment the reference count of an entry that is not linked on to
302 * any list and which only the caller has a pointer to.
303 */
mctelem_hold(struct mctelem_ent * tep)304 static void mctelem_hold(struct mctelem_ent *tep)
305 {
306 tep->mcte_refcnt++;
307 }
308
309 /* Increment the reference count on an entry that is linked at the head of
310 * a processing list. The caller is responsible for locking the list.
311 */
mctelem_processing_hold(struct mctelem_ent * tep)312 static void mctelem_processing_hold(struct mctelem_ent *tep)
313 {
314 int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
315 MC_URGENT : MC_NONURGENT;
316
317 BUG_ON(tep != mctctl.mctc_processing_head[which]);
318 tep->mcte_refcnt++;
319 }
320
321 /* Decrement the reference count on an entry that is linked at the head of
322 * a processing list. The caller is responsible for locking the list.
323 */
mctelem_processing_release(struct mctelem_ent * tep)324 static void mctelem_processing_release(struct mctelem_ent *tep)
325 {
326 int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
327 MC_URGENT : MC_NONURGENT;
328
329 BUG_ON(tep != mctctl.mctc_processing_head[which]);
330 if (--tep->mcte_refcnt == 0) {
331 MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
332 mctctl.mctc_processing_head[which] = tep->mcte_next;
333 mctelem_free(tep);
334 }
335 }
336
mctelem_init(unsigned int datasz)337 void __init mctelem_init(unsigned int datasz)
338 {
339 char *datarr;
340 unsigned int i;
341
342 BUILD_BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
343
344 datasz = (datasz & ~0xf) + 0x10; /* 16 byte roundup */
345
346 if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
347 MC_NENT)) == NULL ||
348 (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {
349 xfree(mctctl.mctc_elems);
350 printk("Allocations for MCA telemetry failed\n");
351 return;
352 }
353
354 for (i = 0; i < MC_NENT; i++) {
355 struct mctelem_ent *tep;
356
357 tep = mctctl.mctc_elems + i;
358 tep->mcte_flags = MCTE_F_STATE_FREE;
359 tep->mcte_refcnt = 0;
360 tep->mcte_data = datarr + i * datasz;
361
362 __set_bit(i, mctctl.mctc_free);
363 tep->mcte_next = NULL;
364 tep->mcte_prev = NULL;
365 }
366 }
367
368 /* incremented non-atomically when reserve fails */
369 static int mctelem_drop_count;
370
371 /* Reserve a telemetry entry, or return NULL if none available.
372 * If we return an entry then the caller must subsequently call exactly one of
373 * mctelem_dismiss or mctelem_commit for that entry.
374 */
mctelem_reserve(mctelem_class_t which)375 mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
376 {
377 unsigned bit;
378 unsigned start_bit = (which == MC_URGENT) ? 0 : MC_URGENT_NENT;
379
380 for (;;) {
381 bit = find_next_bit(mctctl.mctc_free, MC_NENT, start_bit);
382
383 if (bit >= MC_NENT) {
384 mctelem_drop_count++;
385 return NULL;
386 }
387
388 /* try to allocate, atomically clear free bit */
389 if (test_and_clear_bit(bit, mctctl.mctc_free)) {
390 /* return element we got */
391 struct mctelem_ent *tep = mctctl.mctc_elems + bit;
392
393 mctelem_hold(tep);
394 MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
395 tep->mcte_next = NULL;
396 tep->mcte_prev = NULL;
397 if (which == MC_URGENT)
398 MCTE_SET_CLASS(tep, URGENT);
399 else
400 MCTE_SET_CLASS(tep, NONURGENT);
401 return MCTE2COOKIE(tep);
402 }
403 }
404 }
405
mctelem_dataptr(mctelem_cookie_t cookie)406 void *mctelem_dataptr(mctelem_cookie_t cookie)
407 {
408 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
409
410 return tep->mcte_data;
411 }
412
413 /* Release a previously reserved entry back to the freelist without
414 * submitting it for logging. The entry must not be linked on to any
415 * list - that's how mctelem_reserve handed it out.
416 */
mctelem_dismiss(mctelem_cookie_t cookie)417 void mctelem_dismiss(mctelem_cookie_t cookie)
418 {
419 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
420
421 tep->mcte_refcnt--;
422 MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
423 mctelem_free(tep);
424 }
425
426 /* Commit an entry with completed telemetry for logging. The caller must
427 * not reference the entry after this call. Note that we add entries
428 * at the head of the committed list, so that list therefore has entries
429 * in reverse chronological order.
430 */
mctelem_commit(mctelem_cookie_t cookie)431 void mctelem_commit(mctelem_cookie_t cookie)
432 {
433 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
434 mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
435 MC_URGENT : MC_NONURGENT;
436
437 BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
438 MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
439
440 mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
441 }
442
443 /* Move telemetry from committed list to processing list, reversing the
444 * list into chronological order. The processing list has been
445 * locked by the caller, and may be non-empty. We append the
446 * reversed committed list on to the tail of the processing list.
447 * The committed list may grow even while we run, so use atomic
448 * operations to swap NULL to the freelist head.
449 *
450 * Note that "chronological order" means the order in which producers
451 * won additions to the processing list, which may not reflect the
452 * strict chronological order of the associated events if events are
453 * closely spaced in time and contend for the processing list at once.
454 */
455
456 static struct mctelem_ent *dangling[MC_NCLASSES];
457
mctelem_append_processing(mctelem_class_t which)458 static void mctelem_append_processing(mctelem_class_t which)
459 {
460 mctelem_class_t target = which == MC_URGENT ?
461 MC_URGENT : MC_NONURGENT;
462 struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
463 struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
464 struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
465 struct mctelem_ent *tep, *ltep;
466
467 /* Check for an empty list; no race since we hold the processing lock */
468 if (*commlp == NULL)
469 return;
470
471 /* Atomically unlink the committed list, and keep a pointer to
472 * the list we unlink in a well-known location so it can be
473 * picked up in panic code should we panic between this unlink
474 * and the append to the processing list. */
475 mctelem_xchg_head(commlp, &dangling[target], NULL);
476
477 if (dangling[target] == NULL)
478 return;
479
480 /* Traverse the list following the previous pointers (reverse
481 * chronological order). For each entry fill in the next pointer
482 * and transition the element state. */
483 for (tep = dangling[target], ltep = NULL; tep != NULL;
484 tep = tep->mcte_prev) {
485 MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
486 tep->mcte_next = ltep;
487 ltep = tep;
488 }
489
490 /* ltep points to the head of a chronologically ordered linked
491 * list of telemetry entries ending at the most recent entry
492 * dangling[target] if mcte_next is followed; tack this on to
493 * the processing list.
494 */
495 if (*proclhp == NULL) {
496 *proclhp = ltep;
497 *procltp = dangling[target];
498 } else {
499 (*procltp)->mcte_next = ltep;
500 ltep->mcte_prev = *procltp;
501 *procltp = dangling[target];
502 }
503 smp_wmb();
504 dangling[target] = NULL;
505 smp_wmb();
506 }
507
mctelem_consume_oldest_begin(mctelem_class_t which)508 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
509 {
510 mctelem_class_t target = (which == MC_URGENT) ?
511 MC_URGENT : MC_NONURGENT;
512 struct mctelem_ent *tep;
513
514 spin_lock(&processing_lock);
515 mctelem_append_processing(target);
516 if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
517 spin_unlock(&processing_lock);
518 return NULL;
519 }
520
521 mctelem_processing_hold(tep);
522 spin_unlock(&processing_lock);
523 return MCTE2COOKIE(tep);
524 }
525
mctelem_consume_oldest_end(mctelem_cookie_t cookie)526 void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
527 {
528 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
529
530 spin_lock(&processing_lock);
531 mctelem_processing_release(tep);
532 spin_unlock(&processing_lock);
533 }
534
mctelem_ack(mctelem_class_t which,mctelem_cookie_t cookie)535 void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
536 {
537 mctelem_class_t target = (which == MC_URGENT) ?
538 MC_URGENT : MC_NONURGENT;
539 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
540
541 if (tep == NULL)
542 return;
543
544 spin_lock(&processing_lock);
545 if (tep == mctctl.mctc_processing_head[target])
546 mctelem_processing_release(tep);
547 spin_unlock(&processing_lock);
548 }
549
550 /*
551 * Local variables:
552 * mode: C
553 * c-file-style: "BSD"
554 * c-basic-offset: 4
555 * indent-tabs-mode: t
556 * tab-width: 8
557 * End:
558 */
559