1 /*
2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation, version 2 of the
8  * License.
9  */
10 
11 /*
12  * mctelem.c - x86 Machine Check Telemetry Transport
13  */
14 
15 #include <xen/init.h>
16 #include <xen/types.h>
17 #include <xen/kernel.h>
18 #include <xen/smp.h>
19 #include <xen/errno.h>
20 #include <xen/sched.h>
21 #include <xen/cpumask.h>
22 #include <xen/event.h>
23 
24 #include <asm/processor.h>
25 #include <asm/system.h>
26 #include <asm/msr.h>
27 
28 #include "mce.h"
29 
30 struct mctelem_ent {
31 	struct mctelem_ent *mcte_next;	/* next in chronological order */
32 	struct mctelem_ent *mcte_prev;	/* previous in chronological order */
33 	uint32_t mcte_flags;		/* See MCTE_F_* below */
34 	uint32_t mcte_refcnt;		/* Reference count */
35 	void *mcte_data;		/* corresponding data payload */
36 };
37 
38 #define	MCTE_F_CLASS_URGENT		0x0001U /* in use - urgent errors */
39 #define	MCTE_F_CLASS_NONURGENT		0x0002U /* in use - nonurgent errors */
40 #define	MCTE_F_STATE_FREE		0x0010U	/* on a freelist */
41 #define	MCTE_F_STATE_UNCOMMITTED	0x0020U	/* reserved; on no list */
42 #define	MCTE_F_STATE_COMMITTED		0x0040U	/* on a committed list */
43 #define	MCTE_F_STATE_PROCESSING		0x0080U	/* on a processing list */
44 
45 #define	MCTE_F_MASK_CLASS	(MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
46 #define	MCTE_F_MASK_STATE	(MCTE_F_STATE_FREE | \
47 				MCTE_F_STATE_UNCOMMITTED | \
48 				MCTE_F_STATE_COMMITTED | \
49 				MCTE_F_STATE_PROCESSING)
50 
51 #define	MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
52 #define	MCTE_SET_CLASS(tep, new) do { \
53     (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
54     (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
55 
56 #define	MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
57 #define	MCTE_TRANSITION_STATE(tep, old, new) do { \
58     BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
59     (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
60     (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
61 
62 #define	MC_URGENT_NENT		10
63 #define	MC_NONURGENT_NENT	20
64 
65 #define MC_NENT (MC_URGENT_NENT + MC_NONURGENT_NENT)
66 
67 #define	MC_NCLASSES		(MC_NONURGENT + 1)
68 
69 #define	COOKIE2MCTE(c)		((struct mctelem_ent *)(c))
70 #define	MCTE2COOKIE(tep)	((mctelem_cookie_t)(tep))
71 
72 static struct mc_telem_ctl {
73 	/* Linked lists that thread the array members together.
74 	 *
75 	 * The free lists is a bit array where bit 1 means free.
76 	 * This as element number is quite small and is easy to
77 	 * atomically allocate that way.
78 	 *
79 	 * The committed list grows at the head and we do not maintain a
80 	 * tail pointer; insertions are performed atomically.  The head
81 	 * thus has the most-recently committed telemetry, i.e. the
82 	 * list is in reverse chronological order.  The committed list
83 	 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
84 	 * When we move telemetry from the committed list to the processing
85 	 * list we atomically unlink the committed list and keep a pointer
86 	 * to the head of that list;  we then traverse the list following
87 	 * mcte_prev and fill in mcte_next to doubly-link the list, and then
88 	 * append the tail of the list onto the processing list.  If we panic
89 	 * during this manipulation of the committed list we still have
90 	 * the pointer to its head so we can recover all entries during
91 	 * the panic flow (albeit in reverse chronological order).
92 	 *
93 	 * The processing list is updated in a controlled context, and
94 	 * we can lock it for updates.  The head of the processing list
95 	 * always has the oldest telemetry, and we append (as above)
96 	 * at the tail of the processing list. */
97 	DECLARE_BITMAP(mctc_free, MC_NENT);
98 	struct mctelem_ent *mctc_committed[MC_NCLASSES];
99 	struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
100 	struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
101 	/*
102 	 * Telemetry array
103 	 */
104 	struct mctelem_ent *mctc_elems;
105 } mctctl;
106 
107 struct mc_telem_cpu_ctl {
108 	/*
109 	 * Per-CPU processing lists, used for deferred (softirq)
110 	 * processing of telemetry.
111 	 *
112 	 * The two pending lists @lmce_pending and @pending grow at
113 	 * the head in the reverse chronological order.
114 	 *
115 	 * @pending and @lmce_pending on the same CPU are mutually
116 	 * exclusive, i.e. deferred MCE on a CPU are either all in
117 	 * @lmce_pending or all in @pending. In the former case, all
118 	 * deferred MCE are LMCE. In the latter case, both LMCE and
119 	 * non-local MCE can be in @pending, and @pending contains at
120 	 * least one non-local MCE if it's not empty.
121 	 *
122 	 * Changes to @pending and @lmce_pending should be performed
123 	 * via mctelem_process_deferred() and mctelem_defer(), in order
124 	 * to guarantee the above mutual exclusivity.
125 	 */
126 	struct mctelem_ent *pending, *lmce_pending;
127 	struct mctelem_ent *processing;
128 };
129 static DEFINE_PER_CPU(struct mc_telem_cpu_ctl, mctctl);
130 
131 /* Lock protecting all processing lists */
132 static DEFINE_SPINLOCK(processing_lock);
133 
mctelem_xchg_head(struct mctelem_ent ** headp,struct mctelem_ent ** linkp,struct mctelem_ent * new)134 static void mctelem_xchg_head(struct mctelem_ent **headp,
135 				struct mctelem_ent **linkp,
136 				struct mctelem_ent *new)
137 {
138 	for (;;) {
139 		struct mctelem_ent *old;
140 
141 		*linkp = old = *headp;
142 		if (cmpxchgptr(headp, old, new) == old)
143 			break;
144 	}
145 }
146 
147 /**
148  * Append a telemetry of deferred MCE to a per-cpu pending list,
149  * either @pending or @lmce_pending, according to rules below:
150  *  - if @pending is not empty, then the new telemetry will be
151  *    appended to @pending;
152  *  - if @pending is empty and the new telemetry is for a deferred
153  *    LMCE, then the new telemetry will be appended to @lmce_pending;
154  *  - if @pending is empty and the new telemetry is for a deferred
155  *    non-local MCE, all existing telemetries in @lmce_pending will be
156  *    moved to @pending and then the new telemetry will be appended to
157  *    @pending.
158  *
159  * This function must be called with MCIP bit set, so that it does not
160  * need to worry about MC# re-occurring in this function.
161  *
162  * As a result, this function can preserve the mutual exclusivity
163  * between @pending and @lmce_pending (see their comments in struct
164  * mc_telem_cpu_ctl).
165  *
166  * Parameters:
167  *  @cookie: telemetry of the deferred MCE
168  *  @lmce:   indicate whether the telemetry is for LMCE
169  */
mctelem_defer(mctelem_cookie_t cookie,bool lmce)170 void mctelem_defer(mctelem_cookie_t cookie, bool lmce)
171 {
172 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
173 	struct mc_telem_cpu_ctl *mctctl = &this_cpu(mctctl);
174 
175 	ASSERT(mctctl->pending == NULL || mctctl->lmce_pending == NULL);
176 
177 	if (mctctl->pending)
178 		mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
179 	else if (lmce)
180 		mctelem_xchg_head(&mctctl->lmce_pending, &tep->mcte_next, tep);
181 	else {
182 		/*
183 		 * LMCE is supported on Skylake-server and later CPUs, on
184 		 * which mce_broadcast is always true. Therefore, non-empty
185 		 * mctctl->lmce_pending in this branch implies a broadcasting
186 		 * MC# is being handled, every CPU is in the exception
187 		 * context, and no one is consuming mctctl->pending at this
188 		 * moment. As a result, the following two exchanges together
189 		 * can be treated as atomic.
190 		 */
191 		if (mctctl->lmce_pending)
192 			mctelem_xchg_head(&mctctl->lmce_pending,
193 					  &mctctl->pending, NULL);
194 		mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
195 	}
196 }
197 
198 /**
199  * Move telemetries of deferred MCE from the per-cpu pending list on
200  * this or another CPU to the per-cpu processing list on this CPU, and
201  * then process all deferred MCE on the processing list.
202  *
203  * This function can be called with MCIP bit set (e.g. from MC#
204  * handler) or cleared (from MCE softirq handler). In the latter case,
205  * MC# may re-occur in this function.
206  *
207  * Parameters:
208  *  @cpu:  indicate the CPU where the pending list is
209  *  @fn:   the function to handle the deferred MCE
210  *  @lmce: indicate which pending list on @cpu is handled
211  */
mctelem_process_deferred(unsigned int cpu,int (* fn)(mctelem_cookie_t),bool lmce)212 void mctelem_process_deferred(unsigned int cpu,
213 			      int (*fn)(mctelem_cookie_t),
214 			      bool lmce)
215 {
216 	struct mctelem_ent *tep;
217 	struct mctelem_ent *head, *prev;
218 	struct mc_telem_cpu_ctl *mctctl = &per_cpu(mctctl, cpu);
219 	int ret;
220 
221 	/*
222 	 * First, unhook the list of telemetry structures, and
223 	 * hook it up to the processing list head for this CPU.
224 	 *
225 	 * If @lmce is true and a non-local MC# occurs before the
226 	 * following atomic exchange, @lmce will not hold after
227 	 * resumption, because all telemetries in @lmce_pending on
228 	 * @cpu are moved to @pending on @cpu in mcheck_cmn_handler().
229 	 * In such a case, no telemetries will be handled in this
230 	 * function after resumption. Another round of MCE softirq,
231 	 * which was raised by above mcheck_cmn_handler(), will handle
232 	 * those moved telemetries in @pending on @cpu.
233 	 *
234 	 * Any MC# occurring after the following atomic exchange will be
235 	 * handled by another round of MCE softirq.
236 	 */
237 	mctelem_xchg_head(lmce ? &mctctl->lmce_pending : &mctctl->pending,
238 			  &this_cpu(mctctl.processing), NULL);
239 
240 	head = this_cpu(mctctl.processing);
241 
242 	/*
243 	 * Then, fix up the list to include prev pointers, to make
244 	 * things a little easier, as the list must be traversed in
245 	 * chronological order, which is backward from the order they
246 	 * are in.
247 	 */
248 	for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
249 		tep->mcte_prev = prev;
250 		prev = tep;
251 	}
252 
253 	/*
254 	 * Now walk the list of telemetry structures, handling each
255 	 * one of them. Unhooking the structure here does not need to
256 	 * be atomic, as this list is only accessed from a softirq
257 	 * context; the MCE handler does not touch it.
258 	 */
259 	for (tep = prev; tep != NULL; tep = prev) {
260 		prev = tep->mcte_prev;
261 		tep->mcte_next = tep->mcte_prev = NULL;
262 
263 		ret = fn(MCTE2COOKIE(tep));
264 		if (prev != NULL)
265 			prev->mcte_next = NULL;
266 		tep->mcte_prev = tep->mcte_next = NULL;
267 		if (ret != 0)
268 			mctelem_commit(MCTE2COOKIE(tep));
269 		else
270 			mctelem_dismiss(MCTE2COOKIE(tep));
271 	}
272 }
273 
mctelem_has_deferred(unsigned int cpu)274 bool mctelem_has_deferred(unsigned int cpu)
275 {
276 	if (per_cpu(mctctl.pending, cpu) != NULL)
277 		return true;
278 	return false;
279 }
280 
mctelem_has_deferred_lmce(unsigned int cpu)281 bool mctelem_has_deferred_lmce(unsigned int cpu)
282 {
283 	return per_cpu(mctctl.lmce_pending, cpu) != NULL;
284 }
285 
286 /* Free an entry to its native free list; the entry must not be linked on
287  * any list.
288  */
mctelem_free(struct mctelem_ent * tep)289 static void mctelem_free(struct mctelem_ent *tep)
290 {
291 	BUG_ON(tep->mcte_refcnt != 0);
292 	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
293 
294 	tep->mcte_prev = NULL;
295 	tep->mcte_next = NULL;
296 
297 	/* set free in array */
298 	set_bit(tep - mctctl.mctc_elems, mctctl.mctc_free);
299 }
300 
301 /* Increment the reference count of an entry that is not linked on to
302  * any list and which only the caller has a pointer to.
303  */
mctelem_hold(struct mctelem_ent * tep)304 static void mctelem_hold(struct mctelem_ent *tep)
305 {
306 	tep->mcte_refcnt++;
307 }
308 
309 /* Increment the reference count on an entry that is linked at the head of
310  * a processing list.  The caller is responsible for locking the list.
311  */
mctelem_processing_hold(struct mctelem_ent * tep)312 static void mctelem_processing_hold(struct mctelem_ent *tep)
313 {
314 	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
315 	    MC_URGENT : MC_NONURGENT;
316 
317 	BUG_ON(tep != mctctl.mctc_processing_head[which]);
318 	tep->mcte_refcnt++;
319 }
320 
321 /* Decrement the reference count on an entry that is linked at the head of
322  * a processing list.  The caller is responsible for locking the list.
323  */
mctelem_processing_release(struct mctelem_ent * tep)324 static void mctelem_processing_release(struct mctelem_ent *tep)
325 {
326 	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
327 	    MC_URGENT : MC_NONURGENT;
328 
329 	BUG_ON(tep != mctctl.mctc_processing_head[which]);
330 	if (--tep->mcte_refcnt == 0) {
331 		MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
332 		mctctl.mctc_processing_head[which] = tep->mcte_next;
333 		mctelem_free(tep);
334 	}
335 }
336 
mctelem_init(unsigned int datasz)337 void __init mctelem_init(unsigned int datasz)
338 {
339 	char *datarr;
340 	unsigned int i;
341 
342 	BUILD_BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
343 
344 	datasz = (datasz & ~0xf) + 0x10;	/* 16 byte roundup */
345 
346 	if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
347 	    MC_NENT)) == NULL ||
348 	    (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {
349 		xfree(mctctl.mctc_elems);
350 		printk("Allocations for MCA telemetry failed\n");
351 		return;
352 	}
353 
354 	for (i = 0; i < MC_NENT; i++) {
355 		struct mctelem_ent *tep;
356 
357 		tep = mctctl.mctc_elems + i;
358 		tep->mcte_flags = MCTE_F_STATE_FREE;
359 		tep->mcte_refcnt = 0;
360 		tep->mcte_data = datarr + i * datasz;
361 
362 		__set_bit(i, mctctl.mctc_free);
363 		tep->mcte_next = NULL;
364 		tep->mcte_prev = NULL;
365 	}
366 }
367 
368 /* incremented non-atomically when reserve fails */
369 static int mctelem_drop_count;
370 
371 /* Reserve a telemetry entry, or return NULL if none available.
372  * If we return an entry then the caller must subsequently call exactly one of
373  * mctelem_dismiss or mctelem_commit for that entry.
374  */
mctelem_reserve(mctelem_class_t which)375 mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
376 {
377 	unsigned bit;
378 	unsigned start_bit = (which == MC_URGENT) ? 0 : MC_URGENT_NENT;
379 
380 	for (;;) {
381 		bit = find_next_bit(mctctl.mctc_free, MC_NENT, start_bit);
382 
383 		if (bit >= MC_NENT) {
384 			mctelem_drop_count++;
385 			return NULL;
386 		}
387 
388 		/* try to allocate, atomically clear free bit */
389 		if (test_and_clear_bit(bit, mctctl.mctc_free)) {
390 			/* return element we got */
391 			struct mctelem_ent *tep = mctctl.mctc_elems + bit;
392 
393 			mctelem_hold(tep);
394 			MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
395 			tep->mcte_next = NULL;
396 			tep->mcte_prev = NULL;
397 			if (which == MC_URGENT)
398 				MCTE_SET_CLASS(tep, URGENT);
399 			else
400 				MCTE_SET_CLASS(tep, NONURGENT);
401 			return MCTE2COOKIE(tep);
402 		}
403 	}
404 }
405 
mctelem_dataptr(mctelem_cookie_t cookie)406 void *mctelem_dataptr(mctelem_cookie_t cookie)
407 {
408 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
409 
410 	return tep->mcte_data;
411 }
412 
413 /* Release a previously reserved entry back to the freelist without
414  * submitting it for logging.  The entry must not be linked on to any
415  * list - that's how mctelem_reserve handed it out.
416  */
mctelem_dismiss(mctelem_cookie_t cookie)417 void mctelem_dismiss(mctelem_cookie_t cookie)
418 {
419 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
420 
421 	tep->mcte_refcnt--;
422 	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
423 	mctelem_free(tep);
424 }
425 
426 /* Commit an entry with completed telemetry for logging.  The caller must
427  * not reference the entry after this call.  Note that we add entries
428  * at the head of the committed list, so that list therefore has entries
429  * in reverse chronological order.
430  */
mctelem_commit(mctelem_cookie_t cookie)431 void mctelem_commit(mctelem_cookie_t cookie)
432 {
433 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
434 	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
435 	    MC_URGENT : MC_NONURGENT;
436 
437 	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
438 	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
439 
440 	mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
441 }
442 
443 /* Move telemetry from committed list to processing list, reversing the
444  * list into chronological order.  The processing list has been
445  * locked by the caller, and may be non-empty.  We append the
446  * reversed committed list on to the tail of the processing list.
447  * The committed list may grow even while we run, so use atomic
448  * operations to swap NULL to the freelist head.
449  *
450  * Note that "chronological order" means the order in which producers
451  * won additions to the processing list, which may not reflect the
452  * strict chronological order of the associated events if events are
453  * closely spaced in time and contend for the processing list at once.
454  */
455 
456 static struct mctelem_ent *dangling[MC_NCLASSES];
457 
mctelem_append_processing(mctelem_class_t which)458 static void mctelem_append_processing(mctelem_class_t which)
459 {
460 	mctelem_class_t target = which == MC_URGENT ?
461 	    MC_URGENT : MC_NONURGENT;
462 	struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
463 	struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
464 	struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
465 	struct mctelem_ent *tep, *ltep;
466 
467 	/* Check for an empty list; no race since we hold the processing lock */
468 	if (*commlp == NULL)
469 		return;
470 
471 	/* Atomically unlink the committed list, and keep a pointer to
472 	 * the list we unlink in a well-known location so it can be
473 	 * picked up in panic code should we panic between this unlink
474 	 * and the append to the processing list. */
475 	mctelem_xchg_head(commlp, &dangling[target], NULL);
476 
477 	if (dangling[target] == NULL)
478 		return;
479 
480 	/* Traverse the list following the previous pointers (reverse
481 	 * chronological order).  For each entry fill in the next pointer
482 	 * and transition the element state.  */
483 	for (tep = dangling[target], ltep = NULL; tep != NULL;
484 	    tep = tep->mcte_prev) {
485 		MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
486 		tep->mcte_next = ltep;
487 		ltep = tep;
488 	}
489 
490 	/* ltep points to the head of a chronologically ordered linked
491 	 * list of telemetry entries ending at the most recent entry
492 	 * dangling[target] if mcte_next is followed; tack this on to
493 	 * the processing list.
494 	 */
495 	if (*proclhp == NULL) {
496 		*proclhp = ltep;
497 		*procltp = dangling[target];
498 	} else {
499 		(*procltp)->mcte_next = ltep;
500 		ltep->mcte_prev = *procltp;
501 		*procltp = dangling[target];
502 	}
503 	smp_wmb();
504 	dangling[target] = NULL;
505 	smp_wmb();
506 }
507 
mctelem_consume_oldest_begin(mctelem_class_t which)508 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
509 {
510 	mctelem_class_t target = (which == MC_URGENT) ?
511 	    MC_URGENT : MC_NONURGENT;
512 	struct mctelem_ent *tep;
513 
514 	spin_lock(&processing_lock);
515 	mctelem_append_processing(target);
516 	if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
517 		spin_unlock(&processing_lock);
518 		return NULL;
519 	}
520 
521 	mctelem_processing_hold(tep);
522 	spin_unlock(&processing_lock);
523 	return MCTE2COOKIE(tep);
524 }
525 
mctelem_consume_oldest_end(mctelem_cookie_t cookie)526 void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
527 {
528 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
529 
530 	spin_lock(&processing_lock);
531 	mctelem_processing_release(tep);
532 	spin_unlock(&processing_lock);
533 }
534 
mctelem_ack(mctelem_class_t which,mctelem_cookie_t cookie)535 void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
536 {
537 	mctelem_class_t target = (which == MC_URGENT) ?
538 	    MC_URGENT : MC_NONURGENT;
539 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
540 
541 	if (tep == NULL)
542 		return;
543 
544 	spin_lock(&processing_lock);
545 	if (tep == mctctl.mctc_processing_head[target])
546 		mctelem_processing_release(tep);
547 	spin_unlock(&processing_lock);
548 }
549 
550 /*
551  * Local variables:
552  * mode: C
553  * c-file-style: "BSD"
554  * c-basic-offset: 4
555  * indent-tabs-mode: t
556  * tab-width: 8
557  * End:
558  */
559