x86 mcheck: Replace hypervisor MCA telemetry structures with something

more robust and designed to make terminal error telemetry available to the dom0 panic flow for diagnosis on reboot. Use common code for a lot of the AMD and Intel MCE handling code. Signed-off-by: Gavin Maltby <gavin.maltby@sun.com> Signed-off-by: Frank van der Linden <frank.vanderlinden@sun.com>
author: Keir Fraser <keir.fraser@citrix.com> 2009-03-17 14:22:50 +0000
committer: Keir Fraser <keir.fraser@citrix.com> 2009-03-17 14:22:50 +0000
commit: e114bca753116936a99531ec27c591949b5c11c9 (patch)
tree: 5a6256158d2f183d6431900567a2891ac6b1793d
parent: 60158182b4ff9dfeea12703d040ece4461c795ad (diff)
download: xen-e114bca753116936a99531ec27c591949b5c11c9.tar.gz
xen-e114bca753116936a99531ec27c591949b5c11c9.tar.bz2
xen-e114bca753116936a99531ec27c591949b5c11c9.zip
16 files changed, 1439 insertions, 1084 deletions
diff --git a/xen/arch/x86/cpu/mcheck/Makefile b/xen/arch/x86/cpu/mcheck/Makefile
index 15fed6eb0b..ed0ae00058 100644
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -2,6 +2,7 @@ obj-y += amd_nonfatal.o
 obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
+obj-y += mctelem.o
 obj-y += mce.o
 obj-y += mce_intel.o
 obj-y += non-fatal.o
diff --git a/xen/arch/x86/cpu/mcheck/amd_f10.c b/xen/arch/x86/cpu/mcheck/amd_f10.c
index 9c26ef9fe8..dd2a54b8fa 100644
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -49,20 +49,21 @@
 #include "x86_mca.h"
 
 
-static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+static enum mca_extinfo
+amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
 {
 	struct mcinfo_extended mc_ext;
 
 	/* Family 0x10 introduced additional MSR that belong to the
 	 * northbridge bank (4). */
-	if (bank != 4)
-		return 0;
+	if (mi == NULL || bank != 4)
+		return MCA_EXTINFO_IGNORED;
 
 	if (!(status & MCi_STATUS_VAL))
-		return 0;
+		return MCA_EXTINFO_IGNORED;
 
 	if (!(status & MCi_STATUS_MISCV))
-		return 0;
+		return MCA_EXTINFO_IGNORED;
 
 	memset(&mc_ext, 0, sizeof(mc_ext));
 	mc_ext.common.type = MC_TYPE_EXTENDED;
@@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
 	rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
 	
 	x86_mcinfo_add(mi, &mc_ext);
-	return 1;
+	return MCA_EXTINFO_LOCAL;
 }
 
 
 extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
 
 /* AMD Family10 machine check */
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
 	uint64_t value;
 	uint32_t i;
 	int cpu_nr;
 
-	machine_check_vector = k8_machine_check;
-	mc_callback_bank_extended = amd_f10_handler;
+	if (!cpu_has(c, X86_FEATURE_MCA))
+		return 0;
+
+	x86_mce_vector_register(k8_machine_check);
+	x86_mce_callback_register(amd_f10_handler);
 	cpu_nr = smp_processor_id();
-	wmb();
 
 	rdmsrl(MSR_IA32_MCG_CAP, value);
 	if (value & MCG_CTL_P)	/* Control register present ? */
@@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
 	for (i = 0; i < nr_mce_banks; i++) {
 		switch (i) {
 		case 4: /* Northbridge */
-			/* Enable error reporting of all errors,
-			 * enable error checking and
-			 * disable sync flooding */
-			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+			/* Enable error reporting of all errors */
+			wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
 			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
-
-			/* XXX: We should write the value 0x1087821UL into
-			 * to register F3x180 here, which sits in
-			 * the PCI extended configuration space.
-			 * Since this is not possible here, we can only hope,
-			 * Dom0 is doing that.
-			 */
 			break;
 
 		default:
@@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
 
 	set_in_cr4(X86_CR4_MCE);
 	printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
+	return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/amd_k8.c b/xen/arch/x86/cpu/mcheck/amd_k8.c
index 768bfadb2a..03c36d3a1d 100644
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c
@@ -67,234 +67,27 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 
 /* Machine Check Handler for AMD K8 family series */
 void k8_machine_check(struct cpu_user_regs *regs, long error_code)
 {
-	struct vcpu *vcpu = current;
-	struct domain *curdom;
-	struct mc_info *mc_data;
-	struct mcinfo_global mc_global;
-	struct mcinfo_bank mc_info;
-	uint64_t status, addrv, miscv, uc;
-	uint32_t i;
-	unsigned int cpu_nr;
-	uint32_t xen_impacted = 0;
-#define DOM_NORMAL	0
-#define DOM0_TRAP	1
-#define DOMU_TRAP	2
-#define DOMU_KILLED	4
-	uint32_t dom_state = DOM_NORMAL;
-
-	/* This handler runs as interrupt gate. So IPIs from the
-	 * polling service routine are defered until we finished.
-	 */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-	 * an other physical CPU or the impacted process in the guest
-	 * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-	mc_data = x86_mcinfo_getptr();
-	cpu_nr = smp_processor_id();
-	BUG_ON(cpu_nr != vcpu->processor);
-
-	curdom = vcpu->domain;
-
-	memset(&mc_global, 0, sizeof(mc_global));
-	mc_global.common.type = MC_TYPE_GLOBAL;
-	mc_global.common.size = sizeof(mc_global);
-
-	mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-
-	x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-	    &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-	    &mc_global.mc_apicid, NULL, NULL, NULL);
-
-	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-	mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-	/* Quick check, who is impacted */
-	xen_impacted = is_idle_domain(curdom);
-
-	/* Dom0 */
-	x86_mcinfo_clear(mc_data);
-	x86_mcinfo_add(mc_data, &mc_global);
-
-	for (i = 0; i < nr_mce_banks; i++) {
-		struct domain *d;
-
-		rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-		if (!(status & MCi_STATUS_VAL))
-			continue;
-
-		/* An error happened in this bank.
-		 * This is expected to be an uncorrectable error,
-		 * since correctable errors get polled.
-		 */
-		uc = status & MCi_STATUS_UC;
-
-		memset(&mc_info, 0, sizeof(mc_info));
-		mc_info.common.type = MC_TYPE_BANK;
-		mc_info.common.size = sizeof(mc_info);
-		mc_info.mc_bank = i;
-		mc_info.mc_status = status;
-
-		addrv = 0;
-		if (status & MCi_STATUS_ADDRV) {
-			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-			
-			d = maddr_get_owner(addrv);
-			if (d != NULL)
-				mc_info.mc_domid = d->domain_id;
-		}
-
-		miscv = 0;
-		if (status & MCi_STATUS_MISCV)
-			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-		mc_info.mc_addr = addrv;
-		mc_info.mc_misc = miscv;
-
-		x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-		if (mc_callback_bank_extended)
-			mc_callback_bank_extended(mc_data, i, status);
-
-		/* clear status */
-		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-		wmb();
-		add_taint(TAINT_MACHINE_CHECK);
-	}
-
-	status = mc_global.mc_gstatus;
-
-	/* clear MCIP or cpu enters shutdown state
-	 * in case another MCE occurs. */
-	status &= ~MCG_STATUS_MCIP;
-	wrmsrl(MSR_IA32_MCG_STATUS, status);
-	wmb();
-
-	/* For the details see the discussion "MCE/MCA concept" on xen-devel.
-	 * The thread started here:
-	 * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-	 */
-
-	/* MCG_STATUS_RIPV: 
-	 * When this bit is not set, then the instruction pointer onto the stack
-	 * to resume at is not valid. If xen is interrupted, then we panic anyway
-	 * right below. Otherwise it is up to the guest to figure out if 
-	 * guest kernel or guest userland is affected and should kill either
-	 * itself or the affected process.
-	 */
-
-	/* MCG_STATUS_EIPV:
-	 * Evaluation of EIPV is the job of the guest.
-	 */
-
-	if (xen_impacted) {
-		/* Now we are going to panic anyway. Allow interrupts, so that
-		 * printk on serial console can work. */
-		vcpu_schedule_unlock_irq(vcpu);
-
-		/* Uh, that means, machine check exception
-		 * inside Xen occured. */
-		printk("Machine check exception occured in Xen.\n");
-
-		/* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-		 * to the error then it makes sense to print a stack trace.
-		 * That can be useful for more detailed error analysis and/or
-		 * error case studies to figure out, if we can clear
-		 * xen_impacted and kill a DomU instead
-		 * (i.e. if a guest only control structure is affected, but then
-		 * we must ensure the bad pages are not re-used again).
-		 */
-		if (status & MCG_STATUS_EIPV) {
-			printk("MCE: Instruction Pointer is related to the error. "
-				"Therefore, print the execution state.\n");
-			show_execution_state(regs);
-		}
-		x86_mcinfo_dump(mc_data);
-		mc_panic("End of MCE. Use mcelog to decode above error codes.\n");
-	}
-
-	/* If Dom0 registered a machine check handler, which is only possible
-	 * with a PV MCA driver, then ... */
-	if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-		dom_state = DOM0_TRAP;
-
-		/* ... deliver machine check trap to Dom0. */
-		send_guest_trap(dom0, 0, TRAP_machine_check);
-
-		/* Xen may tell Dom0 now to notify the DomU.
-		 * But this will happen through a hypercall. */
-	} else
-		/* Dom0 did not register a machine check handler, but if DomU
-		 * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
-			dom_state = DOMU_TRAP;
-
-			/* ... deliver machine check trap to DomU */
-			send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
-	} else {
-		/* hmm... noone feels responsible to handle the error.
-		 * So, do a quick check if a DomU is impacted or not.
-		 */
-		if (curdom == dom0) {
-			/* Dom0 is impacted. Since noone can't handle
-			 * this error, panic! */
-			x86_mcinfo_dump(mc_data);
-			mc_panic("MCE occured in Dom0, which it can't handle\n");
-
-			/* UNREACHED */
-		} else {
-			dom_state = DOMU_KILLED;
-
-			/* Enable interrupts. This basically results in
-			 * calling sti on the *physical* cpu. But after
-			 * domain_crash() the vcpu pointer is invalid.
-			 * Therefore, we must unlock the irqs before killing
-			 * it. */
-			vcpu_schedule_unlock_irq(vcpu);
-
-			/* DomU is impacted. Kill it and continue. */
-			domain_crash(curdom);
-		}
-	}
-
-
-	switch (dom_state) {
-	case DOM0_TRAP:
-	case DOMU_TRAP:
-		/* Enable interrupts. */
-		vcpu_schedule_unlock_irq(vcpu);
-
-		/* guest softirqs and event callbacks are scheduled
-		 * immediately after this handler exits. */
-		break;
-	case DOMU_KILLED:
-		/* Nothing to do here. */
-		break;
-	default:
-		BUG();
-	}
+	mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
-
 /* AMD K8 machine check */
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 {
 	uint64_t value;
 	uint32_t i;
 	int cpu_nr;
 
-	machine_check_vector = k8_machine_check;
+	/* Check for PPro style MCA; our caller has confirmed MCE support. */
+	if (!cpu_has(c, X86_FEATURE_MCA))
+		return 0;
+
+	x86_mce_vector_register(k8_machine_check);
 	cpu_nr = smp_processor_id();
-	wmb();
 
 	rdmsrl(MSR_IA32_MCG_CAP, value);
 	if (value & MCG_CTL_P)	/* Control register present ? */
@@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 	for (i = 0; i < nr_mce_banks; i++) {
 		switch (i) {
 		case 4: /* Northbridge */
-			/* Enable error reporting of all errors,
-			 * enable error checking and
-			 * disable sync flooding */
-			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+			/* Enable error reporting of all errors */
+			wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
 			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
 			break;
 
@@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
 
 	set_in_cr4(X86_CR4_MCE);
 	printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+
+	return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
index f57e4e3811..01766c2a45 100644
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -58,22 +58,23 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/event.h>
-#include <asm/processor.h> 
+
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(10000)
 #define MCE_MIN    MILLISECS(2000)
 #define MCE_MAX    MILLISECS(30000)
 
 static s_time_t period = MCE_PERIOD;
 static int hw_threshold = 0;
 static int adjust = 0;
+static int variable_period = 1;
 
 /* The polling service routine:
  * Collects information of correctable errors and notifies
@@ -81,99 +82,46 @@ static int adjust = 0;
  */
 void mce_amd_checkregs(void *info)
 {
-	struct vcpu *vcpu = current;
-	struct mc_info *mc_data;
-	struct mcinfo_global mc_global;
-	struct mcinfo_bank mc_info;
-	uint64_t status, addrv, miscv;
-	unsigned int i;
+	mctelem_cookie_t mctc;
+	struct mca_summary bs;
 	unsigned int event_enabled;
-	unsigned int cpu_nr;
-	int error_found;
 
-	/* We don't need a slot yet. Only allocate one on error. */
-	mc_data = NULL;
+	mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
 
-	cpu_nr = smp_processor_id();
-	BUG_ON(cpu_nr != vcpu->processor);
 	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-	error_found = 0;
-
-	memset(&mc_global, 0, sizeof(mc_global));
-	mc_global.common.type = MC_TYPE_GLOBAL;
-	mc_global.common.size = sizeof(mc_global);
-
-	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-
-	x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
-	    &mc_global.mc_coreid, &mc_global.mc_core_threadid,
-	    &mc_global.mc_apicid, NULL, NULL, NULL);
-
-	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-	for (i = 0; i < nr_mce_banks; i++) {
-		struct domain *d;
-
-		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-		if (!(status & MCi_STATUS_VAL))
-			continue;
-
-		if (mc_data == NULL) {
-			/* Now we need a slot to fill in error telemetry. */
-			mc_data = x86_mcinfo_getptr();
-			BUG_ON(mc_data == NULL);
-			x86_mcinfo_clear(mc_data);
-			x86_mcinfo_add(mc_data, &mc_global);
-		}
-
-		memset(&mc_info, 0, sizeof(mc_info));
-		mc_info.common.type = MC_TYPE_BANK;
-		mc_info.common.size = sizeof(mc_info);
-		mc_info.mc_bank = i;
-		mc_info.mc_status = status;
 
-		/* Increase polling frequency */
-		error_found = 1;
-
-		addrv = 0;
-		if (status & MCi_STATUS_ADDRV) {
-			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-			d = maddr_get_owner(addrv);
-			if (d != NULL)
-				mc_info.mc_domid = d->domain_id;
-		}
-
-		miscv = 0;
-		if (status & MCi_STATUS_MISCV)
-			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
-		mc_info.mc_addr = addrv;
-		mc_info.mc_misc = miscv;
-		x86_mcinfo_add(mc_data, &mc_info);
+	if (bs.errcnt && mctc != NULL) {
+		static uint64_t dumpcount = 0;
 
-		if (mc_callback_bank_extended)
-			mc_callback_bank_extended(mc_data, i, status);
+		/* If Dom0 enabled the VIRQ_MCA event, then notify it.
+		 * Otherwise, if dom0 has had plenty of time to register
+		 * the virq handler but still hasn't then dump telemetry
+		 * to the Xen console.  The call count may be incremented
+		 * on multiple cpus at once and is indicative only - just
+		 * a simple-minded attempt to avoid spamming the console
+		 * for corrected errors in early startup. */
 
-		/* clear status */
-		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-		wmb();
-	}
-
-	if (error_found > 0) {
-		/* If Dom0 enabled the VIRQ_MCA event, then ... */
-		if (event_enabled)
-			/* ... notify it. */
+		if (event_enabled) {
+			mctelem_commit(mctc);
 			send_guest_global_virq(dom0, VIRQ_MCA);
-		else
-			/* ... or dump it */
-			x86_mcinfo_dump(mc_data);
+		} else if (++dumpcount >= 10) {
+			x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+			mctelem_dismiss(mctc);
+		} else {
+			mctelem_dismiss(mctc);
+		}
+		
+	} else if (mctc != NULL) {
+		mctelem_dismiss(mctc);
 	}
 
-	adjust += error_found;
+	/* adjust is global and all cpus may attempt to increment it without
+	 * synchronisation, so they race and the final adjust count
+	 * (number of cpus seeing any error) is approximate.  We can
+	 * guarantee that if any cpu observes an error that the
+	 * adjust count is at least 1. */
+	if (bs.errcnt)
+		adjust++;
 }
 
 /* polling service routine invoker:
@@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data)
 	on_each_cpu(mce_amd_checkregs, data, 1, 1);
 
 	if (adjust > 0) {
-		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+		if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
 			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
 			printk("MCE: polling routine found correctable error. "
 				" Use mcelog to parse above error output.\n");
@@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data)
 		}
 	}
 
-	if (adjust > 0) {
+	if (variable_period && adjust > 0) {
 		/* Increase polling frequency */
 		adjust++; /* adjust == 1 must have an effect */
 		period /= adjust;
-	} else {
+	} else if (variable_period) {
 		/* Decrease polling frequency */
 		period *= 2;
 	}
-	if (period > MCE_MAX) {
+	if (variable_period && period > MCE_MAX) {
 		/* limit: Poll at least every 30s */
 		period = MCE_MAX;
 	}
-	if (period < MCE_MIN) {
+	if (variable_period && period < MCE_MIN) {
 		/* limit: Poll every 2s.
 		 * When this is reached an uncorrectable error
 		 * is expected to happen, if Dom0 does nothing.
@@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
 
 	/* The threshold bitfields in MSR_IA32_MC4_MISC has
 	 * been introduced along with the SVME feature bit. */
-	if (cpu_has(c, X86_FEATURE_SVME)) {
+	if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
 		uint64_t value;
 
 		/* hw threshold registers present */
diff --git a/xen/arch/x86/cpu/mcheck/k7.c b/xen/arch/x86/cpu/mcheck/k7.c
index aedd0a0e1f..1a0a0a5fef 100644
--- a/xen/arch/x86/cpu/mcheck/k7.c
+++ b/xen/arch/x86/cpu/mcheck/k7.c
@@ -68,13 +68,16 @@ static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_co
 
 
 /* AMD K7 machine check */
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
 
-	machine_check_vector = k7_machine_check;
-	wmb();
+	/* Check for PPro style MCA; our caller has confirmed MCE support. */
+	if (!cpu_has(c, X86_FEATURE_MCA))
+		return 0;
+
+	x86_mce_vector_register(k7_machine_check);
 
 	rdmsr (MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
@@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 	set_in_cr4 (X86_CR4_MCE);
 	printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
 		smp_processor_id());
+
+	return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 3b622adc50..a6051d9755 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -10,104 +10,490 @@
 #include <xen/smp.h>
 #include <xen/errno.h>
 #include <xen/console.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
+#include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 
-/* XXX For now a fixed array is used. Later this should be changed
- * to a dynamic allocated array with the size calculated in relation
- * to physical cpus present in the machine.
- * The more physical cpus are available, the more entries you need.
- */
-#define MAX_MCINFO	20
-
-struct mc_machine_notify {
-	struct mc_info mc;
-	uint32_t fetch_idx;
-	uint32_t valid;
-};
+static void mcinfo_clear(struct mc_info *);
 
-struct mc_machine {
+#define	SEG_PL(segsel) ((segsel) & 0x3)
 
-	/* Array structure used for collecting machine check error telemetry. */
-	struct mc_info mc[MAX_MCINFO];
+#if 1	/* XXFM switch to 0 for putback */
 
-	/* We handle multiple machine check reports lockless by
-	 * iterating through the array using the producer/consumer concept.
-	 */
-	/* Producer array index to fill with machine check error data.
-	 * Index must be increased atomically. */
-	uint32_t error_idx;
-
-	/* Consumer array index to fetch machine check error data from.
-	 * Index must be increased atomically. */
-	uint32_t fetch_idx;
-
-	/* Integer array holding the indeces of the mc array that allows
-         * a Dom0 to notify a DomU to re-fetch the same machine check error
-         * data. The notification and refetch also uses its own 
-	 * producer/consumer mechanism, because Dom0 may decide to not report
-	 * every error to the impacted DomU.
-	 */
-	struct mc_machine_notify notify[MAX_MCINFO];
+#define	x86_mcerr(str, err) _x86_mcerr(str, err)
 
-	/* Array index to get fetch_idx from.
-	 * Index must be increased atomically. */
-	uint32_t notifyproducer_idx;
-	uint32_t notifyconsumer_idx;
-};
+static int _x86_mcerr(const char *msg, int err)
+{
+	printk("x86_mcerr: %s, returning %d\n",
+	    msg != NULL ? msg : "", err);
+	return err;
+}
+#else
+#define x86_mcerr(str,err)
+#endif
 
-/* Global variable with machine check information. */
-struct mc_machine mc_data;
+cpu_banks_t mca_allbanks;
 
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
-{	
+{
 	printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
 		smp_processor_id());
 }
 
 
+static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
+
+void x86_mce_vector_register(x86_mce_vector_t hdlr)
+{
+	_machine_check_vector = hdlr;
+	wmb();
+}
+
 /* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
+
+void machine_check_vector(struct cpu_user_regs *regs, long error_code)
+{
+	_machine_check_vector(regs, error_code);
+}
 
 /* Init machine check callback handler
  * It is used to collect additional information provided by newer
  * CPU families/models without the need to duplicate the whole handler.
  * This avoids having many handlers doing almost nearly the same and each
  * with its own tweaks ands bugs. */
-int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+static x86_mce_callback_t mc_callback_bank_extended = NULL;
+
+void x86_mce_callback_register(x86_mce_callback_t cbfunc)
+{
+	mc_callback_bank_extended = cbfunc;
+}
+
+/* Utility function to perform MCA bank telemetry readout and to push that
+ * telemetry towards an interested dom0 for logging and diagnosis.
+ * The caller - #MC handler or MCA poll function - must arrange that we
+ * do not migrate cpus. */
+
+/* XXFM Could add overflow counting? */
+mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
+    struct mca_summary *sp)
+{
+	struct vcpu *v = current;
+	struct domain *d;
+	uint64_t gstatus, status, addr, misc;
+	struct mcinfo_global mcg;	/* on stack */
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mig;	/* on stack */
+	mctelem_cookie_t mctc = NULL;
+	uint32_t uc = 0, pcc = 0;
+	struct mc_info *mci = NULL;
+	mctelem_class_t which = MC_URGENT;	/* XXXgcc */
+	unsigned int cpu_nr;
+	int errcnt = 0;
+	int i;
+	enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
+
+	cpu_nr = smp_processor_id();
+	BUG_ON(cpu_nr != v->processor);
+
+	rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+
+	memset(&mcg, 0, sizeof (mcg));
+	mcg.common.type = MC_TYPE_GLOBAL;
+	mcg.common.size = sizeof (mcg);
+	if (v != NULL && ((d = v->domain) != NULL)) {
+		mcg.mc_domid = d->domain_id;
+		mcg.mc_vcpuid = v->vcpu_id;
+	} else {
+		mcg.mc_domid = -1;
+		mcg.mc_vcpuid = -1;
+	}
+	mcg.mc_gstatus = gstatus;	/* MCG_STATUS */
+
+	switch (who) {
+	case MCA_MCE_HANDLER:
+		mcg.mc_flags = MC_FLAG_MCE;
+		which = MC_URGENT;
+		break;
+
+	case MCA_POLLER:
+	case MCA_RESET:
+		mcg.mc_flags = MC_FLAG_POLLED;
+		which = MC_NONURGENT;
+		break;
+
+	case MCA_CMCI_HANDLER:
+		mcg.mc_flags = MC_FLAG_CMCI;
+		which = MC_NONURGENT;
+		break;
+
+	default:
+		BUG();
+	}
+
+	/* Retrieve detector information */
+	x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
+	    &mcg.mc_coreid, &mcg.mc_core_threadid,
+	    &mcg.mc_apicid, NULL, NULL, NULL);
+
+	for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+		struct mcinfo_bank mcb;		/* on stack */
+
+		/* Skip bank if corresponding bit in bankmask is clear */
+		if (!test_bit(i, bankmask))
+			continue;
+
+		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+		if (!(status & MCi_STATUS_VAL))
+			continue;	/* this bank has no valid telemetry */
+
+		/* If this is the first bank with valid MCA DATA, then
+		 * try to reserve an entry from the urgent/nonurgent queue
+		 * depending on whethere we are called from an exception or
+		 * a poller;  this can fail (for example dom0 may not
+		 * yet have consumed past telemetry). */
+		if (errcnt == 0) {
+			if ((mctc = mctelem_reserve(which)) != NULL) {
+				mci = mctelem_dataptr(mctc);
+				mcinfo_clear(mci);
+			}
+		}
+
+		memset(&mcb, 0, sizeof (mcb));
+		mcb.common.type = MC_TYPE_BANK;
+		mcb.common.size = sizeof (mcb);
+		mcb.mc_bank = i;
+		mcb.mc_status = status;
+
+		/* form a mask of which banks have logged uncorrected errors */
+		if ((status & MCi_STATUS_UC) != 0)
+			uc |= (1 << i);
+
+		/* likewise for those with processor context corrupt */
+		if ((status & MCi_STATUS_PCC) != 0)
+			pcc |= (1 << i);
+
+		addr = misc = 0;
+
+		if (status & MCi_STATUS_ADDRV) {
+			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+			d = maddr_get_owner(addr);
+			if (d != NULL && (who == MCA_POLLER ||
+			    who == MCA_CMCI_HANDLER))
+				mcb.mc_domid = d->domain_id;
+		}
+
+		if (status & MCi_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
+
+		mcb.mc_addr = addr;
+		mcb.mc_misc = misc;
+
+		if (who == MCA_CMCI_HANDLER) {
+			rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+			rdtscll(mcb.mc_tsc);
+		}
 
+		/* Increment the error count;  if this is the first bank
+		 * with a valid error then add the global info to the mcinfo. */
+		if (errcnt++ == 0 && mci != NULL)
+			x86_mcinfo_add(mci, &mcg);
 
-static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+		/* Add the bank data */
+		if (mci != NULL)
+			x86_mcinfo_add(mci, &mcb);
+
+		if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
+			cbret = mc_callback_bank_extended(mci, i, status);
+		}
+
+		/* Clear status */
+		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+		wmb();
+	}
+
+	if (mci != NULL && errcnt > 0) {
+		x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
+		mig = (struct mcinfo_global *)mic;
+		if (pcc)
+			mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+		else if (uc)
+			mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+		else
+			mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+	}
+
+
+	if (sp) {
+		sp->errcnt = errcnt;
+		sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
+		sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+		sp->uc = uc;
+		sp->pcc = pcc;
+	}
+
+	return mci != NULL ? mctc : NULL;	/* may be NULL */
+}
+
+#define DOM_NORMAL	0
+#define DOM0_TRAP	1
+#define DOMU_TRAP	2
+#define DOMU_KILLED	4
+
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+    cpu_banks_t bankmask)
 {
+	int xen_state_lost, dom0_state_lost, domU_state_lost;
+	struct vcpu *v = current;
+	struct domain *curdom = v->domain;
+	domid_t domid = curdom->domain_id;
+	int ctx_xen, ctx_dom0, ctx_domU;
+	uint32_t dom_state = DOM_NORMAL;
+	mctelem_cookie_t mctc = NULL;
+	struct mca_summary bs;
+	struct mc_info *mci = NULL;
+	int irqlocked = 0;
+	uint64_t gstatus;
+	int ripv;
+
+	/* This handler runs as interrupt gate. So IPIs from the
+	 * polling service routine are defered until we're finished.
+	 */
+
+	/* Disable interrupts for the _vcpu_. It may not re-scheduled to
+	 * another physical CPU. */
+	vcpu_schedule_lock_irq(v);
+	irqlocked = 1;
+
+	/* Read global status;  if it does not indicate machine check
+	 * in progress then bail as long as we have a valid ip to return to. */
+	rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+	ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
+	if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
+		add_taint(TAINT_MACHINE_CHECK); /* questionable */
+		vcpu_schedule_unlock_irq(v);
+		irqlocked = 0;
+		goto cmn_handler_done;
+	}
+
+	/* Go and grab error telemetry.  We must choose whether to commit
+	 * for logging or dismiss the cookie that is returned, and must not
+	 * reference the cookie after that action.
+	 */
+	mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+	if (mctc != NULL)
+		mci = (struct mc_info *)mctelem_dataptr(mctc);
+
+	/* Clear MCIP or another #MC will enter shutdown state */
+	gstatus &= ~MCG_STATUS_MCIP;
+	wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+	wmb();
+
+	/* If no valid errors and our stack is intact, we're done */
+	if (ripv && bs.errcnt == 0) {
+		vcpu_schedule_unlock_irq(v);
+		irqlocked = 0;
+		goto cmn_handler_done;
+	}
+
+	if (bs.uc || bs.pcc)
+		add_taint(TAINT_MACHINE_CHECK);
+
+	/* Machine check exceptions will usually be for UC and/or PCC errors,
+	 * but it is possible to configure machine check for some classes
+	 * of corrected error.
+	 *
+	 * UC errors could compromise any domain or the hypervisor
+	 * itself - for example a cache writeback of modified data that
+	 * turned out to be bad could be for data belonging to anyone, not
+	 * just the current domain.  In the absence of known data poisoning
+	 * to prevent consumption of such bad data in the system we regard
+	 * all UC errors as terminal.  It may be possible to attempt some
+	 * heuristics based on the address affected, which guests have
+	 * mappings to that mfn etc.
+	 *
+	 * PCC errors apply to the current context.
+	 *
+	 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
+	 * and not PCC is terminal - the return instruction pointer
+	 * pushed onto the stack is bogus.  If the interrupt context is
+	 * the hypervisor or dom0 the game is over, otherwise we can
+	 * limit the impact to a single domU but only if we trampoline
+	 * somewhere safely - we can't return and unwind the stack.
+	 * Since there is no trampoline in place we will treat !RIPV
+	 * as terminal for any context.
+	 */
+	ctx_xen = SEG_PL(regs->cs) == 0;
+	ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
+	ctx_domU = !ctx_xen && !ctx_dom0;
+
+	xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
+	    !ripv;
+	dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
+	domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
+
+	if (xen_state_lost) {
+		/* Now we are going to panic anyway. Allow interrupts, so that
+		 * printk on serial console can work. */
+		vcpu_schedule_unlock_irq(v);
+		irqlocked = 0;
+
+		printk("Terminal machine check exception occured in "
+		    "hypervisor context.\n");
+
+		/* If MCG_STATUS_EIPV indicates, the IP on the stack is related
+		 * to the error then it makes sense to print a stack trace.
+		 * That can be useful for more detailed error analysis and/or
+		 * error case studies to figure out, if we can clear
+		 * xen_impacted and kill a DomU instead
+		 * (i.e. if a guest only control structure is affected, but then
+		 * we must ensure the bad pages are not re-used again).
+		 */
+		if (bs.eipv & MCG_STATUS_EIPV) {
+			printk("MCE: Instruction Pointer is related to the "
+			    "error, therefore print the execution state.\n");
+			show_execution_state(regs);
+		}
+
+		/* Commit the telemetry so that panic flow can find it. */
+		if (mctc != NULL) {
+			x86_mcinfo_dump(mci);
+			mctelem_commit(mctc);
+		}
+		mc_panic("Hypervisor state lost due to machine check "
+		    "exception.\n");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Xen hypervisor state is intact.  If dom0 state is lost then
+	 * give it a chance to decide what to do if it has registered
+	 * a handler for this event, otherwise panic.
+	 *
+	 * XXFM Could add some Solaris dom0 contract kill here?
+	 */
+	if (dom0_state_lost) {
+		if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+			dom_state = DOM0_TRAP;
+			send_guest_trap(dom0, 0, TRAP_machine_check);
+			/* XXFM case of return with !ripv ??? */
+		} else {
+			/* Commit telemetry for panic flow. */
+			if (mctc != NULL) {
+				x86_mcinfo_dump(mci);
+				mctelem_commit(mctc);
+			}
+			mc_panic("Dom0 state lost due to machine check "
+			    "exception\n");
+			/*NOTREACHED*/
+		}
+	}
+
+	/*
+	 * If a domU has lost state then send it a trap if it has registered
+	 * a handler, otherwise crash the domain.
+	 * XXFM Revisit this functionality.
+	 */
+	if (domU_state_lost) {
+		if (guest_has_trap_callback(v->domain, v->vcpu_id,
+		    TRAP_machine_check)) {
+			dom_state = DOMU_TRAP;
+			send_guest_trap(curdom, v->vcpu_id,
+			    TRAP_machine_check);
+		} else {
+			dom_state = DOMU_KILLED;
+			/* Enable interrupts. This basically results in
+			 * calling sti on the *physical* cpu. But after
+			 * domain_crash() the vcpu pointer is invalid.
+			 * Therefore, we must unlock the irqs before killing
+			 * it. */
+			vcpu_schedule_unlock_irq(v);
+			irqlocked = 0;
+
+			/* DomU is impacted. Kill it and continue. */
+			domain_crash(curdom);
+		}
+	}
+
+	switch (dom_state) {
+	case DOM0_TRAP:
+	case DOMU_TRAP:
+		/* Enable interrupts. */
+		vcpu_schedule_unlock_irq(v);
+		irqlocked = 0;
+
+		/* guest softirqs and event callbacks are scheduled
+		 * immediately after this handler exits. */
+		break;
+	case DOMU_KILLED:
+		/* Nothing to do here. */
+		break;
+
+	case DOM_NORMAL:
+		vcpu_schedule_unlock_irq(v);
+		irqlocked = 0;
+		break;
+	}
+
+cmn_handler_done:
+	BUG_ON(irqlocked);
+	BUG_ON(!ripv);
+
+	if (bs.errcnt) {
+		/* Not panicing, so forward telemetry to dom0 now if it
+		 * is interested. */
+		if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+			if (mctc != NULL)
+				mctelem_commit(mctc);
+			send_guest_global_virq(dom0, VIRQ_MCA);
+		} else {
+			x86_mcinfo_dump(mci);
+			if (mctc != NULL)
+				mctelem_dismiss(mctc);
+		}
+	} else if (mctc != NULL) {
+		mctelem_dismiss(mctc);
+	}
+}
+
+static int amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+	int rc = 0;
 
 	switch (ci->x86) {
 	case 6:
-		amd_k7_mcheck_init(ci);
+		rc = amd_k7_mcheck_init(ci);
 		break;
 
 	case 0xf:
-		amd_k8_mcheck_init(ci);
+		rc = amd_k8_mcheck_init(ci);
 		break;
 
 	case 0x10:
-		amd_f10_mcheck_init(ci);
+		rc = amd_f10_mcheck_init(ci);
 		break;
 
 	default:
 		/* Assume that machine check support is available.
 		 * The minimum provided support is at least the K8. */
-		amd_k8_mcheck_init(ci);
+		rc = amd_k8_mcheck_init(ci);
 	}
+
+	return rc;
 }
 
 /*check the existence of Machine Check*/
@@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c)
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
+/*
+ * Check if bank 0 is usable for MCE. It isn't for AMD K7,
+ * and Intel P6 family before model 0x1a.
+ */
+int mce_firstbank(struct cpuinfo_x86 *c)
+{
+	if (c->x86 == 6) {
+		if (c->x86_vendor == X86_VENDOR_AMD)
+			return 1;
+
+		if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
+			return 1;
+	}
+
+	return 0;
+}
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
+	int inited = 0, i;
+
 	if (mce_disabled == 1) {
 		printk(XENLOG_INFO "MCE support disabled by bootparam\n");
 		return;
 	}
 
+	for (i = 0; i < MAX_NR_BANKS; i++)
+		set_bit(i,mca_allbanks);
+
+	/* Enforce at least MCE support in CPUID information.  Individual
+	 * families may also need to enforce a check for MCA support. */
 	if (!cpu_has(c, X86_FEATURE_MCE)) {
 		printk(XENLOG_INFO "CPU%i: No machine check support available\n",
 			smp_processor_id());
 		return;
 	}
 
-	memset(&mc_data, 0, sizeof(struct mc_machine));
+	mctelem_init(sizeof (struct mc_info));
 
 	switch (c->x86_vendor) {
 	case X86_VENDOR_AMD:
-		amd_mcheck_init(c);
+		inited = amd_mcheck_init(c);
 		break;
 
 	case X86_VENDOR_INTEL:
+		switch (c->x86) {
+		case 5:
 #ifndef CONFIG_X86_64
-		if (c->x86==5)
-			intel_p5_mcheck_init(c);
+			inited = intel_p5_mcheck_init(c);
 #endif
-		/*If it is P6 or P4 family, including CORE 2 DUO series*/
-		if (c->x86 == 6 || c->x86==15)
-		{
-			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
-			intel_mcheck_init(c);
+			break;
+
+		case 6:
+		case 15:
+			inited = intel_mcheck_init(c);
+			break;
 		}
 		break;
 
 #ifndef CONFIG_X86_64
 	case X86_VENDOR_CENTAUR:
-		if (c->x86==5)
-			winchip_mcheck_init(c);
+		if (c->x86==5) {
+			inited = winchip_mcheck_init(c);
+		}
 		break;
 #endif
 
 	default:
 		break;
 	}
+
+	if (!inited)
+		printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
+		    smp_processor_id());
 }
 
 
@@ -176,191 +593,12 @@ static void __init mcheck_enable(char *str)
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
 
-
-#include <xen/guest_access.h>
-#include <asm/traps.h>
-
-struct mc_info *x86_mcinfo_getptr(void)
-{
-	struct mc_info *mi;
-	uint32_t entry, next;
-
-	for (;;) {
-		entry = mc_data.error_idx;
-		smp_rmb();
-		next = entry + 1;
-		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
-			break;
-	}
-
-	mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
-	BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
-
-	return mi;
-}
-
-static int x86_mcinfo_matches_guest(const struct mc_info *mi,
-			const struct domain *d, const struct vcpu *v)
-{
-	struct mcinfo_common *mic;
-	struct mcinfo_global *mig;
-
-	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
-	mig = (struct mcinfo_global *)mic;
-	if (mig == NULL)
-		return 0;
-
-	if (d->domain_id != mig->mc_domid)
-		return 0;
-
-	if (v->vcpu_id != mig->mc_vcpuid)
-		return 0;
-
-	return 1;
-}
-
-
-#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
-
-static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
-				const struct domain *d, const struct vcpu *v)
-{
-	struct mc_info *mi;
-
-	/* This function is called from the fetch hypercall with
-	 * the mc_lock spinlock held. Thus, no need for locking here.
-	 */
-	mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
-	if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
-		/* Bogus domU command detected. */
-		*fetch_idx = 0;
-		return NULL;
-	}
-
-	*fetch_idx = mc_data.fetch_idx;
-	mc_data.fetch_idx++;
-	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
-	return mi;
-}
-
-
-static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
-{
-	struct mc_machine_notify *mn;
-	struct mcinfo_common *mic = NULL;
-	struct mcinfo_global *mig;
-	struct domain *d;
-	int i;
-
-	/* This function is called from the notifier hypercall with
-	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
-	 */
-
-	/* First invalidate entries for guests that disappeared after
-	 * notification (e.g. shutdown/crash). This step prevents the
-	 * notification array from filling up with stalling/leaking entries.
-	 */
-	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
-		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-		BUG_ON(mic == NULL);
-		mig = (struct mcinfo_global *)mic;
-		d = get_domain_by_id(mig->mc_domid);
-		if (d == NULL) {
-			/* Domain does not exist. */
-			mn->valid = 0;
-		}
-		if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
-			mc_data.notifyconsumer_idx++;
-	}
-
-	/* Now put in the error telemetry. Since all error data fetchable
-	 * by domUs are uncorrectable errors, they are very important.
-	 * So we dump them before overriding them. When a guest takes that long,
-	 * then we can assume something bad already happened (crash, hang, etc.)
-	 */
-	mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
-
-	if (mn->valid) {
-		struct mcinfo_common *mic = NULL;
-		struct mcinfo_global *mig;
-
-		/* To not loose the information, we dump it. */
-		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
-		BUG_ON(mic == NULL);
-		mig = (struct mcinfo_global *)mic;
-		printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
-			"fetch machine check error telemetry. But Domain ID "
-			"did not do that in time.\n",
-			mig->mc_domid);
-		x86_mcinfo_dump(&mn->mc);
-	}
-
-	memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
-		sizeof(struct mc_info));
-	mn->fetch_idx = mc_notifydomain->fetch_idx;
-	mn->valid = 1;
-
-	mc_data.notifyproducer_idx++;
-
-	/* By design there can never be more notifies than machine check errors.
-	 * If that ever happens, then we hit a bug. */
-	BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
-	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-}
-
-static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
-				const struct domain *d, const struct vcpu *v)
-{
-	struct mc_machine_notify *mn = NULL;
-	uint32_t i;
-	int found;
-
-	/* This function is called from the fetch hypercall with
-	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
-	 */
-
-	/* The notifier data is filled in the order guests get notified, but
-	 * guests may fetch them in a different order. That's why we need
-	 * the game with valid/invalid entries. */
-	found = 0;
-	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
-		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
-		if (!mn->valid) {
-			if (i == mc_data.notifyconsumer_idx)
-				mc_data.notifyconsumer_idx++;
-			continue;
-		}
-		if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
-			found = 1;
-			break;
-		}
-	}
-
-	if (!found) {
-		/* This domain has never been notified. This must be
-		 * a bogus domU command. */
-		*fetch_idx = 0;
-		return NULL;
-	}
-
-	BUG_ON(mn == NULL);
-	*fetch_idx = mn->fetch_idx;
-	mn->valid = 0;
-
-	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-	return &mn->mc;
-}
-
-
-void x86_mcinfo_clear(struct mc_info *mi)
+static void mcinfo_clear(struct mc_info *mi)
 {
 	memset(mi, 0, sizeof(struct mc_info));
 	x86_mcinfo_nentries(mi) = 0;
 }
 
-
 int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 {
 	int i;
@@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
 
 	if (end1 < end2)
-		return -ENOSPC; /* No space. Can't add entry. */
+		return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
 
 	/* there's enough space. add entry. */
 	memcpy(mic_index, mic, mic->size);
@@ -389,7 +627,6 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
 	return 0;
 }
 
-
 /* Dump machine check information in a format,
  * mcelog can parse. This is used only when
  * Dom0 does not take the notification. */
@@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
 	if (mic == NULL)
 		return;
 	mc_global = (struct mcinfo_global *)mic;
-	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+	if (mc_global->mc_flags & MC_FLAG_MCE) {
 		printk(XENLOG_WARNING
 			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
 			mc_global->mc_coreid, mc_global->mc_gstatus);
@@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
 			goto next;
 
 		mc_bank = (struct mcinfo_bank *)mic;
-	
+
 		printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
 			mc_bank->mc_bank,
 			mc_bank->mc_status);
@@ -441,8 +678,6 @@ next:
 	} while (1);
 }
 
-
-
 static void do_mc_get_cpu_info(void *v)
 {
 	int cpu = smp_processor_id();
@@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
 	}
 }
 
+#if BITS_PER_LONG == 64
+
+#define	ID2COOKIE(id)	((mctelem_cookie_t)(id))
+#define	COOKIE2ID(c) ((uint64_t)(c))
+
+#elif BITS_PER_LONG == 32
+
+#define	ID2COOKIE(id)	((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
+#define	COOKIE2ID(c)	((uint64_t)(uint32_t)(c))
+
+#elif defined(BITS_PER_LONG)
+#error BITS_PER_LONG has unexpected value
+#else
+#error BITS_PER_LONG definition absent
+#endif
+
 /* Machine Check Architecture Hypercall */
 long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
 {
 	long ret = 0;
 	struct xen_mc curop, *op = &curop;
 	struct vcpu *v = current;
-	struct domain *domU;
 	struct xen_mc_fetch *mc_fetch;
-	struct xen_mc_notifydomain *mc_notifydomain;
 	struct xen_mc_physcpuinfo *mc_physcpuinfo;
-	struct mc_info *mi;
-	uint32_t flags;
-	uint32_t fetch_idx;
-        uint16_t vcpuid;
-	/* Use a different lock for the notify hypercall in order to allow
-	 * a DomU to fetch mc data while Dom0 notifies another DomU. */
-	static DEFINE_SPINLOCK(mc_lock);
-	static DEFINE_SPINLOCK(mc_notify_lock);
+	uint32_t flags, cmdflags;
 	int nlcpu;
 	xen_mc_logical_cpu_t *log_cpus = NULL;
+	mctelem_cookie_t mctc;
+	mctelem_class_t which;
 
 	if ( copy_from_guest(op, u_xen_mc, 1) )
-		return -EFAULT;
+		return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
 
 	if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
-		return -EACCES;
+		return x86_mcerr("do_mca: interface version mismatch", -EACCES);
 
-	switch ( op->cmd ) {
+	switch (op->cmd) {
 	case XEN_MC_fetch:
-		/* This hypercall is for any domain */
 		mc_fetch = &op->u.mc_fetch;
+		cmdflags = mc_fetch->flags;
+
+		/* This hypercall is for Dom0 only */
+		if (!IS_PRIV(v->domain) )
+			return x86_mcerr(NULL, -EPERM);
 
-		switch (mc_fetch->flags) {
-		case XEN_MC_CORRECTABLE:
-			/* But polling mode is Dom0 only, because
-			 * correctable errors are reported to Dom0 only */
-			if ( !IS_PRIV(v->domain) )
-				return -EPERM;
+		switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
+		case XEN_MC_NONURGENT:
+			which = MC_NONURGENT;
 			break;
 
-		case XEN_MC_TRAP:
+		case XEN_MC_URGENT:
+			which = MC_URGENT;
 			break;
+
 		default:
-			return -EFAULT;
+			return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
 		}
 
 		flags = XEN_MC_OK;
-		spin_lock(&mc_lock);
 
-		if ( IS_PRIV(v->domain) ) {
-			/* this must be Dom0. So a notify hypercall
-			 * can't have happened before. */
-			mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+		if (cmdflags & XEN_MC_ACK) {
+			mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
+			mctelem_ack(which, cookie);
 		} else {
-			/* Hypercall comes from an unprivileged domain */
-			domU = v->domain;
-			if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
-				/* Dom0 must have notified this DomU before
-				 * via the notify hypercall. */
-				mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
+			if (guest_handle_is_null(mc_fetch->data))
+				return x86_mcerr("do_mca fetch: guest buffer "
+				    "invalid", -EINVAL);
+
+			if ((mctc = mctelem_consume_oldest_begin(which))) {
+				struct mc_info *mcip = mctelem_dataptr(mctc);
+				if (copy_to_guest(mc_fetch->data, mcip, 1)) {
+					ret = -EFAULT;
+					flags |= XEN_MC_FETCHFAILED;
+					mc_fetch->fetch_id = 0;
+				} else {
+					mc_fetch->fetch_id = COOKIE2ID(mctc);
+				}
+				mctelem_consume_oldest_end(mctc);
 			} else {
-				/* Xen notified the DomU. */
-				mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
+				/* There is no data */
+				flags |= XEN_MC_NODATA;
+				mc_fetch->fetch_id = 0;
 			}
-		}
 
-		if (mi) {
-			memcpy(&mc_fetch->mc_info, mi,
-				sizeof(struct mc_info));
-		} else {
-			/* There is no data for a bogus DomU command. */
-			flags |= XEN_MC_NODATA;
-			memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+			mc_fetch->flags = flags;
+			if (copy_to_guest(u_xen_mc, op, 1) != 0)
+				ret = -EFAULT;
 		}
 
-		mc_fetch->flags = flags;
-		mc_fetch->fetch_idx = fetch_idx;
-
-		if ( copy_to_guest(u_xen_mc, op, 1) )
-			ret = -EFAULT;
-
-		spin_unlock(&mc_lock);
 		break;
 
 	case XEN_MC_notifydomain:
-		/* This hypercall is for Dom0 only */
-		if ( !IS_PRIV(v->domain) )
-			return -EPERM;
-
-		spin_lock(&mc_notify_lock);
-
-		mc_notifydomain = &op->u.mc_notifydomain;
-		domU = get_domain_by_id(mc_notifydomain->mc_domid);
-		vcpuid = mc_notifydomain->mc_vcpuid;
-
-		if ((domU == NULL) || (domU == dom0)) {
-			/* It's not possible to notify a non-existent domain
-			 * or the dom0. */
-			spin_unlock(&mc_notify_lock);
-			return -EACCES;
-		}
+		return x86_mcerr("do_mca notify unsupported", -EINVAL);
 
-		if (vcpuid >= MAX_VIRT_CPUS) {
-			/* It's not possible to notify a vcpu, Xen can't
-			 * assign to a domain. */
-			spin_unlock(&mc_notify_lock);
-			return -EACCES;
+	case XEN_MC_physcpuinfo:
+		if ( !IS_PRIV(v->domain) )
+			return x86_mcerr("do_mca cpuinfo", -EPERM);
+
+		mc_physcpuinfo = &op->u.mc_physcpuinfo;
+		nlcpu = num_online_cpus();
+
+		if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+			if (mc_physcpuinfo->ncpus <= 0)
+				return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
+				    -EINVAL);
+			nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
+			log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
+			if (log_cpus == NULL)
+				return x86_mcerr("do_mca cpuinfo", -ENOMEM);
+
+			if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
+			    1, 1) != 0) {
+				xfree(log_cpus);
+				return x86_mcerr("do_mca cpuinfo", -EIO);
+			}
 		}
 
-		mc_notifydomain->flags = XEN_MC_OK;
-
-		mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
-		if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
-			/* The error telemetry is not for the guest, Dom0
-			 * wants to notify. */
-			mc_notifydomain->flags |= XEN_MC_NOMATCH;
-		} else if ( guest_has_trap_callback(domU, vcpuid,
-						TRAP_machine_check) )
-		{
-			/* Send notification */
-			if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
-				mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
-		} else
-			mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
-
-#ifdef DEBUG
-		/* sanity check - these two flags are mutually exclusive */
-		if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
-			BUG();
-#endif
+		mc_physcpuinfo->ncpus = nlcpu;
 
-		if ( copy_to_guest(u_xen_mc, op, 1) )
-			ret = -EFAULT;
-
-		if (ret == 0) {
-			x86_mcinfo_marknotified(mc_notifydomain);
+		if (copy_to_guest(u_xen_mc, op, 1)) {
+			if (log_cpus != NULL)
+				xfree(log_cpus);
+			return x86_mcerr("do_mca cpuinfo", -EFAULT);
 		}
 
-		spin_unlock(&mc_notify_lock);
+		if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+			if (copy_to_guest(mc_physcpuinfo->info,
+			    log_cpus, nlcpu))
+				ret = -EFAULT;
+			xfree(log_cpus);
+		}
 		break;
 
-       case XEN_MC_physcpuinfo:
-	       if ( !IS_PRIV(v->domain) )
-		       return -EPERM;
- 
-	       mc_physcpuinfo = &op->u.mc_physcpuinfo;
-	       nlcpu = num_online_cpus();
- 
-	       if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-		       if (mc_physcpuinfo->ncpus <= 0)
-			       return -EINVAL;
-		       nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
-		       log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
-		       if (log_cpus == NULL)
-			       return -ENOMEM;
- 
-		       if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
-			   1, 1) != 0) {
-			       xfree(log_cpus);
-			       return -EIO;
-		       }
-	       }
- 
-	       mc_physcpuinfo->ncpus = nlcpu;
- 
-	       if (copy_to_guest(u_xen_mc, op, 1)) {
-		       if (log_cpus != NULL)
-			       xfree(log_cpus);
-		       return -EFAULT;
-	       }
- 
-	       if (!guest_handle_is_null(mc_physcpuinfo->info)) {
-		       if (copy_to_guest(mc_physcpuinfo->info,
-			   log_cpus, nlcpu))
-			       ret = -EFAULT;
-		       xfree(log_cpus);
-	       }
+	default:
+		return x86_mcerr("do_mca: bad command", -EINVAL);
 	}
 
 	return ret;
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
index f2e29897bd..f360268821 100644
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -1,38 +1,98 @@
+#ifndef _MCE_H
+
+#define _MCE_H
+
 #include <xen/init.h>
+#include <xen/smp.h>
 #include <asm/types.h>
 #include <asm/traps.h>
 #include <asm/atomic.h>
 #include <asm/percpu.h>
 
+#include "x86_mca.h"
+#include "mctelem.h"
 
 /* Init functions */
-void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+int winchip_mcheck_init(struct cpuinfo_x86 *c);
+int intel_mcheck_init(struct cpuinfo_x86 *c);
 
 void intel_mcheck_timer(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_mcheck_init(struct cpuinfo_x86 *c);
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
-
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Function pointer used in the handlers to collect additional information
- * provided by newer CPU families/models without the need to duplicate
- * the whole handler resulting in various handlers each with its own
- * tweaks and bugs */
-extern int (*mc_callback_bank_extended)(struct mc_info *mi,
-		uint16_t bank, uint64_t status);
-
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 
 int mce_available(struct cpuinfo_x86 *c);
+int mce_firstbank(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
-void x86_mcinfo_clear(struct mc_info *mi);
-int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
-void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
 			 uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
+
+/* Register a handler for machine check exceptions. */
+typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
+extern void x86_mce_vector_register(x86_mce_vector_t);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Utility function to "logout" all architectural MCA telemetry from the MCA
+ * banks of the current processor.  A cookie is returned which may be
+ * uses to reference the data so logged (the cookie can be NULL if
+ * no logout structures were available).  The caller can also pass a pointer
+ * to a structure which will be completed with some summary information
+ * of the MCA data observed in the logout operation. */
+
+enum mca_source {
+	MCA_MCE_HANDLER,
+	MCA_POLLER,
+	MCA_CMCI_HANDLER,
+	MCA_RESET
+};
+
+enum mca_extinfo {
+	MCA_EXTINFO_LOCAL,
+	MCA_EXTINFO_GLOBAL,
+	MCA_EXTINFO_IGNORED
+};
+
+struct mca_summary {
+	uint32_t	errcnt;	/* number of banks with valid errors */
+	int		ripv;	/* meaningful on #MC */
+	int		eipv;	/* meaningful on #MC */
+	uint32_t	uc;	/* bitmask of banks with UC */
+	uint32_t	pcc;	/* bitmask of banks with PCC */
+};
+
+extern cpu_banks_t mca_allbanks;
+
+extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
+    struct mca_summary *);
+
+/* Register a callback to be made during bank telemetry logout.
+ * This callback is only available to those machine check handlers
+ * that call to the common mcheck_cmn_handler or who use the common
+ * telemetry logout function mcheck_mca_logout in error polling.
+ *
+ * This can be used to collect additional information (typically non-
+ * architectural) provided by newer CPU families/models without the need
+ * to duplicate the whole handler resulting in various handlers each with
+ * its own tweaks and bugs.  The callback receives an struct mc_info pointer
+ * which it can use with x86_mcinfo_add to add additional telemetry,
+ * the current MCA bank number we are reading telemetry from, and the
+ * MCi_STATUS value for that bank.
+ */
+typedef enum mca_extinfo (*x86_mce_callback_t)
+    (struct mc_info *, uint16_t, uint64_t);
+extern void x86_mce_callback_register(x86_mce_callback_t);
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+#endif /* _MCE_H */
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index dabac9ac5c..e1c41cd01d 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
 
 static int nr_intel_ext_msrs = 0;
 static int cmci_support = 0;
+static int firstbank;
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -115,222 +116,51 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 }
 #endif /* CONFIG_X86_MCE_THERMAL */
 
-static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
+static enum mca_extinfo
+intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
 {
-    if (nr_intel_ext_msrs == 0)
-        return;
-
-    /* this function will called when CAP(9).MCG_EXT_P = 1 */
-    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
-    mc_ext->common.type = MC_TYPE_EXTENDED;
-    mc_ext->common.size = sizeof(mc_ext);
-    mc_ext->mc_msrs = 10;
-
-    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
-    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
-    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
-    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
-    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
-    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
-
-    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
-    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
-    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
-    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
-    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
-    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
-
-    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
-    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
-    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
-    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
-    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
-    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
-    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
-    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
-}
-
-/* machine_check_poll might be called by following types:
- * 1. called when do mcheck_init.
- * 2. called in cmci interrupt handler
- * 3. called in polling handler
- * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
- * consumer.
- */
-static struct mc_info *machine_check_poll(int calltype)
-{
-    struct mc_info *mi = NULL;
-    int exceptions = (read_cr4() & X86_CR4_MCE);
-    int i, nr_unit = 0, uc = 0, pcc = 0;
-    uint64_t status, addr;
-    struct mcinfo_global mcg;
-    struct mcinfo_extended mce;
-    unsigned int cpu;
-    struct domain *d;
-
-    cpu = smp_processor_id();
-
-    memset(&mcg, 0, sizeof(mcg));
-    mcg.common.type = MC_TYPE_GLOBAL;
-    mcg.common.size = sizeof(mcg);
-    /* If called from cpu-reset check, don't need to fill them.
-     * If called from cmci context, we'll try to fill domid by memory addr
-     */
-    mcg.mc_domid = -1;
-    mcg.mc_vcpuid = -1;
-    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
-        mcg.mc_flags = MC_FLAG_POLLED;
-    else if (calltype == MC_FLAG_CMCI)
-        mcg.mc_flags = MC_FLAG_CMCI;
-    x86_mc_get_cpu_info(
-        cpu, &mcg.mc_socketid, &mcg.mc_coreid,
-        &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL);
-    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
-
-    for ( i = 0; i < nr_mce_banks; i++ ) {
-        struct mcinfo_bank mcb;
-        /* For CMCI, only owners checks the owned MSRs */
-        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
-             (calltype & MC_FLAG_CMCI) )
-            continue;
-        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+    struct mcinfo_extended mc_ext;
 
-        if (! (status & MCi_STATUS_VAL) )
-            continue;
-        /*
-         * Uncorrected events are handled by the exception
-         * handler when it is enabled. But when the exception
-         * is disabled such as when mcheck_init, log everything.
-         */
-        if ((status & MCi_STATUS_UC) && exceptions)
-            continue;
+    if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
+        return MCA_EXTINFO_IGNORED;
 
-        if (status & MCi_STATUS_UC)
-            uc = 1;
-        if (status & MCi_STATUS_PCC)
-            pcc = 1;
-
-        if (!mi) {
-            mi = x86_mcinfo_getptr();
-            if (!mi) {
-                printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
-                return NULL;
-            }
-            x86_mcinfo_clear(mi);
-        }
-        memset(&mcb, 0, sizeof(mcb));
-        mcb.common.type = MC_TYPE_BANK;
-        mcb.common.size = sizeof(mcb);
-        mcb.mc_bank = i;
-        mcb.mc_status = status;
-        if (status & MCi_STATUS_MISCV)
-            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
-        if (status & MCi_STATUS_ADDRV) {
-            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
-            d = maddr_get_owner(addr);
-            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
-                mcb.mc_domid = d->domain_id;
-        }
-        if (cmci_support)
-            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
-        if (calltype == MC_FLAG_CMCI)
-            rdtscll(mcb.mc_tsc);
-        x86_mcinfo_add(mi, &mcb);
-        nr_unit++;
-        add_taint(TAINT_MACHINE_CHECK);
-        /* Clear state for this bank */
-        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
-        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n", 
-                i, cpu, status);
-        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
-                "thread[%d]\n", cpu, mcg.mc_socketid, 
-                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
- 
-    }
-    /* if pcc = 1, uc must be 1 */
-    if (pcc)
-        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
-    else if (uc)
-        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
-    else /* correctable */
-        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
-
-    if (nr_unit && nr_intel_ext_msrs && 
-                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
-        intel_get_extended_msrs(&mce);
-        x86_mcinfo_add(mi, &mce);
-    }
-    if (nr_unit) 
-        x86_mcinfo_add(mi, &mcg);
-    /* Clear global state */
-    return mi;
+    /* this function will called when CAP(9).MCG_EXT_P = 1 */
+    memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext.common.type = MC_TYPE_EXTENDED;
+    mc_ext.common.size = sizeof(mc_ext);
+    mc_ext.mc_msrs = 10;
+
+    mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
+    mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
+    mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
+
+    mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
+    mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
+    mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
+
+    mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
+    mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
+    mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
+    mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
+
+    x86_mcinfo_add(mci, &mc_ext);
+
+    return MCA_EXTINFO_GLOBAL;
 }
 
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
-    /* MACHINE CHECK Error handler will be sent in another patch,
-     * simply copy old solutions here. This code will be replaced
-     * by upcoming machine check patches
-     */
-
-    int recover=1;
-    u32 alow, ahigh, high, low;
-    u32 mcgstl, mcgsth;
-    int i;
-   
-    rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-    if (mcgstl & (1<<0))       /* Recoverable ? */
-        recover=0;
-    
-    printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-           smp_processor_id(), mcgsth, mcgstl);
-    
-    for (i=0; i<nr_mce_banks; i++) {
-        rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-        if (high & (1<<31)) {
-            if (high & (1<<29))
-                recover |= 1;
-            if (high & (1<<25))
-                recover |= 2;
-            printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-            high &= ~(1<<31);
-            if (high & (1<<27)) {
-                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                printk ("[%08x%08x]", ahigh, alow);
-            }
-            if (high & (1<<26)) {
-                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                printk (" at %08x%08x", ahigh, alow);
-            }
-            printk ("\n");
-        }
-    }
-    
-    if (recover & 2)
-        mc_panic ("CPU context corrupt");
-    if (recover & 1)
-        mc_panic ("Unable to continue");
-    
-    printk(KERN_EMERG "Attempting to continue.\n");
-    /* 
-     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-     * recoverable/continuable.This will allow BIOS to look at the MSRs
-     * for errors if the OS could not log the error.
-     */
-    for (i=0; i<nr_mce_banks; i++) {
-        u32 msr;
-        msr = MSR_IA32_MC0_STATUS+i*4;
-        rdmsr (msr, low, high);
-        if (high&(1<<31)) {
-            /* Clear it */
-            wrmsr(msr, 0UL, 0UL);
-            /* Serialize */
-            wmb();
-            add_taint(TAINT_MACHINE_CHECK);
-        }
-    }
-    mcgstl &= ~(1<<2);
-    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+	mcheck_cmn_handler(regs, error_code, mca_allbanks);
 }
 
 static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -369,6 +199,8 @@ static void cmci_discover(void)
     unsigned long flags;
     int i;
     struct mc_info *mi = NULL;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
 
@@ -385,12 +217,20 @@ static void cmci_discover(void)
      * MCi_status (error_count bit 38~52) is not cleared,
      * the CMCI interrupt will never be triggered again.
      */
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
 
     printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
            smp_processor_id(), 
@@ -487,17 +327,26 @@ static void intel_init_cmci(struct cpuinfo_x86 *c)
 fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
 {
     struct mc_info *mi = NULL;
-    int cpu = smp_processor_id();
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
 
     ack_APIC_irq();
     irq_enter();
-    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
-    mi = machine_check_poll(MC_FLAG_CMCI);
-    if (mi) {
-        x86_mcinfo_dump(mi);
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+    mctc = mcheck_mca_logout(
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+    if (bs.errcnt && mctc != NULL) {
+        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+            mctelem_commit(mctc);
             send_guest_global_virq(dom0, VIRQ_MCA);
-    }
+        } else {
+            x86_mcinfo_dump(mi);
+            mctelem_dismiss(mctc);
+       }
+    } else if (mctc != NULL)
+        mctelem_dismiss(mctc);
+
     irq_exit();
 }
 
@@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_x86 *c)
         printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
             smp_processor_id(), nr_intel_ext_msrs);
     }
-    /* for most of p6 family, bank 0 is an alias bios MSR.
-     * But after model>1a, bank 0 is available*/
-    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
-            && c->x86_model < 0x1A)
-        firstbank = 1;
-    else
-        firstbank = 0;
+    firstbank = mce_firstbank(c);
 }
 
 static void mce_init(void)
 {
     u32 l, h;
     int i;
-    struct mc_info *mi;
+    mctelem_cookie_t mctc;
+    struct mca_summary bs;
+
     clear_in_cr4(X86_CR4_MCE);
+
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/
 
-    mi = machine_check_poll(MC_FLAG_RESET);
+    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+
     /* in the boot up stage, don't inject to DOM0, but print out */
-    if (mi)
-        x86_mcinfo_dump(mi);
+    if (bs.errcnt && mctc != NULL) {
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        mctelem_dismiss(mctc);
+    }
 
     set_in_cr4(X86_CR4_MCE);
     rdmsr (MSR_IA32_MCG_CAP, l, h);
@@ -573,71 +422,19 @@ static void mce_init(void)
 }
 
 /* p4/p6 family have similar MCA initialization process */
-void intel_mcheck_init(struct cpuinfo_x86 *c)
+int intel_mcheck_init(struct cpuinfo_x86 *c)
 {
     mce_cap_init(c);
     printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
             smp_processor_id());
+
     /* machine check is available */
-    machine_check_vector = intel_machine_check;
+    x86_mce_vector_register(intel_machine_check);
+    x86_mce_callback_register(intel_get_extended_msrs);
+
     mce_init();
     mce_intel_feature_init(c);
     mce_set_owner();
-}
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll faster. When the poller finds no more 
- * errors, poll slower
-*/
-static struct timer mce_timer;
-
-#define MCE_PERIOD 4000
-#define MCE_MIN    2000
-#define MCE_MAX    32000
-
-static u64 period = MCE_PERIOD;
-static int adjust = 0;
-
-static void mce_intel_checkregs(void *info)
-{
-    struct mc_info *mi;
-
-    if( !mce_available(&current_cpu_data))
-        return;
-    mi = machine_check_poll(MC_FLAG_POLLED);
-    if (mi)
-    {
-        x86_mcinfo_dump(mi);
-        adjust++;
-        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
-            send_guest_global_virq(dom0, VIRQ_MCA);
-    }
-}
 
-static void mce_intel_work_fn(void *data)
-{
-    on_each_cpu(mce_intel_checkregs, data, 1, 1);
-    if (adjust) {
-        period = period / (adjust + 1);
-        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval "
-               "to %"PRIu64"\n", period);
-    }
-    else {
-        period *= 2;
-    }
-    if (period > MCE_MAX) 
-        period = MCE_MAX;
-    if (period < MCE_MIN)
-        period = MCE_MIN;
-    set_timer(&mce_timer, NOW() + MILLISECS(period));
-    adjust = 0;
-}
-
-void intel_mcheck_timer(struct cpuinfo_x86 *c)
-{
-    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
-    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
-    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
+    return 1;
 }
-
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
new file mode 100644
index 0000000000..4111ddcbb7
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+/*
+ * mctelem.c - x86 Machine Check Telemetry Transport
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+struct mctelem_ent {
+	struct mctelem_ent *mcte_next;	/* next in chronological order */
+	struct mctelem_ent *mcte_prev;	/* previous in chronological order */
+	uint32_t mcte_flags;		/* See MCTE_F_* below */
+	uint32_t mcte_refcnt;		/* Reference count */
+	void *mcte_data;		/* corresponding data payload */
+};
+
+#define	MCTE_F_HOME_URGENT		0x0001U	/* free to urgent freelist */
+#define	MCTE_F_HOME_NONURGENT		0x0002U /* free to nonurgent freelist */
+#define	MCTE_F_CLASS_URGENT		0x0004U /* in use - urgent errors */
+#define	MCTE_F_CLASS_NONURGENT		0x0008U /* in use - nonurgent errors */
+#define	MCTE_F_STATE_FREE		0x0010U	/* on a freelist */
+#define	MCTE_F_STATE_UNCOMMITTED	0x0020U	/* reserved; on no list */
+#define	MCTE_F_STATE_COMMITTED		0x0040U	/* on a committed list */
+#define	MCTE_F_STATE_PROCESSING		0x0080U	/* on a processing list */
+
+#define	MCTE_F_MASK_HOME	(MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT)
+#define	MCTE_F_MASK_CLASS	(MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
+#define	MCTE_F_MASK_STATE	(MCTE_F_STATE_FREE | \
+				MCTE_F_STATE_UNCOMMITTED | \
+				MCTE_F_STATE_COMMITTED | \
+				MCTE_F_STATE_PROCESSING)
+
+#define	MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
+
+#define	MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
+#define	MCTE_SET_CLASS(tep, new) do { \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
+    (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
+
+#define	MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
+#define	MCTE_TRANSITION_STATE(tep, old, new) do { \
+    BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
+    (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
+    (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
+
+#define	MC_URGENT_NENT		10
+#define	MC_NONURGENT_NENT	20
+
+#define	MC_NCLASSES		(MC_NONURGENT + 1)
+
+#define	COOKIE2MCTE(c)		((struct mctelem_ent *)(c))
+#define	MCTE2COOKIE(tep)	((mctelem_cookie_t)(tep))
+
+static struct mc_telem_ctl {
+	/* Linked lists that thread the array members together.
+	 *
+	 * The free lists are singly-linked via mcte_next, and we allocate
+	 * from them by atomically unlinking an element from the head.
+	 * Consumed entries are returned to the head of the free list.
+	 * When an entry is reserved off the free list it is not linked
+	 * on any list until it is committed or dismissed.
+	 *
+	 * The committed list grows at the head and we do not maintain a
+	 * tail pointer; insertions are performed atomically.  The head
+	 * thus has the most-recently committed telemetry, i.e. the
+	 * list is in reverse chronological order.  The committed list
+	 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
+	 * When we move telemetry from the committed list to the processing
+	 * list we atomically unlink the committed list and keep a pointer
+	 * to the head of that list;  we then traverse the list following
+	 * mcte_prev and fill in mcte_next to doubly-link the list, and then
+	 * append the tail of the list onto the processing list.  If we panic
+	 * during this manipulation of the committed list we still have
+	 * the pointer to its head so we can recover all entries during
+	 * the panic flow (albeit in reverse chronological order).
+	 *
+	 * The processing list is updated in a controlled context, and
+	 * we can lock it for updates.  The head of the processing list
+	 * always has the oldest telemetry, and we append (as above)
+	 * at the tail of the processing list. */
+	struct mctelem_ent *mctc_free[MC_NCLASSES];
+	struct mctelem_ent *mctc_committed[MC_NCLASSES];
+	struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
+	struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
+	/*
+	 * Telemetry array
+	 */
+	struct mctelem_ent *mctc_elems;
+} mctctl;
+
+/* Lock protecting all processing lists */
+static DEFINE_SPINLOCK(processing_lock);
+
+static void *cmpxchgptr(void *ptr, void *old, void *new)
+{
+	unsigned long *ulp = (unsigned long *)ptr;
+	unsigned long a = (unsigned long)old;
+	unsigned long b = (unsigned long)new;
+
+	return (void *)cmpxchg(ulp, a, b);
+}
+
+/* Free an entry to its native free list; the entry must not be linked on
+ * any list.
+ */
+static void mctelem_free(struct mctelem_ent *tep)
+{
+	mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
+	    MC_URGENT : MC_NONURGENT;
+	struct mctelem_ent **freelp;
+	struct mctelem_ent *oldhead;
+
+	BUG_ON(tep->mcte_refcnt != 0);
+	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
+
+	tep->mcte_prev = NULL;
+	freelp = &mctctl.mctc_free[target];
+	for (;;) {
+		oldhead = *freelp;
+		tep->mcte_next = oldhead;
+		wmb();
+		if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
+			break;
+	}
+}
+
+/* Increment the reference count of an entry that is not linked on to
+ * any list and which only the caller has a pointer to.
+ */
+static void mctelem_hold(struct mctelem_ent *tep)
+{
+	tep->mcte_refcnt++;
+}
+
+/* Increment the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_hold(struct mctelem_ent *tep)
+{
+	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+	    MC_URGENT : MC_NONURGENT;
+
+	BUG_ON(tep != mctctl.mctc_processing_head[which]);
+	tep->mcte_refcnt++;
+}
+
+/* Decrement the reference count on an entry that is linked at the head of
+ * a processing list.  The caller is responsible for locking the list.
+ */
+static void mctelem_processing_release(struct mctelem_ent *tep)
+{
+	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+	    MC_URGENT : MC_NONURGENT;
+
+	BUG_ON(tep != mctctl.mctc_processing_head[which]);
+	if (--tep->mcte_refcnt == 0) {
+		MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
+		mctctl.mctc_processing_head[which] = tep->mcte_next;
+		mctelem_free(tep);
+	}
+}
+
+void mctelem_init(int reqdatasz)
+{
+	static int called = 0;
+	static int datasz = 0, realdatasz = 0;
+	char *datarr;
+	int i;
+	
+	BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
+
+	/* Called from mcheck_init for all processors; initialize for the
+	 * first call only (no race here since the boot cpu completes
+	 * init before others start up). */
+	if (++called == 1) {
+		realdatasz = reqdatasz;
+		datasz = (reqdatasz & ~0xf) + 0x10;	/* 16 byte roundup */
+	} else {
+		BUG_ON(reqdatasz != realdatasz);
+		return;
+	}
+
+	if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
+	    MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
+	    (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
+	    datasz)) == NULL) {
+		if (mctctl.mctc_elems)
+			xfree(mctctl.mctc_elems);
+		printk("Allocations for MCA telemetry failed\n");
+		return;
+	}
+
+	for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
+		struct mctelem_ent *tep, **tepp;
+
+		tep = mctctl.mctc_elems + i;
+		tep->mcte_flags = MCTE_F_STATE_FREE;
+		tep->mcte_refcnt = 0;
+		tep->mcte_data = datarr + i * datasz;
+
+		if (i < MC_URGENT_NENT) {
+			tepp = &mctctl.mctc_free[MC_URGENT];
+			tep->mcte_flags |= MCTE_F_HOME_URGENT;
+		} else {
+			tepp = &mctctl.mctc_free[MC_NONURGENT];
+			tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
+		}
+
+		tep->mcte_next = *tepp;
+		tep->mcte_prev = NULL;
+		*tepp = tep;
+	}
+}
+
+/* incremented non-atomically when reserve fails */
+static int mctelem_drop_count;
+
+/* Reserve a telemetry entry, or return NULL if none available.
+ * If we return an entry then the caller must subsequently call exactly one of
+ * mctelem_unreserve or mctelem_commit for that entry.
+ */
+mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
+{
+	struct mctelem_ent **freelp;
+	struct mctelem_ent *oldhead, *newhead;
+	mctelem_class_t target = (which == MC_URGENT) ?
+	    MC_URGENT : MC_NONURGENT;
+
+	freelp = &mctctl.mctc_free[target];
+	for (;;) {
+		if ((oldhead = *freelp) == NULL) {
+			if (which == MC_URGENT && target == MC_URGENT) {
+				/* raid the non-urgent freelist */
+				target = MC_NONURGENT;
+				freelp = &mctctl.mctc_free[target];
+				continue;
+			} else {
+				mctelem_drop_count++;
+				return (NULL);
+			}
+		}
+
+		newhead = oldhead->mcte_next;
+		if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
+			struct mctelem_ent *tep = oldhead;
+
+			mctelem_hold(tep);
+			MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
+			tep->mcte_next = NULL;
+			tep->mcte_prev = NULL;
+			if (which == MC_URGENT)
+				MCTE_SET_CLASS(tep, URGENT);
+			else
+				MCTE_SET_CLASS(tep, NONURGENT);
+			return MCTE2COOKIE(tep);
+		}
+	}
+}
+
+void *mctelem_dataptr(mctelem_cookie_t cookie)
+{
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+	return tep->mcte_data;
+}
+
+/* Release a previously reserved entry back to the freelist without
+ * submitting it for logging.  The entry must not be linked on to any
+ * list - that's how mctelem_reserve handed it out.
+ */
+void mctelem_dismiss(mctelem_cookie_t cookie)
+{
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+	tep->mcte_refcnt--;
+	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
+	mctelem_free(tep);
+}
+
+/* Commit an entry with completed telemetry for logging.  The caller must
+ * not reference the entry after this call.  Note that we add entries
+ * at the head of the committed list, so that list therefore has entries
+ * in reverse chronological order.
+ */
+void mctelem_commit(mctelem_cookie_t cookie)
+{
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+	struct mctelem_ent **commlp;
+	struct mctelem_ent *oldhead;
+	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+	    MC_URGENT : MC_NONURGENT;
+
+	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
+	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
+
+	commlp = &mctctl.mctc_committed[target];
+	for (;;) {
+		oldhead = *commlp;
+		tep->mcte_prev = oldhead;
+		wmb();
+		if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
+			break;
+	}
+}
+
+/* Move telemetry from committed list to processing list, reversing the
+ * list into chronological order.  The processing list has been
+ * locked by the caller, and may be non-empty.  We append the
+ * reversed committed list on to the tail of the processing list.
+ * The committed list may grow even while we run, so use atomic
+ * operations to swap NULL to the freelist head.
+ *
+ * Note that "chronological order" means the order in which producers
+ * won additions to the processing list, which may not reflect the
+ * strict chronological order of the associated events if events are
+ * closely spaced in time and contend for the processing list at once.
+ */
+
+static struct mctelem_ent *dangling[MC_NCLASSES];
+
+static void mctelem_append_processing(mctelem_class_t which)
+{
+	mctelem_class_t target = which == MC_URGENT ?
+	    MC_URGENT : MC_NONURGENT;
+	struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
+	struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
+	struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
+	struct mctelem_ent *tep, *ltep;
+
+	/* Check for an empty list; no race since we hold the processing lock */
+	if (*commlp == NULL)
+		return;
+
+	/* Atomically unlink the committed list, and keep a pointer to
+	 * the list we unlink in a well-known location so it can be
+	 * picked up in panic code should we panic between this unlink
+	 * and the append to the processing list. */
+	for (;;) {
+		dangling[target] = *commlp;
+		wmb();
+		if (cmpxchgptr(commlp, dangling[target], NULL) ==
+		    dangling[target])
+			break;
+	}
+
+	if (dangling[target] == NULL)
+		return;
+
+	/* Traverse the list following the previous pointers (reverse
+	 * chronological order).  For each entry fill in the next pointer
+	 * and transition the element state.  */
+	for (tep = dangling[target], ltep = NULL; tep != NULL;
+	    tep = tep->mcte_prev) {
+		MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
+		tep->mcte_next = ltep;
+		ltep = tep;
+	}
+
+	/* ltep points to the head of a chronologically ordered linked
+	 * list of telemetry entries ending at the most recent entry
+	 * dangling[target] if mcte_next is followed; tack this on to
+	 * the processing list.
+	 */
+	if (*proclhp == NULL) {
+		*proclhp = ltep;
+		*procltp = dangling[target];
+	} else {
+		(*procltp)->mcte_next = ltep;
+		ltep->mcte_prev = *procltp;
+		*procltp = dangling[target];
+	}
+	wmb();
+	dangling[target] = NULL;
+	wmb();
+}
+
+mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
+{
+	mctelem_class_t target = (which == MC_URGENT) ?
+	    MC_URGENT : MC_NONURGENT;
+	struct mctelem_ent *tep;
+
+	spin_lock(&processing_lock);
+	mctelem_append_processing(target);
+	if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
+		spin_unlock(&processing_lock);
+		return NULL;
+	}
+
+	mctelem_processing_hold(tep);
+	wmb();
+	spin_unlock(&processing_lock);
+	return MCTE2COOKIE(tep);
+}
+
+void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
+{
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+	spin_lock(&processing_lock);
+	mctelem_processing_release(tep);
+	wmb();
+	spin_unlock(&processing_lock);
+}
+
+void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
+{
+	mctelem_class_t target = (which == MC_URGENT) ?
+	    MC_URGENT : MC_NONURGENT;
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+	if (tep == NULL)
+		return;
+
+	spin_lock(&processing_lock);
+	if (tep == mctctl.mctc_processing_head[target])
+		mctelem_processing_release(tep);
+	wmb();
+	spin_unlock(&processing_lock);
+}
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h b/xen/arch/x86/cpu/mcheck/mctelem.h
new file mode 100644
index 0000000000..e3270f606c
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef _MCTELEM_H
+
+#define	_MCTELEM_H
+
+#include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/traps.h>
+
+/* Helper functions used for collecting error telemetry.
+ *
+ * mctelem_init preallocates a number of data areas for use during
+ * machine check data "logout".  Two classes are distinguished -
+ * urgent uses, intended for use from machine check exception handlers,
+ * and non-urgent uses intended for use from error pollers.
+ * Associated with each logout entry of whatever class is a data area
+ * sized per the single argument to mctelem_init.  mcelem_init should be
+ * called from MCA init code before anybody has the chance to change the
+ * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
+ *
+ * To reserve an entry of a given class for use in logout, call
+ * mctelem_reserve (or use the common handler functions which do all this
+ * for you).  This returns an opaque cookie, or NULL if no elements are
+ * available.  Elements are reserved with an atomic operation so no deadlock
+ * will occur if, for example, a machine check exception interrupts a
+ * scheduled error poll.  The implementation will raid free non-urgent
+ * entries if all urgent entries are in use when an urgent request is received.
+ * Once an entry is reserved the caller must eventually perform exactly
+ * one of two actions: mctelem_commit or mctelem_dismiss.
+ *
+ * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss
+ * frees the element without processing.  After either call the cookie
+ * must not be referenced again.
+ *
+ * To consume committed telemetry call mctelem_consume_oldest_begin
+ * which will return a cookie referencing the oldest (first committed)
+ * entry of the requested class.  Access the associated data using
+ * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
+ * begin .. end bracket you are guaranteed that the entry canot be freed
+ * even if it is ack'd elsewhere).  Once the ultimate consumer of the
+ * telemetry has processed it to stable storage it should acknowledge
+ * the telemetry quoting the cookie id, at which point we will free
+ * the element from the processing list.
+ */
+
+typedef struct mctelem_cookie *mctelem_cookie_t;
+
+typedef enum mctelem_class {
+	MC_URGENT,
+	MC_NONURGENT
+} mctelem_class_t;
+
+extern void mctelem_init(int);
+extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
+extern void *mctelem_dataptr(mctelem_cookie_t);
+extern void mctelem_commit(mctelem_cookie_t);
+extern void mctelem_dismiss(mctelem_cookie_t);
+extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
+extern void mctelem_consume_oldest_end(mctelem_cookie_t);
+extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+
+#endif
diff --git a/xen/arch/x86/cpu/mcheck/non-fatal.c b/xen/arch/x86/cpu/mcheck/non-fatal.c
index 35982a461b..167b1cea2a 100644
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c
@@ -14,46 +14,76 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/sched.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
-#include "x86_mca.h"
-int firstbank = 0;
+
+static cpu_banks_t bankmask;
 static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(8000)
+#define MCE_PERIOD_MIN MILLISECS(2000)
+#define MCE_PERIOD_MAX MILLISECS(16000)
+
+static uint64_t period = MCE_PERIOD;
+static int adjust = 0;
+static int variable_period = 1;
 
 static void mce_checkregs (void *info)
 {
-	u32 low, high;
-	int i;
+	mctelem_cookie_t mctc;
+	struct mca_summary bs;
+	static uint64_t dumpcount = 0;
 
-	for (i=firstbank; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+	mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
 
-		if (high & (1<<31)) {
-			printk(KERN_INFO "MCE: The hardware reports a non "
-				"fatal, correctable incident occurred on "
-				"CPU %d.\n",
-				smp_processor_id());
-			printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+	if (bs.errcnt && mctc != NULL) {
+		adjust++;
 
-			/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
-			wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+		/* If Dom0 enabled the VIRQ_MCA event, then notify it.
+		 * Otherwise, if dom0 has had plenty of time to register
+		 * the virq handler but still hasn't then dump telemetry
+		 * to the Xen console.  The call count may be incremented
+		 * on multiple cpus at once and is indicative only - just
+		 * a simple-minded attempt to avoid spamming the console
+		 * for corrected errors in early startup.
+		 */
 
-			/* Serialize */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
+		if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+			mctelem_commit(mctc);
+			send_guest_global_virq(dom0, VIRQ_MCA);
+		} else if (++dumpcount >= 10) {
+			x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+			mctelem_dismiss(mctc);
+		} else {
+			mctelem_dismiss(mctc);
 		}
+	} else if (mctc != NULL) {
+		mctelem_dismiss(mctc);
 	}
 }
 
 static void mce_work_fn(void *data)
 { 
 	on_each_cpu(mce_checkregs, NULL, 1, 1);
-	set_timer(&mce_timer, NOW() + MCE_PERIOD);
+
+	if (variable_period) {
+		if (adjust)
+			period /= (adjust + 1);
+		else
+			period *= 2;
+		if (period > MCE_PERIOD_MAX)
+			period = MCE_PERIOD_MAX;
+		if (period < MCE_PERIOD_MIN)
+			period = MCE_PERIOD_MIN;
+	}
+
+	set_timer(&mce_timer, NOW() + period);
+	adjust = 0;
 }
 
 static int __init init_nonfatal_mce_checker(void)
@@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_checker(void)
 	/* Check for MCE support */
 	if (!mce_available(c))
 		return -ENODEV;
+
+	memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
+	if (mce_firstbank(c) == 1)
+		clear_bit(0, bankmask);
+
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
 	switch (c->x86_vendor) {
 	case X86_VENDOR_AMD:
 		if (c->x86 == 6) { /* K7 */
-			firstbank = 1;
 			init_timer(&mce_timer, mce_work_fn, NULL, 0);
 			set_timer(&mce_timer, NOW() + MCE_PERIOD);
 			break;
@@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_checker(void)
 		break;
 
 	case X86_VENDOR_INTEL:
-		/* p5 family is different. P4/P6 and latest CPUs shares the
-		 * same polling methods
-		*/
+		/*
+		 * The P5 family is different. P4/P6 and latest CPUs share the
+		 * same polling methods.
+		 */
 		if ( c->x86 != 5 )
 		{
-			/* some CPUs or banks don't support cmci, we need to 
-			 * enable this feature anyway
-			 */
-			intel_mcheck_timer(c);
+			init_timer(&mce_timer, mce_work_fn, NULL, 0);
+			set_timer(&mce_timer, NOW() + MCE_PERIOD);
 		}
 		break;
 	}
diff --git a/xen/arch/x86/cpu/mcheck/p5.c b/xen/arch/x86/cpu/mcheck/p5.c
index 97360bc7e7..4106cbcf53 100644
--- a/xen/arch/x86/cpu/mcheck/p5.c
+++ b/xen/arch/x86/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
 #include "x86_mca.h"
 
 /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
+static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
 {
 	u32 loaddr, hi, lotype;
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
@@ -28,19 +28,14 @@ static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long err
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	
-	/*Check for MCE support */
-	if( !cpu_has(c, X86_FEATURE_MCE) )
-		return;	
-
 	/* Default P5 to off as its often misconnected */
 	if(mce_disabled != -1)
-		return;
-	machine_check_vector = pentium_machine_check;
-	wmb();
+		return 0;
+	x86_mce_vector_register(pentium_machine_check);
 
 	/* Read registers before enabling */
 	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
@@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
  	/* Enable MCE */
 	set_in_cr4(X86_CR4_MCE);
 	printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+
+	return 1;
 }
diff --git a/xen/arch/x86/cpu/mcheck/winchip.c b/xen/arch/x86/cpu/mcheck/winchip.c
index 12b3e6db24..6dede3796f 100644
--- a/xen/arch/x86/cpu/mcheck/winchip.c
+++ b/xen/arch/x86/cpu/mcheck/winchip.c
@@ -16,22 +16,24 @@
 #include "mce.h"
 
 /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
+static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
 {
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
+int winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 lo, hi;
-	machine_check_vector = winchip_machine_check;
+
 	wmb();
+	x86_mce_vector_register(winchip_machine_check);
 	rdmsr(MSR_IDT_FCR1, lo, hi);
 	lo|= (1<<2);	/* Enable EIERRINT (int 18 MCE) */
 	lo&= ~(1<<4);	/* Enable MCE */
 	wrmsr(MSR_IDT_FCR1, lo, hi);
 	set_in_cr4(X86_CR4_MCE);
 	printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+	return (1);
 }
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index df3899bbfe..7661d57e06 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -17,6 +17,10 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#ifndef X86_MCA_H
+
+#define X86_MCA_H
+
 
 /* The MCA/MCE MSRs should not be used anywhere else.
  * They are cpu family/model specific and are only for use
@@ -73,6 +77,9 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Bitfield of MSR_K8_HWCR register */
+#define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
+
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD			0x2
 
@@ -87,3 +94,4 @@ extern int mce_disabled;
 extern unsigned int nr_mce_banks;
 extern int firstbank;
 
+#endif /* X86_MCA_H */
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
index 2d055301f2..85a422363f 100644
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -28,7 +28,7 @@ struct softirq_trap {
 
 struct cpu_user_regs;
 
-extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
+extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
  
 /**
  * guest_has_trap_callback
diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h
index e1f5297bfa..6a55f32ef3 100644
--- a/xen/include/public/arch-x86/xen-mca.h
+++ b/xen/include/public/arch-x86/xen-mca.h
@@ -56,13 +56,20 @@
 /* Hypercall */
 #define __HYPERVISOR_mca __HYPERVISOR_arch_0
 
-#define XEN_MCA_INTERFACE_VERSION 0x03000002
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
 
-/* IN: Dom0 calls hypercall from MC event handler. */
-#define XEN_MC_CORRECTABLE  0x0
-/* IN: Dom0/DomU calls hypercall from MC trap handler. */
-#define XEN_MC_TRAP         0x1
-/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
 
 /* OUT: All is ok */
 #define XEN_MC_OK           0x0
@@ -110,6 +117,7 @@ struct mcinfo_common {
 #define MC_FLAG_POLLED		(1 << 3)
 #define MC_FLAG_RESET		(1 << 4)
 #define MC_FLAG_CMCI		(1 << 5)
+#define MC_FLAG_MCE		(1 << 6)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -174,6 +182,7 @@ struct mc_info {
     uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
 };
 typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
 
 #define __MC_MSR_ARRAYSIZE 8
 #define __MC_NMSRS 1
@@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
 #define XEN_MC_fetch            1
 struct xen_mc_fetch {
     /* IN/OUT variables. */
-    uint32_t flags;
-
-/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
-/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t flags;	/* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+			/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+			   XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint64_t fetch_id;	/* OUT: id for ack, IN: id we are ack'ing */
 
     /* OUT variables. */
-    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
-    struct mc_info mc_info;
+    XEN_GUEST_HANDLE(mc_info_t) data;
 };
 typedef struct xen_mc_fetch xen_mc_fetch_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
@@ -296,7 +305,6 @@ struct xen_mc_notifydomain {
     uint16_t mc_domid;    /* The unprivileged domain to notify. */
     uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
                            * Usually echo'd value from the fetch hypercall. */
-    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
 
     /* IN/OUT variables. */
     uint32_t flags;
@@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo {
 	XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
 };
 
+typedef union {
+    struct xen_mc_fetch        mc_fetch;
+    struct xen_mc_notifydomain mc_notifydomain;
+    struct xen_mc_physcpuinfo  mc_physcpuinfo;
+} xen_mc_arg_t;
+
 struct xen_mc {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
-    union {
-        struct xen_mc_fetch        mc_fetch;
-        struct xen_mc_notifydomain mc_notifydomain;
-        struct xen_mc_physcpuinfo  mc_physcpuinfo;
-        uint8_t pad[MCINFO_HYPERCALLSIZE];
-    } u;
+    xen_mc_arg_t u;
 };
 typedef struct xen_mc xen_mc_t;
 DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
author	Keir Fraser <keir.fraser@citrix.com>	2009-03-17 14:22:50 +0000
committer	Keir Fraser <keir.fraser@citrix.com>	2009-03-17 14:22:50 +0000
commit	e114bca753116936a99531ec27c591949b5c11c9 (patch)
tree	5a6256158d2f183d6431900567a2891ac6b1793d
parent	60158182b4ff9dfeea12703d040ece4461c795ad (diff)
download	xen-e114bca753116936a99531ec27c591949b5c11c9.tar.gz xen-e114bca753116936a99531ec27c591949b5c11c9.tar.bz2 xen-e114bca753116936a99531ec27c591949b5c11c9.zip