aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeir Fraser <keir.fraser@citrix.com>2009-03-17 14:22:50 +0000
committerKeir Fraser <keir.fraser@citrix.com>2009-03-17 14:22:50 +0000
commite114bca753116936a99531ec27c591949b5c11c9 (patch)
tree5a6256158d2f183d6431900567a2891ac6b1793d
parent60158182b4ff9dfeea12703d040ece4461c795ad (diff)
downloadxen-e114bca753116936a99531ec27c591949b5c11c9.tar.gz
xen-e114bca753116936a99531ec27c591949b5c11c9.tar.bz2
xen-e114bca753116936a99531ec27c591949b5c11c9.zip
x86 mcheck: Replace hypervisor MCA telemetry structures with something
more robust and designed to make terminal error telemetry available to the dom0 panic flow for diagnosis on reboot. Use common code for a lot of the AMD and Intel MCE handling code. Signed-off-by: Gavin Maltby <gavin.maltby@sun.com> Signed-off-by: Frank van der Linden <frank.vanderlinden@sun.com>
-rw-r--r--xen/arch/x86/cpu/mcheck/Makefile1
-rw-r--r--xen/arch/x86/cpu/mcheck/amd_f10.c37
-rw-r--r--xen/arch/x86/cpu/mcheck/amd_k8.c229
-rw-r--r--xen/arch/x86/cpu/mcheck/amd_nonfatal.c132
-rw-r--r--xen/arch/x86/cpu/mcheck/k7.c11
-rw-r--r--xen/arch/x86/cpu/mcheck/mce.c955
-rw-r--r--xen/arch/x86/cpu/mcheck/mce.h98
-rw-r--r--xen/arch/x86/cpu/mcheck/mce_intel.c379
-rw-r--r--xen/arch/x86/cpu/mcheck/mctelem.c443
-rw-r--r--xen/arch/x86/cpu/mcheck/mctelem.h71
-rw-r--r--xen/arch/x86/cpu/mcheck/non-fatal.c87
-rw-r--r--xen/arch/x86/cpu/mcheck/p5.c15
-rw-r--r--xen/arch/x86/cpu/mcheck/winchip.c8
-rw-r--r--xen/arch/x86/cpu/mcheck/x86_mca.h8
-rw-r--r--xen/include/asm-x86/traps.h2
-rw-r--r--xen/include/public/arch-x86/xen-mca.h47
16 files changed, 1439 insertions, 1084 deletions
diff --git a/xen/arch/x86/cpu/mcheck/Makefile b/xen/arch/x86/cpu/mcheck/Makefile
index 15fed6eb0b..ed0ae00058 100644
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -2,6 +2,7 @@ obj-y += amd_nonfatal.o
obj-y += k7.o
obj-y += amd_k8.o
obj-y += amd_f10.o
+obj-y += mctelem.o
obj-y += mce.o
obj-y += mce_intel.o
obj-y += non-fatal.o
diff --git a/xen/arch/x86/cpu/mcheck/amd_f10.c b/xen/arch/x86/cpu/mcheck/amd_f10.c
index 9c26ef9fe8..dd2a54b8fa 100644
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -49,20 +49,21 @@
#include "x86_mca.h"
-static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+static enum mca_extinfo
+amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
{
struct mcinfo_extended mc_ext;
/* Family 0x10 introduced additional MSR that belong to the
* northbridge bank (4). */
- if (bank != 4)
- return 0;
+ if (mi == NULL || bank != 4)
+ return MCA_EXTINFO_IGNORED;
if (!(status & MCi_STATUS_VAL))
- return 0;
+ return MCA_EXTINFO_IGNORED;
if (!(status & MCi_STATUS_MISCV))
- return 0;
+ return MCA_EXTINFO_IGNORED;
memset(&mc_ext, 0, sizeof(mc_ext));
mc_ext.common.type = MC_TYPE_EXTENDED;
@@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
x86_mcinfo_add(mi, &mc_ext);
- return 1;
+ return MCA_EXTINFO_LOCAL;
}
extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
/* AMD Family10 machine check */
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c)
{
uint64_t value;
uint32_t i;
int cpu_nr;
- machine_check_vector = k8_machine_check;
- mc_callback_bank_extended = amd_f10_handler;
+ if (!cpu_has(c, X86_FEATURE_MCA))
+ return 0;
+
+ x86_mce_vector_register(k8_machine_check);
+ x86_mce_callback_register(amd_f10_handler);
cpu_nr = smp_processor_id();
- wmb();
rdmsrl(MSR_IA32_MCG_CAP, value);
if (value & MCG_CTL_P) /* Control register present ? */
@@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
for (i = 0; i < nr_mce_banks; i++) {
switch (i) {
case 4: /* Northbridge */
- /* Enable error reporting of all errors,
- * enable error checking and
- * disable sync flooding */
- wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+ /* Enable error reporting of all errors */
+ wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
-
- /* XXX: We should write the value 0x1087821UL into
- * to register F3x180 here, which sits in
- * the PCI extended configuration space.
- * Since this is not possible here, we can only hope,
- * Dom0 is doing that.
- */
break;
default:
@@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
set_in_cr4(X86_CR4_MCE);
printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
+ return 1;
}
diff --git a/xen/arch/x86/cpu/mcheck/amd_k8.c b/xen/arch/x86/cpu/mcheck/amd_k8.c
index 768bfadb2a..03c36d3a1d 100644
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c
@@ -67,234 +67,27 @@
#include <asm/msr.h>
#include "mce.h"
-#include "x86_mca.h"
/* Machine Check Handler for AMD K8 family series */
void k8_machine_check(struct cpu_user_regs *regs, long error_code)
{
- struct vcpu *vcpu = current;
- struct domain *curdom;
- struct mc_info *mc_data;
- struct mcinfo_global mc_global;
- struct mcinfo_bank mc_info;
- uint64_t status, addrv, miscv, uc;
- uint32_t i;
- unsigned int cpu_nr;
- uint32_t xen_impacted = 0;
-#define DOM_NORMAL 0
-#define DOM0_TRAP 1
-#define DOMU_TRAP 2
-#define DOMU_KILLED 4
- uint32_t dom_state = DOM_NORMAL;
-
- /* This handler runs as interrupt gate. So IPIs from the
- * polling service routine are defered until we finished.
- */
-
- /* Disable interrupts for the _vcpu_. It may not re-scheduled to
- * an other physical CPU or the impacted process in the guest
- * continues running with corrupted data, otherwise. */
- vcpu_schedule_lock_irq(vcpu);
-
- mc_data = x86_mcinfo_getptr();
- cpu_nr = smp_processor_id();
- BUG_ON(cpu_nr != vcpu->processor);
-
- curdom = vcpu->domain;
-
- memset(&mc_global, 0, sizeof(mc_global));
- mc_global.common.type = MC_TYPE_GLOBAL;
- mc_global.common.size = sizeof(mc_global);
-
- mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-
- x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
- &mc_global.mc_coreid, &mc_global.mc_core_threadid,
- &mc_global.mc_apicid, NULL, NULL, NULL);
-
- mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
- mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
- rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
- /* Quick check, who is impacted */
- xen_impacted = is_idle_domain(curdom);
-
- /* Dom0 */
- x86_mcinfo_clear(mc_data);
- x86_mcinfo_add(mc_data, &mc_global);
-
- for (i = 0; i < nr_mce_banks; i++) {
- struct domain *d;
-
- rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
- if (!(status & MCi_STATUS_VAL))
- continue;
-
- /* An error happened in this bank.
- * This is expected to be an uncorrectable error,
- * since correctable errors get polled.
- */
- uc = status & MCi_STATUS_UC;
-
- memset(&mc_info, 0, sizeof(mc_info));
- mc_info.common.type = MC_TYPE_BANK;
- mc_info.common.size = sizeof(mc_info);
- mc_info.mc_bank = i;
- mc_info.mc_status = status;
-
- addrv = 0;
- if (status & MCi_STATUS_ADDRV) {
- rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-
- d = maddr_get_owner(addrv);
- if (d != NULL)
- mc_info.mc_domid = d->domain_id;
- }
-
- miscv = 0;
- if (status & MCi_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
- mc_info.mc_addr = addrv;
- mc_info.mc_misc = miscv;
-
- x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
- if (mc_callback_bank_extended)
- mc_callback_bank_extended(mc_data, i, status);
-
- /* clear status */
- wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- status = mc_global.mc_gstatus;
-
- /* clear MCIP or cpu enters shutdown state
- * in case another MCE occurs. */
- status &= ~MCG_STATUS_MCIP;
- wrmsrl(MSR_IA32_MCG_STATUS, status);
- wmb();
-
- /* For the details see the discussion "MCE/MCA concept" on xen-devel.
- * The thread started here:
- * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
- */
-
- /* MCG_STATUS_RIPV:
- * When this bit is not set, then the instruction pointer onto the stack
- * to resume at is not valid. If xen is interrupted, then we panic anyway
- * right below. Otherwise it is up to the guest to figure out if
- * guest kernel or guest userland is affected and should kill either
- * itself or the affected process.
- */
-
- /* MCG_STATUS_EIPV:
- * Evaluation of EIPV is the job of the guest.
- */
-
- if (xen_impacted) {
- /* Now we are going to panic anyway. Allow interrupts, so that
- * printk on serial console can work. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* Uh, that means, machine check exception
- * inside Xen occured. */
- printk("Machine check exception occured in Xen.\n");
-
- /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
- * to the error then it makes sense to print a stack trace.
- * That can be useful for more detailed error analysis and/or
- * error case studies to figure out, if we can clear
- * xen_impacted and kill a DomU instead
- * (i.e. if a guest only control structure is affected, but then
- * we must ensure the bad pages are not re-used again).
- */
- if (status & MCG_STATUS_EIPV) {
- printk("MCE: Instruction Pointer is related to the error. "
- "Therefore, print the execution state.\n");
- show_execution_state(regs);
- }
- x86_mcinfo_dump(mc_data);
- mc_panic("End of MCE. Use mcelog to decode above error codes.\n");
- }
-
- /* If Dom0 registered a machine check handler, which is only possible
- * with a PV MCA driver, then ... */
- if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
- dom_state = DOM0_TRAP;
-
- /* ... deliver machine check trap to Dom0. */
- send_guest_trap(dom0, 0, TRAP_machine_check);
-
- /* Xen may tell Dom0 now to notify the DomU.
- * But this will happen through a hypercall. */
- } else
- /* Dom0 did not register a machine check handler, but if DomU
- * did so, then... */
- if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
- dom_state = DOMU_TRAP;
-
- /* ... deliver machine check trap to DomU */
- send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
- } else {
- /* hmm... noone feels responsible to handle the error.
- * So, do a quick check if a DomU is impacted or not.
- */
- if (curdom == dom0) {
- /* Dom0 is impacted. Since noone can't handle
- * this error, panic! */
- x86_mcinfo_dump(mc_data);
- mc_panic("MCE occured in Dom0, which it can't handle\n");
-
- /* UNREACHED */
- } else {
- dom_state = DOMU_KILLED;
-
- /* Enable interrupts. This basically results in
- * calling sti on the *physical* cpu. But after
- * domain_crash() the vcpu pointer is invalid.
- * Therefore, we must unlock the irqs before killing
- * it. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* DomU is impacted. Kill it and continue. */
- domain_crash(curdom);
- }
- }
-
-
- switch (dom_state) {
- case DOM0_TRAP:
- case DOMU_TRAP:
- /* Enable interrupts. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* guest softirqs and event callbacks are scheduled
- * immediately after this handler exits. */
- break;
- case DOMU_KILLED:
- /* Nothing to do here. */
- break;
- default:
- BUG();
- }
+ mcheck_cmn_handler(regs, error_code, mca_allbanks);
}
-
/* AMD K8 machine check */
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
{
uint64_t value;
uint32_t i;
int cpu_nr;
- machine_check_vector = k8_machine_check;
+ /* Check for PPro style MCA; our caller has confirmed MCE support. */
+ if (!cpu_has(c, X86_FEATURE_MCA))
+ return 0;
+
+ x86_mce_vector_register(k8_machine_check);
cpu_nr = smp_processor_id();
- wmb();
rdmsrl(MSR_IA32_MCG_CAP, value);
if (value & MCG_CTL_P) /* Control register present ? */
@@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
for (i = 0; i < nr_mce_banks; i++) {
switch (i) {
case 4: /* Northbridge */
- /* Enable error reporting of all errors,
- * enable error checking and
- * disable sync flooding */
- wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+ /* Enable error reporting of all errors */
+ wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
break;
@@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
set_in_cr4(X86_CR4_MCE);
printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+
+ return 1;
}
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
index f57e4e3811..01766c2a45 100644
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -58,22 +58,23 @@
#include <xen/smp.h>
#include <xen/timer.h>
#include <xen/event.h>
-#include <asm/processor.h>
+
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
-#include "x86_mca.h"
static struct timer mce_timer;
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(10000)
#define MCE_MIN MILLISECS(2000)
#define MCE_MAX MILLISECS(30000)
static s_time_t period = MCE_PERIOD;
static int hw_threshold = 0;
static int adjust = 0;
+static int variable_period = 1;
/* The polling service routine:
* Collects information of correctable errors and notifies
@@ -81,99 +82,46 @@ static int adjust = 0;
*/
void mce_amd_checkregs(void *info)
{
- struct vcpu *vcpu = current;
- struct mc_info *mc_data;
- struct mcinfo_global mc_global;
- struct mcinfo_bank mc_info;
- uint64_t status, addrv, miscv;
- unsigned int i;
+ mctelem_cookie_t mctc;
+ struct mca_summary bs;
unsigned int event_enabled;
- unsigned int cpu_nr;
- int error_found;
- /* We don't need a slot yet. Only allocate one on error. */
- mc_data = NULL;
+ mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
- cpu_nr = smp_processor_id();
- BUG_ON(cpu_nr != vcpu->processor);
event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
- error_found = 0;
-
- memset(&mc_global, 0, sizeof(mc_global));
- mc_global.common.type = MC_TYPE_GLOBAL;
- mc_global.common.size = sizeof(mc_global);
-
- mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
- mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-
- x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
- &mc_global.mc_coreid, &mc_global.mc_core_threadid,
- &mc_global.mc_apicid, NULL, NULL, NULL);
-
- mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
- rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
- for (i = 0; i < nr_mce_banks; i++) {
- struct domain *d;
-
- rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
- if (!(status & MCi_STATUS_VAL))
- continue;
-
- if (mc_data == NULL) {
- /* Now we need a slot to fill in error telemetry. */
- mc_data = x86_mcinfo_getptr();
- BUG_ON(mc_data == NULL);
- x86_mcinfo_clear(mc_data);
- x86_mcinfo_add(mc_data, &mc_global);
- }
-
- memset(&mc_info, 0, sizeof(mc_info));
- mc_info.common.type = MC_TYPE_BANK;
- mc_info.common.size = sizeof(mc_info);
- mc_info.mc_bank = i;
- mc_info.mc_status = status;
- /* Increase polling frequency */
- error_found = 1;
-
- addrv = 0;
- if (status & MCi_STATUS_ADDRV) {
- rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
- d = maddr_get_owner(addrv);
- if (d != NULL)
- mc_info.mc_domid = d->domain_id;
- }
-
- miscv = 0;
- if (status & MCi_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
- mc_info.mc_addr = addrv;
- mc_info.mc_misc = miscv;
- x86_mcinfo_add(mc_data, &mc_info);
+ if (bs.errcnt && mctc != NULL) {
+ static uint64_t dumpcount = 0;
- if (mc_callback_bank_extended)
- mc_callback_bank_extended(mc_data, i, status);
+ /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+ * Otherwise, if dom0 has had plenty of time to register
+ * the virq handler but still hasn't then dump telemetry
+ * to the Xen console. The call count may be incremented
+ * on multiple cpus at once and is indicative only - just
+ * a simple-minded attempt to avoid spamming the console
+ * for corrected errors in early startup. */
- /* clear status */
- wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
- wmb();
- }
-
- if (error_found > 0) {
- /* If Dom0 enabled the VIRQ_MCA event, then ... */
- if (event_enabled)
- /* ... notify it. */
+ if (event_enabled) {
+ mctelem_commit(mctc);
send_guest_global_virq(dom0, VIRQ_MCA);
- else
- /* ... or dump it */
- x86_mcinfo_dump(mc_data);
+ } else if (++dumpcount >= 10) {
+ x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+ mctelem_dismiss(mctc);
+ } else {
+ mctelem_dismiss(mctc);
+ }
+
+ } else if (mctc != NULL) {
+ mctelem_dismiss(mctc);
}
- adjust += error_found;
+ /* adjust is global and all cpus may attempt to increment it without
+ * synchronisation, so they race and the final adjust count
+ * (number of cpus seeing any error) is approximate. We can
+ * guarantee that if any cpu observes an error that the
+ * adjust count is at least 1. */
+ if (bs.errcnt)
+ adjust++;
}
/* polling service routine invoker:
@@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data)
on_each_cpu(mce_amd_checkregs, data, 1, 1);
if (adjust > 0) {
- if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+ if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
printk("MCE: polling routine found correctable error. "
" Use mcelog to parse above error output.\n");
@@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data)
}
}
- if (adjust > 0) {
+ if (variable_period && adjust > 0) {
/* Increase polling frequency */
adjust++; /* adjust == 1 must have an effect */
period /= adjust;
- } else {
+ } else if (variable_period) {
/* Decrease polling frequency */
period *= 2;
}
- if (period > MCE_MAX) {
+ if (variable_period && period > MCE_MAX) {
/* limit: Poll at least every 30s */
period = MCE_MAX;
}
- if (period < MCE_MIN) {
+ if (variable_period && period < MCE_MIN) {
/* limit: Poll every 2s.
* When this is reached an uncorrectable error
* is expected to happen, if Dom0 does nothing.
@@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
/* The threshold bitfields in MSR_IA32_MC4_MISC has
* been introduced along with the SVME feature bit. */
- if (cpu_has(c, X86_FEATURE_SVME)) {
+ if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
uint64_t value;
/* hw threshold registers present */
diff --git a/xen/arch/x86/cpu/mcheck/k7.c b/xen/arch/x86/cpu/mcheck/k7.c
index aedd0a0e1f..1a0a0a5fef 100644
--- a/xen/arch/x86/cpu/mcheck/k7.c
+++ b/xen/arch/x86/cpu/mcheck/k7.c
@@ -68,13 +68,16 @@ static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_co
/* AMD K7 machine check */
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
- machine_check_vector = k7_machine_check;
- wmb();
+ /* Check for PPro style MCA; our caller has confirmed MCE support. */
+ if (!cpu_has(c, X86_FEATURE_MCA))
+ return 0;
+
+ x86_mce_vector_register(k7_machine_check);
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
@@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
smp_processor_id());
+
+ return 1;
}
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index 3b622adc50..a6051d9755 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -10,104 +10,490 @@
#include <xen/smp.h>
#include <xen/errno.h>
#include <xen/console.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
+#include <asm/msr.h>
#include "mce.h"
-#include "x86_mca.h"
int mce_disabled = 0;
unsigned int nr_mce_banks;
EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
-/* XXX For now a fixed array is used. Later this should be changed
- * to a dynamic allocated array with the size calculated in relation
- * to physical cpus present in the machine.
- * The more physical cpus are available, the more entries you need.
- */
-#define MAX_MCINFO 20
-
-struct mc_machine_notify {
- struct mc_info mc;
- uint32_t fetch_idx;
- uint32_t valid;
-};
+static void mcinfo_clear(struct mc_info *);
-struct mc_machine {
+#define SEG_PL(segsel) ((segsel) & 0x3)
- /* Array structure used for collecting machine check error telemetry. */
- struct mc_info mc[MAX_MCINFO];
+#if 1 /* XXFM switch to 0 for putback */
- /* We handle multiple machine check reports lockless by
- * iterating through the array using the producer/consumer concept.
- */
- /* Producer array index to fill with machine check error data.
- * Index must be increased atomically. */
- uint32_t error_idx;
-
- /* Consumer array index to fetch machine check error data from.
- * Index must be increased atomically. */
- uint32_t fetch_idx;
-
- /* Integer array holding the indeces of the mc array that allows
- * a Dom0 to notify a DomU to re-fetch the same machine check error
- * data. The notification and refetch also uses its own
- * producer/consumer mechanism, because Dom0 may decide to not report
- * every error to the impacted DomU.
- */
- struct mc_machine_notify notify[MAX_MCINFO];
+#define x86_mcerr(str, err) _x86_mcerr(str, err)
- /* Array index to get fetch_idx from.
- * Index must be increased atomically. */
- uint32_t notifyproducer_idx;
- uint32_t notifyconsumer_idx;
-};
+static int _x86_mcerr(const char *msg, int err)
+{
+ printk("x86_mcerr: %s, returning %d\n",
+ msg != NULL ? msg : "", err);
+ return err;
+}
+#else
+#define x86_mcerr(str,err)
+#endif
-/* Global variable with machine check information. */
-struct mc_machine mc_data;
+cpu_banks_t mca_allbanks;
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
-{
+{
printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
smp_processor_id());
}
+static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
+
+void x86_mce_vector_register(x86_mce_vector_t hdlr)
+{
+ _machine_check_vector = hdlr;
+ wmb();
+}
+
/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
+
+void machine_check_vector(struct cpu_user_regs *regs, long error_code)
+{
+ _machine_check_vector(regs, error_code);
+}
/* Init machine check callback handler
* It is used to collect additional information provided by newer
* CPU families/models without the need to duplicate the whole handler.
* This avoids having many handlers doing almost nearly the same and each
* with its own tweaks ands bugs. */
-int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+static x86_mce_callback_t mc_callback_bank_extended = NULL;
+
+void x86_mce_callback_register(x86_mce_callback_t cbfunc)
+{
+ mc_callback_bank_extended = cbfunc;
+}
+
+/* Utility function to perform MCA bank telemetry readout and to push that
+ * telemetry towards an interested dom0 for logging and diagnosis.
+ * The caller - #MC handler or MCA poll function - must arrange that we
+ * do not migrate cpus. */
+
+/* XXFM Could add overflow counting? */
+mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
+ struct mca_summary *sp)
+{
+ struct vcpu *v = current;
+ struct domain *d;
+ uint64_t gstatus, status, addr, misc;
+ struct mcinfo_global mcg; /* on stack */
+ struct mcinfo_common *mic;
+ struct mcinfo_global *mig; /* on stack */
+ mctelem_cookie_t mctc = NULL;
+ uint32_t uc = 0, pcc = 0;
+ struct mc_info *mci = NULL;
+ mctelem_class_t which = MC_URGENT; /* XXXgcc */
+ unsigned int cpu_nr;
+ int errcnt = 0;
+ int i;
+ enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
+
+ cpu_nr = smp_processor_id();
+ BUG_ON(cpu_nr != v->processor);
+
+ rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+
+ memset(&mcg, 0, sizeof (mcg));
+ mcg.common.type = MC_TYPE_GLOBAL;
+ mcg.common.size = sizeof (mcg);
+ if (v != NULL && ((d = v->domain) != NULL)) {
+ mcg.mc_domid = d->domain_id;
+ mcg.mc_vcpuid = v->vcpu_id;
+ } else {
+ mcg.mc_domid = -1;
+ mcg.mc_vcpuid = -1;
+ }
+ mcg.mc_gstatus = gstatus; /* MCG_STATUS */
+
+ switch (who) {
+ case MCA_MCE_HANDLER:
+ mcg.mc_flags = MC_FLAG_MCE;
+ which = MC_URGENT;
+ break;
+
+ case MCA_POLLER:
+ case MCA_RESET:
+ mcg.mc_flags = MC_FLAG_POLLED;
+ which = MC_NONURGENT;
+ break;
+
+ case MCA_CMCI_HANDLER:
+ mcg.mc_flags = MC_FLAG_CMCI;
+ which = MC_NONURGENT;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Retrieve detector information */
+ x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
+ &mcg.mc_coreid, &mcg.mc_core_threadid,
+ &mcg.mc_apicid, NULL, NULL, NULL);
+
+ for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+ struct mcinfo_bank mcb; /* on stack */
+
+ /* Skip bank if corresponding bit in bankmask is clear */
+ if (!test_bit(i, bankmask))
+ continue;
+
+ rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+ if (!(status & MCi_STATUS_VAL))
+ continue; /* this bank has no valid telemetry */
+
+ /* If this is the first bank with valid MCA DATA, then
+ * try to reserve an entry from the urgent/nonurgent queue
+ * depending on whethere we are called from an exception or
+ * a poller; this can fail (for example dom0 may not
+ * yet have consumed past telemetry). */
+ if (errcnt == 0) {
+ if ((mctc = mctelem_reserve(which)) != NULL) {
+ mci = mctelem_dataptr(mctc);
+ mcinfo_clear(mci);
+ }
+ }
+
+ memset(&mcb, 0, sizeof (mcb));
+ mcb.common.type = MC_TYPE_BANK;
+ mcb.common.size = sizeof (mcb);
+ mcb.mc_bank = i;
+ mcb.mc_status = status;
+
+ /* form a mask of which banks have logged uncorrected errors */
+ if ((status & MCi_STATUS_UC) != 0)
+ uc |= (1 << i);
+
+ /* likewise for those with processor context corrupt */
+ if ((status & MCi_STATUS_PCC) != 0)
+ pcc |= (1 << i);
+
+ addr = misc = 0;
+
+ if (status & MCi_STATUS_ADDRV) {
+ rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+ d = maddr_get_owner(addr);
+ if (d != NULL && (who == MCA_POLLER ||
+ who == MCA_CMCI_HANDLER))
+ mcb.mc_domid = d->domain_id;
+ }
+
+ if (status & MCi_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
+
+ mcb.mc_addr = addr;
+ mcb.mc_misc = misc;
+
+ if (who == MCA_CMCI_HANDLER) {
+ rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+ rdtscll(mcb.mc_tsc);
+ }
+ /* Increment the error count; if this is the first bank
+ * with a valid error then add the global info to the mcinfo. */
+ if (errcnt++ == 0 && mci != NULL)
+ x86_mcinfo_add(mci, &mcg);
-static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+ /* Add the bank data */
+ if (mci != NULL)
+ x86_mcinfo_add(mci, &mcb);
+
+ if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
+ cbret = mc_callback_bank_extended(mci, i, status);
+ }
+
+ /* Clear status */
+ wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+ wmb();
+ }
+
+ if (mci != NULL && errcnt > 0) {
+ x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
+ mig = (struct mcinfo_global *)mic;
+ if (pcc)
+ mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+ else if (uc)
+ mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+ else
+ mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+ }
+
+
+ if (sp) {
+ sp->errcnt = errcnt;
+ sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
+ sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
+ sp->uc = uc;
+ sp->pcc = pcc;
+ }
+
+ return mci != NULL ? mctc : NULL; /* may be NULL */
+}
+
+#define DOM_NORMAL 0
+#define DOM0_TRAP 1
+#define DOMU_TRAP 2
+#define DOMU_KILLED 4
+
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+ cpu_banks_t bankmask)
{
+ int xen_state_lost, dom0_state_lost, domU_state_lost;
+ struct vcpu *v = current;
+ struct domain *curdom = v->domain;
+ domid_t domid = curdom->domain_id;
+ int ctx_xen, ctx_dom0, ctx_domU;
+ uint32_t dom_state = DOM_NORMAL;
+ mctelem_cookie_t mctc = NULL;
+ struct mca_summary bs;
+ struct mc_info *mci = NULL;
+ int irqlocked = 0;
+ uint64_t gstatus;
+ int ripv;
+
+ /* This handler runs as interrupt gate. So IPIs from the
+ * polling service routine are defered until we're finished.
+ */
+
+ /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+ * another physical CPU. */
+ vcpu_schedule_lock_irq(v);
+ irqlocked = 1;
+
+ /* Read global status; if it does not indicate machine check
+ * in progress then bail as long as we have a valid ip to return to. */
+ rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+ ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
+ if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
+ add_taint(TAINT_MACHINE_CHECK); /* questionable */
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+ goto cmn_handler_done;
+ }
+
+ /* Go and grab error telemetry. We must choose whether to commit
+ * for logging or dismiss the cookie that is returned, and must not
+ * reference the cookie after that action.
+ */
+ mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+ if (mctc != NULL)
+ mci = (struct mc_info *)mctelem_dataptr(mctc);
+
+ /* Clear MCIP or another #MC will enter shutdown state */
+ gstatus &= ~MCG_STATUS_MCIP;
+ wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+ wmb();
+
+ /* If no valid errors and our stack is intact, we're done */
+ if (ripv && bs.errcnt == 0) {
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+ goto cmn_handler_done;
+ }
+
+ if (bs.uc || bs.pcc)
+ add_taint(TAINT_MACHINE_CHECK);
+
+ /* Machine check exceptions will usually be for UC and/or PCC errors,
+ * but it is possible to configure machine check for some classes
+ * of corrected error.
+ *
+ * UC errors could compromise any domain or the hypervisor
+ * itself - for example a cache writeback of modified data that
+ * turned out to be bad could be for data belonging to anyone, not
+ * just the current domain. In the absence of known data poisoning
+ * to prevent consumption of such bad data in the system we regard
+ * all UC errors as terminal. It may be possible to attempt some
+ * heuristics based on the address affected, which guests have
+ * mappings to that mfn etc.
+ *
+ * PCC errors apply to the current context.
+ *
+ * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
+ * and not PCC is terminal - the return instruction pointer
+ * pushed onto the stack is bogus. If the interrupt context is
+ * the hypervisor or dom0 the game is over, otherwise we can
+ * limit the impact to a single domU but only if we trampoline
+ * somewhere safely - we can't return and unwind the stack.
+ * Since there is no trampoline in place we will treat !RIPV
+ * as terminal for any context.
+ */
+ ctx_xen = SEG_PL(regs->cs) == 0;
+ ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
+ ctx_domU = !ctx_xen && !ctx_dom0;
+
+ xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
+ !ripv;
+ dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
+ domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
+
+ if (xen_state_lost) {
+ /* Now we are going to panic anyway. Allow interrupts, so that
+ * printk on serial console can work. */
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+
+ printk("Terminal machine check exception occured in "
+ "hypervisor context.\n");
+
+ /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
+ * to the error then it makes sense to print a stack trace.
+ * That can be useful for more detailed error analysis and/or
+ * error case studies to figure out, if we can clear
+ * xen_impacted and kill a DomU instead
+ * (i.e. if a guest only control structure is affected, but then
+ * we must ensure the bad pages are not re-used again).
+ */
+ if (bs.eipv & MCG_STATUS_EIPV) {
+ printk("MCE: Instruction Pointer is related to the "
+ "error, therefore print the execution state.\n");
+ show_execution_state(regs);
+ }
+
+ /* Commit the telemetry so that panic flow can find it. */
+ if (mctc != NULL) {
+ x86_mcinfo_dump(mci);
+ mctelem_commit(mctc);
+ }
+ mc_panic("Hypervisor state lost due to machine check "
+ "exception.\n");
+ /*NOTREACHED*/
+ }
+
+ /*
+ * Xen hypervisor state is intact. If dom0 state is lost then
+ * give it a chance to decide what to do if it has registered
+ * a handler for this event, otherwise panic.
+ *
+ * XXFM Could add some Solaris dom0 contract kill here?
+ */
+ if (dom0_state_lost) {
+ if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+ dom_state = DOM0_TRAP;
+ send_guest_trap(dom0, 0, TRAP_machine_check);
+ /* XXFM case of return with !ripv ??? */
+ } else {
+ /* Commit telemetry for panic flow. */
+ if (mctc != NULL) {
+ x86_mcinfo_dump(mci);
+ mctelem_commit(mctc);
+ }
+ mc_panic("Dom0 state lost due to machine check "
+ "exception\n");
+ /*NOTREACHED*/
+ }
+ }
+
+ /*
+ * If a domU has lost state then send it a trap if it has registered
+ * a handler, otherwise crash the domain.
+ * XXFM Revisit this functionality.
+ */
+ if (domU_state_lost) {
+ if (guest_has_trap_callback(v->domain, v->vcpu_id,
+ TRAP_machine_check)) {
+ dom_state = DOMU_TRAP;
+ send_guest_trap(curdom, v->vcpu_id,
+ TRAP_machine_check);
+ } else {
+ dom_state = DOMU_KILLED;
+ /* Enable interrupts. This basically results in
+ * calling sti on the *physical* cpu. But after
+ * domain_crash() the vcpu pointer is invalid.
+ * Therefore, we must unlock the irqs before killing
+ * it. */
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+
+ /* DomU is impacted. Kill it and continue. */
+ domain_crash(curdom);
+ }
+ }
+
+ switch (dom_state) {
+ case DOM0_TRAP:
+ case DOMU_TRAP:
+ /* Enable interrupts. */
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+
+ /* guest softirqs and event callbacks are scheduled
+ * immediately after this handler exits. */
+ break;
+ case DOMU_KILLED:
+ /* Nothing to do here. */
+ break;
+
+ case DOM_NORMAL:
+ vcpu_schedule_unlock_irq(v);
+ irqlocked = 0;
+ break;
+ }
+
+cmn_handler_done:
+ BUG_ON(irqlocked);
+ BUG_ON(!ripv);
+
+ if (bs.errcnt) {
+ /* Not panicing, so forward telemetry to dom0 now if it
+ * is interested. */
+ if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+ if (mctc != NULL)
+ mctelem_commit(mctc);
+ send_guest_global_virq(dom0, VIRQ_MCA);
+ } else {
+ x86_mcinfo_dump(mci);
+ if (mctc != NULL)
+ mctelem_dismiss(mctc);
+ }
+ } else if (mctc != NULL) {
+ mctelem_dismiss(mctc);
+ }
+}
+
+static int amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+ int rc = 0;
switch (ci->x86) {
case 6:
- amd_k7_mcheck_init(ci);
+ rc = amd_k7_mcheck_init(ci);
break;
case 0xf:
- amd_k8_mcheck_init(ci);
+ rc = amd_k8_mcheck_init(ci);
break;
case 0x10:
- amd_f10_mcheck_init(ci);
+ rc = amd_f10_mcheck_init(ci);
break;
default:
/* Assume that machine check support is available.
* The minimum provided support is at least the K8. */
- amd_k8_mcheck_init(ci);
+ rc = amd_k8_mcheck_init(ci);
}
+
+ return rc;
}
/*check the existence of Machine Check*/
@@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c)
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
+/*
+ * Check if bank 0 is usable for MCE. It isn't for AMD K7,
+ * and Intel P6 family before model 0x1a.
+ */
+int mce_firstbank(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 6) {
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return 1;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
+ return 1;
+ }
+
+ return 0;
+}
+
/* This has to be run for each processor */
void mcheck_init(struct cpuinfo_x86 *c)
{
+ int inited = 0, i;
+
if (mce_disabled == 1) {
printk(XENLOG_INFO "MCE support disabled by bootparam\n");
return;
}
+ for (i = 0; i < MAX_NR_BANKS; i++)
+ set_bit(i,mca_allbanks);
+
+ /* Enforce at least MCE support in CPUID information. Individual
+ * families may also need to enforce a check for MCA support. */
if (!cpu_has(c, X86_FEATURE_MCE)) {
printk(XENLOG_INFO "CPU%i: No machine check support available\n",
smp_processor_id());
return;
}
- memset(&mc_data, 0, sizeof(struct mc_machine));
+ mctelem_init(sizeof (struct mc_info));
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
- amd_mcheck_init(c);
+ inited = amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
+ switch (c->x86) {
+ case 5:
#ifndef CONFIG_X86_64
- if (c->x86==5)
- intel_p5_mcheck_init(c);
+ inited = intel_p5_mcheck_init(c);
#endif
- /*If it is P6 or P4 family, including CORE 2 DUO series*/
- if (c->x86 == 6 || c->x86==15)
- {
- printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
- intel_mcheck_init(c);
+ break;
+
+ case 6:
+ case 15:
+ inited = intel_mcheck_init(c);
+ break;
}
break;
#ifndef CONFIG_X86_64
case X86_VENDOR_CENTAUR:
- if (c->x86==5)
- winchip_mcheck_init(c);
+ if (c->x86==5) {
+ inited = winchip_mcheck_init(c);
+ }
break;
#endif
default:
break;
}
+
+ if (!inited)
+ printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
+ smp_processor_id());
}
@@ -176,191 +593,12 @@ static void __init mcheck_enable(char *str)
custom_param("nomce", mcheck_disable);
custom_param("mce", mcheck_enable);
-
-#include <xen/guest_access.h>
-#include <asm/traps.h>
-
-struct mc_info *x86_mcinfo_getptr(void)
-{
- struct mc_info *mi;
- uint32_t entry, next;
-
- for (;;) {
- entry = mc_data.error_idx;
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
- break;
- }
-
- mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
- BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
-
- return mi;
-}
-
-static int x86_mcinfo_matches_guest(const struct mc_info *mi,
- const struct domain *d, const struct vcpu *v)
-{
- struct mcinfo_common *mic;
- struct mcinfo_global *mig;
-
- x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
- mig = (struct mcinfo_global *)mic;
- if (mig == NULL)
- return 0;
-
- if (d->domain_id != mig->mc_domid)
- return 0;
-
- if (v->vcpu_id != mig->mc_vcpuid)
- return 0;
-
- return 1;
-}
-
-
-#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
-
-static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
- const struct domain *d, const struct vcpu *v)
-{
- struct mc_info *mi;
-
- /* This function is called from the fetch hypercall with
- * the mc_lock spinlock held. Thus, no need for locking here.
- */
- mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
- if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
- /* Bogus domU command detected. */
- *fetch_idx = 0;
- return NULL;
- }
-
- *fetch_idx = mc_data.fetch_idx;
- mc_data.fetch_idx++;
- BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
-
- return mi;
-}
-
-
-static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
-{
- struct mc_machine_notify *mn;
- struct mcinfo_common *mic = NULL;
- struct mcinfo_global *mig;
- struct domain *d;
- int i;
-
- /* This function is called from the notifier hypercall with
- * the mc_notify_lock spinlock held. Thus, no need for locking here.
- */
-
- /* First invalidate entries for guests that disappeared after
- * notification (e.g. shutdown/crash). This step prevents the
- * notification array from filling up with stalling/leaking entries.
- */
- for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
- mn = &(mc_data.notify[(i % MAX_MCINFO)]);
- x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
- BUG_ON(mic == NULL);
- mig = (struct mcinfo_global *)mic;
- d = get_domain_by_id(mig->mc_domid);
- if (d == NULL) {
- /* Domain does not exist. */
- mn->valid = 0;
- }
- if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
- mc_data.notifyconsumer_idx++;
- }
-
- /* Now put in the error telemetry. Since all error data fetchable
- * by domUs are uncorrectable errors, they are very important.
- * So we dump them before overriding them. When a guest takes that long,
- * then we can assume something bad already happened (crash, hang, etc.)
- */
- mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
-
- if (mn->valid) {
- struct mcinfo_common *mic = NULL;
- struct mcinfo_global *mig;
-
- /* To not loose the information, we dump it. */
- x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
- BUG_ON(mic == NULL);
- mig = (struct mcinfo_global *)mic;
- printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
- "fetch machine check error telemetry. But Domain ID "
- "did not do that in time.\n",
- mig->mc_domid);
- x86_mcinfo_dump(&mn->mc);
- }
-
- memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
- sizeof(struct mc_info));
- mn->fetch_idx = mc_notifydomain->fetch_idx;
- mn->valid = 1;
-
- mc_data.notifyproducer_idx++;
-
- /* By design there can never be more notifies than machine check errors.
- * If that ever happens, then we hit a bug. */
- BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
- BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
-}
-
-static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
- const struct domain *d, const struct vcpu *v)
-{
- struct mc_machine_notify *mn = NULL;
- uint32_t i;
- int found;
-
- /* This function is called from the fetch hypercall with
- * the mc_notify_lock spinlock held. Thus, no need for locking here.
- */
-
- /* The notifier data is filled in the order guests get notified, but
- * guests may fetch them in a different order. That's why we need
- * the game with valid/invalid entries. */
- found = 0;
- for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
- mn = &(mc_data.notify[(i % MAX_MCINFO)]);
- if (!mn->valid) {
- if (i == mc_data.notifyconsumer_idx)
- mc_data.notifyconsumer_idx++;
- continue;
- }
- if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- /* This domain has never been notified. This must be
- * a bogus domU command. */
- *fetch_idx = 0;
- return NULL;
- }
-
- BUG_ON(mn == NULL);
- *fetch_idx = mn->fetch_idx;
- mn->valid = 0;
-
- BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
- return &mn->mc;
-}
-
-
-void x86_mcinfo_clear(struct mc_info *mi)
+static void mcinfo_clear(struct mc_info *mi)
{
memset(mi, 0, sizeof(struct mc_info));
x86_mcinfo_nentries(mi) = 0;
}
-
int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
{
int i;
@@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
if (end1 < end2)
- return -ENOSPC; /* No space. Can't add entry. */
+ return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
/* there's enough space. add entry. */
memcpy(mic_index, mic, mic->size);
@@ -389,7 +627,6 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
return 0;
}
-
/* Dump machine check information in a format,
* mcelog can parse. This is used only when
* Dom0 does not take the notification. */
@@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
if (mic == NULL)
return;
mc_global = (struct mcinfo_global *)mic;
- if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+ if (mc_global->mc_flags & MC_FLAG_MCE) {
printk(XENLOG_WARNING
"CPU%d: Machine Check Exception: %16"PRIx64"\n",
mc_global->mc_coreid, mc_global->mc_gstatus);
@@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
goto next;
mc_bank = (struct mcinfo_bank *)mic;
-
+
printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
mc_bank->mc_bank,
mc_bank->mc_status);
@@ -441,8 +678,6 @@ next:
} while (1);
}
-
-
static void do_mc_get_cpu_info(void *v)
{
int cpu = smp_processor_id();
@@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
}
}
+#if BITS_PER_LONG == 64
+
+#define ID2COOKIE(id) ((mctelem_cookie_t)(id))
+#define COOKIE2ID(c) ((uint64_t)(c))
+
+#elif BITS_PER_LONG == 32
+
+#define ID2COOKIE(id) ((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
+#define COOKIE2ID(c) ((uint64_t)(uint32_t)(c))
+
+#elif defined(BITS_PER_LONG)
+#error BITS_PER_LONG has unexpected value
+#else
+#error BITS_PER_LONG definition absent
+#endif
+
/* Machine Check Architecture Hypercall */
long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
{
long ret = 0;
struct xen_mc curop, *op = &curop;
struct vcpu *v = current;
- struct domain *domU;
struct xen_mc_fetch *mc_fetch;
- struct xen_mc_notifydomain *mc_notifydomain;
struct xen_mc_physcpuinfo *mc_physcpuinfo;
- struct mc_info *mi;
- uint32_t flags;
- uint32_t fetch_idx;
- uint16_t vcpuid;
- /* Use a different lock for the notify hypercall in order to allow
- * a DomU to fetch mc data while Dom0 notifies another DomU. */
- static DEFINE_SPINLOCK(mc_lock);
- static DEFINE_SPINLOCK(mc_notify_lock);
+ uint32_t flags, cmdflags;
int nlcpu;
xen_mc_logical_cpu_t *log_cpus = NULL;
+ mctelem_cookie_t mctc;
+ mctelem_class_t which;
if ( copy_from_guest(op, u_xen_mc, 1) )
- return -EFAULT;
+ return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
- return -EACCES;
+ return x86_mcerr("do_mca: interface version mismatch", -EACCES);
- switch ( op->cmd ) {
+ switch (op->cmd) {
case XEN_MC_fetch:
- /* This hypercall is for any domain */
mc_fetch = &op->u.mc_fetch;
+ cmdflags = mc_fetch->flags;
+
+ /* This hypercall is for Dom0 only */
+ if (!IS_PRIV(v->domain) )
+ return x86_mcerr(NULL, -EPERM);
- switch (mc_fetch->flags) {
- case XEN_MC_CORRECTABLE:
- /* But polling mode is Dom0 only, because
- * correctable errors are reported to Dom0 only */
- if ( !IS_PRIV(v->domain) )
- return -EPERM;
+ switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
+ case XEN_MC_NONURGENT:
+ which = MC_NONURGENT;
break;
- case XEN_MC_TRAP:
+ case XEN_MC_URGENT:
+ which = MC_URGENT;
break;
+
default:
- return -EFAULT;
+ return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
}
flags = XEN_MC_OK;
- spin_lock(&mc_lock);
- if ( IS_PRIV(v->domain) ) {
- /* this must be Dom0. So a notify hypercall
- * can't have happened before. */
- mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+ if (cmdflags & XEN_MC_ACK) {
+ mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
+ mctelem_ack(which, cookie);
} else {
- /* Hypercall comes from an unprivileged domain */
- domU = v->domain;
- if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
- /* Dom0 must have notified this DomU before
- * via the notify hypercall. */
- mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
+ if (guest_handle_is_null(mc_fetch->data))
+ return x86_mcerr("do_mca fetch: guest buffer "
+ "invalid", -EINVAL);
+
+ if ((mctc = mctelem_consume_oldest_begin(which))) {
+ struct mc_info *mcip = mctelem_dataptr(mctc);
+ if (copy_to_guest(mc_fetch->data, mcip, 1)) {
+ ret = -EFAULT;
+ flags |= XEN_MC_FETCHFAILED;
+ mc_fetch->fetch_id = 0;
+ } else {
+ mc_fetch->fetch_id = COOKIE2ID(mctc);
+ }
+ mctelem_consume_oldest_end(mctc);
} else {
- /* Xen notified the DomU. */
- mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
+ /* There is no data */
+ flags |= XEN_MC_NODATA;
+ mc_fetch->fetch_id = 0;
}
- }
- if (mi) {
- memcpy(&mc_fetch->mc_info, mi,
- sizeof(struct mc_info));
- } else {
- /* There is no data for a bogus DomU command. */
- flags |= XEN_MC_NODATA;
- memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+ mc_fetch->flags = flags;
+ if (copy_to_guest(u_xen_mc, op, 1) != 0)
+ ret = -EFAULT;
}
- mc_fetch->flags = flags;
- mc_fetch->fetch_idx = fetch_idx;
-
- if ( copy_to_guest(u_xen_mc, op, 1) )
- ret = -EFAULT;
-
- spin_unlock(&mc_lock);
break;
case XEN_MC_notifydomain:
- /* This hypercall is for Dom0 only */
- if ( !IS_PRIV(v->domain) )
- return -EPERM;
-
- spin_lock(&mc_notify_lock);
-
- mc_notifydomain = &op->u.mc_notifydomain;
- domU = get_domain_by_id(mc_notifydomain->mc_domid);
- vcpuid = mc_notifydomain->mc_vcpuid;
-
- if ((domU == NULL) || (domU == dom0)) {
- /* It's not possible to notify a non-existent domain
- * or the dom0. */
- spin_unlock(&mc_notify_lock);
- return -EACCES;
- }
+ return x86_mcerr("do_mca notify unsupported", -EINVAL);
- if (vcpuid >= MAX_VIRT_CPUS) {
- /* It's not possible to notify a vcpu, Xen can't
- * assign to a domain. */
- spin_unlock(&mc_notify_lock);
- return -EACCES;
+ case XEN_MC_physcpuinfo:
+ if ( !IS_PRIV(v->domain) )
+ return x86_mcerr("do_mca cpuinfo", -EPERM);
+
+ mc_physcpuinfo = &op->u.mc_physcpuinfo;
+ nlcpu = num_online_cpus();
+
+ if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+ if (mc_physcpuinfo->ncpus <= 0)
+ return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
+ -EINVAL);
+ nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
+ log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
+ if (log_cpus == NULL)
+ return x86_mcerr("do_mca cpuinfo", -ENOMEM);
+
+ if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
+ 1, 1) != 0) {
+ xfree(log_cpus);
+ return x86_mcerr("do_mca cpuinfo", -EIO);
+ }
}
- mc_notifydomain->flags = XEN_MC_OK;
-
- mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
- if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
- /* The error telemetry is not for the guest, Dom0
- * wants to notify. */
- mc_notifydomain->flags |= XEN_MC_NOMATCH;
- } else if ( guest_has_trap_callback(domU, vcpuid,
- TRAP_machine_check) )
- {
- /* Send notification */
- if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
- mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
- } else
- mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
-
-#ifdef DEBUG
- /* sanity check - these two flags are mutually exclusive */
- if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
- BUG();
-#endif
+ mc_physcpuinfo->ncpus = nlcpu;
- if ( copy_to_guest(u_xen_mc, op, 1) )
- ret = -EFAULT;
-
- if (ret == 0) {
- x86_mcinfo_marknotified(mc_notifydomain);
+ if (copy_to_guest(u_xen_mc, op, 1)) {
+ if (log_cpus != NULL)
+ xfree(log_cpus);
+ return x86_mcerr("do_mca cpuinfo", -EFAULT);
}
- spin_unlock(&mc_notify_lock);
+ if (!guest_handle_is_null(mc_physcpuinfo->info)) {
+ if (copy_to_guest(mc_physcpuinfo->info,
+ log_cpus, nlcpu))
+ ret = -EFAULT;
+ xfree(log_cpus);
+ }
break;
- case XEN_MC_physcpuinfo:
- if ( !IS_PRIV(v->domain) )
- return -EPERM;
-
- mc_physcpuinfo = &op->u.mc_physcpuinfo;
- nlcpu = num_online_cpus();
-
- if (!guest_handle_is_null(mc_physcpuinfo->info)) {
- if (mc_physcpuinfo->ncpus <= 0)
- return -EINVAL;
- nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
- log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
- if (log_cpus == NULL)
- return -ENOMEM;
-
- if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
- 1, 1) != 0) {
- xfree(log_cpus);
- return -EIO;
- }
- }
-
- mc_physcpuinfo->ncpus = nlcpu;
-
- if (copy_to_guest(u_xen_mc, op, 1)) {
- if (log_cpus != NULL)
- xfree(log_cpus);
- return -EFAULT;
- }
-
- if (!guest_handle_is_null(mc_physcpuinfo->info)) {
- if (copy_to_guest(mc_physcpuinfo->info,
- log_cpus, nlcpu))
- ret = -EFAULT;
- xfree(log_cpus);
- }
+ default:
+ return x86_mcerr("do_mca: bad command", -EINVAL);
}
return ret;
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
index f2e29897bd..f360268821 100644
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -1,38 +1,98 @@
+#ifndef _MCE_H
+
+#define _MCE_H
+
#include <xen/init.h>
+#include <xen/smp.h>
#include <asm/types.h>
#include <asm/traps.h>
#include <asm/atomic.h>
#include <asm/percpu.h>
+#include "x86_mca.h"
+#include "mctelem.h"
/* Init functions */
-void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
-void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
-void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+int winchip_mcheck_init(struct cpuinfo_x86 *c);
+int intel_mcheck_init(struct cpuinfo_x86 *c);
void intel_mcheck_timer(struct cpuinfo_x86 *c);
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_mcheck_init(struct cpuinfo_x86 *c);
void mce_intel_feature_init(struct cpuinfo_x86 *c);
-
-void winchip_mcheck_init(struct cpuinfo_x86 *c);
-
-/* Function pointer used in the handlers to collect additional information
- * provided by newer CPU families/models without the need to duplicate
- * the whole handler resulting in various handlers each with its own
- * tweaks and bugs */
-extern int (*mc_callback_bank_extended)(struct mc_info *mi,
- uint16_t bank, uint64_t status);
-
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
int mce_available(struct cpuinfo_x86 *c);
+int mce_firstbank(struct cpuinfo_x86 *c);
/* Helper functions used for collecting error telemetry */
struct mc_info *x86_mcinfo_getptr(void);
-void x86_mcinfo_clear(struct mc_info *mi);
-int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
-void x86_mcinfo_dump(struct mc_info *mi);
void mc_panic(char *s);
void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
uint32_t *, uint32_t *, uint32_t *, uint32_t *);
+
+
+/* Register a handler for machine check exceptions. */
+typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
+extern void x86_mce_vector_register(x86_mce_vector_t);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Utility function to "logout" all architectural MCA telemetry from the MCA
+ * banks of the current processor. A cookie is returned which may be
+ * uses to reference the data so logged (the cookie can be NULL if
+ * no logout structures were available). The caller can also pass a pointer
+ * to a structure which will be completed with some summary information
+ * of the MCA data observed in the logout operation. */
+
+enum mca_source {
+ MCA_MCE_HANDLER,
+ MCA_POLLER,
+ MCA_CMCI_HANDLER,
+ MCA_RESET
+};
+
+enum mca_extinfo {
+ MCA_EXTINFO_LOCAL,
+ MCA_EXTINFO_GLOBAL,
+ MCA_EXTINFO_IGNORED
+};
+
+struct mca_summary {
+ uint32_t errcnt; /* number of banks with valid errors */
+ int ripv; /* meaningful on #MC */
+ int eipv; /* meaningful on #MC */
+ uint32_t uc; /* bitmask of banks with UC */
+ uint32_t pcc; /* bitmask of banks with PCC */
+};
+
+extern cpu_banks_t mca_allbanks;
+
+extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
+ struct mca_summary *);
+
+/* Register a callback to be made during bank telemetry logout.
+ * This callback is only available to those machine check handlers
+ * that call to the common mcheck_cmn_handler or who use the common
+ * telemetry logout function mcheck_mca_logout in error polling.
+ *
+ * This can be used to collect additional information (typically non-
+ * architectural) provided by newer CPU families/models without the need
+ * to duplicate the whole handler resulting in various handlers each with
+ * its own tweaks and bugs. The callback receives an struct mc_info pointer
+ * which it can use with x86_mcinfo_add to add additional telemetry,
+ * the current MCA bank number we are reading telemetry from, and the
+ * MCi_STATUS value for that bank.
+ */
+typedef enum mca_extinfo (*x86_mce_callback_t)
+ (struct mc_info *, uint16_t, uint64_t);
+extern void x86_mce_callback_register(x86_mce_callback_t);
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+#endif /* _MCE_H */
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index dabac9ac5c..e1c41cd01d 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
static int nr_intel_ext_msrs = 0;
static int cmci_support = 0;
+static int firstbank;
#ifdef CONFIG_X86_MCE_THERMAL
static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -115,222 +116,51 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_MCE_THERMAL */
-static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
+static enum mca_extinfo
+intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
{
- if (nr_intel_ext_msrs == 0)
- return;
-
- /* this function will called when CAP(9).MCG_EXT_P = 1 */
- memset(mc_ext, 0, sizeof(struct mcinfo_extended));
- mc_ext->common.type = MC_TYPE_EXTENDED;
- mc_ext->common.size = sizeof(mc_ext);
- mc_ext->mc_msrs = 10;
-
- mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
- rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
- mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
- rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
- mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
- rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
-
- mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
- rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
- mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
- rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
- mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
- rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
-
- mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
- rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
- mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
- rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
- mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
- rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
- mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
- rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
-}
-
-/* machine_check_poll might be called by following types:
- * 1. called when do mcheck_init.
- * 2. called in cmci interrupt handler
- * 3. called in polling handler
- * It will generate a new mc_info item if found CE/UC errors. DOM0 is the
- * consumer.
- */
-static struct mc_info *machine_check_poll(int calltype)
-{
- struct mc_info *mi = NULL;
- int exceptions = (read_cr4() & X86_CR4_MCE);
- int i, nr_unit = 0, uc = 0, pcc = 0;
- uint64_t status, addr;
- struct mcinfo_global mcg;
- struct mcinfo_extended mce;
- unsigned int cpu;
- struct domain *d;
-
- cpu = smp_processor_id();
-
- memset(&mcg, 0, sizeof(mcg));
- mcg.common.type = MC_TYPE_GLOBAL;
- mcg.common.size = sizeof(mcg);
- /* If called from cpu-reset check, don't need to fill them.
- * If called from cmci context, we'll try to fill domid by memory addr
- */
- mcg.mc_domid = -1;
- mcg.mc_vcpuid = -1;
- if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
- mcg.mc_flags = MC_FLAG_POLLED;
- else if (calltype == MC_FLAG_CMCI)
- mcg.mc_flags = MC_FLAG_CMCI;
- x86_mc_get_cpu_info(
- cpu, &mcg.mc_socketid, &mcg.mc_coreid,
- &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL);
- rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
-
- for ( i = 0; i < nr_mce_banks; i++ ) {
- struct mcinfo_bank mcb;
- /* For CMCI, only owners checks the owned MSRs */
- if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
- (calltype & MC_FLAG_CMCI) )
- continue;
- rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+ struct mcinfo_extended mc_ext;
- if (! (status & MCi_STATUS_VAL) )
- continue;
- /*
- * Uncorrected events are handled by the exception
- * handler when it is enabled. But when the exception
- * is disabled such as when mcheck_init, log everything.
- */
- if ((status & MCi_STATUS_UC) && exceptions)
- continue;
+ if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
+ return MCA_EXTINFO_IGNORED;
- if (status & MCi_STATUS_UC)
- uc = 1;
- if (status & MCi_STATUS_PCC)
- pcc = 1;
-
- if (!mi) {
- mi = x86_mcinfo_getptr();
- if (!mi) {
- printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
- return NULL;
- }
- x86_mcinfo_clear(mi);
- }
- memset(&mcb, 0, sizeof(mcb));
- mcb.common.type = MC_TYPE_BANK;
- mcb.common.size = sizeof(mcb);
- mcb.mc_bank = i;
- mcb.mc_status = status;
- if (status & MCi_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
- if (status & MCi_STATUS_ADDRV) {
- rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
- d = maddr_get_owner(addr);
- if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
- mcb.mc_domid = d->domain_id;
- }
- if (cmci_support)
- rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
- if (calltype == MC_FLAG_CMCI)
- rdtscll(mcb.mc_tsc);
- x86_mcinfo_add(mi, &mcb);
- nr_unit++;
- add_taint(TAINT_MACHINE_CHECK);
- /* Clear state for this bank */
- wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
- printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n",
- i, cpu, status);
- printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
- "thread[%d]\n", cpu, mcg.mc_socketid,
- mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
-
- }
- /* if pcc = 1, uc must be 1 */
- if (pcc)
- mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
- else if (uc)
- mcg.mc_flags |= MC_FLAG_RECOVERABLE;
- else /* correctable */
- mcg.mc_flags |= MC_FLAG_CORRECTABLE;
-
- if (nr_unit && nr_intel_ext_msrs &&
- (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
- intel_get_extended_msrs(&mce);
- x86_mcinfo_add(mi, &mce);
- }
- if (nr_unit)
- x86_mcinfo_add(mi, &mcg);
- /* Clear global state */
- return mi;
+ /* this function will called when CAP(9).MCG_EXT_P = 1 */
+ memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
+ mc_ext.common.type = MC_TYPE_EXTENDED;
+ mc_ext.common.size = sizeof(mc_ext);
+ mc_ext.mc_msrs = 10;
+
+ mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
+ rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
+ mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
+ rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
+ mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
+ rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
+
+ mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
+ rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
+ mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
+ rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
+ mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
+ rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
+
+ mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
+ rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
+ mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
+ rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
+ mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+ rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
+ mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
+ rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
+
+ x86_mcinfo_add(mci, &mc_ext);
+
+ return MCA_EXTINFO_GLOBAL;
}
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
{
- /* MACHINE CHECK Error handler will be sent in another patch,
- * simply copy old solutions here. This code will be replaced
- * by upcoming machine check patches
- */
-
- int recover=1;
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int i;
-
- rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
-
- printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- for (i=0; i<nr_mce_banks; i++) {
- rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
- if (high & (1<<31)) {
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
- high &= ~(1<<31);
- if (high & (1<<27)) {
- rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- printk ("[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- printk (" at %08x%08x", ahigh, alow);
- }
- printk ("\n");
- }
- }
-
- if (recover & 2)
- mc_panic ("CPU context corrupt");
- if (recover & 1)
- mc_panic ("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
- */
- for (i=0; i<nr_mce_banks; i++) {
- u32 msr;
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr (msr, low, high);
- if (high&(1<<31)) {
- /* Clear it */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ mcheck_cmn_handler(regs, error_code, mca_allbanks);
}
static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -369,6 +199,8 @@ static void cmci_discover(void)
unsigned long flags;
int i;
struct mc_info *mi = NULL;
+ mctelem_cookie_t mctc;
+ struct mca_summary bs;
printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
@@ -385,12 +217,20 @@ static void cmci_discover(void)
* MCi_status (error_count bit 38~52) is not cleared,
* the CMCI interrupt will never be triggered again.
*/
- mi = machine_check_poll(MC_FLAG_CMCI);
- if (mi) {
- x86_mcinfo_dump(mi);
- if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+ mctc = mcheck_mca_logout(
+ MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+ if (bs.errcnt && mctc != NULL) {
+ if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+ mctelem_commit(mctc);
send_guest_global_virq(dom0, VIRQ_MCA);
- }
+ } else {
+ x86_mcinfo_dump(mi);
+ mctelem_dismiss(mctc);
+ }
+ } else if (mctc != NULL)
+ mctelem_dismiss(mctc);
printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
smp_processor_id(),
@@ -487,17 +327,26 @@ static void intel_init_cmci(struct cpuinfo_x86 *c)
fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
{
struct mc_info *mi = NULL;
- int cpu = smp_processor_id();
+ mctelem_cookie_t mctc;
+ struct mca_summary bs;
ack_APIC_irq();
irq_enter();
- printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
- mi = machine_check_poll(MC_FLAG_CMCI);
- if (mi) {
- x86_mcinfo_dump(mi);
- if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+
+ mctc = mcheck_mca_logout(
+ MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+
+ if (bs.errcnt && mctc != NULL) {
+ if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+ mctelem_commit(mctc);
send_guest_global_virq(dom0, VIRQ_MCA);
- }
+ } else {
+ x86_mcinfo_dump(mi);
+ mctelem_dismiss(mctc);
+ }
+ } else if (mctc != NULL)
+ mctelem_dismiss(mctc);
+
irq_exit();
}
@@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_x86 *c)
printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
smp_processor_id(), nr_intel_ext_msrs);
}
- /* for most of p6 family, bank 0 is an alias bios MSR.
- * But after model>1a, bank 0 is available*/
- if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
- && c->x86_model < 0x1A)
- firstbank = 1;
- else
- firstbank = 0;
+ firstbank = mce_firstbank(c);
}
static void mce_init(void)
{
u32 l, h;
int i;
- struct mc_info *mi;
+ mctelem_cookie_t mctc;
+ struct mca_summary bs;
+
clear_in_cr4(X86_CR4_MCE);
+
/* log the machine checks left over from the previous reset.
* This also clears all registers*/
- mi = machine_check_poll(MC_FLAG_RESET);
+ mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+
/* in the boot up stage, don't inject to DOM0, but print out */
- if (mi)
- x86_mcinfo_dump(mi);
+ if (bs.errcnt && mctc != NULL) {
+ x86_mcinfo_dump(mctelem_dataptr(mctc));
+ mctelem_dismiss(mctc);
+ }
set_in_cr4(X86_CR4_MCE);
rdmsr (MSR_IA32_MCG_CAP, l, h);
@@ -573,71 +422,19 @@ static void mce_init(void)
}
/* p4/p6 family have similar MCA initialization process */
-void intel_mcheck_init(struct cpuinfo_x86 *c)
+int intel_mcheck_init(struct cpuinfo_x86 *c)
{
mce_cap_init(c);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
+
/* machine check is available */
- machine_check_vector = intel_machine_check;
+ x86_mce_vector_register(intel_machine_check);
+ x86_mce_callback_register(intel_get_extended_msrs);
+
mce_init();
mce_intel_feature_init(c);
mce_set_owner();
-}
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll faster. When the poller finds no more
- * errors, poll slower
-*/
-static struct timer mce_timer;
-
-#define MCE_PERIOD 4000
-#define MCE_MIN 2000
-#define MCE_MAX 32000
-
-static u64 period = MCE_PERIOD;
-static int adjust = 0;
-
-static void mce_intel_checkregs(void *info)
-{
- struct mc_info *mi;
-
- if( !mce_available(&current_cpu_data))
- return;
- mi = machine_check_poll(MC_FLAG_POLLED);
- if (mi)
- {
- x86_mcinfo_dump(mi);
- adjust++;
- if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
- send_guest_global_virq(dom0, VIRQ_MCA);
- }
-}
-static void mce_intel_work_fn(void *data)
-{
- on_each_cpu(mce_intel_checkregs, data, 1, 1);
- if (adjust) {
- period = period / (adjust + 1);
- printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval "
- "to %"PRIu64"\n", period);
- }
- else {
- period *= 2;
- }
- if (period > MCE_MAX)
- period = MCE_MAX;
- if (period < MCE_MIN)
- period = MCE_MIN;
- set_timer(&mce_timer, NOW() + MILLISECS(period));
- adjust = 0;
-}
-
-void intel_mcheck_timer(struct cpuinfo_x86 *c)
-{
- printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
- init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
- set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
+ return 1;
}
-
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
new file mode 100644
index 0000000000..4111ddcbb7
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+/*
+ * mctelem.c - x86 Machine Check Telemetry Transport
+ */
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/cpumask.h>
+#include <xen/event.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+
+struct mctelem_ent {
+ struct mctelem_ent *mcte_next; /* next in chronological order */
+ struct mctelem_ent *mcte_prev; /* previous in chronological order */
+ uint32_t mcte_flags; /* See MCTE_F_* below */
+ uint32_t mcte_refcnt; /* Reference count */
+ void *mcte_data; /* corresponding data payload */
+};
+
+#define MCTE_F_HOME_URGENT 0x0001U /* free to urgent freelist */
+#define MCTE_F_HOME_NONURGENT 0x0002U /* free to nonurgent freelist */
+#define MCTE_F_CLASS_URGENT 0x0004U /* in use - urgent errors */
+#define MCTE_F_CLASS_NONURGENT 0x0008U /* in use - nonurgent errors */
+#define MCTE_F_STATE_FREE 0x0010U /* on a freelist */
+#define MCTE_F_STATE_UNCOMMITTED 0x0020U /* reserved; on no list */
+#define MCTE_F_STATE_COMMITTED 0x0040U /* on a committed list */
+#define MCTE_F_STATE_PROCESSING 0x0080U /* on a processing list */
+
+#define MCTE_F_MASK_HOME (MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT)
+#define MCTE_F_MASK_CLASS (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
+#define MCTE_F_MASK_STATE (MCTE_F_STATE_FREE | \
+ MCTE_F_STATE_UNCOMMITTED | \
+ MCTE_F_STATE_COMMITTED | \
+ MCTE_F_STATE_PROCESSING)
+
+#define MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
+
+#define MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
+#define MCTE_SET_CLASS(tep, new) do { \
+ (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
+ (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
+
+#define MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
+#define MCTE_TRANSITION_STATE(tep, old, new) do { \
+ BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
+ (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
+ (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
+
+#define MC_URGENT_NENT 10
+#define MC_NONURGENT_NENT 20
+
+#define MC_NCLASSES (MC_NONURGENT + 1)
+
+#define COOKIE2MCTE(c) ((struct mctelem_ent *)(c))
+#define MCTE2COOKIE(tep) ((mctelem_cookie_t)(tep))
+
+static struct mc_telem_ctl {
+ /* Linked lists that thread the array members together.
+ *
+ * The free lists are singly-linked via mcte_next, and we allocate
+ * from them by atomically unlinking an element from the head.
+ * Consumed entries are returned to the head of the free list.
+ * When an entry is reserved off the free list it is not linked
+ * on any list until it is committed or dismissed.
+ *
+ * The committed list grows at the head and we do not maintain a
+ * tail pointer; insertions are performed atomically. The head
+ * thus has the most-recently committed telemetry, i.e. the
+ * list is in reverse chronological order. The committed list
+ * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
+ * When we move telemetry from the committed list to the processing
+ * list we atomically unlink the committed list and keep a pointer
+ * to the head of that list; we then traverse the list following
+ * mcte_prev and fill in mcte_next to doubly-link the list, and then
+ * append the tail of the list onto the processing list. If we panic
+ * during this manipulation of the committed list we still have
+ * the pointer to its head so we can recover all entries during
+ * the panic flow (albeit in reverse chronological order).
+ *
+ * The processing list is updated in a controlled context, and
+ * we can lock it for updates. The head of the processing list
+ * always has the oldest telemetry, and we append (as above)
+ * at the tail of the processing list. */
+ struct mctelem_ent *mctc_free[MC_NCLASSES];
+ struct mctelem_ent *mctc_committed[MC_NCLASSES];
+ struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
+ struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
+ /*
+ * Telemetry array
+ */
+ struct mctelem_ent *mctc_elems;
+} mctctl;
+
+/* Lock protecting all processing lists */
+static DEFINE_SPINLOCK(processing_lock);
+
+static void *cmpxchgptr(void *ptr, void *old, void *new)
+{
+ unsigned long *ulp = (unsigned long *)ptr;
+ unsigned long a = (unsigned long)old;
+ unsigned long b = (unsigned long)new;
+
+ return (void *)cmpxchg(ulp, a, b);
+}
+
+/* Free an entry to its native free list; the entry must not be linked on
+ * any list.
+ */
+static void mctelem_free(struct mctelem_ent *tep)
+{
+ mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
+ MC_URGENT : MC_NONURGENT;
+ struct mctelem_ent **freelp;
+ struct mctelem_ent *oldhead;
+
+ BUG_ON(tep->mcte_refcnt != 0);
+ BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
+
+ tep->mcte_prev = NULL;
+ freelp = &mctctl.mctc_free[target];
+ for (;;) {
+ oldhead = *freelp;
+ tep->mcte_next = oldhead;
+ wmb();
+ if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
+ break;
+ }
+}
+
+/* Increment the reference count of an entry that is not linked on to
+ * any list and which only the caller has a pointer to.
+ */
+static void mctelem_hold(struct mctelem_ent *tep)
+{
+ tep->mcte_refcnt++;
+}
+
+/* Increment the reference count on an entry that is linked at the head of
+ * a processing list. The caller is responsible for locking the list.
+ */
+static void mctelem_processing_hold(struct mctelem_ent *tep)
+{
+ int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+ MC_URGENT : MC_NONURGENT;
+
+ BUG_ON(tep != mctctl.mctc_processing_head[which]);
+ tep->mcte_refcnt++;
+}
+
+/* Decrement the reference count on an entry that is linked at the head of
+ * a processing list. The caller is responsible for locking the list.
+ */
+static void mctelem_processing_release(struct mctelem_ent *tep)
+{
+ int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+ MC_URGENT : MC_NONURGENT;
+
+ BUG_ON(tep != mctctl.mctc_processing_head[which]);
+ if (--tep->mcte_refcnt == 0) {
+ MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
+ mctctl.mctc_processing_head[which] = tep->mcte_next;
+ mctelem_free(tep);
+ }
+}
+
+void mctelem_init(int reqdatasz)
+{
+ static int called = 0;
+ static int datasz = 0, realdatasz = 0;
+ char *datarr;
+ int i;
+
+ BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
+
+ /* Called from mcheck_init for all processors; initialize for the
+ * first call only (no race here since the boot cpu completes
+ * init before others start up). */
+ if (++called == 1) {
+ realdatasz = reqdatasz;
+ datasz = (reqdatasz & ~0xf) + 0x10; /* 16 byte roundup */
+ } else {
+ BUG_ON(reqdatasz != realdatasz);
+ return;
+ }
+
+ if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
+ MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
+ (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
+ datasz)) == NULL) {
+ if (mctctl.mctc_elems)
+ xfree(mctctl.mctc_elems);
+ printk("Allocations for MCA telemetry failed\n");
+ return;
+ }
+
+ for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
+ struct mctelem_ent *tep, **tepp;
+
+ tep = mctctl.mctc_elems + i;
+ tep->mcte_flags = MCTE_F_STATE_FREE;
+ tep->mcte_refcnt = 0;
+ tep->mcte_data = datarr + i * datasz;
+
+ if (i < MC_URGENT_NENT) {
+ tepp = &mctctl.mctc_free[MC_URGENT];
+ tep->mcte_flags |= MCTE_F_HOME_URGENT;
+ } else {
+ tepp = &mctctl.mctc_free[MC_NONURGENT];
+ tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
+ }
+
+ tep->mcte_next = *tepp;
+ tep->mcte_prev = NULL;
+ *tepp = tep;
+ }
+}
+
+/* incremented non-atomically when reserve fails */
+static int mctelem_drop_count;
+
+/* Reserve a telemetry entry, or return NULL if none available.
+ * If we return an entry then the caller must subsequently call exactly one of
+ * mctelem_unreserve or mctelem_commit for that entry.
+ */
+mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
+{
+ struct mctelem_ent **freelp;
+ struct mctelem_ent *oldhead, *newhead;
+ mctelem_class_t target = (which == MC_URGENT) ?
+ MC_URGENT : MC_NONURGENT;
+
+ freelp = &mctctl.mctc_free[target];
+ for (;;) {
+ if ((oldhead = *freelp) == NULL) {
+ if (which == MC_URGENT && target == MC_URGENT) {
+ /* raid the non-urgent freelist */
+ target = MC_NONURGENT;
+ freelp = &mctctl.mctc_free[target];
+ continue;
+ } else {
+ mctelem_drop_count++;
+ return (NULL);
+ }
+ }
+
+ newhead = oldhead->mcte_next;
+ if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
+ struct mctelem_ent *tep = oldhead;
+
+ mctelem_hold(tep);
+ MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
+ tep->mcte_next = NULL;
+ tep->mcte_prev = NULL;
+ if (which == MC_URGENT)
+ MCTE_SET_CLASS(tep, URGENT);
+ else
+ MCTE_SET_CLASS(tep, NONURGENT);
+ return MCTE2COOKIE(tep);
+ }
+ }
+}
+
+void *mctelem_dataptr(mctelem_cookie_t cookie)
+{
+ struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+ return tep->mcte_data;
+}
+
+/* Release a previously reserved entry back to the freelist without
+ * submitting it for logging. The entry must not be linked on to any
+ * list - that's how mctelem_reserve handed it out.
+ */
+void mctelem_dismiss(mctelem_cookie_t cookie)
+{
+ struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+ tep->mcte_refcnt--;
+ MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
+ mctelem_free(tep);
+}
+
+/* Commit an entry with completed telemetry for logging. The caller must
+ * not reference the entry after this call. Note that we add entries
+ * at the head of the committed list, so that list therefore has entries
+ * in reverse chronological order.
+ */
+void mctelem_commit(mctelem_cookie_t cookie)
+{
+ struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+ struct mctelem_ent **commlp;
+ struct mctelem_ent *oldhead;
+ mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
+ MC_URGENT : MC_NONURGENT;
+
+ BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
+ MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
+
+ commlp = &mctctl.mctc_committed[target];
+ for (;;) {
+ oldhead = *commlp;
+ tep->mcte_prev = oldhead;
+ wmb();
+ if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
+ break;
+ }
+}
+
+/* Move telemetry from committed list to processing list, reversing the
+ * list into chronological order. The processing list has been
+ * locked by the caller, and may be non-empty. We append the
+ * reversed committed list on to the tail of the processing list.
+ * The committed list may grow even while we run, so use atomic
+ * operations to swap NULL to the freelist head.
+ *
+ * Note that "chronological order" means the order in which producers
+ * won additions to the processing list, which may not reflect the
+ * strict chronological order of the associated events if events are
+ * closely spaced in time and contend for the processing list at once.
+ */
+
+static struct mctelem_ent *dangling[MC_NCLASSES];
+
+static void mctelem_append_processing(mctelem_class_t which)
+{
+ mctelem_class_t target = which == MC_URGENT ?
+ MC_URGENT : MC_NONURGENT;
+ struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
+ struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
+ struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
+ struct mctelem_ent *tep, *ltep;
+
+ /* Check for an empty list; no race since we hold the processing lock */
+ if (*commlp == NULL)
+ return;
+
+ /* Atomically unlink the committed list, and keep a pointer to
+ * the list we unlink in a well-known location so it can be
+ * picked up in panic code should we panic between this unlink
+ * and the append to the processing list. */
+ for (;;) {
+ dangling[target] = *commlp;
+ wmb();
+ if (cmpxchgptr(commlp, dangling[target], NULL) ==
+ dangling[target])
+ break;
+ }
+
+ if (dangling[target] == NULL)
+ return;
+
+ /* Traverse the list following the previous pointers (reverse
+ * chronological order). For each entry fill in the next pointer
+ * and transition the element state. */
+ for (tep = dangling[target], ltep = NULL; tep != NULL;
+ tep = tep->mcte_prev) {
+ MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
+ tep->mcte_next = ltep;
+ ltep = tep;
+ }
+
+ /* ltep points to the head of a chronologically ordered linked
+ * list of telemetry entries ending at the most recent entry
+ * dangling[target] if mcte_next is followed; tack this on to
+ * the processing list.
+ */
+ if (*proclhp == NULL) {
+ *proclhp = ltep;
+ *procltp = dangling[target];
+ } else {
+ (*procltp)->mcte_next = ltep;
+ ltep->mcte_prev = *procltp;
+ *procltp = dangling[target];
+ }
+ wmb();
+ dangling[target] = NULL;
+ wmb();
+}
+
+mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
+{
+ mctelem_class_t target = (which == MC_URGENT) ?
+ MC_URGENT : MC_NONURGENT;
+ struct mctelem_ent *tep;
+
+ spin_lock(&processing_lock);
+ mctelem_append_processing(target);
+ if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
+ spin_unlock(&processing_lock);
+ return NULL;
+ }
+
+ mctelem_processing_hold(tep);
+ wmb();
+ spin_unlock(&processing_lock);
+ return MCTE2COOKIE(tep);
+}
+
+void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
+{
+ struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+ spin_lock(&processing_lock);
+ mctelem_processing_release(tep);
+ wmb();
+ spin_unlock(&processing_lock);
+}
+
+void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
+{
+ mctelem_class_t target = (which == MC_URGENT) ?
+ MC_URGENT : MC_NONURGENT;
+ struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+ if (tep == NULL)
+ return;
+
+ spin_lock(&processing_lock);
+ if (tep == mctctl.mctc_processing_head[target])
+ mctelem_processing_release(tep);
+ wmb();
+ spin_unlock(&processing_lock);
+}
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h b/xen/arch/x86/cpu/mcheck/mctelem.h
new file mode 100644
index 0000000000..e3270f606c
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef _MCTELEM_H
+
+#define _MCTELEM_H
+
+#include <xen/init.h>
+#include <xen/smp.h>
+#include <asm/traps.h>
+
+/* Helper functions used for collecting error telemetry.
+ *
+ * mctelem_init preallocates a number of data areas for use during
+ * machine check data "logout". Two classes are distinguished -
+ * urgent uses, intended for use from machine check exception handlers,
+ * and non-urgent uses intended for use from error pollers.
+ * Associated with each logout entry of whatever class is a data area
+ * sized per the single argument to mctelem_init. mcelem_init should be
+ * called from MCA init code before anybody has the chance to change the
+ * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
+ *
+ * To reserve an entry of a given class for use in logout, call
+ * mctelem_reserve (or use the common handler functions which do all this
+ * for you). This returns an opaque cookie, or NULL if no elements are
+ * available. Elements are reserved with an atomic operation so no deadlock
+ * will occur if, for example, a machine check exception interrupts a
+ * scheduled error poll. The implementation will raid free non-urgent
+ * entries if all urgent entries are in use when an urgent request is received.
+ * Once an entry is reserved the caller must eventually perform exactly
+ * one of two actions: mctelem_commit or mctelem_dismiss.
+ *
+ * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss
+ * frees the element without processing. After either call the cookie
+ * must not be referenced again.
+ *
+ * To consume committed telemetry call mctelem_consume_oldest_begin
+ * which will return a cookie referencing the oldest (first committed)
+ * entry of the requested class. Access the associated data using
+ * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
+ * begin .. end bracket you are guaranteed that the entry canot be freed
+ * even if it is ack'd elsewhere). Once the ultimate consumer of the
+ * telemetry has processed it to stable storage it should acknowledge
+ * the telemetry quoting the cookie id, at which point we will free
+ * the element from the processing list.
+ */
+
+typedef struct mctelem_cookie *mctelem_cookie_t;
+
+typedef enum mctelem_class {
+ MC_URGENT,
+ MC_NONURGENT
+} mctelem_class_t;
+
+extern void mctelem_init(int);
+extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
+extern void *mctelem_dataptr(mctelem_cookie_t);
+extern void mctelem_commit(mctelem_cookie_t);
+extern void mctelem_dismiss(mctelem_cookie_t);
+extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
+extern void mctelem_consume_oldest_end(mctelem_cookie_t);
+extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+
+#endif
diff --git a/xen/arch/x86/cpu/mcheck/non-fatal.c b/xen/arch/x86/cpu/mcheck/non-fatal.c
index 35982a461b..167b1cea2a 100644
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c
@@ -14,46 +14,76 @@
#include <xen/smp.h>
#include <xen/timer.h>
#include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/sched.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
-#include "x86_mca.h"
-int firstbank = 0;
+
+static cpu_banks_t bankmask;
static struct timer mce_timer;
-#define MCE_PERIOD MILLISECS(15000)
+#define MCE_PERIOD MILLISECS(8000)
+#define MCE_PERIOD_MIN MILLISECS(2000)
+#define MCE_PERIOD_MAX MILLISECS(16000)
+
+static uint64_t period = MCE_PERIOD;
+static int adjust = 0;
+static int variable_period = 1;
static void mce_checkregs (void *info)
{
- u32 low, high;
- int i;
+ mctelem_cookie_t mctc;
+ struct mca_summary bs;
+ static uint64_t dumpcount = 0;
- for (i=firstbank; i<nr_mce_banks; i++) {
- rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+ mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
- if (high & (1<<31)) {
- printk(KERN_INFO "MCE: The hardware reports a non "
- "fatal, correctable incident occurred on "
- "CPU %d.\n",
- smp_processor_id());
- printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+ if (bs.errcnt && mctc != NULL) {
+ adjust++;
- /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
- wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+ /* If Dom0 enabled the VIRQ_MCA event, then notify it.
+ * Otherwise, if dom0 has had plenty of time to register
+ * the virq handler but still hasn't then dump telemetry
+ * to the Xen console. The call count may be incremented
+ * on multiple cpus at once and is indicative only - just
+ * a simple-minded attempt to avoid spamming the console
+ * for corrected errors in early startup.
+ */
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
+ if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+ mctelem_commit(mctc);
+ send_guest_global_virq(dom0, VIRQ_MCA);
+ } else if (++dumpcount >= 10) {
+ x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
+ mctelem_dismiss(mctc);
+ } else {
+ mctelem_dismiss(mctc);
}
+ } else if (mctc != NULL) {
+ mctelem_dismiss(mctc);
}
}
static void mce_work_fn(void *data)
{
on_each_cpu(mce_checkregs, NULL, 1, 1);
- set_timer(&mce_timer, NOW() + MCE_PERIOD);
+
+ if (variable_period) {
+ if (adjust)
+ period /= (adjust + 1);
+ else
+ period *= 2;
+ if (period > MCE_PERIOD_MAX)
+ period = MCE_PERIOD_MAX;
+ if (period < MCE_PERIOD_MIN)
+ period = MCE_PERIOD_MIN;
+ }
+
+ set_timer(&mce_timer, NOW() + period);
+ adjust = 0;
}
static int __init init_nonfatal_mce_checker(void)
@@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_checker(void)
/* Check for MCE support */
if (!mce_available(c))
return -ENODEV;
+
+ memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
+ if (mce_firstbank(c) == 1)
+ clear_bit(0, bankmask);
+
/*
* Check for non-fatal errors every MCE_RATE s
*/
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86 == 6) { /* K7 */
- firstbank = 1;
init_timer(&mce_timer, mce_work_fn, NULL, 0);
set_timer(&mce_timer, NOW() + MCE_PERIOD);
break;
@@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_checker(void)
break;
case X86_VENDOR_INTEL:
- /* p5 family is different. P4/P6 and latest CPUs shares the
- * same polling methods
- */
+ /*
+ * The P5 family is different. P4/P6 and latest CPUs share the
+ * same polling methods.
+ */
if ( c->x86 != 5 )
{
- /* some CPUs or banks don't support cmci, we need to
- * enable this feature anyway
- */
- intel_mcheck_timer(c);
+ init_timer(&mce_timer, mce_work_fn, NULL, 0);
+ set_timer(&mce_timer, NOW() + MCE_PERIOD);
}
break;
}
diff --git a/xen/arch/x86/cpu/mcheck/p5.c b/xen/arch/x86/cpu/mcheck/p5.c
index 97360bc7e7..4106cbcf53 100644
--- a/xen/arch/x86/cpu/mcheck/p5.c
+++ b/xen/arch/x86/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
#include "x86_mca.h"
/* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
+static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
@@ -28,19 +28,14 @@ static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long err
}
/* Set up machine check reporting for processors with Intel style MCE */
-void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
- /*Check for MCE support */
- if( !cpu_has(c, X86_FEATURE_MCE) )
- return;
-
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
- return;
- machine_check_vector = pentium_machine_check;
- wmb();
+ return 0;
+ x86_mce_vector_register(pentium_machine_check);
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
@@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
+
+ return 1;
}
diff --git a/xen/arch/x86/cpu/mcheck/winchip.c b/xen/arch/x86/cpu/mcheck/winchip.c
index 12b3e6db24..6dede3796f 100644
--- a/xen/arch/x86/cpu/mcheck/winchip.c
+++ b/xen/arch/x86/cpu/mcheck/winchip.c
@@ -16,22 +16,24 @@
#include "mce.h"
/* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
+static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK);
}
/* Set up machine check reporting on the Winchip C6 series */
-void winchip_mcheck_init(struct cpuinfo_x86 *c)
+int winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
+
wmb();
+ x86_mce_vector_register(winchip_machine_check);
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
+ return (1);
}
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index df3899bbfe..7661d57e06 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -17,6 +17,10 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#ifndef X86_MCA_H
+
+#define X86_MCA_H
+
/* The MCA/MCE MSRs should not be used anywhere else.
* They are cpu family/model specific and are only for use
@@ -73,6 +77,9 @@
/* reserved bits */
#define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL
+/* Bitfield of MSR_K8_HWCR register */
+#define K8_HWCR_MCi_STATUS_WREN (1ULL << 18)
+
/*Intel Specific bitfield*/
#define CMCI_THRESHOLD 0x2
@@ -87,3 +94,4 @@ extern int mce_disabled;
extern unsigned int nr_mce_banks;
extern int firstbank;
+#endif /* X86_MCA_H */
diff --git a/xen/include/asm-x86/traps.h b/xen/include/asm-x86/traps.h
index 2d055301f2..85a422363f 100644
--- a/xen/include/asm-x86/traps.h
+++ b/xen/include/asm-x86/traps.h
@@ -28,7 +28,7 @@ struct softirq_trap {
struct cpu_user_regs;
-extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
+extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
/**
* guest_has_trap_callback
diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h
index e1f5297bfa..6a55f32ef3 100644
--- a/xen/include/public/arch-x86/xen-mca.h
+++ b/xen/include/public/arch-x86/xen-mca.h
@@ -56,13 +56,20 @@
/* Hypercall */
#define __HYPERVISOR_mca __HYPERVISOR_arch_0
-#define XEN_MCA_INTERFACE_VERSION 0x03000002
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
-/* IN: Dom0 calls hypercall from MC event handler. */
-#define XEN_MC_CORRECTABLE 0x0
-/* IN: Dom0/DomU calls hypercall from MC trap handler. */
-#define XEN_MC_TRAP 0x1
-/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT 0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT 0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK 0x0004
/* OUT: All is ok */
#define XEN_MC_OK 0x0
@@ -110,6 +117,7 @@ struct mcinfo_common {
#define MC_FLAG_POLLED (1 << 3)
#define MC_FLAG_RESET (1 << 4)
#define MC_FLAG_CMCI (1 << 5)
+#define MC_FLAG_MCE (1 << 6)
/* contains global x86 mc information */
struct mcinfo_global {
struct mcinfo_common common;
@@ -174,6 +182,7 @@ struct mc_info {
uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
};
typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
#define __MC_MSR_ARRAYSIZE 8
#define __MC_NMSRS 1
@@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
#define XEN_MC_fetch 1
struct xen_mc_fetch {
/* IN/OUT variables. */
- uint32_t flags;
-
-/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
-/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+ uint32_t flags; /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+ XEN_MC_ACK if ack'ing an earlier fetch */
+ /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+ XEN_MC_NODATA, XEN_MC_NOMATCH */
+ uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */
/* OUT variables. */
- uint32_t fetch_idx; /* only useful for Dom0 for the notify hypercall */
- struct mc_info mc_info;
+ XEN_GUEST_HANDLE(mc_info_t) data;
};
typedef struct xen_mc_fetch xen_mc_fetch_t;
DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
@@ -296,7 +305,6 @@ struct xen_mc_notifydomain {
uint16_t mc_domid; /* The unprivileged domain to notify. */
uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify.
* Usually echo'd value from the fetch hypercall. */
- uint32_t fetch_idx; /* echo'd value from the fetch hypercall. */
/* IN/OUT variables. */
uint32_t flags;
@@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo {
XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
};
+typedef union {
+ struct xen_mc_fetch mc_fetch;
+ struct xen_mc_notifydomain mc_notifydomain;
+ struct xen_mc_physcpuinfo mc_physcpuinfo;
+} xen_mc_arg_t;
+
struct xen_mc {
uint32_t cmd;
uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
- union {
- struct xen_mc_fetch mc_fetch;
- struct xen_mc_notifydomain mc_notifydomain;
- struct xen_mc_physcpuinfo mc_physcpuinfo;
- uint8_t pad[MCINFO_HYPERCALLSIZE];
- } u;
+ xen_mc_arg_t u;
};
typedef struct xen_mc xen_mc_t;
DEFINE_XEN_GUEST_HANDLE(xen_mc_t);