aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xen/arch/x86/cpu/mcheck/mce_intel.c382
-rw-r--r--xen/arch/x86/cpu/mcheck/x86_mca.h4
-rw-r--r--xen/arch/x86/domain.c11
-rw-r--r--xen/arch/x86/x86_64/traps.c47
-rw-r--r--xen/include/asm-x86/domain.h28
-rw-r--r--xen/include/asm-x86/softirq.h3
6 files changed, 468 insertions, 7 deletions
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index 01360d5ff5..5a9960fe6a 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -3,6 +3,7 @@
#include <xen/irq.h>
#include <xen/event.h>
#include <xen/kernel.h>
+#include <xen/delay.h>
#include <xen/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -158,9 +159,378 @@ intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
return MCA_EXTINFO_GLOBAL;
}
+/* Below are for MCE handling */
+
+/* Log worst error severity and offending CPU.,
+ * Pick this CPU for further processing in softirq */
+static int severity_cpu = -1;
+static int worst = 0;
+
+/* Lock of entry@second round scanning in MCE# handler */
+static cpumask_t scanned_cpus;
+/* Lock for entry@Critical Section in MCE# handler */
+static bool_t mce_enter_lock = 0;
+/* Record how many CPUs impacted in this MCE# */
+static cpumask_t impact_map;
+
+/* Lock of softirq rendezvous entering point */
+static cpumask_t mced_cpus;
+/*Lock of softirq rendezvous leaving point */
+static cpumask_t finished_cpus;
+/* Lock for picking one processing CPU */
+static bool_t mce_process_lock = 0;
+
+/* Spinlock for vMCE# MSR virtualization data */
+static DEFINE_SPINLOCK(mce_locks);
+
+/* Local buffer for holding MCE# data temporarily, sharing between mce
+ * handler and softirq handler. Those data will be finally committed
+ * for DOM0 Log and coped to per_dom related data for guest vMCE#
+ * MSR virtualization.
+ * Note: When local buffer is still in processing in softirq, another
+ * MCA comes, simply panic.
+ */
+
+struct mc_local_t
+{
+ bool_t in_use;
+ mctelem_cookie_t mctc[NR_CPUS];
+};
+static struct mc_local_t mc_local;
+
+/* This node list records errors impacting a domain. when one
+ * MCE# happens, one error bank impacts a domain. This error node
+ * will be inserted to the tail of the per_dom data for vMCE# MSR
+ * virtualization. When one vMCE# injection is finished processing
+ * processed by guest, the corresponding node will be deleted.
+ * This node list is for GUEST vMCE# MSRS virtualization.
+ */
+static struct bank_entry* alloc_bank_entry(void) {
+ struct bank_entry *entry;
+
+ entry = xmalloc(struct bank_entry);
+ if (!entry) {
+ printk(KERN_ERR "MCE: malloc bank_entry failed\n");
+ return NULL;
+ }
+ memset(entry, 0x0, sizeof(entry));
+ INIT_LIST_HEAD(&entry->list);
+ return entry;
+}
+
+/* Fill error bank info for #vMCE injection and GUEST vMCE#
+ * MSR virtualization data
+ * 1) Log down how many nr_injections of the impacted.
+ * 2) Copy MCE# error bank to impacted DOM node list,
+ for vMCE# MSRs virtualization
+*/
+
+static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank,
+ uint64_t gstatus) {
+ struct domain *d;
+ struct bank_entry *entry;
+
+ /* This error bank impacts one domain, we need to fill domain related
+ * data for vMCE MSRs virtualization and vMCE# injection */
+ if (mc_bank->mc_domid != (uint16_t)~0) {
+ d = get_domain_by_id(mc_bank->mc_domid);
+
+ /* Not impact a valid domain, skip this error of the bank */
+ if (!d) {
+ printk(KERN_DEBUG "MCE: Not found valid impacted DOM\n");
+ return 0;
+ }
+
+ entry = alloc_bank_entry();
+ entry->mci_status = mc_bank->mc_status;
+ entry->mci_addr = mc_bank->mc_addr;
+ entry->mci_misc = mc_bank->mc_misc;
+ entry->cpu = cpu;
+ entry->bank = mc_bank->mc_bank;
+
+ /* New error Node, insert to the tail of the per_dom data */
+ list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
+ /* Fill MSR global status */
+ d->arch.vmca_msrs.mcg_status = gstatus;
+ /* New node impact the domain, need another vMCE# injection*/
+ d->arch.vmca_msrs.nr_injection++;
+
+ printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
+ "status %lx addr %lx domid %d]\n ",
+ entry->cpu, mc_bank->mc_bank,
+ mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid);
+ }
+ return 0;
+}
+
+static int mce_actions(void) {
+ int32_t cpu, ret;
+ struct mc_info *local_mi;
+ struct mcinfo_common *mic = NULL;
+ struct mcinfo_global *mc_global;
+ struct mcinfo_bank *mc_bank;
+
+ /* Spinlock is used for exclusive read/write of vMSR virtualization
+ * (per_dom vMCE# data)
+ */
+ spin_lock(&mce_locks);
+
+ /*
+ * If softirq is filling this buffer while another MCE# comes,
+ * simply panic
+ */
+ test_and_set_bool(mc_local.in_use);
+
+ for_each_cpu_mask(cpu, impact_map) {
+ if (mc_local.mctc[cpu] == NULL) {
+ printk(KERN_ERR "MCE: get reserved entry failed\n ");
+ ret = -1;
+ goto end;
+ }
+ local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]);
+ x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+ if (mic == NULL) {
+ printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+ ret = -1;
+ goto end;
+ }
+
+ mc_global = (struct mcinfo_global *)mic;
+
+ /* Processing bank information */
+ x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+ for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
+ if (mic->type != MC_TYPE_BANK) {
+ continue;
+ }
+ mc_bank = (struct mcinfo_bank*)mic;
+ /* Fill vMCE# injection and vMCE# MSR virtualization related data */
+ if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) {
+ ret = -1;
+ goto end;
+ }
+
+ /* TODO: Add recovery actions here, such as page-offline, etc */
+ }
+ } /* end of impact_map loop */
+
+ ret = 0;
+
+end:
+
+ for_each_cpu_mask(cpu, impact_map) {
+ /* This reserved entry is processed, commit it */
+ if (mc_local.mctc[cpu] != NULL) {
+ mctelem_commit(mc_local.mctc[cpu]);
+ printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n");
+ }
+ }
+
+ test_and_clear_bool(mc_local.in_use);
+ spin_unlock(&mce_locks);
+ return ret;
+}
+
+/* Softirq Handler for this MCE# processing */
+static void mce_softirq(void)
+{
+ int cpu = smp_processor_id();
+ cpumask_t affinity;
+
+ /* Wait until all cpus entered softirq */
+ while ( cpus_weight(mced_cpus) != num_online_cpus() ) {
+ cpu_relax();
+ }
+ /* Not Found worst error on severity_cpu, it's weird */
+ if (severity_cpu == -1) {
+ printk(KERN_WARNING "MCE: not found severity_cpu!\n");
+ mc_panic("MCE: not found severity_cpu!");
+ return;
+ }
+ /* We choose severity_cpu for further processing */
+ if (severity_cpu == cpu) {
+
+ /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
+ * vMCE MSRs virtualization buffer
+ */
+ if (mce_actions())
+ mc_panic("MCE recovery actions or Filling vMCE MSRS "
+ "virtualization data failed!\n");
+
+ /* Step2: Send Log to DOM0 through vIRQ */
+ if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
+ printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n");
+ send_guest_global_virq(dom0, VIRQ_MCA);
+ }
+
+ /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */
+ if (guest_has_trap_callback
+ (dom0, 0, TRAP_machine_check) &&
+ !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
+ dom0->vcpu[0]->cpu_affinity_tmp =
+ dom0->vcpu[0]->cpu_affinity;
+ cpus_clear(affinity);
+ cpu_set(cpu, affinity);
+ printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+ dom0->vcpu[0]->processor);
+ vcpu_set_affinity(dom0->vcpu[0], &affinity);
+ vcpu_kick(dom0->vcpu[0]);
+ }
+
+ /* Clean Data */
+ test_and_clear_bool(mce_process_lock);
+ cpus_clear(impact_map);
+ cpus_clear(scanned_cpus);
+ worst = 0;
+ cpus_clear(mced_cpus);
+ memset(&mc_local, 0x0, sizeof(mc_local));
+ }
+
+ cpu_set(cpu, finished_cpus);
+ wmb();
+ /* Leave until all cpus finished recovery actions in softirq */
+ while ( cpus_weight(finished_cpus) != num_online_cpus() ) {
+ cpu_relax();
+ }
+
+ cpus_clear(finished_cpus);
+ severity_cpu = -1;
+ printk(KERN_DEBUG "CPU%d exit softirq \n", cpu);
+}
+
+/* Machine Check owner judge algorithm:
+ * When error happens, all cpus serially read its msr banks.
+ * The first CPU who fetches the error bank's info will clear
+ * this bank. Later readers can't get any infor again.
+ * The first CPU is the actual mce_owner
+ *
+ * For Fatal (pcc=1) error, it might cause machine crash
+ * before we're able to log. For avoiding log missing, we adopt two
+ * round scanning:
+ * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
+ * All MCE banks are sticky, when boot up, MCE polling mechanism
+ * will help to collect and log those MCE errors.
+ * Round2: Do all MCE processing logic as normal.
+ */
+
+/* Simple Scan. Panic when found non-recovery errors. Doing this for
+ * avoiding LOG missing
+ */
+static void severity_scan(void)
+{
+ uint64_t status;
+ int32_t i;
+
+ /* TODO: For PCC = 0, we need to have further judge. If it is can't be
+ * recovered, we need to RESET for avoiding DOM0 LOG missing
+ */
+ for ( i = 0; i < nr_mce_banks; i++) {
+ rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status);
+ if ( !(status & MCi_STATUS_VAL) )
+ continue;
+ /* MCE handler only handles UC error */
+ if ( !(status & MCi_STATUS_UC) )
+ continue;
+ if ( !(status & MCi_STATUS_EN) )
+ continue;
+ if (status & MCi_STATUS_PCC)
+ mc_panic("pcc = 1, cpu unable to continue\n");
+ }
+
+ /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */
+ /* EIPV and RIPV is not a reliable way to judge the error severity */
+
+}
+
+
static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
{
- mcheck_cmn_handler(regs, error_code, mca_allbanks);
+ unsigned int cpu = smp_processor_id();
+ int32_t severity = 0;
+ uint64_t gstatus;
+ mctelem_cookie_t mctc = NULL;
+ struct mca_summary bs;
+
+ /* First round scanning */
+ severity_scan();
+ cpu_set(cpu, scanned_cpus);
+ while (cpus_weight(scanned_cpus) < num_online_cpus())
+ cpu_relax();
+
+ wmb();
+ /* All CPUs Finished first round scanning */
+ if (mc_local.in_use != 0) {
+ mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n");
+ return;
+ }
+
+ /* Enter Critical Section */
+ while (test_and_set_bool(mce_enter_lock)) {
+ udelay (1);
+ }
+
+ mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs);
+ /* local data point to the reserved entry, let softirq to
+ * process the local data */
+ if (!bs.errcnt) {
+ if (mctc != NULL)
+ mctelem_dismiss(mctc);
+ mc_local.mctc[cpu] = NULL;
+ cpu_set(cpu, mced_cpus);
+ test_and_clear_bool(mce_enter_lock);
+ raise_softirq(MACHINE_CHECK_SOFTIRQ);
+ return;
+ }
+ else if ( mctc != NULL) {
+ mc_local.mctc[cpu] = mctc;
+ }
+
+ if (bs.uc || bs.pcc)
+ add_taint(TAINT_MACHINE_CHECK);
+
+ if (bs.pcc) {
+ printk(KERN_WARNING "PCC=1 should have caused reset\n");
+ severity = 3;
+ }
+ else if (bs.uc) {
+ severity = 2;
+ }
+ else {
+ printk(KERN_WARNING "We should skip Correctable Error\n");
+ severity = 1;
+ }
+ /* This is the offending cpu! */
+ cpu_set(cpu, impact_map);
+
+ if ( severity > worst) {
+ worst = severity;
+ severity_cpu = cpu;
+ }
+ cpu_set(cpu, mced_cpus);
+ test_and_clear_bool(mce_enter_lock);
+ wmb();
+
+ /* Wait for all cpus Leave Critical */
+ while (cpus_weight(mced_cpus) < num_online_cpus())
+ cpu_relax();
+ /* Print MCE error */
+ x86_mcinfo_dump(mctelem_dataptr(mctc));
+
+ /* Pick one CPU to clear MCIP */
+ if (!test_and_set_bool(mce_process_lock)) {
+ rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+ wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+
+ if (worst >= 3) {
+ printk(KERN_WARNING "worst=3 should have caused RESET\n");
+ mc_panic("worst=3 should have caused RESET");
+ }
+ else {
+ printk(KERN_DEBUG "MCE: trying to recover\n");
+ }
+ }
+ raise_softirq(MACHINE_CHECK_SOFTIRQ);
}
static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -227,7 +597,7 @@ static void cmci_discover(void)
} else {
x86_mcinfo_dump(mctelem_dataptr(mctc));
mctelem_dismiss(mctc);
- }
+ }
} else if (mctc != NULL)
mctelem_dismiss(mctc);
@@ -337,11 +707,12 @@ fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
if (bs.errcnt && mctc != NULL) {
if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
mctelem_commit(mctc);
+ printk(KERN_DEBUG "CMCI: send CMCI to DOM0 through virq\n");
send_guest_global_virq(dom0, VIRQ_MCA);
} else {
x86_mcinfo_dump(mctelem_dataptr(mctc));
mctelem_dismiss(mctc);
- }
+ }
} else if (mctc != NULL)
mctelem_dismiss(mctc);
@@ -357,11 +728,15 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci(c);
}
+uint64_t g_mcg_cap;
static void mce_cap_init(struct cpuinfo_x86 *c)
{
u32 l, h;
rdmsr (MSR_IA32_MCG_CAP, l, h);
+ /* For Guest vMCE usage */
+ g_mcg_cap = ((u64)h << 32 | l) & (~MCG_CMCI_P);
+
if ((l & MCG_CMCI_P) && cpu_has_apic)
cmci_support = 1;
@@ -434,5 +809,6 @@ int intel_mcheck_init(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
mce_set_owner();
+ open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
return 1;
}
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index e9a21d17f3..a84c9dc100 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -83,9 +83,7 @@
/*Intel Specific bitfield*/
#define CMCI_THRESHOLD 0x2
-
-#define MAX_NR_BANKS 128
-
+#include <asm/domain.h>
typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 84d08dd97d..8d7a166f65 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -373,6 +373,7 @@ void vcpu_destroy(struct vcpu *v)
hvm_vcpu_destroy(v);
}
+extern uint64_t g_mcg_cap;
int arch_domain_create(struct domain *d, unsigned int domcr_flags)
{
#ifdef __x86_64__
@@ -455,6 +456,16 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
if ( (rc = iommu_domain_init(d)) != 0 )
goto fail;
+
+ /* For Guest vMCE MSRs virtualization */
+ d->arch.vmca_msrs.mcg_status = 0x0;
+ d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
+ d->arch.vmca_msrs.mcg_ctl = (uint64_t)~0x0;
+ d->arch.vmca_msrs.nr_injection = 0;
+ memset(d->arch.vmca_msrs.mci_ctl, 0x1,
+ sizeof(d->arch.vmca_msrs.mci_ctl));
+ INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
+
}
if ( is_hvm_domain(d) )
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 34f4ecf59c..70fef66f84 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -14,6 +14,8 @@
#include <xen/nmi.h>
#include <asm/current.h>
#include <asm/flushtlb.h>
+#include <asm/traps.h>
+#include <asm/event.h>
#include <asm/msr.h>
#include <asm/page.h>
#include <asm/shared.h>
@@ -265,6 +267,9 @@ unsigned long do_iret(void)
struct cpu_user_regs *regs = guest_cpu_user_regs();
struct iret_context iret_saved;
struct vcpu *v = current;
+ struct domain *d = v->domain;
+ struct bank_entry *entry;
+ int cpu = smp_processor_id();
if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
sizeof(iret_saved))) )
@@ -304,6 +309,48 @@ unsigned long do_iret(void)
&& !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
vcpu_set_affinity(v, &v->cpu_affinity_tmp);
+ /*Currently, only inject vMCE to DOM0.*/
+ if (v->trap_priority >= VCPU_TRAP_NMI) {
+ printk(KERN_DEBUG "MCE: Return from vMCE# trap!");
+ if (d->domain_id == 0 && v->vcpu_id == 0) {
+ if ( !d->arch.vmca_msrs.nr_injection ) {
+ printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 0\n");
+ goto end;
+ }
+
+ d->arch.vmca_msrs.nr_injection--;
+ if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+ entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+ struct bank_entry, list);
+ printk(KERN_DEBUG "MCE: Delete last injection Node\n");
+ list_del(&entry->list);
+ }
+ else
+ printk(KERN_DEBUG "MCE: Not found last injection "
+ "Node, something Wrong!\n");
+
+ /* futher injection*/
+ if ( d->arch.vmca_msrs.nr_injection > 0) {
+ if ( d->arch.vmca_msrs.nr_injection > 0 &&
+ guest_has_trap_callback(d, v->vcpu_id,
+ TRAP_machine_check) &&
+ !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
+ cpumask_t affinity;
+
+ dom0->vcpu[0]->cpu_affinity_tmp =
+ dom0->vcpu[0]->cpu_affinity;
+ cpus_clear(affinity);
+ cpu_set(cpu, affinity);
+ printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+ dom0->vcpu[0]->processor);
+ vcpu_set_affinity(dom0->vcpu[0], &affinity);
+ vcpu_kick(dom0->vcpu[0]);
+ }
+ }
+ }
+ } /* end of outer-if */
+
+end:
/* Restore previous trap priority */
v->trap_priority = v->old_trap_priority;
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index ec70469308..2bf5b1c823 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -203,6 +203,31 @@ typedef xen_domctl_cpuid_t cpuid_input_t;
struct p2m_domain;
+/* Define for GUEST MCA handling */
+#define MAX_NR_BANKS 30
+
+/* This entry is for recording bank nodes for the impacted domain,
+ * put into impact_header list. */
+struct bank_entry {
+ struct list_head list;
+ int32_t cpu;
+ uint16_t bank;
+ uint64_t mci_status;
+ uint64_t mci_addr;
+ uint64_t mci_misc;
+};
+
+struct domain_mca_msrs
+{
+ /* Guest should not change below values after DOM boot up */
+ uint64_t mcg_cap;
+ uint64_t mcg_ctl;
+ uint64_t mcg_status;
+ uint64_t mci_ctl[MAX_NR_BANKS];
+ uint16_t nr_injection;
+ struct list_head impact_header;
+};
+
struct arch_domain
{
l1_pgentry_t *mm_perdomain_pt;
@@ -269,6 +294,9 @@ struct arch_domain
struct page_list_head relmem_list;
cpuid_input_t cpuids[MAX_CPUID_INPUT];
+
+ /* For Guest vMCA handling */
+ struct domain_mca_msrs vmca_msrs;
} __cacheline_aligned;
#define has_arch_pdevs(d) (!list_empty(&(d)->arch.pdev_list))
diff --git a/xen/include/asm-x86/softirq.h b/xen/include/asm-x86/softirq.h
index 84b540587d..4387803910 100644
--- a/xen/include/asm-x86/softirq.h
+++ b/xen/include/asm-x86/softirq.h
@@ -5,6 +5,7 @@
#define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
#define VCPU_KICK_SOFTIRQ (NR_COMMON_SOFTIRQS + 2)
-#define NR_ARCH_SOFTIRQS 3
+#define MACHINE_CHECK_SOFTIRQ (NR_COMMON_SOFTIRQS + 3)
+#define NR_ARCH_SOFTIRQS 4
#endif /* __ASM_SOFTIRQ_H__ */