x86 mce: fix a few possible issues

Do not share synchronization variables between the trap handler and the softirq handler, as this will cause problems. Abstract the synchronization bits into functions. Make the synchronization code aware of a panic, so that spinning with interrupts disabled is avoided. To avoid problems with MCEs happening while we're doing recovery actions in the softirq handler, implement a deferred list of telemetry structures, using the mctelem interfaces. Thist list will get updated atomically, so any additional MCEs will not cause error telemetry to be missed or not handled. Signed-off-by: Frank van der Linden <frank.vanderlinden@sun.com> Signed-off-by: Liping Ke <liping.ke@intel.com> Signed-off-by: Yunhong Jiang<yunhong.jiang@intel.com>
author: Keir Fraser <keir.fraser@citrix.com> 2009-04-09 08:41:28 +0100
committer: Keir Fraser <keir.fraser@citrix.com> 2009-04-09 08:41:28 +0100
commit: 7d20f72ca3a521699b5231d1a61656eb18424421 (patch)
tree: 7fe7d3b687915a50b93d080f174b0742e4148363
parent: 11e1149fe08e2e9a478696cc1c3b331e6b8e4efa (diff)
download: xen-7d20f72ca3a521699b5231d1a61656eb18424421.tar.gz
xen-7d20f72ca3a521699b5231d1a61656eb18424421.tar.bz2
xen-7d20f72ca3a521699b5231d1a61656eb18424421.zip
6 files changed, 337 insertions, 261 deletions
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index de5c432141..49f3f84899 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -23,6 +23,7 @@
 #include "mce.h"
 
 int mce_disabled = 0;
+int is_mc_panic = 0;
 unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
@@ -124,6 +125,7 @@ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
 
 	switch (who) {
 	case MCA_MCE_HANDLER:
+	case MCA_MCE_SCAN:
 		mcg.mc_flags = MC_FLAG_MCE;
 		which = MC_URGENT;
 		break;
@@ -219,8 +221,9 @@ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
 			cbret = mc_callback_bank_extended(mci, i, status);
 		}
 
-		/* Clear status */
-		mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+		if (who != MCA_MCE_SCAN)
+			/* Clear status */
+			mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
 		wmb();
 	}
 
@@ -469,6 +472,21 @@ cmn_handler_done:
 	}
 }
 
+void mcheck_mca_clearbanks(cpu_banks_t bankmask)
+{
+	int i;
+	uint64_t status;
+
+	for (i = 0; i < 32 && i < nr_mce_banks; i++) {
+		if (!test_bit(i, bankmask))
+			continue;
+		mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+		if (!(status & MCi_STATUS_VAL))
+			continue;
+		mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+	}
+}
+
 static int amd_mcheck_init(struct cpuinfo_x86 *ci)
 {
 	int rc = 0;
@@ -1207,9 +1225,8 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
 
 		add_taint(TAINT_ERROR_INJECT);
 
-		on_selected_cpus(cpumask_of_cpu(target),
-		    x86_mc_mceinject, mc_mceinject, 1, 1);
-
+		on_selected_cpus(cpumask_of_cpu(target), x86_mc_mceinject,
+		    mc_mceinject, 1, 1);
 		break;
 
 	default:
@@ -1233,6 +1250,7 @@ void set_poll_bankmask(struct cpuinfo_x86 *c)
 }
 void mc_panic(char *s)
 {
+    is_mc_panic = 1;
     console_start_sync();
     printk("Fatal machine check: %s\n", s);
     printk("\n"
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
index 6b79e673e2..044d11a3e4 100644
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -70,7 +70,8 @@ enum mca_source {
 	MCA_MCE_HANDLER,
 	MCA_POLLER,
 	MCA_CMCI_HANDLER,
-	MCA_RESET
+	MCA_RESET,
+	MCA_MCE_SCAN
 };
 
 enum mca_extinfo {
@@ -92,6 +93,8 @@ void set_poll_bankmask(struct cpuinfo_x86 *c);
 DECLARE_PER_CPU(cpu_banks_t, poll_bankmask);
 DECLARE_PER_CPU(cpu_banks_t, no_cmci_banks);
 extern int cmci_support;
+extern int is_mc_panic;
+extern void mcheck_mca_clearbanks(cpu_banks_t);
 
 extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
     struct mca_summary *);
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
index 67d0342d99..6bf8c87add 100644
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -18,6 +18,29 @@ int cmci_support = 0;
 static int nr_intel_ext_msrs = 0;
 static int firstbank;
 
+/* Below are for MCE handling */
+struct mce_softirq_barrier {
+	atomic_t val;
+	atomic_t ingen;
+	atomic_t outgen;
+};
+
+static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
+static struct mce_softirq_barrier mce_trap_bar;
+
+/*
+ * mce_logout_lock should only be used in the trap handler,
+ * while MCIP has not been cleared yet in the global status
+ * register. Other use is not safe, since an MCE trap can
+ * happen at any moment, which would cause lock recursion.
+ */
+static DEFINE_SPINLOCK(mce_logout_lock);
+
+static atomic_t severity_cpu = ATOMIC_INIT(-1);
+
+static void mce_barrier_enter(struct mce_softirq_barrier *);
+static void mce_barrier_exit(struct mce_softirq_barrier *);
+
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
 {
@@ -123,7 +146,7 @@ static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
     if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
          && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
         ext->mc_msr[ext->mc_msrs].reg = msr;
-        rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
+        mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
         ++ext->mc_msrs;
     }
 }
@@ -169,45 +192,6 @@ intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
     return MCA_EXTINFO_GLOBAL;
 }
 
-/* Below are for MCE handling */
-
-/* Log worst error severity and offending CPU.,
- * Pick this CPU for further processing in softirq */
-static int severity_cpu = -1;
-static int worst = 0;
-
-/* Lock of entry@second round scanning in MCE# handler */
-static cpumask_t scanned_cpus;
-/* Lock for entry@Critical Section in MCE# handler */
-static bool_t mce_enter_lock = 0;
-/* Record how many CPUs impacted in this MCE# */
-static cpumask_t impact_map;
-
-/* Lock of softirq rendezvous entering point */
-static cpumask_t mced_cpus;
-/*Lock of softirq rendezvous leaving point */
-static cpumask_t finished_cpus;
-/* Lock for picking one processing CPU */
-static bool_t mce_process_lock = 0;
-
-/* Spinlock for vMCE# MSR virtualization data */
-static DEFINE_SPINLOCK(mce_locks);
-
-/* Local buffer for holding MCE# data temporarily, sharing between mce
- * handler and softirq handler. Those data will be finally committed
- * for DOM0 Log and coped to per_dom related data for guest vMCE#
- * MSR virtualization.
- * Note: When local buffer is still in processing in softirq, another
- * MCA comes, simply panic.
- */
-
-struct mc_local_t
-{
-    bool_t in_use;
-    mctelem_cookie_t mctc[NR_CPUS];
-};
-static struct mc_local_t mc_local;
-
 /* This node list records errors impacting a domain. when one
  * MCE# happens, one error bank impacts a domain. This error node
  * will be inserted to the tail of the per_dom data for vMCE# MSR
@@ -252,18 +236,22 @@ static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank,
         }
 
         entry = alloc_bank_entry();
+        if (entry == NULL)
+	    return -1;
         entry->mci_status = mc_bank->mc_status;
         entry->mci_addr = mc_bank->mc_addr;
         entry->mci_misc = mc_bank->mc_misc;
         entry->cpu = cpu;
         entry->bank = mc_bank->mc_bank;
 
+	spin_lock(&d->arch.vmca_msrs.lock);
         /* New error Node, insert to the tail of the per_dom data */
         list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
         /* Fill MSR global status */
         d->arch.vmca_msrs.mcg_status = gstatus;
         /* New node impact the domain, need another vMCE# injection*/
         d->arch.vmca_msrs.nr_injection++;
+	spin_unlock(&d->arch.vmca_msrs.lock);
 
         printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
                 "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
@@ -273,100 +261,83 @@ static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank,
     return 0;
 }
 
-static int mce_actions(void) {
-    int32_t cpu, ret;
+/*
+ * Called from mctelem_process_deferred. Return 1 if the telemetry
+ * should be committed for dom0 consumption, 0 if it should be
+ * dismissed.
+ */
+static int mce_action(unsigned int cpu, mctelem_cookie_t mctc)
+{
     struct mc_info *local_mi;
     struct mcinfo_common *mic = NULL;
     struct mcinfo_global *mc_global;
     struct mcinfo_bank *mc_bank;
 
-    /* Spinlock is used for exclusive read/write of vMSR virtualization
-     * (per_dom vMCE# data)
-     */
-    spin_lock(&mce_locks);
-
-    /*
-     * If softirq is filling this buffer while another MCE# comes,
-     * simply panic
-     */
-    test_and_set_bool(mc_local.in_use);
-
-    for_each_cpu_mask(cpu, impact_map) {
-        if (mc_local.mctc[cpu] == NULL) {
-            printk(KERN_ERR "MCE: get reserved entry failed\n ");
-            ret = -1;
-            goto end;
-        }
-        local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]);
-        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
-        if (mic == NULL) {
-            printk(KERN_ERR "MCE: get local buffer entry failed\n ");
-            ret = -1;
-            goto end;
-        }
-
-        mc_global = (struct mcinfo_global *)mic;
+    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+    if (mic == NULL) {
+        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+        return 0;
+    }
 
-        /* Processing bank information */
-        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+    mc_global = (struct mcinfo_global *)mic;
 
-        for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
-            if (mic->type != MC_TYPE_BANK) {
-                continue;
-            }
-            mc_bank = (struct mcinfo_bank*)mic;
-            /* Fill vMCE# injection and vMCE# MSR virtualization related data */
-            if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) {
-                ret = -1;
-                goto end;
-            }
+    /* Processing bank information */
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
 
-            /* TODO: Add recovery actions here, such as page-offline, etc */
+    for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
+        if (mic->type != MC_TYPE_BANK) {
+            continue;
         }
-    } /* end of impact_map loop */
-
-    ret = 0;
-
-end:
+        mc_bank = (struct mcinfo_bank*)mic;
+        /* Fill vMCE# injection and vMCE# MSR virtualization related data */
+        if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1)
+             break;
 
-    for_each_cpu_mask(cpu, impact_map) {
-        /* This reserved entry is processed, commit it */
-        if (mc_local.mctc[cpu] != NULL) {
-            mctelem_commit(mc_local.mctc[cpu]);
-            printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n");
-        }
+       /* TODO: Add recovery actions here, such as page-offline, etc */
     }
 
-    test_and_clear_bool(mc_local.in_use);
-    spin_unlock(&mce_locks);
-    return ret;
+    return 1;
 }
 
 /* Softirq Handler for this MCE# processing */
 static void mce_softirq(void)
 {
     int cpu = smp_processor_id();
+    unsigned int workcpu;
     cpumask_t affinity;
 
-    /* Wait until all cpus entered softirq */
-    while ( cpus_weight(mced_cpus) != num_online_cpus() ) {
-        cpu_relax();
-    }
-    /* Not Found worst error on severity_cpu, it's weird */
-    if (severity_cpu == -1) {
-        printk(KERN_WARNING "MCE: not found severity_cpu!\n");
-        mc_panic("MCE: not found severity_cpu!");
-        return;
-    }
+    printk(KERN_DEBUG "CPU%d enter softirq\n", cpu);
+
+    mce_barrier_enter(&mce_inside_bar);
+
+    /*
+     * Everybody is here. Now let's see who gets to do the
+     * recovery work. Right now we just see if there's a CPU
+     * that did not have any problems, and pick that one.
+     *
+     * First, just set a default value: the last CPU who reaches this
+     * will overwrite the value and become the default.
+     */
+
+    atomic_set(&severity_cpu, cpu);
+
+    mce_barrier_enter(&mce_severity_bar);
+    if (!mctelem_has_deferred(cpu))
+        atomic_set(&severity_cpu, cpu);
+    mce_barrier_exit(&mce_severity_bar);
+
     /* We choose severity_cpu for further processing */
-    if (severity_cpu == cpu) {
+    if (atomic_read(&severity_cpu) == cpu) {
+
+        printk(KERN_DEBUG "CPU%d handling errors\n", cpu);
 
         /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
          * vMCE MSRs virtualization buffer
          */
-        if (mce_actions())
-            mc_panic("MCE recovery actions or Filling vMCE MSRS "
-                     "virtualization data failed!\n");
+        for_each_online_cpu(workcpu) {
+	    mctelem_process_deferred(workcpu, mce_action);
+        }
 
         /* Step2: Send Log to DOM0 through vIRQ */
         if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
@@ -387,26 +358,9 @@ static void mce_softirq(void)
             vcpu_set_affinity(dom0->vcpu[0], &affinity);
             vcpu_kick(dom0->vcpu[0]);
         }
-
-        /* Clean Data */
-        test_and_clear_bool(mce_process_lock);
-        cpus_clear(impact_map);
-        cpus_clear(scanned_cpus);
-        worst = 0;
-        cpus_clear(mced_cpus);
-        memset(&mc_local, 0x0, sizeof(mc_local));
-    }
-
-    cpu_set(cpu, finished_cpus);
-    wmb();
-   /* Leave until all cpus finished recovery actions in softirq */
-    while ( cpus_weight(finished_cpus) != num_online_cpus() ) {
-        cpu_relax();
     }
 
-    cpus_clear(finished_cpus);
-    severity_cpu = -1;
-    printk(KERN_DEBUG "CPU%d exit softirq \n", cpu);
+    mce_barrier_exit(&mce_inside_bar);
 }
 
 /* Machine Check owner judge algorithm:
@@ -424,127 +378,157 @@ static void mce_softirq(void)
  * Round2: Do all MCE processing logic as normal.
  */
 
-/* Simple Scan. Panic when found non-recovery errors. Doing this for
- * avoiding LOG missing
+static void mce_panic_check(void)
+{
+      if (is_mc_panic) {
+              local_irq_enable();
+              for ( ; ; )
+                      halt();
+      }
+}
+
+/*
+ * Initialize a barrier. Just set it to 0.
  */
-static void severity_scan(void)
+static void mce_barrier_init(struct mce_softirq_barrier *bar)
 {
-    uint64_t status;
-    int32_t i;
+      atomic_set(&bar->val, 0);
+      atomic_set(&bar->ingen, 0);
+      atomic_set(&bar->outgen, 0);
+}
 
-    /* TODO: For PCC = 0, we need to have further judge. If it is can't be
-     * recovered, we need to RESET for avoiding DOM0 LOG missing
-     */
-    for ( i = 0; i < nr_mce_banks; i++) {
-        mca_rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status);
-        if ( !(status & MCi_STATUS_VAL) )
-            continue;
-        /* MCE handler only handles UC error */
-        if ( !(status & MCi_STATUS_UC) )
-            continue;
-        if ( !(status & MCi_STATUS_EN) )
-            continue;
-        /*
-         * If this was an injected error, keep going, since the
-         * interposed value will be lost at reboot.
-         */
-        if (status & MCi_STATUS_PCC && intpose_lookup(smp_processor_id(),
-          MSR_IA32_MC0_STATUS + 4 * i, NULL) == NULL)
-            mc_panic("pcc = 1, cpu unable to continue\n");
-    }
+#if 0
+/*
+ * This function will need to be used when offlining a CPU in the
+ * recovery actions.
+ *
+ * Decrement a barrier only. Needed for cases where the CPU
+ * in question can't do it itself (e.g. it is being offlined).
+ */
+static void mce_barrier_dec(struct mce_softirq_barrier *bar)
+{
+      atomic_inc(&bar->outgen);
+      wmb();
+      atomic_dec(&bar->val);
+}
+#endif
 
-    /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */
-    /* EIPV and RIPV is not a reliable way to judge the error severity */
+static void mce_spin_lock(spinlock_t *lk)
+{
+      while (!spin_trylock(lk)) {
+              cpu_relax();
+              mce_panic_check();
+      }
+}
+
+static void mce_spin_unlock(spinlock_t *lk)
+{
+      spin_unlock(lk);
+}
+
+/*
+ * Increment the generation number and the value. The generation number
+ * is incremented when entering a barrier. This way, it can be checked
+ * on exit if a CPU is trying to re-enter the barrier. This can happen
+ * if the first CPU to make it out immediately exits or re-enters, while
+ * another CPU that is still in the loop becomes otherwise occupied
+ * (e.g. it needs to service an interrupt, etc), missing the value
+ * it's waiting for.
+ *
+ * These barrier functions should always be paired, so that the
+ * counter value will reach 0 again after all CPUs have exited.
+ */
+static void mce_barrier_enter(struct mce_softirq_barrier *bar)
+{
+      int gen;
+
+      atomic_inc(&bar->ingen);
+      gen = atomic_read(&bar->outgen);
+      mb();
+      atomic_inc(&bar->val);
+      while ( atomic_read(&bar->val) != num_online_cpus() &&
+          atomic_read(&bar->outgen) == gen) {
+              mb();
+              mce_panic_check();
+      }
+}
 
+static void mce_barrier_exit(struct mce_softirq_barrier *bar)
+{
+      int gen;
+
+      atomic_inc(&bar->outgen);
+      gen = atomic_read(&bar->ingen);
+      mb();
+      atomic_dec(&bar->val);
+      while ( atomic_read(&bar->val) != 0 &&
+          atomic_read(&bar->ingen) == gen ) {
+              mb();
+              mce_panic_check();
+      }
 }
 
+static void mce_barrier(struct mce_softirq_barrier *bar)
+{
+      mce_barrier_enter(bar);
+      mce_barrier_exit(bar);
+}
 
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
-    unsigned int cpu = smp_processor_id();
-    int32_t severity = 0;
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
 
-    /* First round scanning */
-    severity_scan();
-    cpu_set(cpu, scanned_cpus);
-    while (cpus_weight(scanned_cpus) < num_online_cpus())
-        cpu_relax();
+    mce_spin_lock(&mce_logout_lock);
 
-    wmb();
-    /* All CPUs Finished first round scanning */
-    if (mc_local.in_use != 0) {
-        mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n");
-        return;
-    }
+    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs);
 
-    /* Enter Critical Section */
-    while (test_and_set_bool(mce_enter_lock)) {
-        udelay (1);
-    }
-
-    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs);
-     /* local data point to the reserved entry, let softirq to
-      * process the local data */
-    if (!bs.errcnt) {
+    if (bs.errcnt) {
+        /*
+         * Uncorrected errors must be dealth with in softirq context.
+         */
+        if (bs.uc || bs.pcc) {
+            add_taint(TAINT_MACHINE_CHECK);
+            if (mctc != NULL)
+                mctelem_defer(mctc);
+            /*
+             * For PCC=1, context is lost, so reboot now without clearing
+             * the banks, and deal with the telemetry after reboot
+             * (the MSRs are sticky)
+             */
+            if (bs.pcc)
+                mc_panic("State lost due to machine check exception.\n");
+        } else {
+            if (mctc != NULL)
+                mctelem_commit(mctc);
+        }
+        mcheck_mca_clearbanks(mca_allbanks);
+    } else {
         if (mctc != NULL)
             mctelem_dismiss(mctc);
-        mc_local.mctc[cpu] = NULL;
-        cpu_set(cpu, mced_cpus);
-        test_and_clear_bool(mce_enter_lock);
-        raise_softirq(MACHINE_CHECK_SOFTIRQ);
-        return;
-    }
-    else if ( mctc != NULL) {
-        mc_local.mctc[cpu] = mctc;
     }
 
-    if (bs.uc || bs.pcc)
-        add_taint(TAINT_MACHINE_CHECK);
+    mce_spin_unlock(&mce_logout_lock);
 
-    if (bs.pcc) {
-        printk(KERN_WARNING "PCC=1 should have caused reset\n");
-        severity = 3;
-    }
-    else if (bs.uc) {
-        severity = 2;
-    }
-    else {
-        printk(KERN_WARNING "We should skip Correctable Error\n");
-        severity = 1;
-    }
-    /* This is the offending cpu! */
-    cpu_set(cpu, impact_map);
+    /*
+     * Wait until everybody has processed the trap.
+     */
+    mce_barrier(&mce_trap_bar);
 
-    if ( severity > worst) {
-        worst = severity;
-        severity_cpu = cpu;
-    }
-    cpu_set(cpu, mced_cpus);
-    test_and_clear_bool(mce_enter_lock);
-    wmb();
-
-    /* Wait for all cpus Leave Critical */
-    while (cpus_weight(mced_cpus) < num_online_cpus())
-        cpu_relax();
-    /* Print MCE error */
-    x86_mcinfo_dump(mctelem_dataptr(mctc));
-
-    /* Pick one CPU to clear MCIP */
-    if (!test_and_set_bool(mce_process_lock)) {
+    /*
+     * Clear MCIP if it wasn't already. There is a small
+     * chance that more than 1 CPU will end up doing this,
+     * but that's OK.
+     */
+    if (bs.errcnt) {
         mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
-        mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
-
-        if (worst >= 3) {
-            printk(KERN_WARNING "worst=3 should have caused RESET\n");
-            mc_panic("worst=3 should have caused RESET");
-        }
-        else {
-            printk(KERN_DEBUG "MCE: trying to recover\n");
-        }
+        if ((gstatus & MCG_STATUS_MCIP) != 0)
+            mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+        /* Print MCE error */
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
     }
+
     raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }
 
@@ -778,6 +762,11 @@ static void mce_init(void)
 
     clear_in_cr4(X86_CR4_MCE);
 
+    mce_barrier_init(&mce_inside_bar);
+    mce_barrier_init(&mce_severity_bar);
+    mce_barrier_init(&mce_trap_bar);
+    spin_lock_init(&mce_logout_lock);
+
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/
 
@@ -840,6 +829,7 @@ void intel_mce_init_msr(struct domain *d)
     memset(d->arch.vmca_msrs.mci_ctl, ~0,
            sizeof(d->arch.vmca_msrs.mci_ctl));
     INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
+    spin_lock_init(&d->arch.vmca_msrs.lock);
 }
 
 int intel_mce_wrmsr(u32 msr, u64 value)
@@ -849,7 +839,7 @@ int intel_mce_wrmsr(u32 msr, u64 value)
     unsigned int bank;
     int ret = 1;
 
-    spin_lock(&mce_locks);
+    spin_lock(&d->arch.vmca_msrs.lock);
     switch(msr)
     {
     case MSR_IA32_MCG_CTL:
@@ -924,7 +914,7 @@ int intel_mce_wrmsr(u32 msr, u64 value)
         ret = 0;
         break;
     }
-    spin_unlock(&mce_locks);
+    spin_unlock(&d->arch.vmca_msrs.lock);
     return ret;
 }
 
@@ -936,7 +926,7 @@ int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi)
     struct bank_entry *entry = NULL;
 
     *lo = *hi = 0x0;
-    spin_lock(&mce_locks);
+    spin_lock(&d->arch.vmca_msrs.lock);
     switch(msr)
     {
     case MSR_IA32_MCG_STATUS:
@@ -1022,7 +1012,7 @@ int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi)
         ret = 0;
         break;
     }
-    spin_unlock(&mce_locks);
+    spin_unlock(&d->arch.vmca_msrs.lock);
     return ret;
 }
 
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c b/xen/arch/x86/cpu/mcheck/mctelem.c
index 4111ddcbb7..d8dd482bc6 100644
--- a/xen/arch/x86/cpu/mcheck/mctelem.c
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c
@@ -109,6 +109,14 @@ static struct mc_telem_ctl {
 	 * Telemetry array
 	 */
 	struct mctelem_ent *mctc_elems;
+	/*
+	 * Per-CPU processing lists, used for deferred (softirq)
+	 * processing of telemetry. mctc_cpu is indexed by the
+	 * CPU that the telemetry belongs to. mctc_cpu_processing
+	 * is indexed by the CPU that is processing the telemetry.
+	 */
+	struct mctelem_ent *mctc_cpu[NR_CPUS];
+	struct mctelem_ent *mctc_cpu_processing[NR_CPUS];
 } mctctl;
 
 /* Lock protecting all processing lists */
@@ -123,6 +131,82 @@ static void *cmpxchgptr(void *ptr, void *old, void *new)
 	return (void *)cmpxchg(ulp, a, b);
 }
 
+static void mctelem_xchg_head(struct mctelem_ent **headp,
+				struct mctelem_ent **old,
+				struct mctelem_ent *new)
+{
+	for (;;) {
+		*old = *headp;
+		wmb();
+		if (cmpxchgptr(headp, *old, new) == *old)
+			break;
+	}
+}
+
+
+void mctelem_defer(mctelem_cookie_t cookie)
+{
+	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
+
+	mctelem_xchg_head(&mctctl.mctc_cpu[smp_processor_id()],
+	    &tep->mcte_next, tep);
+}
+
+void mctelem_process_deferred(unsigned int cpu,
+			      int (*fn)(unsigned int, mctelem_cookie_t))
+{
+	struct mctelem_ent *tep;
+	struct mctelem_ent *head, *prev;
+	int ret;
+
+	/*
+	 * First, unhook the list of telemetry structures, and	
+	 * hook it up to the processing list head for this CPU.
+	 */
+	mctelem_xchg_head(&mctctl.mctc_cpu[cpu],
+	    &mctctl.mctc_cpu_processing[smp_processor_id()], NULL);
+
+	head = mctctl.mctc_cpu_processing[smp_processor_id()];
+
+	/*
+	 * Then, fix up the list to include prev pointers, to make
+	 * things a little easier, as the list must be traversed in
+	 * chronological order, which is backward from the order they
+	 * are in.
+	 */
+	for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
+		tep->mcte_prev = prev;
+		prev = tep;
+	}
+
+	/*
+	 * Now walk the list of telemetry structures, handling each
+	 * one of them. Unhooking the structure here does not need to
+	 * be atomic, as this list is only accessed from a softirq
+	 * context; the MCE handler does not touch it.
+	 */
+	for (tep = prev; tep != NULL; tep = prev) {
+		prev = tep->mcte_prev;
+		tep->mcte_next = tep->mcte_prev = NULL;
+
+		ret = fn(cpu, MCTE2COOKIE(tep));
+		if (prev != NULL)
+			prev->mcte_next = NULL;
+		tep->mcte_prev = tep->mcte_next = NULL;
+		if (ret != 0)
+			mctelem_commit(MCTE2COOKIE(tep));
+		else
+			mctelem_dismiss(MCTE2COOKIE(tep));
+	}
+}
+
+int mctelem_has_deferred(unsigned int cpu)
+{
+	if (mctctl.mctc_cpu[cpu] != NULL)
+		return 1;
+	return 0;
+}
+
 /* Free an entry to its native free list; the entry must not be linked on
  * any list.
  */
@@ -130,21 +214,12 @@ static void mctelem_free(struct mctelem_ent *tep)
 {
 	mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
 	    MC_URGENT : MC_NONURGENT;
-	struct mctelem_ent **freelp;
-	struct mctelem_ent *oldhead;
 
 	BUG_ON(tep->mcte_refcnt != 0);
 	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
 
 	tep->mcte_prev = NULL;
-	freelp = &mctctl.mctc_free[target];
-	for (;;) {
-		oldhead = *freelp;
-		tep->mcte_next = oldhead;
-		wmb();
-		if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
-			break;
-	}
+	mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next, tep);
 }
 
 /* Increment the reference count of an entry that is not linked on to
@@ -308,22 +383,13 @@ void mctelem_dismiss(mctelem_cookie_t cookie)
 void mctelem_commit(mctelem_cookie_t cookie)
 {
 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
-	struct mctelem_ent **commlp;
-	struct mctelem_ent *oldhead;
 	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
 	    MC_URGENT : MC_NONURGENT;
 
 	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
 	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
 
-	commlp = &mctctl.mctc_committed[target];
-	for (;;) {
-		oldhead = *commlp;
-		tep->mcte_prev = oldhead;
-		wmb();
-		if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
-			break;
-	}
+	mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
 }
 
 /* Move telemetry from committed list to processing list, reversing the
@@ -358,13 +424,7 @@ static void mctelem_append_processing(mctelem_class_t which)
 	 * the list we unlink in a well-known location so it can be
 	 * picked up in panic code should we panic between this unlink
 	 * and the append to the processing list. */
-	for (;;) {
-		dangling[target] = *commlp;
-		wmb();
-		if (cmpxchgptr(commlp, dangling[target], NULL) ==
-		    dangling[target])
-			break;
-	}
+	mctelem_xchg_head(commlp, &dangling[target], NULL);
 
 	if (dangling[target] == NULL)
 		return;
diff --git a/xen/arch/x86/cpu/mcheck/mctelem.h b/xen/arch/x86/cpu/mcheck/mctelem.h
index e3270f606c..fb381a10a0 100644
--- a/xen/arch/x86/cpu/mcheck/mctelem.h
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h
@@ -67,5 +67,9 @@ extern void mctelem_dismiss(mctelem_cookie_t);
 extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
 extern void mctelem_consume_oldest_end(mctelem_cookie_t);
 extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
+extern void mctelem_defer(mctelem_cookie_t);
+extern void mctelem_process_deferred(unsigned int,
+    int (*)(unsigned int, mctelem_cookie_t));
+int mctelem_has_deferred(unsigned int);
 
 #endif
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 2bf5b1c823..8508a1d946 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -226,6 +226,7 @@ struct domain_mca_msrs
     uint64_t mci_ctl[MAX_NR_BANKS];
     uint16_t nr_injection;
     struct list_head impact_header;
+    spinlock_t lock;
 };
 
 struct arch_domain
author	Keir Fraser <keir.fraser@citrix.com>	2009-04-09 08:41:28 +0100
committer	Keir Fraser <keir.fraser@citrix.com>	2009-04-09 08:41:28 +0100
commit	7d20f72ca3a521699b5231d1a61656eb18424421 (patch)
tree	7fe7d3b687915a50b93d080f174b0742e4148363
parent	11e1149fe08e2e9a478696cc1c3b331e6b8e4efa (diff)
download	xen-7d20f72ca3a521699b5231d1a61656eb18424421.tar.gz xen-7d20f72ca3a521699b5231d1a61656eb18424421.tar.bz2 xen-7d20f72ca3a521699b5231d1a61656eb18424421.zip