diff options
-rw-r--r-- | xen/arch/x86/cpu/mcheck/mce_intel.c | 382 | ||||
-rw-r--r-- | xen/arch/x86/cpu/mcheck/x86_mca.h | 4 | ||||
-rw-r--r-- | xen/arch/x86/domain.c | 11 | ||||
-rw-r--r-- | xen/arch/x86/x86_64/traps.c | 47 | ||||
-rw-r--r-- | xen/include/asm-x86/domain.h | 28 | ||||
-rw-r--r-- | xen/include/asm-x86/softirq.h | 3 |
6 files changed, 468 insertions, 7 deletions
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c index 01360d5ff5..5a9960fe6a 100644 --- a/xen/arch/x86/cpu/mcheck/mce_intel.c +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c @@ -3,6 +3,7 @@ #include <xen/irq.h> #include <xen/event.h> #include <xen/kernel.h> +#include <xen/delay.h> #include <xen/smp.h> #include <asm/processor.h> #include <asm/system.h> @@ -158,9 +159,378 @@ intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status) return MCA_EXTINFO_GLOBAL; } +/* Below are for MCE handling */ + +/* Log worst error severity and offending CPU., + * Pick this CPU for further processing in softirq */ +static int severity_cpu = -1; +static int worst = 0; + +/* Lock of entry@second round scanning in MCE# handler */ +static cpumask_t scanned_cpus; +/* Lock for entry@Critical Section in MCE# handler */ +static bool_t mce_enter_lock = 0; +/* Record how many CPUs impacted in this MCE# */ +static cpumask_t impact_map; + +/* Lock of softirq rendezvous entering point */ +static cpumask_t mced_cpus; +/*Lock of softirq rendezvous leaving point */ +static cpumask_t finished_cpus; +/* Lock for picking one processing CPU */ +static bool_t mce_process_lock = 0; + +/* Spinlock for vMCE# MSR virtualization data */ +static DEFINE_SPINLOCK(mce_locks); + +/* Local buffer for holding MCE# data temporarily, sharing between mce + * handler and softirq handler. Those data will be finally committed + * for DOM0 Log and coped to per_dom related data for guest vMCE# + * MSR virtualization. + * Note: When local buffer is still in processing in softirq, another + * MCA comes, simply panic. + */ + +struct mc_local_t +{ + bool_t in_use; + mctelem_cookie_t mctc[NR_CPUS]; +}; +static struct mc_local_t mc_local; + +/* This node list records errors impacting a domain. when one + * MCE# happens, one error bank impacts a domain. This error node + * will be inserted to the tail of the per_dom data for vMCE# MSR + * virtualization. When one vMCE# injection is finished processing + * processed by guest, the corresponding node will be deleted. + * This node list is for GUEST vMCE# MSRS virtualization. + */ +static struct bank_entry* alloc_bank_entry(void) { + struct bank_entry *entry; + + entry = xmalloc(struct bank_entry); + if (!entry) { + printk(KERN_ERR "MCE: malloc bank_entry failed\n"); + return NULL; + } + memset(entry, 0x0, sizeof(entry)); + INIT_LIST_HEAD(&entry->list); + return entry; +} + +/* Fill error bank info for #vMCE injection and GUEST vMCE# + * MSR virtualization data + * 1) Log down how many nr_injections of the impacted. + * 2) Copy MCE# error bank to impacted DOM node list, + for vMCE# MSRs virtualization +*/ + +static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, + uint64_t gstatus) { + struct domain *d; + struct bank_entry *entry; + + /* This error bank impacts one domain, we need to fill domain related + * data for vMCE MSRs virtualization and vMCE# injection */ + if (mc_bank->mc_domid != (uint16_t)~0) { + d = get_domain_by_id(mc_bank->mc_domid); + + /* Not impact a valid domain, skip this error of the bank */ + if (!d) { + printk(KERN_DEBUG "MCE: Not found valid impacted DOM\n"); + return 0; + } + + entry = alloc_bank_entry(); + entry->mci_status = mc_bank->mc_status; + entry->mci_addr = mc_bank->mc_addr; + entry->mci_misc = mc_bank->mc_misc; + entry->cpu = cpu; + entry->bank = mc_bank->mc_bank; + + /* New error Node, insert to the tail of the per_dom data */ + list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header); + /* Fill MSR global status */ + d->arch.vmca_msrs.mcg_status = gstatus; + /* New node impact the domain, need another vMCE# injection*/ + d->arch.vmca_msrs.nr_injection++; + + printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d " + "status %lx addr %lx domid %d]\n ", + entry->cpu, mc_bank->mc_bank, + mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid); + } + return 0; +} + +static int mce_actions(void) { + int32_t cpu, ret; + struct mc_info *local_mi; + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + + /* Spinlock is used for exclusive read/write of vMSR virtualization + * (per_dom vMCE# data) + */ + spin_lock(&mce_locks); + + /* + * If softirq is filling this buffer while another MCE# comes, + * simply panic + */ + test_and_set_bool(mc_local.in_use); + + for_each_cpu_mask(cpu, impact_map) { + if (mc_local.mctc[cpu] == NULL) { + printk(KERN_ERR "MCE: get reserved entry failed\n "); + ret = -1; + goto end; + } + local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]); + x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL); + if (mic == NULL) { + printk(KERN_ERR "MCE: get local buffer entry failed\n "); + ret = -1; + goto end; + } + + mc_global = (struct mcinfo_global *)mic; + + /* Processing bank information */ + x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK); + + for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) { + if (mic->type != MC_TYPE_BANK) { + continue; + } + mc_bank = (struct mcinfo_bank*)mic; + /* Fill vMCE# injection and vMCE# MSR virtualization related data */ + if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) { + ret = -1; + goto end; + } + + /* TODO: Add recovery actions here, such as page-offline, etc */ + } + } /* end of impact_map loop */ + + ret = 0; + +end: + + for_each_cpu_mask(cpu, impact_map) { + /* This reserved entry is processed, commit it */ + if (mc_local.mctc[cpu] != NULL) { + mctelem_commit(mc_local.mctc[cpu]); + printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n"); + } + } + + test_and_clear_bool(mc_local.in_use); + spin_unlock(&mce_locks); + return ret; +} + +/* Softirq Handler for this MCE# processing */ +static void mce_softirq(void) +{ + int cpu = smp_processor_id(); + cpumask_t affinity; + + /* Wait until all cpus entered softirq */ + while ( cpus_weight(mced_cpus) != num_online_cpus() ) { + cpu_relax(); + } + /* Not Found worst error on severity_cpu, it's weird */ + if (severity_cpu == -1) { + printk(KERN_WARNING "MCE: not found severity_cpu!\n"); + mc_panic("MCE: not found severity_cpu!"); + return; + } + /* We choose severity_cpu for further processing */ + if (severity_cpu == cpu) { + + /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and + * vMCE MSRs virtualization buffer + */ + if (mce_actions()) + mc_panic("MCE recovery actions or Filling vMCE MSRS " + "virtualization data failed!\n"); + + /* Step2: Send Log to DOM0 through vIRQ */ + if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { + printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n"); + send_guest_global_virq(dom0, VIRQ_MCA); + } + + /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */ + if (guest_has_trap_callback + (dom0, 0, TRAP_machine_check) && + !test_and_set_bool(dom0->vcpu[0]->mce_pending)) { + dom0->vcpu[0]->cpu_affinity_tmp = + dom0->vcpu[0]->cpu_affinity; + cpus_clear(affinity); + cpu_set(cpu, affinity); + printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu, + dom0->vcpu[0]->processor); + vcpu_set_affinity(dom0->vcpu[0], &affinity); + vcpu_kick(dom0->vcpu[0]); + } + + /* Clean Data */ + test_and_clear_bool(mce_process_lock); + cpus_clear(impact_map); + cpus_clear(scanned_cpus); + worst = 0; + cpus_clear(mced_cpus); + memset(&mc_local, 0x0, sizeof(mc_local)); + } + + cpu_set(cpu, finished_cpus); + wmb(); + /* Leave until all cpus finished recovery actions in softirq */ + while ( cpus_weight(finished_cpus) != num_online_cpus() ) { + cpu_relax(); + } + + cpus_clear(finished_cpus); + severity_cpu = -1; + printk(KERN_DEBUG "CPU%d exit softirq \n", cpu); +} + +/* Machine Check owner judge algorithm: + * When error happens, all cpus serially read its msr banks. + * The first CPU who fetches the error bank's info will clear + * this bank. Later readers can't get any infor again. + * The first CPU is the actual mce_owner + * + * For Fatal (pcc=1) error, it might cause machine crash + * before we're able to log. For avoiding log missing, we adopt two + * round scanning: + * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset. + * All MCE banks are sticky, when boot up, MCE polling mechanism + * will help to collect and log those MCE errors. + * Round2: Do all MCE processing logic as normal. + */ + +/* Simple Scan. Panic when found non-recovery errors. Doing this for + * avoiding LOG missing + */ +static void severity_scan(void) +{ + uint64_t status; + int32_t i; + + /* TODO: For PCC = 0, we need to have further judge. If it is can't be + * recovered, we need to RESET for avoiding DOM0 LOG missing + */ + for ( i = 0; i < nr_mce_banks; i++) { + rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status); + if ( !(status & MCi_STATUS_VAL) ) + continue; + /* MCE handler only handles UC error */ + if ( !(status & MCi_STATUS_UC) ) + continue; + if ( !(status & MCi_STATUS_EN) ) + continue; + if (status & MCi_STATUS_PCC) + mc_panic("pcc = 1, cpu unable to continue\n"); + } + + /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */ + /* EIPV and RIPV is not a reliable way to judge the error severity */ + +} + + static void intel_machine_check(struct cpu_user_regs * regs, long error_code) { - mcheck_cmn_handler(regs, error_code, mca_allbanks); + unsigned int cpu = smp_processor_id(); + int32_t severity = 0; + uint64_t gstatus; + mctelem_cookie_t mctc = NULL; + struct mca_summary bs; + + /* First round scanning */ + severity_scan(); + cpu_set(cpu, scanned_cpus); + while (cpus_weight(scanned_cpus) < num_online_cpus()) + cpu_relax(); + + wmb(); + /* All CPUs Finished first round scanning */ + if (mc_local.in_use != 0) { + mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n"); + return; + } + + /* Enter Critical Section */ + while (test_and_set_bool(mce_enter_lock)) { + udelay (1); + } + + mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs); + /* local data point to the reserved entry, let softirq to + * process the local data */ + if (!bs.errcnt) { + if (mctc != NULL) + mctelem_dismiss(mctc); + mc_local.mctc[cpu] = NULL; + cpu_set(cpu, mced_cpus); + test_and_clear_bool(mce_enter_lock); + raise_softirq(MACHINE_CHECK_SOFTIRQ); + return; + } + else if ( mctc != NULL) { + mc_local.mctc[cpu] = mctc; + } + + if (bs.uc || bs.pcc) + add_taint(TAINT_MACHINE_CHECK); + + if (bs.pcc) { + printk(KERN_WARNING "PCC=1 should have caused reset\n"); + severity = 3; + } + else if (bs.uc) { + severity = 2; + } + else { + printk(KERN_WARNING "We should skip Correctable Error\n"); + severity = 1; + } + /* This is the offending cpu! */ + cpu_set(cpu, impact_map); + + if ( severity > worst) { + worst = severity; + severity_cpu = cpu; + } + cpu_set(cpu, mced_cpus); + test_and_clear_bool(mce_enter_lock); + wmb(); + + /* Wait for all cpus Leave Critical */ + while (cpus_weight(mced_cpus) < num_online_cpus()) + cpu_relax(); + /* Print MCE error */ + x86_mcinfo_dump(mctelem_dataptr(mctc)); + + /* Pick one CPU to clear MCIP */ + if (!test_and_set_bool(mce_process_lock)) { + rdmsrl(MSR_IA32_MCG_STATUS, gstatus); + wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); + + if (worst >= 3) { + printk(KERN_WARNING "worst=3 should have caused RESET\n"); + mc_panic("worst=3 should have caused RESET"); + } + else { + printk(KERN_DEBUG "MCE: trying to recover\n"); + } + } + raise_softirq(MACHINE_CHECK_SOFTIRQ); } static DEFINE_SPINLOCK(cmci_discover_lock); @@ -227,7 +597,7 @@ static void cmci_discover(void) } else { x86_mcinfo_dump(mctelem_dataptr(mctc)); mctelem_dismiss(mctc); - } + } } else if (mctc != NULL) mctelem_dismiss(mctc); @@ -337,11 +707,12 @@ fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs) if (bs.errcnt && mctc != NULL) { if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) { mctelem_commit(mctc); + printk(KERN_DEBUG "CMCI: send CMCI to DOM0 through virq\n"); send_guest_global_virq(dom0, VIRQ_MCA); } else { x86_mcinfo_dump(mctelem_dataptr(mctc)); mctelem_dismiss(mctc); - } + } } else if (mctc != NULL) mctelem_dismiss(mctc); @@ -357,11 +728,15 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(c); } +uint64_t g_mcg_cap; static void mce_cap_init(struct cpuinfo_x86 *c) { u32 l, h; rdmsr (MSR_IA32_MCG_CAP, l, h); + /* For Guest vMCE usage */ + g_mcg_cap = ((u64)h << 32 | l) & (~MCG_CMCI_P); + if ((l & MCG_CMCI_P) && cpu_has_apic) cmci_support = 1; @@ -434,5 +809,6 @@ int intel_mcheck_init(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_set_owner(); + open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq); return 1; } diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h index e9a21d17f3..a84c9dc100 100644 --- a/xen/arch/x86/cpu/mcheck/x86_mca.h +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h @@ -83,9 +83,7 @@ /*Intel Specific bitfield*/ #define CMCI_THRESHOLD 0x2 - -#define MAX_NR_BANKS 128 - +#include <asm/domain.h> typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned); diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 84d08dd97d..8d7a166f65 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -373,6 +373,7 @@ void vcpu_destroy(struct vcpu *v) hvm_vcpu_destroy(v); } +extern uint64_t g_mcg_cap; int arch_domain_create(struct domain *d, unsigned int domcr_flags) { #ifdef __x86_64__ @@ -455,6 +456,16 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags) if ( (rc = iommu_domain_init(d)) != 0 ) goto fail; + + /* For Guest vMCE MSRs virtualization */ + d->arch.vmca_msrs.mcg_status = 0x0; + d->arch.vmca_msrs.mcg_cap = g_mcg_cap; + d->arch.vmca_msrs.mcg_ctl = (uint64_t)~0x0; + d->arch.vmca_msrs.nr_injection = 0; + memset(d->arch.vmca_msrs.mci_ctl, 0x1, + sizeof(d->arch.vmca_msrs.mci_ctl)); + INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header); + } if ( is_hvm_domain(d) ) diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index 34f4ecf59c..70fef66f84 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -14,6 +14,8 @@ #include <xen/nmi.h> #include <asm/current.h> #include <asm/flushtlb.h> +#include <asm/traps.h> +#include <asm/event.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/shared.h> @@ -265,6 +267,9 @@ unsigned long do_iret(void) struct cpu_user_regs *regs = guest_cpu_user_regs(); struct iret_context iret_saved; struct vcpu *v = current; + struct domain *d = v->domain; + struct bank_entry *entry; + int cpu = smp_processor_id(); if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp, sizeof(iret_saved))) ) @@ -304,6 +309,48 @@ unsigned long do_iret(void) && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) vcpu_set_affinity(v, &v->cpu_affinity_tmp); + /*Currently, only inject vMCE to DOM0.*/ + if (v->trap_priority >= VCPU_TRAP_NMI) { + printk(KERN_DEBUG "MCE: Return from vMCE# trap!"); + if (d->domain_id == 0 && v->vcpu_id == 0) { + if ( !d->arch.vmca_msrs.nr_injection ) { + printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 0\n"); + goto end; + } + + d->arch.vmca_msrs.nr_injection--; + if (!list_empty(&d->arch.vmca_msrs.impact_header)) { + entry = list_entry(d->arch.vmca_msrs.impact_header.next, + struct bank_entry, list); + printk(KERN_DEBUG "MCE: Delete last injection Node\n"); + list_del(&entry->list); + } + else + printk(KERN_DEBUG "MCE: Not found last injection " + "Node, something Wrong!\n"); + + /* futher injection*/ + if ( d->arch.vmca_msrs.nr_injection > 0) { + if ( d->arch.vmca_msrs.nr_injection > 0 && + guest_has_trap_callback(d, v->vcpu_id, + TRAP_machine_check) && + !test_and_set_bool(dom0->vcpu[0]->mce_pending)) { + cpumask_t affinity; + + dom0->vcpu[0]->cpu_affinity_tmp = + dom0->vcpu[0]->cpu_affinity; + cpus_clear(affinity); + cpu_set(cpu, affinity); + printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu, + dom0->vcpu[0]->processor); + vcpu_set_affinity(dom0->vcpu[0], &affinity); + vcpu_kick(dom0->vcpu[0]); + } + } + } + } /* end of outer-if */ + +end: /* Restore previous trap priority */ v->trap_priority = v->old_trap_priority; diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index ec70469308..2bf5b1c823 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -203,6 +203,31 @@ typedef xen_domctl_cpuid_t cpuid_input_t; struct p2m_domain; +/* Define for GUEST MCA handling */ +#define MAX_NR_BANKS 30 + +/* This entry is for recording bank nodes for the impacted domain, + * put into impact_header list. */ +struct bank_entry { + struct list_head list; + int32_t cpu; + uint16_t bank; + uint64_t mci_status; + uint64_t mci_addr; + uint64_t mci_misc; +}; + +struct domain_mca_msrs +{ + /* Guest should not change below values after DOM boot up */ + uint64_t mcg_cap; + uint64_t mcg_ctl; + uint64_t mcg_status; + uint64_t mci_ctl[MAX_NR_BANKS]; + uint16_t nr_injection; + struct list_head impact_header; +}; + struct arch_domain { l1_pgentry_t *mm_perdomain_pt; @@ -269,6 +294,9 @@ struct arch_domain struct page_list_head relmem_list; cpuid_input_t cpuids[MAX_CPUID_INPUT]; + + /* For Guest vMCA handling */ + struct domain_mca_msrs vmca_msrs; } __cacheline_aligned; #define has_arch_pdevs(d) (!list_empty(&(d)->arch.pdev_list)) diff --git a/xen/include/asm-x86/softirq.h b/xen/include/asm-x86/softirq.h index 84b540587d..4387803910 100644 --- a/xen/include/asm-x86/softirq.h +++ b/xen/include/asm-x86/softirq.h @@ -5,6 +5,7 @@ #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1) #define VCPU_KICK_SOFTIRQ (NR_COMMON_SOFTIRQS + 2) -#define NR_ARCH_SOFTIRQS 3 +#define MACHINE_CHECK_SOFTIRQ (NR_COMMON_SOFTIRQS + 3) +#define NR_ARCH_SOFTIRQS 4 #endif /* __ASM_SOFTIRQ_H__ */ |