5 files changed, 208 insertions, 15 deletions
diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 88cba19a40..b6c0a7f569 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -32,41 +32,130 @@
 
 static atomic_t waiting_for_crash_ipi;
 static unsigned int crashing_cpu;
+static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
 
-static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
+void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
 {
-    /* Don't do anything if this handler is invoked on crashing cpu.
-     * Otherwise, system will completely hang. Crashing cpu can get
-     * an NMI if system was initially booted with nmi_watchdog parameter.
+    int cpu = smp_processor_id();
+
+    /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
+    ASSERT(cpu != crashing_cpu);
+
+    /* Save crash information and shut down CPU.  Attempt only once. */
+    if ( !this_cpu(crash_save_done) )
+    {
+        /* Disable the interrupt stack table for the MCE handler.  This
+         * prevents race conditions between clearing MCIP and receving a
+         * new MCE, during which the exception frame would be clobbered
+         * and the MCE handler fall into an infinite loop.  We are soon
+         * going to disable the NMI watchdog, so the loop would not be
+         * caught.
+         *
+         * We do not need to change the NMI IST, as the nmi_crash
+         * handler is immue to corrupt exception frames, by virtue of
+         * being designed never to return.
+         *
+         * This update is safe from a security point of view, as this
+         * pcpu is never going to try to sysret back to a PV vcpu.
+         */
+        set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+
+        kexec_crash_save_cpu();
+        __stop_this_cpu();
+
+        this_cpu(crash_save_done) = 1;
+        atomic_dec(&waiting_for_crash_ipi);
+    }
+
+    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
+     * back to its boot state, so we are unable to rely on the regular
+     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
+     * (The likely scenario is that we have reverted from x2apic mode to
+     * xapic, at which point #GPFs will occur if we use the apic_*
+     * functions)
+     *
+     * The ICR and APIC ID of the LAPIC are still valid even during
+     * software disable (Intel SDM Vol 3, 10.4.7.2).  As a result, we
+     * can deliberately queue up another NMI at the LAPIC which will not
+     * be delivered as the hardware NMI latch is currently in effect.
+     * This means that if NMIs become unlatched (e.g. following a
+     * non-fatal MCE), the LAPIC will force us back here rather than
+     * wandering back into regular Xen code.
      */
-    if ( cpu == crashing_cpu )
-        return 1;
-    local_irq_disable();
+    switch ( current_local_apic_mode() )
+    {
+        u32 apic_id;
 
-    kexec_crash_save_cpu();
+    case APIC_MODE_X2APIC:
+        apic_id = apic_rdmsr(APIC_ID);
 
-    __stop_this_cpu();
+        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
+                   | ((u64)apic_id << 32));
+        break;
+
+    case APIC_MODE_XAPIC:
+        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
 
-    atomic_dec(&waiting_for_crash_ipi);
+        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
+            cpu_relax();
+
+        apic_mem_write(APIC_ICR2, apic_id << 24);
+        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
+        break;
+
+    default:
+        break;
+    }
 
     for ( ; ; )
         halt();
-
-    return 1;
 }
 
 static void nmi_shootdown_cpus(void)
 {
     unsigned long msecs;
+    int i, cpu = smp_processor_id();
 
     local_irq_disable();
 
-    crashing_cpu = smp_processor_id();
+    crashing_cpu = cpu;
     local_irq_count(crashing_cpu) = 0;
 
     atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-    /* Would it be better to replace the trap vector here? */
-    set_nmi_callback(crash_nmi_callback);
+
+    /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
+     * invokes do_nmi_crash (above), which cause them to write state and
+     * fall into a loop.  The crashing pcpu gets the nop handler to
+     * cause it to return to this function ASAP.
+     */
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        if ( idt_tables[i] == NULL )
+            continue;
+
+        if ( i == cpu )
+        {
+            /*
+             * Disable the interrupt stack tables for this cpu's MCE and NMI 
+             * handlers, and alter the NMI handler to have no operation.  
+             * Disabling the stack tables prevents stack corruption race 
+             * conditions, while changing the handler helps prevent cascading 
+             * faults; we are certainly going to crash by this point.
+             *
+             * This update is safe from a security point of view, as this pcpu 
+             * is never going to try to sysret back to a PV vcpu.
+             */
+            _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
+            set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE);
+        }
+        else
+        {
+            /* Do not update stack table for other pcpus. */
+            _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash);
+        }
+    }
+
     /* Ensure the new callback function is set before sending out the NMI. */
     wmb();
 
diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c
index 8d99cc4b11..8191ef1b96 100644
--- a/xen/arch/x86/machine_kexec.c
+++ b/xen/arch/x86/machine_kexec.c
@@ -81,12 +81,34 @@ void machine_kexec(xen_kexec_image_t *image)
         .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY),
         .limit = LAST_RESERVED_GDT_BYTE
     };
+    int i;
 
     /* We are about to permenantly jump out of the Xen context into the kexec
      * purgatory code.  We really dont want to be still servicing interupts.
      */
     local_irq_disable();
 
+    /* Now regular interrupts are disabled, we need to reduce the impact
+     * of interrupts not disabled by 'cli'.
+     *
+     * The NMI handlers have already been set up nmi_shootdown_cpus().  All
+     * pcpus other than us have the nmi_crash handler, while we have the nop
+     * handler.
+     *
+     * The MCE handlers touch extensive areas of Xen code and data.  At this
+     * point, there is nothing we can usefully do, so set the nop handler.
+     */
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        if ( idt_tables[i] == NULL )
+            continue;
+        _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop);
+    }
+
+    /* Explicitly enable NMIs on this CPU.  Some crashdump kernels do
+     * not like running with NMIs disabled. */
+    enable_nmis();
+
     /*
      * compat_machine_kexec() returns to idle pagetables, which requires us
      * to be running on a static GDT mapping (idle pagetables have no GDT
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 29b26579e1..462b16fe65 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -635,11 +635,44 @@ ENTRY(nmi)
         movl  $TRAP_nmi,4(%rsp)
         jmp   handle_ist_exception
 
+ENTRY(nmi_crash)
+        pushq $0
+        movl $TRAP_nmi,4(%rsp)
+        SAVE_ALL
+        movq %rsp,%rdi
+        callq do_nmi_crash /* Does not return */
+        ud2
+
 ENTRY(machine_check)
         pushq $0
         movl  $TRAP_machine_check,4(%rsp)
         jmp   handle_ist_exception
 
+/* Enable NMIs.  No special register assumptions. Only %rax is not preserved. */
+ENTRY(enable_nmis)
+        movq  %rsp, %rax /* Grab RSP before pushing */
+
+        /* Set up stack frame */
+        pushq $0               /* SS */
+        pushq %rax             /* RSP */
+        pushfq                 /* RFLAGS */
+        pushq $__HYPERVISOR_CS /* CS */
+        leaq  1f(%rip),%rax
+        pushq %rax             /* RIP */
+
+        iretq /* Disable the hardware NMI latch */
+1:
+        retq
+
+/* No op trap handler.  Required for kexec crash path.  This is not
+ * declared with the ENTRY() macro to avoid wasted alignment space.
+ */
+.globl trap_nop
+trap_nop:
+        iretq
+
+
+
 .section .rodata, "a", @progbits
 
 ENTRY(exception_table)
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index 7466ba8c94..354b8897ee 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -106,6 +106,21 @@ typedef struct {
     u64 a, b;
 } idt_entry_t;
 
+/* Write the lower 64 bits of an IDT Entry. This relies on the upper 32
+ * bits of the address not changing, which is a safe assumption as all
+ * functions we are likely to load will live inside the 1GB
+ * code/data/bss address range.
+ *
+ * Ideally, we would use cmpxchg16b, but this is not supported on some
+ * old AMD 64bit capable processors, and has no safe equivalent.
+ */
+static inline void _write_gate_lower(volatile idt_entry_t *gate,
+                                     const idt_entry_t *new)
+{
+    ASSERT(gate->b == new->b);
+    gate->a = new->a;
+}
+
 #define _set_gate(gate_addr,type,dpl,addr)               \
 do {                                                     \
     (gate_addr)->a = 0;                                  \
@@ -122,6 +137,36 @@ do {                                                     \
         (1UL << 47);                                     \
 } while (0)
 
+static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type,
+                                   unsigned long dpl, void *addr)
+{
+    idt_entry_t idte;
+    idte.b = gate->b;
+    idte.a =
+        (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+        ((unsigned long)(dpl) << 45) |
+        ((unsigned long)(type) << 40) |
+        ((unsigned long)(addr) & 0xFFFFUL) |
+        ((unsigned long)__HYPERVISOR_CS64 << 16) |
+        (1UL << 47);
+    _write_gate_lower(gate, &idte);
+}
+
+/* Update the lower half handler of an IDT Entry, without changing any
+ * other configuration. */
+static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
+{
+    idt_entry_t idte;
+    idte.a = gate->a;
+
+    idte.b = ((unsigned long)(addr) >> 32);
+    idte.a &= 0x0000FFFFFFFF0000ULL;
+    idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+        ((unsigned long)(addr) & 0xFFFFUL);
+
+    _write_gate_lower(gate, &idte);
+}
+
 #define _set_tssldt_desc(desc,addr,limit,type)           \
 do {                                                     \
     (desc)[0].b = (desc)[1].b = 0;                       \
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 6f8121ee59..9b3e4fc0ba 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -527,6 +527,7 @@ void do_ ## _name(struct cpu_user_regs *regs)
 DECLARE_TRAP_HANDLER(divide_error);
 DECLARE_TRAP_HANDLER(debug);
 DECLARE_TRAP_HANDLER(nmi);
+DECLARE_TRAP_HANDLER(nmi_crash);
 DECLARE_TRAP_HANDLER(int3);
 DECLARE_TRAP_HANDLER(overflow);
 DECLARE_TRAP_HANDLER(bounds);
@@ -545,6 +546,9 @@ DECLARE_TRAP_HANDLER(alignment_check);
 DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
 #undef DECLARE_TRAP_HANDLER
 
+void trap_nop(void);
+void enable_nmis(void);
+
 void syscall_enter(void);
 void sysenter_entry(void);
 void sysenter_eflags_saved(void);