aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--xen/arch/x86/crash.c119
-rw-r--r--xen/arch/x86/machine_kexec.c22
-rw-r--r--xen/arch/x86/x86_64/entry.S33
-rw-r--r--xen/include/asm-x86/desc.h45
-rw-r--r--xen/include/asm-x86/processor.h4
5 files changed, 208 insertions, 15 deletions
diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 88cba19a40..b6c0a7f569 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -32,41 +32,130 @@
static atomic_t waiting_for_crash_ipi;
static unsigned int crashing_cpu;
+static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
-static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
+void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
{
- /* Don't do anything if this handler is invoked on crashing cpu.
- * Otherwise, system will completely hang. Crashing cpu can get
- * an NMI if system was initially booted with nmi_watchdog parameter.
+ int cpu = smp_processor_id();
+
+ /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
+ ASSERT(cpu != crashing_cpu);
+
+ /* Save crash information and shut down CPU. Attempt only once. */
+ if ( !this_cpu(crash_save_done) )
+ {
+ /* Disable the interrupt stack table for the MCE handler. This
+ * prevents race conditions between clearing MCIP and receving a
+ * new MCE, during which the exception frame would be clobbered
+ * and the MCE handler fall into an infinite loop. We are soon
+ * going to disable the NMI watchdog, so the loop would not be
+ * caught.
+ *
+ * We do not need to change the NMI IST, as the nmi_crash
+ * handler is immue to corrupt exception frames, by virtue of
+ * being designed never to return.
+ *
+ * This update is safe from a security point of view, as this
+ * pcpu is never going to try to sysret back to a PV vcpu.
+ */
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+
+ kexec_crash_save_cpu();
+ __stop_this_cpu();
+
+ this_cpu(crash_save_done) = 1;
+ atomic_dec(&waiting_for_crash_ipi);
+ }
+
+ /* Poor mans self_nmi(). __stop_this_cpu() has reverted the LAPIC
+ * back to its boot state, so we are unable to rely on the regular
+ * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
+ * (The likely scenario is that we have reverted from x2apic mode to
+ * xapic, at which point #GPFs will occur if we use the apic_*
+ * functions)
+ *
+ * The ICR and APIC ID of the LAPIC are still valid even during
+ * software disable (Intel SDM Vol 3, 10.4.7.2). As a result, we
+ * can deliberately queue up another NMI at the LAPIC which will not
+ * be delivered as the hardware NMI latch is currently in effect.
+ * This means that if NMIs become unlatched (e.g. following a
+ * non-fatal MCE), the LAPIC will force us back here rather than
+ * wandering back into regular Xen code.
*/
- if ( cpu == crashing_cpu )
- return 1;
- local_irq_disable();
+ switch ( current_local_apic_mode() )
+ {
+ u32 apic_id;
- kexec_crash_save_cpu();
+ case APIC_MODE_X2APIC:
+ apic_id = apic_rdmsr(APIC_ID);
- __stop_this_cpu();
+ apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
+ | ((u64)apic_id << 32));
+ break;
+
+ case APIC_MODE_XAPIC:
+ apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
- atomic_dec(&waiting_for_crash_ipi);
+ while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
+ cpu_relax();
+
+ apic_mem_write(APIC_ICR2, apic_id << 24);
+ apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
+ break;
+
+ default:
+ break;
+ }
for ( ; ; )
halt();
-
- return 1;
}
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
+ int i, cpu = smp_processor_id();
local_irq_disable();
- crashing_cpu = smp_processor_id();
+ crashing_cpu = cpu;
local_irq_count(crashing_cpu) = 0;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- set_nmi_callback(crash_nmi_callback);
+
+ /* Change NMI trap handlers. Non-crashing pcpus get nmi_crash which
+ * invokes do_nmi_crash (above), which cause them to write state and
+ * fall into a loop. The crashing pcpu gets the nop handler to
+ * cause it to return to this function ASAP.
+ */
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ {
+ if ( idt_tables[i] == NULL )
+ continue;
+
+ if ( i == cpu )
+ {
+ /*
+ * Disable the interrupt stack tables for this cpu's MCE and NMI
+ * handlers, and alter the NMI handler to have no operation.
+ * Disabling the stack tables prevents stack corruption race
+ * conditions, while changing the handler helps prevent cascading
+ * faults; we are certainly going to crash by this point.
+ *
+ * This update is safe from a security point of view, as this pcpu
+ * is never going to try to sysret back to a PV vcpu.
+ */
+ _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
+ set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE);
+ }
+ else
+ {
+ /* Do not update stack table for other pcpus. */
+ _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash);
+ }
+ }
+
/* Ensure the new callback function is set before sending out the NMI. */
wmb();
diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c
index 8d99cc4b11..8191ef1b96 100644
--- a/xen/arch/x86/machine_kexec.c
+++ b/xen/arch/x86/machine_kexec.c
@@ -81,12 +81,34 @@ void machine_kexec(xen_kexec_image_t *image)
.base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY),
.limit = LAST_RESERVED_GDT_BYTE
};
+ int i;
/* We are about to permenantly jump out of the Xen context into the kexec
* purgatory code. We really dont want to be still servicing interupts.
*/
local_irq_disable();
+ /* Now regular interrupts are disabled, we need to reduce the impact
+ * of interrupts not disabled by 'cli'.
+ *
+ * The NMI handlers have already been set up nmi_shootdown_cpus(). All
+ * pcpus other than us have the nmi_crash handler, while we have the nop
+ * handler.
+ *
+ * The MCE handlers touch extensive areas of Xen code and data. At this
+ * point, there is nothing we can usefully do, so set the nop handler.
+ */
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ {
+ if ( idt_tables[i] == NULL )
+ continue;
+ _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop);
+ }
+
+ /* Explicitly enable NMIs on this CPU. Some crashdump kernels do
+ * not like running with NMIs disabled. */
+ enable_nmis();
+
/*
* compat_machine_kexec() returns to idle pagetables, which requires us
* to be running on a static GDT mapping (idle pagetables have no GDT
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index 29b26579e1..462b16fe65 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -635,11 +635,44 @@ ENTRY(nmi)
movl $TRAP_nmi,4(%rsp)
jmp handle_ist_exception
+ENTRY(nmi_crash)
+ pushq $0
+ movl $TRAP_nmi,4(%rsp)
+ SAVE_ALL
+ movq %rsp,%rdi
+ callq do_nmi_crash /* Does not return */
+ ud2
+
ENTRY(machine_check)
pushq $0
movl $TRAP_machine_check,4(%rsp)
jmp handle_ist_exception
+/* Enable NMIs. No special register assumptions. Only %rax is not preserved. */
+ENTRY(enable_nmis)
+ movq %rsp, %rax /* Grab RSP before pushing */
+
+ /* Set up stack frame */
+ pushq $0 /* SS */
+ pushq %rax /* RSP */
+ pushfq /* RFLAGS */
+ pushq $__HYPERVISOR_CS /* CS */
+ leaq 1f(%rip),%rax
+ pushq %rax /* RIP */
+
+ iretq /* Disable the hardware NMI latch */
+1:
+ retq
+
+/* No op trap handler. Required for kexec crash path. This is not
+ * declared with the ENTRY() macro to avoid wasted alignment space.
+ */
+.globl trap_nop
+trap_nop:
+ iretq
+
+
+
.section .rodata, "a", @progbits
ENTRY(exception_table)
diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h
index 7466ba8c94..354b8897ee 100644
--- a/xen/include/asm-x86/desc.h
+++ b/xen/include/asm-x86/desc.h
@@ -106,6 +106,21 @@ typedef struct {
u64 a, b;
} idt_entry_t;
+/* Write the lower 64 bits of an IDT Entry. This relies on the upper 32
+ * bits of the address not changing, which is a safe assumption as all
+ * functions we are likely to load will live inside the 1GB
+ * code/data/bss address range.
+ *
+ * Ideally, we would use cmpxchg16b, but this is not supported on some
+ * old AMD 64bit capable processors, and has no safe equivalent.
+ */
+static inline void _write_gate_lower(volatile idt_entry_t *gate,
+ const idt_entry_t *new)
+{
+ ASSERT(gate->b == new->b);
+ gate->a = new->a;
+}
+
#define _set_gate(gate_addr,type,dpl,addr) \
do { \
(gate_addr)->a = 0; \
@@ -122,6 +137,36 @@ do { \
(1UL << 47); \
} while (0)
+static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type,
+ unsigned long dpl, void *addr)
+{
+ idt_entry_t idte;
+ idte.b = gate->b;
+ idte.a =
+ (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+ ((unsigned long)(dpl) << 45) |
+ ((unsigned long)(type) << 40) |
+ ((unsigned long)(addr) & 0xFFFFUL) |
+ ((unsigned long)__HYPERVISOR_CS64 << 16) |
+ (1UL << 47);
+ _write_gate_lower(gate, &idte);
+}
+
+/* Update the lower half handler of an IDT Entry, without changing any
+ * other configuration. */
+static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
+{
+ idt_entry_t idte;
+ idte.a = gate->a;
+
+ idte.b = ((unsigned long)(addr) >> 32);
+ idte.a &= 0x0000FFFFFFFF0000ULL;
+ idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+ ((unsigned long)(addr) & 0xFFFFUL);
+
+ _write_gate_lower(gate, &idte);
+}
+
#define _set_tssldt_desc(desc,addr,limit,type) \
do { \
(desc)[0].b = (desc)[1].b = 0; \
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 6f8121ee59..9b3e4fc0ba 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -527,6 +527,7 @@ void do_ ## _name(struct cpu_user_regs *regs)
DECLARE_TRAP_HANDLER(divide_error);
DECLARE_TRAP_HANDLER(debug);
DECLARE_TRAP_HANDLER(nmi);
+DECLARE_TRAP_HANDLER(nmi_crash);
DECLARE_TRAP_HANDLER(int3);
DECLARE_TRAP_HANDLER(overflow);
DECLARE_TRAP_HANDLER(bounds);
@@ -545,6 +546,9 @@ DECLARE_TRAP_HANDLER(alignment_check);
DECLARE_TRAP_HANDLER(spurious_interrupt_bug);
#undef DECLARE_TRAP_HANDLER
+void trap_nop(void);
+void enable_nmis(void);
+
void syscall_enter(void);
void sysenter_entry(void);
void sysenter_eflags_saved(void);