diff options
-rw-r--r-- | xen/arch/x86/crash.c | 119 | ||||
-rw-r--r-- | xen/arch/x86/machine_kexec.c | 22 | ||||
-rw-r--r-- | xen/arch/x86/x86_64/entry.S | 33 | ||||
-rw-r--r-- | xen/include/asm-x86/desc.h | 45 | ||||
-rw-r--r-- | xen/include/asm-x86/processor.h | 4 |
5 files changed, 208 insertions, 15 deletions
diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c index 88cba19a40..b6c0a7f569 100644 --- a/xen/arch/x86/crash.c +++ b/xen/arch/x86/crash.c @@ -32,41 +32,130 @@ static atomic_t waiting_for_crash_ipi; static unsigned int crashing_cpu; +static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done); -static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu) +/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */ +void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs) { - /* Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. + int cpu = smp_processor_id(); + + /* nmi_shootdown_cpus() should ensure that this assertion is correct. */ + ASSERT(cpu != crashing_cpu); + + /* Save crash information and shut down CPU. Attempt only once. */ + if ( !this_cpu(crash_save_done) ) + { + /* Disable the interrupt stack table for the MCE handler. This + * prevents race conditions between clearing MCIP and receving a + * new MCE, during which the exception frame would be clobbered + * and the MCE handler fall into an infinite loop. We are soon + * going to disable the NMI watchdog, so the loop would not be + * caught. + * + * We do not need to change the NMI IST, as the nmi_crash + * handler is immue to corrupt exception frames, by virtue of + * being designed never to return. + * + * This update is safe from a security point of view, as this + * pcpu is never going to try to sysret back to a PV vcpu. + */ + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); + + kexec_crash_save_cpu(); + __stop_this_cpu(); + + this_cpu(crash_save_done) = 1; + atomic_dec(&waiting_for_crash_ipi); + } + + /* Poor mans self_nmi(). __stop_this_cpu() has reverted the LAPIC + * back to its boot state, so we are unable to rely on the regular + * apic_* functions, due to 'x2apic_enabled' being possibly wrong. + * (The likely scenario is that we have reverted from x2apic mode to + * xapic, at which point #GPFs will occur if we use the apic_* + * functions) + * + * The ICR and APIC ID of the LAPIC are still valid even during + * software disable (Intel SDM Vol 3, 10.4.7.2). As a result, we + * can deliberately queue up another NMI at the LAPIC which will not + * be delivered as the hardware NMI latch is currently in effect. + * This means that if NMIs become unlatched (e.g. following a + * non-fatal MCE), the LAPIC will force us back here rather than + * wandering back into regular Xen code. */ - if ( cpu == crashing_cpu ) - return 1; - local_irq_disable(); + switch ( current_local_apic_mode() ) + { + u32 apic_id; - kexec_crash_save_cpu(); + case APIC_MODE_X2APIC: + apic_id = apic_rdmsr(APIC_ID); - __stop_this_cpu(); + apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL + | ((u64)apic_id << 32)); + break; + + case APIC_MODE_XAPIC: + apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID)); - atomic_dec(&waiting_for_crash_ipi); + while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY ) + cpu_relax(); + + apic_mem_write(APIC_ICR2, apic_id << 24); + apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL); + break; + + default: + break; + } for ( ; ; ) halt(); - - return 1; } static void nmi_shootdown_cpus(void) { unsigned long msecs; + int i, cpu = smp_processor_id(); local_irq_disable(); - crashing_cpu = smp_processor_id(); + crashing_cpu = cpu; local_irq_count(crashing_cpu) = 0; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - set_nmi_callback(crash_nmi_callback); + + /* Change NMI trap handlers. Non-crashing pcpus get nmi_crash which + * invokes do_nmi_crash (above), which cause them to write state and + * fall into a loop. The crashing pcpu gets the nop handler to + * cause it to return to this function ASAP. + */ + for ( i = 0; i < nr_cpu_ids; i++ ) + { + if ( idt_tables[i] == NULL ) + continue; + + if ( i == cpu ) + { + /* + * Disable the interrupt stack tables for this cpu's MCE and NMI + * handlers, and alter the NMI handler to have no operation. + * Disabling the stack tables prevents stack corruption race + * conditions, while changing the handler helps prevent cascading + * faults; we are certainly going to crash by this point. + * + * This update is safe from a security point of view, as this pcpu + * is never going to try to sysret back to a PV vcpu. + */ + _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop); + set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE); + } + else + { + /* Do not update stack table for other pcpus. */ + _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash); + } + } + /* Ensure the new callback function is set before sending out the NMI. */ wmb(); diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c index 8d99cc4b11..8191ef1b96 100644 --- a/xen/arch/x86/machine_kexec.c +++ b/xen/arch/x86/machine_kexec.c @@ -81,12 +81,34 @@ void machine_kexec(xen_kexec_image_t *image) .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY), .limit = LAST_RESERVED_GDT_BYTE }; + int i; /* We are about to permenantly jump out of the Xen context into the kexec * purgatory code. We really dont want to be still servicing interupts. */ local_irq_disable(); + /* Now regular interrupts are disabled, we need to reduce the impact + * of interrupts not disabled by 'cli'. + * + * The NMI handlers have already been set up nmi_shootdown_cpus(). All + * pcpus other than us have the nmi_crash handler, while we have the nop + * handler. + * + * The MCE handlers touch extensive areas of Xen code and data. At this + * point, there is nothing we can usefully do, so set the nop handler. + */ + for ( i = 0; i < nr_cpu_ids; i++ ) + { + if ( idt_tables[i] == NULL ) + continue; + _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop); + } + + /* Explicitly enable NMIs on this CPU. Some crashdump kernels do + * not like running with NMIs disabled. */ + enable_nmis(); + /* * compat_machine_kexec() returns to idle pagetables, which requires us * to be running on a static GDT mapping (idle pagetables have no GDT diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S index 29b26579e1..462b16fe65 100644 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -635,11 +635,44 @@ ENTRY(nmi) movl $TRAP_nmi,4(%rsp) jmp handle_ist_exception +ENTRY(nmi_crash) + pushq $0 + movl $TRAP_nmi,4(%rsp) + SAVE_ALL + movq %rsp,%rdi + callq do_nmi_crash /* Does not return */ + ud2 + ENTRY(machine_check) pushq $0 movl $TRAP_machine_check,4(%rsp) jmp handle_ist_exception +/* Enable NMIs. No special register assumptions. Only %rax is not preserved. */ +ENTRY(enable_nmis) + movq %rsp, %rax /* Grab RSP before pushing */ + + /* Set up stack frame */ + pushq $0 /* SS */ + pushq %rax /* RSP */ + pushfq /* RFLAGS */ + pushq $__HYPERVISOR_CS /* CS */ + leaq 1f(%rip),%rax + pushq %rax /* RIP */ + + iretq /* Disable the hardware NMI latch */ +1: + retq + +/* No op trap handler. Required for kexec crash path. This is not + * declared with the ENTRY() macro to avoid wasted alignment space. + */ +.globl trap_nop +trap_nop: + iretq + + + .section .rodata, "a", @progbits ENTRY(exception_table) diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h index 7466ba8c94..354b8897ee 100644 --- a/xen/include/asm-x86/desc.h +++ b/xen/include/asm-x86/desc.h @@ -106,6 +106,21 @@ typedef struct { u64 a, b; } idt_entry_t; +/* Write the lower 64 bits of an IDT Entry. This relies on the upper 32 + * bits of the address not changing, which is a safe assumption as all + * functions we are likely to load will live inside the 1GB + * code/data/bss address range. + * + * Ideally, we would use cmpxchg16b, but this is not supported on some + * old AMD 64bit capable processors, and has no safe equivalent. + */ +static inline void _write_gate_lower(volatile idt_entry_t *gate, + const idt_entry_t *new) +{ + ASSERT(gate->b == new->b); + gate->a = new->a; +} + #define _set_gate(gate_addr,type,dpl,addr) \ do { \ (gate_addr)->a = 0; \ @@ -122,6 +137,36 @@ do { \ (1UL << 47); \ } while (0) +static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type, + unsigned long dpl, void *addr) +{ + idt_entry_t idte; + idte.b = gate->b; + idte.a = + (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | + ((unsigned long)(dpl) << 45) | + ((unsigned long)(type) << 40) | + ((unsigned long)(addr) & 0xFFFFUL) | + ((unsigned long)__HYPERVISOR_CS64 << 16) | + (1UL << 47); + _write_gate_lower(gate, &idte); +} + +/* Update the lower half handler of an IDT Entry, without changing any + * other configuration. */ +static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr) +{ + idt_entry_t idte; + idte.a = gate->a; + + idte.b = ((unsigned long)(addr) >> 32); + idte.a &= 0x0000FFFFFFFF0000ULL; + idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | + ((unsigned long)(addr) & 0xFFFFUL); + + _write_gate_lower(gate, &idte); +} + #define _set_tssldt_desc(desc,addr,limit,type) \ do { \ (desc)[0].b = (desc)[1].b = 0; \ diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 6f8121ee59..9b3e4fc0ba 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -527,6 +527,7 @@ void do_ ## _name(struct cpu_user_regs *regs) DECLARE_TRAP_HANDLER(divide_error); DECLARE_TRAP_HANDLER(debug); DECLARE_TRAP_HANDLER(nmi); +DECLARE_TRAP_HANDLER(nmi_crash); DECLARE_TRAP_HANDLER(int3); DECLARE_TRAP_HANDLER(overflow); DECLARE_TRAP_HANDLER(bounds); @@ -545,6 +546,9 @@ DECLARE_TRAP_HANDLER(alignment_check); DECLARE_TRAP_HANDLER(spurious_interrupt_bug); #undef DECLARE_TRAP_HANDLER +void trap_nop(void); +void enable_nmis(void); + void syscall_enter(void); void sysenter_entry(void); void sysenter_eflags_saved(void); |