xen/arch/x86/crash.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

/******************************************************************************
 * crash.c
 *
 * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
 *
 * Xen port written by:
 * - Simon 'Horms' Horman <horms@verge.net.au>
 * - Magnus Damm <magnus@valinux.co.jp>
 */

#include <asm/atomic.h>
#include <asm/elf.h>
#include <asm/percpu.h>
#include <xen/types.h>
#include <xen/irq.h>
#include <asm/nmi.h>
#include <xen/string.h>
#include <xen/elf.h>
#include <xen/elfcore.h>
#include <xen/smp.h>
#include <xen/delay.h>
#include <xen/perfc.h>
#include <xen/kexec.h>
#include <xen/sched.h>
#include <xen/keyhandler.h>
#include <public/xen.h>
#include <asm/shared.h>
#include <asm/hvm/support.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <xen/iommu.h>
#include <asm/hpet.h>

static cpumask_t waiting_to_crash;
static unsigned int crashing_cpu;
static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);

/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
{
    int cpu = smp_processor_id();

    /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
    ASSERT(cpu != crashing_cpu);

    /* Save crash information and shut down CPU.  Attempt only once. */
    if ( !this_cpu(crash_save_done) )
    {
        /* Disable the interrupt stack table for the MCE handler.  This
         * prevents race conditions between clearing MCIP and receving a
         * new MCE, during which the exception frame would be clobbered
         * and the MCE handler fall into an infinite loop.  We are soon
         * going to disable the NMI watchdog, so the loop would not be
         * caught.
         *
         * We do not need to change the NMI IST, as the nmi_crash
         * handler is immue to corrupt exception frames, by virtue of
         * being designed never to return.
         *
         * This update is safe from a security point of view, as this
         * pcpu is never going to try to sysret back to a PV vcpu.
         */
        set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);

        kexec_crash_save_cpu();
        __stop_this_cpu();

        this_cpu(crash_save_done) = 1;
        cpumask_clear_cpu(cpu, &waiting_to_crash);
    }

    /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
     * back to its boot state, so we are unable to rely on the regular
     * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
     * (The likely scenario is that we have reverted from x2apic mode to
     * xapic, at which point #GPFs will occur if we use the apic_*
     * functions)
     *
     * The ICR and APIC ID of the LAPIC are still valid even during
     * software disable (Intel SDM Vol 3, 10.4.7.2).  As a result, we
     * can deliberately queue up another NMI at the LAPIC which will not
     * be delivered as the hardware NMI latch is currently in effect.
     * This means that if NMIs become unlatched (e.g. following a
     * non-fatal MCE), the LAPIC will force us back here rather than
     * wandering back into regular Xen code.
     */
    switch ( current_local_apic_mode() )
    {
        u32 apic_id;

    case APIC_MODE_X2APIC:
        apic_id = apic_rdmsr(APIC_ID);

        apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
                   | ((u64)apic_id << 32));
        break;

    case APIC_MODE_XAPIC:
        apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));

        while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
            cpu_relax();

        apic_mem_write(APIC_ICR2, apic_id << 24);
        apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
        break;

    default:
        break;
    }

    for ( ; ; )
        halt();
}

static void nmi_shootdown_cpus(void)
{
    unsigned long msecs;
    int i, cpu = smp_processor_id();

    local_irq_disable();

    crashing_cpu = cpu;
    local_irq_count(crashing_cpu) = 0;

    cpumask_andnot(&waiting_to_crash, &cpu_online_map, cpumask_of(cpu));

    /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
     * invokes do_nmi_crash (above), which cause them to write state and
     * fall into a loop.  The crashing pcpu gets the nop handler to
     * cause it to return to this function ASAP.
     */
    for ( i = 0; i < nr_cpu_ids; i++ )
    {
        if ( idt_tables[i] == NULL )
            continue;

        if ( i == cpu )
        {
            /*
             * Disable the interrupt stack tables for this cpu's MCE and NMI 
             * handlers, and alter the NMI handler to have no operation.  
             * Disabling the stack tables prevents stack corruption race 
             * conditions, while changing the handler helps prevent cascading 
             * faults; we are certainly going to crash by this point.
             *
             * This update is safe from a security point of view, as this pcpu 
             * is never going to try to sysret back to a PV vcpu.
             */
            _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
            set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE);
        }
        else
        {
            /* Do not update stack table for other pcpus. */
            _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash);
        }
    }

    /* Ensure the new callback function is set before sending out the NMI. */
    wmb();

    smp_send_nmi_allbutself();

    msecs = 1000; /* Wait at most a second for the other cpus to stop */
    while ( !cpumask_empty(&waiting_to_crash) && msecs )
    {
        mdelay(1);
        msecs--;
    }

    /* Leave a hint of how well we did trying to shoot down the other cpus */
    if ( cpumask_empty(&waiting_to_crash) )
        printk("Shot down all CPUs\n");
    else
    {
        cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
                          &waiting_to_crash);
        printk("Failed to shoot down CPUs {%s}\n", keyhandler_scratch);
    }

    /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
     * happy when booting if interrupt/dma remapping is still enabled */
    iommu_crash_shutdown();

    __stop_this_cpu();

    /* This is a bit of a hack due to the problems with the x2apic_enabled
     * variable, but we can't do any better without a significant refactoring
     * of the APIC code */
    x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC);

    disable_IO_APIC();
    hpet_disable();
}

void machine_crash_shutdown(void)
{
    crash_xen_info_t *info;

    nmi_shootdown_cpus();

    info = kexec_crash_save_info();
    info->xen_phys_start = xen_phys_start;
    info->dom0_pfn_to_mfn_frame_list_list =
        arch_get_pfn_to_mfn_frame_list_list(dom0);
}

/*
 * Local variables:
 * mode: C
 * c-file-style: "BSD"
 * c-basic-offset: 4
 * tab-width: 4
 * indent-tabs-mode: nil
 * End:
 */