aboutsummaryrefslogtreecommitdiffstats
path: root/stubdom/grub/kexec.c
blob: cef357eff99b2c70cd8ca1ce5b40fa38baf0a9e8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/*
 * This supports booting another PV kernel from Mini-OS
 *
 * The idea is to setup it using libxc, answer to day0 memory allocation
 * requests, and using a trampoline boot page to switch to the new page table.
 *
 * The procedure of the boot page is:
 * - map itself at the target position (that may overwrite some C stuff, but we
 *   do not care any more)
 * - jump there
 * - switch to the target page table
 * - unpin the old page table
 * - jump to the new kernel
 *
 * Samuel Thibault <Samuel.Thibault@eu.citrix.com>, May 2008
 */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mman.h>

#include <xenctrl.h>
#include <xc_dom.h>

#include <kernel.h>
#include <console.h>
#include <os.h>
#include <blkfront.h>
#include <netfront.h>
#include <fbfront.h>
#include <tpmfront.h>
#include <shared.h>
#include <byteswap.h>

#include "mini-os.h"

#if 0
#define DEBUG(fmt, ...) printk(fmt, ## __VA_ARGS__)
#else
#define DEBUG(fmt, ...) (void)0
#endif

/* Assembly boot page from boot.S */
extern void _boot_page;
extern pgentry_t _boot_page_entry;
extern unsigned long _boot_pdmfn;
extern unsigned long _boot_stack, _boot_target, _boot_start_info, _boot_start;
extern xen_pfn_t _boot_oldpdmfn;
extern void _boot(void);

static unsigned long *pages;
static unsigned long *pages_mfns;
static xen_pfn_t *pages_moved2pfns;
static unsigned long allocated;

int pin_table(xc_interface *xc_handle, unsigned int type, unsigned long mfn,
              domid_t dom);

#define TPM_TAG_RQU_COMMAND 0xC1
#define TPM_ORD_Extend 20

struct pcr_extend_cmd {
	uint16_t tag;
	uint32_t size;
	uint32_t ord;

	uint32_t pcr;
	unsigned char hash[20];
} __attribute__((packed));

/* Not imported from polarssl's header since the prototype unhelpfully defines
 * the input as unsigned char, which causes pointer type mismatches */
void sha1(const void *input, size_t ilen, unsigned char output[20]);

/* We need mfn to appear as target_pfn, so exchange with the MFN there */
static void do_exchange(struct xc_dom_image *dom, xen_pfn_t target_pfn, xen_pfn_t source_mfn)
{
    xen_pfn_t source_pfn;
    xen_pfn_t target_mfn;

    for (source_pfn = 0; source_pfn < start_info.nr_pages; source_pfn++)
        if (dom->p2m_host[source_pfn] == source_mfn)
            break;
    ASSERT(source_pfn < start_info.nr_pages);

    target_mfn = dom->p2m_host[target_pfn];

    /* Put target MFN at source PFN */
    dom->p2m_host[source_pfn] = target_mfn;

    /* Put source MFN at target PFN */
    dom->p2m_host[target_pfn] = source_mfn;
}

int kexec_allocate(struct xc_dom_image *dom, xen_vaddr_t up_to)
{
    unsigned long new_allocated = (up_to - dom->parms.virt_base) / PAGE_SIZE;
    unsigned long i;

    pages = realloc(pages, new_allocated * sizeof(*pages));
    pages_mfns = realloc(pages_mfns, new_allocated * sizeof(*pages_mfns));
    pages_moved2pfns = realloc(pages_moved2pfns, new_allocated * sizeof(*pages_moved2pfns));
    for (i = allocated; i < new_allocated; i++) {
        /* Exchange old page of PFN i with a newly allocated page.  */
        xen_pfn_t old_mfn = dom->p2m_host[i];
        xen_pfn_t new_pfn;
        xen_pfn_t new_mfn;

        pages[i] = alloc_page();
        memset((void*) pages[i], 0, PAGE_SIZE);
        new_pfn = PHYS_PFN(to_phys(pages[i]));
        pages_mfns[i] = new_mfn = pfn_to_mfn(new_pfn);

	/*
	 * If PFN of newly allocated page (new_pfn) is less then currently
	 * requested PFN (i) then look for relevant PFN/MFN pair. In this
	 * situation dom->p2m_host[new_pfn] no longer contains proper MFN
	 * because original page with new_pfn was moved earlier
	 * to different location.
	 */
	for (; new_pfn < i; new_pfn = pages_moved2pfns[new_pfn]);

	/* Store destination PFN of currently requested page. */
	pages_moved2pfns[i] = new_pfn;

        /* Put old page at new PFN */
        dom->p2m_host[new_pfn] = old_mfn;

        /* Put new page at PFN i */
        dom->p2m_host[i] = new_mfn;
    }

    allocated = new_allocated;

    return 0;
}

static void tpm_hash2pcr(struct xc_dom_image *dom, char *cmdline)
{
	struct tpmfront_dev* tpm = init_tpmfront(NULL);
	uint8_t *resp;
	size_t resplen = 0;
	struct pcr_extend_cmd cmd;

	/* If all guests have access to a vTPM, it may be useful to replace this
	 * with ASSERT(tpm) to prevent configuration errors from allowing a guest
	 * to boot without a TPM (or with a TPM that has not been sent any
	 * measurements, which could allow forging the measurements).
	 */
	if (!tpm)
		return;

	cmd.tag = bswap_16(TPM_TAG_RQU_COMMAND);
	cmd.size = bswap_32(sizeof(cmd));
	cmd.ord = bswap_32(TPM_ORD_Extend);
	cmd.pcr = bswap_32(4); // PCR #4 for kernel
	sha1(dom->kernel_blob, dom->kernel_size, cmd.hash);

	tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);

	cmd.pcr = bswap_32(5); // PCR #5 for cmdline
	sha1(cmdline, strlen(cmdline), cmd.hash);
	tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);

	cmd.pcr = bswap_32(5); // PCR #5 for initrd
	sha1(dom->ramdisk_blob, dom->ramdisk_size, cmd.hash);
	tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);

	shutdown_tpmfront(tpm);
}

void kexec(void *kernel, long kernel_size, void *module, long module_size, char *cmdline, unsigned long flags)
{
    struct xc_dom_image *dom;
    int rc;
    domid_t domid = DOMID_SELF;
    xen_pfn_t pfn;
    xc_interface *xc_handle;
    unsigned long i;
    void *seg;
    xen_pfn_t boot_page_mfn = virt_to_mfn(&_boot_page);
    char features[] = "";
    struct mmu_update *m2p_updates;
    unsigned long nr_m2p_updates;

    DEBUG("booting with cmdline %s\n", cmdline);
    xc_handle = xc_interface_open(0,0,0);

    dom = xc_dom_allocate(xc_handle, cmdline, features);
    dom->allocate = kexec_allocate;

    /* We are using guest owned memory, therefore no limits. */
    xc_dom_kernel_max_size(dom, 0);
    xc_dom_ramdisk_max_size(dom, 0);

    dom->kernel_blob = kernel;
    dom->kernel_size = kernel_size;

    dom->ramdisk_blob = module;
    dom->ramdisk_size = module_size;

    dom->flags = flags;
    dom->console_evtchn = start_info.console.domU.evtchn;
    dom->xenstore_evtchn = start_info.store_evtchn;

    tpm_hash2pcr(dom, cmdline);

    if ( (rc = xc_dom_boot_xen_init(dom, xc_handle, domid)) != 0 ) {
        grub_printf("xc_dom_boot_xen_init returned %d\n", rc);
        errnum = ERR_BOOT_FAILURE;
        goto out;
    }
    if ( (rc = xc_dom_parse_image(dom)) != 0 ) {
        grub_printf("xc_dom_parse_image returned %d\n", rc);
        errnum = ERR_BOOT_FAILURE;
        goto out;
    }

#ifdef __i386__
    if (strcmp(dom->guest_type, "xen-3.0-x86_32p")) {
        grub_printf("can only boot x86 32 PAE kernels, not %s\n", dom->guest_type);
        errnum = ERR_EXEC_FORMAT;
        goto out;
    }
#endif
#ifdef __x86_64__
    if (strcmp(dom->guest_type, "xen-3.0-x86_64")) {
        grub_printf("can only boot x86 64 kernels, not %s\n", dom->guest_type);
        errnum = ERR_EXEC_FORMAT;
        goto out;
    }
#endif

    /* equivalent of xc_dom_mem_init */
    dom->arch_hooks = xc_dom_find_arch_hooks(xc_handle, dom->guest_type);
    dom->total_pages = start_info.nr_pages;

    /* equivalent of arch_setup_meminit */

    /* setup initial p2m */
    dom->p2m_host = malloc(sizeof(*dom->p2m_host) * dom->total_pages);

    /* Start with our current P2M */
    for (i = 0; i < dom->total_pages; i++)
        dom->p2m_host[i] = pfn_to_mfn(i);

    if ( (rc = xc_dom_build_image(dom)) != 0 ) {
        grub_printf("xc_dom_build_image returned %d\n", rc);
        errnum = ERR_BOOT_FAILURE;
        goto out;
    }

    /* copy hypercall page */
    /* TODO: domctl instead, but requires privileges */
    if (dom->parms.virt_hypercall != -1) {
        pfn = PHYS_PFN(dom->parms.virt_hypercall - dom->parms.virt_base);
        memcpy((void *) pages[pfn], hypercall_page, PAGE_SIZE);
    }

    /* Equivalent of xc_dom_boot_image */
    dom->shared_info_mfn = PHYS_PFN(start_info.shared_info);

    if (!xc_dom_compat_check(dom)) {
        grub_printf("xc_dom_compat_check failed\n");
        errnum = ERR_EXEC_FORMAT;
        goto out;
    }

    /* Move current console, xenstore and boot MFNs to the allocated place */
    do_exchange(dom, dom->console_pfn, start_info.console.domU.mfn);
    do_exchange(dom, dom->xenstore_pfn, start_info.store_mfn);
    DEBUG("virt base at %llx\n", dom->parms.virt_base);
    DEBUG("bootstack_pfn %lx\n", dom->bootstack_pfn);
    _boot_target = dom->parms.virt_base + PFN_PHYS(dom->bootstack_pfn);
    DEBUG("_boot_target %lx\n", _boot_target);
    do_exchange(dom, PHYS_PFN(_boot_target - dom->parms.virt_base),
            virt_to_mfn(&_boot_page));

    /* Make sure the bootstrap page table does not RW-map any of our current
     * page table frames */
    kexec_allocate(dom, dom->virt_pgtab_end);

    if ( (rc = xc_dom_update_guest_p2m(dom))) {
        grub_printf("xc_dom_update_guest_p2m returned %d\n", rc);
        errnum = ERR_BOOT_FAILURE;
        goto out;
    }

    if ( dom->arch_hooks->setup_pgtables )
        if ( (rc = dom->arch_hooks->setup_pgtables(dom))) {
            grub_printf("setup_pgtables returned %d\n", rc);
            errnum = ERR_BOOT_FAILURE;
            goto out;
        }

    /* start info page */
#undef start_info
    if ( dom->arch_hooks->start_info )
        dom->arch_hooks->start_info(dom);
#define start_info (start_info_union.start_info)

    xc_dom_log_memory_footprint(dom);

    /* Unmap libxc's projection of the boot page table */
    seg = xc_dom_seg_to_ptr(dom, &dom->pgtables_seg);
    munmap(seg, dom->pgtables_seg.vend - dom->pgtables_seg.vstart);

    /* Unmap day0 pages to avoid having a r/w mapping of the future page table */
    for (pfn = 0; pfn < allocated; pfn++)
        munmap((void*) pages[pfn], PAGE_SIZE);

    /* Pin the boot page table base */
    if ( (rc = pin_table(dom->xch,
#ifdef __i386__
                MMUEXT_PIN_L3_TABLE,
#endif
#ifdef __x86_64__
                MMUEXT_PIN_L4_TABLE,
#endif
                xc_dom_p2m_host(dom, dom->pgtables_seg.pfn),
                dom->guest_domid)) != 0 ) {
        grub_printf("pin_table(%lx) returned %d\n", xc_dom_p2m_host(dom,
                    dom->pgtables_seg.pfn), rc);
        errnum = ERR_BOOT_FAILURE;
        goto out_remap;
    }

    /* We populate the Mini-OS page table here so that boot.S can just call
     * update_va_mapping to project itself there.  */
    need_pgt(_boot_target);
    DEBUG("day0 pages %lx\n", allocated);
    DEBUG("boot target page %lx\n", _boot_target);
    DEBUG("boot page %p\n", &_boot_page);
    DEBUG("boot page mfn %lx\n", boot_page_mfn);
    _boot_page_entry = PFN_PHYS(boot_page_mfn) | L1_PROT;
    DEBUG("boot page entry %llx\n", _boot_page_entry);
    _boot_oldpdmfn = virt_to_mfn(start_info.pt_base);
    DEBUG("boot old pd mfn %lx\n", _boot_oldpdmfn);
    DEBUG("boot pd virt %lx\n", dom->pgtables_seg.vstart);
    _boot_pdmfn = dom->p2m_host[PHYS_PFN(dom->pgtables_seg.vstart - dom->parms.virt_base)];
    DEBUG("boot pd mfn %lx\n", _boot_pdmfn);
    _boot_stack = _boot_target + PAGE_SIZE;
    DEBUG("boot stack %lx\n", _boot_stack);
    _boot_start_info = dom->parms.virt_base + PFN_PHYS(dom->start_info_pfn);
    DEBUG("boot start info %lx\n", _boot_start_info);
    _boot_start = dom->parms.virt_entry;
    DEBUG("boot start %lx\n", _boot_start);

    /* Keep only useful entries */
    for (nr_m2p_updates = pfn = 0; pfn < start_info.nr_pages; pfn++)
        if (dom->p2m_host[pfn] != pfn_to_mfn(pfn))
            nr_m2p_updates++;

    m2p_updates = malloc(sizeof(*m2p_updates) * nr_m2p_updates);
    for (i = pfn = 0; pfn < start_info.nr_pages; pfn++)
        if (dom->p2m_host[pfn] != pfn_to_mfn(pfn)) {
            m2p_updates[i].ptr = PFN_PHYS(dom->p2m_host[pfn]) | MMU_MACHPHYS_UPDATE;
            m2p_updates[i].val = pfn;
            i++;
        }

    for (i = 0; i < blk_nb; i++)
        shutdown_blkfront(blk_dev[i]);
    if (net_dev)
        shutdown_netfront(net_dev);
    if (kbd_dev)
        shutdown_kbdfront(kbd_dev);
    stop_kernel();

    /* Update M2P */
    if ((rc = HYPERVISOR_mmu_update(m2p_updates, nr_m2p_updates, NULL, DOMID_SELF)) < 0) {
        xprintk("Could not update M2P\n");
        ASSERT(0);
    }

    xprintk("go!\n");

    /* Jump to trampoline boot page */
    _boot();

    ASSERT(0);

out_remap:
    for (pfn = 0; pfn < allocated; pfn++)
        do_map_frames(pages[pfn], &pages_mfns[pfn], 1, 0, 0, DOMID_SELF, 0, L1_PROT);
out:
    xc_dom_release(dom);
    for (pfn = 0; pfn < allocated; pfn++)
        free_page((void*)pages[pfn]);
    free(pages);
    free(pages_mfns);
    pages = NULL;
    pages_mfns = NULL;
    allocated = 0;
    xc_interface_close(xc_handle );
}