From 6fc8621b96e18503bf92a3b1651f35bf17af41a2 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 30 Nov 2006 12:38:51 +0000 Subject: [XEN] Kexec / Kdump: x86_64 specific code This patch contains the x86_64 implementation of Kexec / Kdump for Xen. Signed-Off-By: Magnus Damm Signed-Off-By: Simon Horman --- buildconfigs/linux-defconfig_xen_x86_64 | 1 + linux-2.6-xen-sparse/arch/x86_64/Kconfig | 2 +- linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c | 6 +- .../arch/x86_64/kernel/setup-xen.c | 13 + .../include/asm-x86_64/mach-xen/asm/hypercall.h | 7 + .../include/asm-x86_64/mach-xen/asm/ptrace.h | 2 + ...-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch | 375 +++++++++++++++++++++ ...2.6.19-rc1-kexec-move_segment_code-x86_64.patch | 161 +++++++++ .../linux-2.6.19-rc1-kexec-xen-x86_64.patch | 219 ++++++++++++ patches/linux-2.6.16.33/series | 3 + xen/arch/x86/x86_64/entry.S | 2 + xen/include/asm-x86/x86_64/elf.h | 72 +++- xen/include/asm-x86/x86_64/kexec.h | 23 +- 13 files changed, 877 insertions(+), 9 deletions(-) create mode 100644 patches/linux-2.6.16.33/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch create mode 100644 patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch create mode 100644 patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-xen-x86_64.patch diff --git a/buildconfigs/linux-defconfig_xen_x86_64 b/buildconfigs/linux-defconfig_xen_x86_64 index 3f9599adb0..0bca78b24f 100644 --- a/buildconfigs/linux-defconfig_xen_x86_64 +++ b/buildconfigs/linux-defconfig_xen_x86_64 @@ -139,6 +139,7 @@ CONFIG_SWIOTLB=y CONFIG_PHYSICAL_START=0x100000 CONFIG_SECCOMP=y CONFIG_HZ_100=y +CONFIG_KEXEC=y # CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=100 diff --git a/linux-2.6-xen-sparse/arch/x86_64/Kconfig b/linux-2.6-xen-sparse/arch/x86_64/Kconfig index 79bd0044fa..5fcf581612 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig +++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig @@ -435,7 +435,7 @@ config X86_MCE_AMD config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL && !X86_64_XEN + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c b/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c index 333a9a26f4..37427e6285 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c @@ -260,7 +260,11 @@ void __init e820_reserve_resources(struct e820entry *e820, int nr_map) request_resource(res, &data_resource); #endif #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); +#ifdef CONFIG_XEN + xen_machine_kexec_register_resources(res); +#endif #endif } } diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c index 711ce5d198..5c630cac9e 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c @@ -80,6 +80,10 @@ #include #include +#ifdef CONFIG_XEN +#include +#endif + extern unsigned long start_pfn; extern struct edid_info edid_info; @@ -450,6 +454,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN unsigned long size, base; size = memparse(from+12, &from); if (*from == '@') { @@ -460,6 +465,10 @@ static __init void parse_cmdline_early (char ** cmdline_p) crashk_res.start = base; crashk_res.end = base + size - 1; } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif } #endif @@ -812,10 +821,14 @@ void __init setup_arch(char **cmdline_p) #endif #endif /* !CONFIG_XEN */ #ifdef CONFIG_KEXEC +#ifdef CONFIG_XEN + xen_machine_kexec_setup_resources(); +#else if (crashk_res.start != crashk_res.end) { reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); } +#endif #endif paging_init(); diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h index 956a4c4b0d..5c6c8deff6 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h @@ -396,4 +396,11 @@ HYPERVISOR_xenoprof_op( return _hypercall2(int, xenoprof_op, op, arg); } +static inline int +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + #endif /* __HYPERCALL_H__ */ diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h index 052f05bca8..e6801fa7af 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/ptrace.h @@ -90,6 +90,8 @@ extern unsigned long profile_pc(struct pt_regs *regs); #define profile_pc(regs) instruction_pointer(regs) #endif +#include + void signal_fault(struct pt_regs *regs, void __user *frame, char *where); struct task_struct; diff --git a/patches/linux-2.6.16.33/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch b/patches/linux-2.6.16.33/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch new file mode 100644 index 0000000000..6e1f65fe9f --- /dev/null +++ b/patches/linux-2.6.16.33/git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch @@ -0,0 +1,375 @@ +From: Magnus Damm +Date: Tue, 26 Sep 2006 08:52:38 +0000 (+0200) +Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64) +X-Git-Tag: v2.6.19-rc1 +X-Git-Url: http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f + +[PATCH] Avoid overwriting the current pgd (V4, x86_64) + +kexec: Avoid overwriting the current pgd (V4, x86_64) + +This patch upgrades the x86_64-specific kexec code to avoid overwriting the +current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used +to start a secondary kernel that dumps the memory of the previous kernel. + +The code introduces a new set of page tables. These tables are used to provide +an executable identity mapping without overwriting the current pgd. + +Signed-off-by: Magnus Damm +Signed-off-by: Andi Kleen +--- + +--- a/arch/x86_64/kernel/machine_kexec.c ++++ b/arch/x86_64/kernel/machine_kexec.c +@@ -15,6 +15,15 @@ + #include + #include + ++#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) ++static u64 kexec_pgd[512] PAGE_ALIGNED; ++static u64 kexec_pud0[512] PAGE_ALIGNED; ++static u64 kexec_pmd0[512] PAGE_ALIGNED; ++static u64 kexec_pte0[512] PAGE_ALIGNED; ++static u64 kexec_pud1[512] PAGE_ALIGNED; ++static u64 kexec_pmd1[512] PAGE_ALIGNED; ++static u64 kexec_pte1[512] PAGE_ALIGNED; ++ + static void init_level2_page(pmd_t *level2p, unsigned long addr) + { + unsigned long end_addr; +@@ -144,32 +153,19 @@ static void load_segments(void) + ); + } + +-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, +- unsigned long control_code_buffer, +- unsigned long start_address, +- unsigned long pgtable) ATTRIB_NORET; +- +-extern const unsigned char relocate_new_kernel[]; +-extern const unsigned long relocate_new_kernel_size; +- + int machine_kexec_prepare(struct kimage *image) + { +- unsigned long start_pgtable, control_code_buffer; ++ unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; +- control_code_buffer = start_pgtable + PAGE_SIZE; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + +- /* Place the code in the reboot code buffer */ +- memcpy(__va(control_code_buffer), relocate_new_kernel, +- relocate_new_kernel_size); +- + return 0; + } + +@@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage + */ + NORET_TYPE void machine_kexec(struct kimage *image) + { +- unsigned long page_list; +- unsigned long control_code_buffer; +- unsigned long start_pgtable; +- relocate_new_kernel_t rnk; ++ unsigned long page_list[PAGES_NR]; ++ void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + +- /* Calculate the offsets */ +- page_list = image->head; +- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; +- control_code_buffer = start_pgtable + PAGE_SIZE; ++ control_page = page_address(image->control_code_page) + PAGE_SIZE; ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); + +- /* Set the low half of the page table to my identity mapped +- * page table for kexec. Leave the high half pointing at the +- * kernel pages. Don't bother to flush the global pages +- * as that will happen when I fully switch to my identity mapped +- * page table anyway. +- */ +- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); +- __flush_tlb(); ++ page_list[PA_CONTROL_PAGE] = __pa(control_page); ++ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; ++ page_list[PA_PGD] = __pa(kexec_pgd); ++ page_list[VA_PGD] = (unsigned long)kexec_pgd; ++ page_list[PA_PUD_0] = __pa(kexec_pud0); ++ page_list[VA_PUD_0] = (unsigned long)kexec_pud0; ++ page_list[PA_PMD_0] = __pa(kexec_pmd0); ++ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; ++ page_list[PA_PTE_0] = __pa(kexec_pte0); ++ page_list[VA_PTE_0] = (unsigned long)kexec_pte0; ++ page_list[PA_PUD_1] = __pa(kexec_pud1); ++ page_list[VA_PUD_1] = (unsigned long)kexec_pud1; ++ page_list[PA_PMD_1] = __pa(kexec_pmd1); ++ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; ++ page_list[PA_PTE_1] = __pa(kexec_pte1); ++ page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + ++ page_list[PA_TABLE_PAGE] = ++ (unsigned long)__pa(page_address(image->control_code_page)); + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is +@@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kim + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); ++ + /* now call it */ +- rnk = (relocate_new_kernel_t) control_code_buffer; +- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); ++ relocate_kernel((unsigned long)image->head, (unsigned long)page_list, ++ image->start); + } + + /* crashkernel=size@addr specifies the location to reserve for +--- a/arch/x86_64/kernel/relocate_kernel.S ++++ b/arch/x86_64/kernel/relocate_kernel.S +@@ -7,31 +7,169 @@ + */ + + #include ++#include ++#include + +- /* +- * Must be relocatable PIC code callable as a C function, that once +- * it starts can not use the previous processes stack. +- */ +- .globl relocate_new_kernel ++/* ++ * Must be relocatable PIC code callable as a C function ++ */ ++ ++#define PTR(x) (x << 3) ++#define PAGE_ALIGNED (1 << PAGE_SHIFT) ++#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ ++ ++ .text ++ .align PAGE_ALIGNED + .code64 ++ .globl relocate_kernel ++relocate_kernel: ++ /* %rdi indirection_page ++ * %rsi page_list ++ * %rdx start address ++ */ ++ ++ /* map the control page at its virtual address */ ++ ++ movq $0x0000ff8000000000, %r10 /* mask */ ++ mov $(39 - 3), %cl /* bits to shift */ ++ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PGD)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PUD_0)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PUD_0)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PMD_0)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PMD_0)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PTE_0)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PTE_0)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ /* identity map the control page at its physical address */ ++ ++ movq $0x0000ff8000000000, %r10 /* mask */ ++ mov $(39 - 3), %cl /* bits to shift */ ++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PGD)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PUD_1)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PUD_1)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PMD_1)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PMD_1)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_PTE_1)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ ++ shrq $9, %r10 ++ sub $9, %cl ++ ++ movq %r11, %r9 ++ andq %r10, %r9 ++ shrq %cl, %r9 ++ ++ movq PTR(VA_PTE_1)(%rsi), %r8 ++ addq %r8, %r9 ++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 ++ orq $PAGE_ATTR, %r8 ++ movq %r8, (%r9) ++ + relocate_new_kernel: +- /* %rdi page_list +- * %rsi reboot_code_buffer ++ /* %rdi indirection_page ++ * %rsi page_list + * %rdx start address +- * %rcx page_table +- * %r8 arg5 +- * %r9 arg6 + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + +- /* set a new stack at the bottom of our page... */ +- lea 4096(%rsi), %rsp ++ /* get physical address of control page now */ ++ /* this is impossible after page table switch */ ++ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 ++ ++ /* get physical address of page table now too */ ++ movq PTR(PA_TABLE_PAGE)(%rsi), %rcx ++ ++ /* switch to new set of page tables */ ++ movq PTR(PA_PGD)(%rsi), %r9 ++ movq %r9, %cr3 ++ ++ /* setup a new stack at the end of the physical control page */ ++ lea 4096(%r8), %rsp ++ ++ /* jump to identity mapped page */ ++ addq $(identity_mapped - relocate_kernel), %r8 ++ pushq %r8 ++ ret + +- /* store the parameters back on the stack */ +- pushq %rdx /* store the start address */ ++identity_mapped: ++ /* store the start address on the stack */ ++ pushq %rdx + + /* Set cr0 to a known state: + * 31 1 == Paging enabled +@@ -136,8 +274,3 @@ relocate_new_kernel: + xorq %r15, %r15 + + ret +-relocate_new_kernel_end: +- +- .globl relocate_new_kernel_size +-relocate_new_kernel_size: +- .quad relocate_new_kernel_end - relocate_new_kernel +--- a/include/asm-x86_64/kexec.h ++++ b/include/asm-x86_64/kexec.h +@@ -1,6 +1,27 @@ + #ifndef _X86_64_KEXEC_H + #define _X86_64_KEXEC_H + ++#define PA_CONTROL_PAGE 0 ++#define VA_CONTROL_PAGE 1 ++#define PA_PGD 2 ++#define VA_PGD 3 ++#define PA_PUD_0 4 ++#define VA_PUD_0 5 ++#define PA_PMD_0 6 ++#define VA_PMD_0 7 ++#define PA_PTE_0 8 ++#define VA_PTE_0 9 ++#define PA_PUD_1 10 ++#define VA_PUD_1 11 ++#define PA_PMD_1 12 ++#define VA_PMD_1 13 ++#define PA_PTE_1 14 ++#define VA_PTE_1 15 ++#define PA_TABLE_PAGE 16 ++#define PAGES_NR 17 ++ ++#ifndef __ASSEMBLY__ ++ + #include + + #include +@@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru + newregs->rip = (unsigned long)current_text_addr(); + } + } ++ ++NORET_TYPE void ++relocate_kernel(unsigned long indirection_page, ++ unsigned long page_list, ++ unsigned long start_address) ATTRIB_NORET; ++ ++#endif /* __ASSEMBLY__ */ ++ + #endif /* _X86_64_KEXEC_H */ diff --git a/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch b/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch new file mode 100644 index 0000000000..1cf613284f --- /dev/null +++ b/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch @@ -0,0 +1,161 @@ +kexec: Move asm segment handling code to the assembly file (x86_64) + +This patch moves the idt, gdt, and segment handling code from machine_kexec.c +to relocate_kernel.S. The main reason behind this move is to avoid code +duplication in the Xen hypervisor. With this patch all code required to kexec +is put on the control page. + +On top of that this patch also counts as a cleanup - I think it is much +nicer to write assembly directly in assembly files than wrap inline assembly +in C functions for no apparent reason. + +Signed-off-by: Magnus Damm +--- + + Applies to 2.6.19-rc1. + + machine_kexec.c | 58 ----------------------------------------------------- + relocate_kernel.S | 50 +++++++++++++++++++++++++++++++++++++++++---- + 2 files changed, 45 insertions(+), 63 deletions(-) + +--- 0002/arch/x86_64/kernel/machine_kexec.c ++++ work/arch/x86_64/kernel/machine_kexec.c 2006-10-05 16:15:49.000000000 +0900 +@@ -112,47 +112,6 @@ static int init_pgtable(struct kimage *i + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); + } + +-static void set_idt(void *newidt, u16 limit) +-{ +- struct desc_ptr curidt; +- +- /* x86-64 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; +- +- __asm__ __volatile__ ( +- "lidtq %0\n" +- : : "m" (curidt) +- ); +-}; +- +- +-static void set_gdt(void *newgdt, u16 limit) +-{ +- struct desc_ptr curgdt; +- +- /* x86-64 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; +- +- __asm__ __volatile__ ( +- "lgdtq %0\n" +- : : "m" (curgdt) +- ); +-}; +- +-static void load_segments(void) +-{ +- __asm__ __volatile__ ( +- "\tmovl %0,%%ds\n" +- "\tmovl %0,%%es\n" +- "\tmovl %0,%%ss\n" +- "\tmovl %0,%%fs\n" +- "\tmovl %0,%%gs\n" +- : : "a" (__KERNEL_DS) : "memory" +- ); +-} +- + int machine_kexec_prepare(struct kimage *image) + { + unsigned long start_pgtable; +@@ -209,23 +168,6 @@ NORET_TYPE void machine_kexec(struct kim + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + +- /* The segment registers are funny things, they have both a +- * visible and an invisible part. Whenever the visible part is +- * set to a specific selector, the invisible part is loaded +- * with from a table in memory. At no other time is the +- * descriptor table in memory accessed. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); +- +- /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); + } +--- 0002/arch/x86_64/kernel/relocate_kernel.S ++++ work/arch/x86_64/kernel/relocate_kernel.S 2006-10-05 16:18:07.000000000 +0900 +@@ -159,13 +159,39 @@ relocate_new_kernel: + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + ++ /* setup idt */ ++ movq %r8, %rax ++ addq $(idt_80 - relocate_kernel), %rax ++ lidtq (%rax) ++ ++ /* setup gdt */ ++ movq %r8, %rax ++ addq $(gdt - relocate_kernel), %rax ++ movq %r8, %r9 ++ addq $((gdt_80 - relocate_kernel) + 2), %r9 ++ movq %rax, (%r9) ++ ++ movq %r8, %rax ++ addq $(gdt_80 - relocate_kernel), %rax ++ lgdtq (%rax) ++ ++ /* setup data segment registers */ ++ xorl %eax, %eax ++ movl %eax, %ds ++ movl %eax, %es ++ movl %eax, %fs ++ movl %eax, %gs ++ movl %eax, %ss ++ + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + +- /* jump to identity mapped page */ +- addq $(identity_mapped - relocate_kernel), %r8 +- pushq %r8 +- ret ++ /* load new code segment and jump to identity mapped page */ ++ movq %r8, %rax ++ addq $(identity_mapped - relocate_kernel), %rax ++ pushq $(gdt_cs - gdt) ++ pushq %rax ++ lretq + + identity_mapped: + /* store the start address on the stack */ +@@ -272,5 +298,19 @@ identity_mapped: + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 +- + ret ++ ++ .align 16 ++gdt: ++ .quad 0x0000000000000000 /* NULL descriptor */ ++gdt_cs: ++ .quad 0x00af9a000000ffff ++gdt_end: ++ ++gdt_80: ++ .word gdt_end - gdt - 1 /* limit */ ++ .quad 0 /* base - filled in by code above */ ++ ++idt_80: ++ .word 0 /* limit */ ++ .quad 0 /* base */ diff --git a/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-xen-x86_64.patch b/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-xen-x86_64.patch new file mode 100644 index 0000000000..c387519889 --- /dev/null +++ b/patches/linux-2.6.16.33/linux-2.6.19-rc1-kexec-xen-x86_64.patch @@ -0,0 +1,219 @@ +--- 0001/arch/x86_64/kernel/crash.c ++++ work/arch/x86_64/kernel/crash.c +@@ -92,6 +92,7 @@ static void crash_save_self(struct pt_re + crash_save_this_cpu(regs, cpu); + } + ++#ifndef CONFIG_XEN + #ifdef CONFIG_SMP + static atomic_t waiting_for_crash_ipi; + +@@ -156,6 +157,7 @@ static void nmi_shootdown_cpus(void) + /* There are no cpus to shootdown */ + } + #endif ++#endif /* CONFIG_XEN */ + + void machine_crash_shutdown(struct pt_regs *regs) + { +@@ -173,6 +175,8 @@ void machine_crash_shutdown(struct pt_re + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); ++ ++#ifndef CONFIG_XEN + nmi_shootdown_cpus(); + + if(cpu_has_apic) +@@ -181,6 +185,6 @@ void machine_crash_shutdown(struct pt_re + #if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); + #endif +- ++#endif /* CONFIG_XEN */ + crash_save_self(regs); + } +--- 0010/arch/x86_64/kernel/machine_kexec.c ++++ work/arch/x86_64/kernel/machine_kexec.c +@@ -24,6 +24,104 @@ static u64 kexec_pud1[512] PAGE_ALIGNED; + static u64 kexec_pmd1[512] PAGE_ALIGNED; + static u64 kexec_pte1[512] PAGE_ALIGNED; + ++#ifdef CONFIG_XEN ++ ++/* In the case of Xen, override hypervisor functions to be able to create ++ * a regular identity mapping page table... ++ */ ++ ++#include ++#include ++ ++#define x__pmd(x) ((pmd_t) { (x) } ) ++#define x__pud(x) ((pud_t) { (x) } ) ++#define x__pgd(x) ((pgd_t) { (x) } ) ++ ++#define x_pmd_val(x) ((x).pmd) ++#define x_pud_val(x) ((x).pud) ++#define x_pgd_val(x) ((x).pgd) ++ ++static inline void x_set_pmd(pmd_t *dst, pmd_t val) ++{ ++ x_pmd_val(*dst) = x_pmd_val(val); ++} ++ ++static inline void x_set_pud(pud_t *dst, pud_t val) ++{ ++ x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); ++} ++ ++static inline void x_pud_clear (pud_t *pud) ++{ ++ x_pud_val(*pud) = 0; ++} ++ ++static inline void x_set_pgd(pgd_t *dst, pgd_t val) ++{ ++ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); ++} ++ ++static inline void x_pgd_clear (pgd_t * pgd) ++{ ++ x_pgd_val(*pgd) = 0; ++} ++ ++#define X__PAGE_KERNEL_LARGE_EXEC \ ++ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE ++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY ++ ++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) ++ ++#if PAGES_NR > KEXEC_XEN_NO_PAGES ++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break ++#endif ++ ++#if PA_CONTROL_PAGE != 0 ++#error PA_CONTROL_PAGE is non zero - Xen support will break ++#endif ++ ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ void *control_page; ++ void *table_page; ++ ++ memset(xki->page_list, 0, sizeof(xki->page_list)); ++ ++ control_page = page_address(image->control_code_page) + PAGE_SIZE; ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ ++ table_page = page_address(image->control_code_page); ++ ++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); ++ xki->page_list[PA_TABLE_PAGE] = __ma(table_page); ++ ++ xki->page_list[PA_PGD] = __ma(kexec_pgd); ++ xki->page_list[PA_PUD_0] = __ma(kexec_pud0); ++ xki->page_list[PA_PUD_1] = __ma(kexec_pud1); ++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); ++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); ++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); ++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); ++} ++ ++#else /* CONFIG_XEN */ ++ ++#define x__pmd(x) __pmd(x) ++#define x__pud(x) __pud(x) ++#define x__pgd(x) __pgd(x) ++ ++#define x_set_pmd(x, y) set_pmd(x, y) ++#define x_set_pud(x, y) set_pud(x, y) ++#define x_set_pgd(x, y) set_pgd(x, y) ++ ++#define x_pud_clear(x) pud_clear(x) ++#define x_pgd_clear(x) pgd_clear(x) ++ ++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC ++#define X_KERNPG_TABLE _KERNPG_TABLE ++ ++#endif /* CONFIG_XEN */ ++ + static void init_level2_page(pmd_t *level2p, unsigned long addr) + { + unsigned long end_addr; +@@ -31,7 +129,7 @@ static void init_level2_page(pmd_t *leve + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { +- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); ++ x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } + } +@@ -56,12 +154,12 @@ static int init_level3_page(struct kimag + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); +- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); ++ x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pud_clear(level3p++); ++ x_pud_clear(level3p++); + addr += PUD_SIZE; + } + out: +@@ -92,12 +190,12 @@ static int init_level4_page(struct kimag + if (result) { + goto out; + } +- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); ++ x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pgd_clear(level4p++); ++ x_pgd_clear(level4p++); + addr += PGDIR_SIZE; + } + out: +@@ -108,8 +206,14 @@ out: + static int init_pgtable(struct kimage *image, unsigned long start_pgtable) + { + pgd_t *level4p; ++ unsigned long x_end_pfn = end_pfn; ++ ++#ifdef CONFIG_XEN ++ x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); ++#endif ++ + level4p = (pgd_t *)__va(start_pgtable); +- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); ++ return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT); + } + + int machine_kexec_prepare(struct kimage *image) +--- 0009/include/asm-x86_64/kexec.h ++++ work/include/asm-x86_64/kexec.h +@@ -91,6 +91,19 @@ relocate_kernel(unsigned long indirectio + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; + ++/* Under Xen we need to work with machine addresses. These macros give the ++ * machine address of a certain page to the generic kexec code instead of ++ * the pseudo physical address which would be given by the default macros. ++ */ ++ ++#ifdef CONFIG_XEN ++#define KEXEC_ARCH_HAS_PAGE_MACROS ++#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) ++#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) ++#define kexec_virt_to_phys(addr) virt_to_machine(addr) ++#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) ++#endif ++ + #endif /* __ASSEMBLY__ */ + + #endif /* _X86_64_KEXEC_H */ diff --git a/patches/linux-2.6.16.33/series b/patches/linux-2.6.16.33/series index 780b7a3499..81cc316ad8 100644 --- a/patches/linux-2.6.16.33/series +++ b/patches/linux-2.6.16.33/series @@ -4,6 +4,9 @@ git-2a8a3d5b65e86ec1dfef7d268c64a909eab94af7.patch git-3566561bfadffcb5dbc85d576be80c0dbf2cccc9.patch linux-2.6.19-rc1-kexec-move_segment_code-i386.patch linux-2.6.19-rc1-kexec-xen-i386.patch +git-4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f.patch +linux-2.6.19-rc1-kexec-move_segment_code-x86_64.patch +linux-2.6.19-rc1-kexec-xen-x86_64.patch blktap-aio-16_03_06.patch device_bind.patch fix-hz-suspend.patch diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S index 3f2b8297c7..87602a3d86 100644 --- a/xen/arch/x86/x86_64/entry.S +++ b/xen/arch/x86/x86_64/entry.S @@ -559,6 +559,7 @@ ENTRY(hypercall_table) .quad do_hvm_op .quad do_sysctl /* 35 */ .quad do_domctl + .quad do_kexec_op .rept NR_hypercalls-((.-hypercall_table)/8) .quad do_ni_hypercall .endr @@ -601,6 +602,7 @@ ENTRY(hypercall_args_table) .byte 2 /* do_hvm_op */ .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ + .byte 2 /* do_kexec */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr diff --git a/xen/include/asm-x86/x86_64/elf.h b/xen/include/asm-x86/x86_64/elf.h index 8068c3ab5d..75f191b43c 100644 --- a/xen/include/asm-x86/x86_64/elf.h +++ b/xen/include/asm-x86/x86_64/elf.h @@ -1,16 +1,82 @@ #ifndef __X86_64_ELF_H__ #define __X86_64_ELF_H__ -#include /* for printk() used in stub */ +#include typedef struct { - unsigned long dummy; + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long rbp; + unsigned long rbx; + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long rax; + unsigned long rcx; + unsigned long rdx; + unsigned long rsi; + unsigned long rdi; + unsigned long orig_rax; + unsigned long rip; + unsigned long cs; + unsigned long eflags; + unsigned long rsp; + unsigned long ss; + unsigned long thread_fs; + unsigned long thread_gs; + unsigned long ds; + unsigned long es; + unsigned long fs; + unsigned long gs; } ELF_Gregset; extern inline void elf_core_save_regs(ELF_Gregset *core_regs, crash_xen_core_t *xen_core_regs) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + unsigned long tmp; + + asm volatile("movq %%r15,%0" : "=m"(core_regs->r15)); + asm volatile("movq %%r14,%0" : "=m"(core_regs->r14)); + asm volatile("movq %%r13,%0" : "=m"(core_regs->r13)); + asm volatile("movq %%r12,%0" : "=m"(core_regs->r12)); + asm volatile("movq %%rbp,%0" : "=m"(core_regs->rbp)); + asm volatile("movq %%rbx,%0" : "=m"(core_regs->rbx)); + asm volatile("movq %%r11,%0" : "=m"(core_regs->r11)); + asm volatile("movq %%r10,%0" : "=m"(core_regs->r10)); + asm volatile("movq %%r9,%0" : "=m"(core_regs->r9)); + asm volatile("movq %%r8,%0" : "=m"(core_regs->r8)); + asm volatile("movq %%rax,%0" : "=m"(core_regs->rax)); + asm volatile("movq %%rcx,%0" : "=m"(core_regs->rcx)); + asm volatile("movq %%rdx,%0" : "=m"(core_regs->rdx)); + asm volatile("movq %%rsi,%0" : "=m"(core_regs->rsi)); + asm volatile("movq %%rdi,%0" : "=m"(core_regs->rdi)); + /* orig_rax not filled in for now */ + core_regs->rip = (unsigned long)current_text_addr(); + asm volatile("movl %%cs, %%eax;" :"=a"(core_regs->cs)); + asm volatile("pushfq; popq %0" :"=m"(core_regs->eflags)); + asm volatile("movq %%rsp,%0" : "=m"(core_regs->rsp)); + asm volatile("movl %%ss, %%eax;" :"=a"(core_regs->ss)); + /* thread_fs not filled in for now */ + /* thread_gs not filled in for now */ + asm volatile("movl %%ds, %%eax;" :"=a"(core_regs->ds)); + asm volatile("movl %%es, %%eax;" :"=a"(core_regs->es)); + asm volatile("movl %%fs, %%eax;" :"=a"(core_regs->fs)); + asm volatile("movl %%gs, %%eax;" :"=a"(core_regs->gs)); + + asm volatile("mov %%cr0, %0" : "=r" (tmp) : ); + xen_core_regs->cr0 = tmp; + + asm volatile("mov %%cr2, %0" : "=r" (tmp) : ); + xen_core_regs->cr2 = tmp; + + asm volatile("mov %%cr3, %0" : "=r" (tmp) : ); + xen_core_regs->cr3 = tmp; + + asm volatile("mov %%cr4, %0" : "=r" (tmp) : ); + xen_core_regs->cr4 = tmp; } #endif /* __X86_64_ELF_H__ */ diff --git a/xen/include/asm-x86/x86_64/kexec.h b/xen/include/asm-x86/x86_64/kexec.h index 3433ba7846..b9779aa9d7 100644 --- a/xen/include/asm-x86/x86_64/kexec.h +++ b/xen/include/asm-x86/x86_64/kexec.h @@ -1,14 +1,29 @@ +/****************************************************************************** + * kexec.h + * + * Based heavily on machine_kexec.c and kexec.h from Linux 2.6.19-rc1 + * + */ + #ifndef __X86_64_KEXEC_H__ #define __X86_64_KEXEC_H__ - -#include /* for printk() used in stub */ + #include -#include #include +#include + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address); static inline void machine_kexec(xen_kexec_image_t *image) { - printk("STUB: " __FILE__ ": %s: not implemented\n", __FUNCTION__); + relocate_new_kernel_t rnk; + + rnk = (relocate_new_kernel_t) image->page_list[1]; + (*rnk)(image->indirection_page, (unsigned long)image->page_list, + image->start_address); } #endif /* __X86_64_KEXEC_H__ */ -- cgit v1.2.3