diff options
28 files changed, 2099 insertions, 1020 deletions
@@ -411,7 +411,6 @@ 3ddb79c34BFiXjBJ_cCKB0aCsV1IDw xen/include/asm-i386/desc.h 3e564149UkU91RX7onzpCAmbj_IFjw xen/include/asm-i386/dma.h 3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-i386/domain_page.h -3ddb79c2O729EttZTYu1c8LcsUO_GQ xen/include/asm-i386/elf.h 3ddb79c3NU8Zy40OTrq3D-i30Y3t4A xen/include/asm-i386/fixmap.h 3e2d29944GI24gf7vOP_7x8EyuqxeA xen/include/asm-i386/flushtlb.h 3ddb79c39o75zPP0T1aQQ4mNrCAN2w xen/include/asm-i386/hardirq.h @@ -460,7 +459,6 @@ 404f1b9fl6AQ_a-T1TDK3fuwTPXmHw xen/include/asm-x86_64/desc.h 404f1ba05mjpUREtosjzz3PPL5cTJA xen/include/asm-x86_64/dma.h 404f1ba13mnjeZT2ytPm0DB63703nA xen/include/asm-x86_64/domain_page.h -404f1ba2IXQ7E0x9NlqpR5hgYtC9RQ xen/include/asm-x86_64/elf.h 404f1ba31i0gS-cdqvd0RZX1HVnxsA xen/include/asm-x86_64/fixmap.h 404f1ba4KXQ_V7HOkenF04KRU7Tl7w xen/include/asm-x86_64/flushtlb.h 404f1ba5Sqzc22eXORShvCF9-rpMbA xen/include/asm-x86_64/hardirq.h @@ -534,6 +532,7 @@ 3ddb79c1V44RD26YqCUm-kqIupM37A xen/include/xen/ctype.h 3ddb79c05DdHQ0UxX_jKsXdR4QlMCA xen/include/xen/delay.h 3ddb79c1uaWQZj551j1O0B5z8AnHOg xen/include/xen/elevator.h +3ddb79c2O729EttZTYu1c8LcsUO_GQ xen/include/xen/elf.h 3ddb79c0HIghfBF8zFUdmXhOU8i6hA xen/include/xen/errno.h 3ddb79c0rMjudDKkJku_mkm0J-BZgw xen/include/xen/etherdevice.h 3ddb79c0T3X07lFnM9OSE-W5bqIDSQ xen/include/xen/ethtool.h @@ -663,7 +662,7 @@ 3e5a4e66HdSkvIV6SJ1evG_xmTmXHA xenolinux-2.4.25-sparse/include/asm-xen/desc.h 4048c0e0_P2wUTiT6UqgPhn0s7yFcA xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h 3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h -3e5a4e67w_DWgjIJ17Tlossu1LGujQ xenolinux-2.4.25-sparse/include/asm-xen/highmem.h +406aeeaaQvl4RNtmd9hDEugBURbFpQ xenolinux-2.4.25-sparse/include/asm-xen/highmem.h 3e5a4e67YtcyDLQsShhCfQwPSELfvA xenolinux-2.4.25-sparse/include/asm-xen/hw_irq.h 3e5a4e677VBavzM1UZIEcH1B-RlXMA xenolinux-2.4.25-sparse/include/asm-xen/hypervisor.h 4060044fVx7-tokvNLKBf_6qBB4lqQ xenolinux-2.4.25-sparse/include/asm-xen/io.h @@ -697,6 +696,7 @@ 3f9d4b44247udoqWEgFkaHiWv6Uvyg xenolinux-2.4.25-sparse/kernel/time.c 401c059bjLBFYHRD4Py2uM3eA1D4zQ xenolinux-2.4.25-sparse/kernel/timer.c 3e6e7c1efbQe93xCvOpOVCnXTMmQ5w xenolinux-2.4.25-sparse/mkbuildtree +406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux-2.4.25-sparse/mm/highmem.c 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.25-sparse/mm/memory.c 3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.25-sparse/mm/mprotect.c 3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.25-sparse/mm/mremap.c diff --git a/extras/mini-os/Makefile b/extras/mini-os/Makefile index fc9c2bf733..d2d478ac40 100644 --- a/extras/mini-os/Makefile +++ b/extras/mini-os/Makefile @@ -24,17 +24,8 @@ hypervisor-ifs: ln -sf ../../../xen/include/hypervisor-ifs h/hypervisor-ifs $(TARGET): hypervisor-ifs head.o $(OBJS) - # Image will load at 0xC0000000. First bytes from head.o - #$(LD) -N -Ttext 0xC0000000 head.o $(OBJS) -o $@.elf $(LD) -N -T minios.lds head.o $(OBJS) -o $@.elf - # Guest OS header -- first 8 bytes are identifier 'XenGuest'. - echo -e -n 'XenGuest' >$@ - # Guest OS header -- next 4 bytes are load address (0xC0000000). - echo -e -n '\000\000\000\300' >>$@ - # Create a raw bag of bytes from the ELF image. - objcopy -O binary -R .note -R .comment $@.elf $@.raw - # Guest OS header is immediately followed by raw OS image. - cat $@.raw >>$@ + objcopy -R .note -R .comment $@.elf $@ gzip -f -9 -c $@ >$@.gz clean: diff --git a/extras/mini-os/head.S b/extras/mini-os/head.S index 5844e296c4..52eae8f818 100644 --- a/extras/mini-os/head.S +++ b/extras/mini-os/head.S @@ -1,48 +1,18 @@ #include <os.h> -/* Offsets in start_info structure */ -#define MOD_START 20 -#define MOD_LEN 24 - .globl _start, shared_info _start: cld - lss stack_start,%esp - - /* Copy any module somewhere safe before it's clobbered by BSS. */ - mov MOD_LEN(%esi),%ecx - shr $2,%ecx - jz 2f /* bail from copy loop if no module */ - - mov $_end,%edi - add MOD_LEN(%esi),%edi - mov MOD_START(%esi),%eax - add MOD_LEN(%esi),%eax -1: sub $4,%eax - sub $4,%edi - mov (%eax),%ebx - mov %ebx,(%edi) - loop 1b - mov %edi,MOD_START(%esi) - - /* Clear BSS first so that there are no surprises... */ -2: xorl %eax,%eax - movl $__bss_start,%edi - movl $_end,%ecx - subl %edi,%ecx - rep stosb - push %esi call start_kernel - stack_start: .long stack+8192, __KERNEL_DS - - /* Unpleasant -- we actually use this PTE to map shared_info :-) */ + /* Unpleasant -- the PTE that maps this page is actually overwritten */ + /* to map the real shared-info page! :-) */ .org 0x1000 shared_info: .org 0x2000 diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index f9f6949348..92fff33a6a 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -3,15 +3,24 @@ */ #include "xc_private.h" +#define ELFSIZE 32 +#include "xc_elf.h" #include <zlib.h> -/* This string is written to the head of every guest kernel image. */ -#define GUEST_SIG "XenGuest" -#define SIG_LEN 8 - #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) +#define round_pgdown(_p) ((_p)&PAGE_MASK) + +static int readelfimage_base_and_size(char *elfbase, + unsigned long elfsize, + unsigned long *pkernstart, + unsigned long *pkernend, + unsigned long *pkernentry); +static int loadelfimage(char *elfbase, int pmh, unsigned long *parray, + unsigned long vstart); + static long get_tot_pages(int xc_handle, u64 domid) { dom0_op_t op; @@ -43,25 +52,6 @@ static int get_pfn_list(int xc_handle, return (ret < 0) ? -1 : op.u.getmemlist.num_pfns; } -/* Read the kernel header, extracting the image size and load address. */ -static int read_kernel_header(gzFile gfd, long dom_size, - unsigned long *load_addr) -{ - char signature[SIG_LEN]; - - gzread(gfd, signature, SIG_LEN); - if ( strncmp(signature, GUEST_SIG, SIG_LEN) ) - { - ERROR("Kernel image does not contain required signature"); - return -1; - } - - /* Read the load address which immediately follows the Xen signature. */ - gzread(gfd, load_addr, sizeof(unsigned long)); - - return 0; -} - static int copy_to_domain_page(int pm_handle, unsigned long dst_pfn, void *src_page) @@ -75,12 +65,11 @@ static int copy_to_domain_page(int pm_handle, } static int setup_guestos(int xc_handle, - u64 dom, - gzFile kernel_gfd, - gzFile initrd_gfd, - unsigned long tot_pages, - unsigned long *virt_startinfo_addr, - unsigned long virt_load_addr, + u64 dom, + char *image, unsigned long image_size, + gzFile initrd_gfd, unsigned long initrd_len, + unsigned long nr_pages, + unsigned long *pvsi, unsigned long *pvke, dom0_builddomain_t *builddomain, const char *cmdline, unsigned long shared_info_frame) @@ -88,140 +77,184 @@ static int setup_guestos(int xc_handle, l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; unsigned long *page_array = NULL; - int alloc_index, num_pt_pages; unsigned long l2tab; unsigned long l1tab; - unsigned long count, pt_start, i, j; - unsigned long initrd_addr = 0, initrd_len = 0; + unsigned long count, i; start_info_t *start_info; shared_info_t *shared_info; - unsigned long ksize; mmu_t *mmu = NULL; - int pm_handle; + int pm_handle=-1, rc; + + unsigned long nr_pt_pages; + unsigned long ppt_alloc; + unsigned long *physmap, *physmap_e, physmap_pfn; + + unsigned long v_start; + unsigned long vkern_start; + unsigned long vkern_entry; + unsigned long vkern_end; + unsigned long vinitrd_start; + unsigned long vinitrd_end; + unsigned long vphysmap_start; + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; + unsigned long vpt_end; + unsigned long v_end; + + rc = readelfimage_base_and_size(image, image_size, + &vkern_start, &vkern_end, &vkern_entry); + if ( rc != 0 ) + goto error_out; + + /* + * Why do we need this? The number of page-table frames depends on the + * size of the bootstrap address space. But the size of the address space + * depends on the number of page-table frames (since each one is mapped + * read-only). We have a pair of simultaneous equations in two unknowns, + * which we solve by exhaustive search. + */ + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) + { + v_start = vkern_start & ~((1<<22)-1); + vinitrd_start = round_pgup(vkern_end); + vinitrd_end = vinitrd_start + initrd_len; + vphysmap_start = round_pgup(vinitrd_end); + vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); + vpt_start = round_pgup(vphysmap_end); + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); + vstartinfo_start = vpt_end; + vstartinfo_end = vstartinfo_start + PAGE_SIZE; + vstack_start = vstartinfo_end; + vstack_end = vstack_start + PAGE_SIZE; + v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); + if ( (v_end - vstack_end) < (512 << 10) ) + v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ + if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) + break; + } + + if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) ) + { + printf("Initial guest OS requires too much space\n" + "(%luMB is greater than %luMB limit)\n", + (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20); + goto error_out; + } + + printf("VIRTUAL MEMORY ARRANGEMENT:\n" + " Loaded kernel: %08lx->%08lx\n" + " Init. ramdisk: %08lx->%08lx\n" + " Phys-Mach map: %08lx->%08lx\n" + " Page tables: %08lx->%08lx\n" + " Start info: %08lx->%08lx\n" + " Boot stack: %08lx->%08lx\n" + " TOTAL: %08lx->%08lx\n", + vkern_start, vkern_end, + vinitrd_start, vinitrd_end, + vphysmap_start, vphysmap_end, + vpt_start, vpt_end, + vstartinfo_start, vstartinfo_end, + vstack_start, vstack_end, + v_start, v_end); + printf(" ENTRY ADDRESS: %08lx\n", vkern_entry); memset(builddomain, 0, sizeof(*builddomain)); if ( (pm_handle = init_pfn_mapper()) < 0 ) goto error_out; - if ( (page_array = malloc(tot_pages * sizeof(unsigned long))) == NULL ) + if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL ) { PERROR("Could not allocate memory"); goto error_out; } - if ( get_pfn_list(xc_handle, dom, page_array, tot_pages) != tot_pages ) + if ( get_pfn_list(xc_handle, dom, page_array, nr_pages) != nr_pages ) { PERROR("Could not get the page frame list"); goto error_out; } - /* Load the guest OS image. Let it take no more than 1/2 memory.*/ - for ( i = 0; i < ((tot_pages/2)*PAGE_SIZE); i += PAGE_SIZE ) - { - char page[PAGE_SIZE]; - int size; - if ( (size = gzread(kernel_gfd, page, PAGE_SIZE)) == -1 ) - { - PERROR("Error reading kernel image, could not" - " read the whole image."); - goto error_out; - } - if ( size == 0 ) - goto kernel_copied; - copy_to_domain_page(pm_handle, page_array[i>>PAGE_SHIFT], page); - } - ERROR("Kernel too big to safely fit in domain memory"); - goto error_out; - - kernel_copied: - /* ksize is kernel-image size rounded up to a page boundary. */ - ksize = i; + loadelfimage(image, pm_handle, page_array, v_start); /* Load the initial ramdisk image. */ - if ( initrd_gfd ) + if ( initrd_len != 0 ) { - int size; - - for ( j=0, i=ksize; i < ((tot_pages/2) * PAGE_SIZE); i += PAGE_SIZE ) + for ( i = (vinitrd_start - v_start); + i < (vinitrd_end - v_start); i += PAGE_SIZE ) { char page[PAGE_SIZE]; - if ( (size = gzread(initrd_gfd, page, PAGE_SIZE)) == -1 ) + if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 ) { PERROR("Error reading initrd image, could not"); goto error_out; } - j += size; - if ( size > 0 ) - copy_to_domain_page(pm_handle, - page_array[i>>PAGE_SHIFT], page); - if ( size < PAGE_SIZE ) - goto initrd_copied; + copy_to_domain_page(pm_handle, + page_array[i>>PAGE_SHIFT], page); } - ERROR("Kernel/initrd too big to safely fit in domain memory"); - goto error_out; - - initrd_copied: - initrd_addr = virt_load_addr + ksize; - initrd_len = j; } - alloc_index = tot_pages - 1; - - /* Count bottom-level PTs, rounding up. */ - num_pt_pages = (l1_table_offset(virt_load_addr) + tot_pages + 1023) / 1024; - - /* We must also count the page directory. */ - num_pt_pages++; - - /* Index of first PT page. */ - pt_start = tot_pages - num_pt_pages; - - /* - * First allocate page for page dir. Allocation goes backwards from the end - * of the allocated physical address space. - */ - l2tab = page_array[alloc_index] << PAGE_SHIFT; - alloc_index--; - builddomain->ctxt.pt_base = l2tab; - if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL ) goto error_out; + /* First allocate page for page dir. */ + ppt_alloc = (vpt_start - v_start) >> PAGE_SHIFT; + l2tab = page_array[ppt_alloc++] << PAGE_SHIFT; + builddomain->ctxt.pt_base = l2tab; + /* Initialise the page tables. */ if ( (vl2tab = map_pfn_writeable(pm_handle, l2tab >> PAGE_SHIFT)) == NULL ) goto error_out; memset(vl2tab, 0, PAGE_SIZE); - vl2e = &vl2tab[l2_table_offset(virt_load_addr)]; - for ( count = 0; count < tot_pages; count++ ) + vl2e = &vl2tab[l2_table_offset(v_start)]; + for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) { - l1tab = page_array[alloc_index--] << PAGE_SHIFT; + l1tab = page_array[ppt_alloc++] << PAGE_SHIFT; if ( vl1tab != NULL ) unmap_pfn(pm_handle, vl1tab); if ( (vl1tab = map_pfn_writeable(pm_handle, l1tab >> PAGE_SHIFT)) == NULL ) goto error_out; memset(vl1tab, 0, PAGE_SIZE); - vl1e = &vl1tab[l1_table_offset(virt_load_addr + - (count<<PAGE_SHIFT))]; + vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))]; *vl2e++ = l1tab | L2_PROT; } *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT; - if ( count >= pt_start ) + if ( (count >= ((vpt_start-v_start)>>PAGE_SHIFT)) && + (count < ((vpt_end -v_start)>>PAGE_SHIFT)) ) *vl1e &= ~_PAGE_RW; vl1e++; + } + unmap_pfn(pm_handle, vl1tab); + unmap_pfn(pm_handle, vl2tab); + /* Write the phys->machine and machine->phys table entries. */ + physmap_pfn = (vphysmap_start - v_start) >> PAGE_SHIFT; + physmap = physmap_e = + map_pfn_writeable(pm_handle, page_array[physmap_pfn++]); + for ( count = 0; count < nr_pages; count++ ) + { if ( add_mmu_update(xc_handle, mmu, (page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, count) ) goto error_out; + *physmap_e++ = page_array[count]; + if ( ((unsigned long)physmap_e & (PAGE_SIZE-1)) == 0 ) + { + unmap_pfn(pm_handle, physmap); + physmap = physmap_e = + map_pfn_writeable(pm_handle, page_array[physmap_pfn++]); + } } - unmap_pfn(pm_handle, vl1tab); - unmap_pfn(pm_handle, vl2tab); - + unmap_pfn(pm_handle, physmap); + /* * Pin down l2tab addr as page dir page - causes hypervisor to provide * correct protection for the page @@ -230,17 +263,20 @@ static int setup_guestos(int xc_handle, l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) ) goto error_out; - *virt_startinfo_addr = - virt_load_addr + ((alloc_index-1) << PAGE_SHIFT); - - start_info = map_pfn_writeable(pm_handle, page_array[alloc_index-1]); + start_info = map_pfn_writeable( + pm_handle, page_array[(vstartinfo_start-v_start)>>PAGE_SHIFT]); memset(start_info, 0, sizeof(*start_info)); - start_info->pt_base = virt_load_addr + ((tot_pages-1) << PAGE_SHIFT); - start_info->mod_start = initrd_addr; - start_info->mod_len = initrd_len; - start_info->nr_pages = tot_pages; - start_info->shared_info = shared_info_frame << PAGE_SHIFT; - start_info->flags = 0; + start_info->nr_pages = nr_pages; + start_info->shared_info = shared_info_frame << PAGE_SHIFT; + start_info->flags = 0; + start_info->pt_base = vpt_start; + start_info->nr_pt_frames = nr_pt_pages; + start_info->mfn_list = vphysmap_start; + if ( initrd_len != 0 ) + { + start_info->mod_start = vinitrd_start; + start_info->mod_len = initrd_len; + } strncpy(start_info->cmd_line, cmdline, MAX_CMD_LEN); start_info->cmd_line[MAX_CMD_LEN-1] = '\0'; unmap_pfn(pm_handle, start_info); @@ -258,6 +294,10 @@ static int setup_guestos(int xc_handle, free(mmu); (void)close_pfn_mapper(pm_handle); free(page_array); + + *pvsi = vstartinfo_start; + *pvke = vkern_entry; + return 0; error_out: @@ -270,74 +310,109 @@ static int setup_guestos(int xc_handle, return -1; } -int xc_linux_build(int xc_handle, - u64 domid, - const char *image_name, - const char *ramdisk_name, - const char *cmdline) +static unsigned long get_filesz(int fd) { - dom0_op_t launch_op, op; - unsigned long load_addr; - long tot_pages; - int kernel_fd = -1, initrd_fd = -1; - gzFile kernel_gfd = NULL, initrd_gfd = NULL; - int rc, i; - full_execution_context_t *ctxt; - unsigned long virt_startinfo_addr; - - if ( (tot_pages = get_tot_pages(xc_handle, domid)) < 0 ) + u16 sig; + u32 _sz = 0; + unsigned long sz; + + lseek(fd, 0, SEEK_SET); + read(fd, &sig, sizeof(sig)); + sz = lseek(fd, 0, SEEK_END); + if ( sig == 0x8b1f ) /* GZIP signature? */ { - PERROR("Could not find total pages for domain"); - return 1; + lseek(fd, -4, SEEK_END); + read(fd, &_sz, 4); + sz = _sz; } + lseek(fd, 0, SEEK_SET); - kernel_fd = open(image_name, O_RDONLY); - if ( kernel_fd < 0 ) + return sz; +} + +static char *read_kernel_image(const char *filename, unsigned long *size) +{ + int kernel_fd = -1; + gzFile kernel_gfd = NULL; + char *image = NULL; + unsigned int bytes; + + if ( (kernel_fd = open(filename, O_RDONLY)) < 0 ) { PERROR("Could not open kernel image"); - return 1; + goto out; } + *size = get_filesz(kernel_fd); + if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) { PERROR("Could not allocate decompression state for state file"); - close(kernel_fd); - return 1; + goto out; } - rc = read_kernel_header(kernel_gfd, - tot_pages << (PAGE_SHIFT - 10), - &load_addr); - if ( rc < 0 ) - goto error_out; - - if ( (load_addr & (PAGE_SIZE-1)) != 0 ) + if ( (image = malloc(*size)) == NULL ) { - ERROR("We can only deal with page-aligned load addresses"); - goto error_out; + PERROR("Could not allocate memory for kernel image"); + goto out; } - if ( (load_addr + (tot_pages << PAGE_SHIFT)) > HYPERVISOR_VIRT_START ) + if ( (bytes = gzread(kernel_gfd, image, *size)) != *size ) { - ERROR("Cannot map all domain memory without hitting Xen space"); + PERROR("Error reading kernel image, could not" + " read the whole image (%d != %ld).", bytes, *size); + free(image); + image = NULL; + } + + out: + if ( kernel_gfd != NULL ) + gzclose(kernel_gfd); + else if ( kernel_fd >= 0 ) + close(kernel_fd); + return image; +} + +int xc_linux_build(int xc_handle, + u64 domid, + const char *image_name, + const char *ramdisk_name, + const char *cmdline) +{ + dom0_op_t launch_op, op; + int initrd_fd = -1; + gzFile initrd_gfd = NULL; + int rc, i; + full_execution_context_t *ctxt; + unsigned long nr_pages; + char *image = NULL; + unsigned long image_size, initrd_size=0; + unsigned long vstartinfo_start, vkern_entry; + + if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 ) + { + PERROR("Could not find total pages for domain"); goto error_out; } + if ( (image = read_kernel_image(image_name, &image_size)) == NULL ) + goto error_out; + if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) ) { - initrd_fd = open(ramdisk_name, O_RDONLY); - if ( initrd_fd < 0 ) + if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 ) { PERROR("Could not open the initial ramdisk image"); goto error_out; } + initrd_size = get_filesz(initrd_fd); + if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL ) { PERROR("Could not allocate decompression state for initrd"); goto error_out; } - } op.cmd = DOM0_GETDOMAININFO; @@ -355,23 +430,22 @@ int xc_linux_build(int xc_handle, goto error_out; } - if ( setup_guestos(xc_handle, domid, kernel_gfd, initrd_gfd, tot_pages, - &virt_startinfo_addr, - load_addr, &launch_op.u.builddomain, cmdline, + if ( setup_guestos(xc_handle, domid, image, image_size, + initrd_gfd, initrd_size, nr_pages, + &vstartinfo_start, &vkern_entry, + &launch_op.u.builddomain, cmdline, op.u.getdomaininfo.shared_info_frame) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; } - if ( kernel_fd >= 0 ) - close(kernel_fd); - if( kernel_gfd ) - gzclose(kernel_gfd); if ( initrd_fd >= 0 ) close(initrd_fd); - if( initrd_gfd ) + if ( initrd_gfd ) gzclose(initrd_gfd); + if ( image != NULL ) + free(image); ctxt = &launch_op.u.builddomain.ctxt; @@ -392,9 +466,9 @@ int xc_linux_build(int xc_handle, ctxt->cpu_ctxt.gs = FLAT_GUESTOS_DS; ctxt->cpu_ctxt.ss = FLAT_GUESTOS_DS; ctxt->cpu_ctxt.cs = FLAT_GUESTOS_CS; - ctxt->cpu_ctxt.eip = load_addr; - ctxt->cpu_ctxt.esp = virt_startinfo_addr; - ctxt->cpu_ctxt.esi = virt_startinfo_addr; + ctxt->cpu_ctxt.eip = vkern_entry; + ctxt->cpu_ctxt.esp = vstartinfo_start; + ctxt->cpu_ctxt.esi = vstartinfo_start; ctxt->cpu_ctxt.eflags = (1<<9) | (1<<2); /* FPU is set up to default initial state. */ @@ -416,7 +490,7 @@ int xc_linux_build(int xc_handle, /* Ring 1 stack is the initial stack. */ ctxt->guestos_ss = FLAT_GUESTOS_DS; - ctxt->guestos_esp = virt_startinfo_addr; + ctxt->guestos_esp = vstartinfo_start; /* No debugging. */ memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg)); @@ -436,14 +510,152 @@ int xc_linux_build(int xc_handle, return rc; error_out: - if ( kernel_fd >= 0 ) - close(kernel_fd); - if( kernel_gfd ) - gzclose(kernel_gfd); - if ( initrd_fd >= 0 ) - close(initrd_fd); - if( initrd_gfd ) + if ( initrd_gfd != NULL ) gzclose(initrd_gfd); + else if ( initrd_fd >= 0 ) + close(initrd_fd); + if ( image != NULL ) + free(image); return -1; } + +static inline int is_loadable_phdr(Elf_Phdr *phdr) +{ + return ((phdr->p_type == PT_LOAD) && + ((phdr->p_flags & (PF_W|PF_X)) != 0)); +} + +static int readelfimage_base_and_size(char *elfbase, + unsigned long elfsize, + unsigned long *pkernstart, + unsigned long *pkernend, + unsigned long *pkernentry) +{ + Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase; + Elf_Phdr *phdr; + Elf_Shdr *shdr; + unsigned long kernstart = ~0UL, kernend=0UL; + char *shstrtab, *guestinfo; + int h; + + if ( !IS_ELF(*ehdr) ) + { + ERROR("Kernel image does not have an ELF header."); + return -EINVAL; + } + + if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize ) + { + ERROR("ELF program headers extend beyond end of image."); + return -EINVAL; + } + + if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize ) + { + ERROR("ELF section headers extend beyond end of image."); + return -EINVAL; + } + + /* Find the section-header strings table. */ + if ( ehdr->e_shstrndx == SHN_UNDEF ) + { + ERROR("ELF image has no section-header strings table (shstrtab)."); + return -EINVAL; + } + shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + + (ehdr->e_shstrndx*ehdr->e_shentsize)); + shstrtab = elfbase + shdr->sh_offset; + + /* Find the special '__xen_guest' section and check its contents. */ + for ( h = 0; h < ehdr->e_shnum; h++ ) + { + shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + (h*ehdr->e_shentsize)); + if ( strcmp(&shstrtab[shdr->sh_name], "__xen_guest") != 0 ) + continue; + guestinfo = elfbase + shdr->sh_offset; + if ( (strstr(guestinfo, "GUEST_OS=linux") == NULL) || + (strstr(guestinfo, "XEN_VER=1.3") == NULL) ) + { + ERROR("Will only load Linux images built for Xen v1.3"); + ERROR("Actually saw: '%s'", guestinfo); + return -EINVAL; + } + break; + } + if ( h == ehdr->e_shnum ) + { + ERROR("Not a Xen-ELF image: '__xen_guest' section not found."); + return -EINVAL; + } + + for ( h = 0; h < ehdr->e_phnum; h++ ) + { + phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); + if ( !is_loadable_phdr(phdr) ) + continue; + if ( phdr->p_vaddr < kernstart ) + kernstart = phdr->p_vaddr; + if ( (phdr->p_vaddr + phdr->p_memsz) > kernend ) + kernend = phdr->p_vaddr + phdr->p_memsz; + } + + if ( (kernstart > kernend) || + (ehdr->e_entry < kernstart) || + (ehdr->e_entry > kernend) ) + { + ERROR("Malformed ELF image."); + return -EINVAL; + } + + *pkernstart = kernstart; + *pkernend = kernend; + *pkernentry = ehdr->e_entry; + + return 0; +} + +static int loadelfimage(char *elfbase, int pmh, unsigned long *parray, + unsigned long vstart) +{ + Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase; + Elf_Phdr *phdr; + int h; + + char *va; + unsigned long pa, done, chunksz; + + for ( h = 0; h < ehdr->e_phnum; h++ ) + { + phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); + if ( !is_loadable_phdr(phdr) ) + continue; + + for ( done = 0; done < phdr->p_filesz; done += chunksz ) + { + pa = (phdr->p_vaddr + done) - vstart; + va = map_pfn_writeable(pmh, parray[pa>>PAGE_SHIFT]); + va += pa & (PAGE_SIZE-1); + chunksz = phdr->p_filesz - done; + if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) ) + chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1)); + memcpy(va, elfbase + phdr->p_offset + done, chunksz); + unmap_pfn(pmh, va); + } + + for ( ; done < phdr->p_memsz; done += chunksz ) + { + pa = (phdr->p_vaddr + done) - vstart; + va = map_pfn_writeable(pmh, parray[pa>>PAGE_SHIFT]); + va += pa & (PAGE_SIZE-1); + chunksz = phdr->p_memsz - done; + if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) ) + chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1)); + memset(va, 0, chunksz); + unmap_pfn(pmh, va); + } + } + + return 0; +} + diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h index 8e3cae3f52..dda04a9f8d 100644 --- a/tools/xc/lib/xc_private.h +++ b/tools/xc/lib/xc_private.h @@ -55,11 +55,12 @@ typedef unsigned long l2_pgentry_t; #define l2_table_offset(_a) \ ((_a) >> L2_PAGETABLE_SHIFT) -#define ERROR(_m) \ - fprintf(stderr, "ERROR: %s\n", (_m)) +#define ERROR(_m, _a...) \ + fprintf(stderr, "ERROR: " _m "\n" , ## _a ) -#define PERROR(_m) \ - fprintf(stderr, "ERROR: %s (%d = %s)\n", (_m), errno, strerror(errno)) +#define PERROR(_m, _a...) \ + fprintf(stderr, "ERROR: " _m " (%d = %s)\n" , ## _a , \ + errno, strerror(errno)) static inline int do_privcmd(int xc_handle, unsigned int cmd, diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c index 2567e39422..9f1eaa465b 100644 --- a/xen/arch/i386/mm.c +++ b/xen/arch/i386/mm.c @@ -81,6 +81,13 @@ void __init paging_init(void) { unsigned long addr; void *ioremap_pt; + int i; + + /* Idle page table 1:1 maps the first part of physical memory. */ + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + idle_pg_table[i] = + mk_l2_pgentry((i << L2_PAGETABLE_SHIFT) | + __PAGE_HYPERVISOR | _PAGE_PSE); /* * Fixed mappings, only the page table structure has to be diff --git a/xen/arch/i386/setup.c b/xen/arch/i386/setup.c index 318d8cff90..862582e5d4 100644 --- a/xen/arch/i386/setup.c +++ b/xen/arch/i386/setup.c @@ -411,8 +411,6 @@ void __init start_of_day(void) check_nmi_watchdog(); - zap_low_mappings(); - #ifdef CONFIG_PCI pci_init(); #endif diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c index 7a012e147a..d10292f618 100644 --- a/xen/arch/i386/traps.c +++ b/xen/arch/i386/traps.c @@ -339,11 +339,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) return; /* successfully copied the mapping */ } - if ( unlikely( p->mm.shadow_mode ) && addr < PAGE_OFFSET && - shadow_fault( addr, error_code ) ) - { - return; // return true if fault was handled - } + if ( unlikely(p->mm.shadow_mode) && + (addr < PAGE_OFFSET) && shadow_fault(addr, error_code) ) + return; /* Return TRUE if fault was handled. */ if ( unlikely(!(regs->xcs & 3)) ) goto fault_in_hypervisor; @@ -363,7 +361,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { perfc_incrc(copy_user_faults); - //DPRINTK("copy_user fault: %08lx -> %08lx\n", regs->eip, fixup); + if ( !p->mm.shadow_mode ) + DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup); regs->eip = fixup; regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; return; diff --git a/xen/common/domain.c b/xen/common/domain.c index 5be1be9b06..f83562a903 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -20,6 +20,13 @@ #include <xen/vbd.h> #include <asm/i387.h> +#ifdef CONFIG_X86_64BITMODE +#define ELFSIZE 64 +#else +#define ELFSIZE 32 +#endif +#include <xen/elf.h> + #if !defined(CONFIG_X86_64BITMODE) /* No ring-3 access in initial page tables. */ #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) @@ -31,6 +38,9 @@ #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) +#define round_pgdown(_p) ((_p)&PAGE_MASK) + /* Both these structures are protected by the tasklist_lock. */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; struct task_struct *task_hash[TASK_HASH_SIZE]; @@ -459,7 +469,7 @@ unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) (PAGE_SHIFT-10))) ) { free_all_dom_mem(p); - return -1; + return -ENOMEM; } } @@ -555,39 +565,166 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) return 0; } -static unsigned long alloc_page_from_domain(unsigned long * cur_addr, - unsigned long * index) +static inline int is_loadable_phdr(Elf_Phdr *phdr) { - unsigned long ret = *cur_addr; - struct list_head *ent = frame_table[ret >> PAGE_SHIFT].list.prev; - *cur_addr = list_entry(ent, struct pfn_info, list) - frame_table; - *cur_addr <<= PAGE_SHIFT; - (*index)--; - return ret; + return ((phdr->p_type == PT_LOAD) && + ((phdr->p_flags & (PF_W|PF_X)) != 0)); } -/* - * setup_guestos is used for building dom0 solely. other domains are built in - * userspace dom0 and final setup is being done by final_setup_guestos. - */ -int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, - unsigned int num_vifs, - char *phy_data_start, unsigned long data_len, - char *cmdline, unsigned long initrd_len) +static int readelfimage_base_and_size(char *elfbase, + unsigned long elfsize, + unsigned long *pkernstart, + unsigned long *pkernend, + unsigned long *pkernentry) { - struct list_head *list_ent; - char *src, *vsrc, *dst, *data_start; - int i; + Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase; + Elf_Phdr *phdr; + Elf_Shdr *shdr; + unsigned long kernstart = ~0UL, kernend=0UL; + char *shstrtab, *guestinfo; + int h; + + if ( !IS_ELF(*ehdr) ) + { + printk("Kernel image does not have an ELF header.\n"); + return -EINVAL; + } + + if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize ) + { + printk("ELF program headers extend beyond end of image.\n"); + return -EINVAL; + } + + if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize ) + { + printk("ELF section headers extend beyond end of image.\n"); + return -EINVAL; + } + + /* Find the section-header strings table. */ + if ( ehdr->e_shstrndx == SHN_UNDEF ) + { + printk("ELF image has no section-header strings table (shstrtab).\n"); + return -EINVAL; + } + shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + + (ehdr->e_shstrndx*ehdr->e_shentsize)); + shstrtab = elfbase + shdr->sh_offset; + + /* Find the special '__xen_guest' section and check its contents. */ + for ( h = 0; h < ehdr->e_shnum; h++ ) + { + shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + (h*ehdr->e_shentsize)); + if ( strcmp(&shstrtab[shdr->sh_name], "__xen_guest") != 0 ) + continue; + guestinfo = elfbase + shdr->sh_offset; + printk("Xen-ELF header found: '%s'\n", guestinfo); + if ( (strstr(guestinfo, "GUEST_OS=linux") == NULL) || + (strstr(guestinfo, "XEN_VER=1.3") == NULL) ) + { + printk("ERROR: Xen will only load Linux built for Xen v1.3\n"); + return -EINVAL; + } + break; + } + if ( h == ehdr->e_shnum ) + { + printk("Not a Xen-ELF image: '__xen_guest' section not found.\n"); + return -EINVAL; + } + + for ( h = 0; h < ehdr->e_phnum; h++ ) + { + phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); + if ( !is_loadable_phdr(phdr) ) + continue; + if ( phdr->p_vaddr < kernstart ) + kernstart = phdr->p_vaddr; + if ( (phdr->p_vaddr + phdr->p_memsz) > kernend ) + kernend = phdr->p_vaddr + phdr->p_memsz; + } + + if ( (kernstart > kernend) || + (ehdr->e_entry < kernstart) || + (ehdr->e_entry > kernend) ) + { + printk("Malformed ELF image.\n"); + return -EINVAL; + } + + *pkernstart = kernstart; + *pkernend = kernend; + *pkernentry = ehdr->e_entry; + + return 0; +} + +static int loadelfimage(char *elfbase) +{ + Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase; + Elf_Phdr *phdr; + int h; + + for ( h = 0; h < ehdr->e_phnum; h++ ) + { + phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); + if ( !is_loadable_phdr(phdr) ) + continue; + if ( phdr->p_filesz != 0 ) + memcpy((char *)phdr->p_vaddr, elfbase + phdr->p_offset, + phdr->p_filesz); + if ( phdr->p_memsz > phdr->p_filesz ) + memset((char *)phdr->p_vaddr + phdr->p_filesz, 0, + phdr->p_memsz - phdr->p_filesz); + } + + return 0; +} + +int construct_dom0(struct task_struct *p, + unsigned long alloc_start, + unsigned long alloc_end, + unsigned int num_vifs, + char *image_start, unsigned long image_len, + char *initrd_start, unsigned long initrd_len, + char *cmdline) +{ + char *dst; + int i, rc; domid_t dom = p->domain; - unsigned long phys_l1tab, phys_l2tab; - unsigned long cur_address, alloc_address; - unsigned long virt_load_address, virt_stack_address; - start_info_t *virt_startinfo_address; + unsigned long pfn, mfn; + unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT; + unsigned long nr_pt_pages; unsigned long count; - unsigned long alloc_index; l2_pgentry_t *l2tab, *l2start; l1_pgentry_t *l1tab = NULL, *l1start = NULL; struct pfn_info *page = NULL; + start_info_t *si; + + /* + * This fully describes the memory layout of the initial domain. All + * *_start address are page-aligned, except v_start (and v_end) which are + * superpage-aligned. + */ + unsigned long v_start; + unsigned long vkern_start; + unsigned long vkern_entry; + unsigned long vkern_end; + unsigned long vinitrd_start; + unsigned long vinitrd_end; + unsigned long vphysmap_start; + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; + unsigned long vpt_end; + unsigned long v_end; + + /* Machine address of next candidate page-table page. */ + unsigned long mpt_alloc; extern void physdev_init_dom0(struct task_struct *); extern void ide_probe_devices(xen_disk_info_t *); @@ -597,67 +734,114 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, xen_disk_t *xd; /* Sanity! */ - if ( p->domain != 0 ) BUG(); - if ( test_bit(PF_CONSTRUCTED, &p->flags) ) BUG(); + if ( p->domain != 0 ) + BUG(); + if ( test_bit(PF_CONSTRUCTED, &p->flags) ) + BUG(); + + printk("*** LOADING DOMAIN 0 ***\n"); /* * This is all a bit grim. We've moved the modules to the "safe" physical * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this - * routeine, we're going to copy it down into the region that's actually + * routine we're going to copy it down into the region that's actually * been allocated to domain 0. This is highly likely to be overlapping, so * we use a forward copy. * * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with * 4GB and lots of network/disk cards that allocate loads of buffers. - * We'll have to revist this if we ever support PAE (64GB). + * We'll have to revisit this if we ever support PAE (64GB). */ - data_start = map_domain_mem((unsigned long)phy_data_start); + rc = readelfimage_base_and_size(image_start, image_len, + &vkern_start, &vkern_end, &vkern_entry); + if ( rc != 0 ) + return rc; - if ( strncmp(data_start, "XenGuest", 8) ) + /* + * Why do we need this? The number of page-table frames depends on the + * size of the bootstrap address space. But the size of the address space + * depends on the number of page-table frames (since each one is mapped + * read-only). We have a pair of simultaneous equations in two unknowns, + * which we solve by exhaustive search. + */ + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { - printk("DOM%llu: Invalid guest OS image - bad signature\n", dom); - unmap_domain_mem(data_start); - return -1; + v_start = vkern_start & ~((1<<22)-1); + vinitrd_start = round_pgup(vkern_end); + vinitrd_end = vinitrd_start + initrd_len; + vphysmap_start = round_pgup(vinitrd_end); + vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long)); + vpt_start = round_pgup(vphysmap_end); + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); + vstartinfo_start = vpt_end; + vstartinfo_end = vstartinfo_start + PAGE_SIZE; + vstack_start = vstartinfo_end; + vstack_end = vstack_start + PAGE_SIZE; + v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1); + if ( (v_end - vstack_end) < (512 << 10) ) + v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */ + if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) + break; } - virt_load_address = *(unsigned long *)(data_start + 8); - if ( (virt_load_address & (PAGE_SIZE-1)) ) + if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) ) { - printk("DOM%llu: Guest OS load address not page-aligned (%08lx)\n", - dom, virt_load_address); - unmap_domain_mem(data_start); - return -1; + printk("Initial guest OS requires too much space\n" + "(%luMB is greater than %luMB limit)\n", + (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20); + return -ENOMEM; } - if ( alloc_new_dom_mem(p, params->memory_kb) ) + printk("PHYSICAL MEMORY ARRANGEMENT:\n" + " Kernel image: %p->%p\n" + " Initrd image: %p->%p\n" + " Dom0 alloc.: %08lx->%08lx\n", + image_start, image_start + image_len, + initrd_start, initrd_start + initrd_len, + alloc_start, alloc_end); + printk("VIRTUAL MEMORY ARRANGEMENT:\n" + " Loaded kernel: %08lx->%08lx\n" + " Init. ramdisk: %08lx->%08lx\n" + " Phys-Mach map: %08lx->%08lx\n" + " Page tables: %08lx->%08lx\n" + " Start info: %08lx->%08lx\n" + " Boot stack: %08lx->%08lx\n" + " TOTAL: %08lx->%08lx\n", + vkern_start, vkern_end, + vinitrd_start, vinitrd_end, + vphysmap_start, vphysmap_end, + vpt_start, vpt_end, + vstartinfo_start, vstartinfo_end, + vstack_start, vstack_end, + v_start, v_end); + printk(" ENTRY ADDRESS: %08lx\n", vkern_entry); + + /* + * Protect the lowest 1GB of memory. We use a temporary mapping there + * from which we copy the kernel and ramdisk images. + */ + if ( v_start < (1<<30) ) { - printk("DOM%llu: Not enough memory --- reduce dom0_mem ??\n", dom); - unmap_domain_mem(data_start); - return -ENOMEM; + printk("Initial loading isn't allowed to lowest 1GB of memory.\n"); + return -EINVAL; } - alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) - - frame_table; - alloc_address <<= PAGE_SHIFT; - alloc_index = p->tot_pages; - - if ( data_len > (params->memory_kb << 9) ) + /* Construct a frame-allocation list for the initial domain. */ + for ( pfn = (alloc_start>>PAGE_SHIFT); + pfn < (alloc_end>>PAGE_SHIFT); + pfn++ ) { - printk("DOM%llu: Guest OS image is too large\n" - " (%luMB is greater than %uMB limit for a\n" - " %uMB address space)\n", - dom, data_len>>20, - (params->memory_kb)>>11, - (params->memory_kb)>>10); - unmap_domain_mem(data_start); - free_all_dom_mem(p); - return -1; + page = &frame_table[pfn]; + page->u.domain = p; + page->type_and_flags = 0; + page->count_and_flags = PGC_allocated | 1; + list_add_tail(&page->list, &p->page_list); + p->tot_pages++; } - printk("DOM%llu: Guest OS virtual load address is %08lx\n", dom, - virt_load_address); - + mpt_alloc = (vpt_start - v_start) + alloc_start; + SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES); SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS); @@ -671,157 +855,137 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, for ( i = 0; i < 256; i++ ) p->thread.traps[i].cs = FLAT_GUESTOS_CS; - /* - * WARNING: The new domain must have its 'processor' field - * filled in by now !! - */ - phys_l2tab = alloc_page_from_domain(&alloc_address, &alloc_index); - l2start = l2tab = map_domain_mem(phys_l2tab); + /* WARNING: The new domain must have its 'processor' field filled in! */ + l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); + l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR); l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR); - l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(phys_l2tab | __PAGE_HYPERVISOR); - memset(l2tab, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); - p->mm.pagetable = mk_pagetable(phys_l2tab); + p->mm.pagetable = mk_pagetable((unsigned long)l2start); - l2tab += l2_table_offset(virt_load_address); - cur_address = list_entry(p->page_list.next, struct pfn_info, list) - - frame_table; - cur_address <<= PAGE_SHIFT; - for ( count = 0; count < p->tot_pages; count++ ) + l2tab += l2_table_offset(v_start); + mfn = alloc_start >> PAGE_SHIFT; + for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { - if ( l1tab != NULL ) unmap_domain_mem(l1start); - phys_l1tab = alloc_page_from_domain(&alloc_address, &alloc_index); - *l2tab++ = mk_l2_pgentry(phys_l1tab|L2_PROT); - l1start = l1tab = map_domain_mem(phys_l1tab); + l1start = l1tab = (l1_pgentry_t *)mpt_alloc; + mpt_alloc += PAGE_SIZE; + *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT); clear_page(l1tab); - l1tab += l1_table_offset( - virt_load_address + (count << PAGE_SHIFT)); } - *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT); + *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT); - page = &frame_table[cur_address >> PAGE_SHIFT]; + page = &frame_table[mfn]; set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags); if ( !get_page_and_type(page, p, PGT_writeable_page) ) BUG(); - /* Set up the MPT entry. */ - machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count; - list_ent = frame_table[cur_address >> PAGE_SHIFT].list.next; - cur_address = list_entry(list_ent, struct pfn_info, list) - - frame_table; - cur_address <<= PAGE_SHIFT; + mfn++; } - unmap_domain_mem(l1start); - /* pages that are part of page tables must be read only */ - l2tab = l2start + l2_table_offset(virt_load_address + - (alloc_index << PAGE_SHIFT)); - l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab)); - l1tab += l1_table_offset(virt_load_address + (alloc_index << PAGE_SHIFT)); + /* Pages that are part of page tables must be read only. */ + l2tab = l2start + l2_table_offset(vpt_start); + l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); + l1tab += l1_table_offset(vpt_start); l2tab++; - for ( count = alloc_index; count < p->tot_pages; count++ ) + for ( count = 0; count < nr_pt_pages; count++ ) { *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); - page = frame_table + l1_pgentry_to_pagenr(*l1tab); - page->type_and_flags &= ~PGT_type_mask; - page->type_and_flags |= PGT_l1_page_table; - get_page(page, p); /* an extra ref because of readable mapping */ - l1tab++; - if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) ) + page = &frame_table[l1_pgentry_to_pagenr(*l1tab)]; + if ( count == 0 ) + { + page->type_and_flags &= ~PGT_type_mask; + page->type_and_flags |= PGT_l2_page_table; + get_page(page, p); /* an extra ref because of readable mapping */ + /* Get another ref to L2 page so that it can be pinned. */ + if ( !get_page_and_type(page, p, PGT_l2_page_table) ) + BUG(); + set_bit(_PGC_guest_pinned, &page->count_and_flags); + } + else { - unmap_domain_mem(l1start); - l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab)); - l2tab++; + page->type_and_flags &= ~PGT_type_mask; + page->type_and_flags |= PGT_l1_page_table; + get_page(page, p); /* an extra ref because of readable mapping */ } + l1tab++; + if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) ) + l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab); } - /* Rewrite last L1 page to be a L2 page. */ - page->type_and_flags &= ~PGT_type_mask; - page->type_and_flags |= PGT_l2_page_table; - /* Get another ref to L2 page so that it can be pinned. */ - if ( !get_page_and_type(page, p, PGT_l2_page_table) ) - BUG(); - set_bit(_PGC_guest_pinned, &page->count_and_flags); - unmap_domain_mem(l1start); - /* Set up shared info area. */ + /* Set up shared-info area. */ update_dom_time(p->shared_info); p->shared_info->domain_time = 0; p->shared_info->evtchn_upcall_mask = ~0UL; /* mask all upcalls */ - virt_startinfo_address = (start_info_t *) - (virt_load_address + ((alloc_index - 1) << PAGE_SHIFT)); - virt_stack_address = (unsigned long)virt_startinfo_address; - - unmap_domain_mem(l2start); - /* Install the new page tables. */ __cli(); write_cr3_counted(pagetable_val(p->mm.pagetable)); - /* Copy the guest OS image. */ - src = (char *)(phy_data_start + 12); - vsrc = (char *)(data_start + 12); /* data_start invalid after first page*/ - dst = (char *)virt_load_address; - while ( src < (phy_data_start+data_len) ) - { - *dst++ = *vsrc++; - src++; - if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 ) - { - unmap_domain_mem(vsrc-1); - vsrc = map_domain_mem((unsigned long)src); - } - } - unmap_domain_mem(vsrc); + /* Copy the OS image. */ + (void)loadelfimage(image_start); + + /* Copy the initial ramdisk. */ + if ( initrd_len != 0 ) + memcpy((void *)vinitrd_start, initrd_start, initrd_len); /* Set up start info area. */ - memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address)); - virt_startinfo_address->nr_pages = p->tot_pages; - virt_startinfo_address->shared_info = virt_to_phys(p->shared_info); - virt_startinfo_address->pt_base = virt_load_address + - ((p->tot_pages - 1) << PAGE_SHIFT); - - virt_startinfo_address->flags = 0; - if ( IS_PRIV(p) ) - virt_startinfo_address->flags |= SIF_PRIVILEGED; - if ( p->domain == 0 ) - virt_startinfo_address->flags |= SIF_INITDOMAIN; + si = (start_info_t *)vstartinfo_start; + memset(si, 0, PAGE_SIZE); + si->nr_pages = p->tot_pages; + si->shared_info = virt_to_phys(p->shared_info); + si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + si->pt_base = vpt_start; + si->nr_pt_frames = nr_pt_pages; + si->mfn_list = vphysmap_start; + + /* Write the phys->machine and machine->phys table entries. */ + for ( pfn = 0; pfn < p->tot_pages; pfn++ ) + { + mfn = (alloc_start >> PAGE_SHIFT) + pfn; + ((unsigned long *)vphysmap_start)[pfn] = mfn; + machine_to_phys_mapping[mfn] = pfn; + } - if ( initrd_len ) + if ( initrd_len != 0 ) { - virt_startinfo_address->mod_start = (unsigned long)dst-initrd_len; - virt_startinfo_address->mod_len = initrd_len; + si->mod_start = vinitrd_start; + si->mod_len = initrd_len; printk("Initrd len 0x%lx, start at 0x%08lx\n", - virt_startinfo_address->mod_len, - virt_startinfo_address->mod_start); + si->mod_len, si->mod_start); } - /* Add virtual network interfaces and point to them in startinfo. */ - while ( num_vifs-- > 0 ) - (void)create_net_vif(dom); - - dst = virt_startinfo_address->cmd_line; + dst = si->cmd_line; if ( cmdline != NULL ) { for ( i = 0; i < 255; i++ ) { - if ( cmdline[i] == '\0' ) break; + if ( cmdline[i] == '\0' ) + break; *dst++ = cmdline[i]; } } *dst = '\0'; - /* NB: Give up the VGA console if DOM0 is ocnfigured to grab it. */ - console_endboot(strstr(cmdline, "tty0") != NULL); - /* Reinstate the caller's page tables. */ write_cr3_counted(pagetable_val(current->mm.pagetable)); __sti(); + /* Destroy low mappings - they were only for our convenience. */ + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE ) + l2start[i] = mk_l2_pgentry(0); + zap_low_mappings(); /* Do the same for the idle page tables. */ + + /* Give up the VGA console if DOM0 is configured to grab it. */ + console_endboot(strstr(cmdline, "tty0") != NULL); + + /* Add virtual network interfaces. */ + while ( num_vifs-- > 0 ) + (void)create_net_vif(dom); + #ifndef NO_DEVICES_IN_XEN /* DOM0 gets access to all real block devices. */ #define MAX_REAL_DISKS 256 @@ -851,14 +1015,11 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, set_bit(PF_CONSTRUCTED, &p->flags); -#if 0 // XXXXX DO NOT CHECK IN ENBALED !!! (but useful for testing so leave) +#if 0 // XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) shadow_mode_enable(p, SHM_test); #endif - new_thread(p, - (unsigned long)virt_load_address, - (unsigned long)virt_stack_address, - (unsigned long)virt_startinfo_address); + new_thread(p, vkern_entry, vstack_end, vstartinfo_start); return 0; } diff --git a/xen/common/kernel.c b/xen/common/kernel.c index ac44c2407f..4fd4990ac0 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -104,10 +104,12 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) dom0_createdomain_t dom0_params; unsigned long max_page; unsigned char *cmdline; - module_t *mod; + module_t *mod = (module_t *)__va(mbi->mods_addr); void *heap_start; int i; unsigned long max_mem; + unsigned long dom0_memory_start, dom0_memory_end; + unsigned long initial_images_start, initial_images_end; /* Parse the command-line options. */ cmdline = (unsigned char *)(mbi->cmdline ? __va(mbi->cmdline) : NULL); @@ -215,6 +217,19 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) max_page >> (20-PAGE_SHIFT), max_mem >> (20-PAGE_SHIFT) ); + initial_images_start = MAX_DIRECTMAP_ADDRESS; + initial_images_end = initial_images_start + + (mod[mbi->mods_count-1].mod_end - mod[0].mod_start); + dom0_memory_start = (initial_images_end + ((4<<20)-1)) & ~((4<<20)-1); + dom0_memory_end = dom0_memory_start + (opt_dom0_mem << 10); + dom0_memory_end = (dom0_memory_end + PAGE_SIZE - 1) & PAGE_MASK; + + /* Cheesy sanity check: enough memory for DOM0 allocation + some slack? */ + if ( (dom0_memory_end + (8<<20)) > (max_page<<PAGE_SHIFT) ) + panic("Not enough memory to craete initial domain!\n"); + + add_to_domain_alloc_list(dom0_memory_end, max_page << PAGE_SHIFT); + heap_start = memguard_init(&_end); printk("Xen heap size is %luKB\n", @@ -243,24 +258,30 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) /* Create initial domain 0. */ dom0_params.memory_kb = opt_dom0_mem; new_dom = do_createdomain(0, 0); - if ( new_dom == NULL ) panic("Error creating domain 0\n"); + if ( new_dom == NULL ) + panic("Error creating domain 0\n"); set_bit(PF_PRIVILEGED, &new_dom->flags); /* * We're going to setup domain0 using the module(s) that we stashed safely - * above our MAX_DIRECTMAP_ADDRESS in boot/Boot.S The second module, if - * present, is an initrd ramdisk + * above our MAX_DIRECTMAP_ADDRESS in boot/boot.S. The second module, if + * present, is an initrd ramdisk. */ - mod = (module_t *)__va(mbi->mods_addr); - if ( setup_guestos(new_dom, - &dom0_params, 1, - (char *)MAX_DIRECTMAP_ADDRESS, - mod[mbi->mods_count-1].mod_end - mod[0].mod_start, - __va(mod[0].string), - (mbi->mods_count == 2) ? - (mod[1].mod_end - mod[1].mod_start):0) - != 0 ) panic("Could not set up DOM0 guest OS\n"); + if ( construct_dom0(new_dom, dom0_memory_start, dom0_memory_end, 1, + (char *)initial_images_start, + mod[0].mod_end-mod[0].mod_start, + (mbi->mods_count == 1) ? 0 : + (char *)initial_images_start + + (mod[1].mod_start-mod[0].mod_start), + (mbi->mods_count == 1) ? 0 : + mod[mbi->mods_count-1].mod_end - mod[1].mod_start, + __va(mod[0].string)) != 0) + panic("Could not set up DOM0 guest OS\n"); + + /* The stash space for the initial kernel image can now be freed up. */ + add_to_domain_alloc_list(__pa(frame_table) + frame_table_size, + dom0_memory_start); wake_up(new_dom); diff --git a/xen/common/memory.c b/xen/common/memory.c index c8510c514d..9422c5ba86 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -170,7 +170,7 @@ unsigned long frame_table_size; unsigned long max_page; struct list_head free_list; -spinlock_t free_list_lock = SPIN_LOCK_UNLOCKED; +spinlock_t free_list_lock; unsigned int free_pfns; /* Used to defer flushing of memory structures. */ @@ -191,10 +191,6 @@ static struct { */ void __init init_frametable(unsigned long nr_pages) { - struct pfn_info *pf; - unsigned long page_index; - unsigned long flags; - memset(percpu_info, 0, sizeof(percpu_info)); max_page = nr_pages; @@ -203,23 +199,28 @@ void __init init_frametable(unsigned long nr_pages) frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; memset(frame_table, 0, frame_table_size); + spin_lock_init(&free_list_lock); + INIT_LIST_HEAD(&free_list); free_pfns = 0; - /* Put all domain-allocatable memory on a free list. */ +} + +void add_to_domain_alloc_list(unsigned long ps, unsigned long pe) +{ + struct pfn_info *pf; + unsigned long i; + unsigned long flags; + spin_lock_irqsave(&free_list_lock, flags); - INIT_LIST_HEAD(&free_list); - for( page_index = (__pa(frame_table) + frame_table_size) >> PAGE_SHIFT; - page_index < nr_pages; - page_index++ ) + for ( i = ps >> PAGE_SHIFT; i < (pe >> PAGE_SHIFT); i++ ) { - pf = list_entry(&frame_table[page_index].list, struct pfn_info, list); + pf = list_entry(&frame_table[i].list, struct pfn_info, list); list_add_tail(&pf->list, &free_list); free_pfns++; } spin_unlock_irqrestore(&free_list_lock, flags); } - static void __invalidate_shadow_ldt(struct task_struct *p) { int i; diff --git a/xen/include/asm-i386/elf.h b/xen/include/asm-i386/elf.h deleted file mode 100644 index ded22856d0..0000000000 --- a/xen/include/asm-i386/elf.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * GRUB -- GRand Unified Bootloader - * Copyright (C) 1996 Erich Boleyn <erich@uruk.org> - * Copyright (C) 2001 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* 32-bit data types */ - -typedef unsigned long Elf32_Addr; -typedef unsigned short Elf32_Half; -typedef unsigned long Elf32_Off; -typedef signed long Elf32_Sword; -typedef unsigned long Elf32_Word; -/* "unsigned char" already exists */ - -/* ELF header */ -typedef struct -{ - -#define EI_NIDENT 16 - - /* first four characters are defined below */ -#define EI_MAG0 0 -#define ELFMAG0 0x7f -#define EI_MAG1 1 -#define ELFMAG1 'E' -#define EI_MAG2 2 -#define ELFMAG2 'L' -#define EI_MAG3 3 -#define ELFMAG3 'F' - -#define EI_CLASS 4 /* data sizes */ -#define ELFCLASS32 1 /* i386 -- up to 32-bit data sizes present */ - -#define EI_DATA 5 /* data type and ordering */ -#define ELFDATA2LSB 1 /* i386 -- LSB 2's complement */ - -#define EI_VERSION 6 /* version number. "e_version" must be the same */ -#define EV_CURRENT 1 /* current version number */ - -#define EI_PAD 7 /* from here in is just padding */ - -#define EI_BRAND 8 /* start of OS branding (This is - obviously illegal against the ELF - standard.) */ - - unsigned char e_ident[EI_NIDENT]; /* basic identification block */ - -#define ET_EXEC 2 /* we only care about executable types */ - Elf32_Half e_type; /* file types */ - -#define EM_386 3 /* i386 -- obviously use this one */ - Elf32_Half e_machine; /* machine types */ - Elf32_Word e_version; /* use same as "EI_VERSION" above */ - Elf32_Addr e_entry; /* entry point of the program */ - Elf32_Off e_phoff; /* program header table file offset */ - Elf32_Off e_shoff; /* section header table file offset */ - Elf32_Word e_flags; /* flags */ - Elf32_Half e_ehsize; /* elf header size in bytes */ - Elf32_Half e_phentsize; /* program header entry size */ - Elf32_Half e_phnum; /* number of entries in program header */ - Elf32_Half e_shentsize; /* section header entry size */ - Elf32_Half e_shnum; /* number of entries in section header */ - -#define SHN_UNDEF 0 -#define SHN_LORESERVE 0xff00 -#define SHN_LOPROC 0xff00 -#define SHN_HIPROC 0xff1f -#define SHN_ABS 0xfff1 -#define SHN_COMMON 0xfff2 -#define SHN_HIRESERVE 0xffff - Elf32_Half e_shstrndx; /* section header table index */ -} -Elf32_Ehdr; - - -#define BOOTABLE_I386_ELF(h) \ - ((h.e_ident[EI_MAG0] == ELFMAG0) & (h.e_ident[EI_MAG1] == ELFMAG1) \ - & (h.e_ident[EI_MAG2] == ELFMAG2) & (h.e_ident[EI_MAG3] == ELFMAG3) \ - & (h.e_ident[EI_CLASS] == ELFCLASS32) & (h.e_ident[EI_DATA] == ELFDATA2LSB) \ - & (h.e_ident[EI_VERSION] == EV_CURRENT) & (h.e_type == ET_EXEC) \ - & (h.e_machine == EM_386) & (h.e_version == EV_CURRENT)) - -/* section table - ? */ -typedef struct -{ - Elf32_Word sh_name; /* Section name (string tbl index) */ - Elf32_Word sh_type; /* Section type */ - Elf32_Word sh_flags; /* Section flags */ - Elf32_Addr sh_addr; /* Section virtual addr at execution */ - Elf32_Off sh_offset; /* Section file offset */ - Elf32_Word sh_size; /* Section size in bytes */ - Elf32_Word sh_link; /* Link to another section */ - Elf32_Word sh_info; /* Additional section information */ - Elf32_Word sh_addralign; /* Section alignment */ - Elf32_Word sh_entsize; /* Entry size if section holds table */ -} -Elf32_Shdr; - -/* symbol table - page 4-25, figure 4-15 */ -typedef struct -{ - Elf32_Word st_name; - Elf32_Addr st_value; - Elf32_Word st_size; - unsigned char st_info; - unsigned char st_other; - Elf32_Half st_shndx; -} -Elf32_Sym; - -/* symbol type and binding attributes - page 4-26 */ - -#define ELF32_ST_BIND(i) ((i) >> 4) -#define ELF32_ST_TYPE(i) ((i) & 0xf) -#define ELF32_ST_INFO(b,t) (((b)<<4)+((t)&0xf)) - -/* symbol binding - page 4-26, figure 4-16 */ - -#define STB_LOCAL 0 -#define STB_GLOBAL 1 -#define STB_WEAK 2 -#define STB_LOPROC 13 -#define STB_HIPROC 15 - -/* symbol types - page 4-28, figure 4-17 */ - -#define STT_NOTYPE 0 -#define STT_OBJECT 1 -#define STT_FUNC 2 -#define STT_SECTION 3 -#define STT_FILE 4 -#define STT_LOPROC 13 -#define STT_HIPROC 15 - - -/* Macros to split/combine relocation type and symbol page 4-32 */ - -#define ELF32_R_SYM(__i) ((__i)>>8) -#define ELF32_R_TYPE(__i) ((unsigned char) (__i)) -#define ELF32_R_INFO(__s, __t) (((__s)<<8) + (unsigned char) (__t)) - - -/* program header - page 5-2, figure 5-1 */ - -typedef struct -{ - Elf32_Word p_type; - Elf32_Off p_offset; - Elf32_Addr p_vaddr; - Elf32_Addr p_paddr; - Elf32_Word p_filesz; - Elf32_Word p_memsz; - Elf32_Word p_flags; - Elf32_Word p_align; -} -Elf32_Phdr; - -/* segment types - page 5-3, figure 5-2 */ - -#define PT_NULL 0 -#define PT_LOAD 1 -#define PT_DYNAMIC 2 -#define PT_INTERP 3 -#define PT_NOTE 4 -#define PT_SHLIB 5 -#define PT_PHDR 6 - -#define PT_LOPROC 0x70000000 -#define PT_HIPROC 0x7fffffff - -/* segment permissions - page 5-6 */ - -#define PF_X 0x1 -#define PF_W 0x2 -#define PF_R 0x4 -#define PF_MASKPROC 0xf0000000 - - -/* dynamic structure - page 5-15, figure 5-9 */ - -typedef struct -{ - Elf32_Sword d_tag; - union - { - Elf32_Word d_val; - Elf32_Addr d_ptr; - } - d_un; -} -Elf32_Dyn; - -/* Dynamic array tags - page 5-16, figure 5-10. */ - -#define DT_NULL 0 -#define DT_NEEDED 1 -#define DT_PLTRELSZ 2 -#define DT_PLTGOT 3 -#define DT_HASH 4 -#define DT_STRTAB 5 -#define DT_SYMTAB 6 -#define DT_RELA 7 -#define DT_RELASZ 8 -#define DT_RELAENT 9 -#define DT_STRSZ 10 -#define DT_SYMENT 11 -#define DT_INIT 12 -#define DT_FINI 13 -#define DT_SONAME 14 -#define DT_RPATH 15 -#define DT_SYMBOLIC 16 -#define DT_REL 17 -#define DT_RELSZ 18 -#define DT_RELENT 19 -#define DT_PLTREL 20 -#define DT_DEBUG 21 -#define DT_TEXTREL 22 -#define DT_JMPREL 23 diff --git a/xen/include/asm-x86_64/elf.h b/xen/include/asm-x86_64/elf.h deleted file mode 100644 index ded22856d0..0000000000 --- a/xen/include/asm-x86_64/elf.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * GRUB -- GRand Unified Bootloader - * Copyright (C) 1996 Erich Boleyn <erich@uruk.org> - * Copyright (C) 2001 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* 32-bit data types */ - -typedef unsigned long Elf32_Addr; -typedef unsigned short Elf32_Half; -typedef unsigned long Elf32_Off; -typedef signed long Elf32_Sword; -typedef unsigned long Elf32_Word; -/* "unsigned char" already exists */ - -/* ELF header */ -typedef struct -{ - -#define EI_NIDENT 16 - - /* first four characters are defined below */ -#define EI_MAG0 0 -#define ELFMAG0 0x7f -#define EI_MAG1 1 -#define ELFMAG1 'E' -#define EI_MAG2 2 -#define ELFMAG2 'L' -#define EI_MAG3 3 -#define ELFMAG3 'F' - -#define EI_CLASS 4 /* data sizes */ -#define ELFCLASS32 1 /* i386 -- up to 32-bit data sizes present */ - -#define EI_DATA 5 /* data type and ordering */ -#define ELFDATA2LSB 1 /* i386 -- LSB 2's complement */ - -#define EI_VERSION 6 /* version number. "e_version" must be the same */ -#define EV_CURRENT 1 /* current version number */ - -#define EI_PAD 7 /* from here in is just padding */ - -#define EI_BRAND 8 /* start of OS branding (This is - obviously illegal against the ELF - standard.) */ - - unsigned char e_ident[EI_NIDENT]; /* basic identification block */ - -#define ET_EXEC 2 /* we only care about executable types */ - Elf32_Half e_type; /* file types */ - -#define EM_386 3 /* i386 -- obviously use this one */ - Elf32_Half e_machine; /* machine types */ - Elf32_Word e_version; /* use same as "EI_VERSION" above */ - Elf32_Addr e_entry; /* entry point of the program */ - Elf32_Off e_phoff; /* program header table file offset */ - Elf32_Off e_shoff; /* section header table file offset */ - Elf32_Word e_flags; /* flags */ - Elf32_Half e_ehsize; /* elf header size in bytes */ - Elf32_Half e_phentsize; /* program header entry size */ - Elf32_Half e_phnum; /* number of entries in program header */ - Elf32_Half e_shentsize; /* section header entry size */ - Elf32_Half e_shnum; /* number of entries in section header */ - -#define SHN_UNDEF 0 -#define SHN_LORESERVE 0xff00 -#define SHN_LOPROC 0xff00 -#define SHN_HIPROC 0xff1f -#define SHN_ABS 0xfff1 -#define SHN_COMMON 0xfff2 -#define SHN_HIRESERVE 0xffff - Elf32_Half e_shstrndx; /* section header table index */ -} -Elf32_Ehdr; - - -#define BOOTABLE_I386_ELF(h) \ - ((h.e_ident[EI_MAG0] == ELFMAG0) & (h.e_ident[EI_MAG1] == ELFMAG1) \ - & (h.e_ident[EI_MAG2] == ELFMAG2) & (h.e_ident[EI_MAG3] == ELFMAG3) \ - & (h.e_ident[EI_CLASS] == ELFCLASS32) & (h.e_ident[EI_DATA] == ELFDATA2LSB) \ - & (h.e_ident[EI_VERSION] == EV_CURRENT) & (h.e_type == ET_EXEC) \ - & (h.e_machine == EM_386) & (h.e_version == EV_CURRENT)) - -/* section table - ? */ -typedef struct -{ - Elf32_Word sh_name; /* Section name (string tbl index) */ - Elf32_Word sh_type; /* Section type */ - Elf32_Word sh_flags; /* Section flags */ - Elf32_Addr sh_addr; /* Section virtual addr at execution */ - Elf32_Off sh_offset; /* Section file offset */ - Elf32_Word sh_size; /* Section size in bytes */ - Elf32_Word sh_link; /* Link to another section */ - Elf32_Word sh_info; /* Additional section information */ - Elf32_Word sh_addralign; /* Section alignment */ - Elf32_Word sh_entsize; /* Entry size if section holds table */ -} -Elf32_Shdr; - -/* symbol table - page 4-25, figure 4-15 */ -typedef struct -{ - Elf32_Word st_name; - Elf32_Addr st_value; - Elf32_Word st_size; - unsigned char st_info; - unsigned char st_other; - Elf32_Half st_shndx; -} -Elf32_Sym; - -/* symbol type and binding attributes - page 4-26 */ - -#define ELF32_ST_BIND(i) ((i) >> 4) -#define ELF32_ST_TYPE(i) ((i) & 0xf) -#define ELF32_ST_INFO(b,t) (((b)<<4)+((t)&0xf)) - -/* symbol binding - page 4-26, figure 4-16 */ - -#define STB_LOCAL 0 -#define STB_GLOBAL 1 -#define STB_WEAK 2 -#define STB_LOPROC 13 -#define STB_HIPROC 15 - -/* symbol types - page 4-28, figure 4-17 */ - -#define STT_NOTYPE 0 -#define STT_OBJECT 1 -#define STT_FUNC 2 -#define STT_SECTION 3 -#define STT_FILE 4 -#define STT_LOPROC 13 -#define STT_HIPROC 15 - - -/* Macros to split/combine relocation type and symbol page 4-32 */ - -#define ELF32_R_SYM(__i) ((__i)>>8) -#define ELF32_R_TYPE(__i) ((unsigned char) (__i)) -#define ELF32_R_INFO(__s, __t) (((__s)<<8) + (unsigned char) (__t)) - - -/* program header - page 5-2, figure 5-1 */ - -typedef struct -{ - Elf32_Word p_type; - Elf32_Off p_offset; - Elf32_Addr p_vaddr; - Elf32_Addr p_paddr; - Elf32_Word p_filesz; - Elf32_Word p_memsz; - Elf32_Word p_flags; - Elf32_Word p_align; -} -Elf32_Phdr; - -/* segment types - page 5-3, figure 5-2 */ - -#define PT_NULL 0 -#define PT_LOAD 1 -#define PT_DYNAMIC 2 -#define PT_INTERP 3 -#define PT_NOTE 4 -#define PT_SHLIB 5 -#define PT_PHDR 6 - -#define PT_LOPROC 0x70000000 -#define PT_HIPROC 0x7fffffff - -/* segment permissions - page 5-6 */ - -#define PF_X 0x1 -#define PF_W 0x2 -#define PF_R 0x4 -#define PF_MASKPROC 0xf0000000 - - -/* dynamic structure - page 5-15, figure 5-9 */ - -typedef struct -{ - Elf32_Sword d_tag; - union - { - Elf32_Word d_val; - Elf32_Addr d_ptr; - } - d_un; -} -Elf32_Dyn; - -/* Dynamic array tags - page 5-16, figure 5-10. */ - -#define DT_NULL 0 -#define DT_NEEDED 1 -#define DT_PLTRELSZ 2 -#define DT_PLTGOT 3 -#define DT_HASH 4 -#define DT_STRTAB 5 -#define DT_SYMTAB 6 -#define DT_RELA 7 -#define DT_RELASZ 8 -#define DT_RELAENT 9 -#define DT_STRSZ 10 -#define DT_SYMENT 11 -#define DT_INIT 12 -#define DT_FINI 13 -#define DT_SONAME 14 -#define DT_RPATH 15 -#define DT_SYMBOLIC 16 -#define DT_REL 17 -#define DT_RELSZ 18 -#define DT_RELENT 19 -#define DT_PLTREL 20 -#define DT_DEBUG 21 -#define DT_TEXTREL 22 -#define DT_JMPREL 23 diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index 2b27b4b824..2335ed5ad2 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -247,18 +247,41 @@ typedef struct shared_info_st } shared_info_t; /* - * NB. We expect that this struct is smaller than a page. + * Start-of-day memory layout for the initial domain (DOM0): + * 1. The domain is started within contiguous virtual-memory region. + * 2. The contiguous region begins and ends on an aligned 4MB boundary. + * 3. The region start corresponds to the load address of the OS image. + * If the load address is not 4MB aligned then the address is rounded down. + * 4. This the order of bootstrap elements in the initial virtual region: + * a. relocated kernel image + * b. initial ram disk [mod_start, mod_len] + * c. list of allocated page frames [mfn_list, nr_pages] + * d. bootstrap page tables [pt_base, CR3 (x86)] + * e. start_info_t structure [register ESI (x86)] + * f. bootstrap stack [register ESP (x86)] + * 5. Bootstrap elements are packed together, but each is 4kB-aligned. + * 6. The initial ram disk may be omitted. + * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * layout for the domain. In particular, the bootstrap virtual-memory + * region is a 1:1 mapping to the first section of the pseudo-physical map. + * 8. All bootstrap elements are mapped read-writeable for the guest OS. The + * only exception is the bootstrap page table, which is mapped read-only. + * 9. There is guaranteed to be at least 512kB padding after the final + * bootstrap element. If necessary, the bootstrap virtual region is + * extended by an extra 4MB to ensure this. */ typedef struct start_info_st { /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ - unsigned long nr_pages; /* total pages allocated to this domain. */ - unsigned long shared_info; /* MACHINE address of shared info struct.*/ + unsigned long nr_pages; /* total pages allocated to this domain. */ + unsigned long shared_info; /* MACHINE address of shared info struct.*/ unsigned long flags; /* SIF_xxx flags. */ /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ - unsigned long pt_base; /* VIRTUAL address of page directory. */ - unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ - unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ - unsigned char cmd_line[1]; /* Variable-length options. */ + unsigned long pt_base; /* VIRTUAL address of page directory. */ + unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ + unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ + unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ + unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ + unsigned char cmd_line[1]; /* Variable-length options. */ } start_info_t; /* These flags are passed in the 'flags' field of start_info_t. */ diff --git a/xen/include/xen/elf.h b/xen/include/xen/elf.h new file mode 100644 index 0000000000..ecf6bbca97 --- /dev/null +++ b/xen/include/xen/elf.h @@ -0,0 +1,523 @@ +/* + * Copyright (c) 1995, 1996 Erik Theisen. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +typedef u8 Elf_Byte; + +typedef u32 Elf32_Addr; /* Unsigned program address */ +typedef u32 Elf32_Off; /* Unsigned file offset */ +typedef s32 Elf32_Sword; /* Signed large integer */ +typedef u32 Elf32_Word; /* Unsigned large integer */ +typedef u16 Elf32_Half; /* Unsigned medium integer */ + +typedef u64 Elf64_Addr; +typedef u64 Elf64_Off; +typedef s32 Elf64_Shalf; + +typedef s32 Elf64_Sword; +typedef u32 Elf64_Word; + +typedef s64 Elf64_Sxword; +typedef u64 Elf64_Xword; + +typedef u32 Elf64_Half; +typedef u16 Elf64_Quarter; + +/* + * e_ident[] identification indexes + * See http://www.caldera.com/developers/gabi/2000-07-17/ch4.eheader.html + */ +#define EI_MAG0 0 /* file ID */ +#define EI_MAG1 1 /* file ID */ +#define EI_MAG2 2 /* file ID */ +#define EI_MAG3 3 /* file ID */ +#define EI_CLASS 4 /* file class */ +#define EI_DATA 5 /* data encoding */ +#define EI_VERSION 6 /* ELF header version */ +#define EI_OSABI 7 /* OS/ABI ID */ +#define EI_ABIVERSION 8 /* ABI version */ +#define EI_PAD 9 /* start of pad bytes */ +#define EI_NIDENT 16 /* Size of e_ident[] */ + +/* e_ident[] magic number */ +#define ELFMAG0 0x7f /* e_ident[EI_MAG0] */ +#define ELFMAG1 'E' /* e_ident[EI_MAG1] */ +#define ELFMAG2 'L' /* e_ident[EI_MAG2] */ +#define ELFMAG3 'F' /* e_ident[EI_MAG3] */ +#define ELFMAG "\177ELF" /* magic */ +#define SELFMAG 4 /* size of magic */ + +/* e_ident[] file class */ +#define ELFCLASSNONE 0 /* invalid */ +#define ELFCLASS32 1 /* 32-bit objs */ +#define ELFCLASS64 2 /* 64-bit objs */ +#define ELFCLASSNUM 3 /* number of classes */ + +/* e_ident[] data encoding */ +#define ELFDATANONE 0 /* invalid */ +#define ELFDATA2LSB 1 /* Little-Endian */ +#define ELFDATA2MSB 2 /* Big-Endian */ +#define ELFDATANUM 3 /* number of data encode defines */ + +/* e_ident[] Operating System/ABI */ +#define ELFOSABI_SYSV 0 /* UNIX System V ABI */ +#define ELFOSABI_HPUX 1 /* HP-UX operating system */ +#define ELFOSABI_NETBSD 2 /* NetBSD */ +#define ELFOSABI_LINUX 3 /* GNU/Linux */ +#define ELFOSABI_HURD 4 /* GNU/Hurd */ +#define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */ +#define ELFOSABI_SOLARIS 6 /* Solaris */ +#define ELFOSABI_MONTEREY 7 /* Monterey */ +#define ELFOSABI_IRIX 8 /* IRIX */ +#define ELFOSABI_FREEBSD 9 /* FreeBSD */ +#define ELFOSABI_TRU64 10 /* TRU64 UNIX */ +#define ELFOSABI_MODESTO 11 /* Novell Modesto */ +#define ELFOSABI_OPENBSD 12 /* OpenBSD */ +#define ELFOSABI_ARM 97 /* ARM */ +#define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */ + +/* e_ident */ +#define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \ + (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \ + (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \ + (ehdr).e_ident[EI_MAG3] == ELFMAG3) + +/* ELF Header */ +typedef struct elfhdr { + unsigned char e_ident[EI_NIDENT]; /* ELF Identification */ + Elf32_Half e_type; /* object file type */ + Elf32_Half e_machine; /* machine */ + Elf32_Word e_version; /* object file version */ + Elf32_Addr e_entry; /* virtual entry point */ + Elf32_Off e_phoff; /* program header table offset */ + Elf32_Off e_shoff; /* section header table offset */ + Elf32_Word e_flags; /* processor-specific flags */ + Elf32_Half e_ehsize; /* ELF header size */ + Elf32_Half e_phentsize; /* program header entry size */ + Elf32_Half e_phnum; /* number of program header entries */ + Elf32_Half e_shentsize; /* section header entry size */ + Elf32_Half e_shnum; /* number of section header entries */ + Elf32_Half e_shstrndx; /* section header table's "section + header string table" entry offset */ +} Elf32_Ehdr; + +typedef struct { + unsigned char e_ident[EI_NIDENT]; /* Id bytes */ + Elf64_Quarter e_type; /* file type */ + Elf64_Quarter e_machine; /* machine type */ + Elf64_Half e_version; /* version number */ + Elf64_Addr e_entry; /* entry point */ + Elf64_Off e_phoff; /* Program hdr offset */ + Elf64_Off e_shoff; /* Section hdr offset */ + Elf64_Half e_flags; /* Processor flags */ + Elf64_Quarter e_ehsize; /* sizeof ehdr */ + Elf64_Quarter e_phentsize; /* Program header entry size */ + Elf64_Quarter e_phnum; /* Number of program headers */ + Elf64_Quarter e_shentsize; /* Section header entry size */ + Elf64_Quarter e_shnum; /* Number of section headers */ + Elf64_Quarter e_shstrndx; /* String table index */ +} Elf64_Ehdr; + +/* e_type */ +#define ET_NONE 0 /* No file type */ +#define ET_REL 1 /* relocatable file */ +#define ET_EXEC 2 /* executable file */ +#define ET_DYN 3 /* shared object file */ +#define ET_CORE 4 /* core file */ +#define ET_NUM 5 /* number of types */ +#define ET_LOPROC 0xff00 /* reserved range for processor */ +#define ET_HIPROC 0xffff /* specific e_type */ + +/* e_machine */ +#define EM_NONE 0 /* No Machine */ +#define EM_M32 1 /* AT&T WE 32100 */ +#define EM_SPARC 2 /* SPARC */ +#define EM_386 3 /* Intel 80386 */ +#define EM_68K 4 /* Motorola 68000 */ +#define EM_88K 5 /* Motorola 88000 */ +#define EM_486 6 /* Intel 80486 - unused? */ +#define EM_860 7 /* Intel 80860 */ +#define EM_MIPS 8 /* MIPS R3000 Big-Endian only */ +/* + * Don't know if EM_MIPS_RS4_BE, + * EM_SPARC64, EM_PARISC, + * or EM_PPC are ABI compliant + */ +#define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */ +#define EM_SPARC64 11 /* SPARC v9 64-bit unoffical */ +#define EM_PARISC 15 /* HPPA */ +#define EM_SPARC32PLUS 18 /* Enhanced instruction set SPARC */ +#define EM_PPC 20 /* PowerPC */ +#define EM_ARM 40 /* Advanced RISC Machines ARM */ +#define EM_ALPHA 41 /* DEC ALPHA */ +#define EM_SPARCV9 43 /* SPARC version 9 */ +#define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */ +#define EM_X86_64 62 /* AMD x86-64 architecture */ +#define EM_VAX 75 /* DEC VAX */ +#define EM_NUM 15 /* number of machine types */ + +/* Version */ +#define EV_NONE 0 /* Invalid */ +#define EV_CURRENT 1 /* Current */ +#define EV_NUM 2 /* number of versions */ + +/* Section Header */ +typedef struct { + Elf32_Word sh_name; /* name - index into section header + string table section */ + Elf32_Word sh_type; /* type */ + Elf32_Word sh_flags; /* flags */ + Elf32_Addr sh_addr; /* address */ + Elf32_Off sh_offset; /* file offset */ + Elf32_Word sh_size; /* section size */ + Elf32_Word sh_link; /* section header table index link */ + Elf32_Word sh_info; /* extra information */ + Elf32_Word sh_addralign; /* address alignment */ + Elf32_Word sh_entsize; /* section entry size */ +} Elf32_Shdr; + +typedef struct { + Elf64_Half sh_name; /* section name */ + Elf64_Half sh_type; /* section type */ + Elf64_Xword sh_flags; /* section flags */ + Elf64_Addr sh_addr; /* virtual address */ + Elf64_Off sh_offset; /* file offset */ + Elf64_Xword sh_size; /* section size */ + Elf64_Half sh_link; /* link to another */ + Elf64_Half sh_info; /* misc info */ + Elf64_Xword sh_addralign; /* memory alignment */ + Elf64_Xword sh_entsize; /* table entry size */ +} Elf64_Shdr; + +/* Special Section Indexes */ +#define SHN_UNDEF 0 /* undefined */ +#define SHN_LORESERVE 0xff00 /* lower bounds of reserved indexes */ +#define SHN_LOPROC 0xff00 /* reserved range for processor */ +#define SHN_HIPROC 0xff1f /* specific section indexes */ +#define SHN_ABS 0xfff1 /* absolute value */ +#define SHN_COMMON 0xfff2 /* common symbol */ +#define SHN_HIRESERVE 0xffff /* upper bounds of reserved indexes */ + +/* sh_type */ +#define SHT_NULL 0 /* inactive */ +#define SHT_PROGBITS 1 /* program defined information */ +#define SHT_SYMTAB 2 /* symbol table section */ +#define SHT_STRTAB 3 /* string table section */ +#define SHT_RELA 4 /* relocation section with addends*/ +#define SHT_HASH 5 /* symbol hash table section */ +#define SHT_DYNAMIC 6 /* dynamic section */ +#define SHT_NOTE 7 /* note section */ +#define SHT_NOBITS 8 /* no space section */ +#define SHT_REL 9 /* relation section without addends */ +#define SHT_SHLIB 10 /* reserved - purpose unknown */ +#define SHT_DYNSYM 11 /* dynamic symbol table section */ +#define SHT_NUM 12 /* number of section types */ +#define SHT_LOPROC 0x70000000 /* reserved range for processor */ +#define SHT_HIPROC 0x7fffffff /* specific section header types */ +#define SHT_LOUSER 0x80000000 /* reserved range for application */ +#define SHT_HIUSER 0xffffffff /* specific indexes */ + +/* Section names */ +#define ELF_BSS ".bss" /* uninitialized data */ +#define ELF_DATA ".data" /* initialized data */ +#define ELF_DEBUG ".debug" /* debug */ +#define ELF_DYNAMIC ".dynamic" /* dynamic linking information */ +#define ELF_DYNSTR ".dynstr" /* dynamic string table */ +#define ELF_DYNSYM ".dynsym" /* dynamic symbol table */ +#define ELF_FINI ".fini" /* termination code */ +#define ELF_GOT ".got" /* global offset table */ +#define ELF_HASH ".hash" /* symbol hash table */ +#define ELF_INIT ".init" /* initialization code */ +#define ELF_REL_DATA ".rel.data" /* relocation data */ +#define ELF_REL_FINI ".rel.fini" /* relocation termination code */ +#define ELF_REL_INIT ".rel.init" /* relocation initialization code */ +#define ELF_REL_DYN ".rel.dyn" /* relocaltion dynamic link info */ +#define ELF_REL_RODATA ".rel.rodata" /* relocation read-only data */ +#define ELF_REL_TEXT ".rel.text" /* relocation code */ +#define ELF_RODATA ".rodata" /* read-only data */ +#define ELF_SHSTRTAB ".shstrtab" /* section header string table */ +#define ELF_STRTAB ".strtab" /* string table */ +#define ELF_SYMTAB ".symtab" /* symbol table */ +#define ELF_TEXT ".text" /* code */ + + +/* Section Attribute Flags - sh_flags */ +#define SHF_WRITE 0x1 /* Writable */ +#define SHF_ALLOC 0x2 /* occupies memory */ +#define SHF_EXECINSTR 0x4 /* executable */ +#define SHF_MASKPROC 0xf0000000 /* reserved bits for processor */ + /* specific section attributes */ + +/* Symbol Table Entry */ +typedef struct elf32_sym { + Elf32_Word st_name; /* name - index into string table */ + Elf32_Addr st_value; /* symbol value */ + Elf32_Word st_size; /* symbol size */ + unsigned char st_info; /* type and binding */ + unsigned char st_other; /* 0 - no defined meaning */ + Elf32_Half st_shndx; /* section header index */ +} Elf32_Sym; + +typedef struct { + Elf64_Half st_name; /* Symbol name index in str table */ + Elf_Byte st_info; /* type / binding attrs */ + Elf_Byte st_other; /* unused */ + Elf64_Quarter st_shndx; /* section index of symbol */ + Elf64_Xword st_value; /* value of symbol */ + Elf64_Xword st_size; /* size of symbol */ +} Elf64_Sym; + +/* Symbol table index */ +#define STN_UNDEF 0 /* undefined */ + +/* Extract symbol info - st_info */ +#define ELF32_ST_BIND(x) ((x) >> 4) +#define ELF32_ST_TYPE(x) (((unsigned int) x) & 0xf) +#define ELF32_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf)) + +#define ELF64_ST_BIND(x) ((x) >> 4) +#define ELF64_ST_TYPE(x) (((unsigned int) x) & 0xf) +#define ELF64_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf)) + +/* Symbol Binding - ELF32_ST_BIND - st_info */ +#define STB_LOCAL 0 /* Local symbol */ +#define STB_GLOBAL 1 /* Global symbol */ +#define STB_WEAK 2 /* like global - lower precedence */ +#define STB_NUM 3 /* number of symbol bindings */ +#define STB_LOPROC 13 /* reserved range for processor */ +#define STB_HIPROC 15 /* specific symbol bindings */ + +/* Symbol type - ELF32_ST_TYPE - st_info */ +#define STT_NOTYPE 0 /* not specified */ +#define STT_OBJECT 1 /* data object */ +#define STT_FUNC 2 /* function */ +#define STT_SECTION 3 /* section */ +#define STT_FILE 4 /* file */ +#define STT_NUM 5 /* number of symbol types */ +#define STT_LOPROC 13 /* reserved range for processor */ +#define STT_HIPROC 15 /* specific symbol types */ + +/* Relocation entry with implicit addend */ +typedef struct { + Elf32_Addr r_offset; /* offset of relocation */ + Elf32_Word r_info; /* symbol table index and type */ +} Elf32_Rel; + +/* Relocation entry with explicit addend */ +typedef struct { + Elf32_Addr r_offset; /* offset of relocation */ + Elf32_Word r_info; /* symbol table index and type */ + Elf32_Sword r_addend; +} Elf32_Rela; + +/* Extract relocation info - r_info */ +#define ELF32_R_SYM(i) ((i) >> 8) +#define ELF32_R_TYPE(i) ((unsigned char) (i)) +#define ELF32_R_INFO(s,t) (((s) << 8) + (unsigned char)(t)) + +typedef struct { + Elf64_Xword r_offset; /* where to do it */ + Elf64_Xword r_info; /* index & type of relocation */ +} Elf64_Rel; + +typedef struct { + Elf64_Xword r_offset; /* where to do it */ + Elf64_Xword r_info; /* index & type of relocation */ + Elf64_Sxword r_addend; /* adjustment value */ +} Elf64_Rela; + +#define ELF64_R_SYM(info) ((info) >> 32) +#define ELF64_R_TYPE(info) ((info) & 0xFFFFFFFF) +#define ELF64_R_INFO(s,t) (((s) << 32) + (u32)(t)) + +/* Program Header */ +typedef struct { + Elf32_Word p_type; /* segment type */ + Elf32_Off p_offset; /* segment offset */ + Elf32_Addr p_vaddr; /* virtual address of segment */ + Elf32_Addr p_paddr; /* physical address - ignored? */ + Elf32_Word p_filesz; /* number of bytes in file for seg. */ + Elf32_Word p_memsz; /* number of bytes in mem. for seg. */ + Elf32_Word p_flags; /* flags */ + Elf32_Word p_align; /* memory alignment */ +} Elf32_Phdr; + +typedef struct { + Elf64_Half p_type; /* entry type */ + Elf64_Half p_flags; /* flags */ + Elf64_Off p_offset; /* offset */ + Elf64_Addr p_vaddr; /* virtual address */ + Elf64_Addr p_paddr; /* physical address */ + Elf64_Xword p_filesz; /* file size */ + Elf64_Xword p_memsz; /* memory size */ + Elf64_Xword p_align; /* memory & file alignment */ +} Elf64_Phdr; + +/* Segment types - p_type */ +#define PT_NULL 0 /* unused */ +#define PT_LOAD 1 /* loadable segment */ +#define PT_DYNAMIC 2 /* dynamic linking section */ +#define PT_INTERP 3 /* the RTLD */ +#define PT_NOTE 4 /* auxiliary information */ +#define PT_SHLIB 5 /* reserved - purpose undefined */ +#define PT_PHDR 6 /* program header */ +#define PT_NUM 7 /* Number of segment types */ +#define PT_LOPROC 0x70000000 /* reserved range for processor */ +#define PT_HIPROC 0x7fffffff /* specific segment types */ + +/* Segment flags - p_flags */ +#define PF_X 0x1 /* Executable */ +#define PF_W 0x2 /* Writable */ +#define PF_R 0x4 /* Readable */ +#define PF_MASKPROC 0xf0000000 /* reserved bits for processor */ + /* specific segment flags */ + +/* Dynamic structure */ +typedef struct { + Elf32_Sword d_tag; /* controls meaning of d_val */ + union { + Elf32_Word d_val; /* Multiple meanings - see d_tag */ + Elf32_Addr d_ptr; /* program virtual address */ + } d_un; +} Elf32_Dyn; + +typedef struct { + Elf64_Xword d_tag; /* controls meaning of d_val */ + union { + Elf64_Addr d_ptr; + Elf64_Xword d_val; + } d_un; +} Elf64_Dyn; + +/* Dynamic Array Tags - d_tag */ +#define DT_NULL 0 /* marks end of _DYNAMIC array */ +#define DT_NEEDED 1 /* string table offset of needed lib */ +#define DT_PLTRELSZ 2 /* size of relocation entries in PLT */ +#define DT_PLTGOT 3 /* address PLT/GOT */ +#define DT_HASH 4 /* address of symbol hash table */ +#define DT_STRTAB 5 /* address of string table */ +#define DT_SYMTAB 6 /* address of symbol table */ +#define DT_RELA 7 /* address of relocation table */ +#define DT_RELASZ 8 /* size of relocation table */ +#define DT_RELAENT 9 /* size of relocation entry */ +#define DT_STRSZ 10 /* size of string table */ +#define DT_SYMENT 11 /* size of symbol table entry */ +#define DT_INIT 12 /* address of initialization func. */ +#define DT_FINI 13 /* address of termination function */ +#define DT_SONAME 14 /* string table offset of shared obj */ +#define DT_RPATH 15 /* string table offset of library + search path */ +#define DT_SYMBOLIC 16 /* start sym search in shared obj. */ +#define DT_REL 17 /* address of rel. tbl. w addends */ +#define DT_RELSZ 18 /* size of DT_REL relocation table */ +#define DT_RELENT 19 /* size of DT_REL relocation entry */ +#define DT_PLTREL 20 /* PLT referenced relocation entry */ +#define DT_DEBUG 21 /* bugger */ +#define DT_TEXTREL 22 /* Allow rel. mod. to unwritable seg */ +#define DT_JMPREL 23 /* add. of PLT's relocation entries */ +#define DT_BIND_NOW 24 /* Bind now regardless of env setting */ +#define DT_NUM 25 /* Number used. */ +#define DT_LOPROC 0x70000000 /* reserved range for processor */ +#define DT_HIPROC 0x7fffffff /* specific dynamic array tags */ + +/* Standard ELF hashing function */ +unsigned int elf_hash(const unsigned char *name); + +/* + * Note Definitions + */ +typedef struct { + Elf32_Word namesz; + Elf32_Word descsz; + Elf32_Word type; +} Elf32_Note; + +typedef struct { + Elf64_Half namesz; + Elf64_Half descsz; + Elf64_Half type; +} Elf64_Note; + + +#if defined(ELFSIZE) +#define CONCAT(x,y) __CONCAT(x,y) +#define ELFNAME(x) CONCAT(elf,CONCAT(ELFSIZE,CONCAT(_,x))) +#define ELFNAME2(x,y) CONCAT(x,CONCAT(_elf,CONCAT(ELFSIZE,CONCAT(_,y)))) +#define ELFNAMEEND(x) CONCAT(x,CONCAT(_elf,ELFSIZE)) +#define ELFDEFNNAME(x) CONCAT(ELF,CONCAT(ELFSIZE,CONCAT(_,x))) +#endif + +#if defined(ELFSIZE) && (ELFSIZE == 32) +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Phdr Elf32_Phdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Rel Elf32_Rel +#define Elf_RelA Elf32_Rela +#define Elf_Dyn Elf32_Dyn +#define Elf_Word Elf32_Word +#define Elf_Sword Elf32_Sword +#define Elf_Addr Elf32_Addr +#define Elf_Off Elf32_Off +#define Elf_Nhdr Elf32_Nhdr +#define Elf_Note Elf32_Note + +#define ELF_R_SYM ELF32_R_SYM +#define ELF_R_TYPE ELF32_R_TYPE +#define ELF_R_INFO ELF32_R_INFO +#define ELFCLASS ELFCLASS32 + +#define ELF_ST_BIND ELF32_ST_BIND +#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_INFO ELF32_ST_INFO + +#define AuxInfo Aux32Info +#elif defined(ELFSIZE) && (ELFSIZE == 64) +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Phdr Elf64_Phdr +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Rel Elf64_Rel +#define Elf_RelA Elf64_Rela +#define Elf_Dyn Elf64_Dyn +#define Elf_Word Elf64_Word +#define Elf_Sword Elf64_Sword +#define Elf_Addr Elf64_Addr +#define Elf_Off Elf64_Off +#define Elf_Nhdr Elf64_Nhdr +#define Elf_Note Elf64_Note + +#define ELF_R_SYM ELF64_R_SYM +#define ELF_R_TYPE ELF64_R_TYPE +#define ELF_R_INFO ELF64_R_INFO +#define ELFCLASS ELFCLASS64 + +#define ELF_ST_BIND ELF64_ST_BIND +#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_INFO ELF64_ST_INFO + +#define AuxInfo Aux64Info +#endif + diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h index bc5a6362ea..37705452e2 100644 --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -125,6 +125,7 @@ extern spinlock_t free_list_lock; extern unsigned int free_pfns; extern unsigned long max_page; void init_frametable(unsigned long nr_pages); +void add_to_domain_alloc_list(unsigned long ps, unsigned long pe); struct pfn_info *alloc_domain_page(struct task_struct *p); void free_domain_page(struct pfn_info *page); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 58ffffa5e2..4f506df04b 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -228,10 +228,13 @@ struct task_struct *alloc_task_struct(); extern struct task_struct *do_createdomain( domid_t dom_id, unsigned int cpu); -extern int setup_guestos( - struct task_struct *p, dom0_createdomain_t *params, unsigned int num_vifs, - char *data_start, unsigned long data_len, - char *cmdline, unsigned long initrd_len); +extern int construct_dom0(struct task_struct *p, + unsigned long alloc_start, + unsigned long alloc_end, + unsigned int num_vifs, + char *image_start, unsigned long image_len, + char *initrd_start, unsigned long initrd_len, + char *cmdline); extern int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *); struct task_struct *find_domain_by_id(domid_t dom); diff --git a/xenolinux-2.4.25-sparse/arch/xen/Makefile b/xenolinux-2.4.25-sparse/arch/xen/Makefile index f52b90632f..214675d8ec 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xen/Makefile @@ -19,7 +19,7 @@ override EXTRAVERSION := -xen$(EXTRAVERSION) LD=$(CROSS_COMPILE)ld -m elf_i386 -OBJCOPY=$(CROSS_COMPILE)objcopy -O binary -R .note -R .comment -S +OBJCOPY=$(CROSS_COMPILE)objcopy -R .note -R .comment -S LDFLAGS=-e stext LINKFLAGS =-T $(TOPDIR)/arch/xen/vmlinux.lds $(LDFLAGS) diff --git a/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile b/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile index 01fb131959..64b402e833 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile @@ -6,17 +6,9 @@ xenolinux.gz: xenolinux gzip -f -9 < $< > $@ xenolinux: $(TOPDIR)/vmlinux - # Guest OS header -- first 8 bytes are identifier 'XenGuest'. - echo -e -n 'XenGuest' >$@ - # Guest OS header -- next 4 bytes are load address (0xC0000000). - echo -e -n '\000\000\000\300' >>$@ - $(OBJCOPY) $< xenolinux.body - # Guest OS header is immediately followed by raw OS image. - # Start address must be at byte 0. - cat xenolinux.body >>$@ - rm -f xenolinux.body + $(OBJCOPY) $< $@ dep: clean: - rm -f xenolinux xenolinux.gz
\ No newline at end of file + rm -f xenolinux* diff --git a/xenolinux-2.4.25-sparse/arch/xen/config.in b/xenolinux-2.4.25-sparse/arch/xen/config.in index dcf812d659..c66383f643 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/config.in +++ b/xenolinux-2.4.25-sparse/arch/xen/config.in @@ -82,9 +82,9 @@ fi # tristate 'BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)' CONFIG_EDD #fi -#choice 'High Memory Support' \ -# "off CONFIG_NOHIGHMEM \ -# 4GB CONFIG_HIGHMEM4G \ +choice 'High Memory Support' \ + "off CONFIG_NOHIGHMEM \ + 4GB CONFIG_HIGHMEM4G" off # 64GB CONFIG_HIGHMEM64G" off if [ "$CONFIG_HIGHMEM4G" = "y" ]; then define_bool CONFIG_HIGHMEM y diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S index 8ce06571ef..c744f1bdcb 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S +++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S @@ -369,21 +369,28 @@ critical_fixup_table: # Hypervisor uses this for application faults while it executes. ENTRY(failsafe_callback) + pushal call SYMBOL_NAME(install_safe_pf_handler) -1: pop %ds -2: pop %es -3: pop %fs -4: pop %gs + movl 32(%esp),%ebx +1: movl %ebx,%ds + movl 36(%esp),%ebx +2: movl %ebx,%es + movl 40(%esp),%ebx +3: movl %ebx,%fs + movl 44(%esp),%ebx +4: movl %ebx,%gs call SYMBOL_NAME(install_normal_pf_handler) + popal + addl $16,%esp 5: iret .section .fixup,"ax"; \ -6: movl $0,(%esp); \ +6: xorl %ebx,%ebx; \ jmp 1b; \ -7: movl $0,(%esp); \ +7: xorl %ebx,%ebx; \ jmp 2b; \ -8: movl $0,(%esp); \ +8: xorl %ebx,%ebx; \ jmp 3b; \ -9: movl $0,(%esp); \ +9: xorl %ebx,%ebx; \ jmp 4b; \ 10: pushl %ss; \ popl %ds; \ diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S b/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S index 361815a58b..2d9379a15b 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S +++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S @@ -1,4 +1,7 @@ +.section __xen_guest + .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=1.3" + .text #include <linux/config.h> #include <linux/threads.h> @@ -8,42 +11,14 @@ #include <asm/pgtable.h> #include <asm/desc.h> -/* Offsets in start_info structure */ -#define MOD_START 16 -#define MOD_LEN 20 - -startup_32: +ENTRY(stext) +ENTRY(_stext) cld - lss stack_start,%esp - - /* Copy initrd somewhere safe before it's clobbered by BSS. */ - mov MOD_LEN(%esi),%ecx - shr $2,%ecx - jz 2f /* bail from copy loop if no initrd */ - mov $SYMBOL_NAME(_end),%edi - add MOD_LEN(%esi),%edi - mov MOD_START(%esi),%eax - add MOD_LEN(%esi),%eax -1: sub $4,%eax - sub $4,%edi - mov (%eax),%ebx - mov %ebx,(%edi) - loop 1b - mov %edi,MOD_START(%esi) - - /* Clear BSS first so that there are no surprises... */ -2: xorl %eax,%eax - movl $SYMBOL_NAME(__bss_start),%edi - movl $SYMBOL_NAME(_end),%ecx - subl %edi,%ecx - rep stosb - /* Copy the necessary stuff from start_info structure. */ - mov $SYMBOL_NAME(start_info_union),%edi - mov $128,%ecx + mov $SYMBOL_NAME(start_info_union),%edi + mov $128,%ecx rep movsl - jmp SYMBOL_NAME(start_kernel) ENTRY(stack_start) @@ -62,5 +37,3 @@ ENTRY(cpu0_pte_quicklist) ENTRY(cpu0_pgd_quicklist) .org 0x3800 -ENTRY(stext) -ENTRY(_stext) diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c b/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c index bce8d39497..bd65655c48 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c +++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c @@ -69,7 +69,6 @@ char ignore_irq13; /* set if exception 16 works */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; unsigned long mmu_cr4_features; -//EXPORT_SYMBOL(mmu_cr4_features); unsigned char * vgacon_mmap; @@ -106,6 +105,8 @@ unsigned char aux_device_present; extern int root_mountflags; extern char _text, _etext, _edata, _end; +extern int blk_nohighio; + int enable_acpi_smp_table; /* Raw start-of-day parameters from the hypervisor. */ @@ -160,7 +161,6 @@ static void __init parse_mem_cmdline (char ** cmdline_p) void __init setup_arch(char **cmdline_p) { unsigned long bootmap_size, start_pfn, max_low_pfn; - unsigned long i; extern void hypervisor_callback(void); extern void failsafe_callback(void); @@ -168,6 +168,10 @@ void __init setup_arch(char **cmdline_p) extern unsigned long cpu0_pte_quicklist[]; extern unsigned long cpu0_pgd_quicklist[]; +#ifndef CONFIG_HIGHIO + blk_nohighio = 1; +#endif + HYPERVISOR_set_callbacks( __KERNEL_CS, (unsigned long)hypervisor_callback, __KERNEL_CS, (unsigned long)failsafe_callback); @@ -208,7 +212,7 @@ void __init setup_arch(char **cmdline_p) #define PFN_PHYS(x) ((x) << PAGE_SHIFT) /* - * 128MB for vmalloc and initrd + * 128MB for vmalloc(), iomap(), kmap(), and fixaddr mappings. */ #define VMALLOC_RESERVE (unsigned long)(128 << 20) #define MAXMEM (unsigned long)(HYPERVISOR_VIRT_START-PAGE_OFFSET-VMALLOC_RESERVE) @@ -216,21 +220,9 @@ void __init setup_arch(char **cmdline_p) #define MAX_NONPAE_PFN (1 << 20) /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ -#ifdef CONFIG_BLK_DEV_INITRD - if ( start_info.mod_start ) - start_pfn = PFN_UP(__pa(start_info.mod_start + start_info.mod_len)); - else -#endif - start_pfn = PFN_UP(__pa(&_end)); - max_pfn = start_info.nr_pages; - - /* * Determine low and high memory ranges: */ - max_low_pfn = max_pfn; + max_low_pfn = max_pfn = start_info.nr_pages; if (max_low_pfn > MAXMEM_PFN) { max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM @@ -261,51 +253,36 @@ void __init setup_arch(char **cmdline_p) } #endif + phys_to_machine_mapping = (unsigned long *)start_info.mfn_list; + cur_pgd = init_mm.pgd = (pgd_t *)start_info.pt_base; + + start_pfn = (__pa(start_info.pt_base) >> PAGE_SHIFT) + + start_info.nr_pt_frames; + /* - * Initialize the boot-time allocator, and free up all RAM. - * Then reserve space for OS image, and the bootmem bitmap. + * Initialize the boot-time allocator, and free up all RAM. Then reserve + * space for OS image, initrd, phys->machine table, bootstrap page table, + * and the bootmem bitmap. + * NB. There is definitely enough room for the bootmem bitmap in the + * bootstrap page table. We are guaranteed to get >=512kB unused 'padding' + * for our own use after all bootstrap elements (see hypervisor-if.h). */ bootmap_size = init_bootmem(start_pfn, max_low_pfn); free_bootmem(0, PFN_PHYS(max_low_pfn)); reserve_bootmem(0, PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1); - /* Now reserve space for the hypervisor-provided page tables. */ +#ifdef CONFIG_BLK_DEV_INITRD + if ( start_info.mod_start != 0 ) { - unsigned long *pgd = (unsigned long *)start_info.pt_base; - unsigned long pte; - int i; - reserve_bootmem(__pa(pgd), PAGE_SIZE); - for ( i = 0; i < (HYPERVISOR_VIRT_START>>22); i++ ) + if ( (__pa(start_info.mod_start) + start_info.mod_len) <= + (max_low_pfn << PAGE_SHIFT) ) { - unsigned long pgde = *pgd++; - if ( !(pgde & 1) ) continue; - pte = machine_to_phys(pgde & PAGE_MASK); - reserve_bootmem(pte, PAGE_SIZE); - } - } - cur_pgd = init_mm.pgd = (pgd_t *)start_info.pt_base; - - /* Now initialise the physical->machine mapping table. */ - phys_to_machine_mapping = alloc_bootmem(max_pfn * sizeof(unsigned long)); - for ( i = 0; i < max_pfn; i++ ) - { - unsigned long pgde, *ppte; - unsigned long pfn = i + (PAGE_OFFSET >> PAGE_SHIFT); - pgde = *((unsigned long *)start_info.pt_base + (pfn >> 10)); - ppte = (unsigned long *)machine_to_phys(pgde & PAGE_MASK) + (pfn&1023); - phys_to_machine_mapping[i] = - (*(unsigned long *)__va(ppte)) >> PAGE_SHIFT; - } - -#ifdef CONFIG_BLK_DEV_INITRD - if (start_info.mod_start) { - if ((__pa(start_info.mod_start) + start_info.mod_len) <= - (max_low_pfn << PAGE_SHIFT)) { initrd_start = start_info.mod_start; initrd_end = initrd_start + start_info.mod_len; initrd_below_start_ok = 1; } - else { + else + { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", __pa(start_info.mod_start) + start_info.mod_len, @@ -317,7 +294,7 @@ void __init setup_arch(char **cmdline_p) paging_init(); - /* We are privileged guest os - should have IO privileges. */ + /* If we are a privileged guest OS then we should request IO privileges. */ if ( start_info.flags & SIF_PRIVILEGED ) { dom0_op_t op; @@ -352,6 +329,13 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); +static int __init highio_setup(char *str) +{ + printk("i386: disabling HIGHMEM block I/O\n"); + blk_nohighio = 1; + return 1; +} +__setup("nohighio", highio_setup); static int __init get_model_name(struct cpuinfo_x86 *c) { diff --git a/xenolinux-2.4.25-sparse/arch/xen/mm/init.c b/xenolinux-2.4.25-sparse/arch/xen/mm/init.c index 0bb2d173e4..6bfdd3ae9f 100644 --- a/xenolinux-2.4.25-sparse/arch/xen/mm/init.c +++ b/xenolinux-2.4.25-sparse/arch/xen/mm/init.c @@ -58,6 +58,31 @@ int do_check_pgt_cache(int low, int high) } return freed; } + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +#if CONFIG_HIGHMEM +pte_t *kmap_pte; +pgprot_t kmap_prot; + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + +void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} +#endif /* CONFIG_HIGHMEM */ void show_mem(void) { @@ -186,6 +211,77 @@ static void __init fixrange_init (unsigned long start, } +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *kpgd, *pgd, *pgd_base; + int i, j, k; + pmd_t *kpmd, *pmd; + pte_t *kpte, *pte, *pte_base; + + /* + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. + */ + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); + + pgd_base = init_mm.pgd; + i = __pgd_offset(PAGE_OFFSET); + pgd = pgd_base + i; + + for (; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = (pmd_t *)pgd; + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + + /* Filled in for us already? */ + if ( pmd_val(*pmd) & _PAGE_PRESENT ) + continue; + + pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + + for (k = 0; k < PTRS_PER_PTE; pte++, k++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; + if (end && (vaddr >= end)) + break; + *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); + } + kpgd = pgd_offset_k((unsigned long)pte_base); + kpmd = pmd_offset(kpgd, (unsigned long)pte_base); + kpte = pte_offset(kpmd, (unsigned long)pte_base); + queue_l1_entry_update(kpte, + (*(unsigned long *)kpte)&~_PAGE_RW); + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + XEN_flush_page_update_queue(); + } + } + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, HYPERVISOR_VIRT_START, init_mm.pgd); + +#if CONFIG_HIGHMEM + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, init_mm.pgd); + + pgd = init_mm.pgd + __pgd_offset(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset(pmd, vaddr); + pkmap_page_table = pte; +#endif +} + static void __init zone_sizes_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; @@ -207,33 +303,18 @@ static void __init zone_sizes_init(void) free_area_init(zones_size); } -/* - * paging_init() sets up the page tables - note that the first 8MB are - * already mapped by head.S. - * - * This routines also unmaps the page at virtual kernel address 0, so - * that we can trap those pesky NULL-reference errors in the kernel. - */ void __init paging_init(void) { - unsigned long vaddr; + pagetable_init(); zone_sizes_init(); - /* - * Fixed mappings, only the page table structure has to be created - - * mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, HYPERVISOR_VIRT_START, init_mm.pgd); - /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, start_info.shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); memset(empty_zero_page, 0, sizeof(empty_zero_page)); #ifdef CONFIG_HIGHMEM -#error kmap_init(); #endif } @@ -243,6 +324,11 @@ static inline int page_is_ram (unsigned long pagenr) return 1; } +static inline int page_kills_ppro(unsigned long pagenr) +{ + return 0; +} + #ifdef CONFIG_HIGHMEM void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) { @@ -278,8 +364,7 @@ static void __init set_max_mapnr_init(void) static int __init free_pages_init(void) { #ifdef CONFIG_HIGHMEM -#error Where is this supposed to be initialised? - int bad_ppro; + int bad_ppro = 0; #endif int reservedpages, pfn; diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h b/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h index 2441b01d4e..338bd4ba2c 100644 --- a/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h +++ b/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h @@ -17,6 +17,10 @@ #include <linux/kernel.h> #include <asm/apicdef.h> #include <asm/page.h> +#ifdef CONFIG_HIGHMEM +#include <linux/threads.h> +#include <asm/kmap_types.h> +#endif /* * Here we define all the compile-time 'special' virtual @@ -38,7 +42,7 @@ */ enum fixed_addresses { -#ifdef CONFIG_HIGHMEM_XXX +#ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, #endif diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h b/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h index 7e56b1b32d..25ef32882c 100644 --- a/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h +++ b/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h @@ -1,2 +1,132 @@ -#error "Highmem unsupported!" +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ +#ifndef _ASM_HIGHMEM_H +#define _ASM_HIGHMEM_H + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <asm/kmap_types.h> +#include <asm/pgtable.h> + +#ifdef CONFIG_DEBUG_HIGHMEM +#define HIGHMEM_DEBUG 1 +#else +#define HIGHMEM_DEBUG 0 +#endif + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +extern void kmap_init(void) __init; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#define PKMAP_BASE (HYPERVISOR_VIRT_START - (1<<23)) +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void * FASTCALL(kmap_high(struct page *page, int nonblocking)); +extern void FASTCALL(kunmap_high(struct page *page)); + +#define kmap(page) __kmap(page, 0) +#define kmap_nonblock(page) __kmap(page, 1) + +static inline void *__kmap(struct page *page, int nonblocking) +{ + if (in_interrupt()) + out_of_line_bug(); + if (page < highmem_start_page) + return page_address(page); + return kmap_high(page, nonblocking); +} + +static inline void kunmap(struct page *page) +{ + if (in_interrupt()) + out_of_line_bug(); + if (page < highmem_start_page) + return; + kunmap_high(page); +} + +/* + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + if (page < highmem_start_page) + return page_address(page); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#if HIGHMEM_DEBUG + if (!pte_none(*(kmap_pte-idx))) + out_of_line_bug(); +#endif + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +static inline void kunmap_atomic(void *kvaddr, enum km_type type) +{ +#if HIGHMEM_DEBUG + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + if (vaddr < FIXADDR_START) // FIXME + return; + + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + out_of_line_bug(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif +} + +#endif /* __KERNEL__ */ + +#endif /* _ASM_HIGHMEM_H */ diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h b/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h index 36655e63e5..38721e4cff 100644 --- a/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h +++ b/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h @@ -101,7 +101,11 @@ extern void * high_memory; #define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ ~(VMALLOC_OFFSET-1)) #define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#define VMALLOC_END (FIXADDR_START - 2*PAGE_SIZE) +#if CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif #define _PAGE_BIT_PRESENT 0 #define _PAGE_BIT_RW 1 diff --git a/xenolinux-2.4.25-sparse/mm/highmem.c b/xenolinux-2.4.25-sparse/mm/highmem.c new file mode 100644 index 0000000000..a68937452c --- /dev/null +++ b/xenolinux-2.4.25-sparse/mm/highmem.c @@ -0,0 +1,455 @@ +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/swap.h> +#include <linux/slab.h> + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +static int pkmap_count[LAST_PKMAP]; +static unsigned int last_pkmap_nr; +static spinlock_cacheline_t kmap_lock_cacheline = {SPIN_LOCK_UNLOCKED}; +#define kmap_lock kmap_lock_cacheline.lock + +pte_t * pkmap_page_table; + +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + +static void flush_all_zero_pkmaps(void) +{ + int i; + + flush_cache_all(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + if (pte_none(pkmap_page_table[i])) + BUG(); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&pkmap_page_table[i]); + + page->virtual = NULL; + } + flush_tlb_all(); +} + +static inline unsigned long map_new_virtual(struct page *page, int nonblocking) +{ + unsigned long vaddr; + int count; + +start: + count = LAST_PKMAP; + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + if (!last_pkmap_nr) { + flush_all_zero_pkmaps(); + count = LAST_PKMAP; + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + if (nonblocking) + return 0; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + + current->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&pkmap_map_wait, &wait); + spin_unlock(&kmap_lock); + schedule(); + remove_wait_queue(&pkmap_map_wait, &wait); + spin_lock(&kmap_lock); + + /* Somebody else might have mapped it while we slept */ + if (page->virtual) + return (unsigned long) page->virtual; + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + XEN_flush_page_update_queue(); + + pkmap_count[last_pkmap_nr] = 1; + page->virtual = (void *) vaddr; + + return vaddr; +} + +void *kmap_high(struct page *page, int nonblocking) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + * + * We cannot call this from interrupts, as it may block + */ + spin_lock(&kmap_lock); + vaddr = (unsigned long) page->virtual; + if (!vaddr) { + vaddr = map_new_virtual(page, nonblocking); + if (!vaddr) + goto out; + } + pkmap_count[PKMAP_NR(vaddr)]++; + if (pkmap_count[PKMAP_NR(vaddr)] < 2) + BUG(); + out: + spin_unlock(&kmap_lock); + return (void*) vaddr; +} + +void kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + int need_wakeup; + + spin_lock(&kmap_lock); + vaddr = (unsigned long) page->virtual; + if (!vaddr) + BUG(); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + need_wakeup = waitqueue_active(&pkmap_map_wait); + } + spin_unlock(&kmap_lock); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(&pkmap_map_wait); +} + +#define POOL_SIZE 32 + +/* + * This lock gets no contention at all, normally. + */ +static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED; + +int nr_emergency_pages; +static LIST_HEAD(emergency_pages); + +int nr_emergency_bhs; +static LIST_HEAD(emergency_bhs); + +/* + * Simple bounce buffer support for highmem pages. + * This will be moved to the block layer in 2.5. + */ + +static inline void copy_from_high_bh (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_from; + char *vfrom; + + p_from = from->b_page; + + vfrom = kmap_atomic(p_from, KM_USER0); + memcpy(to->b_data, vfrom + bh_offset(from), to->b_size); + kunmap_atomic(vfrom, KM_USER0); +} + +static inline void copy_to_high_bh_irq (struct buffer_head *to, + struct buffer_head *from) +{ + struct page *p_to; + char *vto; + unsigned long flags; + + p_to = to->b_page; + __save_flags(flags); + __cli(); + vto = kmap_atomic(p_to, KM_BOUNCE_READ); + memcpy(vto + bh_offset(to), from->b_data, to->b_size); + kunmap_atomic(vto, KM_BOUNCE_READ); + __restore_flags(flags); +} + +static inline void bounce_end_io (struct buffer_head *bh, int uptodate) +{ + struct page *page; + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); + unsigned long flags; + + bh_orig->b_end_io(bh_orig, uptodate); + + page = bh->b_page; + + spin_lock_irqsave(&emergency_lock, flags); + if (nr_emergency_pages >= POOL_SIZE) + __free_page(page); + else { + /* + * We are abusing page->list to manage + * the highmem emergency pool: + */ + list_add(&page->list, &emergency_pages); + nr_emergency_pages++; + } + + if (nr_emergency_bhs >= POOL_SIZE) { +#ifdef HIGHMEM_DEBUG + /* Don't clobber the constructed slab cache */ + init_waitqueue_head(&bh->b_wait); +#endif + kmem_cache_free(bh_cachep, bh); + } else { + /* + * Ditto in the bh case, here we abuse b_inode_buffers: + */ + list_add(&bh->b_inode_buffers, &emergency_bhs); + nr_emergency_bhs++; + } + spin_unlock_irqrestore(&emergency_lock, flags); +} + +static __init int init_emergency_pool(void) +{ + struct sysinfo i; + si_meminfo(&i); + si_swapinfo(&i); + + if (!i.totalhigh) + return 0; + + spin_lock_irq(&emergency_lock); + while (nr_emergency_pages < POOL_SIZE) { + struct page * page = alloc_page(GFP_ATOMIC); + if (!page) { + printk("couldn't refill highmem emergency pages"); + break; + } + list_add(&page->list, &emergency_pages); + nr_emergency_pages++; + } + while (nr_emergency_bhs < POOL_SIZE) { + struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC); + if (!bh) { + printk("couldn't refill highmem emergency bhs"); + break; + } + list_add(&bh->b_inode_buffers, &emergency_bhs); + nr_emergency_bhs++; + } + spin_unlock_irq(&emergency_lock); + printk("allocated %d pages and %d bhs reserved for the highmem bounces\n", + nr_emergency_pages, nr_emergency_bhs); + + return 0; +} + +__initcall(init_emergency_pool); + +static void bounce_end_io_write (struct buffer_head *bh, int uptodate) +{ + bounce_end_io(bh, uptodate); +} + +static void bounce_end_io_read (struct buffer_head *bh, int uptodate) +{ + struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); + + if (uptodate) + copy_to_high_bh_irq(bh_orig, bh); + bounce_end_io(bh, uptodate); +} + +struct page *alloc_bounce_page (void) +{ + struct list_head *tmp; + struct page *page; + + page = alloc_page(GFP_NOHIGHIO); + if (page) + return page; + /* + * No luck. First, kick the VM so it doesn't idle around while + * we are using up our emergency rations. + */ + wakeup_bdflush(); + +repeat_alloc: + /* + * Try to allocate from the emergency pool. + */ + tmp = &emergency_pages; + spin_lock_irq(&emergency_lock); + if (!list_empty(tmp)) { + page = list_entry(tmp->next, struct page, list); + list_del(tmp->next); + nr_emergency_pages--; + } + spin_unlock_irq(&emergency_lock); + if (page) + return page; + + /* we need to wait I/O completion */ + run_task_queue(&tq_disk); + + yield(); + goto repeat_alloc; +} + +struct buffer_head *alloc_bounce_bh (void) +{ + struct list_head *tmp; + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO); + if (bh) + return bh; + /* + * No luck. First, kick the VM so it doesn't idle around while + * we are using up our emergency rations. + */ + wakeup_bdflush(); + +repeat_alloc: + /* + * Try to allocate from the emergency pool. + */ + tmp = &emergency_bhs; + spin_lock_irq(&emergency_lock); + if (!list_empty(tmp)) { + bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); + list_del(tmp->next); + nr_emergency_bhs--; + } + spin_unlock_irq(&emergency_lock); + if (bh) + return bh; + + /* we need to wait I/O completion */ + run_task_queue(&tq_disk); + + yield(); + goto repeat_alloc; +} + +struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig) +{ + struct page *page; + struct buffer_head *bh; + + if (!PageHighMem(bh_orig->b_page)) + return bh_orig; + + bh = alloc_bounce_bh(); + /* + * This is wasteful for 1k buffers, but this is a stopgap measure + * and we are being ineffective anyway. This approach simplifies + * things immensly. On boxes with more than 4GB RAM this should + * not be an issue anyway. + */ + page = alloc_bounce_page(); + + set_bh_page(bh, page, 0); + + bh->b_next = NULL; + bh->b_blocknr = bh_orig->b_blocknr; + bh->b_size = bh_orig->b_size; + bh->b_list = -1; + bh->b_dev = bh_orig->b_dev; + bh->b_count = bh_orig->b_count; + bh->b_rdev = bh_orig->b_rdev; + bh->b_state = bh_orig->b_state; +#ifdef HIGHMEM_DEBUG + bh->b_flushtime = jiffies; + bh->b_next_free = NULL; + bh->b_prev_free = NULL; + /* bh->b_this_page */ + bh->b_reqnext = NULL; + bh->b_pprev = NULL; +#endif + /* bh->b_page */ + if (rw == WRITE) { + bh->b_end_io = bounce_end_io_write; + copy_from_high_bh(bh, bh_orig); + } else + bh->b_end_io = bounce_end_io_read; + bh->b_private = (void *)bh_orig; + bh->b_rsector = bh_orig->b_rsector; +#ifdef HIGHMEM_DEBUG + memset(&bh->b_wait, -1, sizeof(bh->b_wait)); +#endif + + return bh; +} + |