aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2004-03-31 16:15:50 +0000
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2004-03-31 16:15:50 +0000
commit3072fef54df8b25abb624a52d0a13bdef9b74b97 (patch)
treeced584f168231f438bd5bc24e5860a86932a613f
parenta6ba94a3071f2454379db3c84aee7298ca63ea61 (diff)
downloadxen-3072fef54df8b25abb624a52d0a13bdef9b74b97.tar.gz
xen-3072fef54df8b25abb624a52d0a13bdef9b74b97.tar.bz2
xen-3072fef54df8b25abb624a52d0a13bdef9b74b97.zip
bitkeeper revision 1.825.3.18 (406aeeb6EYYfLTekemoWDRaWO8SuEQ)
highmem.c, highmem.h: new file Many files: New bootstrap layout for DOM0 and for all Linux domains. Xenolinux now support CONFIG_HIGHMEM for up to 4GB allocated to a Xenolinux domain. .del-highmem.h~c3cf3f6856a90f6: Delete: xenolinux-2.4.25-sparse/include/asm-xen/highmem.h elf.h: Rename: xen/include/asm-i386/elf.h -> xen/include/xen/elf.h .del-elf.h~57347596c85127ef: Delete: xen/include/asm-x86_64/elf.h
-rw-r--r--.rootkeys6
-rw-r--r--extras/mini-os/Makefile11
-rw-r--r--extras/mini-os/head.S34
-rw-r--r--tools/xc/lib/xc_linux_build.c546
-rw-r--r--tools/xc/lib/xc_private.h9
-rw-r--r--xen/arch/i386/mm.c7
-rw-r--r--xen/arch/i386/setup.c2
-rw-r--r--xen/arch/i386/traps.c11
-rw-r--r--xen/common/domain.c495
-rw-r--r--xen/common/kernel.c47
-rw-r--r--xen/common/memory.c25
-rw-r--r--xen/include/asm-i386/elf.h233
-rw-r--r--xen/include/asm-x86_64/elf.h233
-rw-r--r--xen/include/hypervisor-ifs/hypervisor-if.h37
-rw-r--r--xen/include/xen/elf.h523
-rw-r--r--xen/include/xen/mm.h1
-rw-r--r--xen/include/xen/sched.h11
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/Makefile2
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/boot/Makefile12
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/config.in6
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S23
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/kernel/head.S41
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c84
-rw-r--r--xenolinux-2.4.25-sparse/arch/xen/mm/init.c121
-rw-r--r--xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h6
-rw-r--r--xenolinux-2.4.25-sparse/include/asm-xen/highmem.h132
-rw-r--r--xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h6
-rw-r--r--xenolinux-2.4.25-sparse/mm/highmem.c455
28 files changed, 2099 insertions, 1020 deletions
diff --git a/.rootkeys b/.rootkeys
index 7a74d6af0b..57ac0c7f50 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -411,7 +411,6 @@
3ddb79c34BFiXjBJ_cCKB0aCsV1IDw xen/include/asm-i386/desc.h
3e564149UkU91RX7onzpCAmbj_IFjw xen/include/asm-i386/dma.h
3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-i386/domain_page.h
-3ddb79c2O729EttZTYu1c8LcsUO_GQ xen/include/asm-i386/elf.h
3ddb79c3NU8Zy40OTrq3D-i30Y3t4A xen/include/asm-i386/fixmap.h
3e2d29944GI24gf7vOP_7x8EyuqxeA xen/include/asm-i386/flushtlb.h
3ddb79c39o75zPP0T1aQQ4mNrCAN2w xen/include/asm-i386/hardirq.h
@@ -460,7 +459,6 @@
404f1b9fl6AQ_a-T1TDK3fuwTPXmHw xen/include/asm-x86_64/desc.h
404f1ba05mjpUREtosjzz3PPL5cTJA xen/include/asm-x86_64/dma.h
404f1ba13mnjeZT2ytPm0DB63703nA xen/include/asm-x86_64/domain_page.h
-404f1ba2IXQ7E0x9NlqpR5hgYtC9RQ xen/include/asm-x86_64/elf.h
404f1ba31i0gS-cdqvd0RZX1HVnxsA xen/include/asm-x86_64/fixmap.h
404f1ba4KXQ_V7HOkenF04KRU7Tl7w xen/include/asm-x86_64/flushtlb.h
404f1ba5Sqzc22eXORShvCF9-rpMbA xen/include/asm-x86_64/hardirq.h
@@ -534,6 +532,7 @@
3ddb79c1V44RD26YqCUm-kqIupM37A xen/include/xen/ctype.h
3ddb79c05DdHQ0UxX_jKsXdR4QlMCA xen/include/xen/delay.h
3ddb79c1uaWQZj551j1O0B5z8AnHOg xen/include/xen/elevator.h
+3ddb79c2O729EttZTYu1c8LcsUO_GQ xen/include/xen/elf.h
3ddb79c0HIghfBF8zFUdmXhOU8i6hA xen/include/xen/errno.h
3ddb79c0rMjudDKkJku_mkm0J-BZgw xen/include/xen/etherdevice.h
3ddb79c0T3X07lFnM9OSE-W5bqIDSQ xen/include/xen/ethtool.h
@@ -663,7 +662,7 @@
3e5a4e66HdSkvIV6SJ1evG_xmTmXHA xenolinux-2.4.25-sparse/include/asm-xen/desc.h
4048c0e0_P2wUTiT6UqgPhn0s7yFcA xenolinux-2.4.25-sparse/include/asm-xen/evtchn.h
3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h
-3e5a4e67w_DWgjIJ17Tlossu1LGujQ xenolinux-2.4.25-sparse/include/asm-xen/highmem.h
+406aeeaaQvl4RNtmd9hDEugBURbFpQ xenolinux-2.4.25-sparse/include/asm-xen/highmem.h
3e5a4e67YtcyDLQsShhCfQwPSELfvA xenolinux-2.4.25-sparse/include/asm-xen/hw_irq.h
3e5a4e677VBavzM1UZIEcH1B-RlXMA xenolinux-2.4.25-sparse/include/asm-xen/hypervisor.h
4060044fVx7-tokvNLKBf_6qBB4lqQ xenolinux-2.4.25-sparse/include/asm-xen/io.h
@@ -697,6 +696,7 @@
3f9d4b44247udoqWEgFkaHiWv6Uvyg xenolinux-2.4.25-sparse/kernel/time.c
401c059bjLBFYHRD4Py2uM3eA1D4zQ xenolinux-2.4.25-sparse/kernel/timer.c
3e6e7c1efbQe93xCvOpOVCnXTMmQ5w xenolinux-2.4.25-sparse/mkbuildtree
+406aeeafkrnCuIVWLFv3kfn4uAD5Eg xenolinux-2.4.25-sparse/mm/highmem.c
3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.25-sparse/mm/memory.c
3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.25-sparse/mm/mprotect.c
3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.25-sparse/mm/mremap.c
diff --git a/extras/mini-os/Makefile b/extras/mini-os/Makefile
index fc9c2bf733..d2d478ac40 100644
--- a/extras/mini-os/Makefile
+++ b/extras/mini-os/Makefile
@@ -24,17 +24,8 @@ hypervisor-ifs:
ln -sf ../../../xen/include/hypervisor-ifs h/hypervisor-ifs
$(TARGET): hypervisor-ifs head.o $(OBJS)
- # Image will load at 0xC0000000. First bytes from head.o
- #$(LD) -N -Ttext 0xC0000000 head.o $(OBJS) -o $@.elf
$(LD) -N -T minios.lds head.o $(OBJS) -o $@.elf
- # Guest OS header -- first 8 bytes are identifier 'XenGuest'.
- echo -e -n 'XenGuest' >$@
- # Guest OS header -- next 4 bytes are load address (0xC0000000).
- echo -e -n '\000\000\000\300' >>$@
- # Create a raw bag of bytes from the ELF image.
- objcopy -O binary -R .note -R .comment $@.elf $@.raw
- # Guest OS header is immediately followed by raw OS image.
- cat $@.raw >>$@
+ objcopy -R .note -R .comment $@.elf $@
gzip -f -9 -c $@ >$@.gz
clean:
diff --git a/extras/mini-os/head.S b/extras/mini-os/head.S
index 5844e296c4..52eae8f818 100644
--- a/extras/mini-os/head.S
+++ b/extras/mini-os/head.S
@@ -1,48 +1,18 @@
#include <os.h>
-/* Offsets in start_info structure */
-#define MOD_START 20
-#define MOD_LEN 24
-
.globl _start, shared_info
_start:
cld
-
lss stack_start,%esp
-
- /* Copy any module somewhere safe before it's clobbered by BSS. */
- mov MOD_LEN(%esi),%ecx
- shr $2,%ecx
- jz 2f /* bail from copy loop if no module */
-
- mov $_end,%edi
- add MOD_LEN(%esi),%edi
- mov MOD_START(%esi),%eax
- add MOD_LEN(%esi),%eax
-1: sub $4,%eax
- sub $4,%edi
- mov (%eax),%ebx
- mov %ebx,(%edi)
- loop 1b
- mov %edi,MOD_START(%esi)
-
- /* Clear BSS first so that there are no surprises... */
-2: xorl %eax,%eax
- movl $__bss_start,%edi
- movl $_end,%ecx
- subl %edi,%ecx
- rep stosb
-
push %esi
call start_kernel
-
stack_start:
.long stack+8192, __KERNEL_DS
-
- /* Unpleasant -- we actually use this PTE to map shared_info :-) */
+ /* Unpleasant -- the PTE that maps this page is actually overwritten */
+ /* to map the real shared-info page! :-) */
.org 0x1000
shared_info:
.org 0x2000
diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c
index f9f6949348..92fff33a6a 100644
--- a/tools/xc/lib/xc_linux_build.c
+++ b/tools/xc/lib/xc_linux_build.c
@@ -3,15 +3,24 @@
*/
#include "xc_private.h"
+#define ELFSIZE 32
+#include "xc_elf.h"
#include <zlib.h>
-/* This string is written to the head of every guest kernel image. */
-#define GUEST_SIG "XenGuest"
-#define SIG_LEN 8
-
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
+#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
+#define round_pgdown(_p) ((_p)&PAGE_MASK)
+
+static int readelfimage_base_and_size(char *elfbase,
+ unsigned long elfsize,
+ unsigned long *pkernstart,
+ unsigned long *pkernend,
+ unsigned long *pkernentry);
+static int loadelfimage(char *elfbase, int pmh, unsigned long *parray,
+ unsigned long vstart);
+
static long get_tot_pages(int xc_handle, u64 domid)
{
dom0_op_t op;
@@ -43,25 +52,6 @@ static int get_pfn_list(int xc_handle,
return (ret < 0) ? -1 : op.u.getmemlist.num_pfns;
}
-/* Read the kernel header, extracting the image size and load address. */
-static int read_kernel_header(gzFile gfd, long dom_size,
- unsigned long *load_addr)
-{
- char signature[SIG_LEN];
-
- gzread(gfd, signature, SIG_LEN);
- if ( strncmp(signature, GUEST_SIG, SIG_LEN) )
- {
- ERROR("Kernel image does not contain required signature");
- return -1;
- }
-
- /* Read the load address which immediately follows the Xen signature. */
- gzread(gfd, load_addr, sizeof(unsigned long));
-
- return 0;
-}
-
static int copy_to_domain_page(int pm_handle,
unsigned long dst_pfn,
void *src_page)
@@ -75,12 +65,11 @@ static int copy_to_domain_page(int pm_handle,
}
static int setup_guestos(int xc_handle,
- u64 dom,
- gzFile kernel_gfd,
- gzFile initrd_gfd,
- unsigned long tot_pages,
- unsigned long *virt_startinfo_addr,
- unsigned long virt_load_addr,
+ u64 dom,
+ char *image, unsigned long image_size,
+ gzFile initrd_gfd, unsigned long initrd_len,
+ unsigned long nr_pages,
+ unsigned long *pvsi, unsigned long *pvke,
dom0_builddomain_t *builddomain,
const char *cmdline,
unsigned long shared_info_frame)
@@ -88,140 +77,184 @@ static int setup_guestos(int xc_handle,
l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
unsigned long *page_array = NULL;
- int alloc_index, num_pt_pages;
unsigned long l2tab;
unsigned long l1tab;
- unsigned long count, pt_start, i, j;
- unsigned long initrd_addr = 0, initrd_len = 0;
+ unsigned long count, i;
start_info_t *start_info;
shared_info_t *shared_info;
- unsigned long ksize;
mmu_t *mmu = NULL;
- int pm_handle;
+ int pm_handle=-1, rc;
+
+ unsigned long nr_pt_pages;
+ unsigned long ppt_alloc;
+ unsigned long *physmap, *physmap_e, physmap_pfn;
+
+ unsigned long v_start;
+ unsigned long vkern_start;
+ unsigned long vkern_entry;
+ unsigned long vkern_end;
+ unsigned long vinitrd_start;
+ unsigned long vinitrd_end;
+ unsigned long vphysmap_start;
+ unsigned long vphysmap_end;
+ unsigned long vstartinfo_start;
+ unsigned long vstartinfo_end;
+ unsigned long vstack_start;
+ unsigned long vstack_end;
+ unsigned long vpt_start;
+ unsigned long vpt_end;
+ unsigned long v_end;
+
+ rc = readelfimage_base_and_size(image, image_size,
+ &vkern_start, &vkern_end, &vkern_entry);
+ if ( rc != 0 )
+ goto error_out;
+
+ /*
+ * Why do we need this? The number of page-table frames depends on the
+ * size of the bootstrap address space. But the size of the address space
+ * depends on the number of page-table frames (since each one is mapped
+ * read-only). We have a pair of simultaneous equations in two unknowns,
+ * which we solve by exhaustive search.
+ */
+ for ( nr_pt_pages = 2; ; nr_pt_pages++ )
+ {
+ v_start = vkern_start & ~((1<<22)-1);
+ vinitrd_start = round_pgup(vkern_end);
+ vinitrd_end = vinitrd_start + initrd_len;
+ vphysmap_start = round_pgup(vinitrd_end);
+ vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
+ vpt_start = round_pgup(vphysmap_end);
+ vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
+ vstartinfo_start = vpt_end;
+ vstartinfo_end = vstartinfo_start + PAGE_SIZE;
+ vstack_start = vstartinfo_end;
+ vstack_end = vstack_start + PAGE_SIZE;
+ v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
+ if ( (v_end - vstack_end) < (512 << 10) )
+ v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
+ if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
+ break;
+ }
+
+ if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) )
+ {
+ printf("Initial guest OS requires too much space\n"
+ "(%luMB is greater than %luMB limit)\n",
+ (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
+ goto error_out;
+ }
+
+ printf("VIRTUAL MEMORY ARRANGEMENT:\n"
+ " Loaded kernel: %08lx->%08lx\n"
+ " Init. ramdisk: %08lx->%08lx\n"
+ " Phys-Mach map: %08lx->%08lx\n"
+ " Page tables: %08lx->%08lx\n"
+ " Start info: %08lx->%08lx\n"
+ " Boot stack: %08lx->%08lx\n"
+ " TOTAL: %08lx->%08lx\n",
+ vkern_start, vkern_end,
+ vinitrd_start, vinitrd_end,
+ vphysmap_start, vphysmap_end,
+ vpt_start, vpt_end,
+ vstartinfo_start, vstartinfo_end,
+ vstack_start, vstack_end,
+ v_start, v_end);
+ printf(" ENTRY ADDRESS: %08lx\n", vkern_entry);
memset(builddomain, 0, sizeof(*builddomain));
if ( (pm_handle = init_pfn_mapper()) < 0 )
goto error_out;
- if ( (page_array = malloc(tot_pages * sizeof(unsigned long))) == NULL )
+ if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
{
PERROR("Could not allocate memory");
goto error_out;
}
- if ( get_pfn_list(xc_handle, dom, page_array, tot_pages) != tot_pages )
+ if ( get_pfn_list(xc_handle, dom, page_array, nr_pages) != nr_pages )
{
PERROR("Could not get the page frame list");
goto error_out;
}
- /* Load the guest OS image. Let it take no more than 1/2 memory.*/
- for ( i = 0; i < ((tot_pages/2)*PAGE_SIZE); i += PAGE_SIZE )
- {
- char page[PAGE_SIZE];
- int size;
- if ( (size = gzread(kernel_gfd, page, PAGE_SIZE)) == -1 )
- {
- PERROR("Error reading kernel image, could not"
- " read the whole image.");
- goto error_out;
- }
- if ( size == 0 )
- goto kernel_copied;
- copy_to_domain_page(pm_handle, page_array[i>>PAGE_SHIFT], page);
- }
- ERROR("Kernel too big to safely fit in domain memory");
- goto error_out;
-
- kernel_copied:
- /* ksize is kernel-image size rounded up to a page boundary. */
- ksize = i;
+ loadelfimage(image, pm_handle, page_array, v_start);
/* Load the initial ramdisk image. */
- if ( initrd_gfd )
+ if ( initrd_len != 0 )
{
- int size;
-
- for ( j=0, i=ksize; i < ((tot_pages/2) * PAGE_SIZE); i += PAGE_SIZE )
+ for ( i = (vinitrd_start - v_start);
+ i < (vinitrd_end - v_start); i += PAGE_SIZE )
{
char page[PAGE_SIZE];
- if ( (size = gzread(initrd_gfd, page, PAGE_SIZE)) == -1 )
+ if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
{
PERROR("Error reading initrd image, could not");
goto error_out;
}
- j += size;
- if ( size > 0 )
- copy_to_domain_page(pm_handle,
- page_array[i>>PAGE_SHIFT], page);
- if ( size < PAGE_SIZE )
- goto initrd_copied;
+ copy_to_domain_page(pm_handle,
+ page_array[i>>PAGE_SHIFT], page);
}
- ERROR("Kernel/initrd too big to safely fit in domain memory");
- goto error_out;
-
- initrd_copied:
- initrd_addr = virt_load_addr + ksize;
- initrd_len = j;
}
- alloc_index = tot_pages - 1;
-
- /* Count bottom-level PTs, rounding up. */
- num_pt_pages = (l1_table_offset(virt_load_addr) + tot_pages + 1023) / 1024;
-
- /* We must also count the page directory. */
- num_pt_pages++;
-
- /* Index of first PT page. */
- pt_start = tot_pages - num_pt_pages;
-
- /*
- * First allocate page for page dir. Allocation goes backwards from the end
- * of the allocated physical address space.
- */
- l2tab = page_array[alloc_index] << PAGE_SHIFT;
- alloc_index--;
- builddomain->ctxt.pt_base = l2tab;
-
if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL )
goto error_out;
+ /* First allocate page for page dir. */
+ ppt_alloc = (vpt_start - v_start) >> PAGE_SHIFT;
+ l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
+ builddomain->ctxt.pt_base = l2tab;
+
/* Initialise the page tables. */
if ( (vl2tab = map_pfn_writeable(pm_handle, l2tab >> PAGE_SHIFT)) == NULL )
goto error_out;
memset(vl2tab, 0, PAGE_SIZE);
- vl2e = &vl2tab[l2_table_offset(virt_load_addr)];
- for ( count = 0; count < tot_pages; count++ )
+ vl2e = &vl2tab[l2_table_offset(v_start)];
+ for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
{
if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
{
- l1tab = page_array[alloc_index--] << PAGE_SHIFT;
+ l1tab = page_array[ppt_alloc++] << PAGE_SHIFT;
if ( vl1tab != NULL )
unmap_pfn(pm_handle, vl1tab);
if ( (vl1tab = map_pfn_writeable(pm_handle,
l1tab >> PAGE_SHIFT)) == NULL )
goto error_out;
memset(vl1tab, 0, PAGE_SIZE);
- vl1e = &vl1tab[l1_table_offset(virt_load_addr +
- (count<<PAGE_SHIFT))];
+ vl1e = &vl1tab[l1_table_offset(v_start + (count<<PAGE_SHIFT))];
*vl2e++ = l1tab | L2_PROT;
}
*vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT;
- if ( count >= pt_start )
+ if ( (count >= ((vpt_start-v_start)>>PAGE_SHIFT)) &&
+ (count < ((vpt_end -v_start)>>PAGE_SHIFT)) )
*vl1e &= ~_PAGE_RW;
vl1e++;
+ }
+ unmap_pfn(pm_handle, vl1tab);
+ unmap_pfn(pm_handle, vl2tab);
+ /* Write the phys->machine and machine->phys table entries. */
+ physmap_pfn = (vphysmap_start - v_start) >> PAGE_SHIFT;
+ physmap = physmap_e =
+ map_pfn_writeable(pm_handle, page_array[physmap_pfn++]);
+ for ( count = 0; count < nr_pages; count++ )
+ {
if ( add_mmu_update(xc_handle, mmu,
(page_array[count] << PAGE_SHIFT) |
MMU_MACHPHYS_UPDATE, count) )
goto error_out;
+ *physmap_e++ = page_array[count];
+ if ( ((unsigned long)physmap_e & (PAGE_SIZE-1)) == 0 )
+ {
+ unmap_pfn(pm_handle, physmap);
+ physmap = physmap_e =
+ map_pfn_writeable(pm_handle, page_array[physmap_pfn++]);
+ }
}
- unmap_pfn(pm_handle, vl1tab);
- unmap_pfn(pm_handle, vl2tab);
-
+ unmap_pfn(pm_handle, physmap);
+
/*
* Pin down l2tab addr as page dir page - causes hypervisor to provide
* correct protection for the page
@@ -230,17 +263,20 @@ static int setup_guestos(int xc_handle,
l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) )
goto error_out;
- *virt_startinfo_addr =
- virt_load_addr + ((alloc_index-1) << PAGE_SHIFT);
-
- start_info = map_pfn_writeable(pm_handle, page_array[alloc_index-1]);
+ start_info = map_pfn_writeable(
+ pm_handle, page_array[(vstartinfo_start-v_start)>>PAGE_SHIFT]);
memset(start_info, 0, sizeof(*start_info));
- start_info->pt_base = virt_load_addr + ((tot_pages-1) << PAGE_SHIFT);
- start_info->mod_start = initrd_addr;
- start_info->mod_len = initrd_len;
- start_info->nr_pages = tot_pages;
- start_info->shared_info = shared_info_frame << PAGE_SHIFT;
- start_info->flags = 0;
+ start_info->nr_pages = nr_pages;
+ start_info->shared_info = shared_info_frame << PAGE_SHIFT;
+ start_info->flags = 0;
+ start_info->pt_base = vpt_start;
+ start_info->nr_pt_frames = nr_pt_pages;
+ start_info->mfn_list = vphysmap_start;
+ if ( initrd_len != 0 )
+ {
+ start_info->mod_start = vinitrd_start;
+ start_info->mod_len = initrd_len;
+ }
strncpy(start_info->cmd_line, cmdline, MAX_CMD_LEN);
start_info->cmd_line[MAX_CMD_LEN-1] = '\0';
unmap_pfn(pm_handle, start_info);
@@ -258,6 +294,10 @@ static int setup_guestos(int xc_handle,
free(mmu);
(void)close_pfn_mapper(pm_handle);
free(page_array);
+
+ *pvsi = vstartinfo_start;
+ *pvke = vkern_entry;
+
return 0;
error_out:
@@ -270,74 +310,109 @@ static int setup_guestos(int xc_handle,
return -1;
}
-int xc_linux_build(int xc_handle,
- u64 domid,
- const char *image_name,
- const char *ramdisk_name,
- const char *cmdline)
+static unsigned long get_filesz(int fd)
{
- dom0_op_t launch_op, op;
- unsigned long load_addr;
- long tot_pages;
- int kernel_fd = -1, initrd_fd = -1;
- gzFile kernel_gfd = NULL, initrd_gfd = NULL;
- int rc, i;
- full_execution_context_t *ctxt;
- unsigned long virt_startinfo_addr;
-
- if ( (tot_pages = get_tot_pages(xc_handle, domid)) < 0 )
+ u16 sig;
+ u32 _sz = 0;
+ unsigned long sz;
+
+ lseek(fd, 0, SEEK_SET);
+ read(fd, &sig, sizeof(sig));
+ sz = lseek(fd, 0, SEEK_END);
+ if ( sig == 0x8b1f ) /* GZIP signature? */
{
- PERROR("Could not find total pages for domain");
- return 1;
+ lseek(fd, -4, SEEK_END);
+ read(fd, &_sz, 4);
+ sz = _sz;
}
+ lseek(fd, 0, SEEK_SET);
- kernel_fd = open(image_name, O_RDONLY);
- if ( kernel_fd < 0 )
+ return sz;
+}
+
+static char *read_kernel_image(const char *filename, unsigned long *size)
+{
+ int kernel_fd = -1;
+ gzFile kernel_gfd = NULL;
+ char *image = NULL;
+ unsigned int bytes;
+
+ if ( (kernel_fd = open(filename, O_RDONLY)) < 0 )
{
PERROR("Could not open kernel image");
- return 1;
+ goto out;
}
+ *size = get_filesz(kernel_fd);
+
if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL )
{
PERROR("Could not allocate decompression state for state file");
- close(kernel_fd);
- return 1;
+ goto out;
}
- rc = read_kernel_header(kernel_gfd,
- tot_pages << (PAGE_SHIFT - 10),
- &load_addr);
- if ( rc < 0 )
- goto error_out;
-
- if ( (load_addr & (PAGE_SIZE-1)) != 0 )
+ if ( (image = malloc(*size)) == NULL )
{
- ERROR("We can only deal with page-aligned load addresses");
- goto error_out;
+ PERROR("Could not allocate memory for kernel image");
+ goto out;
}
- if ( (load_addr + (tot_pages << PAGE_SHIFT)) > HYPERVISOR_VIRT_START )
+ if ( (bytes = gzread(kernel_gfd, image, *size)) != *size )
{
- ERROR("Cannot map all domain memory without hitting Xen space");
+ PERROR("Error reading kernel image, could not"
+ " read the whole image (%d != %ld).", bytes, *size);
+ free(image);
+ image = NULL;
+ }
+
+ out:
+ if ( kernel_gfd != NULL )
+ gzclose(kernel_gfd);
+ else if ( kernel_fd >= 0 )
+ close(kernel_fd);
+ return image;
+}
+
+int xc_linux_build(int xc_handle,
+ u64 domid,
+ const char *image_name,
+ const char *ramdisk_name,
+ const char *cmdline)
+{
+ dom0_op_t launch_op, op;
+ int initrd_fd = -1;
+ gzFile initrd_gfd = NULL;
+ int rc, i;
+ full_execution_context_t *ctxt;
+ unsigned long nr_pages;
+ char *image = NULL;
+ unsigned long image_size, initrd_size=0;
+ unsigned long vstartinfo_start, vkern_entry;
+
+ if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
+ {
+ PERROR("Could not find total pages for domain");
goto error_out;
}
+ if ( (image = read_kernel_image(image_name, &image_size)) == NULL )
+ goto error_out;
+
if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) )
{
- initrd_fd = open(ramdisk_name, O_RDONLY);
- if ( initrd_fd < 0 )
+ if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 )
{
PERROR("Could not open the initial ramdisk image");
goto error_out;
}
+ initrd_size = get_filesz(initrd_fd);
+
if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL )
{
PERROR("Could not allocate decompression state for initrd");
goto error_out;
}
-
}
op.cmd = DOM0_GETDOMAININFO;
@@ -355,23 +430,22 @@ int xc_linux_build(int xc_handle,
goto error_out;
}
- if ( setup_guestos(xc_handle, domid, kernel_gfd, initrd_gfd, tot_pages,
- &virt_startinfo_addr,
- load_addr, &launch_op.u.builddomain, cmdline,
+ if ( setup_guestos(xc_handle, domid, image, image_size,
+ initrd_gfd, initrd_size, nr_pages,
+ &vstartinfo_start, &vkern_entry,
+ &launch_op.u.builddomain, cmdline,
op.u.getdomaininfo.shared_info_frame) < 0 )
{
ERROR("Error constructing guest OS");
goto error_out;
}
- if ( kernel_fd >= 0 )
- close(kernel_fd);
- if( kernel_gfd )
- gzclose(kernel_gfd);
if ( initrd_fd >= 0 )
close(initrd_fd);
- if( initrd_gfd )
+ if ( initrd_gfd )
gzclose(initrd_gfd);
+ if ( image != NULL )
+ free(image);
ctxt = &launch_op.u.builddomain.ctxt;
@@ -392,9 +466,9 @@ int xc_linux_build(int xc_handle,
ctxt->cpu_ctxt.gs = FLAT_GUESTOS_DS;
ctxt->cpu_ctxt.ss = FLAT_GUESTOS_DS;
ctxt->cpu_ctxt.cs = FLAT_GUESTOS_CS;
- ctxt->cpu_ctxt.eip = load_addr;
- ctxt->cpu_ctxt.esp = virt_startinfo_addr;
- ctxt->cpu_ctxt.esi = virt_startinfo_addr;
+ ctxt->cpu_ctxt.eip = vkern_entry;
+ ctxt->cpu_ctxt.esp = vstartinfo_start;
+ ctxt->cpu_ctxt.esi = vstartinfo_start;
ctxt->cpu_ctxt.eflags = (1<<9) | (1<<2);
/* FPU is set up to default initial state. */
@@ -416,7 +490,7 @@ int xc_linux_build(int xc_handle,
/* Ring 1 stack is the initial stack. */
ctxt->guestos_ss = FLAT_GUESTOS_DS;
- ctxt->guestos_esp = virt_startinfo_addr;
+ ctxt->guestos_esp = vstartinfo_start;
/* No debugging. */
memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg));
@@ -436,14 +510,152 @@ int xc_linux_build(int xc_handle,
return rc;
error_out:
- if ( kernel_fd >= 0 )
- close(kernel_fd);
- if( kernel_gfd )
- gzclose(kernel_gfd);
- if ( initrd_fd >= 0 )
- close(initrd_fd);
- if( initrd_gfd )
+ if ( initrd_gfd != NULL )
gzclose(initrd_gfd);
+ else if ( initrd_fd >= 0 )
+ close(initrd_fd);
+ if ( image != NULL )
+ free(image);
return -1;
}
+
+static inline int is_loadable_phdr(Elf_Phdr *phdr)
+{
+ return ((phdr->p_type == PT_LOAD) &&
+ ((phdr->p_flags & (PF_W|PF_X)) != 0));
+}
+
+static int readelfimage_base_and_size(char *elfbase,
+ unsigned long elfsize,
+ unsigned long *pkernstart,
+ unsigned long *pkernend,
+ unsigned long *pkernentry)
+{
+ Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
+ Elf_Phdr *phdr;
+ Elf_Shdr *shdr;
+ unsigned long kernstart = ~0UL, kernend=0UL;
+ char *shstrtab, *guestinfo;
+ int h;
+
+ if ( !IS_ELF(*ehdr) )
+ {
+ ERROR("Kernel image does not have an ELF header.");
+ return -EINVAL;
+ }
+
+ if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize )
+ {
+ ERROR("ELF program headers extend beyond end of image.");
+ return -EINVAL;
+ }
+
+ if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize )
+ {
+ ERROR("ELF section headers extend beyond end of image.");
+ return -EINVAL;
+ }
+
+ /* Find the section-header strings table. */
+ if ( ehdr->e_shstrndx == SHN_UNDEF )
+ {
+ ERROR("ELF image has no section-header strings table (shstrtab).");
+ return -EINVAL;
+ }
+ shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff +
+ (ehdr->e_shstrndx*ehdr->e_shentsize));
+ shstrtab = elfbase + shdr->sh_offset;
+
+ /* Find the special '__xen_guest' section and check its contents. */
+ for ( h = 0; h < ehdr->e_shnum; h++ )
+ {
+ shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + (h*ehdr->e_shentsize));
+ if ( strcmp(&shstrtab[shdr->sh_name], "__xen_guest") != 0 )
+ continue;
+ guestinfo = elfbase + shdr->sh_offset;
+ if ( (strstr(guestinfo, "GUEST_OS=linux") == NULL) ||
+ (strstr(guestinfo, "XEN_VER=1.3") == NULL) )
+ {
+ ERROR("Will only load Linux images built for Xen v1.3");
+ ERROR("Actually saw: '%s'", guestinfo);
+ return -EINVAL;
+ }
+ break;
+ }
+ if ( h == ehdr->e_shnum )
+ {
+ ERROR("Not a Xen-ELF image: '__xen_guest' section not found.");
+ return -EINVAL;
+ }
+
+ for ( h = 0; h < ehdr->e_phnum; h++ )
+ {
+ phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
+ if ( !is_loadable_phdr(phdr) )
+ continue;
+ if ( phdr->p_vaddr < kernstart )
+ kernstart = phdr->p_vaddr;
+ if ( (phdr->p_vaddr + phdr->p_memsz) > kernend )
+ kernend = phdr->p_vaddr + phdr->p_memsz;
+ }
+
+ if ( (kernstart > kernend) ||
+ (ehdr->e_entry < kernstart) ||
+ (ehdr->e_entry > kernend) )
+ {
+ ERROR("Malformed ELF image.");
+ return -EINVAL;
+ }
+
+ *pkernstart = kernstart;
+ *pkernend = kernend;
+ *pkernentry = ehdr->e_entry;
+
+ return 0;
+}
+
+static int loadelfimage(char *elfbase, int pmh, unsigned long *parray,
+ unsigned long vstart)
+{
+ Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
+ Elf_Phdr *phdr;
+ int h;
+
+ char *va;
+ unsigned long pa, done, chunksz;
+
+ for ( h = 0; h < ehdr->e_phnum; h++ )
+ {
+ phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
+ if ( !is_loadable_phdr(phdr) )
+ continue;
+
+ for ( done = 0; done < phdr->p_filesz; done += chunksz )
+ {
+ pa = (phdr->p_vaddr + done) - vstart;
+ va = map_pfn_writeable(pmh, parray[pa>>PAGE_SHIFT]);
+ va += pa & (PAGE_SIZE-1);
+ chunksz = phdr->p_filesz - done;
+ if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
+ chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1));
+ memcpy(va, elfbase + phdr->p_offset + done, chunksz);
+ unmap_pfn(pmh, va);
+ }
+
+ for ( ; done < phdr->p_memsz; done += chunksz )
+ {
+ pa = (phdr->p_vaddr + done) - vstart;
+ va = map_pfn_writeable(pmh, parray[pa>>PAGE_SHIFT]);
+ va += pa & (PAGE_SIZE-1);
+ chunksz = phdr->p_memsz - done;
+ if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) )
+ chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1));
+ memset(va, 0, chunksz);
+ unmap_pfn(pmh, va);
+ }
+ }
+
+ return 0;
+}
+
diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h
index 8e3cae3f52..dda04a9f8d 100644
--- a/tools/xc/lib/xc_private.h
+++ b/tools/xc/lib/xc_private.h
@@ -55,11 +55,12 @@ typedef unsigned long l2_pgentry_t;
#define l2_table_offset(_a) \
((_a) >> L2_PAGETABLE_SHIFT)
-#define ERROR(_m) \
- fprintf(stderr, "ERROR: %s\n", (_m))
+#define ERROR(_m, _a...) \
+ fprintf(stderr, "ERROR: " _m "\n" , ## _a )
-#define PERROR(_m) \
- fprintf(stderr, "ERROR: %s (%d = %s)\n", (_m), errno, strerror(errno))
+#define PERROR(_m, _a...) \
+ fprintf(stderr, "ERROR: " _m " (%d = %s)\n" , ## _a , \
+ errno, strerror(errno))
static inline int do_privcmd(int xc_handle,
unsigned int cmd,
diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c
index 2567e39422..9f1eaa465b 100644
--- a/xen/arch/i386/mm.c
+++ b/xen/arch/i386/mm.c
@@ -81,6 +81,13 @@ void __init paging_init(void)
{
unsigned long addr;
void *ioremap_pt;
+ int i;
+
+ /* Idle page table 1:1 maps the first part of physical memory. */
+ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ idle_pg_table[i] =
+ mk_l2_pgentry((i << L2_PAGETABLE_SHIFT) |
+ __PAGE_HYPERVISOR | _PAGE_PSE);
/*
* Fixed mappings, only the page table structure has to be
diff --git a/xen/arch/i386/setup.c b/xen/arch/i386/setup.c
index 318d8cff90..862582e5d4 100644
--- a/xen/arch/i386/setup.c
+++ b/xen/arch/i386/setup.c
@@ -411,8 +411,6 @@ void __init start_of_day(void)
check_nmi_watchdog();
- zap_low_mappings();
-
#ifdef CONFIG_PCI
pci_init();
#endif
diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c
index 7a012e147a..d10292f618 100644
--- a/xen/arch/i386/traps.c
+++ b/xen/arch/i386/traps.c
@@ -339,11 +339,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
return; /* successfully copied the mapping */
}
- if ( unlikely( p->mm.shadow_mode ) && addr < PAGE_OFFSET &&
- shadow_fault( addr, error_code ) )
- {
- return; // return true if fault was handled
- }
+ if ( unlikely(p->mm.shadow_mode) &&
+ (addr < PAGE_OFFSET) && shadow_fault(addr, error_code) )
+ return; /* Return TRUE if fault was handled. */
if ( unlikely(!(regs->xcs & 3)) )
goto fault_in_hypervisor;
@@ -363,7 +361,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
{
perfc_incrc(copy_user_faults);
- //DPRINTK("copy_user fault: %08lx -> %08lx\n", regs->eip, fixup);
+ if ( !p->mm.shadow_mode )
+ DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup);
regs->eip = fixup;
regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
return;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 5be1be9b06..f83562a903 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -20,6 +20,13 @@
#include <xen/vbd.h>
#include <asm/i387.h>
+#ifdef CONFIG_X86_64BITMODE
+#define ELFSIZE 64
+#else
+#define ELFSIZE 32
+#endif
+#include <xen/elf.h>
+
#if !defined(CONFIG_X86_64BITMODE)
/* No ring-3 access in initial page tables. */
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
@@ -31,6 +38,9 @@
#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
+#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
+#define round_pgdown(_p) ((_p)&PAGE_MASK)
+
/* Both these structures are protected by the tasklist_lock. */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;
struct task_struct *task_hash[TASK_HASH_SIZE];
@@ -459,7 +469,7 @@ unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
(PAGE_SHIFT-10))) )
{
free_all_dom_mem(p);
- return -1;
+ return -ENOMEM;
}
}
@@ -555,39 +565,166 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
return 0;
}
-static unsigned long alloc_page_from_domain(unsigned long * cur_addr,
- unsigned long * index)
+static inline int is_loadable_phdr(Elf_Phdr *phdr)
{
- unsigned long ret = *cur_addr;
- struct list_head *ent = frame_table[ret >> PAGE_SHIFT].list.prev;
- *cur_addr = list_entry(ent, struct pfn_info, list) - frame_table;
- *cur_addr <<= PAGE_SHIFT;
- (*index)--;
- return ret;
+ return ((phdr->p_type == PT_LOAD) &&
+ ((phdr->p_flags & (PF_W|PF_X)) != 0));
}
-/*
- * setup_guestos is used for building dom0 solely. other domains are built in
- * userspace dom0 and final setup is being done by final_setup_guestos.
- */
-int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
- unsigned int num_vifs,
- char *phy_data_start, unsigned long data_len,
- char *cmdline, unsigned long initrd_len)
+static int readelfimage_base_and_size(char *elfbase,
+ unsigned long elfsize,
+ unsigned long *pkernstart,
+ unsigned long *pkernend,
+ unsigned long *pkernentry)
{
- struct list_head *list_ent;
- char *src, *vsrc, *dst, *data_start;
- int i;
+ Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
+ Elf_Phdr *phdr;
+ Elf_Shdr *shdr;
+ unsigned long kernstart = ~0UL, kernend=0UL;
+ char *shstrtab, *guestinfo;
+ int h;
+
+ if ( !IS_ELF(*ehdr) )
+ {
+ printk("Kernel image does not have an ELF header.\n");
+ return -EINVAL;
+ }
+
+ if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize )
+ {
+ printk("ELF program headers extend beyond end of image.\n");
+ return -EINVAL;
+ }
+
+ if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize )
+ {
+ printk("ELF section headers extend beyond end of image.\n");
+ return -EINVAL;
+ }
+
+ /* Find the section-header strings table. */
+ if ( ehdr->e_shstrndx == SHN_UNDEF )
+ {
+ printk("ELF image has no section-header strings table (shstrtab).\n");
+ return -EINVAL;
+ }
+ shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff +
+ (ehdr->e_shstrndx*ehdr->e_shentsize));
+ shstrtab = elfbase + shdr->sh_offset;
+
+ /* Find the special '__xen_guest' section and check its contents. */
+ for ( h = 0; h < ehdr->e_shnum; h++ )
+ {
+ shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + (h*ehdr->e_shentsize));
+ if ( strcmp(&shstrtab[shdr->sh_name], "__xen_guest") != 0 )
+ continue;
+ guestinfo = elfbase + shdr->sh_offset;
+ printk("Xen-ELF header found: '%s'\n", guestinfo);
+ if ( (strstr(guestinfo, "GUEST_OS=linux") == NULL) ||
+ (strstr(guestinfo, "XEN_VER=1.3") == NULL) )
+ {
+ printk("ERROR: Xen will only load Linux built for Xen v1.3\n");
+ return -EINVAL;
+ }
+ break;
+ }
+ if ( h == ehdr->e_shnum )
+ {
+ printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
+ return -EINVAL;
+ }
+
+ for ( h = 0; h < ehdr->e_phnum; h++ )
+ {
+ phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
+ if ( !is_loadable_phdr(phdr) )
+ continue;
+ if ( phdr->p_vaddr < kernstart )
+ kernstart = phdr->p_vaddr;
+ if ( (phdr->p_vaddr + phdr->p_memsz) > kernend )
+ kernend = phdr->p_vaddr + phdr->p_memsz;
+ }
+
+ if ( (kernstart > kernend) ||
+ (ehdr->e_entry < kernstart) ||
+ (ehdr->e_entry > kernend) )
+ {
+ printk("Malformed ELF image.\n");
+ return -EINVAL;
+ }
+
+ *pkernstart = kernstart;
+ *pkernend = kernend;
+ *pkernentry = ehdr->e_entry;
+
+ return 0;
+}
+
+static int loadelfimage(char *elfbase)
+{
+ Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
+ Elf_Phdr *phdr;
+ int h;
+
+ for ( h = 0; h < ehdr->e_phnum; h++ )
+ {
+ phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
+ if ( !is_loadable_phdr(phdr) )
+ continue;
+ if ( phdr->p_filesz != 0 )
+ memcpy((char *)phdr->p_vaddr, elfbase + phdr->p_offset,
+ phdr->p_filesz);
+ if ( phdr->p_memsz > phdr->p_filesz )
+ memset((char *)phdr->p_vaddr + phdr->p_filesz, 0,
+ phdr->p_memsz - phdr->p_filesz);
+ }
+
+ return 0;
+}
+
+int construct_dom0(struct task_struct *p,
+ unsigned long alloc_start,
+ unsigned long alloc_end,
+ unsigned int num_vifs,
+ char *image_start, unsigned long image_len,
+ char *initrd_start, unsigned long initrd_len,
+ char *cmdline)
+{
+ char *dst;
+ int i, rc;
domid_t dom = p->domain;
- unsigned long phys_l1tab, phys_l2tab;
- unsigned long cur_address, alloc_address;
- unsigned long virt_load_address, virt_stack_address;
- start_info_t *virt_startinfo_address;
+ unsigned long pfn, mfn;
+ unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
+ unsigned long nr_pt_pages;
unsigned long count;
- unsigned long alloc_index;
l2_pgentry_t *l2tab, *l2start;
l1_pgentry_t *l1tab = NULL, *l1start = NULL;
struct pfn_info *page = NULL;
+ start_info_t *si;
+
+ /*
+ * This fully describes the memory layout of the initial domain. All
+ * *_start address are page-aligned, except v_start (and v_end) which are
+ * superpage-aligned.
+ */
+ unsigned long v_start;
+ unsigned long vkern_start;
+ unsigned long vkern_entry;
+ unsigned long vkern_end;
+ unsigned long vinitrd_start;
+ unsigned long vinitrd_end;
+ unsigned long vphysmap_start;
+ unsigned long vphysmap_end;
+ unsigned long vstartinfo_start;
+ unsigned long vstartinfo_end;
+ unsigned long vstack_start;
+ unsigned long vstack_end;
+ unsigned long vpt_start;
+ unsigned long vpt_end;
+ unsigned long v_end;
+
+ /* Machine address of next candidate page-table page. */
+ unsigned long mpt_alloc;
extern void physdev_init_dom0(struct task_struct *);
extern void ide_probe_devices(xen_disk_info_t *);
@@ -597,67 +734,114 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
xen_disk_t *xd;
/* Sanity! */
- if ( p->domain != 0 ) BUG();
- if ( test_bit(PF_CONSTRUCTED, &p->flags) ) BUG();
+ if ( p->domain != 0 )
+ BUG();
+ if ( test_bit(PF_CONSTRUCTED, &p->flags) )
+ BUG();
+
+ printk("*** LOADING DOMAIN 0 ***\n");
/*
* This is all a bit grim. We've moved the modules to the "safe" physical
* memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
- * routeine, we're going to copy it down into the region that's actually
+ * routine we're going to copy it down into the region that's actually
* been allocated to domain 0. This is highly likely to be overlapping, so
* we use a forward copy.
*
* MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
* 4GB and lots of network/disk cards that allocate loads of buffers.
- * We'll have to revist this if we ever support PAE (64GB).
+ * We'll have to revisit this if we ever support PAE (64GB).
*/
- data_start = map_domain_mem((unsigned long)phy_data_start);
+ rc = readelfimage_base_and_size(image_start, image_len,
+ &vkern_start, &vkern_end, &vkern_entry);
+ if ( rc != 0 )
+ return rc;
- if ( strncmp(data_start, "XenGuest", 8) )
+ /*
+ * Why do we need this? The number of page-table frames depends on the
+ * size of the bootstrap address space. But the size of the address space
+ * depends on the number of page-table frames (since each one is mapped
+ * read-only). We have a pair of simultaneous equations in two unknowns,
+ * which we solve by exhaustive search.
+ */
+ for ( nr_pt_pages = 2; ; nr_pt_pages++ )
{
- printk("DOM%llu: Invalid guest OS image - bad signature\n", dom);
- unmap_domain_mem(data_start);
- return -1;
+ v_start = vkern_start & ~((1<<22)-1);
+ vinitrd_start = round_pgup(vkern_end);
+ vinitrd_end = vinitrd_start + initrd_len;
+ vphysmap_start = round_pgup(vinitrd_end);
+ vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
+ vpt_start = round_pgup(vphysmap_end);
+ vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
+ vstartinfo_start = vpt_end;
+ vstartinfo_end = vstartinfo_start + PAGE_SIZE;
+ vstack_start = vstartinfo_end;
+ vstack_end = vstack_start + PAGE_SIZE;
+ v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
+ if ( (v_end - vstack_end) < (512 << 10) )
+ v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
+ if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
+ break;
}
- virt_load_address = *(unsigned long *)(data_start + 8);
- if ( (virt_load_address & (PAGE_SIZE-1)) )
+ if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) )
{
- printk("DOM%llu: Guest OS load address not page-aligned (%08lx)\n",
- dom, virt_load_address);
- unmap_domain_mem(data_start);
- return -1;
+ printk("Initial guest OS requires too much space\n"
+ "(%luMB is greater than %luMB limit)\n",
+ (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
+ return -ENOMEM;
}
- if ( alloc_new_dom_mem(p, params->memory_kb) )
+ printk("PHYSICAL MEMORY ARRANGEMENT:\n"
+ " Kernel image: %p->%p\n"
+ " Initrd image: %p->%p\n"
+ " Dom0 alloc.: %08lx->%08lx\n",
+ image_start, image_start + image_len,
+ initrd_start, initrd_start + initrd_len,
+ alloc_start, alloc_end);
+ printk("VIRTUAL MEMORY ARRANGEMENT:\n"
+ " Loaded kernel: %08lx->%08lx\n"
+ " Init. ramdisk: %08lx->%08lx\n"
+ " Phys-Mach map: %08lx->%08lx\n"
+ " Page tables: %08lx->%08lx\n"
+ " Start info: %08lx->%08lx\n"
+ " Boot stack: %08lx->%08lx\n"
+ " TOTAL: %08lx->%08lx\n",
+ vkern_start, vkern_end,
+ vinitrd_start, vinitrd_end,
+ vphysmap_start, vphysmap_end,
+ vpt_start, vpt_end,
+ vstartinfo_start, vstartinfo_end,
+ vstack_start, vstack_end,
+ v_start, v_end);
+ printk(" ENTRY ADDRESS: %08lx\n", vkern_entry);
+
+ /*
+ * Protect the lowest 1GB of memory. We use a temporary mapping there
+ * from which we copy the kernel and ramdisk images.
+ */
+ if ( v_start < (1<<30) )
{
- printk("DOM%llu: Not enough memory --- reduce dom0_mem ??\n", dom);
- unmap_domain_mem(data_start);
- return -ENOMEM;
+ printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
+ return -EINVAL;
}
- alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
- frame_table;
- alloc_address <<= PAGE_SHIFT;
- alloc_index = p->tot_pages;
-
- if ( data_len > (params->memory_kb << 9) )
+ /* Construct a frame-allocation list for the initial domain. */
+ for ( pfn = (alloc_start>>PAGE_SHIFT);
+ pfn < (alloc_end>>PAGE_SHIFT);
+ pfn++ )
{
- printk("DOM%llu: Guest OS image is too large\n"
- " (%luMB is greater than %uMB limit for a\n"
- " %uMB address space)\n",
- dom, data_len>>20,
- (params->memory_kb)>>11,
- (params->memory_kb)>>10);
- unmap_domain_mem(data_start);
- free_all_dom_mem(p);
- return -1;
+ page = &frame_table[pfn];
+ page->u.domain = p;
+ page->type_and_flags = 0;
+ page->count_and_flags = PGC_allocated | 1;
+ list_add_tail(&page->list, &p->page_list);
+ p->tot_pages++;
}
- printk("DOM%llu: Guest OS virtual load address is %08lx\n", dom,
- virt_load_address);
-
+ mpt_alloc = (vpt_start - v_start) + alloc_start;
+
SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
@@ -671,157 +855,137 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
for ( i = 0; i < 256; i++ )
p->thread.traps[i].cs = FLAT_GUESTOS_CS;
- /*
- * WARNING: The new domain must have its 'processor' field
- * filled in by now !!
- */
- phys_l2tab = alloc_page_from_domain(&alloc_address, &alloc_index);
- l2start = l2tab = map_domain_mem(phys_l2tab);
+ /* WARNING: The new domain must have its 'processor' field filled in! */
+ l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
+ l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
- l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry(phys_l2tab | __PAGE_HYPERVISOR);
- memset(l2tab, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
- p->mm.pagetable = mk_pagetable(phys_l2tab);
+ p->mm.pagetable = mk_pagetable((unsigned long)l2start);
- l2tab += l2_table_offset(virt_load_address);
- cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
- frame_table;
- cur_address <<= PAGE_SHIFT;
- for ( count = 0; count < p->tot_pages; count++ )
+ l2tab += l2_table_offset(v_start);
+ mfn = alloc_start >> PAGE_SHIFT;
+ for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
{
if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
{
- if ( l1tab != NULL ) unmap_domain_mem(l1start);
- phys_l1tab = alloc_page_from_domain(&alloc_address, &alloc_index);
- *l2tab++ = mk_l2_pgentry(phys_l1tab|L2_PROT);
- l1start = l1tab = map_domain_mem(phys_l1tab);
+ l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
+ mpt_alloc += PAGE_SIZE;
+ *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
clear_page(l1tab);
- l1tab += l1_table_offset(
- virt_load_address + (count << PAGE_SHIFT));
}
- *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
+ *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
- page = &frame_table[cur_address >> PAGE_SHIFT];
+ page = &frame_table[mfn];
set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
if ( !get_page_and_type(page, p, PGT_writeable_page) )
BUG();
- /* Set up the MPT entry. */
- machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
- list_ent = frame_table[cur_address >> PAGE_SHIFT].list.next;
- cur_address = list_entry(list_ent, struct pfn_info, list) -
- frame_table;
- cur_address <<= PAGE_SHIFT;
+ mfn++;
}
- unmap_domain_mem(l1start);
- /* pages that are part of page tables must be read only */
- l2tab = l2start + l2_table_offset(virt_load_address +
- (alloc_index << PAGE_SHIFT));
- l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
- l1tab += l1_table_offset(virt_load_address + (alloc_index << PAGE_SHIFT));
+ /* Pages that are part of page tables must be read only. */
+ l2tab = l2start + l2_table_offset(vpt_start);
+ l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
+ l1tab += l1_table_offset(vpt_start);
l2tab++;
- for ( count = alloc_index; count < p->tot_pages; count++ )
+ for ( count = 0; count < nr_pt_pages; count++ )
{
*l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
- page = frame_table + l1_pgentry_to_pagenr(*l1tab);
- page->type_and_flags &= ~PGT_type_mask;
- page->type_and_flags |= PGT_l1_page_table;
- get_page(page, p); /* an extra ref because of readable mapping */
- l1tab++;
- if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
+ page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
+ if ( count == 0 )
+ {
+ page->type_and_flags &= ~PGT_type_mask;
+ page->type_and_flags |= PGT_l2_page_table;
+ get_page(page, p); /* an extra ref because of readable mapping */
+ /* Get another ref to L2 page so that it can be pinned. */
+ if ( !get_page_and_type(page, p, PGT_l2_page_table) )
+ BUG();
+ set_bit(_PGC_guest_pinned, &page->count_and_flags);
+ }
+ else
{
- unmap_domain_mem(l1start);
- l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
- l2tab++;
+ page->type_and_flags &= ~PGT_type_mask;
+ page->type_and_flags |= PGT_l1_page_table;
+ get_page(page, p); /* an extra ref because of readable mapping */
}
+ l1tab++;
+ if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
+ l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
}
- /* Rewrite last L1 page to be a L2 page. */
- page->type_and_flags &= ~PGT_type_mask;
- page->type_and_flags |= PGT_l2_page_table;
- /* Get another ref to L2 page so that it can be pinned. */
- if ( !get_page_and_type(page, p, PGT_l2_page_table) )
- BUG();
- set_bit(_PGC_guest_pinned, &page->count_and_flags);
- unmap_domain_mem(l1start);
- /* Set up shared info area. */
+ /* Set up shared-info area. */
update_dom_time(p->shared_info);
p->shared_info->domain_time = 0;
p->shared_info->evtchn_upcall_mask = ~0UL; /* mask all upcalls */
- virt_startinfo_address = (start_info_t *)
- (virt_load_address + ((alloc_index - 1) << PAGE_SHIFT));
- virt_stack_address = (unsigned long)virt_startinfo_address;
-
- unmap_domain_mem(l2start);
-
/* Install the new page tables. */
__cli();
write_cr3_counted(pagetable_val(p->mm.pagetable));
- /* Copy the guest OS image. */
- src = (char *)(phy_data_start + 12);
- vsrc = (char *)(data_start + 12); /* data_start invalid after first page*/
- dst = (char *)virt_load_address;
- while ( src < (phy_data_start+data_len) )
- {
- *dst++ = *vsrc++;
- src++;
- if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 )
- {
- unmap_domain_mem(vsrc-1);
- vsrc = map_domain_mem((unsigned long)src);
- }
- }
- unmap_domain_mem(vsrc);
+ /* Copy the OS image. */
+ (void)loadelfimage(image_start);
+
+ /* Copy the initial ramdisk. */
+ if ( initrd_len != 0 )
+ memcpy((void *)vinitrd_start, initrd_start, initrd_len);
/* Set up start info area. */
- memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address));
- virt_startinfo_address->nr_pages = p->tot_pages;
- virt_startinfo_address->shared_info = virt_to_phys(p->shared_info);
- virt_startinfo_address->pt_base = virt_load_address +
- ((p->tot_pages - 1) << PAGE_SHIFT);
-
- virt_startinfo_address->flags = 0;
- if ( IS_PRIV(p) )
- virt_startinfo_address->flags |= SIF_PRIVILEGED;
- if ( p->domain == 0 )
- virt_startinfo_address->flags |= SIF_INITDOMAIN;
+ si = (start_info_t *)vstartinfo_start;
+ memset(si, 0, PAGE_SIZE);
+ si->nr_pages = p->tot_pages;
+ si->shared_info = virt_to_phys(p->shared_info);
+ si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
+ si->pt_base = vpt_start;
+ si->nr_pt_frames = nr_pt_pages;
+ si->mfn_list = vphysmap_start;
+
+ /* Write the phys->machine and machine->phys table entries. */
+ for ( pfn = 0; pfn < p->tot_pages; pfn++ )
+ {
+ mfn = (alloc_start >> PAGE_SHIFT) + pfn;
+ ((unsigned long *)vphysmap_start)[pfn] = mfn;
+ machine_to_phys_mapping[mfn] = pfn;
+ }
- if ( initrd_len )
+ if ( initrd_len != 0 )
{
- virt_startinfo_address->mod_start = (unsigned long)dst-initrd_len;
- virt_startinfo_address->mod_len = initrd_len;
+ si->mod_start = vinitrd_start;
+ si->mod_len = initrd_len;
printk("Initrd len 0x%lx, start at 0x%08lx\n",
- virt_startinfo_address->mod_len,
- virt_startinfo_address->mod_start);
+ si->mod_len, si->mod_start);
}
- /* Add virtual network interfaces and point to them in startinfo. */
- while ( num_vifs-- > 0 )
- (void)create_net_vif(dom);
-
- dst = virt_startinfo_address->cmd_line;
+ dst = si->cmd_line;
if ( cmdline != NULL )
{
for ( i = 0; i < 255; i++ )
{
- if ( cmdline[i] == '\0' ) break;
+ if ( cmdline[i] == '\0' )
+ break;
*dst++ = cmdline[i];
}
}
*dst = '\0';
- /* NB: Give up the VGA console if DOM0 is ocnfigured to grab it. */
- console_endboot(strstr(cmdline, "tty0") != NULL);
-
/* Reinstate the caller's page tables. */
write_cr3_counted(pagetable_val(current->mm.pagetable));
__sti();
+ /* Destroy low mappings - they were only for our convenience. */
+ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
+ l2start[i] = mk_l2_pgentry(0);
+ zap_low_mappings(); /* Do the same for the idle page tables. */
+
+ /* Give up the VGA console if DOM0 is configured to grab it. */
+ console_endboot(strstr(cmdline, "tty0") != NULL);
+
+ /* Add virtual network interfaces. */
+ while ( num_vifs-- > 0 )
+ (void)create_net_vif(dom);
+
#ifndef NO_DEVICES_IN_XEN
/* DOM0 gets access to all real block devices. */
#define MAX_REAL_DISKS 256
@@ -851,14 +1015,11 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
set_bit(PF_CONSTRUCTED, &p->flags);
-#if 0 // XXXXX DO NOT CHECK IN ENBALED !!! (but useful for testing so leave)
+#if 0 // XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave)
shadow_mode_enable(p, SHM_test);
#endif
- new_thread(p,
- (unsigned long)virt_load_address,
- (unsigned long)virt_stack_address,
- (unsigned long)virt_startinfo_address);
+ new_thread(p, vkern_entry, vstack_end, vstartinfo_start);
return 0;
}
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index ac44c2407f..4fd4990ac0 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -104,10 +104,12 @@ void cmain(unsigned long magic, multiboot_info_t *mbi)
dom0_createdomain_t dom0_params;
unsigned long max_page;
unsigned char *cmdline;
- module_t *mod;
+ module_t *mod = (module_t *)__va(mbi->mods_addr);
void *heap_start;
int i;
unsigned long max_mem;
+ unsigned long dom0_memory_start, dom0_memory_end;
+ unsigned long initial_images_start, initial_images_end;
/* Parse the command-line options. */
cmdline = (unsigned char *)(mbi->cmdline ? __va(mbi->cmdline) : NULL);
@@ -215,6 +217,19 @@ void cmain(unsigned long magic, multiboot_info_t *mbi)
max_page >> (20-PAGE_SHIFT),
max_mem >> (20-PAGE_SHIFT) );
+ initial_images_start = MAX_DIRECTMAP_ADDRESS;
+ initial_images_end = initial_images_start +
+ (mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
+ dom0_memory_start = (initial_images_end + ((4<<20)-1)) & ~((4<<20)-1);
+ dom0_memory_end = dom0_memory_start + (opt_dom0_mem << 10);
+ dom0_memory_end = (dom0_memory_end + PAGE_SIZE - 1) & PAGE_MASK;
+
+ /* Cheesy sanity check: enough memory for DOM0 allocation + some slack? */
+ if ( (dom0_memory_end + (8<<20)) > (max_page<<PAGE_SHIFT) )
+ panic("Not enough memory to craete initial domain!\n");
+
+ add_to_domain_alloc_list(dom0_memory_end, max_page << PAGE_SHIFT);
+
heap_start = memguard_init(&_end);
printk("Xen heap size is %luKB\n",
@@ -243,24 +258,30 @@ void cmain(unsigned long magic, multiboot_info_t *mbi)
/* Create initial domain 0. */
dom0_params.memory_kb = opt_dom0_mem;
new_dom = do_createdomain(0, 0);
- if ( new_dom == NULL ) panic("Error creating domain 0\n");
+ if ( new_dom == NULL )
+ panic("Error creating domain 0\n");
set_bit(PF_PRIVILEGED, &new_dom->flags);
/*
* We're going to setup domain0 using the module(s) that we stashed safely
- * above our MAX_DIRECTMAP_ADDRESS in boot/Boot.S The second module, if
- * present, is an initrd ramdisk
+ * above our MAX_DIRECTMAP_ADDRESS in boot/boot.S. The second module, if
+ * present, is an initrd ramdisk.
*/
- mod = (module_t *)__va(mbi->mods_addr);
- if ( setup_guestos(new_dom,
- &dom0_params, 1,
- (char *)MAX_DIRECTMAP_ADDRESS,
- mod[mbi->mods_count-1].mod_end - mod[0].mod_start,
- __va(mod[0].string),
- (mbi->mods_count == 2) ?
- (mod[1].mod_end - mod[1].mod_start):0)
- != 0 ) panic("Could not set up DOM0 guest OS\n");
+ if ( construct_dom0(new_dom, dom0_memory_start, dom0_memory_end, 1,
+ (char *)initial_images_start,
+ mod[0].mod_end-mod[0].mod_start,
+ (mbi->mods_count == 1) ? 0 :
+ (char *)initial_images_start +
+ (mod[1].mod_start-mod[0].mod_start),
+ (mbi->mods_count == 1) ? 0 :
+ mod[mbi->mods_count-1].mod_end - mod[1].mod_start,
+ __va(mod[0].string)) != 0)
+ panic("Could not set up DOM0 guest OS\n");
+
+ /* The stash space for the initial kernel image can now be freed up. */
+ add_to_domain_alloc_list(__pa(frame_table) + frame_table_size,
+ dom0_memory_start);
wake_up(new_dom);
diff --git a/xen/common/memory.c b/xen/common/memory.c
index c8510c514d..9422c5ba86 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -170,7 +170,7 @@ unsigned long frame_table_size;
unsigned long max_page;
struct list_head free_list;
-spinlock_t free_list_lock = SPIN_LOCK_UNLOCKED;
+spinlock_t free_list_lock;
unsigned int free_pfns;
/* Used to defer flushing of memory structures. */
@@ -191,10 +191,6 @@ static struct {
*/
void __init init_frametable(unsigned long nr_pages)
{
- struct pfn_info *pf;
- unsigned long page_index;
- unsigned long flags;
-
memset(percpu_info, 0, sizeof(percpu_info));
max_page = nr_pages;
@@ -203,23 +199,28 @@ void __init init_frametable(unsigned long nr_pages)
frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
memset(frame_table, 0, frame_table_size);
+ spin_lock_init(&free_list_lock);
+ INIT_LIST_HEAD(&free_list);
free_pfns = 0;
- /* Put all domain-allocatable memory on a free list. */
+}
+
+void add_to_domain_alloc_list(unsigned long ps, unsigned long pe)
+{
+ struct pfn_info *pf;
+ unsigned long i;
+ unsigned long flags;
+
spin_lock_irqsave(&free_list_lock, flags);
- INIT_LIST_HEAD(&free_list);
- for( page_index = (__pa(frame_table) + frame_table_size) >> PAGE_SHIFT;
- page_index < nr_pages;
- page_index++ )
+ for ( i = ps >> PAGE_SHIFT; i < (pe >> PAGE_SHIFT); i++ )
{
- pf = list_entry(&frame_table[page_index].list, struct pfn_info, list);
+ pf = list_entry(&frame_table[i].list, struct pfn_info, list);
list_add_tail(&pf->list, &free_list);
free_pfns++;
}
spin_unlock_irqrestore(&free_list_lock, flags);
}
-
static void __invalidate_shadow_ldt(struct task_struct *p)
{
int i;
diff --git a/xen/include/asm-i386/elf.h b/xen/include/asm-i386/elf.h
deleted file mode 100644
index ded22856d0..0000000000
--- a/xen/include/asm-i386/elf.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * GRUB -- GRand Unified Bootloader
- * Copyright (C) 1996 Erich Boleyn <erich@uruk.org>
- * Copyright (C) 2001 Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* 32-bit data types */
-
-typedef unsigned long Elf32_Addr;
-typedef unsigned short Elf32_Half;
-typedef unsigned long Elf32_Off;
-typedef signed long Elf32_Sword;
-typedef unsigned long Elf32_Word;
-/* "unsigned char" already exists */
-
-/* ELF header */
-typedef struct
-{
-
-#define EI_NIDENT 16
-
- /* first four characters are defined below */
-#define EI_MAG0 0
-#define ELFMAG0 0x7f
-#define EI_MAG1 1
-#define ELFMAG1 'E'
-#define EI_MAG2 2
-#define ELFMAG2 'L'
-#define EI_MAG3 3
-#define ELFMAG3 'F'
-
-#define EI_CLASS 4 /* data sizes */
-#define ELFCLASS32 1 /* i386 -- up to 32-bit data sizes present */
-
-#define EI_DATA 5 /* data type and ordering */
-#define ELFDATA2LSB 1 /* i386 -- LSB 2's complement */
-
-#define EI_VERSION 6 /* version number. "e_version" must be the same */
-#define EV_CURRENT 1 /* current version number */
-
-#define EI_PAD 7 /* from here in is just padding */
-
-#define EI_BRAND 8 /* start of OS branding (This is
- obviously illegal against the ELF
- standard.) */
-
- unsigned char e_ident[EI_NIDENT]; /* basic identification block */
-
-#define ET_EXEC 2 /* we only care about executable types */
- Elf32_Half e_type; /* file types */
-
-#define EM_386 3 /* i386 -- obviously use this one */
- Elf32_Half e_machine; /* machine types */
- Elf32_Word e_version; /* use same as "EI_VERSION" above */
- Elf32_Addr e_entry; /* entry point of the program */
- Elf32_Off e_phoff; /* program header table file offset */
- Elf32_Off e_shoff; /* section header table file offset */
- Elf32_Word e_flags; /* flags */
- Elf32_Half e_ehsize; /* elf header size in bytes */
- Elf32_Half e_phentsize; /* program header entry size */
- Elf32_Half e_phnum; /* number of entries in program header */
- Elf32_Half e_shentsize; /* section header entry size */
- Elf32_Half e_shnum; /* number of entries in section header */
-
-#define SHN_UNDEF 0
-#define SHN_LORESERVE 0xff00
-#define SHN_LOPROC 0xff00
-#define SHN_HIPROC 0xff1f
-#define SHN_ABS 0xfff1
-#define SHN_COMMON 0xfff2
-#define SHN_HIRESERVE 0xffff
- Elf32_Half e_shstrndx; /* section header table index */
-}
-Elf32_Ehdr;
-
-
-#define BOOTABLE_I386_ELF(h) \
- ((h.e_ident[EI_MAG0] == ELFMAG0) & (h.e_ident[EI_MAG1] == ELFMAG1) \
- & (h.e_ident[EI_MAG2] == ELFMAG2) & (h.e_ident[EI_MAG3] == ELFMAG3) \
- & (h.e_ident[EI_CLASS] == ELFCLASS32) & (h.e_ident[EI_DATA] == ELFDATA2LSB) \
- & (h.e_ident[EI_VERSION] == EV_CURRENT) & (h.e_type == ET_EXEC) \
- & (h.e_machine == EM_386) & (h.e_version == EV_CURRENT))
-
-/* section table - ? */
-typedef struct
-{
- Elf32_Word sh_name; /* Section name (string tbl index) */
- Elf32_Word sh_type; /* Section type */
- Elf32_Word sh_flags; /* Section flags */
- Elf32_Addr sh_addr; /* Section virtual addr at execution */
- Elf32_Off sh_offset; /* Section file offset */
- Elf32_Word sh_size; /* Section size in bytes */
- Elf32_Word sh_link; /* Link to another section */
- Elf32_Word sh_info; /* Additional section information */
- Elf32_Word sh_addralign; /* Section alignment */
- Elf32_Word sh_entsize; /* Entry size if section holds table */
-}
-Elf32_Shdr;
-
-/* symbol table - page 4-25, figure 4-15 */
-typedef struct
-{
- Elf32_Word st_name;
- Elf32_Addr st_value;
- Elf32_Word st_size;
- unsigned char st_info;
- unsigned char st_other;
- Elf32_Half st_shndx;
-}
-Elf32_Sym;
-
-/* symbol type and binding attributes - page 4-26 */
-
-#define ELF32_ST_BIND(i) ((i) >> 4)
-#define ELF32_ST_TYPE(i) ((i) & 0xf)
-#define ELF32_ST_INFO(b,t) (((b)<<4)+((t)&0xf))
-
-/* symbol binding - page 4-26, figure 4-16 */
-
-#define STB_LOCAL 0
-#define STB_GLOBAL 1
-#define STB_WEAK 2
-#define STB_LOPROC 13
-#define STB_HIPROC 15
-
-/* symbol types - page 4-28, figure 4-17 */
-
-#define STT_NOTYPE 0
-#define STT_OBJECT 1
-#define STT_FUNC 2
-#define STT_SECTION 3
-#define STT_FILE 4
-#define STT_LOPROC 13
-#define STT_HIPROC 15
-
-
-/* Macros to split/combine relocation type and symbol page 4-32 */
-
-#define ELF32_R_SYM(__i) ((__i)>>8)
-#define ELF32_R_TYPE(__i) ((unsigned char) (__i))
-#define ELF32_R_INFO(__s, __t) (((__s)<<8) + (unsigned char) (__t))
-
-
-/* program header - page 5-2, figure 5-1 */
-
-typedef struct
-{
- Elf32_Word p_type;
- Elf32_Off p_offset;
- Elf32_Addr p_vaddr;
- Elf32_Addr p_paddr;
- Elf32_Word p_filesz;
- Elf32_Word p_memsz;
- Elf32_Word p_flags;
- Elf32_Word p_align;
-}
-Elf32_Phdr;
-
-/* segment types - page 5-3, figure 5-2 */
-
-#define PT_NULL 0
-#define PT_LOAD 1
-#define PT_DYNAMIC 2
-#define PT_INTERP 3
-#define PT_NOTE 4
-#define PT_SHLIB 5
-#define PT_PHDR 6
-
-#define PT_LOPROC 0x70000000
-#define PT_HIPROC 0x7fffffff
-
-/* segment permissions - page 5-6 */
-
-#define PF_X 0x1
-#define PF_W 0x2
-#define PF_R 0x4
-#define PF_MASKPROC 0xf0000000
-
-
-/* dynamic structure - page 5-15, figure 5-9 */
-
-typedef struct
-{
- Elf32_Sword d_tag;
- union
- {
- Elf32_Word d_val;
- Elf32_Addr d_ptr;
- }
- d_un;
-}
-Elf32_Dyn;
-
-/* Dynamic array tags - page 5-16, figure 5-10. */
-
-#define DT_NULL 0
-#define DT_NEEDED 1
-#define DT_PLTRELSZ 2
-#define DT_PLTGOT 3
-#define DT_HASH 4
-#define DT_STRTAB 5
-#define DT_SYMTAB 6
-#define DT_RELA 7
-#define DT_RELASZ 8
-#define DT_RELAENT 9
-#define DT_STRSZ 10
-#define DT_SYMENT 11
-#define DT_INIT 12
-#define DT_FINI 13
-#define DT_SONAME 14
-#define DT_RPATH 15
-#define DT_SYMBOLIC 16
-#define DT_REL 17
-#define DT_RELSZ 18
-#define DT_RELENT 19
-#define DT_PLTREL 20
-#define DT_DEBUG 21
-#define DT_TEXTREL 22
-#define DT_JMPREL 23
diff --git a/xen/include/asm-x86_64/elf.h b/xen/include/asm-x86_64/elf.h
deleted file mode 100644
index ded22856d0..0000000000
--- a/xen/include/asm-x86_64/elf.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * GRUB -- GRand Unified Bootloader
- * Copyright (C) 1996 Erich Boleyn <erich@uruk.org>
- * Copyright (C) 2001 Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* 32-bit data types */
-
-typedef unsigned long Elf32_Addr;
-typedef unsigned short Elf32_Half;
-typedef unsigned long Elf32_Off;
-typedef signed long Elf32_Sword;
-typedef unsigned long Elf32_Word;
-/* "unsigned char" already exists */
-
-/* ELF header */
-typedef struct
-{
-
-#define EI_NIDENT 16
-
- /* first four characters are defined below */
-#define EI_MAG0 0
-#define ELFMAG0 0x7f
-#define EI_MAG1 1
-#define ELFMAG1 'E'
-#define EI_MAG2 2
-#define ELFMAG2 'L'
-#define EI_MAG3 3
-#define ELFMAG3 'F'
-
-#define EI_CLASS 4 /* data sizes */
-#define ELFCLASS32 1 /* i386 -- up to 32-bit data sizes present */
-
-#define EI_DATA 5 /* data type and ordering */
-#define ELFDATA2LSB 1 /* i386 -- LSB 2's complement */
-
-#define EI_VERSION 6 /* version number. "e_version" must be the same */
-#define EV_CURRENT 1 /* current version number */
-
-#define EI_PAD 7 /* from here in is just padding */
-
-#define EI_BRAND 8 /* start of OS branding (This is
- obviously illegal against the ELF
- standard.) */
-
- unsigned char e_ident[EI_NIDENT]; /* basic identification block */
-
-#define ET_EXEC 2 /* we only care about executable types */
- Elf32_Half e_type; /* file types */
-
-#define EM_386 3 /* i386 -- obviously use this one */
- Elf32_Half e_machine; /* machine types */
- Elf32_Word e_version; /* use same as "EI_VERSION" above */
- Elf32_Addr e_entry; /* entry point of the program */
- Elf32_Off e_phoff; /* program header table file offset */
- Elf32_Off e_shoff; /* section header table file offset */
- Elf32_Word e_flags; /* flags */
- Elf32_Half e_ehsize; /* elf header size in bytes */
- Elf32_Half e_phentsize; /* program header entry size */
- Elf32_Half e_phnum; /* number of entries in program header */
- Elf32_Half e_shentsize; /* section header entry size */
- Elf32_Half e_shnum; /* number of entries in section header */
-
-#define SHN_UNDEF 0
-#define SHN_LORESERVE 0xff00
-#define SHN_LOPROC 0xff00
-#define SHN_HIPROC 0xff1f
-#define SHN_ABS 0xfff1
-#define SHN_COMMON 0xfff2
-#define SHN_HIRESERVE 0xffff
- Elf32_Half e_shstrndx; /* section header table index */
-}
-Elf32_Ehdr;
-
-
-#define BOOTABLE_I386_ELF(h) \
- ((h.e_ident[EI_MAG0] == ELFMAG0) & (h.e_ident[EI_MAG1] == ELFMAG1) \
- & (h.e_ident[EI_MAG2] == ELFMAG2) & (h.e_ident[EI_MAG3] == ELFMAG3) \
- & (h.e_ident[EI_CLASS] == ELFCLASS32) & (h.e_ident[EI_DATA] == ELFDATA2LSB) \
- & (h.e_ident[EI_VERSION] == EV_CURRENT) & (h.e_type == ET_EXEC) \
- & (h.e_machine == EM_386) & (h.e_version == EV_CURRENT))
-
-/* section table - ? */
-typedef struct
-{
- Elf32_Word sh_name; /* Section name (string tbl index) */
- Elf32_Word sh_type; /* Section type */
- Elf32_Word sh_flags; /* Section flags */
- Elf32_Addr sh_addr; /* Section virtual addr at execution */
- Elf32_Off sh_offset; /* Section file offset */
- Elf32_Word sh_size; /* Section size in bytes */
- Elf32_Word sh_link; /* Link to another section */
- Elf32_Word sh_info; /* Additional section information */
- Elf32_Word sh_addralign; /* Section alignment */
- Elf32_Word sh_entsize; /* Entry size if section holds table */
-}
-Elf32_Shdr;
-
-/* symbol table - page 4-25, figure 4-15 */
-typedef struct
-{
- Elf32_Word st_name;
- Elf32_Addr st_value;
- Elf32_Word st_size;
- unsigned char st_info;
- unsigned char st_other;
- Elf32_Half st_shndx;
-}
-Elf32_Sym;
-
-/* symbol type and binding attributes - page 4-26 */
-
-#define ELF32_ST_BIND(i) ((i) >> 4)
-#define ELF32_ST_TYPE(i) ((i) & 0xf)
-#define ELF32_ST_INFO(b,t) (((b)<<4)+((t)&0xf))
-
-/* symbol binding - page 4-26, figure 4-16 */
-
-#define STB_LOCAL 0
-#define STB_GLOBAL 1
-#define STB_WEAK 2
-#define STB_LOPROC 13
-#define STB_HIPROC 15
-
-/* symbol types - page 4-28, figure 4-17 */
-
-#define STT_NOTYPE 0
-#define STT_OBJECT 1
-#define STT_FUNC 2
-#define STT_SECTION 3
-#define STT_FILE 4
-#define STT_LOPROC 13
-#define STT_HIPROC 15
-
-
-/* Macros to split/combine relocation type and symbol page 4-32 */
-
-#define ELF32_R_SYM(__i) ((__i)>>8)
-#define ELF32_R_TYPE(__i) ((unsigned char) (__i))
-#define ELF32_R_INFO(__s, __t) (((__s)<<8) + (unsigned char) (__t))
-
-
-/* program header - page 5-2, figure 5-1 */
-
-typedef struct
-{
- Elf32_Word p_type;
- Elf32_Off p_offset;
- Elf32_Addr p_vaddr;
- Elf32_Addr p_paddr;
- Elf32_Word p_filesz;
- Elf32_Word p_memsz;
- Elf32_Word p_flags;
- Elf32_Word p_align;
-}
-Elf32_Phdr;
-
-/* segment types - page 5-3, figure 5-2 */
-
-#define PT_NULL 0
-#define PT_LOAD 1
-#define PT_DYNAMIC 2
-#define PT_INTERP 3
-#define PT_NOTE 4
-#define PT_SHLIB 5
-#define PT_PHDR 6
-
-#define PT_LOPROC 0x70000000
-#define PT_HIPROC 0x7fffffff
-
-/* segment permissions - page 5-6 */
-
-#define PF_X 0x1
-#define PF_W 0x2
-#define PF_R 0x4
-#define PF_MASKPROC 0xf0000000
-
-
-/* dynamic structure - page 5-15, figure 5-9 */
-
-typedef struct
-{
- Elf32_Sword d_tag;
- union
- {
- Elf32_Word d_val;
- Elf32_Addr d_ptr;
- }
- d_un;
-}
-Elf32_Dyn;
-
-/* Dynamic array tags - page 5-16, figure 5-10. */
-
-#define DT_NULL 0
-#define DT_NEEDED 1
-#define DT_PLTRELSZ 2
-#define DT_PLTGOT 3
-#define DT_HASH 4
-#define DT_STRTAB 5
-#define DT_SYMTAB 6
-#define DT_RELA 7
-#define DT_RELASZ 8
-#define DT_RELAENT 9
-#define DT_STRSZ 10
-#define DT_SYMENT 11
-#define DT_INIT 12
-#define DT_FINI 13
-#define DT_SONAME 14
-#define DT_RPATH 15
-#define DT_SYMBOLIC 16
-#define DT_REL 17
-#define DT_RELSZ 18
-#define DT_RELENT 19
-#define DT_PLTREL 20
-#define DT_DEBUG 21
-#define DT_TEXTREL 22
-#define DT_JMPREL 23
diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h
index 2b27b4b824..2335ed5ad2 100644
--- a/xen/include/hypervisor-ifs/hypervisor-if.h
+++ b/xen/include/hypervisor-ifs/hypervisor-if.h
@@ -247,18 +247,41 @@ typedef struct shared_info_st
} shared_info_t;
/*
- * NB. We expect that this struct is smaller than a page.
+ * Start-of-day memory layout for the initial domain (DOM0):
+ * 1. The domain is started within contiguous virtual-memory region.
+ * 2. The contiguous region begins and ends on an aligned 4MB boundary.
+ * 3. The region start corresponds to the load address of the OS image.
+ * If the load address is not 4MB aligned then the address is rounded down.
+ * 4. This the order of bootstrap elements in the initial virtual region:
+ * a. relocated kernel image
+ * b. initial ram disk [mod_start, mod_len]
+ * c. list of allocated page frames [mfn_list, nr_pages]
+ * d. bootstrap page tables [pt_base, CR3 (x86)]
+ * e. start_info_t structure [register ESI (x86)]
+ * f. bootstrap stack [register ESP (x86)]
+ * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
+ * 6. The initial ram disk may be omitted.
+ * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ * layout for the domain. In particular, the bootstrap virtual-memory
+ * region is a 1:1 mapping to the first section of the pseudo-physical map.
+ * 8. All bootstrap elements are mapped read-writeable for the guest OS. The
+ * only exception is the bootstrap page table, which is mapped read-only.
+ * 9. There is guaranteed to be at least 512kB padding after the final
+ * bootstrap element. If necessary, the bootstrap virtual region is
+ * extended by an extra 4MB to ensure this.
*/
typedef struct start_info_st {
/* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
- unsigned long nr_pages; /* total pages allocated to this domain. */
- unsigned long shared_info; /* MACHINE address of shared info struct.*/
+ unsigned long nr_pages; /* total pages allocated to this domain. */
+ unsigned long shared_info; /* MACHINE address of shared info struct.*/
unsigned long flags; /* SIF_xxx flags. */
/* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
- unsigned long pt_base; /* VIRTUAL address of page directory. */
- unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
- unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
- unsigned char cmd_line[1]; /* Variable-length options. */
+ unsigned long pt_base; /* VIRTUAL address of page directory. */
+ unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
+ unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
+ unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
+ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
+ unsigned char cmd_line[1]; /* Variable-length options. */
} start_info_t;
/* These flags are passed in the 'flags' field of start_info_t. */
diff --git a/xen/include/xen/elf.h b/xen/include/xen/elf.h
new file mode 100644
index 0000000000..ecf6bbca97
--- /dev/null
+++ b/xen/include/xen/elf.h
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 1995, 1996 Erik Theisen. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+typedef u8 Elf_Byte;
+
+typedef u32 Elf32_Addr; /* Unsigned program address */
+typedef u32 Elf32_Off; /* Unsigned file offset */
+typedef s32 Elf32_Sword; /* Signed large integer */
+typedef u32 Elf32_Word; /* Unsigned large integer */
+typedef u16 Elf32_Half; /* Unsigned medium integer */
+
+typedef u64 Elf64_Addr;
+typedef u64 Elf64_Off;
+typedef s32 Elf64_Shalf;
+
+typedef s32 Elf64_Sword;
+typedef u32 Elf64_Word;
+
+typedef s64 Elf64_Sxword;
+typedef u64 Elf64_Xword;
+
+typedef u32 Elf64_Half;
+typedef u16 Elf64_Quarter;
+
+/*
+ * e_ident[] identification indexes
+ * See http://www.caldera.com/developers/gabi/2000-07-17/ch4.eheader.html
+ */
+#define EI_MAG0 0 /* file ID */
+#define EI_MAG1 1 /* file ID */
+#define EI_MAG2 2 /* file ID */
+#define EI_MAG3 3 /* file ID */
+#define EI_CLASS 4 /* file class */
+#define EI_DATA 5 /* data encoding */
+#define EI_VERSION 6 /* ELF header version */
+#define EI_OSABI 7 /* OS/ABI ID */
+#define EI_ABIVERSION 8 /* ABI version */
+#define EI_PAD 9 /* start of pad bytes */
+#define EI_NIDENT 16 /* Size of e_ident[] */
+
+/* e_ident[] magic number */
+#define ELFMAG0 0x7f /* e_ident[EI_MAG0] */
+#define ELFMAG1 'E' /* e_ident[EI_MAG1] */
+#define ELFMAG2 'L' /* e_ident[EI_MAG2] */
+#define ELFMAG3 'F' /* e_ident[EI_MAG3] */
+#define ELFMAG "\177ELF" /* magic */
+#define SELFMAG 4 /* size of magic */
+
+/* e_ident[] file class */
+#define ELFCLASSNONE 0 /* invalid */
+#define ELFCLASS32 1 /* 32-bit objs */
+#define ELFCLASS64 2 /* 64-bit objs */
+#define ELFCLASSNUM 3 /* number of classes */
+
+/* e_ident[] data encoding */
+#define ELFDATANONE 0 /* invalid */
+#define ELFDATA2LSB 1 /* Little-Endian */
+#define ELFDATA2MSB 2 /* Big-Endian */
+#define ELFDATANUM 3 /* number of data encode defines */
+
+/* e_ident[] Operating System/ABI */
+#define ELFOSABI_SYSV 0 /* UNIX System V ABI */
+#define ELFOSABI_HPUX 1 /* HP-UX operating system */
+#define ELFOSABI_NETBSD 2 /* NetBSD */
+#define ELFOSABI_LINUX 3 /* GNU/Linux */
+#define ELFOSABI_HURD 4 /* GNU/Hurd */
+#define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */
+#define ELFOSABI_SOLARIS 6 /* Solaris */
+#define ELFOSABI_MONTEREY 7 /* Monterey */
+#define ELFOSABI_IRIX 8 /* IRIX */
+#define ELFOSABI_FREEBSD 9 /* FreeBSD */
+#define ELFOSABI_TRU64 10 /* TRU64 UNIX */
+#define ELFOSABI_MODESTO 11 /* Novell Modesto */
+#define ELFOSABI_OPENBSD 12 /* OpenBSD */
+#define ELFOSABI_ARM 97 /* ARM */
+#define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */
+
+/* e_ident */
+#define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \
+ (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \
+ (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \
+ (ehdr).e_ident[EI_MAG3] == ELFMAG3)
+
+/* ELF Header */
+typedef struct elfhdr {
+ unsigned char e_ident[EI_NIDENT]; /* ELF Identification */
+ Elf32_Half e_type; /* object file type */
+ Elf32_Half e_machine; /* machine */
+ Elf32_Word e_version; /* object file version */
+ Elf32_Addr e_entry; /* virtual entry point */
+ Elf32_Off e_phoff; /* program header table offset */
+ Elf32_Off e_shoff; /* section header table offset */
+ Elf32_Word e_flags; /* processor-specific flags */
+ Elf32_Half e_ehsize; /* ELF header size */
+ Elf32_Half e_phentsize; /* program header entry size */
+ Elf32_Half e_phnum; /* number of program header entries */
+ Elf32_Half e_shentsize; /* section header entry size */
+ Elf32_Half e_shnum; /* number of section header entries */
+ Elf32_Half e_shstrndx; /* section header table's "section
+ header string table" entry offset */
+} Elf32_Ehdr;
+
+typedef struct {
+ unsigned char e_ident[EI_NIDENT]; /* Id bytes */
+ Elf64_Quarter e_type; /* file type */
+ Elf64_Quarter e_machine; /* machine type */
+ Elf64_Half e_version; /* version number */
+ Elf64_Addr e_entry; /* entry point */
+ Elf64_Off e_phoff; /* Program hdr offset */
+ Elf64_Off e_shoff; /* Section hdr offset */
+ Elf64_Half e_flags; /* Processor flags */
+ Elf64_Quarter e_ehsize; /* sizeof ehdr */
+ Elf64_Quarter e_phentsize; /* Program header entry size */
+ Elf64_Quarter e_phnum; /* Number of program headers */
+ Elf64_Quarter e_shentsize; /* Section header entry size */
+ Elf64_Quarter e_shnum; /* Number of section headers */
+ Elf64_Quarter e_shstrndx; /* String table index */
+} Elf64_Ehdr;
+
+/* e_type */
+#define ET_NONE 0 /* No file type */
+#define ET_REL 1 /* relocatable file */
+#define ET_EXEC 2 /* executable file */
+#define ET_DYN 3 /* shared object file */
+#define ET_CORE 4 /* core file */
+#define ET_NUM 5 /* number of types */
+#define ET_LOPROC 0xff00 /* reserved range for processor */
+#define ET_HIPROC 0xffff /* specific e_type */
+
+/* e_machine */
+#define EM_NONE 0 /* No Machine */
+#define EM_M32 1 /* AT&T WE 32100 */
+#define EM_SPARC 2 /* SPARC */
+#define EM_386 3 /* Intel 80386 */
+#define EM_68K 4 /* Motorola 68000 */
+#define EM_88K 5 /* Motorola 88000 */
+#define EM_486 6 /* Intel 80486 - unused? */
+#define EM_860 7 /* Intel 80860 */
+#define EM_MIPS 8 /* MIPS R3000 Big-Endian only */
+/*
+ * Don't know if EM_MIPS_RS4_BE,
+ * EM_SPARC64, EM_PARISC,
+ * or EM_PPC are ABI compliant
+ */
+#define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */
+#define EM_SPARC64 11 /* SPARC v9 64-bit unoffical */
+#define EM_PARISC 15 /* HPPA */
+#define EM_SPARC32PLUS 18 /* Enhanced instruction set SPARC */
+#define EM_PPC 20 /* PowerPC */
+#define EM_ARM 40 /* Advanced RISC Machines ARM */
+#define EM_ALPHA 41 /* DEC ALPHA */
+#define EM_SPARCV9 43 /* SPARC version 9 */
+#define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */
+#define EM_X86_64 62 /* AMD x86-64 architecture */
+#define EM_VAX 75 /* DEC VAX */
+#define EM_NUM 15 /* number of machine types */
+
+/* Version */
+#define EV_NONE 0 /* Invalid */
+#define EV_CURRENT 1 /* Current */
+#define EV_NUM 2 /* number of versions */
+
+/* Section Header */
+typedef struct {
+ Elf32_Word sh_name; /* name - index into section header
+ string table section */
+ Elf32_Word sh_type; /* type */
+ Elf32_Word sh_flags; /* flags */
+ Elf32_Addr sh_addr; /* address */
+ Elf32_Off sh_offset; /* file offset */
+ Elf32_Word sh_size; /* section size */
+ Elf32_Word sh_link; /* section header table index link */
+ Elf32_Word sh_info; /* extra information */
+ Elf32_Word sh_addralign; /* address alignment */
+ Elf32_Word sh_entsize; /* section entry size */
+} Elf32_Shdr;
+
+typedef struct {
+ Elf64_Half sh_name; /* section name */
+ Elf64_Half sh_type; /* section type */
+ Elf64_Xword sh_flags; /* section flags */
+ Elf64_Addr sh_addr; /* virtual address */
+ Elf64_Off sh_offset; /* file offset */
+ Elf64_Xword sh_size; /* section size */
+ Elf64_Half sh_link; /* link to another */
+ Elf64_Half sh_info; /* misc info */
+ Elf64_Xword sh_addralign; /* memory alignment */
+ Elf64_Xword sh_entsize; /* table entry size */
+} Elf64_Shdr;
+
+/* Special Section Indexes */
+#define SHN_UNDEF 0 /* undefined */
+#define SHN_LORESERVE 0xff00 /* lower bounds of reserved indexes */
+#define SHN_LOPROC 0xff00 /* reserved range for processor */
+#define SHN_HIPROC 0xff1f /* specific section indexes */
+#define SHN_ABS 0xfff1 /* absolute value */
+#define SHN_COMMON 0xfff2 /* common symbol */
+#define SHN_HIRESERVE 0xffff /* upper bounds of reserved indexes */
+
+/* sh_type */
+#define SHT_NULL 0 /* inactive */
+#define SHT_PROGBITS 1 /* program defined information */
+#define SHT_SYMTAB 2 /* symbol table section */
+#define SHT_STRTAB 3 /* string table section */
+#define SHT_RELA 4 /* relocation section with addends*/
+#define SHT_HASH 5 /* symbol hash table section */
+#define SHT_DYNAMIC 6 /* dynamic section */
+#define SHT_NOTE 7 /* note section */
+#define SHT_NOBITS 8 /* no space section */
+#define SHT_REL 9 /* relation section without addends */
+#define SHT_SHLIB 10 /* reserved - purpose unknown */
+#define SHT_DYNSYM 11 /* dynamic symbol table section */
+#define SHT_NUM 12 /* number of section types */
+#define SHT_LOPROC 0x70000000 /* reserved range for processor */
+#define SHT_HIPROC 0x7fffffff /* specific section header types */
+#define SHT_LOUSER 0x80000000 /* reserved range for application */
+#define SHT_HIUSER 0xffffffff /* specific indexes */
+
+/* Section names */
+#define ELF_BSS ".bss" /* uninitialized data */
+#define ELF_DATA ".data" /* initialized data */
+#define ELF_DEBUG ".debug" /* debug */
+#define ELF_DYNAMIC ".dynamic" /* dynamic linking information */
+#define ELF_DYNSTR ".dynstr" /* dynamic string table */
+#define ELF_DYNSYM ".dynsym" /* dynamic symbol table */
+#define ELF_FINI ".fini" /* termination code */
+#define ELF_GOT ".got" /* global offset table */
+#define ELF_HASH ".hash" /* symbol hash table */
+#define ELF_INIT ".init" /* initialization code */
+#define ELF_REL_DATA ".rel.data" /* relocation data */
+#define ELF_REL_FINI ".rel.fini" /* relocation termination code */
+#define ELF_REL_INIT ".rel.init" /* relocation initialization code */
+#define ELF_REL_DYN ".rel.dyn" /* relocaltion dynamic link info */
+#define ELF_REL_RODATA ".rel.rodata" /* relocation read-only data */
+#define ELF_REL_TEXT ".rel.text" /* relocation code */
+#define ELF_RODATA ".rodata" /* read-only data */
+#define ELF_SHSTRTAB ".shstrtab" /* section header string table */
+#define ELF_STRTAB ".strtab" /* string table */
+#define ELF_SYMTAB ".symtab" /* symbol table */
+#define ELF_TEXT ".text" /* code */
+
+
+/* Section Attribute Flags - sh_flags */
+#define SHF_WRITE 0x1 /* Writable */
+#define SHF_ALLOC 0x2 /* occupies memory */
+#define SHF_EXECINSTR 0x4 /* executable */
+#define SHF_MASKPROC 0xf0000000 /* reserved bits for processor */
+ /* specific section attributes */
+
+/* Symbol Table Entry */
+typedef struct elf32_sym {
+ Elf32_Word st_name; /* name - index into string table */
+ Elf32_Addr st_value; /* symbol value */
+ Elf32_Word st_size; /* symbol size */
+ unsigned char st_info; /* type and binding */
+ unsigned char st_other; /* 0 - no defined meaning */
+ Elf32_Half st_shndx; /* section header index */
+} Elf32_Sym;
+
+typedef struct {
+ Elf64_Half st_name; /* Symbol name index in str table */
+ Elf_Byte st_info; /* type / binding attrs */
+ Elf_Byte st_other; /* unused */
+ Elf64_Quarter st_shndx; /* section index of symbol */
+ Elf64_Xword st_value; /* value of symbol */
+ Elf64_Xword st_size; /* size of symbol */
+} Elf64_Sym;
+
+/* Symbol table index */
+#define STN_UNDEF 0 /* undefined */
+
+/* Extract symbol info - st_info */
+#define ELF32_ST_BIND(x) ((x) >> 4)
+#define ELF32_ST_TYPE(x) (((unsigned int) x) & 0xf)
+#define ELF32_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf))
+
+#define ELF64_ST_BIND(x) ((x) >> 4)
+#define ELF64_ST_TYPE(x) (((unsigned int) x) & 0xf)
+#define ELF64_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf))
+
+/* Symbol Binding - ELF32_ST_BIND - st_info */
+#define STB_LOCAL 0 /* Local symbol */
+#define STB_GLOBAL 1 /* Global symbol */
+#define STB_WEAK 2 /* like global - lower precedence */
+#define STB_NUM 3 /* number of symbol bindings */
+#define STB_LOPROC 13 /* reserved range for processor */
+#define STB_HIPROC 15 /* specific symbol bindings */
+
+/* Symbol type - ELF32_ST_TYPE - st_info */
+#define STT_NOTYPE 0 /* not specified */
+#define STT_OBJECT 1 /* data object */
+#define STT_FUNC 2 /* function */
+#define STT_SECTION 3 /* section */
+#define STT_FILE 4 /* file */
+#define STT_NUM 5 /* number of symbol types */
+#define STT_LOPROC 13 /* reserved range for processor */
+#define STT_HIPROC 15 /* specific symbol types */
+
+/* Relocation entry with implicit addend */
+typedef struct {
+ Elf32_Addr r_offset; /* offset of relocation */
+ Elf32_Word r_info; /* symbol table index and type */
+} Elf32_Rel;
+
+/* Relocation entry with explicit addend */
+typedef struct {
+ Elf32_Addr r_offset; /* offset of relocation */
+ Elf32_Word r_info; /* symbol table index and type */
+ Elf32_Sword r_addend;
+} Elf32_Rela;
+
+/* Extract relocation info - r_info */
+#define ELF32_R_SYM(i) ((i) >> 8)
+#define ELF32_R_TYPE(i) ((unsigned char) (i))
+#define ELF32_R_INFO(s,t) (((s) << 8) + (unsigned char)(t))
+
+typedef struct {
+ Elf64_Xword r_offset; /* where to do it */
+ Elf64_Xword r_info; /* index & type of relocation */
+} Elf64_Rel;
+
+typedef struct {
+ Elf64_Xword r_offset; /* where to do it */
+ Elf64_Xword r_info; /* index & type of relocation */
+ Elf64_Sxword r_addend; /* adjustment value */
+} Elf64_Rela;
+
+#define ELF64_R_SYM(info) ((info) >> 32)
+#define ELF64_R_TYPE(info) ((info) & 0xFFFFFFFF)
+#define ELF64_R_INFO(s,t) (((s) << 32) + (u32)(t))
+
+/* Program Header */
+typedef struct {
+ Elf32_Word p_type; /* segment type */
+ Elf32_Off p_offset; /* segment offset */
+ Elf32_Addr p_vaddr; /* virtual address of segment */
+ Elf32_Addr p_paddr; /* physical address - ignored? */
+ Elf32_Word p_filesz; /* number of bytes in file for seg. */
+ Elf32_Word p_memsz; /* number of bytes in mem. for seg. */
+ Elf32_Word p_flags; /* flags */
+ Elf32_Word p_align; /* memory alignment */
+} Elf32_Phdr;
+
+typedef struct {
+ Elf64_Half p_type; /* entry type */
+ Elf64_Half p_flags; /* flags */
+ Elf64_Off p_offset; /* offset */
+ Elf64_Addr p_vaddr; /* virtual address */
+ Elf64_Addr p_paddr; /* physical address */
+ Elf64_Xword p_filesz; /* file size */
+ Elf64_Xword p_memsz; /* memory size */
+ Elf64_Xword p_align; /* memory & file alignment */
+} Elf64_Phdr;
+
+/* Segment types - p_type */
+#define PT_NULL 0 /* unused */
+#define PT_LOAD 1 /* loadable segment */
+#define PT_DYNAMIC 2 /* dynamic linking section */
+#define PT_INTERP 3 /* the RTLD */
+#define PT_NOTE 4 /* auxiliary information */
+#define PT_SHLIB 5 /* reserved - purpose undefined */
+#define PT_PHDR 6 /* program header */
+#define PT_NUM 7 /* Number of segment types */
+#define PT_LOPROC 0x70000000 /* reserved range for processor */
+#define PT_HIPROC 0x7fffffff /* specific segment types */
+
+/* Segment flags - p_flags */
+#define PF_X 0x1 /* Executable */
+#define PF_W 0x2 /* Writable */
+#define PF_R 0x4 /* Readable */
+#define PF_MASKPROC 0xf0000000 /* reserved bits for processor */
+ /* specific segment flags */
+
+/* Dynamic structure */
+typedef struct {
+ Elf32_Sword d_tag; /* controls meaning of d_val */
+ union {
+ Elf32_Word d_val; /* Multiple meanings - see d_tag */
+ Elf32_Addr d_ptr; /* program virtual address */
+ } d_un;
+} Elf32_Dyn;
+
+typedef struct {
+ Elf64_Xword d_tag; /* controls meaning of d_val */
+ union {
+ Elf64_Addr d_ptr;
+ Elf64_Xword d_val;
+ } d_un;
+} Elf64_Dyn;
+
+/* Dynamic Array Tags - d_tag */
+#define DT_NULL 0 /* marks end of _DYNAMIC array */
+#define DT_NEEDED 1 /* string table offset of needed lib */
+#define DT_PLTRELSZ 2 /* size of relocation entries in PLT */
+#define DT_PLTGOT 3 /* address PLT/GOT */
+#define DT_HASH 4 /* address of symbol hash table */
+#define DT_STRTAB 5 /* address of string table */
+#define DT_SYMTAB 6 /* address of symbol table */
+#define DT_RELA 7 /* address of relocation table */
+#define DT_RELASZ 8 /* size of relocation table */
+#define DT_RELAENT 9 /* size of relocation entry */
+#define DT_STRSZ 10 /* size of string table */
+#define DT_SYMENT 11 /* size of symbol table entry */
+#define DT_INIT 12 /* address of initialization func. */
+#define DT_FINI 13 /* address of termination function */
+#define DT_SONAME 14 /* string table offset of shared obj */
+#define DT_RPATH 15 /* string table offset of library
+ search path */
+#define DT_SYMBOLIC 16 /* start sym search in shared obj. */
+#define DT_REL 17 /* address of rel. tbl. w addends */
+#define DT_RELSZ 18 /* size of DT_REL relocation table */
+#define DT_RELENT 19 /* size of DT_REL relocation entry */
+#define DT_PLTREL 20 /* PLT referenced relocation entry */
+#define DT_DEBUG 21 /* bugger */
+#define DT_TEXTREL 22 /* Allow rel. mod. to unwritable seg */
+#define DT_JMPREL 23 /* add. of PLT's relocation entries */
+#define DT_BIND_NOW 24 /* Bind now regardless of env setting */
+#define DT_NUM 25 /* Number used. */
+#define DT_LOPROC 0x70000000 /* reserved range for processor */
+#define DT_HIPROC 0x7fffffff /* specific dynamic array tags */
+
+/* Standard ELF hashing function */
+unsigned int elf_hash(const unsigned char *name);
+
+/*
+ * Note Definitions
+ */
+typedef struct {
+ Elf32_Word namesz;
+ Elf32_Word descsz;
+ Elf32_Word type;
+} Elf32_Note;
+
+typedef struct {
+ Elf64_Half namesz;
+ Elf64_Half descsz;
+ Elf64_Half type;
+} Elf64_Note;
+
+
+#if defined(ELFSIZE)
+#define CONCAT(x,y) __CONCAT(x,y)
+#define ELFNAME(x) CONCAT(elf,CONCAT(ELFSIZE,CONCAT(_,x)))
+#define ELFNAME2(x,y) CONCAT(x,CONCAT(_elf,CONCAT(ELFSIZE,CONCAT(_,y))))
+#define ELFNAMEEND(x) CONCAT(x,CONCAT(_elf,ELFSIZE))
+#define ELFDEFNNAME(x) CONCAT(ELF,CONCAT(ELFSIZE,CONCAT(_,x)))
+#endif
+
+#if defined(ELFSIZE) && (ELFSIZE == 32)
+#define Elf_Ehdr Elf32_Ehdr
+#define Elf_Phdr Elf32_Phdr
+#define Elf_Shdr Elf32_Shdr
+#define Elf_Sym Elf32_Sym
+#define Elf_Rel Elf32_Rel
+#define Elf_RelA Elf32_Rela
+#define Elf_Dyn Elf32_Dyn
+#define Elf_Word Elf32_Word
+#define Elf_Sword Elf32_Sword
+#define Elf_Addr Elf32_Addr
+#define Elf_Off Elf32_Off
+#define Elf_Nhdr Elf32_Nhdr
+#define Elf_Note Elf32_Note
+
+#define ELF_R_SYM ELF32_R_SYM
+#define ELF_R_TYPE ELF32_R_TYPE
+#define ELF_R_INFO ELF32_R_INFO
+#define ELFCLASS ELFCLASS32
+
+#define ELF_ST_BIND ELF32_ST_BIND
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#define ELF_ST_INFO ELF32_ST_INFO
+
+#define AuxInfo Aux32Info
+#elif defined(ELFSIZE) && (ELFSIZE == 64)
+#define Elf_Ehdr Elf64_Ehdr
+#define Elf_Phdr Elf64_Phdr
+#define Elf_Shdr Elf64_Shdr
+#define Elf_Sym Elf64_Sym
+#define Elf_Rel Elf64_Rel
+#define Elf_RelA Elf64_Rela
+#define Elf_Dyn Elf64_Dyn
+#define Elf_Word Elf64_Word
+#define Elf_Sword Elf64_Sword
+#define Elf_Addr Elf64_Addr
+#define Elf_Off Elf64_Off
+#define Elf_Nhdr Elf64_Nhdr
+#define Elf_Note Elf64_Note
+
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+#define ELF_R_INFO ELF64_R_INFO
+#define ELFCLASS ELFCLASS64
+
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_INFO ELF64_ST_INFO
+
+#define AuxInfo Aux64Info
+#endif
+
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index bc5a6362ea..37705452e2 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -125,6 +125,7 @@ extern spinlock_t free_list_lock;
extern unsigned int free_pfns;
extern unsigned long max_page;
void init_frametable(unsigned long nr_pages);
+void add_to_domain_alloc_list(unsigned long ps, unsigned long pe);
struct pfn_info *alloc_domain_page(struct task_struct *p);
void free_domain_page(struct pfn_info *page);
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 58ffffa5e2..4f506df04b 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -228,10 +228,13 @@ struct task_struct *alloc_task_struct();
extern struct task_struct *do_createdomain(
domid_t dom_id, unsigned int cpu);
-extern int setup_guestos(
- struct task_struct *p, dom0_createdomain_t *params, unsigned int num_vifs,
- char *data_start, unsigned long data_len,
- char *cmdline, unsigned long initrd_len);
+extern int construct_dom0(struct task_struct *p,
+ unsigned long alloc_start,
+ unsigned long alloc_end,
+ unsigned int num_vifs,
+ char *image_start, unsigned long image_len,
+ char *initrd_start, unsigned long initrd_len,
+ char *cmdline);
extern int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *);
struct task_struct *find_domain_by_id(domid_t dom);
diff --git a/xenolinux-2.4.25-sparse/arch/xen/Makefile b/xenolinux-2.4.25-sparse/arch/xen/Makefile
index f52b90632f..214675d8ec 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/Makefile
+++ b/xenolinux-2.4.25-sparse/arch/xen/Makefile
@@ -19,7 +19,7 @@
override EXTRAVERSION := -xen$(EXTRAVERSION)
LD=$(CROSS_COMPILE)ld -m elf_i386
-OBJCOPY=$(CROSS_COMPILE)objcopy -O binary -R .note -R .comment -S
+OBJCOPY=$(CROSS_COMPILE)objcopy -R .note -R .comment -S
LDFLAGS=-e stext
LINKFLAGS =-T $(TOPDIR)/arch/xen/vmlinux.lds $(LDFLAGS)
diff --git a/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile b/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile
index 01fb131959..64b402e833 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile
+++ b/xenolinux-2.4.25-sparse/arch/xen/boot/Makefile
@@ -6,17 +6,9 @@ xenolinux.gz: xenolinux
gzip -f -9 < $< > $@
xenolinux: $(TOPDIR)/vmlinux
- # Guest OS header -- first 8 bytes are identifier 'XenGuest'.
- echo -e -n 'XenGuest' >$@
- # Guest OS header -- next 4 bytes are load address (0xC0000000).
- echo -e -n '\000\000\000\300' >>$@
- $(OBJCOPY) $< xenolinux.body
- # Guest OS header is immediately followed by raw OS image.
- # Start address must be at byte 0.
- cat xenolinux.body >>$@
- rm -f xenolinux.body
+ $(OBJCOPY) $< $@
dep:
clean:
- rm -f xenolinux xenolinux.gz \ No newline at end of file
+ rm -f xenolinux*
diff --git a/xenolinux-2.4.25-sparse/arch/xen/config.in b/xenolinux-2.4.25-sparse/arch/xen/config.in
index dcf812d659..c66383f643 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/config.in
+++ b/xenolinux-2.4.25-sparse/arch/xen/config.in
@@ -82,9 +82,9 @@ fi
# tristate 'BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)' CONFIG_EDD
#fi
-#choice 'High Memory Support' \
-# "off CONFIG_NOHIGHMEM \
-# 4GB CONFIG_HIGHMEM4G \
+choice 'High Memory Support' \
+ "off CONFIG_NOHIGHMEM \
+ 4GB CONFIG_HIGHMEM4G" off
# 64GB CONFIG_HIGHMEM64G" off
if [ "$CONFIG_HIGHMEM4G" = "y" ]; then
define_bool CONFIG_HIGHMEM y
diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
index 8ce06571ef..c744f1bdcb 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
+++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/entry.S
@@ -369,21 +369,28 @@ critical_fixup_table:
# Hypervisor uses this for application faults while it executes.
ENTRY(failsafe_callback)
+ pushal
call SYMBOL_NAME(install_safe_pf_handler)
-1: pop %ds
-2: pop %es
-3: pop %fs
-4: pop %gs
+ movl 32(%esp),%ebx
+1: movl %ebx,%ds
+ movl 36(%esp),%ebx
+2: movl %ebx,%es
+ movl 40(%esp),%ebx
+3: movl %ebx,%fs
+ movl 44(%esp),%ebx
+4: movl %ebx,%gs
call SYMBOL_NAME(install_normal_pf_handler)
+ popal
+ addl $16,%esp
5: iret
.section .fixup,"ax"; \
-6: movl $0,(%esp); \
+6: xorl %ebx,%ebx; \
jmp 1b; \
-7: movl $0,(%esp); \
+7: xorl %ebx,%ebx; \
jmp 2b; \
-8: movl $0,(%esp); \
+8: xorl %ebx,%ebx; \
jmp 3b; \
-9: movl $0,(%esp); \
+9: xorl %ebx,%ebx; \
jmp 4b; \
10: pushl %ss; \
popl %ds; \
diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S b/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S
index 361815a58b..2d9379a15b 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S
+++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/head.S
@@ -1,4 +1,7 @@
+.section __xen_guest
+ .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=1.3"
+
.text
#include <linux/config.h>
#include <linux/threads.h>
@@ -8,42 +11,14 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
-/* Offsets in start_info structure */
-#define MOD_START 16
-#define MOD_LEN 20
-
-startup_32:
+ENTRY(stext)
+ENTRY(_stext)
cld
-
lss stack_start,%esp
-
- /* Copy initrd somewhere safe before it's clobbered by BSS. */
- mov MOD_LEN(%esi),%ecx
- shr $2,%ecx
- jz 2f /* bail from copy loop if no initrd */
- mov $SYMBOL_NAME(_end),%edi
- add MOD_LEN(%esi),%edi
- mov MOD_START(%esi),%eax
- add MOD_LEN(%esi),%eax
-1: sub $4,%eax
- sub $4,%edi
- mov (%eax),%ebx
- mov %ebx,(%edi)
- loop 1b
- mov %edi,MOD_START(%esi)
-
- /* Clear BSS first so that there are no surprises... */
-2: xorl %eax,%eax
- movl $SYMBOL_NAME(__bss_start),%edi
- movl $SYMBOL_NAME(_end),%ecx
- subl %edi,%ecx
- rep stosb
-
/* Copy the necessary stuff from start_info structure. */
- mov $SYMBOL_NAME(start_info_union),%edi
- mov $128,%ecx
+ mov $SYMBOL_NAME(start_info_union),%edi
+ mov $128,%ecx
rep movsl
-
jmp SYMBOL_NAME(start_kernel)
ENTRY(stack_start)
@@ -62,5 +37,3 @@ ENTRY(cpu0_pte_quicklist)
ENTRY(cpu0_pgd_quicklist)
.org 0x3800
-ENTRY(stext)
-ENTRY(_stext)
diff --git a/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c b/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c
index bce8d39497..bd65655c48 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c
+++ b/xenolinux-2.4.25-sparse/arch/xen/kernel/setup.c
@@ -69,7 +69,6 @@ char ignore_irq13; /* set if exception 16 works */
struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
unsigned long mmu_cr4_features;
-//EXPORT_SYMBOL(mmu_cr4_features);
unsigned char * vgacon_mmap;
@@ -106,6 +105,8 @@ unsigned char aux_device_present;
extern int root_mountflags;
extern char _text, _etext, _edata, _end;
+extern int blk_nohighio;
+
int enable_acpi_smp_table;
/* Raw start-of-day parameters from the hypervisor. */
@@ -160,7 +161,6 @@ static void __init parse_mem_cmdline (char ** cmdline_p)
void __init setup_arch(char **cmdline_p)
{
unsigned long bootmap_size, start_pfn, max_low_pfn;
- unsigned long i;
extern void hypervisor_callback(void);
extern void failsafe_callback(void);
@@ -168,6 +168,10 @@ void __init setup_arch(char **cmdline_p)
extern unsigned long cpu0_pte_quicklist[];
extern unsigned long cpu0_pgd_quicklist[];
+#ifndef CONFIG_HIGHIO
+ blk_nohighio = 1;
+#endif
+
HYPERVISOR_set_callbacks(
__KERNEL_CS, (unsigned long)hypervisor_callback,
__KERNEL_CS, (unsigned long)failsafe_callback);
@@ -208,7 +212,7 @@ void __init setup_arch(char **cmdline_p)
#define PFN_PHYS(x) ((x) << PAGE_SHIFT)
/*
- * 128MB for vmalloc and initrd
+ * 128MB for vmalloc(), iomap(), kmap(), and fixaddr mappings.
*/
#define VMALLOC_RESERVE (unsigned long)(128 << 20)
#define MAXMEM (unsigned long)(HYPERVISOR_VIRT_START-PAGE_OFFSET-VMALLOC_RESERVE)
@@ -216,21 +220,9 @@ void __init setup_arch(char **cmdline_p)
#define MAX_NONPAE_PFN (1 << 20)
/*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
-#ifdef CONFIG_BLK_DEV_INITRD
- if ( start_info.mod_start )
- start_pfn = PFN_UP(__pa(start_info.mod_start + start_info.mod_len));
- else
-#endif
- start_pfn = PFN_UP(__pa(&_end));
- max_pfn = start_info.nr_pages;
-
- /*
* Determine low and high memory ranges:
*/
- max_low_pfn = max_pfn;
+ max_low_pfn = max_pfn = start_info.nr_pages;
if (max_low_pfn > MAXMEM_PFN) {
max_low_pfn = MAXMEM_PFN;
#ifndef CONFIG_HIGHMEM
@@ -261,51 +253,36 @@ void __init setup_arch(char **cmdline_p)
}
#endif
+ phys_to_machine_mapping = (unsigned long *)start_info.mfn_list;
+ cur_pgd = init_mm.pgd = (pgd_t *)start_info.pt_base;
+
+ start_pfn = (__pa(start_info.pt_base) >> PAGE_SHIFT) +
+ start_info.nr_pt_frames;
+
/*
- * Initialize the boot-time allocator, and free up all RAM.
- * Then reserve space for OS image, and the bootmem bitmap.
+ * Initialize the boot-time allocator, and free up all RAM. Then reserve
+ * space for OS image, initrd, phys->machine table, bootstrap page table,
+ * and the bootmem bitmap.
+ * NB. There is definitely enough room for the bootmem bitmap in the
+ * bootstrap page table. We are guaranteed to get >=512kB unused 'padding'
+ * for our own use after all bootstrap elements (see hypervisor-if.h).
*/
bootmap_size = init_bootmem(start_pfn, max_low_pfn);
free_bootmem(0, PFN_PHYS(max_low_pfn));
reserve_bootmem(0, PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1);
- /* Now reserve space for the hypervisor-provided page tables. */
+#ifdef CONFIG_BLK_DEV_INITRD
+ if ( start_info.mod_start != 0 )
{
- unsigned long *pgd = (unsigned long *)start_info.pt_base;
- unsigned long pte;
- int i;
- reserve_bootmem(__pa(pgd), PAGE_SIZE);
- for ( i = 0; i < (HYPERVISOR_VIRT_START>>22); i++ )
+ if ( (__pa(start_info.mod_start) + start_info.mod_len) <=
+ (max_low_pfn << PAGE_SHIFT) )
{
- unsigned long pgde = *pgd++;
- if ( !(pgde & 1) ) continue;
- pte = machine_to_phys(pgde & PAGE_MASK);
- reserve_bootmem(pte, PAGE_SIZE);
- }
- }
- cur_pgd = init_mm.pgd = (pgd_t *)start_info.pt_base;
-
- /* Now initialise the physical->machine mapping table. */
- phys_to_machine_mapping = alloc_bootmem(max_pfn * sizeof(unsigned long));
- for ( i = 0; i < max_pfn; i++ )
- {
- unsigned long pgde, *ppte;
- unsigned long pfn = i + (PAGE_OFFSET >> PAGE_SHIFT);
- pgde = *((unsigned long *)start_info.pt_base + (pfn >> 10));
- ppte = (unsigned long *)machine_to_phys(pgde & PAGE_MASK) + (pfn&1023);
- phys_to_machine_mapping[i] =
- (*(unsigned long *)__va(ppte)) >> PAGE_SHIFT;
- }
-
-#ifdef CONFIG_BLK_DEV_INITRD
- if (start_info.mod_start) {
- if ((__pa(start_info.mod_start) + start_info.mod_len) <=
- (max_low_pfn << PAGE_SHIFT)) {
initrd_start = start_info.mod_start;
initrd_end = initrd_start + start_info.mod_len;
initrd_below_start_ok = 1;
}
- else {
+ else
+ {
printk(KERN_ERR "initrd extends beyond end of memory "
"(0x%08lx > 0x%08lx)\ndisabling initrd\n",
__pa(start_info.mod_start) + start_info.mod_len,
@@ -317,7 +294,7 @@ void __init setup_arch(char **cmdline_p)
paging_init();
- /* We are privileged guest os - should have IO privileges. */
+ /* If we are a privileged guest OS then we should request IO privileges. */
if ( start_info.flags & SIF_PRIVILEGED )
{
dom0_op_t op;
@@ -352,6 +329,13 @@ static int __init cachesize_setup(char *str)
}
__setup("cachesize=", cachesize_setup);
+static int __init highio_setup(char *str)
+{
+ printk("i386: disabling HIGHMEM block I/O\n");
+ blk_nohighio = 1;
+ return 1;
+}
+__setup("nohighio", highio_setup);
static int __init get_model_name(struct cpuinfo_x86 *c)
{
diff --git a/xenolinux-2.4.25-sparse/arch/xen/mm/init.c b/xenolinux-2.4.25-sparse/arch/xen/mm/init.c
index 0bb2d173e4..6bfdd3ae9f 100644
--- a/xenolinux-2.4.25-sparse/arch/xen/mm/init.c
+++ b/xenolinux-2.4.25-sparse/arch/xen/mm/init.c
@@ -58,6 +58,31 @@ int do_check_pgt_cache(int low, int high)
}
return freed;
}
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+#if CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+#define kmap_get_fixmap_pte(vaddr) \
+ pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
+
+void __init kmap_init(void)
+{
+ unsigned long kmap_vstart;
+
+ /* cache the first kmap pte */
+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+ kmap_prot = PAGE_KERNEL;
+}
+#endif /* CONFIG_HIGHMEM */
void show_mem(void)
{
@@ -186,6 +211,77 @@ static void __init fixrange_init (unsigned long start,
}
+static void __init pagetable_init (void)
+{
+ unsigned long vaddr, end;
+ pgd_t *kpgd, *pgd, *pgd_base;
+ int i, j, k;
+ pmd_t *kpmd, *pmd;
+ pte_t *kpte, *pte, *pte_base;
+
+ /*
+ * This can be zero as well - no problem, in that case we exit
+ * the loops anyway due to the PTRS_PER_* conditions.
+ */
+ end = (unsigned long)__va(max_low_pfn*PAGE_SIZE);
+
+ pgd_base = init_mm.pgd;
+ i = __pgd_offset(PAGE_OFFSET);
+ pgd = pgd_base + i;
+
+ for (; i < PTRS_PER_PGD; pgd++, i++) {
+ vaddr = i*PGDIR_SIZE;
+ if (end && (vaddr >= end))
+ break;
+ pmd = (pmd_t *)pgd;
+ for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
+ vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
+ if (end && (vaddr >= end))
+ break;
+
+ /* Filled in for us already? */
+ if ( pmd_val(*pmd) & _PAGE_PRESENT )
+ continue;
+
+ pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+ for (k = 0; k < PTRS_PER_PTE; pte++, k++) {
+ vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE;
+ if (end && (vaddr >= end))
+ break;
+ *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL);
+ }
+ kpgd = pgd_offset_k((unsigned long)pte_base);
+ kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
+ kpte = pte_offset(kpmd, (unsigned long)pte_base);
+ queue_l1_entry_update(kpte,
+ (*(unsigned long *)kpte)&~_PAGE_RW);
+ set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
+ XEN_flush_page_update_queue();
+ }
+ }
+
+ /*
+ * Fixed mappings, only the page table structure has to be
+ * created - mappings will be set by set_fixmap():
+ */
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ fixrange_init(vaddr, HYPERVISOR_VIRT_START, init_mm.pgd);
+
+#if CONFIG_HIGHMEM
+ /*
+ * Permanent kmaps:
+ */
+ vaddr = PKMAP_BASE;
+ fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, init_mm.pgd);
+
+ pgd = init_mm.pgd + __pgd_offset(vaddr);
+ pmd = pmd_offset(pgd, vaddr);
+ pte = pte_offset(pmd, vaddr);
+ pkmap_page_table = pte;
+#endif
+}
+
static void __init zone_sizes_init(void)
{
unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
@@ -207,33 +303,18 @@ static void __init zone_sizes_init(void)
free_area_init(zones_size);
}
-/*
- * paging_init() sets up the page tables - note that the first 8MB are
- * already mapped by head.S.
- *
- * This routines also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
- */
void __init paging_init(void)
{
- unsigned long vaddr;
+ pagetable_init();
zone_sizes_init();
- /*
- * Fixed mappings, only the page table structure has to be created -
- * mappings will be set by set_fixmap():
- */
- vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
- fixrange_init(vaddr, HYPERVISOR_VIRT_START, init_mm.pgd);
-
/* Switch to the real shared_info page, and clear the dummy page. */
set_fixmap(FIX_SHARED_INFO, start_info.shared_info);
HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
memset(empty_zero_page, 0, sizeof(empty_zero_page));
#ifdef CONFIG_HIGHMEM
-#error
kmap_init();
#endif
}
@@ -243,6 +324,11 @@ static inline int page_is_ram (unsigned long pagenr)
return 1;
}
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+ return 0;
+}
+
#ifdef CONFIG_HIGHMEM
void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
{
@@ -278,8 +364,7 @@ static void __init set_max_mapnr_init(void)
static int __init free_pages_init(void)
{
#ifdef CONFIG_HIGHMEM
-#error Where is this supposed to be initialised?
- int bad_ppro;
+ int bad_ppro = 0;
#endif
int reservedpages, pfn;
diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h b/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h
index 2441b01d4e..338bd4ba2c 100644
--- a/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h
+++ b/xenolinux-2.4.25-sparse/include/asm-xen/fixmap.h
@@ -17,6 +17,10 @@
#include <linux/kernel.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
/*
* Here we define all the compile-time 'special' virtual
@@ -38,7 +42,7 @@
*/
enum fixed_addresses {
-#ifdef CONFIG_HIGHMEM_XXX
+#ifdef CONFIG_HIGHMEM
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
#endif
diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h b/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h
index 7e56b1b32d..25ef32882c 100644
--- a/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h
+++ b/xenolinux-2.4.25-sparse/include/asm-xen/highmem.h
@@ -1,2 +1,132 @@
-#error "Highmem unsupported!"
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+#ifndef _ASM_HIGHMEM_H
+#define _ASM_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/kmap_types.h>
+#include <asm/pgtable.h>
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+#define HIGHMEM_DEBUG 1
+#else
+#define HIGHMEM_DEBUG 0
+#endif
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+extern void kmap_init(void) __init;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+#define PKMAP_BASE (HYPERVISOR_VIRT_START - (1<<23))
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void * FASTCALL(kmap_high(struct page *page, int nonblocking));
+extern void FASTCALL(kunmap_high(struct page *page));
+
+#define kmap(page) __kmap(page, 0)
+#define kmap_nonblock(page) __kmap(page, 1)
+
+static inline void *__kmap(struct page *page, int nonblocking)
+{
+ if (in_interrupt())
+ out_of_line_bug();
+ if (page < highmem_start_page)
+ return page_address(page);
+ return kmap_high(page, nonblocking);
+}
+
+static inline void kunmap(struct page *page)
+{
+ if (in_interrupt())
+ out_of_line_bug();
+ if (page < highmem_start_page)
+ return;
+ kunmap_high(page);
+}
+
+/*
+ * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
+ * gives a more generic (and caching) interface. But kmap_atomic can
+ * be used in IRQ contexts, so in some (very limited) cases we need
+ * it.
+ */
+static inline void *kmap_atomic(struct page *page, enum km_type type)
+{
+ enum fixed_addresses idx;
+ unsigned long vaddr;
+
+ if (page < highmem_start_page)
+ return page_address(page);
+
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+#if HIGHMEM_DEBUG
+ if (!pte_none(*(kmap_pte-idx)))
+ out_of_line_bug();
+#endif
+ set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
+ __flush_tlb_one(vaddr);
+
+ return (void*) vaddr;
+}
+
+static inline void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+#if HIGHMEM_DEBUG
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+ enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+
+ if (vaddr < FIXADDR_START) // FIXME
+ return;
+
+ if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
+ out_of_line_bug();
+
+ /*
+ * force other mappings to Oops if they'll try to access
+ * this pte without first remap it
+ */
+ pte_clear(kmap_pte-idx);
+ __flush_tlb_one(vaddr);
+#endif
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_HIGHMEM_H */
diff --git a/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h b/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h
index 36655e63e5..38721e4cff 100644
--- a/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h
+++ b/xenolinux-2.4.25-sparse/include/asm-xen/pgtable.h
@@ -101,7 +101,11 @@ extern void * high_memory;
#define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \
~(VMALLOC_OFFSET-1))
#define VMALLOC_VMADDR(x) ((unsigned long)(x))
-#define VMALLOC_END (FIXADDR_START - 2*PAGE_SIZE)
+#if CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
+#else
+# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
+#endif
#define _PAGE_BIT_PRESENT 0
#define _PAGE_BIT_RW 1
diff --git a/xenolinux-2.4.25-sparse/mm/highmem.c b/xenolinux-2.4.25-sparse/mm/highmem.c
new file mode 100644
index 0000000000..a68937452c
--- /dev/null
+++ b/xenolinux-2.4.25-sparse/mm/highmem.c
@@ -0,0 +1,455 @@
+/*
+ * High memory handling common code and variables.
+ *
+ * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
+ * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * 64-bit physical space. With current x86 CPUs this
+ * means up to 64 Gigabytes physical RAM.
+ *
+ * Rewrote high memory support to move the page cache into
+ * high memory. Implemented permanent (schedulable) kmaps
+ * based on Linus' idea.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+
+/*
+ * Virtual_count is not a pure "count".
+ * 0 means that it is not mapped, and has not been mapped
+ * since a TLB flush - it is usable.
+ * 1 means that there are no users, but it has been mapped
+ * since the last TLB flush - so we can't use it.
+ * n means that there are (n-1) current users of it.
+ */
+static int pkmap_count[LAST_PKMAP];
+static unsigned int last_pkmap_nr;
+static spinlock_cacheline_t kmap_lock_cacheline = {SPIN_LOCK_UNLOCKED};
+#define kmap_lock kmap_lock_cacheline.lock
+
+pte_t * pkmap_page_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+
+static void flush_all_zero_pkmaps(void)
+{
+ int i;
+
+ flush_cache_all();
+
+ for (i = 0; i < LAST_PKMAP; i++) {
+ struct page *page;
+
+ /*
+ * zero means we don't have anything to do,
+ * >1 means that it is still in use. Only
+ * a count of 1 means that it is free but
+ * needs to be unmapped
+ */
+ if (pkmap_count[i] != 1)
+ continue;
+ pkmap_count[i] = 0;
+
+ /* sanity check */
+ if (pte_none(pkmap_page_table[i]))
+ BUG();
+
+ /*
+ * Don't need an atomic fetch-and-clear op here;
+ * no-one has the page mapped, and cannot get at
+ * its virtual address (and hence PTE) without first
+ * getting the kmap_lock (which is held here).
+ * So no dangers, even with speculative execution.
+ */
+ page = pte_page(pkmap_page_table[i]);
+ pte_clear(&pkmap_page_table[i]);
+
+ page->virtual = NULL;
+ }
+ flush_tlb_all();
+}
+
+static inline unsigned long map_new_virtual(struct page *page, int nonblocking)
+{
+ unsigned long vaddr;
+ int count;
+
+start:
+ count = LAST_PKMAP;
+ /* Find an empty entry */
+ for (;;) {
+ last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
+ if (!last_pkmap_nr) {
+ flush_all_zero_pkmaps();
+ count = LAST_PKMAP;
+ }
+ if (!pkmap_count[last_pkmap_nr])
+ break; /* Found a usable entry */
+ if (--count)
+ continue;
+
+ if (nonblocking)
+ return 0;
+
+ /*
+ * Sleep for somebody else to unmap their entries
+ */
+ {
+ DECLARE_WAITQUEUE(wait, current);
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue(&pkmap_map_wait, &wait);
+ spin_unlock(&kmap_lock);
+ schedule();
+ remove_wait_queue(&pkmap_map_wait, &wait);
+ spin_lock(&kmap_lock);
+
+ /* Somebody else might have mapped it while we slept */
+ if (page->virtual)
+ return (unsigned long) page->virtual;
+
+ /* Re-start */
+ goto start;
+ }
+ }
+ vaddr = PKMAP_ADDR(last_pkmap_nr);
+ set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
+ XEN_flush_page_update_queue();
+
+ pkmap_count[last_pkmap_nr] = 1;
+ page->virtual = (void *) vaddr;
+
+ return vaddr;
+}
+
+void *kmap_high(struct page *page, int nonblocking)
+{
+ unsigned long vaddr;
+
+ /*
+ * For highmem pages, we can't trust "virtual" until
+ * after we have the lock.
+ *
+ * We cannot call this from interrupts, as it may block
+ */
+ spin_lock(&kmap_lock);
+ vaddr = (unsigned long) page->virtual;
+ if (!vaddr) {
+ vaddr = map_new_virtual(page, nonblocking);
+ if (!vaddr)
+ goto out;
+ }
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ if (pkmap_count[PKMAP_NR(vaddr)] < 2)
+ BUG();
+ out:
+ spin_unlock(&kmap_lock);
+ return (void*) vaddr;
+}
+
+void kunmap_high(struct page *page)
+{
+ unsigned long vaddr;
+ unsigned long nr;
+ int need_wakeup;
+
+ spin_lock(&kmap_lock);
+ vaddr = (unsigned long) page->virtual;
+ if (!vaddr)
+ BUG();
+ nr = PKMAP_NR(vaddr);
+
+ /*
+ * A count must never go down to zero
+ * without a TLB flush!
+ */
+ need_wakeup = 0;
+ switch (--pkmap_count[nr]) {
+ case 0:
+ BUG();
+ case 1:
+ /*
+ * Avoid an unnecessary wake_up() function call.
+ * The common case is pkmap_count[] == 1, but
+ * no waiters.
+ * The tasks queued in the wait-queue are guarded
+ * by both the lock in the wait-queue-head and by
+ * the kmap_lock. As the kmap_lock is held here,
+ * no need for the wait-queue-head's lock. Simply
+ * test if the queue is empty.
+ */
+ need_wakeup = waitqueue_active(&pkmap_map_wait);
+ }
+ spin_unlock(&kmap_lock);
+
+ /* do wake-up, if needed, race-free outside of the spin lock */
+ if (need_wakeup)
+ wake_up(&pkmap_map_wait);
+}
+
+#define POOL_SIZE 32
+
+/*
+ * This lock gets no contention at all, normally.
+ */
+static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
+
+int nr_emergency_pages;
+static LIST_HEAD(emergency_pages);
+
+int nr_emergency_bhs;
+static LIST_HEAD(emergency_bhs);
+
+/*
+ * Simple bounce buffer support for highmem pages.
+ * This will be moved to the block layer in 2.5.
+ */
+
+static inline void copy_from_high_bh (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_from;
+ char *vfrom;
+
+ p_from = from->b_page;
+
+ vfrom = kmap_atomic(p_from, KM_USER0);
+ memcpy(to->b_data, vfrom + bh_offset(from), to->b_size);
+ kunmap_atomic(vfrom, KM_USER0);
+}
+
+static inline void copy_to_high_bh_irq (struct buffer_head *to,
+ struct buffer_head *from)
+{
+ struct page *p_to;
+ char *vto;
+ unsigned long flags;
+
+ p_to = to->b_page;
+ __save_flags(flags);
+ __cli();
+ vto = kmap_atomic(p_to, KM_BOUNCE_READ);
+ memcpy(vto + bh_offset(to), from->b_data, to->b_size);
+ kunmap_atomic(vto, KM_BOUNCE_READ);
+ __restore_flags(flags);
+}
+
+static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
+{
+ struct page *page;
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
+ unsigned long flags;
+
+ bh_orig->b_end_io(bh_orig, uptodate);
+
+ page = bh->b_page;
+
+ spin_lock_irqsave(&emergency_lock, flags);
+ if (nr_emergency_pages >= POOL_SIZE)
+ __free_page(page);
+ else {
+ /*
+ * We are abusing page->list to manage
+ * the highmem emergency pool:
+ */
+ list_add(&page->list, &emergency_pages);
+ nr_emergency_pages++;
+ }
+
+ if (nr_emergency_bhs >= POOL_SIZE) {
+#ifdef HIGHMEM_DEBUG
+ /* Don't clobber the constructed slab cache */
+ init_waitqueue_head(&bh->b_wait);
+#endif
+ kmem_cache_free(bh_cachep, bh);
+ } else {
+ /*
+ * Ditto in the bh case, here we abuse b_inode_buffers:
+ */
+ list_add(&bh->b_inode_buffers, &emergency_bhs);
+ nr_emergency_bhs++;
+ }
+ spin_unlock_irqrestore(&emergency_lock, flags);
+}
+
+static __init int init_emergency_pool(void)
+{
+ struct sysinfo i;
+ si_meminfo(&i);
+ si_swapinfo(&i);
+
+ if (!i.totalhigh)
+ return 0;
+
+ spin_lock_irq(&emergency_lock);
+ while (nr_emergency_pages < POOL_SIZE) {
+ struct page * page = alloc_page(GFP_ATOMIC);
+ if (!page) {
+ printk("couldn't refill highmem emergency pages");
+ break;
+ }
+ list_add(&page->list, &emergency_pages);
+ nr_emergency_pages++;
+ }
+ while (nr_emergency_bhs < POOL_SIZE) {
+ struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+ if (!bh) {
+ printk("couldn't refill highmem emergency bhs");
+ break;
+ }
+ list_add(&bh->b_inode_buffers, &emergency_bhs);
+ nr_emergency_bhs++;
+ }
+ spin_unlock_irq(&emergency_lock);
+ printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
+ nr_emergency_pages, nr_emergency_bhs);
+
+ return 0;
+}
+
+__initcall(init_emergency_pool);
+
+static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
+{
+ bounce_end_io(bh, uptodate);
+}
+
+static void bounce_end_io_read (struct buffer_head *bh, int uptodate)
+{
+ struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
+
+ if (uptodate)
+ copy_to_high_bh_irq(bh_orig, bh);
+ bounce_end_io(bh, uptodate);
+}
+
+struct page *alloc_bounce_page (void)
+{
+ struct list_head *tmp;
+ struct page *page;
+
+ page = alloc_page(GFP_NOHIGHIO);
+ if (page)
+ return page;
+ /*
+ * No luck. First, kick the VM so it doesn't idle around while
+ * we are using up our emergency rations.
+ */
+ wakeup_bdflush();
+
+repeat_alloc:
+ /*
+ * Try to allocate from the emergency pool.
+ */
+ tmp = &emergency_pages;
+ spin_lock_irq(&emergency_lock);
+ if (!list_empty(tmp)) {
+ page = list_entry(tmp->next, struct page, list);
+ list_del(tmp->next);
+ nr_emergency_pages--;
+ }
+ spin_unlock_irq(&emergency_lock);
+ if (page)
+ return page;
+
+ /* we need to wait I/O completion */
+ run_task_queue(&tq_disk);
+
+ yield();
+ goto repeat_alloc;
+}
+
+struct buffer_head *alloc_bounce_bh (void)
+{
+ struct list_head *tmp;
+ struct buffer_head *bh;
+
+ bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO);
+ if (bh)
+ return bh;
+ /*
+ * No luck. First, kick the VM so it doesn't idle around while
+ * we are using up our emergency rations.
+ */
+ wakeup_bdflush();
+
+repeat_alloc:
+ /*
+ * Try to allocate from the emergency pool.
+ */
+ tmp = &emergency_bhs;
+ spin_lock_irq(&emergency_lock);
+ if (!list_empty(tmp)) {
+ bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+ list_del(tmp->next);
+ nr_emergency_bhs--;
+ }
+ spin_unlock_irq(&emergency_lock);
+ if (bh)
+ return bh;
+
+ /* we need to wait I/O completion */
+ run_task_queue(&tq_disk);
+
+ yield();
+ goto repeat_alloc;
+}
+
+struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
+{
+ struct page *page;
+ struct buffer_head *bh;
+
+ if (!PageHighMem(bh_orig->b_page))
+ return bh_orig;
+
+ bh = alloc_bounce_bh();
+ /*
+ * This is wasteful for 1k buffers, but this is a stopgap measure
+ * and we are being ineffective anyway. This approach simplifies
+ * things immensly. On boxes with more than 4GB RAM this should
+ * not be an issue anyway.
+ */
+ page = alloc_bounce_page();
+
+ set_bh_page(bh, page, 0);
+
+ bh->b_next = NULL;
+ bh->b_blocknr = bh_orig->b_blocknr;
+ bh->b_size = bh_orig->b_size;
+ bh->b_list = -1;
+ bh->b_dev = bh_orig->b_dev;
+ bh->b_count = bh_orig->b_count;
+ bh->b_rdev = bh_orig->b_rdev;
+ bh->b_state = bh_orig->b_state;
+#ifdef HIGHMEM_DEBUG
+ bh->b_flushtime = jiffies;
+ bh->b_next_free = NULL;
+ bh->b_prev_free = NULL;
+ /* bh->b_this_page */
+ bh->b_reqnext = NULL;
+ bh->b_pprev = NULL;
+#endif
+ /* bh->b_page */
+ if (rw == WRITE) {
+ bh->b_end_io = bounce_end_io_write;
+ copy_from_high_bh(bh, bh_orig);
+ } else
+ bh->b_end_io = bounce_end_io_read;
+ bh->b_private = (void *)bh_orig;
+ bh->b_rsector = bh_orig->b_rsector;
+#ifdef HIGHMEM_DEBUG
+ memset(&bh->b_wait, -1, sizeof(bh->b_wait));
+#endif
+
+ return bh;
+}
+