diff options
author | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2003-12-20 12:44:11 +0000 |
---|---|---|
committer | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2003-12-20 12:44:11 +0000 |
commit | 24fa12d9d6bc4f6cf3740475e8403d911e84b53c (patch) | |
tree | c7487438b7035915ba974897f29788d0c4389ab4 | |
parent | ce423b5dfe323c86c1db29cfe17d1779c4da1829 (diff) | |
download | xen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.tar.gz xen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.tar.bz2 xen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.zip |
bitkeeper revision 1.652.1.1 (3fe4441bD7Ytc0dpv4nkQCX5YO2A8w)
Many files:
Many fixes and a complete rewrite of page management in Xen.
flushtlb.c:
new file
.del-TODO~9e3f87ffe4e9e1f1:
Delete: xen/TODO
.del-GUEST_CHANGES~b67e924f1504662d:
Delete: xen/GUEST_CHANGES
51 files changed, 1601 insertions, 1498 deletions
@@ -80,10 +80,8 @@ 3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py 3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py 3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING -3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES 3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile 3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk -3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO 3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile 3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk 3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c @@ -93,6 +91,7 @@ 3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S 3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c +3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c 3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c 3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index a0176edfc1..7e5c57bb0d 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle, const char *cmdline, unsigned long shared_info_frame) { - l1_pgentry_t *vl1tab = NULL, *vl1e = NULL; - l2_pgentry_t *vl2tab = NULL, *vl2e = NULL; + l1_pgentry_t *vl1tab; + l2_pgentry_t *vl2tab; unsigned long *page_array = NULL; mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL; int alloc_index, num_pt_pages; - unsigned long l2tab; + unsigned long l2tab, l2e, l1e=0; unsigned long l1tab = 0; unsigned long num_pgt_updates = 0; unsigned long count, pt_start, i, j; @@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle, if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL ) goto error_out; memset(vl2tab, 0, PAGE_SIZE); - vl2e = vl2tab + l2_table_offset(virt_load_addr); + unmap_pfn(pm_handle, vl2tab); + l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t)); for ( count = 0; count < tot_pages; count++ ) { - if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) + if ( (l1e & (PAGE_SIZE-1)) == 0 ) { l1tab = page_array[alloc_index] << PAGE_SHIFT; if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL ) goto error_out; memset(vl1tab, 0, PAGE_SIZE); + unmap_pfn(pm_handle, vl1tab); alloc_index--; - vl1e = vl1tab + l1_table_offset(virt_load_addr + - (count << PAGE_SHIFT)); + l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))* + sizeof(l1_pgentry_t)); /* make apropriate entry in the page directory */ - pgt_updates->ptr = (unsigned long)vl2e; + pgt_updates->ptr = l2e; pgt_updates->val = l1tab | L2_PROT; pgt_updates++; num_pgt_updates++; - vl2e++; + l2e += sizeof(l2_pgentry_t); } if ( count < pt_start ) { - pgt_updates->ptr = (unsigned long)vl1e; + pgt_updates->ptr = l1e; pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT; pgt_updates++; num_pgt_updates++; - vl1e++; + l1e += sizeof(l1_pgentry_t); } else { - pgt_updates->ptr = (unsigned long)vl1e; + pgt_updates->ptr = l1e; pgt_updates->val = ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW; pgt_updates++; num_pgt_updates++; - vl1e++; + l1e += sizeof(l1_pgentry_t); } pgt_updates->ptr = diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index 2418d97219..44ebe3c940 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle, page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; } if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx, - (unsigned long)&ppage[j], page[j]) ) + (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)), + page[j]) ) goto out; } break; @@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle, page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; } if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx, - (unsigned long)&ppage[j], page[j]) ) + (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)), + page[j]) ) goto out; } break; @@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle, memcpy(ppage, page, PAGE_SIZE); break; } - /* NB. Must flush before unmapping page, as pass VAs to Xen. */ - if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) ) - goto out; unmap_pfn(pm_handle, ppage); if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx, diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index 463efb7acb..e5f5934cff 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_handle, { dom0_op_t op; op.cmd = DOM0_GETPAGEFRAMEINFO; - op.u.getpageframeinfo.pfn = mfn; - if ( (do_dom0_op(xc_handle, &op) < 0) || - (op.u.getpageframeinfo.domain != dom) ) - return 0; - return 1; + op.u.getpageframeinfo.pfn = mfn; + op.u.getpageframeinfo.domain = dom; + return (do_dom0_op(xc_handle, &op) >= 0); } #define GETPFN_ERR (~0U) -static unsigned int get_pfn_type(int xc_handle, unsigned long mfn) +static unsigned int get_pfn_type(int xc_handle, + unsigned long mfn, + unsigned int dom) { dom0_op_t op; op.cmd = DOM0_GETPAGEFRAMEINFO; - op.u.getpageframeinfo.pfn = mfn; + op.u.getpageframeinfo.pfn = mfn; + op.u.getpageframeinfo.domain = dom; if ( do_dom0_op(xc_handle, &op) < 0 ) { PERROR("Unexpected failure when getting page frame info!"); @@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle, mfn_to_pfn_table[mfn] = i; /* Query page type by MFN, but store it by PFN. */ - if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR ) + if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == + GETPFN_ERR ) goto out; } diff --git a/xen/GUEST_CHANGES b/xen/GUEST_CHANGES deleted file mode 100644 index b9f25d49cd..0000000000 --- a/xen/GUEST_CHANGES +++ /dev/null @@ -1,26 +0,0 @@ - -The interface between Xen and overlying guest OSes has changed in the -following ways since version 1.0: - -Modified hypercall 'pt_update' ------------------------------- -Page-table updates passed to the 'pt_update' hypercall must now -specify a virtual address that maps the PTE to be modified. Previously -a physical address was used, requiring Xen to temporarily map the PTE -into its own private region so that it could be read and written. -This affects only commands of type PGREQ_NORMAL_UPDATE and -PGREQ_UNCHECKED_UPDATE. - -New hypercall 'update_va_mapping' ---------------------------------- -A new high-speed page-table update method has been introduced, which -may be of particular benefit when fixing up application page faults. -Invoked as 'update_va_mapping(page_number, new_pte_value, flags)': - <page_number>: The virtual page number in the current address space - whose PTE is to be modified. - <new_pte_value>: The new value to write into the PTE. - <flags>: An ORed combination of - UVMF_INVLPG: Flush stale TLB entry of the updated page mapping - UVMF_FLUSH_TLB: Flush all TLB entries -You can see this new call in use in Xenolinux (common/memory.c). - diff --git a/xen/TODO b/xen/TODO deleted file mode 100644 index 5eead81b89..0000000000 --- a/xen/TODO +++ /dev/null @@ -1,54 +0,0 @@ - -This is stuff we probably want to implement in the near future. - - -- Keir (16/3/03) - - -1. DOMAIN-0 MANAGEMENT DAEMON ------------------------------ -A better control daemon is required for domain 0, which keeps proper -track of machine resources and can make sensible policy choices. This -may require support in Xen; for example, notifications (eg. DOMn is -killed), and requests (eg. can DOMn allocate x frames of memory?). - -2. ASSIGNING DOMAINS TO PROCESSORS ----------------------------------- -More intelligent assignment of domains to processors. In -particular, we don't play well with hyperthreading: we will assign -domains to virtual processors on the same package, rather then -spreading them across processor packages. - -What we need to do is port code from Linux which stores information on -relationships between processors in the system (eg. which ones are -siblings in the same package). We then use this to balance domains -across packages, and across virtual processors within a package. - -3. SANE NETWORK ROUTING ------------------------ -The current virtual firewall/router is completely broken. Needs a new -design and implementation! - - - -Graveyard -********* - -The hypervisor page cache -------------------------- -This will allow guest OSes to make use of spare pages in the system, but -allow them to be immediately used for any new domains or memory requests. -The idea is that, when a page is laundered and falls off Linux's clean_LRU -list, rather than freeing it it becomes a candidate for passing down into -the hypervisor. In return, xeno-linux may ask for one of its previously- -cached pages back: - (page, new_id) = cache_query(page, old_id); -If the requested page couldn't be kept, a blank page is returned. -When would Linux make the query? Whenever it wants a page back without -the delay or going to disc. Also, whenever a page would otherwise be -flushed to disc. - -To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL); - [NULL means "give me a blank page"]. -To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id) - [we may request that x_page just be discarded, and therefore not impinge - on this domain's cache quota]. diff --git a/xen/arch/i386/Rules.mk b/xen/arch/i386/Rules.mk index e137a1abd3..4d00a727ec 100644 --- a/xen/arch/i386/Rules.mk +++ b/xen/arch/i386/Rules.mk @@ -8,8 +8,8 @@ MONITOR_BASE := 0xFC500000 # Bootloader should load monitor to this real address LOAD_BASE := 0x00100000 CFLAGS := -nostdinc -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE) -CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG -#CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ +#CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG +CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ LDFLAGS := -T xeno.lds -N diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c index 8a3a6b5cf8..b3cd649c9c 100644 --- a/xen/arch/i386/apic.c +++ b/xen/arch/i386/apic.c @@ -47,7 +47,7 @@ #include <asm/hardirq.h> #include <asm/apic.h> #include <xeno/mm.h> - +#include <asm/io_apic.h> #include <asm/timex.h> #include <xeno/ac_timer.h> #include <xeno/perfc.h> diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S index e06c565de7..dc55e35041 100644 --- a/xen/arch/i386/entry.S +++ b/xen/arch/i386/entry.S @@ -82,7 +82,6 @@ #include <xeno/config.h> #include <xeno/errno.h> #include <hypervisor-ifs/hypervisor-if.h> -#include <asm/smp.h> EBX = 0x00 ECX = 0x04 diff --git a/xen/arch/i386/flushtlb.c b/xen/arch/i386/flushtlb.c new file mode 100644 index 0000000000..fc543ebce7 --- /dev/null +++ b/xen/arch/i386/flushtlb.c @@ -0,0 +1,64 @@ +/****************************************************************************** + * flushtlb.c + * + * TLB flushes are timestamped using a global virtual 'clock' which ticks + * on any TLB flush on any processor. + * + * Copyright (c) 2003, K A Fraser + */ + +#include <xeno/config.h> +#include <xeno/sched.h> +#include <asm/flushtlb.h> + +unsigned long tlbflush_mask; +unsigned long tlbflush_clock; +unsigned long tlbflush_time[NR_CPUS]; + +static inline void tlb_clocktick(unsigned int cpu) +{ + unsigned long x, nx, y, ny; + + clear_bit(cpu, &tlbflush_mask); + + /* Tick the clock. 'y' contains the current time after the tick. */ + ny = tlbflush_clock; + do { +#ifdef CONFIG_SMP + if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) ) + { + new_tlbflush_clock_period(); + y = tlbflush_clock; + break; + } +#else + y = ny+1; +#endif + } + while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) ); + + /* Update cpu's timestamp to current time, unless someone else beats us. */ + nx = tlbflush_time[cpu]; + do { + if ( unlikely((x = nx) >= y) ) + break; + } + while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) ); +} + +void write_cr3_counted(unsigned long pa) +{ + __asm__ __volatile__ ( + "movl %0, %%cr3" + : : "r" (pa) : "memory" ); + tlb_clocktick(smp_processor_id()); +} + +void flush_tlb_counted(void) +{ + __asm__ __volatile__ ( + "movl %%cr3, %%eax; movl %%eax, %%cr3" + : : : "memory", "eax" ); + tlb_clocktick(smp_processor_id()); +} + diff --git a/xen/arch/i386/io_apic.c b/xen/arch/i386/io_apic.c index 951763a053..7369966dd8 100644 --- a/xen/arch/i386/io_apic.c +++ b/xen/arch/i386/io_apic.c @@ -28,6 +28,8 @@ #include <xeno/config.h> #include <asm/mc146818rtc.h> #include <asm/io.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> #include <asm/smp.h> #include <asm/desc.h> #include <asm/smpboot.h> diff --git a/xen/arch/i386/ioremap.c b/xen/arch/i386/ioremap.c index 06c09f8520..c650d0b5d8 100644 --- a/xen/arch/i386/ioremap.c +++ b/xen/arch/i386/ioremap.c @@ -15,92 +15,50 @@ #include <asm/pgalloc.h> #include <asm/page.h> -static unsigned long remap_base = 0; +static unsigned long remap_base = IOREMAP_VIRT_START; #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) -static void new_l2e(l2_pgentry_t *pl2e) -{ - l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL); - if ( !pl1e ) BUG(); - clear_page(pl1e); - *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR); -} - - -void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +void * __ioremap(unsigned long phys_addr, + unsigned long size, + unsigned long flags) { unsigned long vaddr; unsigned long offset, cur=0, last_addr; l2_pgentry_t *pl2e; l1_pgentry_t *pl1e; - /* First time through, start allocating from far end of virtual memory. */ - if ( !remap_base ) remap_base = IOREMAP_VIRT_START; - /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) + if ( (size == 0) || (last_addr < phys_addr) ) return NULL; - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0xA0000 && last_addr < 0x100000) + /* Don't remap the low PCI/ISA area: it's always mapped. */ + if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) ) return phys_to_virt(phys_addr); - if(remap_base + size > IOREMAP_VIRT_END-1) { - printk("ioremap: going past end of reserved space!\n"); - return NULL; - } -#if 0 - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (phys_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct pfn_info *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; + if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) ) + { + printk("ioremap: going past end of reserved space!\n"); + return NULL; } -#endif - /* - * Mappings have to be page-aligned - */ + /* Mappings have to be page-aligned. */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr) - phys_addr; - /* - * Ok, go for it.. - */ + /* Ok, go for it. */ vaddr = remap_base; remap_base += size; pl2e = &idle_pg_table[l2_table_offset(vaddr)]; - if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e); pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr); - for ( ; ; ) - { - if ( !l1_pgentry_empty(*pl1e) ) BUG(); + do { *pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags); - cur += PAGE_SIZE; - if ( cur == size ) break; - if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) ) - { - if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e); - pl1e = l2_pgentry_to_l1(*pl2e++); - } } + while ( (cur += PAGE_SIZE) != size ); - flush_tlb_all(); - - return (void *) (offset + (char *)vaddr); + return (void *)(offset + (char *)vaddr); } void iounmap(void *addr) diff --git a/xen/arch/i386/irq.c b/xen/arch/i386/irq.c index 2793eba3d7..cd1bcc6b3c 100644 --- a/xen/arch/i386/irq.c +++ b/xen/arch/i386/irq.c @@ -24,7 +24,8 @@ #include <xeno/interrupt.h> #include <xeno/irq.h> #include <xeno/slab.h> - +#include <asm/mpspec.h> +#include <asm/io_apic.h> #include <asm/msr.h> #include <asm/hardirq.h> #include <asm/ptrace.h> diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c index 5df703de7a..84ef14cf8f 100644 --- a/xen/arch/i386/mm.c +++ b/xen/arch/i386/mm.c @@ -27,8 +27,8 @@ #include <asm/fixmap.h> #include <asm/domain_page.h> -static inline void set_pte_phys (unsigned long vaddr, - l1_pgentry_t entry) +static inline void set_pte_phys(unsigned long vaddr, + l1_pgentry_t entry) { l2_pgentry_t *l2ent; l1_pgentry_t *l1ent; @@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigned long vaddr, __flush_tlb_one(vaddr); } -void __set_fixmap (enum fixed_addresses idx, - l1_pgentry_t entry) + +void __set_fixmap(enum fixed_addresses idx, + l1_pgentry_t entry) { unsigned long address = __fix_to_virt(idx); - if (idx >= __end_of_fixed_addresses) { + if ( likely(idx < __end_of_fixed_addresses) ) + set_pte_phys(address, entry); + else printk("Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, entry); } -static void __init fixrange_init (unsigned long start, - unsigned long end, l2_pgentry_t *pg_base) + +static void __init fixrange_init(unsigned long start, + unsigned long end, + l2_pgentry_t *pg_base) { l2_pgentry_t *l2e; int i; @@ -66,7 +68,8 @@ static void __init fixrange_init (unsigned long start, for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ ) { - if ( !l2_pgentry_empty(*l2e) ) continue; + if ( !l2_pgentry_empty(*l2e) ) + continue; page = (unsigned long)get_free_page(GFP_KERNEL); clear_page(page); *l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR); @@ -79,11 +82,6 @@ void __init paging_init(void) unsigned long addr; void *ioremap_pt; - /* XXX initialised in boot.S */ - /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/ - /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/ - /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/ - /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): @@ -115,12 +113,12 @@ void __init paging_init(void) } -void __init zap_low_mappings (void) +void __init zap_low_mappings(void) { int i; for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) idle_pg_table[i] = mk_l2_pgentry(0); - flush_tlb_all(); + flush_tlb_all_pge(); } @@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p, unsigned int entries) { /* NB. There are 512 8-byte entries per GDT page. */ - unsigned int i, j, nr_pages = (entries + 511) / 512; - unsigned long pfn, *gdt_page; - long ret = -EINVAL; - struct pfn_info *page; + int i, nr_pages = (entries + 511) / 512; + unsigned long pfn; struct desc_struct *vgdt; - spin_lock(&p->page_lock); - /* Check the new GDT. */ for ( i = 0; i < nr_pages; i++ ) { - if ( frames[i] >= max_page ) - goto out; - - page = frame_table + frames[i]; - if ( (page->flags & PG_domain_mask) != p->domain ) - goto out; - - if ( (page->flags & PG_type_mask) != PGT_gdt_page ) - { - if ( page_type_count(page) != 0 ) - goto out; - - /* Check all potential GDT entries in the page. */ - gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT); - for ( j = 0; j < 512; j++ ) - if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) ) - goto out; - unmap_domain_mem(gdt_page); - } + if ( unlikely(frames[i] >= max_page) || + unlikely(!get_page_and_type(&frame_table[frames[i]], + p, PGT_gdt_page)) ) + goto fail; } + /* Copy reserved GDT entries to the new GDT. */ + vgdt = map_domain_mem(frames[0] << PAGE_SHIFT); + memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, + gdt_table + FIRST_RESERVED_GDT_ENTRY, + NR_RESERVED_GDT_ENTRIES*8); + unmap_domain_mem(vgdt); + /* Tear down the old GDT. */ for ( i = 0; i < 16; i++ ) { - pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]); + if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 ) + put_page_and_type(&frame_table[pfn]); p->mm.perdomain_pt[i] = mk_l1_pgentry(0); - if ( pfn == 0 ) continue; - page = frame_table + pfn; - ASSERT((page->flags & PG_type_mask) == PGT_gdt_page); - ASSERT((page->flags & PG_domain_mask) == p->domain); - ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0)); - put_page_type(page); - put_page_tot(page); } /* Install the new GDT. */ for ( i = 0; i < nr_pages; i++ ) - { p->mm.perdomain_pt[i] = mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR); - - page = frame_table + frames[i]; - page->flags &= ~(PG_type_mask | PG_need_flush); - page->flags |= PGT_gdt_page; - get_page_type(page); - get_page_tot(page); - } - - /* Copy reserved GDT entries to the new GDT. */ - vgdt = map_domain_mem(frames[i] << PAGE_SHIFT); - memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, - gdt_table + FIRST_RESERVED_GDT_ENTRY, - NR_RESERVED_GDT_ENTRIES*8); - unmap_domain_mem(vgdt); SET_GDT_ADDRESS(p, GDT_VIRT_START); SET_GDT_ENTRIES(p, (entries*8)-1); - ret = 0; /* success */ + return 0; - out: - spin_unlock(&p->page_lock); - return ret; + fail: + while ( i-- > 0 ) + put_page_and_type(&frame_table[frames[i]]); + return -EINVAL; } long do_set_gdt(unsigned long *frame_list, unsigned int entries) { - unsigned int nr_pages = (entries + 511) / 512; + int nr_pages = (entries + 511) / 512; unsigned long frames[16]; long ret; @@ -321,14 +287,12 @@ long do_update_descriptor( if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) ) return -EINVAL; - spin_lock(¤t->page_lock); - - page = frame_table + pfn; - if ( (page->flags & PG_domain_mask) != current->domain ) + page = &frame_table[pfn]; + if ( unlikely(!get_page(page, current)) ) goto out; /* Check if the given frame is in use in an unsafe context. */ - switch ( (page->flags & PG_type_mask) ) + switch ( page->type_and_flags & PGT_type_mask ) { case PGT_gdt_page: /* Disallow updates of Xen-reserved descriptors in the current GDT. */ @@ -336,12 +300,17 @@ long do_update_descriptor( (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) goto out; + if ( unlikely(!get_page_type(page, PGT_gdt_page)) ) + goto out; + break; case PGT_ldt_page: - case PGT_writeable_page: + if ( unlikely(!get_page_type(page, PGT_ldt_page)) ) + goto out; break; default: - if ( page_type_count(page) != 0 ) + if ( unlikely(!get_page_type(page, PGT_writeable_page)) ) goto out; + break; } /* All is good so make the update. */ @@ -350,9 +319,11 @@ long do_update_descriptor( gdt_pent[1] = word2; unmap_domain_mem(gdt_pent); + put_page_type(page); + ret = 0; /* success */ out: - spin_unlock(¤t->page_lock); + put_page(page); return ret; } diff --git a/xen/arch/i386/pci-irq.c b/xen/arch/i386/pci-irq.c index b7a212b014..2c68d9d3b3 100644 --- a/xen/arch/i386/pci-irq.c +++ b/xen/arch/i386/pci-irq.c @@ -6,16 +6,15 @@ #include <linux/config.h> #include <linux/types.h> -/*#include <linux/kernel.h>*/ #include <linux/pci.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/sched.h> - #include <asm/io.h> #include <asm/smp.h> +#include <asm/mpspec.h> #include <asm/io_apic.h> #include "pci-i386.h" diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c index 4f7d16d761..e75ee1e050 100644 --- a/xen/arch/i386/process.c +++ b/xen/arch/i386/process.c @@ -27,6 +27,7 @@ #include <asm/processor.h> #include <asm/desc.h> #include <asm/i387.h> +#include <asm/mpspec.h> #include <xeno/irq.h> #include <xeno/event.h> @@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p) tss->ss1 = next->ss1; /* Switch page tables. */ - __write_cr3_counted(pagetable_val(next_p->mm.pagetable)); + write_cr3_counted(pagetable_val(next_p->mm.pagetable)); set_current(next_p); diff --git a/xen/arch/i386/smp.c b/xen/arch/i386/smp.c index b1dfe64d4f..4ec5176194 100644 --- a/xen/arch/i386/smp.c +++ b/xen/arch/i386/smp.c @@ -16,6 +16,7 @@ #include <asm/mc146818rtc.h> #include <asm/pgalloc.h> #include <asm/smpboot.h> +#include <asm/hardirq.h> #ifdef CONFIG_SMP @@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED; asmlinkage void smp_invalidate_interrupt(void) { ack_APIC_irq(); - if (test_and_clear_bit(smp_processor_id(), &flush_cpumask)) - local_flush_tlb(); + clear_bit(smp_processor_id(), &flush_cpumask); + local_flush_tlb(); } -void flush_tlb_others(unsigned long cpumask) +void flush_tlb_mask(unsigned long mask) { - spin_lock(&tlbstate_lock); - atomic_set_mask(cpumask, &flush_cpumask); - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - while (flush_cpumask) continue; - spin_unlock(&tlbstate_lock); + if ( unlikely(in_irq()) ) + BUG(); + + if ( mask & (1 << smp_processor_id()) ) + { + local_flush_tlb(); + mask &= ~(1 << smp_processor_id()); + } + + if ( mask != 0 ) + { + spin_lock(&tlbstate_lock); + flush_cpumask = mask; + send_IPI_mask(mask, INVALIDATE_TLB_VECTOR); + while ( flush_cpumask != 0 ) + { + rep_nop(); + barrier(); + } + spin_unlock(&tlbstate_lock); + } } - -static inline void do_flush_tlb_all_local(void) + +void new_tlbflush_clock_period(void) { - __flush_tlb_all(); + if ( unlikely(!spin_trylock(&tlbstate_lock)) ) + return; + + if ( unlikely((flush_cpumask = tlbflush_mask) != 0) ) + { + send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR); + while ( flush_cpumask != 0 ) + { + rep_nop(); + barrier(); + } + } + + /* No need for cmpxchg updates here: we are protected by tlbstate lock. */ + tlbflush_mask = (1 << smp_num_cpus) - 1; + wmb(); /* Reset the mask before allowing the clock to continue ticking. */ + tlbflush_clock++; + + spin_unlock(&tlbstate_lock); } -static void flush_tlb_all_ipi(void* info) +static void flush_tlb_all_pge_ipi(void* info) { - do_flush_tlb_all_local(); + __flush_tlb_pge(); } -void flush_tlb_all(void) +void flush_tlb_all_pge(void) { - smp_call_function (flush_tlb_all_ipi,0,1,1); - - do_flush_tlb_all_local(); + smp_call_function (flush_tlb_all_pge_ipi,0,1,1); + __flush_tlb_pge(); } void smp_send_event_check_mask(unsigned long cpu_mask) diff --git a/xen/arch/i386/smpboot.c b/xen/arch/i386/smpboot.c index 506ec09cb9..b5a4249003 100644 --- a/xen/arch/i386/smpboot.c +++ b/xen/arch/i386/smpboot.c @@ -44,6 +44,8 @@ #include <xeno/smp.h> #include <asm/msr.h> #include <asm/system.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> #include <xeno/sched.h> #include <xeno/delay.h> #include <xeno/lib.h> diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c index 330defe3a8..78c26c37cc 100644 --- a/xen/arch/i386/traps.c +++ b/xen/arch/i386/traps.c @@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, char *str, if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { + DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup); regs->eip = fixup; regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; return; @@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { + DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup); regs->eip = fixup; regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; return; @@ -411,6 +413,7 @@ asmlinkage void do_general_protection(struct pt_regs *regs, long error_code) if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { + DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup); regs->eip = fixup; regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS; return; diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index 2f3073a1c4..5b24d7b5c9 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -38,31 +38,6 @@ static unsigned int get_domnr(void) return 0; } -static void build_page_list(struct task_struct *p) -{ - unsigned long *list; - unsigned long curr; - struct list_head *list_ent; - - curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table; - list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT); - - list_for_each(list_ent, &p->pg_head) - { - *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table; - - if( ((unsigned long)list & ~PAGE_MASK) == 0 ) - { - struct list_head *ent = frame_table[curr].list.next; - curr = list_entry(ent, struct pfn_info, list) - frame_table; - unmap_domain_mem(list-1); - list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT); - } - } - - unmap_domain_mem(list); -} - static int msr_cpu_mask; static unsigned long msr_addr; static unsigned long msr_lo; @@ -164,8 +139,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op) goto exit_create; } - build_page_list(p); - ret = p->domain; op.u.createdomain.domain = ret; @@ -245,7 +218,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) case DOM0_GETMEMLIST: { int i; - struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain); + struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain); unsigned long max_pfns = op.u.getmemlist.max_pfns; unsigned long pfn; unsigned long *buffer = op.u.getmemlist.buffer; @@ -254,28 +227,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op) ret = -EINVAL; if ( p != NULL ) { - list_ent = p->pg_head.next; - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - - for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ ) + ret = 0; + + spin_lock(&p->page_list_lock); + list_ent = p->page_list.next; + for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ ) { + pfn = list_entry(list_ent, struct pfn_info, list) - + frame_table; if ( put_user(pfn, buffer) ) { ret = -EFAULT; - goto out_getmemlist; + break; } buffer++; list_ent = frame_table[pfn].list.next; - pfn = list_entry(list_ent, struct pfn_info, list) - - frame_table; } + spin_unlock(&p->page_list_lock); op.u.getmemlist.num_pfns = i; copy_to_user(u_dom0_op, &op, sizeof(op)); - - ret = 0; - - out_getmemlist: + put_task_struct(p); } } @@ -368,21 +340,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op) { struct pfn_info *page; unsigned long pfn = op.u.getpageframeinfo.pfn; - - if ( pfn >= max_page ) - { - ret = -EINVAL; - } - else + unsigned int dom = op.u.getpageframeinfo.domain; + struct task_struct *p; + + ret = -EINVAL; + + if ( unlikely(pfn >= max_page) || + unlikely((p = find_domain_by_id(dom)) == NULL) ) + break; + + page = &frame_table[pfn]; + + if ( likely(get_page(page, p)) ) { - page = frame_table + pfn; - - op.u.getpageframeinfo.domain = page->flags & PG_domain_mask; - op.u.getpageframeinfo.type = NONE; + op.u.getpageframeinfo.type = NONE; - if ( page_type_count(page) != 0 ) + if ( (page->type_and_flags & PGT_count_mask) != 0 ) { - switch ( page->flags & PG_type_mask ) + switch ( page->type_and_flags & PGT_type_mask ) { case PGT_l1_page_table: op.u.getpageframeinfo.type = L1TAB; @@ -392,9 +367,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op) break; } } - - copy_to_user(u_dom0_op, &op, sizeof(op)); + + put_page(page); } + + put_task_struct(p); + + copy_to_user(u_dom0_op, &op, sizeof(op)); } break; diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c index c8869882ae..53facf8477 100644 --- a/xen/common/dom_mem_ops.c +++ b/xen/common/dom_mem_ops.c @@ -16,58 +16,26 @@ #include <xeno/event.h> #include <asm/domain_page.h> -#if 0 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op) { - struct list_head *temp; - struct pfn_info *pf; /* pfn_info of current page */ + struct pfn_info *page; unsigned long mpfn; /* machine frame number of current page */ void *va; /* Xen-usable mapping of current page */ unsigned long i; - unsigned long flags; - - /* - * POLICY DECISION: Each domain has a page limit. - * NB. The first part of test is because op.size could be so big that - * tot_pages + op.size overflows a u_long. - */ - if( (op.size > p->max_pages) || - ((p->tot_pages + op.size) > p->max_pages) ) - return -ENOMEM; - - spin_lock_irqsave(&free_list_lock, flags); - - if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >> - (PAGE_SHIFT-10))) ) - { - spin_unlock_irqrestore(&free_list_lock, flags); - return -ENOMEM; - } - spin_lock(&p->page_lock); - - temp = free_list.next; for ( i = 0; i < op.size; i++ ) { - /* Get a free page and add it to the domain's page list. */ - pf = list_entry(temp, struct pfn_info, list); - pf->flags |= p->domain; - set_page_type_count(pf, 0); - set_page_tot_count(pf, 0); - temp = temp->next; - list_del(&pf->list); - list_add_tail(&pf->list, &p->pg_head); - free_pfns--; - - p->tot_pages++; - + /* Leave some slack pages; e.g., for the network. */ + if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> + (PAGE_SHIFT-10))) ) + break; + + /* NB. 'alloc_domain_page' does limit checking on pages per domain. */ + if ( unlikely((page = alloc_domain_page(p)) == NULL) ) + break; + /* Inform the domain of the new page's machine address. */ - mpfn = (unsigned long)(pf - frame_table); + mpfn = (unsigned long)(page - frame_table); copy_to_user(op.pages, &mpfn, sizeof(mpfn)); op.pages++; @@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op) unmap_domain_mem(va); } - spin_unlock(&p->page_lock); - spin_unlock_irqrestore(&free_list_lock, flags); - - return op.size; + return i; } static long free_dom_mem(struct task_struct *p, reservation_decrease_t op) { - struct list_head *temp; - struct pfn_info *pf; /* pfn_info of current page */ + struct pfn_info *page; unsigned long mpfn; /* machine frame number of current page */ unsigned long i; - unsigned long flags; long rc = 0; int need_flush = 0; - spin_lock_irqsave(&free_list_lock, flags); - spin_lock(&p->page_lock); - - temp = free_list.next; for ( i = 0; i < op.size; i++ ) { copy_from_user(&mpfn, op.pages, sizeof(mpfn)); @@ -109,37 +68,28 @@ static long free_dom_mem(struct task_struct *p, reservation_decrease_t op) goto out; } - pf = &frame_table[mpfn]; - if ( (page_type_count(pf) != 0) || - (page_tot_count(pf) != 0) || - ((pf->flags & PG_domain_mask) != p->domain) ) + page = &frame_table[mpfn]; + if ( unlikely(!get_page(page, p)) ) { - DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n", - p->domain, page_type_count(pf), - page_tot_count(pf), pf->flags); + DPRINTK("Bad page free for domain %d\n", p->domain); rc = -EINVAL; goto out; } - need_flush |= pf->flags & PG_need_flush; - - pf->flags = 0; + if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) ) + put_page_and_type(page); - list_del(&pf->list); - list_add(&pf->list, &free_list); - free_pfns++; + if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) ) + put_page(page); - p->tot_pages--; + put_page(page); } out: - spin_unlock(&p->page_lock); - spin_unlock_irqrestore(&free_list_lock, flags); - if ( need_flush ) { __flush_tlb(); - perfc_incrc(need_flush_tlb_flush); + perfc_incr(need_flush_tlb_flush); } return rc ? rc : op.size; diff --git a/xen/common/domain.c b/xen/common/domain.c index eae232206b..4f23778e46 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu) sprintf(p->name, "Domain-%d", dom_id); spin_lock_init(&p->blk_ring_lock); - spin_lock_init(&p->page_lock); spin_lock_init(&p->event_channel_lock); p->shared_info = (void *)get_free_page(GFP_KERNEL); memset(p->shared_info, 0, PAGE_SIZE); - SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id); + SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p); p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL); memset(p->mm.perdomain_pt, 0, PAGE_SIZE); @@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu) sched_add_domain(p); - INIT_LIST_HEAD(&p->pg_head); + spin_lock_init(&p->page_list_lock); + INIT_LIST_HEAD(&p->page_list); p->max_pages = p->tot_pages = 0; + write_lock_irqsave(&tasklist_lock, flags); SET_LINKS(p); p->next_hash = task_hash[TASK_HASH(dom_id)]; @@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom) return 0; } -unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) +struct pfn_info *alloc_domain_page(struct task_struct *p) { - struct list_head *temp; - struct pfn_info *pf; - unsigned int alloc_pfns; - unsigned int req_pages; - unsigned long flags; - - /* how many pages do we need to alloc? */ - req_pages = kbytes >> (PAGE_SHIFT - 10); + struct pfn_info *page = NULL; + unsigned long flags, mask, pfn_stamp, cpu_stamp; + int i; spin_lock_irqsave(&free_list_lock, flags); - - /* is there enough mem to serve the request? */ - if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) > - free_pfns ) + if ( likely(!list_empty(&free_list)) ) { - spin_unlock_irqrestore(&free_list_lock, flags); - return -1; + page = list_entry(free_list.next, struct pfn_info, list); + list_del(&page->list); + free_pfns--; } + spin_unlock_irqrestore(&free_list_lock, flags); + + if ( unlikely(page == NULL) ) + return NULL; - /* allocate pages and build a thread through frame_table */ - temp = free_list.next; - for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ ) + if ( unlikely((mask = page->u.cpu_mask) != 0) ) { - pf = list_entry(temp, struct pfn_info, list); - pf->flags = p->domain; - set_page_type_count(pf, 0); - set_page_tot_count(pf, 0); - temp = temp->next; - list_del(&pf->list); - list_add_tail(&pf->list, &p->pg_head); - free_pfns--; - ASSERT(free_pfns != 0); + pfn_stamp = page->tlbflush_timestamp; + for ( i = 0; mask != 0; i++ ) + { + if ( unlikely(mask & (1<<i)) ) + { + cpu_stamp = tlbflush_time[i]; + if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) ) + mask &= ~(1<<i); + } + } + + if ( unlikely(mask != 0) ) + { + if ( unlikely(in_irq()) ) + { + DPRINTK("Returning NULL from alloc_domain_page: in_irq\n"); + goto free_and_exit; + } + perfc_incrc(need_flush_tlb_flush); + flush_tlb_mask(mask); + } } - - spin_unlock_irqrestore(&free_list_lock, flags); - - p->tot_pages = req_pages; - /* TEMPORARY: max_pages should be explicitly specified. */ - p->max_pages = p->tot_pages; + page->u.domain = p; + page->type_and_flags = 0; + if ( p != NULL ) + { + if ( unlikely(in_irq()) ) + BUG(); + wmb(); /* Domain pointer must be visible before updating refcnt. */ + spin_lock(&p->page_list_lock); + if ( unlikely(p->tot_pages >= p->max_pages) ) + { + spin_unlock(&p->page_list_lock); + goto free_and_exit; + } + list_add_tail(&page->list, &p->page_list); + p->tot_pages++; + page->count_and_flags = PGC_allocated | 1; + spin_unlock(&p->page_list_lock); + } - return 0; + return page; + + free_and_exit: + spin_lock_irqsave(&free_list_lock, flags); + list_add(&page->list, &free_list); + free_pfns++; + spin_unlock_irqrestore(&free_list_lock, flags); + return NULL; } - -void free_all_dom_mem(struct task_struct *p) +void free_domain_page(struct pfn_info *page) { - struct list_head *ent; unsigned long flags; + struct task_struct *p = page->u.domain; - spin_lock_irqsave(&free_list_lock, flags); - while ( (ent = p->pg_head.next) != &p->pg_head ) + if ( unlikely(in_irq()) ) + BUG(); + + if ( likely(!IS_XEN_HEAP_FRAME(page)) ) { - struct pfn_info *pf = list_entry(ent, struct pfn_info, list); - set_page_type_count(pf, 0); - set_page_tot_count(pf, 0); - pf->flags = 0; - ASSERT(ent->next->prev == ent); - ASSERT(ent->prev->next == ent); - list_del(ent); - list_add(ent, &free_list); + /* + * No race with setting of zombie bit. If it wasn't set before the + * last reference was dropped, then it can't be set now. + */ + page->u.cpu_mask = 0; + if ( !(page->count_and_flags & PGC_zombie) ) + { + page->tlbflush_timestamp = tlbflush_clock; + page->u.cpu_mask = 1 << p->processor; + + spin_lock(&p->page_list_lock); + list_del(&page->list); + p->tot_pages--; + spin_unlock(&p->page_list_lock); + } + + page->count_and_flags = 0; + + spin_lock_irqsave(&free_list_lock, flags); + list_add(&page->list, &free_list); free_pfns++; + spin_unlock_irqrestore(&free_list_lock, flags); } - spin_unlock_irqrestore(&free_list_lock, flags); + else + { + /* + * No need for a TLB flush. Non-domain pages are always co-held by Xen, + * and the Xen reference is not dropped until the domain is dead. + * DOM0 may hold references, but it's trusted so no need to flush. + */ + page->u.cpu_mask = 0; + page->count_and_flags = 0; + free_page((unsigned long)page_to_virt(page)); + } +} + + +void free_all_dom_mem(struct task_struct *p) +{ + struct list_head *ent, zombies; + struct pfn_info *page; + + INIT_LIST_HEAD(&zombies); + + spin_lock(&p->page_list_lock); + while ( (ent = p->page_list.next) != &p->page_list ) + { + page = list_entry(ent, struct pfn_info, list); + + if ( unlikely(!get_page(page, p)) ) + { + /* + * Another CPU has dropped the last reference and is responsible + * for removing the page from this list. Wait for them to do so. + */ + spin_unlock(&p->page_list_lock); + while ( p->page_list.next == ent ) + barrier(); + spin_lock(&p->page_list_lock); + continue; + } + + set_bit(_PGC_zombie, &page->count_and_flags); + + list_del(&page->list); + p->tot_pages--; + + list_add(&page->list, &zombies); + } + spin_unlock(&p->page_list_lock); + + /* We do the potentially complex 'put' operations with no lock held. */ + while ( (ent = zombies.next) != &zombies ) + { + page = list_entry(ent, struct pfn_info, list); - p->tot_pages = 0; + list_del(&page->list); + + if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) ) + put_page_and_type(page); + + if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) ) + put_page(page); + + put_page(page); + } } +unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) +{ + unsigned int alloc_pfns, nr_pages; + + nr_pages = kbytes >> (PAGE_SHIFT - 10); + + /* TEMPORARY: max_pages should be explicitly specified. */ + p->max_pages = nr_pages; + + for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ ) + { + if ( unlikely(alloc_domain_page(p) == NULL) || + unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> + (PAGE_SHIFT-10))) ) + { + free_all_dom_mem(p); + return -1; + } + } + + p->tot_pages = nr_pages; + + return 0; +} + + /* Release resources belonging to task @p. */ void release_task(struct task_struct *p) { @@ -309,7 +436,6 @@ void release_task(struct task_struct *p) destroy_event_channels(p); free_page((unsigned long)p->mm.perdomain_pt); UNSHARE_PFN(virt_to_page(p->shared_info)); - free_page((unsigned long)p->shared_info); free_all_dom_mem(p); kmem_cache_free(task_struct_cachep, p); @@ -360,11 +486,10 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs; p->failsafe_address = builddomain->ctxt.failsafe_callback_eip; - /* NB. Page base must already be pinned! */ phys_l2tab = builddomain->ctxt.pt_base; p->mm.pagetable = mk_pagetable(phys_l2tab); - get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]); - get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]); + get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, + PGT_l2_page_table); /* Set up the shared info structure. */ update_dom_time(p->shared_info); @@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, return -ENOMEM; } - alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) - + alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) - frame_table; alloc_address <<= PAGE_SHIFT; alloc_index = p->tot_pages; @@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, p->mm.pagetable = mk_pagetable(phys_l2tab); l2tab += l2_table_offset(virt_load_address); - cur_address = list_entry(p->pg_head.next, struct pfn_info, list) - + cur_address = list_entry(p->page_list.next, struct pfn_info, list) - frame_table; cur_address <<= PAGE_SHIFT; for ( count = 0; count < p->tot_pages; count++ ) @@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, } *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT); - page = frame_table + (cur_address >> PAGE_SHIFT); - page->flags = dom | PGT_writeable_page | PG_need_flush; - set_page_type_count(page, 1); - set_page_tot_count(page, 1); + page = &frame_table[cur_address >> PAGE_SHIFT]; + set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags); + if ( !get_page_and_type(page, p, PGT_writeable_page) ) + BUG(); /* Set up the MPT entry. */ machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count; @@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, { *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); page = frame_table + l1_pgentry_to_pagenr(*l1tab); - page->flags = dom | PGT_l1_page_table; - get_page_tot(page); + page->type_and_flags &= ~PGT_type_mask; + page->type_and_flags |= PGT_l1_page_table; + get_page(page, p); /* an extra ref because of readable mapping */ l1tab++; if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) ) { @@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, l2tab++; } } - get_page_type(page); /* guest_pinned */ - get_page_tot(page); /* guest_pinned */ - page->flags = dom | PG_guest_pinned | PGT_l2_page_table; + /* Rewrite last L1 page to be a L2 page. */ + page->type_and_flags &= ~PGT_type_mask; + page->type_and_flags |= PGT_l2_page_table; + /* Get another ref to L2 page so that it can be pinned. */ + if ( !get_page_and_type(page, p, PGT_l2_page_table) ) + BUG(); + set_bit(_PGC_guest_pinned, &page->count_and_flags); unmap_domain_mem(l1start); /* Set up shared info area. */ @@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, /* Install the new page tables. */ __cli(); - __write_cr3_counted(pagetable_val(p->mm.pagetable)); + write_cr3_counted(pagetable_val(p->mm.pagetable)); /* Copy the guest OS image. */ src = (char *)(phy_data_start + 12); @@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, /* Reinstate the caller's page tables. */ - __write_cr3_counted(pagetable_val(current->mm.pagetable)); + write_cr3_counted(pagetable_val(current->mm.pagetable)); __sti(); p->flags |= PF_CONSTRUCTED; diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 9f6fb74556..1bba43d7be 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -181,6 +181,13 @@ void cmain (unsigned long magic, multiboot_info_t *mbi) for ( ; ; ) ; } + /* The array of pfn_info structures must fit into the reserved area. */ + if ( sizeof(struct pfn_info) > 24 ) + { + printk("'struct pfn_info' too large to fit in Xen address space!\n"); + for ( ; ; ) ; + } + set_current(&idle0_task); max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10); diff --git a/xen/common/memory.c b/xen/common/memory.c index 8cbb503cf3..c2b4ee9f7a 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -139,34 +139,28 @@ #include <asm/uaccess.h> #include <asm/domain_page.h> -#if 0 -#define MEM_LOG(_f, _a...) +#ifndef NDEBUG +#define MEM_LOG(_f, _a...) \ printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \ current->domain, __LINE__, ## _a ) #else #define MEM_LOG(_f, _a...) ((void)0) #endif -/* Domain 0 is allowed to submit requests on behalf of others. */ -#define DOMAIN_OKAY(_f) \ - ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0)) - -/* 'get' checks parameter for validity before inc'ing refcnt. */ -static int get_l2_table(unsigned long page_nr); -static int get_l1_table(unsigned long page_nr); -static int get_page(unsigned long page_nr, int writeable); -static int inc_page_refcnt(unsigned long page_nr, unsigned int type); -/* 'put' does no checking because if refcnt not zero, entity must be valid. */ -static void put_l2_table(unsigned long page_nr); -static void put_l1_table(unsigned long page_nr); -static void put_page(unsigned long page_nr, int writeable); -static int dec_page_refcnt(unsigned long page_nr, unsigned int type); - -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t); +static int alloc_l2_table(struct pfn_info *page); +static int alloc_l1_table(struct pfn_info *page); +static int get_page_from_pagenr(unsigned long page_nr); +static int get_page_and_type_from_pagenr(unsigned long page_nr, + unsigned int type); + +static void free_l2_table(struct pfn_info *page); +static void free_l1_table(struct pfn_info *page); + +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); /* frame table size and its size in pages */ -frame_table_t * frame_table; +struct pfn_info *frame_table; unsigned long frame_table_size; unsigned long max_page; @@ -176,8 +170,11 @@ unsigned int free_pfns; /* Used to defer flushing of memory structures. */ static struct { - int flush_tlb; - int refresh_ldt; +#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */ +#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */ +#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0. */ + unsigned long flags; + unsigned long cr0; } deferred_op[NR_CPUS] __cacheline_aligned; /* @@ -196,7 +193,7 @@ void __init init_frametable(unsigned long nr_pages) max_page = nr_pages; frame_table_size = nr_pages * sizeof(struct pfn_info); frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; - frame_table = (frame_table_t *)FRAMETABLE_VIRT_START; + frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START; memset(frame_table, 0, frame_table_size); free_pfns = 0; @@ -218,7 +215,7 @@ void __init init_frametable(unsigned long nr_pages) static void __invalidate_shadow_ldt(struct task_struct *p) { - int i, cpu = p->processor; + int i; unsigned long pfn; struct pfn_info *page; @@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(struct task_struct *p) if ( pfn == 0 ) continue; p->mm.perdomain_pt[i] = mk_l1_pgentry(0); page = frame_table + pfn; - ASSERT((page->flags & PG_type_mask) == PGT_ldt_page); - ASSERT((page->flags & PG_domain_mask) == p->domain); - ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0)); - put_page_type(page); - put_page_tot(page); + ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page); + ASSERT_PAGE_IS_DOMAIN(page, p); + put_page_and_type(page); } /* Dispose of the (now possibly invalid) mappings from the TLB. */ - deferred_op[cpu].flush_tlb = 1; - deferred_op[cpu].refresh_ldt = 1; + deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT; } @@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt(void) } +int alloc_segdesc_page(struct pfn_info *page) +{ + unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT); + int i; + + for ( i = 0; i < 512; i++ ) + if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) ) + goto fail; + + unmap_domain_mem(descs); + return 1; + + fail: + unmap_domain_mem(descs); + return 0; +} + + /* Map shadow page at offset @off. Returns 0 on success. */ int map_ldt_shadow_page(unsigned int off) { struct task_struct *p = current; - unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT); - unsigned long l1e, *ldt_page; - struct pfn_info *page; - int i, ret = -1; + unsigned long l1e; - /* We cannot take a page_lock in interrupt context. */ - if ( in_interrupt() ) + if ( unlikely(in_interrupt()) ) BUG(); - spin_lock(&p->page_lock); + __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >> + PAGE_SHIFT) + off]); - __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT))); - if ( unlikely(!(l1e & _PAGE_PRESENT)) ) - goto out; + if ( unlikely(!(l1e & _PAGE_PRESENT)) || + unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], + p, PGT_ldt_page)) ) + return 0; - page = frame_table + (l1e >> PAGE_SHIFT); - if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) ) - { - if ( unlikely(page_type_count(page) != 0) ) - goto out; - - /* Check all potential LDT entries in the page. */ - ldt_page = (unsigned long *)addr; - for ( i = 0; i < 512; i++ ) - if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) ) - goto out; - if ( unlikely(page->flags & PG_need_flush) ) - { - perfc_incrc(need_flush_tlb_flush); - __write_cr3_counted(pagetable_val(p->mm.pagetable)); - page->flags &= ~PG_need_flush; - } + p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); + p->mm.shadow_ldt_mapcnt++; - page->flags &= ~PG_type_mask; - page->flags |= PGT_ldt_page; - } + return 1; +} - /* Success! */ - get_page_type(page); - get_page_tot(page); - p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW); - p->mm.shadow_ldt_mapcnt++; - ret = 0; +/* Domain 0 is allowed to build page tables on others' behalf. */ +static inline int dom0_get_page(struct pfn_info *page) +{ + unsigned long x, nx, y = page->count_and_flags; + + do { + x = y; + nx = x + 1; + if ( unlikely((x & PGC_count_mask) == 0) || + unlikely((nx & PGC_count_mask) == 0) ) + return 0; + } + while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); - out: - spin_unlock(&p->page_lock); - return ret; + return 1; } -/* Return original refcnt, or -1 on error. */ -static int inc_page_refcnt(unsigned long page_nr, unsigned int type) +static int get_page_from_pagenr(unsigned long page_nr) { - struct pfn_info *page; - unsigned long flags; + struct pfn_info *page = &frame_table[page_nr]; if ( unlikely(page_nr >= max_page) ) { MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); - return -1; + return 0; } - page = frame_table + page_nr; - flags = page->flags; - if ( unlikely(!DOMAIN_OKAY(flags)) ) + + if ( unlikely(!get_page(page, current)) && + ((current->domain != 0) || !dom0_get_page(page)) ) { - MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask); - return -1; + MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr); + return 0; } - if ( (flags & PG_type_mask) != type ) + + return 1; +} + + +static int get_page_and_type_from_pagenr(unsigned long page_nr, + unsigned int type) +{ + struct pfn_info *page = &frame_table[page_nr]; + + if ( unlikely(!get_page_from_pagenr(page_nr)) ) + return 0; + + if ( unlikely(!get_page_type(page, type)) ) { - if ( page_type_count(page) != 0 ) - { - MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld", - page_nr << PAGE_SHIFT, - flags & PG_type_mask, type, page_type_count(page)); - return -1; - } + MEM_LOG("Bad page type for pfn %08lx (%08lx)", + page_nr, page->type_and_flags); + put_page(page); + return 0; + } - if ( unlikely(flags & PG_need_flush) ) - { - deferred_op[smp_processor_id()].flush_tlb = 1; - page->flags &= ~PG_need_flush; - perfc_incrc(need_flush_tlb_flush); - } + return 1; +} + + +/* + * We allow an L2 table to map itself, to achieve a linear p.t. Note that this + * does not raise any reference counts. + */ +static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn) +{ + if ( (l2_pgentry_val(l2e) & _PAGE_RW) ) + { + MEM_LOG("Attempt to create linear p.t. with write perms"); + return 0; + } - page->flags &= ~PG_type_mask; - page->flags |= type; + if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) + { + MEM_LOG("L2 tables may not map _other_ L2 tables!\n"); + return 0; } - get_page_tot(page); - return get_page_type(page); + return 1; } -/* Return new refcnt, or -1 on error. */ -static int dec_page_refcnt(unsigned long page_nr, unsigned int type) +static int get_page_from_l1e(l1_pgentry_t l1e) { - struct pfn_info *page; + ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT); - if ( unlikely(page_nr >= max_page) ) + if ( unlikely((l1_pgentry_val(l1e) & + (_PAGE_GLOBAL|_PAGE_PAT))) ) { - MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); - return -1; + MEM_LOG("Bad L1 page type settings %04lx", + l1_pgentry_val(l1e) & + (_PAGE_GLOBAL|_PAGE_PAT)); + return 0; } - page = frame_table + page_nr; - if ( unlikely(!DOMAIN_OKAY(page->flags)) || - unlikely(((page->flags & PG_type_mask) != type)) ) + + if ( l1_pgentry_val(l1e) & _PAGE_RW ) { - MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)", - page->flags & PG_domain_mask, page->flags & PG_type_mask, - type); - return -1; + if ( unlikely(!get_page_and_type_from_pagenr( + l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) ) + return 0; + set_bit(_PGC_tlb_flush_on_type_change, + &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags); } - ASSERT(page_type_count(page) != 0); - put_page_tot(page); - return put_page_type(page); + else + { + if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) ) + return 0; + } + + return 1; } -/* We allow a L2 table to map itself, to achieve a linear pagetable. */ -/* NB. There's no need for a put_twisted_l2_table() function!! */ -static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e) +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ +static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { - unsigned long l2v = l2_pgentry_val(l2e); + ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT); - /* Clearly the mapping must be read-only :-) */ - if ( (l2v & _PAGE_RW) ) + if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) ) { - MEM_LOG("Attempt to install twisted L2 entry with write permissions"); - return -1; + MEM_LOG("Bad L2 page type settings %04lx", + l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE)); + return 0; } - /* This is a sufficient final check. */ - if ( (l2v >> PAGE_SHIFT) != entry_pfn ) + if ( unlikely(!get_page_and_type_from_pagenr( + l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) && + unlikely(!check_linear_pagetable(l2e, pfn)) ) + return 0; + + return 1; +} + + +static void put_page_from_l1e(l1_pgentry_t l1e) +{ + struct pfn_info *page; + + ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT); + + page = &frame_table[l1_pgentry_to_pagenr(l1e)]; + + if ( l1_pgentry_val(l1e) & _PAGE_RW ) { - MEM_LOG("L2 tables may not map _other_ L2 tables!\n"); - return -1; + put_page_and_type(page); + } + else + { + /* We expect this is rare so we blow the entire shadow LDT. */ + if ( unlikely(((page->type_and_flags & PGT_type_mask) == + PGT_ldt_page)) && + unlikely(((page->type_and_flags & PGT_count_mask) != 0)) ) + invalidate_shadow_ldt(); + put_page(page); } - - /* We don't bump the reference counts. */ - return 0; } -static int get_l2_table(unsigned long page_nr) +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { - struct pfn_info *page; - struct task_struct *p; - l2_pgentry_t *p_l2_entry, l2_entry; - int i, ret=0; + ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT); + + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && + ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) ) + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); +} + + +static int alloc_l2_table(struct pfn_info *page) +{ + unsigned long page_nr = page - frame_table; + l2_pgentry_t *pl2e, l2e; + int i; - ret = inc_page_refcnt(page_nr, PGT_l2_page_table); - if ( likely(ret != 0) ) return (ret < 0) ? ret : 0; - - /* NEW level-2 page table! Deal with every PDE in the table. */ - p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT); + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); + for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) { - l2_entry = *p_l2_entry++; - if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue; - if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) ) - { - MEM_LOG("Bad L2 page type settings %04lx", - l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE)); - ret = -1; + l2e = pl2e[i]; + + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) + continue; + + if ( unlikely(!get_page_from_l2e(l2e, page_nr)) ) goto fail; - } - /* Assume we're mapping an L1 table, falling back to twisted L2. */ - ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry)); - if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry); - if ( unlikely(ret) ) goto fail; } - /* Now we simply slap in our high mapping. */ - memcpy(p_l2_entry, + /* Now we add our private high mappings. */ + memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - - DOMAIN_ENTRIES_PER_L2_PAGETABLE] = + pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR); + pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = + mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) | + __PAGE_HYPERVISOR); - /* - * The per-domain PGD is slightly tricky, as we may not be executing - * in the context of the correct domain (DOM0 builds pt's for others). - */ - page = frame_table + page_nr; - if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL ) - { - p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) - - DOMAIN_ENTRIES_PER_L2_PAGETABLE] = - mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR); - put_task_struct(p); - } - - out: - unmap_domain_mem(p_l2_entry); - return ret; + unmap_domain_mem(pl2e); + return 1; fail: - p_l2_entry--; while ( i-- > 0 ) { - l2_entry = *--p_l2_entry; - if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) - put_l1_table(l2_pgentry_to_pagenr(l2_entry)); + l2e = pl2e[i]; + if ( l2_pgentry_val(l2e) & _PAGE_PRESENT ) + put_page_from_l2e(l2e, page_nr); } - if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 ) - BUG(); - goto out; + + unmap_domain_mem(pl2e); + return 0; } -static int get_l1_table(unsigned long page_nr) +static int alloc_l1_table(struct pfn_info *page) { - l1_pgentry_t *p_l1_entry, l1_entry; - int i, ret; + unsigned long page_nr = page - frame_table; + l1_pgentry_t *pl1e, l1e; + int i; - /* Update ref count for page pointed at by PDE. */ - ret = inc_page_refcnt(page_nr, PGT_l1_page_table); - if ( likely(ret != 0) ) return (ret < 0) ? ret : 0; + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); - /* NEW level-1 page table! Deal with every PTE in the table. */ - p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT); for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) { - l1_entry = *p_l1_entry++; - if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue; - if ( unlikely((l1_pgentry_val(l1_entry) & - (_PAGE_GLOBAL|_PAGE_PAT))) ) - { - MEM_LOG("Bad L1 page type settings %04lx", - l1_pgentry_val(l1_entry) & - (_PAGE_GLOBAL|_PAGE_PAT)); - ret = -1; + l1e = pl1e[i]; + + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) + continue; + + if ( unlikely(!get_page_from_l1e(l1e)) ) goto fail; - } - ret = get_page(l1_pgentry_to_pagenr(l1_entry), - l1_pgentry_val(l1_entry) & _PAGE_RW); - if ( unlikely(ret) ) goto fail; } /* Make sure we unmap the right page! */ - unmap_domain_mem(p_l1_entry-1); - return ret; + unmap_domain_mem(pl1e); + return 1; fail: - p_l1_entry--; while ( i-- > 0 ) { - l1_entry = *--p_l1_entry; - if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) - put_page(l1_pgentry_to_pagenr(l1_entry), - l1_pgentry_val(l1_entry) & _PAGE_RW); - } - if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 ) - BUG(); - unmap_domain_mem(p_l1_entry); - return ret; -} - - -static int get_page(unsigned long page_nr, int writeable) -{ - struct pfn_info *page; - unsigned long flags; - - /* Update ref count for page pointed at by PTE. */ - if ( unlikely(page_nr >= max_page) ) - { - MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page); - return(-1); - } - page = frame_table + page_nr; - flags = page->flags; - if ( unlikely(!DOMAIN_OKAY(flags)) ) - { - MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask); - return(-1); + l1e = pl1e[i]; + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) + continue; + put_page_from_l1e(l1e); } - if ( writeable ) - { - if ( (flags & PG_type_mask) != PGT_writeable_page ) - { - if ( page_type_count(page) != 0 ) - { - MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld", - flags & PG_type_mask, PGT_writeable_page, - page_type_count(page)); - return(-1); - } - page->flags &= ~PG_type_mask; - page->flags |= PGT_writeable_page; - } - page->flags |= PG_need_flush; - get_page_type(page); - } - - get_page_tot(page); - - return(0); + unmap_domain_mem(pl1e); + return 0; } -static void put_l2_table(unsigned long page_nr) +static void free_l2_table(struct pfn_info *page) { - l2_pgentry_t *p_l2_entry, l2_entry; + unsigned long page_nr = page - frame_table; + l2_pgentry_t *pl2e, l2e; int i; - if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return; + pl2e = map_domain_mem(page_nr << PAGE_SHIFT); - /* We had last reference to level-2 page table. Free the PDEs. */ - p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT); for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) { - l2_entry = *p_l2_entry++; - if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) - put_l1_table(l2_pgentry_to_pagenr(l2_entry)); + l2e = pl2e[i]; + if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && + unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) ) + put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]); } - unmap_domain_mem(p_l2_entry); + unmap_domain_mem(pl2e); } -static void put_l1_table(unsigned long page_nr) +static void free_l1_table(struct pfn_info *page) { - l1_pgentry_t *p_l1_entry, l1_entry; + unsigned long page_nr = page - frame_table; + l1_pgentry_t *pl1e, l1e; int i; - if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return; + pl1e = map_domain_mem(page_nr << PAGE_SHIFT); - /* We had last reference to level-1 page table. Free the PTEs. */ - p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT); for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ ) { - l1_entry = *p_l1_entry++; - if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) - put_page(l1_pgentry_to_pagenr(l1_entry), - l1_pgentry_val(l1_entry) & _PAGE_RW); + l1e = pl1e[i]; + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) + continue; + put_page_from_l1e(l1e); } - /* Make sure we unmap the right page! */ - unmap_domain_mem(p_l1_entry-1); + unmap_domain_mem(pl1e); } -static void put_page(unsigned long page_nr, int writeable) +static inline int update_l2e(l2_pgentry_t *pl2e, + l2_pgentry_t ol2e, + l2_pgentry_t nl2e) { - struct pfn_info *page; - ASSERT(page_nr < max_page); - page = frame_table + page_nr; - ASSERT(DOMAIN_OKAY(page->flags)); - ASSERT((!writeable) || - ((page_type_count(page) != 0) && - ((page->flags & PG_type_mask) == PGT_writeable_page) && - ((page->flags & PG_need_flush) == PG_need_flush))); - if ( writeable ) - { - put_page_type(page); - } - else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) && - (page_type_count(page) != 0)) ) - { - /* We expect this is rare so we just blow the entire shadow LDT. */ - invalidate_shadow_ldt(); - } - put_page_tot(page); + unsigned long o = cmpxchg((unsigned long *)pl2e, + l2_pgentry_val(ol2e), + l2_pgentry_val(nl2e)); + if ( o != l2_pgentry_val(ol2e) ) + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", + l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o); + return (o == l2_pgentry_val(ol2e)); } -static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry) +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ +static int mod_l2_entry(l2_pgentry_t *pl2e, + l2_pgentry_t nl2e, + unsigned long pfn) { - l2_pgentry_t old_l2_entry = *p_l2_entry; + l2_pgentry_t ol2e; + unsigned long _ol2e; - if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >= + if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >= DOMAIN_ENTRIES_PER_L2_PAGETABLE) ) { - MEM_LOG("Illegal L2 update attempt in hypervisor area %p", - p_l2_entry); - goto fail; + MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e); + return 0; } - if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) ) + if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) ) + return 0; + ol2e = mk_l2_pgentry(_ol2e); + + if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) { - if ( unlikely((l2_pgentry_val(new_l2_entry) & - (_PAGE_GLOBAL|_PAGE_PSE))) ) - { - MEM_LOG("Bad L2 entry val %04lx", - l2_pgentry_val(new_l2_entry) & - (_PAGE_GLOBAL|_PAGE_PSE)); - goto fail; - } /* Differ in mapping (bits 12-31) or presence (bit 0)? */ - if ( ((l2_pgentry_val(old_l2_entry) ^ - l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 ) + if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 ) { - /* Assume we're mapping an L1 table, falling back to twisted L2. */ - if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) ) + if ( unlikely(!get_page_from_l2e(nl2e, pfn)) ) + return 0; + + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) { - /* NB. No need to sanity-check the VA: done already. */ - unsigned long l1e = l1_pgentry_val( - linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]); - if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) ) - goto fail; + put_page_from_l2e(nl2e, pfn); + return 0; } - if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) - put_l1_table(l2_pgentry_to_pagenr(old_l2_entry)); - } + if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT ) + put_page_from_l2e(ol2e, pfn); + } + else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) + { + return 0; + } } - else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) + else { - put_l1_table(l2_pgentry_to_pagenr(old_l2_entry)); + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) + return 0; + + if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT ) + put_page_from_l2e(ol2e, pfn); } - *p_l2_entry = new_l2_entry; - return 0; - - fail: - return -1; + return 1; } -static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry) +static inline int update_l1e(l1_pgentry_t *pl1e, + l1_pgentry_t ol1e, + l1_pgentry_t nl1e) { - l1_pgentry_t old_l1_entry = *p_l1_entry; + unsigned long o = l1_pgentry_val(ol1e); + unsigned long n = l1_pgentry_val(nl1e); - if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) ) + while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) { - if ( unlikely((l1_pgentry_val(new_l1_entry) & - (_PAGE_GLOBAL|_PAGE_PAT))) ) + unsigned int cpu = smp_processor_id(); + /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */ + if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 ) { - MEM_LOG("Bad L1 entry val %04lx", - l1_pgentry_val(new_l1_entry) & - (_PAGE_GLOBAL|_PAGE_PAT)); - goto fail; + MEM_LOG("cmpxchg fault despite WP bit cleared\n"); + return 0; } + deferred_op[cpu].cr0 = read_cr0(); + write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP); + deferred_op[cpu].flags |= DOP_RESTORE_CR0; + } + + if ( o != l1_pgentry_val(ol1e)) + MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n", + l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); + + /* The swap was successful if the old value we saw is equal to ol1e. */ + return (o == l1_pgentry_val(ol1e)); +} + + +/* Update the L1 entry at pl1e to new value nl1e. */ +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) +{ + l1_pgentry_t ol1e; + unsigned long _ol1e; + + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) + { + MEM_LOG("Bad get_user\n"); + return 0; + } + + ol1e = mk_l1_pgentry(_ol1e); + + if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT ) + { /* * Differ in mapping (bits 12-31), writeable (bit 1), or * presence (bit 0)? */ - if ( ((l1_pgentry_val(old_l1_entry) ^ - l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 ) + if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 ) { - if ( get_page(l1_pgentry_to_pagenr(new_l1_entry), - l1_pgentry_val(new_l1_entry) & _PAGE_RW) ) - goto fail; - - if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) - put_page(l1_pgentry_to_pagenr(old_l1_entry), - l1_pgentry_val(old_l1_entry) & _PAGE_RW); - } + if ( unlikely(!get_page_from_l1e(nl1e)) ) + return 0; + + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + { + put_page_from_l1e(nl1e); + return 0; + } + + if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT ) + put_page_from_l1e(ol1e); + } + else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + { + return 0; + } + } + else + { + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + return 0; + + if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT ) + put_page_from_l1e(ol1e); + } + + return 1; +} + + +int alloc_page_type(struct pfn_info *page, unsigned int type) +{ + if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, + &page->count_and_flags)) ) + { + struct task_struct *p = page->u.domain; + mb(); /* Check zombie status before using domain ptr. */ + /* + * NB. 'p' may no longer be valid by time we dereference it, so + * p->processor might be garbage. We clamp it, just in case. + */ + if ( !test_bit(_PGC_zombie, &page->count_and_flags) && + unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)], + page->tlbflush_timestamp)) ) + { + perfc_incr(need_flush_tlb_flush); + flush_tlb_cpu(p->processor); + } } - else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) + + switch ( type ) { - put_page(l1_pgentry_to_pagenr(old_l1_entry), - l1_pgentry_val(old_l1_entry) & _PAGE_RW); + case PGT_l1_page_table: + return alloc_l1_table(page); + case PGT_l2_page_table: + return alloc_l2_table(page); + case PGT_gdt_page: + case PGT_ldt_page: + return alloc_segdesc_page(page); + default: + BUG(); } - *p_l1_entry = new_l1_entry; return 0; +} - fail: - return -1; + +void free_page_type(struct pfn_info *page, unsigned int type) +{ + switch ( type ) + { + case PGT_l1_page_table: + return free_l1_table(page); + case PGT_l2_page_table: + return free_l2_table(page); + default: + BUG(); + } } static int do_extended_command(unsigned long ptr, unsigned long val) { - int err = 0, cpu = smp_processor_id(); + int okay = 1, cpu = smp_processor_id(); unsigned int cmd = val & MMUEXT_CMD_MASK; unsigned long pfn = ptr >> PAGE_SHIFT; - struct pfn_info *page = frame_table + pfn; + struct pfn_info *page = &frame_table[pfn]; /* 'ptr' must be in range except where it isn't a machine address. */ if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) ) + { + MEM_LOG("Ptr out of range for extended MMU command"); return 1; + } switch ( cmd ) { case MMUEXT_PIN_L1_TABLE: - if ( unlikely(page->flags & PG_guest_pinned) ) + case MMUEXT_PIN_L2_TABLE: + okay = get_page_and_type_from_pagenr(pfn, + (cmd == MMUEXT_PIN_L2_TABLE) ? + PGT_l2_page_table : + PGT_l1_page_table); + if ( unlikely(!okay) ) { - MEM_LOG("Pfn %08lx already pinned", pfn); - err = 1; + MEM_LOG("Error while pinning pfn %08lx", pfn); break; } - err = get_l1_table(pfn); - goto mark_as_pinned; - case MMUEXT_PIN_L2_TABLE: - if ( unlikely(page->flags & PG_guest_pinned) ) + if ( unlikely(test_and_set_bit(_PGC_guest_pinned, + &page->count_and_flags)) ) { MEM_LOG("Pfn %08lx already pinned", pfn); - err = 1; + put_page_and_type(page); + okay = 0; break; } - err = get_l2_table(pfn); - mark_as_pinned: - if ( unlikely(err) ) - { - MEM_LOG("Error while pinning pfn %08lx", pfn); - break; - } - page->flags |= PG_guest_pinned; break; case MMUEXT_UNPIN_TABLE: - if ( unlikely(!DOMAIN_OKAY(page->flags)) ) + if ( unlikely(!(okay = get_page_from_pagenr(pfn))) ) { - err = 1; - MEM_LOG("Page %08lx bad domain (dom=%ld)", - ptr, page->flags & PG_domain_mask); + MEM_LOG("Page %08lx bad domain (dom=%p)", + ptr, page->u.domain); } - else if ( likely(page->flags & PG_guest_pinned) ) + else if ( likely(test_and_clear_bit(_PGC_guest_pinned, + &page->count_and_flags)) ) { - page->flags &= ~PG_guest_pinned; - ((page->flags & PG_type_mask) == PGT_l1_page_table) ? - put_l1_table(pfn) : put_l2_table(pfn); + put_page_and_type(page); } else { - err = 1; + okay = 0; MEM_LOG("Pfn %08lx not pinned", pfn); } break; case MMUEXT_NEW_BASEPTR: - err = get_l2_table(pfn); - if ( !err ) + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table); + if ( likely(okay) ) { - put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT); + put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable) + >> PAGE_SHIFT]); current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); invalidate_shadow_ldt(); - deferred_op[cpu].flush_tlb = 1; + deferred_op[cpu].flags |= DOP_FLUSH_TLB; } else { - MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err); + MEM_LOG("Error while installing new baseptr %08lx", ptr); } break; case MMUEXT_TLB_FLUSH: - deferred_op[cpu].flush_tlb = 1; + deferred_op[cpu].flags |= DOP_FLUSH_TLB; break; case MMUEXT_INVLPG: @@ -815,7 +867,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) ((ptr+ents*LDT_ENTRY_SIZE) < ptr) || ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) ) { - err = 1; + okay = 0; MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents); } else if ( (current->mm.ldt_ents != ents) || @@ -825,37 +877,39 @@ static int do_extended_command(unsigned long ptr, unsigned long val) current->mm.ldt_base = ptr; current->mm.ldt_ents = ents; load_LDT(current); - deferred_op[cpu].refresh_ldt = (ents != 0); + deferred_op[cpu].flags &= ~DOP_RELOAD_LDT; + if ( ents != 0 ) + deferred_op[cpu].flags |= DOP_RELOAD_LDT; } break; } default: MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK); - err = 1; + okay = 0; break; } - return err; + return okay; } int do_mmu_update(mmu_update_t *ureqs, int count) { mmu_update_t req; - unsigned long flags, pfn, l1e; + unsigned long va = 0, flags, pfn, prev_pfn = 0; struct pfn_info *page; - int rc = 0, err = 0, i, cpu = smp_processor_id(); + int rc = 0, okay = 1, i, cpu = smp_processor_id(); unsigned int cmd; - unsigned long cr0 = 0; - perfc_incrc( calls_to_mmu_update ); - perfc_addc( num_page_updates, count ); + perfc_incrc(calls_to_mmu_update); + perfc_addc(num_page_updates, count); for ( i = 0; i < count; i++ ) { if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) ) { + MEM_LOG("Bad copy_from_user"); rc = -EFAULT; break; } @@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, int count) cmd = req.ptr & (sizeof(l1_pgentry_t)-1); pfn = req.ptr >> PAGE_SHIFT; - err = 1; - - spin_lock(¤t->page_lock); - - /* Get the page-frame number that a non-extended command references. */ - if ( (cmd == MMU_NORMAL_PT_UPDATE) || - (cmd == MMU_UNCHECKED_PT_UPDATE) ) - { - if ( cr0 == 0 ) - { - cr0 = read_cr0(); - write_cr0(cr0 & ~X86_CR0_WP); - } - /* Need to use 'get_user' since the VA's PGD may be absent. */ - __get_user(l1e, (unsigned long *)(linear_pg_table+pfn)); - /* Now check that the VA's PTE isn't absent. */ - if ( unlikely(!(l1e & _PAGE_PRESENT)) ) - { - MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e); - goto unlock; - } - /* Finally, get the underlying machine address. */ - pfn = l1e >> PAGE_SHIFT; - } + okay = 0; - /* Least significant bits of 'ptr' demux the operation type. */ switch ( cmd ) { /* * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. */ case MMU_NORMAL_PT_UPDATE: - page = frame_table + pfn; - flags = page->flags; + page = &frame_table[pfn]; - if ( likely(DOMAIN_OKAY(flags)) ) + if ( unlikely(!get_page(page, current)) && + ((current->domain != 0) || !dom0_get_page(page)) ) { - switch ( (flags & PG_type_mask) ) - { - case PGT_l1_page_table: - err = mod_l1_entry((l1_pgentry_t *)req.ptr, - mk_l1_pgentry(req.val)); - break; - case PGT_l2_page_table: - err = mod_l2_entry((l2_pgentry_t *)req.ptr, - mk_l2_pgentry(req.val)); - break; - default: - if ( page_type_count(page) == 0 ) - { - *(unsigned long *)req.ptr = req.val; - err = 0; - } - else - MEM_LOG("Update to bad page %08lx", req.ptr); - break; - } + MEM_LOG("Could not get page for normal update"); + break; + } + + if ( likely(prev_pfn == pfn) ) + { + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); } else { - MEM_LOG("Bad domain normal update (dom %d, pfn %ld)", - current->domain, pfn); + if ( prev_pfn != 0 ) + unmap_domain_mem((void *)va); + va = (unsigned long)map_domain_mem(req.ptr); + prev_pfn = pfn; + } + + switch ( (page->type_and_flags & PGT_type_mask) ) + { + case PGT_l1_page_table: + if ( likely(get_page_type(page, PGT_l1_page_table)) ) + { + okay = mod_l1_entry((l1_pgentry_t *)va, + mk_l1_pgentry(req.val)); + put_page_type(page); + } + break; + case PGT_l2_page_table: + if ( likely(get_page_type(page, PGT_l2_page_table)) ) + { + okay = mod_l2_entry((l2_pgentry_t *)va, + mk_l2_pgentry(req.val), + pfn); + put_page_type(page); + } + break; + default: + if ( likely(get_page_type(page, PGT_writeable_page)) ) + { + *(unsigned long *)va = req.val; + okay = 1; + put_page_type(page); + } + break; } + + put_page(page); + break; case MMU_UNCHECKED_PT_UPDATE: req.ptr &= ~(sizeof(l1_pgentry_t) - 1); if ( likely(IS_PRIV(current)) ) { - *(unsigned long *)req.ptr = req.val; - err = 0; + if ( likely(prev_pfn == pfn) ) + { + va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); + } + else + { + if ( prev_pfn != 0 ) + unmap_domain_mem((void *)va); + va = (unsigned long)map_domain_mem(req.ptr); + prev_pfn = pfn; + } + *(unsigned long *)va = req.val; + okay = 1; } else { @@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, int count) break; case MMU_MACHPHYS_UPDATE: - page = frame_table + pfn; + page = &frame_table[pfn]; if ( unlikely(pfn >= max_page) ) { MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page); } - else if ( likely(DOMAIN_OKAY(page->flags)) ) + else if ( likely(get_page(page, current)) || + ((current->domain == 0) && dom0_get_page(page)) ) { machine_to_phys_mapping[pfn] = req.val; - err = 0; + okay = 1; + put_page(page); } - else - { - MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)", - current->domain, pfn); - } break; /* @@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count) */ case MMU_EXTENDED_COMMAND: req.ptr &= ~(sizeof(l1_pgentry_t) - 1); - err = do_extended_command(req.ptr, req.val); + okay = do_extended_command(req.ptr, req.val); break; default: @@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count) break; } - unlock: - spin_unlock(¤t->page_lock); - - if ( unlikely(err) ) + if ( unlikely(!okay) ) { rc = -EINVAL; break; @@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, int count) ureqs++; } - if ( deferred_op[cpu].flush_tlb ) - { - deferred_op[cpu].flush_tlb = 0; - __write_cr3_counted(pagetable_val(current->mm.pagetable)); - } + if ( prev_pfn != 0 ) + unmap_domain_mem((void *)va); - if ( deferred_op[cpu].refresh_ldt ) - { - deferred_op[cpu].refresh_ldt = 0; + flags = deferred_op[cpu].flags; + deferred_op[cpu].flags = 0; + + if ( flags & DOP_FLUSH_TLB ) + write_cr3_counted(pagetable_val(current->mm.pagetable)); + + if ( flags & DOP_RELOAD_LDT ) (void)map_ldt_shadow_page(0); - } - if ( cr0 != 0 ) - write_cr0(cr0); + if ( unlikely(flags & DOP_RESTORE_CR0) ) + write_cr0(deferred_op[cpu].cr0); return rc; } @@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, int count) int do_update_va_mapping(unsigned long page_nr, unsigned long val, - unsigned long flags) + unsigned long caller_flags) { - unsigned long _x, cr0 = 0; struct task_struct *p = current; - int err = -EINVAL; + int err = 0; + unsigned int cpu = p->processor; + unsigned long defer_flags; if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) - goto out; - - spin_lock(&p->page_lock); + return -EINVAL; - /* Check that the VA's page-directory entry is present.. */ - if ( unlikely((err = __get_user(_x, (unsigned long *) - (&linear_pg_table[page_nr]))) != 0) ) - goto unlock_and_out; - - /* If the VA's page-directory entry is read-only, we frob the WP bit. */ - if ( unlikely(__put_user(_x, (unsigned long *) - (&linear_pg_table[page_nr]))) ) - { - cr0 = read_cr0(); - write_cr0(cr0 & ~X86_CR0_WP); - } - - if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr], - mk_l1_pgentry(val)) != 0) ) - { + if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], + mk_l1_pgentry(val))) ) err = -EINVAL; - goto check_cr0_unlock_and_out; - } - if ( unlikely(flags & UVMF_INVLPG) ) + defer_flags = deferred_op[cpu].flags; + deferred_op[cpu].flags = 0; + + if ( unlikely(defer_flags & DOP_FLUSH_TLB) || + unlikely(caller_flags & UVMF_FLUSH_TLB) ) + write_cr3_counted(pagetable_val(p->mm.pagetable)); + else if ( unlikely(caller_flags & UVMF_INVLPG) ) __flush_tlb_one(page_nr << PAGE_SHIFT); - if ( unlikely(flags & UVMF_FLUSH_TLB) ) - __write_cr3_counted(pagetable_val(p->mm.pagetable)); + if ( unlikely(defer_flags & DOP_RELOAD_LDT) ) + (void)map_ldt_shadow_page(0); + + if ( unlikely(defer_flags & DOP_RESTORE_CR0) ) + write_cr0(deferred_op[cpu].cr0); - check_cr0_unlock_and_out: - if ( unlikely(cr0 != 0) ) - write_cr0(cr0); - unlock_and_out: - spin_unlock(&p->page_lock); - out: return err; } diff --git a/xen/common/network.c b/xen/common/network.c index 02b6f57580..14bfa8dac5 100644 --- a/xen/common/network.c +++ b/xen/common/network.c @@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain) if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG(); new_ring = (net_ring_t *)get_free_page(GFP_KERNEL); clear_page(new_ring); - SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain); + SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p); /* * Fill in the new vif struct. Note that, while the vif's refcnt is diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 9e227f574a..ca609438e0 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned long min, unsigned long max) /* Release a PHYSICAL address range to the allocator. */ void release_bytes_to_allocator(unsigned long min, unsigned long max) { - min = round_pgup (min) + PAGE_OFFSET; - max = round_pgdown(max) + PAGE_OFFSET; + min = round_pgup (min); + max = round_pgdown(max); while ( min < max ) { - __free_pages(min, 0); + __free_pages(min+PAGE_OFFSET, 0); min += PAGE_SIZE; } } @@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask, int order) retry: spin_lock_irqsave(&alloc_lock, flags); - /* Find smallest order which can satisfy the request. */ for ( i = order; i < FREELIST_SIZE; i++ ) { if ( !FREELIST_EMPTY(free_head[i]) ) diff --git a/xen/drivers/block/ll_rw_blk.c b/xen/drivers/block/ll_rw_blk.c index 55fbdf3e79..9e1b0de266 100644 --- a/xen/drivers/block/ll_rw_blk.c +++ b/xen/drivers/block/ll_rw_blk.c @@ -14,31 +14,15 @@ #include <xeno/types.h> #include <xeno/lib.h> #include <xeno/sched.h> -/*#include <xeno/kernel_stat.h>*/ #include <xeno/errno.h> -/*#include <xeno/locks.h>*/ #include <xeno/mm.h> -/*#include <xeno/swap.h>*/ #include <xeno/init.h> -/*#include <xeno/smp_lock.h>*/ -/*#include <xeno/completion.h>*/ - #include <asm/system.h> #include <asm/io.h> #include <xeno/blk.h> -/*#include <xeno/highmem.h>*/ #include <xeno/slab.h> #include <xeno/module.h> -/* - * KAF: We can turn off noise relating to barking guest-OS requests. - */ -#if 0 -#define DPRINTK(_f, _a...) printk(_f , ## _a) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - /* This will die as all synchronous stuff is coming to an end */ #if 0 #define complete(_r) panic("completion.h stuff may be needed...") @@ -47,8 +31,6 @@ #define complete(_r) (*(int *)(_r) = 0) #endif - - /* * MAC Floppy IWM hooks */ diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c index 5103d85ffd..8b1cb119e6 100644 --- a/xen/drivers/block/xen_block.c +++ b/xen/drivers/block/xen_block.c @@ -20,12 +20,6 @@ #include <xeno/vbd.h> #include <xeno/slab.h> -#if 0 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - /* * These are rather arbitrary. They are fairly large because adjacent * requests pulled from a communication ring are quite likely to end @@ -60,15 +54,11 @@ static atomic_t nr_pending; static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned; -static int __buffer_is_valid(struct task_struct *p, - unsigned long buffer, - unsigned short size, - int writeable_buffer); -static void __lock_buffer(unsigned long buffer, - unsigned short size, - int writeable_buffer); -static void unlock_buffer(struct task_struct *p, - unsigned long buffer, +static int lock_buffer(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer); +static void unlock_buffer(unsigned long buffer, unsigned short size, int writeable_buffer); @@ -185,8 +175,7 @@ static void end_block_io_op_softirq(struct softirq_action *h) { pending_req = bh->pending_req; - unlock_buffer(pending_req->domain, - virt_to_phys(bh->b_data), + unlock_buffer(virt_to_phys(bh->b_data), bh->b_size, (pending_req->operation==READ)); @@ -321,97 +310,60 @@ long do_block_io_op(block_io_op_t *u_block_io_op) * DOWNWARD CALLS -- These interface with the block-device layer proper. */ -static int __buffer_is_valid(struct task_struct *p, - unsigned long buffer, - unsigned short size, - int writeable_buffer) +static int lock_buffer(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer) { unsigned long pfn; struct pfn_info *page; - int rc = 0; - /* A request may span multiple page frames. Each must be checked. */ for ( pfn = buffer >> PAGE_SHIFT; pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); pfn++ ) { - /* Each frame must be within bounds of machine memory. */ - if ( pfn >= max_page ) - { - DPRINTK("pfn out of range: %08lx\n", pfn); - goto out; - } + if ( unlikely(pfn >= max_page) ) + goto fail; - page = frame_table + pfn; + page = &frame_table[pfn]; - /* Each frame must belong to the requesting domain. */ - if ( (page->flags & PG_domain_mask) != p->domain ) - { - DPRINTK("bad domain: expected %d, got %ld\n", - p->domain, page->flags & PG_domain_mask); - goto out; - } + if ( unlikely(!get_page(page, p)) ) + goto fail; - /* If reading into the frame, the frame must be writeable. */ - if ( writeable_buffer && - ((page->flags & PG_type_mask) != PGT_writeable_page) && - (page_type_count(page) != 0) ) + if ( writeable_buffer && + unlikely(!get_page_type(page, PGT_writeable_page)) ) { - DPRINTK("non-writeable page passed for block read\n"); - goto out; + put_page(page); + goto fail; } - } - - rc = 1; - out: - return rc; -} + } -static void __lock_buffer(unsigned long buffer, - unsigned short size, - int writeable_buffer) -{ - unsigned long pfn; - struct pfn_info *page; + return 1; - for ( pfn = buffer >> PAGE_SHIFT; - pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); - pfn++ ) - { - page = frame_table + pfn; + fail: + while ( pfn-- > (buffer >> PAGE_SHIFT) ) + { if ( writeable_buffer ) - { - if ( page_type_count(page) == 0 ) - { - page->flags &= ~PG_type_mask; - /* No need for PG_need_flush here. */ - page->flags |= PGT_writeable_page; - } - get_page_type(page); - } - get_page_tot(page); + put_page_type(&frame_table[pfn]); + put_page(&frame_table[pfn]); } + return 0; } -static void unlock_buffer(struct task_struct *p, - unsigned long buffer, +static void unlock_buffer(unsigned long buffer, unsigned short size, int writeable_buffer) { - unsigned long pfn; - struct pfn_info *page; + unsigned long pfn; - spin_lock(&p->page_lock); for ( pfn = buffer >> PAGE_SHIFT; pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); pfn++ ) { - page = frame_table + pfn; if ( writeable_buffer ) - put_page_type(page); - put_page_tot(page); + put_page_type(&frame_table[pfn]); + put_page(&frame_table[pfn]); } - spin_unlock(&p->page_lock); } static int do_block_io_op_domain(struct task_struct *p, int max_to_do) @@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) int new_segs, nr_psegs = 0; phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; - spin_lock(&p->page_lock); - /* Check that number of segments is sane. */ if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) ) { @@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) goto bad_descriptor; } - if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) ) + if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) ) { DPRINTK("invalid buffer\n"); goto bad_descriptor; @@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) req->sector_number + tot_sects, req->sector_number + tot_sects + nr_sects, req->device); + unlock_buffer(buffer, nr_sects<<9, (operation==READ)); goto bad_descriptor; } @@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG(); } - /* Lock pages associated with each buffer head. */ - for ( i = 0; i < nr_psegs; i++ ) - __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, - (operation==READ)); - spin_unlock(&p->page_lock); - atomic_inc(&nr_pending); pending_req = pending_reqs + pending_ring[pending_cons]; PENDREQ_IDX_INC(pending_cons); @@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) return; bad_descriptor: - spin_unlock(&p->page_lock); make_response(p, req->id, req->operation, 1); } @@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct *p) if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG(); p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL); clear_page(p->blk_ring_base); - SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain); + SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p); p->blkdev_list.next = NULL; spin_lock_init(&p->vbd_lock); } @@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_struct *p) { ASSERT(!__on_blkdev_list(p)); UNSHARE_PFN(virt_to_page(p->blk_ring_base)); - free_page((unsigned long)p->blk_ring_base); destroy_all_vbds(p); } diff --git a/xen/drivers/block/xen_vbd.c b/xen/drivers/block/xen_vbd.c index f16adb6795..13da02d03c 100644 --- a/xen/drivers/block/xen_vbd.c +++ b/xen/drivers/block/xen_vbd.c @@ -23,13 +23,6 @@ extern int ide_probe_devices(xen_disk_info_t *xdi); extern int scsi_probe_devices(xen_disk_info_t *xdi); - -#if 0 -#define DPRINTK(_f, _a...) printk( _f , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - /* XXX SMH: crappy 'hash function' .. fix when care. */ #define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1)) @@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe) if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) ) { /* Privileged domains always get access to the 'real' devices. */ - if ( (ret = ide_probe_devices(&probe->xdi)) != 0 ) - { - DPRINTK("vbd_probe: error %d in probing ide devices\n", ret); + if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) || + ((ret = scsi_probe_devices(&probe->xdi)) != 0) ) goto out; - } - if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 ) - { - DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret); - goto out; - } } if ( probe->domain == VBD_PROBE_ALL ) @@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe) { if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) { - DPRINTK("vbd_probe: error %d in probing virtual devices\n", - ret); read_unlock_irqrestore(&tasklist_lock, flags); goto out; } @@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe) } read_unlock_irqrestore(&tasklist_lock, flags); } - else - { - if ( (ret = vbd_probe_devices(&probe->xdi, p)) ) - { - DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret); - goto out; - } - - } + else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) + goto out; out: + if ( ret != 0 ) + DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); if ( p != NULL ) put_task_struct(p); return ret; diff --git a/xen/drivers/net/e1000/e1000_main.c b/xen/drivers/net/e1000/e1000_main.c index 4d88a61465..f6f5bb7aa8 100644 --- a/xen/drivers/net/e1000/e1000_main.c +++ b/xen/drivers/net/e1000/e1000_main.c @@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) static void e1000_tx_timeout(struct net_device *netdev) { +#if 0 struct e1000_adapter *adapter = netdev->priv; /* Do the reset outside of interrupt context */ - //schedule_work(&adapter->tx_timeout_task); + schedule_work(&adapter->tx_timeout_task); +#endif e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN } diff --git a/xen/include/asm-i386/atomic.h b/xen/include/asm-i386/atomic.h index 70a1212ed6..9dcdca93f7 100644 --- a/xen/include/asm-i386/atomic.h +++ b/xen/include/asm-i386/atomic.h @@ -186,15 +186,6 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v) return c; } -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ -__asm__ __volatile__(LOCK "andl %0,%1" \ -: : "r" (~(mask)),"m" (*addr) : "memory") - -#define atomic_set_mask(mask, addr) \ -__asm__ __volatile__(LOCK "orl %0,%1" \ -: : "r" (mask),"m" (*addr) : "memory") - /* Atomic operations are already serializing on x86 */ #define smp_mb__before_atomic_dec() barrier() #define smp_mb__after_atomic_dec() barrier() diff --git a/xen/include/asm-i386/flushtlb.h b/xen/include/asm-i386/flushtlb.h index 3a063fc0cb..e6f61cb521 100644 --- a/xen/include/asm-i386/flushtlb.h +++ b/xen/include/asm-i386/flushtlb.h @@ -1,40 +1,39 @@ /****************************************************************************** * flushtlb.h * - * TLB flush macros that count flushes. Counting is used to enforce - * zero-copy safety, particularily for the network code. - * - * akw - Jan 21, 2003 + * TLB flushes are timestamped using a global virtual 'clock' which ticks + * on any TLB flush on any processor. + * + * Copyright (c) 2003, K A Fraser */ -#ifndef __FLUSHTLB_H -#define __FLUSHTLB_H +#ifndef __FLUSHTLB_H__ +#define __FLUSHTLB_H__ #include <xeno/smp.h> -#include <asm/atomic.h> - -atomic_t tlb_flush_count[NR_CPUS]; - -#define __write_cr3_counted(__pa) \ - do { \ - __asm__ __volatile__ ( \ - "movl %0, %%cr3;" \ - :: "r" (__pa) \ - : "memory"); \ - atomic_inc(&tlb_flush_count[smp_processor_id()]); \ - } while (0) - -#define __flush_tlb_counted() \ - do { \ - unsigned int tmpreg; \ - \ - __asm__ __volatile__( \ - "movl %%cr3, %0; # flush TLB \n" \ - "movl %0, %%cr3; " \ - : "=r" (tmpreg) \ - :: "memory"); \ - atomic_inc(&tlb_flush_count[smp_processor_id()]); \ - } while (0) - -#endif - + +/* + * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the + * system is guaranteed to have been flushed. + */ +#define GLOBAL_FLUSH_PERIOD (1<<16) + +/* + * '_cpu_stamp' is the current timestamp for the CPU we are testing. + * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last + * used for a purpose that may have caused the CPU's TLB to become tainted. + */ +#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \ + (((_cpu_stamp) > (_lastuse_stamp)) || \ + (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD))) + +extern unsigned long tlbflush_mask; +extern unsigned long tlbflush_clock; +extern unsigned long tlbflush_time[NR_CPUS]; + +extern void new_tlbflush_clock_period(void); + +extern void write_cr3_counted(unsigned long pa); +extern void flush_tlb_counted(void); + +#endif /* __FLUSHTLB_H__ */ diff --git a/xen/include/asm-i386/io.h b/xen/include/asm-i386/io.h index 9b54ae278d..1bae91a1e2 100644 --- a/xen/include/asm-i386/io.h +++ b/xen/include/asm-i386/io.h @@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsigned long address) return __va(address); } -/* - * Change "struct page" to physical address. - */ -#define page_to_phys(page) ((page - frame_table) << PAGE_SHIFT) +#define page_to_pfn(_page) ((unsigned long)((_page) - frame_table)) +#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT) +#define page_to_virt(_page) phys_to_virt(page_to_phys(_page)) extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); diff --git a/xen/include/asm-i386/page.h b/xen/include/asm-i386/page.h index c9191c43eb..2fc1c43ef0 100644 --- a/xen/include/asm-i386/page.h +++ b/xen/include/asm-i386/page.h @@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } pagetable_t; extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE]; extern void paging_init(void); -#define __flush_tlb() __flush_tlb_counted() +#define __flush_tlb() flush_tlb_counted() /* Flush global pages as well. */ @@ -111,10 +111,10 @@ extern void paging_init(void); } while (0) -#define __flush_tlb_all() \ +#define __flush_tlb_pge() \ do { \ __pge_off(); \ - __flush_tlb_counted(); \ + flush_tlb_counted(); \ __pge_on(); \ } while (0) diff --git a/xen/include/asm-i386/pgalloc.h b/xen/include/asm-i386/pgalloc.h index 841e5fd4a1..88e9064641 100644 --- a/xen/include/asm-i386/pgalloc.h +++ b/xen/include/asm-i386/pgalloc.h @@ -47,28 +47,24 @@ #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() -#define flush_tlb_cpu(_cpu) __flush_tlb() +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb() +#define flush_tlb_all_pge() __flush_tlb_pge() +#define local_flush_tlb() __flush_tlb() +#define flush_tlb_cpu(_cpu) __flush_tlb() +#define flush_tlb_mask(_mask) __flush_tlb() #else #include <xeno/smp.h> +extern void flush_tlb_mask(unsigned long mask); +extern void flush_tlb_all_pge(void); + #define flush_tlb() __flush_tlb() +#define flush_tlb_all() flush_tlb_mask((1 << smp_num_cpus) - 1) #define local_flush_tlb() __flush_tlb() - -extern void flush_tlb_all(void); - -extern void flush_tlb_others(unsigned long cpumask); -static inline void flush_tlb_cpu(unsigned int cpu) -{ - if ( cpu == smp_processor_id() ) - __flush_tlb(); - else - flush_tlb_others(1<<cpu); -} +#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu)) #endif diff --git a/xen/include/asm-i386/smp.h b/xen/include/asm-i386/smp.h index cfec568c43..08eef3c8bd 100644 --- a/xen/include/asm-i386/smp.h +++ b/xen/include/asm-i386/smp.h @@ -1,15 +1,8 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H -#ifndef __ASSEMBLY__ #include <xeno/config.h> #include <asm/ptrace.h> -#include <asm/fixmap.h> -#include <asm/bitops.h> -#include <asm/mpspec.h> -#include <asm/io_apic.h> -#include <asm/apic.h> -#endif #ifdef CONFIG_SMP #define TARGET_CPUS cpu_online_map @@ -18,8 +11,6 @@ #endif #ifdef CONFIG_SMP -#ifndef __ASSEMBLY__ - /* * Private routines/data */ @@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id); /* Store per CPU info (like the initial #define smp_processor_id() (current->processor) +#include <asm/fixmap.h> +#include <asm/apic.h> + static __inline int hard_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ @@ -86,7 +80,5 @@ static __inline int logical_smp_processor_id(void) return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } -#endif /* !__ASSEMBLY__ */ - #endif #endif diff --git a/xen/include/asm-i386/spinlock.h b/xen/include/asm-i386/spinlock.h index 59dc7b209f..9a4fc8573d 100644 --- a/xen/include/asm-i386/spinlock.h +++ b/xen/include/asm-i386/spinlock.h @@ -1,11 +1,10 @@ #ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H -#include <asm/atomic.h> -#include <asm/rwlock.h> -#include <asm/page.h> #include <xeno/config.h> #include <xeno/lib.h> +#include <asm/atomic.h> +#include <asm/rwlock.h> #if 0 #define SPINLOCK_DEBUG 1 diff --git a/xen/include/asm-i386/system.h b/xen/include/asm-i386/system.h index dc4ac3398b..3e85277d6c 100644 --- a/xen/include/asm-i386/system.h +++ b/xen/include/asm-i386/system.h @@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, #define cmpxchg(ptr,o,n)\ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ (unsigned long)(n),sizeof(*(ptr)))) - + + +/* + * This function causes longword _o to be changed to _n at location _p. + * If this access causes a fault then we return 1, otherwise we return 0. + * If no fault occurs then _o is updated to teh value we saw at _p. If this + * is the same as the initial value of _o then _n is written to location _p. + */ +#define cmpxchg_user(_p,_o,_n) \ +({ \ + int _rc; \ + __asm__ __volatile__ ( \ + "1: " LOCK_PREFIX "cmpxchgl %2,%3\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $1,%1\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,3b\n" \ + ".previous" \ + : "=a" (_o), "=r" (_rc) \ + : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \ + : "memory"); \ + _rc; \ +}) + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index 045e4ad70e..e1d20ff2c7 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st { /* IN variables. */ unsigned long pfn; /* Machine page frame number to query. */ - /* OUT variables. */ unsigned int domain; /* To which domain does the frame belong? */ + /* OUT variables. */ enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type? */ } dom0_getpageframeinfo_t; diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h index 5bd13dba9b..145b1a0aac 100644 --- a/xen/include/hypervisor-ifs/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/hypervisor-if.h @@ -125,9 +125,9 @@ * which shifts the least bits out. */ /* A normal page-table update request. */ -#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is VA. */ +#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ /* DOM0 can make entirely unchecked updates which do not affect refcnts. */ -#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is VA. */ +#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is MA. */ /* Update an entry in the machine->physical mapping table. */ #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ /* An extended command. */ diff --git a/xen/include/xeno/config.h b/xen/include/xeno/config.h index 64a99f66ce..c88e41d15b 100644 --- a/xen/include/xeno/config.h +++ b/xen/include/xeno/config.h @@ -145,6 +145,13 @@ #define capable(_c) 0 +#ifndef NDEBUG +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ + __FILE__, __LINE__, ## _a) +#else +#define DPRINTK(_f, _a...) ((void)0) +#endif + #ifndef __ASSEMBLY__ #include <xeno/compiler.h> diff --git a/xen/include/xeno/mm.h b/xen/include/xeno/mm.h index 8f0c032367..d5c3c5d6cb 100644 --- a/xen/include/xeno/mm.h +++ b/xen/include/xeno/mm.h @@ -3,34 +3,35 @@ #define __XENO_MM_H__ #include <xeno/config.h> +#include <xeno/list.h> +#include <xeno/spinlock.h> +#include <xeno/perfc.h> +#include <xeno/sched.h> + +#include <asm/pgalloc.h> #include <asm/atomic.h> #include <asm/desc.h> -#include <xeno/list.h> +#include <asm/flushtlb.h> +#include <asm/io.h> + #include <hypervisor-ifs/hypervisor-if.h> -#include <xeno/spinlock.h> -/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */ +/* + * These are for compatibility with calls to the Linux memory allocators. + */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */ #define __GFP_DMA 0x01 - -/* Action modifiers - doesn't change the zoning */ +#define GFP_DMA __GFP_DMA #define __GFP_WAIT 0x10 /* Can wait and reschedule? */ #define __GFP_HIGH 0x20 /* Should access emergency pools? */ #define __GFP_IO 0x40 /* Can start low memory physical IO? */ #define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ #define __GFP_FS 0x100 /* Can call down to low-level FS? */ - #define GFP_ATOMIC (__GFP_HIGH) -#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ +#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \ + __GFP_HIGHIO | __GFP_FS) -#define GFP_DMA __GFP_DMA - - -/****************************************************************************** +/* * The following is for page_alloc.c. */ @@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int order); #define free_page(_p) (__free_pages(_p,0)) -/****************************************************************************** - * The following is the array of page info. One entry per page owned - * by the hypervisor, indexed from `mem_map', just like Linux. - * - * 12.11.02. We no longer use struct page or mem_map, these are replaced - * with struct pfn_info and frame_table respectively. Boris Dragovic - */ - -typedef struct pfn_info { - struct list_head list; /* ->mapping has some page lists. */ - unsigned long flags; /* atomic flags. */ - unsigned long tot_count; /* Total domain usage count. */ - unsigned long type_count; /* pagetable/dir, or domain-writeable refs. */ -} frame_table_t; - -#define get_page_tot(p) ((p)->tot_count++) -#define put_page_tot(p) \ - ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; }) -#define page_tot_count(p) ((p)->tot_count) -#define set_page_tot_count(p,v) ((p)->tot_count = v) - -#define get_page_type(p) ((p)->type_count++) -#define put_page_type(p) \ - ({ ASSERT((p)->type_count != 0); --(p)->type_count; }) -#define page_type_count(p) ((p)->type_count) -#define set_page_type_count(p,v) ((p)->type_count = v) - -#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */ -/* hypervisor flags (domain == 0) */ -#define PG_slab 24 -/* domain flags (domain != 0) */ /* - * NB. The following page types are MUTUALLY EXCLUSIVE. - * At most one can be true at any point, and 'type_count' counts how many - * references exist of the current type. A change in type can only occur - * when type_count == 0. + * Per-page-frame information. */ -#define PG_type_mask (15<<24) /* bits 24-27 */ -#define PGT_none (0<<24) /* no special uses of this page */ -#define PGT_l1_page_table (1<<24) /* using this page as an L1 page table? */ -#define PGT_l2_page_table (2<<24) /* using this page as an L2 page table? */ -#define PGT_l3_page_table (3<<24) /* using this page as an L3 page table? */ -#define PGT_l4_page_table (4<<24) /* using this page as an L4 page table? */ -#define PGT_gdt_page (5<<24) /* using this page in a GDT? */ -#define PGT_ldt_page (6<<24) /* using this page in an LDT? */ -#define PGT_writeable_page (7<<24) /* has writable mappings of this page? */ -/* - * This bit indicates that the TLB must be flushed when the type count of this - * frame drops to zero. This is needed on current x86 processors only for - * frames which have guestos-accessible writeable mappings. In this case we - * must prevent stale TLB entries allowing the frame to be written if it used - * for a page table, for example. - * - * We have this bit because the writeable type is actually also used to pin a - * page when it is used as a disk read buffer. This doesn't require a TLB flush - * because the frame never has a mapping in the TLB. - */ -#define PG_need_flush (1<<28) +struct pfn_info +{ + /* Each frame can be threaded onto a doubly-linked list. */ + struct list_head list; + /* The following possible uses are context-dependent. */ + union { + /* Page is in use and not a zombie: we keep a pointer to its owner. */ + struct task_struct *domain; + /* Page is not currently allocated: mask of possibly-tainted TLBs. */ + unsigned long cpu_mask; + /* Page is a zombie: this word currently has no use. */ + unsigned long _unused; + } u; + /* Reference count and various PGC_xxx flags and fields. */ + unsigned long count_and_flags; + /* Type reference count and various PGT_xxx flags and fields. */ + unsigned long type_and_flags; + /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ + unsigned long tlbflush_timestamp; +}; -/* - * This bit indicates that the guest OS has pinned the page to its current - * type. For page tables this can avoid the frame scanning and reference-count - * updates that occur when the type count falls to zero. - */ -#define PG_guest_pinned (1<<29) + /* The following page types are MUTUALLY EXCLUSIVE. */ +#define PGT_none (0<<29) /* no special uses of this page */ +#define PGT_l1_page_table (1<<29) /* using this page as an L1 page table? */ +#define PGT_l2_page_table (2<<29) /* using this page as an L2 page table? */ +#define PGT_l3_page_table (3<<29) /* using this page as an L3 page table? */ +#define PGT_l4_page_table (4<<29) /* using this page as an L4 page table? */ +#define PGT_gdt_page (5<<29) /* using this page in a GDT? */ +#define PGT_ldt_page (6<<29) /* using this page in an LDT? */ +#define PGT_writeable_page (7<<29) /* has writable mappings of this page? */ +#define PGT_type_mask (7<<29) /* Bits 29-31. */ + /* Has this page been validated for use as its current type? */ +#define _PGT_validated 28 +#define PGT_validated (1<<_PGT_validated) + /* 28-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1<<28)-1) -#define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) -#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) + /* The owner of this page is dead: 'u.domain' is no longer valid. */ +#define _PGC_zombie 31 +#define PGC_zombie (1<<_PGC_zombie) + /* For safety, force a TLB flush when this page's type changes. */ +#define _PGC_tlb_flush_on_type_change 30 +#define PGC_tlb_flush_on_type_change (1<<_PGC_tlb_flush_on_type_change) + /* Owning guest has pinned this page to its current type? */ +#define _PGC_guest_pinned 29 +#define PGC_guest_pinned (1<<_PGC_guest_pinned) + /* Cleared when the owning guest 'frees' this page. */ +#define _PGC_allocated 28 +#define PGC_allocated (1<<_PGC_allocated) + /* 28-bit count of references to this frame. */ +#define PGC_count_mask ((1<<28)-1) -#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ - do { \ - (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \ - set_page_tot_count((_pfn), 2); \ - set_page_type_count((_pfn), 2); \ - } while ( 0 ) +/* We trust the slab allocator in slab.c, and our use of it. */ +#define PageSlab(page) (1) +#define PageSetSlab(page) ((void)0) +#define PageClearSlab(page) ((void)0) + +#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS) -#define UNSHARE_PFN(_pfn) \ - do { \ - (_pfn)->flags = 0; \ - set_page_tot_count((_pfn), 0); \ - set_page_type_count((_pfn), 0); \ +#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ + do { \ + (_pfn)->u.domain = (_dom); \ + wmb(); /* install valid domain ptr before updating refcnt. */ \ + (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \ + (_pfn)->type_and_flags = PGT_writeable_page | PGT_validated | 1; \ } while ( 0 ) -/* The array of struct pfn_info, - * free pfn list and number of free pfns in the free list - */ -extern frame_table_t * frame_table; +#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn) + +extern struct pfn_info *frame_table; extern unsigned long frame_table_size; extern struct list_head free_list; extern spinlock_t free_list_lock; @@ -140,6 +126,180 @@ extern unsigned int free_pfns; extern unsigned long max_page; void init_frametable(unsigned long nr_pages); +struct pfn_info *alloc_domain_page(struct task_struct *p); +void free_domain_page(struct pfn_info *page); + +int alloc_page_type(struct pfn_info *page, unsigned int type); +void free_page_type(struct pfn_info *page, unsigned int type); + +static inline void put_page(struct pfn_info *page) +{ + unsigned long nx, x, y = page->count_and_flags; + + do { + x = y; + nx = x - 1; + } + while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); + + if ( unlikely((nx & PGC_count_mask) == 0) ) + free_domain_page(page); +} + + +static inline int get_page(struct pfn_info *page, + struct task_struct *domain) +{ + unsigned long x, nx, y = page->count_and_flags; + struct task_struct *p, *np = page->u.domain; + + do { + x = y; + nx = x + 1; + p = np; + if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ + unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ + unlikely(x & PGC_zombie) || /* Zombie? */ + unlikely(p != domain) ) /* Wrong owner? */ + { + DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n", + page_to_pfn(page), domain, p, x); + return 0; + } + __asm__ __volatile__( + LOCK_PREFIX "cmpxchg8b %3" + : "=a" (np), "=d" (y), "=b" (p), + "=m" (*(volatile unsigned long long *)(&page->u.domain)) + : "0" (p), "1" (x), "b" (p), "c" (nx) ); + } + while ( unlikely(np != p) || unlikely(y != x) ); + + return 1; +} + + +static inline void put_page_type(struct pfn_info *page) +{ + unsigned long nx, x, y = page->type_and_flags; + + again: + do { + x = y; + nx = x - 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + page->tlbflush_timestamp = tlbflush_clock; + if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && + likely(nx & PGT_validated) ) + { + /* + * Page-table pages must be unvalidated when count is zero. The + * 'free' is safe because the refcnt is non-zero and the + * validated bit is clear => other ops will spin or fail. + */ + if ( unlikely((y = cmpxchg(&page->type_and_flags, x, + x & ~PGT_validated)) != x) ) + goto again; + /* We cleared the 'valid bit' so we must do the clear up. */ + free_page_type(page, x & PGT_type_mask); + /* Carry on as we were, but with the 'valid bit' now clear. */ + x &= ~PGT_validated; + nx &= ~PGT_validated; + } + } + } + while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) ); +} + + +static inline int get_page_type(struct pfn_info *page, unsigned long type) +{ + unsigned long nx, x, y = page->type_and_flags; + again: + do { + x = y; + nx = x + 1; + if ( unlikely((nx & PGT_count_mask) == 0) ) + { + DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page)); + return 0; + } + else if ( unlikely((x & PGT_count_mask) == 0) ) + { + if ( (x & PGT_type_mask) != type ) + { + nx &= ~(PGT_type_mask | PGT_validated); + nx |= type; + /* No extra validation needed for writeable pages. */ + if ( type == PGT_writeable_page ) + nx |= PGT_validated; + } + } + else if ( unlikely((x & PGT_type_mask) != type) ) + { + DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n", + x & PGT_type_mask, type, page_to_pfn(page)); + return 0; + } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->type_and_flags) != x ) + { + rep_nop(); + barrier(); + } + goto again; + } + } + while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) ); + + if ( unlikely(!(nx & PGT_validated)) ) + { + /* Try to validate page type; drop the new reference on failure. */ + if ( unlikely(!alloc_page_type(page, type)) ) + { + DPRINTK("Error while validating pfn %08lx for type %08lx\n", + page_to_pfn(page), type); + put_page_type(page); + return 0; + } + set_bit(_PGT_validated, &page->type_and_flags); + } + + return 1; +} + + +static inline void put_page_and_type(struct pfn_info *page) +{ + put_page_type(page); + put_page(page); +} + + +static inline int get_page_and_type(struct pfn_info *page, + struct task_struct *domain, + unsigned int type) +{ + int rc = get_page(page, domain); + + if ( likely(rc) && unlikely(!get_page_type(page, type)) ) + { + put_page(page); + rc = 0; + } + + return rc; +} + +#define ASSERT_PAGE_IS_TYPE(_p, _t) \ + ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t)); \ + ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0) +#define ASSERT_PAGE_IS_DOMAIN(_p, _d) \ + ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0); \ + ASSERT((_p)->u.domain == (_d)) + int check_descriptor(unsigned long a, unsigned long b); /* diff --git a/xen/include/xeno/perfc.h b/xen/include/xeno/perfc.h index 4048000790..9ea244b3b8 100644 --- a/xen/include/xeno/perfc.h +++ b/xen/include/xeno/perfc.h @@ -1,6 +1,6 @@ -/* - * xen performance counters - */ + +#ifndef __XENO_PERFC_H__ +#define __XENO_PERFC_H__ #include <asm/atomic.h> @@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters; #define perfc_addc(x,y) atomic_add((y), &perfcounters.x[smp_processor_id()]) #define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y]) +#endif /* __XENO_PERFC_H__ */ diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h index 033f12c8c9..f81b5bcba1 100644 --- a/xen/include/xeno/perfc_defn.h +++ b/xen/include/xeno/perfc_defn.h @@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hypercalls" ) PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" ) PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" ) PERFCOUNTER( net_rx_delivered, "net rx delivered" ) -PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" ) PERFCOUNTER( net_tx_transmitted, "net tx transmitted" ) PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" ) diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index 736201446a..6c1984d795 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -4,7 +4,6 @@ #include <xeno/config.h> #include <xeno/types.h> #include <xeno/spinlock.h> -#include <asm/page.h> #include <asm/ptrace.h> #include <xeno/smp.h> #include <asm/processor.h> @@ -16,7 +15,6 @@ #include <xeno/time.h> #include <xeno/ac_timer.h> #include <xeno/delay.h> -#include <xeno/slab.h> #define MAX_DOMAIN_NAME 16 @@ -94,9 +92,10 @@ struct task_struct unsigned int domain; /* domain id */ - struct list_head pg_head; - unsigned int tot_pages; /* number of pages currently possesed */ - unsigned int max_pages; /* max number of pages that can be possesed */ + spinlock_t page_list_lock; + struct list_head page_list; + unsigned int tot_pages; /* number of pages currently possesed */ + unsigned int max_pages; /* max number of pages that can be possesed */ /* scheduling */ struct list_head run_list; @@ -132,8 +131,6 @@ struct task_struct /* VM */ struct mm_struct mm; - /* We need this lock to check page types and frob reference counts. */ - spinlock_t page_lock; mm_segment_t addr_limit; @@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_CPUS]; #define STACK_SIZE PAGE_SIZE +#include <xeno/slab.h> + extern kmem_cache_t *task_struct_cachep; #define alloc_task_struct() \ ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL)) diff --git a/xen/include/xeno/vif.h b/xen/include/xeno/vif.h index f3ee9fa616..a557cb3802 100644 --- a/xen/include/xeno/vif.h +++ b/xen/include/xeno/vif.h @@ -34,7 +34,7 @@ extern struct net_device *the_dev; typedef struct rx_shadow_entry_st { unsigned short id; - unsigned short flush_count; /* 16 bits should be enough */ + unsigned short _pad; unsigned long pte_ptr; unsigned long buf_pfn; } rx_shadow_entry_t; diff --git a/xen/net/dev.c b/xen/net/dev.c index 280db4def1..91d6a4e0cf 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -39,12 +39,6 @@ #define rtnl_lock() ((void)0) #define rtnl_unlock() ((void)0) -#if 0 -#define DPRINTK(_f, _a...) printk(_f , ## _a) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1)) #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1)) #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1)) @@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned; static int get_tx_bufs(net_vif_t *vif); -static void __make_tx_response(net_vif_t *vif, - unsigned short id, - unsigned char st); +static void make_tx_response(net_vif_t *vif, + unsigned short id, + unsigned char st); static void make_rx_response(net_vif_t *vif, unsigned short id, unsigned short size, @@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_CPUS]; void deliver_packet(struct sk_buff *skb, net_vif_t *vif) { rx_shadow_entry_t *rx; - unsigned long *ptep; + unsigned long *ptep, pte; struct pfn_info *old_page, *new_page, *pte_page; unsigned int i; unsigned short size; unsigned char offset, status = RING_STATUS_OK; + struct task_struct *p = vif->domain; memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN); if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN); - /* - * Slightly gross: we need the page_lock so that we can do PTE checking. - * However, we take it slightly early so that it can protect the update - * of rx_cons. This saves us from grabbing two locks. - */ - spin_lock(&vif->domain->page_lock); + spin_lock(&vif->rx_lock); if ( (i = vif->rx_cons) == vif->rx_prod ) { - spin_unlock(&vif->domain->page_lock); + spin_unlock(&vif->rx_lock); perfc_incr(net_rx_capacity_drop); return; } - rx = vif->rx_shadow_ring + i; + rx = &vif->rx_shadow_ring[i]; vif->rx_cons = RX_RING_INC(i); size = (unsigned short)skb->len; offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK); - /* Release the page-table page. */ - pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT); - put_page_type(pte_page); - put_page_tot(pte_page); - - old_page = frame_table + rx->buf_pfn; + pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT]; + old_page = &frame_table[rx->buf_pfn]; new_page = skb->pf; ptep = map_domain_mem(rx->pte_ptr); - if ( (*ptep & _PAGE_PRESENT) ) + new_page->u.domain = p; + wmb(); /* make dom ptr visible before updating refcnt. */ + spin_lock(&p->page_list_lock); + list_add(&new_page->list, &p->page_list); + new_page->count_and_flags = PGC_allocated | 2; + spin_unlock(&p->page_list_lock); + get_page_type(new_page, PGT_writeable_page); + set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags); + wmb(); /* Get type count and set flush bit before updating PTE. */ + + pte = *ptep; + if ( unlikely(pte & _PAGE_PRESENT) || + unlikely(cmpxchg(ptep, pte, + (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | + ((new_page - frame_table) << PAGE_SHIFT))) != pte ) { - /* Bail out if the PTE has been reused under our feet. */ - list_add(&old_page->list, &vif->domain->pg_head); - old_page->flags = vif->domain->domain; unmap_domain_mem(ptep); - spin_unlock(&vif->domain->page_lock); status = RING_STATUS_BAD_PAGE; goto out; } - /* Give the new page to the domain, marking it writeable. */ - set_page_type_count(new_page, 1); - set_page_tot_count(new_page, 1); - new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush; - list_add(&new_page->list, &vif->domain->pg_head); - - /* Patch the PTE to map the new page as writeable. */ machine_to_phys_mapping[new_page - frame_table] - = machine_to_phys_mapping[old_page - frame_table]; - *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT | - (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK); + = machine_to_phys_mapping[old_page - frame_table]; unmap_domain_mem(ptep); - spin_unlock(&vif->domain->page_lock); - /* Our skbuff now points at the guest's old frame. */ skb->pf = old_page; /* Updates must happen before releasing the descriptor. */ smp_wmb(); - /* - * NB. The remote flush here should be safe, as we hold no locks. The - * network driver that called us should also have no nasty locks. - */ - if ( rx->flush_count == (unsigned short) - atomic_read(&tlb_flush_count[vif->domain->processor]) ) - { - perfc_incr(net_rx_tlbflush); - flush_tlb_cpu(vif->domain->processor); - } - perfc_incr(net_rx_delivered); /* record this so they can be billed */ @@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) vif->total_bytes_received += size; out: + put_page_and_type(pte_page); make_rx_response(vif, rx->id, size, status, offset); + spin_unlock(&vif->rx_lock); } /** @@ -785,8 +761,8 @@ static void net_tx_action(unsigned long unused) skb->mac.raw = skb->data; skb->guest_id = tx->id; - skb_shinfo(skb)->frags[0].page = frame_table + - (tx->payload >> PAGE_SHIFT); + skb_shinfo(skb)->frags[0].page = + &frame_table[tx->payload >> PAGE_SHIFT]; skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN; skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK; skb_shinfo(skb)->nr_frags = 1; @@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buff *skb) vif = skb->src_vif; - spin_lock(&vif->domain->page_lock); for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ ) - put_page_tot(skb_shinfo(skb)->frags[i].page); - spin_unlock(&vif->domain->page_lock); + put_page(skb_shinfo(skb)->frags[i].page); if ( skb->skb_type == SKB_NODATA ) kmem_cache_free(net_header_cachep, skb->head); @@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buff *skb) skb_shinfo(skb)->nr_frags = 0; spin_lock(&vif->tx_lock); - __make_tx_response(vif, skb->guest_id, RING_STATUS_OK); + make_tx_response(vif, skb->guest_id, RING_STATUS_OK); spin_unlock(&vif->tx_lock); /* @@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif) if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) ) { DPRINTK("Bad packet size: %d\n", tx.size); - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); continue; } @@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif) vif->remaining_credit -= tx.size; /* No crossing a page boundary as the payload mustn't fragment. */ - if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) + if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) ) { DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size); - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); continue; } buf_pfn = tx.addr >> PAGE_SHIFT; buf_page = frame_table + buf_pfn; - spin_lock(&p->page_lock); - if ( (buf_pfn >= max_page) || - ((buf_page->flags & PG_domain_mask) != p->domain) ) + if ( unlikely(buf_pfn >= max_page) || + unlikely(!get_page(buf_page, p)) ) { DPRINTK("Bad page frame\n"); - spin_unlock(&p->page_lock); - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); continue; } @@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif) init_tx_header(vif, g_data, tx.size, the_dev)); if ( protocol == 0 ) { - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); - goto tx_unmap_and_continue; + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); + goto cleanup_and_continue; } target = net_get_target_vif(g_data, tx.size, vif); @@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif) /* Local delivery */ if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL ) { - __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); + make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE); put_vif(target); - goto tx_unmap_and_continue; + goto cleanup_and_continue; } skb->src_vif = vif; @@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif) if ( netif_rx(skb) == NET_RX_DROP ) kfree_skb(skb); - __make_tx_response(vif, tx.id, RING_STATUS_OK); + make_tx_response(vif, tx.id, RING_STATUS_OK); } else if ( (target == VIF_PHYS) || IS_PRIV(p) ) { @@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif) kmem_cache_alloc(net_header_cachep, GFP_KERNEL); if ( vif->tx_shadow_ring[j].header == NULL ) { - __make_tx_response(vif, tx.id, RING_STATUS_OK); - goto tx_unmap_and_continue; + make_tx_response(vif, tx.id, RING_STATUS_OK); + goto cleanup_and_continue; } memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN); vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN; - get_page_tot(buf_page); + buf_page = NULL; /* hand off our page reference */ j = TX_RING_INC(j); } else { - __make_tx_response(vif, tx.id, RING_STATUS_DROPPED); + make_tx_response(vif, tx.id, RING_STATUS_DROPPED); } - tx_unmap_and_continue: + cleanup_and_continue: + if ( buf_page != NULL ) + put_page(buf_page); unmap_domain_mem(g_data); - spin_unlock(&p->page_lock); } /* @@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif) } -static long get_bufs_from_vif(net_vif_t *vif) +static void get_rx_bufs(net_vif_t *vif) { - net_ring_t *shared_rings; - net_idx_t *shared_idxs; + struct task_struct *p = vif->domain; + net_ring_t *shared_rings = vif->shared_rings; + net_idx_t *shared_idxs = vif->shared_idxs; unsigned int i, j; rx_req_entry_t rx; unsigned long pte_pfn, buf_pfn; struct pfn_info *pte_page, *buf_page; - struct task_struct *p = vif->domain; - unsigned long *ptep; - - shared_idxs = vif->shared_idxs; - shared_rings = vif->shared_rings; - - /* - * PHASE 1 -- TRANSMIT RING - */ - - if ( get_tx_bufs(vif) ) - { - add_to_net_schedule_list_tail(vif); - maybe_schedule_tx_action(); - } + unsigned long *ptep, pte; - /* - * PHASE 2 -- RECEIVE RING - */ + spin_lock(&vif->rx_lock); /* * Collect up new receive buffers. We collect up to the guest OS's new @@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t *vif) { rx = shared_rings->rx_ring[i].req; - pte_pfn = rx.addr >> PAGE_SHIFT; - pte_page = frame_table + pte_pfn; + pte_pfn = rx.addr >> PAGE_SHIFT; + pte_page = &frame_table[pte_pfn]; - spin_lock(&p->page_lock); - if ( (pte_pfn >= max_page) || - ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != - (PGT_l1_page_table | p->domain)) ) + /* The address passed down must be to a valid PTE. */ + if ( unlikely(pte_pfn >= max_page) || + unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) ) { DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n", - p->domain, pte_pfn, max_page, pte_page->flags); - spin_unlock(&p->page_lock); + p->domain, pte_pfn, max_page, pte_page->type_and_flags); make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); continue; } - + ptep = map_domain_mem(rx.addr); - - if ( !(*ptep & _PAGE_PRESENT) ) + pte = *ptep; + + /* We must be passed a valid writeable mapping to swizzle. */ + if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != + (_PAGE_PRESENT|_PAGE_RW)) || + unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) ) { - DPRINTK("Invalid PTE passed down (not present)\n"); + DPRINTK("Invalid PTE passed down (not present or changing)\n"); + put_page_and_type(pte_page); make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); goto rx_unmap_and_continue; } - - buf_pfn = *ptep >> PAGE_SHIFT; - buf_page = frame_table + buf_pfn; + + buf_pfn = pte >> PAGE_SHIFT; + buf_page = &frame_table[buf_pfn]; - if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) != - (PGT_writeable_page | p->domain)) || - (page_tot_count(buf_page) != 1) ) + /* + * The page must belong to the correct domain, and must be mapped + * just once as a writeable page. + */ + if ( unlikely(buf_page->u.domain != p) || + unlikely(!test_and_clear_bit(_PGC_allocated, + &buf_page->count_and_flags)) || + unlikely(cmpxchg(&buf_page->type_and_flags, + PGT_writeable_page|PGT_validated|1, + 0) != (PGT_writeable_page|PGT_validated|1)) ) { - DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n", - page_type_count(buf_page), page_tot_count(buf_page), - buf_page->flags); + DPRINTK("Bad domain or page mapped writeable more than once.\n"); + if ( buf_page->u.domain == p ) + set_bit(_PGC_allocated, &buf_page->count_and_flags); + if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) != + (pte & ~_PAGE_PRESENT)) ) + put_page_and_type(buf_page); make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); goto rx_unmap_and_continue; } - + /* - * The pte they passed was good, so take it away from them. We also - * lock down the page-table page, so it doesn't go away. + * Now ensure that we can take the last references to this page. + * The final count should be 2, because of PGC_allocated. */ - get_page_type(pte_page); - get_page_tot(pte_page); - *ptep &= ~_PAGE_PRESENT; - buf_page->flags = 0; - set_page_type_count(buf_page, 0); - set_page_tot_count(buf_page, 0); + if ( unlikely(cmpxchg(&buf_page->count_and_flags, + PGC_tlb_flush_on_type_change | 2, 0) != + (PGC_tlb_flush_on_type_change | 2)) ) + { + DPRINTK("Page held more than once\n"); + /* Leave the page unmapped at 'ptep'. Stoopid domain! */ + make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); + goto rx_unmap_and_continue; + } + + /* Remove from the domain's allocation list. */ + spin_lock(&p->page_list_lock); list_del(&buf_page->list); + spin_unlock(&p->page_list_lock); - vif->rx_shadow_ring[j].id = rx.id; - vif->rx_shadow_ring[j].pte_ptr = rx.addr; - vif->rx_shadow_ring[j].buf_pfn = buf_pfn; - vif->rx_shadow_ring[j].flush_count = (unsigned short) - atomic_read(&tlb_flush_count[smp_processor_id()]); + vif->rx_shadow_ring[j].id = rx.id; + vif->rx_shadow_ring[j].pte_ptr = rx.addr; + vif->rx_shadow_ring[j].buf_pfn = buf_pfn; j = RX_RING_INC(j); rx_unmap_and_continue: unmap_domain_mem(ptep); - spin_unlock(&p->page_lock); } vif->rx_req_cons = i; @@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t *vif) vif->rx_prod = j; } + spin_unlock(&vif->rx_lock); +} + + +static long get_bufs_from_vif(net_vif_t *vif) +{ + if ( get_tx_bufs(vif) ) + { + add_to_net_schedule_list_tail(vif); + maybe_schedule_tx_action(); + } + + get_rx_bufs(vif); + return 0; } @@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t *vif) long flush_bufs_for_vif(net_vif_t *vif) { int i; - unsigned long *pte; + unsigned long *ptep, pte; struct pfn_info *page; struct task_struct *p = vif->domain; rx_shadow_entry_t *rx; @@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif) net_idx_t *shared_idxs = vif->shared_idxs; /* Return any outstanding receive buffers to the guest OS. */ - spin_lock(&p->page_lock); + spin_lock(&vif->rx_lock); for ( i = vif->rx_req_cons; (i != shared_idxs->rx_req_prod) && (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); @@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif) { rx = &vif->rx_shadow_ring[i]; - /* Release the page-table page. */ - page = frame_table + (rx->pte_ptr >> PAGE_SHIFT); - put_page_type(page); - put_page_tot(page); - /* Give the buffer page back to the domain. */ - page = frame_table + rx->buf_pfn; - list_add(&page->list, &p->pg_head); - page->flags = vif->domain->domain; + page = &frame_table[rx->buf_pfn]; + spin_lock(&p->page_list_lock); + list_add(&page->list, &p->page_list); + page->count_and_flags = PGC_allocated | 2; + spin_unlock(&p->page_list_lock); + get_page_type(page, PGT_writeable_page); + set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags); + wmb(); /* Patch up the PTE if it hasn't changed under our feet. */ - pte = map_domain_mem(rx->pte_ptr); - if ( !(*pte & _PAGE_PRESENT) ) - { - *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) | - _PAGE_RW | _PAGE_PRESENT; - page->flags |= PGT_writeable_page | PG_need_flush; - set_page_type_count(page, 1); - set_page_tot_count(page, 1); - } - unmap_domain_mem(pte); + ptep = map_domain_mem(rx->pte_ptr); + pte = *ptep; + if ( unlikely(pte & _PAGE_PRESENT) || + unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) | + (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT) + != pte) ) + put_page_and_type(page); + unmap_domain_mem(ptep); + + put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]); make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0); } vif->rx_cons = i; - spin_unlock(&p->page_lock); + spin_unlock(&vif->rx_lock); /* * Flush pending transmit buffers. The guest may still have to wait for @@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif) (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); i = TX_RING_INC(i) ) { - __make_tx_response(vif, shared_rings->tx_ring[i].req.id, + make_tx_response(vif, shared_rings->tx_ring[i].req.id, RING_STATUS_DROPPED); } vif->tx_req_cons = i; @@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop) } -static void __make_tx_response(net_vif_t *vif, - unsigned short id, - unsigned char st) +static void make_tx_response(net_vif_t *vif, + unsigned short id, + unsigned char st) { unsigned int pos; tx_resp_entry_t *resp; @@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t *vif, rx_resp_entry_t *resp; /* Place on the response ring for the relevant domain. */ - spin_lock(&vif->rx_lock); pos = vif->rx_resp_prod; resp = &vif->shared_rings->rx_ring[pos].resp; resp->id = id; @@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t *vif, unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET); guest_event_notify(cpu_mask); } - spin_unlock(&vif->rx_lock); } diff --git a/xen/net/skbuff.c b/xen/net/skbuff.c index d8950633b9..5fcc044c5e 100644 --- a/xen/net/skbuff.c +++ b/xen/net/skbuff.c @@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool(struct sk_buff *skb) static inline u8 *alloc_skb_data_page(struct sk_buff *skb) { - struct list_head *list_ptr; - struct pfn_info *pf; - unsigned long flags; - - spin_lock_irqsave(&free_list_lock, flags); - - if (!free_pfns) return NULL; - - list_ptr = free_list.next; - pf = list_entry(list_ptr, struct pfn_info, list); - pf->flags = 0; - list_del(&pf->list); - free_pfns--; - - spin_unlock_irqrestore(&free_list_lock, flags); - + struct pfn_info *pf; + if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) ) + return NULL; skb->pf = pf; return (u8 *)((pf - frame_table) << PAGE_SHIFT); } static inline void dealloc_skb_data_page(struct sk_buff *skb) { - struct pfn_info *pf; + struct pfn_info *pf = skb->pf; unsigned long flags; - - pf = skb->pf; - spin_lock_irqsave(&free_list_lock, flags); - - pf->flags = 0; - set_page_type_count(pf, 0); - set_page_tot_count(pf, 0); list_add(&pf->list, &free_list); free_pfns++; - spin_unlock_irqrestore(&free_list_lock, flags); } diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c index b4784ccc02..6bc8baa47a 100644 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c @@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void) pte = update_debug_queue[i].ptep; if ( pte == NULL ) continue; update_debug_queue[i].ptep = NULL; - update.ptr = pte; + update.ptr = virt_to_machine(pte); update.val = update_debug_queue[i].pteval; HYPERVISOR_mmu_update(&update, 1); } @@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsigned long va) pgd = pgd_offset_k(va); pmd = pmd_offset(pgd, va); pte = pte_offset(pmd, va); - update.ptr = pte; + update.ptr = virt_to_machine(pte); pteval = *(unsigned long *)pte; update.val = pteval & ~_PAGE_PRESENT; HYPERVISOR_mmu_update(&update, 1); @@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(void) #if MMU_UPDATE_DEBUG > 0 DEBUG_allow_pt_reads(); #endif - queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx); + queue_multicall2(__HYPERVISOR_mmu_update, + (unsigned long)update_queue, + idx); idx = 0; } spin_unlock_irqrestore(&update_lock, flags); @@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, unsigned long val) #if MMU_UPDATE_DEBUG > 0 DEBUG_disallow_pt_read((unsigned long)ptr); #endif - update_queue[idx].ptr = (unsigned long)ptr; + update_queue[idx].ptr = virt_to_machine(ptr); update_queue[idx].val = val; increment_index(); spin_unlock_irqrestore(&update_lock, flags); @@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, unsigned long val) { unsigned long flags; spin_lock_irqsave(&update_lock, flags); - update_queue[idx].ptr = (unsigned long)ptr; + update_queue[idx].ptr = virt_to_machine(ptr); update_queue[idx].val = val; increment_index(); spin_unlock_irqrestore(&update_lock, flags); diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c index 883cd03b37..b1f8019ef9 100644 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c @@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigned long vaddr, } pte = pte_offset(pmd, vaddr); -#if 0 /* Not in Xen, since this breaks clear_fixmap. */ - if (pte_val(*pte)) - pte_ERROR(*pte); -#endif - - /* We queue directly, avoiding hidden phys->machine translation. */ - queue_l1_entry_update(pte, phys | pgprot_val(prot)); + if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) ) + queue_unchecked_mmu_update(pte, phys | pgprot_val(prot)); + else + queue_l1_entry_update(pte, phys | pgprot_val(prot)); /* * It's enough to flush this one mapping. @@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys, printk("Invalid __set_fixmap\n"); return; } - set_pte_phys(address, phys, - __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags))); + set_pte_phys(address, phys, flags); } void clear_fixmap(enum fixed_addresses idx) diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c index eac5c6a63c..078fede144 100644 --- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c +++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c @@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long machine_addr, unsigned long size) */ nrpages = size >> PAGE_SHIFT; if (nrpages > NR_FIX_BTMAPS) - return NULL; + return NULL; /* * Ok, go for it.. */ idx = FIX_BTMAP_BEGIN; while (nrpages > 0) { - set_fixmap(idx, machine_addr); + __set_fixmap(idx, machine_addr, + __pgprot(__PAGE_KERNEL|_PAGE_IO)); machine_addr += PAGE_SIZE; --idx; --nrpages; |