aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2003-12-20 12:44:11 +0000
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2003-12-20 12:44:11 +0000
commit24fa12d9d6bc4f6cf3740475e8403d911e84b53c (patch)
treec7487438b7035915ba974897f29788d0c4389ab4
parentce423b5dfe323c86c1db29cfe17d1779c4da1829 (diff)
downloadxen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.tar.gz
xen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.tar.bz2
xen-24fa12d9d6bc4f6cf3740475e8403d911e84b53c.zip
bitkeeper revision 1.652.1.1 (3fe4441bD7Ytc0dpv4nkQCX5YO2A8w)
Many files: Many fixes and a complete rewrite of page management in Xen. flushtlb.c: new file .del-TODO~9e3f87ffe4e9e1f1: Delete: xen/TODO .del-GUEST_CHANGES~b67e924f1504662d: Delete: xen/GUEST_CHANGES
-rw-r--r--.rootkeys3
-rw-r--r--tools/xc/lib/xc_linux_build.c28
-rw-r--r--tools/xc/lib/xc_linux_restore.c9
-rw-r--r--tools/xc/lib/xc_linux_save.c18
-rw-r--r--xen/GUEST_CHANGES26
-rw-r--r--xen/TODO54
-rw-r--r--xen/arch/i386/Rules.mk4
-rw-r--r--xen/arch/i386/apic.c2
-rw-r--r--xen/arch/i386/entry.S1
-rw-r--r--xen/arch/i386/flushtlb.c64
-rw-r--r--xen/arch/i386/io_apic.c2
-rw-r--r--xen/arch/i386/ioremap.c74
-rw-r--r--xen/arch/i386/irq.c3
-rw-r--r--xen/arch/i386/mm.c129
-rw-r--r--xen/arch/i386/pci-irq.c3
-rw-r--r--xen/arch/i386/process.c3
-rw-r--r--xen/arch/i386/smp.c68
-rw-r--r--xen/arch/i386/smpboot.c2
-rw-r--r--xen/arch/i386/traps.c3
-rw-r--r--xen/common/dom0_ops.c85
-rw-r--r--xen/common/dom_mem_ops.c94
-rw-r--r--xen/common/domain.c268
-rw-r--r--xen/common/kernel.c7
-rw-r--r--xen/common/memory.c1044
-rw-r--r--xen/common/network.c2
-rw-r--r--xen/common/page_alloc.c7
-rw-r--r--xen/drivers/block/ll_rw_blk.c18
-rw-r--r--xen/drivers/block/xen_block.c125
-rw-r--r--xen/drivers/block/xen_vbd.c33
-rw-r--r--xen/drivers/net/e1000/e1000_main.c4
-rw-r--r--xen/include/asm-i386/atomic.h9
-rw-r--r--xen/include/asm-i386/flushtlb.h65
-rw-r--r--xen/include/asm-i386/io.h7
-rw-r--r--xen/include/asm-i386/page.h6
-rw-r--r--xen/include/asm-i386/pgalloc.h26
-rw-r--r--xen/include/asm-i386/smp.h14
-rw-r--r--xen/include/asm-i386/spinlock.h5
-rw-r--r--xen/include/asm-i386/system.h29
-rw-r--r--xen/include/hypervisor-ifs/dom0_ops.h2
-rw-r--r--xen/include/hypervisor-ifs/hypervisor-if.h4
-rw-r--r--xen/include/xeno/config.h7
-rw-r--r--xen/include/xeno/mm.h350
-rw-r--r--xen/include/xeno/perfc.h7
-rw-r--r--xen/include/xeno/perfc_defn.h1
-rw-r--r--xen/include/xeno/sched.h13
-rw-r--r--xen/include/xeno/vif.h2
-rw-r--r--xen/net/dev.c309
-rw-r--r--xen/net/skbuff.c29
-rw-r--r--xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c12
-rw-r--r--xenolinux-2.4.23-sparse/arch/xeno/mm/init.c14
-rw-r--r--xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c5
51 files changed, 1601 insertions, 1498 deletions
diff --git a/.rootkeys b/.rootkeys
index 8506a8fff8..a21a1a92f4 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -80,10 +80,8 @@
3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py
3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py
3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING
-3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES
3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile
3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk
-3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO
3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile
3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk
3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c
@@ -93,6 +91,7 @@
3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c
3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S
3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c
+3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c
3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c
3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c
3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c
diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c
index a0176edfc1..7e5c57bb0d 100644
--- a/tools/xc/lib/xc_linux_build.c
+++ b/tools/xc/lib/xc_linux_build.c
@@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle,
const char *cmdline,
unsigned long shared_info_frame)
{
- l1_pgentry_t *vl1tab = NULL, *vl1e = NULL;
- l2_pgentry_t *vl2tab = NULL, *vl2e = NULL;
+ l1_pgentry_t *vl1tab;
+ l2_pgentry_t *vl2tab;
unsigned long *page_array = NULL;
mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL;
int alloc_index, num_pt_pages;
- unsigned long l2tab;
+ unsigned long l2tab, l2e, l1e=0;
unsigned long l1tab = 0;
unsigned long num_pgt_updates = 0;
unsigned long count, pt_start, i, j;
@@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle,
if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL )
goto error_out;
memset(vl2tab, 0, PAGE_SIZE);
- vl2e = vl2tab + l2_table_offset(virt_load_addr);
+ unmap_pfn(pm_handle, vl2tab);
+ l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t));
for ( count = 0; count < tot_pages; count++ )
{
- if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
+ if ( (l1e & (PAGE_SIZE-1)) == 0 )
{
l1tab = page_array[alloc_index] << PAGE_SHIFT;
if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL )
goto error_out;
memset(vl1tab, 0, PAGE_SIZE);
+ unmap_pfn(pm_handle, vl1tab);
alloc_index--;
- vl1e = vl1tab + l1_table_offset(virt_load_addr +
- (count << PAGE_SHIFT));
+ l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))*
+ sizeof(l1_pgentry_t));
/* make apropriate entry in the page directory */
- pgt_updates->ptr = (unsigned long)vl2e;
+ pgt_updates->ptr = l2e;
pgt_updates->val = l1tab | L2_PROT;
pgt_updates++;
num_pgt_updates++;
- vl2e++;
+ l2e += sizeof(l2_pgentry_t);
}
if ( count < pt_start )
{
- pgt_updates->ptr = (unsigned long)vl1e;
+ pgt_updates->ptr = l1e;
pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT;
pgt_updates++;
num_pgt_updates++;
- vl1e++;
+ l1e += sizeof(l1_pgentry_t);
}
else
{
- pgt_updates->ptr = (unsigned long)vl1e;
+ pgt_updates->ptr = l1e;
pgt_updates->val =
((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW;
pgt_updates++;
num_pgt_updates++;
- vl1e++;
+ l1e += sizeof(l1_pgentry_t);
}
pgt_updates->ptr =
diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c
index 2418d97219..44ebe3c940 100644
--- a/tools/xc/lib/xc_linux_restore.c
+++ b/tools/xc/lib/xc_linux_restore.c
@@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle,
page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
}
if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
- (unsigned long)&ppage[j], page[j]) )
+ (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)),
+ page[j]) )
goto out;
}
break;
@@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle,
page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
}
if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
- (unsigned long)&ppage[j], page[j]) )
+ (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)),
+ page[j]) )
goto out;
}
break;
@@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle,
memcpy(ppage, page, PAGE_SIZE);
break;
}
- /* NB. Must flush before unmapping page, as pass VAs to Xen. */
- if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) )
- goto out;
unmap_pfn(pm_handle, ppage);
if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c
index 463efb7acb..e5f5934cff 100644
--- a/tools/xc/lib/xc_linux_save.c
+++ b/tools/xc/lib/xc_linux_save.c
@@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_handle,
{
dom0_op_t op;
op.cmd = DOM0_GETPAGEFRAMEINFO;
- op.u.getpageframeinfo.pfn = mfn;
- if ( (do_dom0_op(xc_handle, &op) < 0) ||
- (op.u.getpageframeinfo.domain != dom) )
- return 0;
- return 1;
+ op.u.getpageframeinfo.pfn = mfn;
+ op.u.getpageframeinfo.domain = dom;
+ return (do_dom0_op(xc_handle, &op) >= 0);
}
#define GETPFN_ERR (~0U)
-static unsigned int get_pfn_type(int xc_handle, unsigned long mfn)
+static unsigned int get_pfn_type(int xc_handle,
+ unsigned long mfn,
+ unsigned int dom)
{
dom0_op_t op;
op.cmd = DOM0_GETPAGEFRAMEINFO;
- op.u.getpageframeinfo.pfn = mfn;
+ op.u.getpageframeinfo.pfn = mfn;
+ op.u.getpageframeinfo.domain = dom;
if ( do_dom0_op(xc_handle, &op) < 0 )
{
PERROR("Unexpected failure when getting page frame info!");
@@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle,
mfn_to_pfn_table[mfn] = i;
/* Query page type by MFN, but store it by PFN. */
- if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR )
+ if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) ==
+ GETPFN_ERR )
goto out;
}
diff --git a/xen/GUEST_CHANGES b/xen/GUEST_CHANGES
deleted file mode 100644
index b9f25d49cd..0000000000
--- a/xen/GUEST_CHANGES
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The interface between Xen and overlying guest OSes has changed in the
-following ways since version 1.0:
-
-Modified hypercall 'pt_update'
-------------------------------
-Page-table updates passed to the 'pt_update' hypercall must now
-specify a virtual address that maps the PTE to be modified. Previously
-a physical address was used, requiring Xen to temporarily map the PTE
-into its own private region so that it could be read and written.
-This affects only commands of type PGREQ_NORMAL_UPDATE and
-PGREQ_UNCHECKED_UPDATE.
-
-New hypercall 'update_va_mapping'
----------------------------------
-A new high-speed page-table update method has been introduced, which
-may be of particular benefit when fixing up application page faults.
-Invoked as 'update_va_mapping(page_number, new_pte_value, flags)':
- <page_number>: The virtual page number in the current address space
- whose PTE is to be modified.
- <new_pte_value>: The new value to write into the PTE.
- <flags>: An ORed combination of
- UVMF_INVLPG: Flush stale TLB entry of the updated page mapping
- UVMF_FLUSH_TLB: Flush all TLB entries
-You can see this new call in use in Xenolinux (common/memory.c).
-
diff --git a/xen/TODO b/xen/TODO
deleted file mode 100644
index 5eead81b89..0000000000
--- a/xen/TODO
+++ /dev/null
@@ -1,54 +0,0 @@
-
-This is stuff we probably want to implement in the near future.
-
- -- Keir (16/3/03)
-
-
-1. DOMAIN-0 MANAGEMENT DAEMON
------------------------------
-A better control daemon is required for domain 0, which keeps proper
-track of machine resources and can make sensible policy choices. This
-may require support in Xen; for example, notifications (eg. DOMn is
-killed), and requests (eg. can DOMn allocate x frames of memory?).
-
-2. ASSIGNING DOMAINS TO PROCESSORS
-----------------------------------
-More intelligent assignment of domains to processors. In
-particular, we don't play well with hyperthreading: we will assign
-domains to virtual processors on the same package, rather then
-spreading them across processor packages.
-
-What we need to do is port code from Linux which stores information on
-relationships between processors in the system (eg. which ones are
-siblings in the same package). We then use this to balance domains
-across packages, and across virtual processors within a package.
-
-3. SANE NETWORK ROUTING
------------------------
-The current virtual firewall/router is completely broken. Needs a new
-design and implementation!
-
-
-
-Graveyard
-*********
-
-The hypervisor page cache
--------------------------
-This will allow guest OSes to make use of spare pages in the system, but
-allow them to be immediately used for any new domains or memory requests.
-The idea is that, when a page is laundered and falls off Linux's clean_LRU
-list, rather than freeing it it becomes a candidate for passing down into
-the hypervisor. In return, xeno-linux may ask for one of its previously-
-cached pages back:
- (page, new_id) = cache_query(page, old_id);
-If the requested page couldn't be kept, a blank page is returned.
-When would Linux make the query? Whenever it wants a page back without
-the delay or going to disc. Also, whenever a page would otherwise be
-flushed to disc.
-
-To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL);
- [NULL means "give me a blank page"].
-To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id)
- [we may request that x_page just be discarded, and therefore not impinge
- on this domain's cache quota].
diff --git a/xen/arch/i386/Rules.mk b/xen/arch/i386/Rules.mk
index e137a1abd3..4d00a727ec 100644
--- a/xen/arch/i386/Rules.mk
+++ b/xen/arch/i386/Rules.mk
@@ -8,8 +8,8 @@ MONITOR_BASE := 0xFC500000
# Bootloader should load monitor to this real address
LOAD_BASE := 0x00100000
CFLAGS := -nostdinc -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE)
-CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
-#CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
+#CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
+CFLAGS += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
LDFLAGS := -T xeno.lds -N
diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c
index 8a3a6b5cf8..b3cd649c9c 100644
--- a/xen/arch/i386/apic.c
+++ b/xen/arch/i386/apic.c
@@ -47,7 +47,7 @@
#include <asm/hardirq.h>
#include <asm/apic.h>
#include <xeno/mm.h>
-
+#include <asm/io_apic.h>
#include <asm/timex.h>
#include <xeno/ac_timer.h>
#include <xeno/perfc.h>
diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S
index e06c565de7..dc55e35041 100644
--- a/xen/arch/i386/entry.S
+++ b/xen/arch/i386/entry.S
@@ -82,7 +82,6 @@
#include <xeno/config.h>
#include <xeno/errno.h>
#include <hypervisor-ifs/hypervisor-if.h>
-#include <asm/smp.h>
EBX = 0x00
ECX = 0x04
diff --git a/xen/arch/i386/flushtlb.c b/xen/arch/i386/flushtlb.c
new file mode 100644
index 0000000000..fc543ebce7
--- /dev/null
+++ b/xen/arch/i386/flushtlb.c
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * flushtlb.c
+ *
+ * TLB flushes are timestamped using a global virtual 'clock' which ticks
+ * on any TLB flush on any processor.
+ *
+ * Copyright (c) 2003, K A Fraser
+ */
+
+#include <xeno/config.h>
+#include <xeno/sched.h>
+#include <asm/flushtlb.h>
+
+unsigned long tlbflush_mask;
+unsigned long tlbflush_clock;
+unsigned long tlbflush_time[NR_CPUS];
+
+static inline void tlb_clocktick(unsigned int cpu)
+{
+ unsigned long x, nx, y, ny;
+
+ clear_bit(cpu, &tlbflush_mask);
+
+ /* Tick the clock. 'y' contains the current time after the tick. */
+ ny = tlbflush_clock;
+ do {
+#ifdef CONFIG_SMP
+ if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) )
+ {
+ new_tlbflush_clock_period();
+ y = tlbflush_clock;
+ break;
+ }
+#else
+ y = ny+1;
+#endif
+ }
+ while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) );
+
+ /* Update cpu's timestamp to current time, unless someone else beats us. */
+ nx = tlbflush_time[cpu];
+ do {
+ if ( unlikely((x = nx) >= y) )
+ break;
+ }
+ while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) );
+}
+
+void write_cr3_counted(unsigned long pa)
+{
+ __asm__ __volatile__ (
+ "movl %0, %%cr3"
+ : : "r" (pa) : "memory" );
+ tlb_clocktick(smp_processor_id());
+}
+
+void flush_tlb_counted(void)
+{
+ __asm__ __volatile__ (
+ "movl %%cr3, %%eax; movl %%eax, %%cr3"
+ : : : "memory", "eax" );
+ tlb_clocktick(smp_processor_id());
+}
+
diff --git a/xen/arch/i386/io_apic.c b/xen/arch/i386/io_apic.c
index 951763a053..7369966dd8 100644
--- a/xen/arch/i386/io_apic.c
+++ b/xen/arch/i386/io_apic.c
@@ -28,6 +28,8 @@
#include <xeno/config.h>
#include <asm/mc146818rtc.h>
#include <asm/io.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
#include <asm/smp.h>
#include <asm/desc.h>
#include <asm/smpboot.h>
diff --git a/xen/arch/i386/ioremap.c b/xen/arch/i386/ioremap.c
index 06c09f8520..c650d0b5d8 100644
--- a/xen/arch/i386/ioremap.c
+++ b/xen/arch/i386/ioremap.c
@@ -15,92 +15,50 @@
#include <asm/pgalloc.h>
#include <asm/page.h>
-static unsigned long remap_base = 0;
+static unsigned long remap_base = IOREMAP_VIRT_START;
#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
-static void new_l2e(l2_pgentry_t *pl2e)
-{
- l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
- if ( !pl1e ) BUG();
- clear_page(pl1e);
- *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR);
-}
-
-
-void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+void * __ioremap(unsigned long phys_addr,
+ unsigned long size,
+ unsigned long flags)
{
unsigned long vaddr;
unsigned long offset, cur=0, last_addr;
l2_pgentry_t *pl2e;
l1_pgentry_t *pl1e;
- /* First time through, start allocating from far end of virtual memory. */
- if ( !remap_base ) remap_base = IOREMAP_VIRT_START;
-
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
+ if ( (size == 0) || (last_addr < phys_addr) )
return NULL;
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (phys_addr >= 0xA0000 && last_addr < 0x100000)
+ /* Don't remap the low PCI/ISA area: it's always mapped. */
+ if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) )
return phys_to_virt(phys_addr);
- if(remap_base + size > IOREMAP_VIRT_END-1) {
- printk("ioremap: going past end of reserved space!\n");
- return NULL;
- }
-#if 0
- /*
- * Don't allow anybody to remap normal RAM that we're using..
- */
- if (phys_addr < virt_to_phys(high_memory)) {
- char *t_addr, *t_end;
- struct pfn_info *page;
-
- t_addr = __va(phys_addr);
- t_end = t_addr + (size - 1);
-
- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
- if(!PageReserved(page))
- return NULL;
+ if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) )
+ {
+ printk("ioremap: going past end of reserved space!\n");
+ return NULL;
}
-#endif
- /*
- * Mappings have to be page-aligned
- */
+ /* Mappings have to be page-aligned. */
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr) - phys_addr;
- /*
- * Ok, go for it..
- */
+ /* Ok, go for it. */
vaddr = remap_base;
remap_base += size;
pl2e = &idle_pg_table[l2_table_offset(vaddr)];
- if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr);
- for ( ; ; )
- {
- if ( !l1_pgentry_empty(*pl1e) ) BUG();
+ do {
*pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags);
- cur += PAGE_SIZE;
- if ( cur == size ) break;
- if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) )
- {
- if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
- pl1e = l2_pgentry_to_l1(*pl2e++);
- }
}
+ while ( (cur += PAGE_SIZE) != size );
- flush_tlb_all();
-
- return (void *) (offset + (char *)vaddr);
+ return (void *)(offset + (char *)vaddr);
}
void iounmap(void *addr)
diff --git a/xen/arch/i386/irq.c b/xen/arch/i386/irq.c
index 2793eba3d7..cd1bcc6b3c 100644
--- a/xen/arch/i386/irq.c
+++ b/xen/arch/i386/irq.c
@@ -24,7 +24,8 @@
#include <xeno/interrupt.h>
#include <xeno/irq.h>
#include <xeno/slab.h>
-
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
#include <asm/msr.h>
#include <asm/hardirq.h>
#include <asm/ptrace.h>
diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c
index 5df703de7a..84ef14cf8f 100644
--- a/xen/arch/i386/mm.c
+++ b/xen/arch/i386/mm.c
@@ -27,8 +27,8 @@
#include <asm/fixmap.h>
#include <asm/domain_page.h>
-static inline void set_pte_phys (unsigned long vaddr,
- l1_pgentry_t entry)
+static inline void set_pte_phys(unsigned long vaddr,
+ l1_pgentry_t entry)
{
l2_pgentry_t *l2ent;
l1_pgentry_t *l1ent;
@@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigned long vaddr,
__flush_tlb_one(vaddr);
}
-void __set_fixmap (enum fixed_addresses idx,
- l1_pgentry_t entry)
+
+void __set_fixmap(enum fixed_addresses idx,
+ l1_pgentry_t entry)
{
unsigned long address = __fix_to_virt(idx);
- if (idx >= __end_of_fixed_addresses) {
+ if ( likely(idx < __end_of_fixed_addresses) )
+ set_pte_phys(address, entry);
+ else
printk("Invalid __set_fixmap\n");
- return;
- }
- set_pte_phys(address, entry);
}
-static void __init fixrange_init (unsigned long start,
- unsigned long end, l2_pgentry_t *pg_base)
+
+static void __init fixrange_init(unsigned long start,
+ unsigned long end,
+ l2_pgentry_t *pg_base)
{
l2_pgentry_t *l2e;
int i;
@@ -66,7 +68,8 @@ static void __init fixrange_init (unsigned long start,
for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ )
{
- if ( !l2_pgentry_empty(*l2e) ) continue;
+ if ( !l2_pgentry_empty(*l2e) )
+ continue;
page = (unsigned long)get_free_page(GFP_KERNEL);
clear_page(page);
*l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR);
@@ -79,11 +82,6 @@ void __init paging_init(void)
unsigned long addr;
void *ioremap_pt;
- /* XXX initialised in boot.S */
- /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/
- /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/
- /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/
-
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
@@ -115,12 +113,12 @@ void __init paging_init(void)
}
-void __init zap_low_mappings (void)
+void __init zap_low_mappings(void)
{
int i;
for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
idle_pg_table[i] = mk_l2_pgentry(0);
- flush_tlb_all();
+ flush_tlb_all_pge();
}
@@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p,
unsigned int entries)
{
/* NB. There are 512 8-byte entries per GDT page. */
- unsigned int i, j, nr_pages = (entries + 511) / 512;
- unsigned long pfn, *gdt_page;
- long ret = -EINVAL;
- struct pfn_info *page;
+ int i, nr_pages = (entries + 511) / 512;
+ unsigned long pfn;
struct desc_struct *vgdt;
- spin_lock(&p->page_lock);
-
/* Check the new GDT. */
for ( i = 0; i < nr_pages; i++ )
{
- if ( frames[i] >= max_page )
- goto out;
-
- page = frame_table + frames[i];
- if ( (page->flags & PG_domain_mask) != p->domain )
- goto out;
-
- if ( (page->flags & PG_type_mask) != PGT_gdt_page )
- {
- if ( page_type_count(page) != 0 )
- goto out;
-
- /* Check all potential GDT entries in the page. */
- gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT);
- for ( j = 0; j < 512; j++ )
- if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) )
- goto out;
- unmap_domain_mem(gdt_page);
- }
+ if ( unlikely(frames[i] >= max_page) ||
+ unlikely(!get_page_and_type(&frame_table[frames[i]],
+ p, PGT_gdt_page)) )
+ goto fail;
}
+ /* Copy reserved GDT entries to the new GDT. */
+ vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
+ memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
+ gdt_table + FIRST_RESERVED_GDT_ENTRY,
+ NR_RESERVED_GDT_ENTRIES*8);
+ unmap_domain_mem(vgdt);
+
/* Tear down the old GDT. */
for ( i = 0; i < 16; i++ )
{
- pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]);
+ if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 )
+ put_page_and_type(&frame_table[pfn]);
p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
- if ( pfn == 0 ) continue;
- page = frame_table + pfn;
- ASSERT((page->flags & PG_type_mask) == PGT_gdt_page);
- ASSERT((page->flags & PG_domain_mask) == p->domain);
- ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
- put_page_type(page);
- put_page_tot(page);
}
/* Install the new GDT. */
for ( i = 0; i < nr_pages; i++ )
- {
p->mm.perdomain_pt[i] =
mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-
- page = frame_table + frames[i];
- page->flags &= ~(PG_type_mask | PG_need_flush);
- page->flags |= PGT_gdt_page;
- get_page_type(page);
- get_page_tot(page);
- }
-
- /* Copy reserved GDT entries to the new GDT. */
- vgdt = map_domain_mem(frames[i] << PAGE_SHIFT);
- memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY,
- gdt_table + FIRST_RESERVED_GDT_ENTRY,
- NR_RESERVED_GDT_ENTRIES*8);
- unmap_domain_mem(vgdt);
SET_GDT_ADDRESS(p, GDT_VIRT_START);
SET_GDT_ENTRIES(p, (entries*8)-1);
- ret = 0; /* success */
+ return 0;
- out:
- spin_unlock(&p->page_lock);
- return ret;
+ fail:
+ while ( i-- > 0 )
+ put_page_and_type(&frame_table[frames[i]]);
+ return -EINVAL;
}
long do_set_gdt(unsigned long *frame_list, unsigned int entries)
{
- unsigned int nr_pages = (entries + 511) / 512;
+ int nr_pages = (entries + 511) / 512;
unsigned long frames[16];
long ret;
@@ -321,14 +287,12 @@ long do_update_descriptor(
if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
return -EINVAL;
- spin_lock(&current->page_lock);
-
- page = frame_table + pfn;
- if ( (page->flags & PG_domain_mask) != current->domain )
+ page = &frame_table[pfn];
+ if ( unlikely(!get_page(page, current)) )
goto out;
/* Check if the given frame is in use in an unsafe context. */
- switch ( (page->flags & PG_type_mask) )
+ switch ( page->type_and_flags & PGT_type_mask )
{
case PGT_gdt_page:
/* Disallow updates of Xen-reserved descriptors in the current GDT. */
@@ -336,12 +300,17 @@ long do_update_descriptor(
(((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
(((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
goto out;
+ if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
+ goto out;
+ break;
case PGT_ldt_page:
- case PGT_writeable_page:
+ if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
+ goto out;
break;
default:
- if ( page_type_count(page) != 0 )
+ if ( unlikely(!get_page_type(page, PGT_writeable_page)) )
goto out;
+ break;
}
/* All is good so make the update. */
@@ -350,9 +319,11 @@ long do_update_descriptor(
gdt_pent[1] = word2;
unmap_domain_mem(gdt_pent);
+ put_page_type(page);
+
ret = 0; /* success */
out:
- spin_unlock(&current->page_lock);
+ put_page(page);
return ret;
}
diff --git a/xen/arch/i386/pci-irq.c b/xen/arch/i386/pci-irq.c
index b7a212b014..2c68d9d3b3 100644
--- a/xen/arch/i386/pci-irq.c
+++ b/xen/arch/i386/pci-irq.c
@@ -6,16 +6,15 @@
#include <linux/config.h>
#include <linux/types.h>
-/*#include <linux/kernel.h>*/
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/sched.h>
-
#include <asm/io.h>
#include <asm/smp.h>
+#include <asm/mpspec.h>
#include <asm/io_apic.h>
#include "pci-i386.h"
diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c
index 4f7d16d761..e75ee1e050 100644
--- a/xen/arch/i386/process.c
+++ b/xen/arch/i386/process.c
@@ -27,6 +27,7 @@
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/mpspec.h>
#include <xeno/irq.h>
#include <xeno/event.h>
@@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p)
tss->ss1 = next->ss1;
/* Switch page tables. */
- __write_cr3_counted(pagetable_val(next_p->mm.pagetable));
+ write_cr3_counted(pagetable_val(next_p->mm.pagetable));
set_current(next_p);
diff --git a/xen/arch/i386/smp.c b/xen/arch/i386/smp.c
index b1dfe64d4f..4ec5176194 100644
--- a/xen/arch/i386/smp.c
+++ b/xen/arch/i386/smp.c
@@ -16,6 +16,7 @@
#include <asm/mc146818rtc.h>
#include <asm/pgalloc.h>
#include <asm/smpboot.h>
+#include <asm/hardirq.h>
#ifdef CONFIG_SMP
@@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
asmlinkage void smp_invalidate_interrupt(void)
{
ack_APIC_irq();
- if (test_and_clear_bit(smp_processor_id(), &flush_cpumask))
- local_flush_tlb();
+ clear_bit(smp_processor_id(), &flush_cpumask);
+ local_flush_tlb();
}
-void flush_tlb_others(unsigned long cpumask)
+void flush_tlb_mask(unsigned long mask)
{
- spin_lock(&tlbstate_lock);
- atomic_set_mask(cpumask, &flush_cpumask);
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
- while (flush_cpumask) continue;
- spin_unlock(&tlbstate_lock);
+ if ( unlikely(in_irq()) )
+ BUG();
+
+ if ( mask & (1 << smp_processor_id()) )
+ {
+ local_flush_tlb();
+ mask &= ~(1 << smp_processor_id());
+ }
+
+ if ( mask != 0 )
+ {
+ spin_lock(&tlbstate_lock);
+ flush_cpumask = mask;
+ send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
+ while ( flush_cpumask != 0 )
+ {
+ rep_nop();
+ barrier();
+ }
+ spin_unlock(&tlbstate_lock);
+ }
}
-
-static inline void do_flush_tlb_all_local(void)
+
+void new_tlbflush_clock_period(void)
{
- __flush_tlb_all();
+ if ( unlikely(!spin_trylock(&tlbstate_lock)) )
+ return;
+
+ if ( unlikely((flush_cpumask = tlbflush_mask) != 0) )
+ {
+ send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);
+ while ( flush_cpumask != 0 )
+ {
+ rep_nop();
+ barrier();
+ }
+ }
+
+ /* No need for cmpxchg updates here: we are protected by tlbstate lock. */
+ tlbflush_mask = (1 << smp_num_cpus) - 1;
+ wmb(); /* Reset the mask before allowing the clock to continue ticking. */
+ tlbflush_clock++;
+
+ spin_unlock(&tlbstate_lock);
}
-static void flush_tlb_all_ipi(void* info)
+static void flush_tlb_all_pge_ipi(void* info)
{
- do_flush_tlb_all_local();
+ __flush_tlb_pge();
}
-void flush_tlb_all(void)
+void flush_tlb_all_pge(void)
{
- smp_call_function (flush_tlb_all_ipi,0,1,1);
-
- do_flush_tlb_all_local();
+ smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
+ __flush_tlb_pge();
}
void smp_send_event_check_mask(unsigned long cpu_mask)
diff --git a/xen/arch/i386/smpboot.c b/xen/arch/i386/smpboot.c
index 506ec09cb9..b5a4249003 100644
--- a/xen/arch/i386/smpboot.c
+++ b/xen/arch/i386/smpboot.c
@@ -44,6 +44,8 @@
#include <xeno/smp.h>
#include <asm/msr.h>
#include <asm/system.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
#include <xeno/sched.h>
#include <xeno/delay.h>
#include <xeno/lib.h>
diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c
index 330defe3a8..78c26c37cc 100644
--- a/xen/arch/i386/traps.c
+++ b/xen/arch/i386/traps.c
@@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, char *str,
if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
{
+ DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup);
regs->eip = fixup;
regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
return;
@@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
{
+ DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup);
regs->eip = fixup;
regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
return;
@@ -411,6 +413,7 @@ asmlinkage void do_general_protection(struct pt_regs *regs, long error_code)
if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
{
+ DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup);
regs->eip = fixup;
regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
return;
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index 2f3073a1c4..5b24d7b5c9 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -38,31 +38,6 @@ static unsigned int get_domnr(void)
return 0;
}
-static void build_page_list(struct task_struct *p)
-{
- unsigned long *list;
- unsigned long curr;
- struct list_head *list_ent;
-
- curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table;
- list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
-
- list_for_each(list_ent, &p->pg_head)
- {
- *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table;
-
- if( ((unsigned long)list & ~PAGE_MASK) == 0 )
- {
- struct list_head *ent = frame_table[curr].list.next;
- curr = list_entry(ent, struct pfn_info, list) - frame_table;
- unmap_domain_mem(list-1);
- list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
- }
- }
-
- unmap_domain_mem(list);
-}
-
static int msr_cpu_mask;
static unsigned long msr_addr;
static unsigned long msr_lo;
@@ -164,8 +139,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
goto exit_create;
}
- build_page_list(p);
-
ret = p->domain;
op.u.createdomain.domain = ret;
@@ -245,7 +218,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
case DOM0_GETMEMLIST:
{
int i;
- struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain);
+ struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain);
unsigned long max_pfns = op.u.getmemlist.max_pfns;
unsigned long pfn;
unsigned long *buffer = op.u.getmemlist.buffer;
@@ -254,28 +227,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
ret = -EINVAL;
if ( p != NULL )
{
- list_ent = p->pg_head.next;
- pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-
- for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ )
+ ret = 0;
+
+ spin_lock(&p->page_list_lock);
+ list_ent = p->page_list.next;
+ for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ )
{
+ pfn = list_entry(list_ent, struct pfn_info, list) -
+ frame_table;
if ( put_user(pfn, buffer) )
{
ret = -EFAULT;
- goto out_getmemlist;
+ break;
}
buffer++;
list_ent = frame_table[pfn].list.next;
- pfn = list_entry(list_ent, struct pfn_info, list) -
- frame_table;
}
+ spin_unlock(&p->page_list_lock);
op.u.getmemlist.num_pfns = i;
copy_to_user(u_dom0_op, &op, sizeof(op));
-
- ret = 0;
-
- out_getmemlist:
+
put_task_struct(p);
}
}
@@ -368,21 +340,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
{
struct pfn_info *page;
unsigned long pfn = op.u.getpageframeinfo.pfn;
-
- if ( pfn >= max_page )
- {
- ret = -EINVAL;
- }
- else
+ unsigned int dom = op.u.getpageframeinfo.domain;
+ struct task_struct *p;
+
+ ret = -EINVAL;
+
+ if ( unlikely(pfn >= max_page) ||
+ unlikely((p = find_domain_by_id(dom)) == NULL) )
+ break;
+
+ page = &frame_table[pfn];
+
+ if ( likely(get_page(page, p)) )
{
- page = frame_table + pfn;
-
- op.u.getpageframeinfo.domain = page->flags & PG_domain_mask;
- op.u.getpageframeinfo.type = NONE;
+ op.u.getpageframeinfo.type = NONE;
- if ( page_type_count(page) != 0 )
+ if ( (page->type_and_flags & PGT_count_mask) != 0 )
{
- switch ( page->flags & PG_type_mask )
+ switch ( page->type_and_flags & PGT_type_mask )
{
case PGT_l1_page_table:
op.u.getpageframeinfo.type = L1TAB;
@@ -392,9 +367,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
break;
}
}
-
- copy_to_user(u_dom0_op, &op, sizeof(op));
+
+ put_page(page);
}
+
+ put_task_struct(p);
+
+ copy_to_user(u_dom0_op, &op, sizeof(op));
}
break;
diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c
index c8869882ae..53facf8477 100644
--- a/xen/common/dom_mem_ops.c
+++ b/xen/common/dom_mem_ops.c
@@ -16,58 +16,26 @@
#include <xeno/event.h>
#include <asm/domain_page.h>
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
{
- struct list_head *temp;
- struct pfn_info *pf; /* pfn_info of current page */
+ struct pfn_info *page;
unsigned long mpfn; /* machine frame number of current page */
void *va; /* Xen-usable mapping of current page */
unsigned long i;
- unsigned long flags;
-
- /*
- * POLICY DECISION: Each domain has a page limit.
- * NB. The first part of test is because op.size could be so big that
- * tot_pages + op.size overflows a u_long.
- */
- if( (op.size > p->max_pages) ||
- ((p->tot_pages + op.size) > p->max_pages) )
- return -ENOMEM;
-
- spin_lock_irqsave(&free_list_lock, flags);
-
- if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >>
- (PAGE_SHIFT-10))) )
- {
- spin_unlock_irqrestore(&free_list_lock, flags);
- return -ENOMEM;
- }
- spin_lock(&p->page_lock);
-
- temp = free_list.next;
for ( i = 0; i < op.size; i++ )
{
- /* Get a free page and add it to the domain's page list. */
- pf = list_entry(temp, struct pfn_info, list);
- pf->flags |= p->domain;
- set_page_type_count(pf, 0);
- set_page_tot_count(pf, 0);
- temp = temp->next;
- list_del(&pf->list);
- list_add_tail(&pf->list, &p->pg_head);
- free_pfns--;
-
- p->tot_pages++;
-
+ /* Leave some slack pages; e.g., for the network. */
+ if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >>
+ (PAGE_SHIFT-10))) )
+ break;
+
+ /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
+ if ( unlikely((page = alloc_domain_page(p)) == NULL) )
+ break;
+
/* Inform the domain of the new page's machine address. */
- mpfn = (unsigned long)(pf - frame_table);
+ mpfn = (unsigned long)(page - frame_table);
copy_to_user(op.pages, &mpfn, sizeof(mpfn));
op.pages++;
@@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
unmap_domain_mem(va);
}
- spin_unlock(&p->page_lock);
- spin_unlock_irqrestore(&free_list_lock, flags);
-
- return op.size;
+ return i;
}
static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
{
- struct list_head *temp;
- struct pfn_info *pf; /* pfn_info of current page */
+ struct pfn_info *page;
unsigned long mpfn; /* machine frame number of current page */
unsigned long i;
- unsigned long flags;
long rc = 0;
int need_flush = 0;
- spin_lock_irqsave(&free_list_lock, flags);
- spin_lock(&p->page_lock);
-
- temp = free_list.next;
for ( i = 0; i < op.size; i++ )
{
copy_from_user(&mpfn, op.pages, sizeof(mpfn));
@@ -109,37 +68,28 @@ static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
goto out;
}
- pf = &frame_table[mpfn];
- if ( (page_type_count(pf) != 0) ||
- (page_tot_count(pf) != 0) ||
- ((pf->flags & PG_domain_mask) != p->domain) )
+ page = &frame_table[mpfn];
+ if ( unlikely(!get_page(page, p)) )
{
- DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
- p->domain, page_type_count(pf),
- page_tot_count(pf), pf->flags);
+ DPRINTK("Bad page free for domain %d\n", p->domain);
rc = -EINVAL;
goto out;
}
- need_flush |= pf->flags & PG_need_flush;
-
- pf->flags = 0;
+ if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
+ put_page_and_type(page);
- list_del(&pf->list);
- list_add(&pf->list, &free_list);
- free_pfns++;
+ if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
+ put_page(page);
- p->tot_pages--;
+ put_page(page);
}
out:
- spin_unlock(&p->page_lock);
- spin_unlock_irqrestore(&free_list_lock, flags);
-
if ( need_flush )
{
__flush_tlb();
- perfc_incrc(need_flush_tlb_flush);
+ perfc_incr(need_flush_tlb_flush);
}
return rc ? rc : op.size;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index eae232206b..4f23778e46 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu)
sprintf(p->name, "Domain-%d", dom_id);
spin_lock_init(&p->blk_ring_lock);
- spin_lock_init(&p->page_lock);
spin_lock_init(&p->event_channel_lock);
p->shared_info = (void *)get_free_page(GFP_KERNEL);
memset(p->shared_info, 0, PAGE_SIZE);
- SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id);
+ SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
@@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu)
sched_add_domain(p);
- INIT_LIST_HEAD(&p->pg_head);
+ spin_lock_init(&p->page_list_lock);
+ INIT_LIST_HEAD(&p->page_list);
p->max_pages = p->tot_pages = 0;
+
write_lock_irqsave(&tasklist_lock, flags);
SET_LINKS(p);
p->next_hash = task_hash[TASK_HASH(dom_id)];
@@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom)
return 0;
}
-unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
+struct pfn_info *alloc_domain_page(struct task_struct *p)
{
- struct list_head *temp;
- struct pfn_info *pf;
- unsigned int alloc_pfns;
- unsigned int req_pages;
- unsigned long flags;
-
- /* how many pages do we need to alloc? */
- req_pages = kbytes >> (PAGE_SHIFT - 10);
+ struct pfn_info *page = NULL;
+ unsigned long flags, mask, pfn_stamp, cpu_stamp;
+ int i;
spin_lock_irqsave(&free_list_lock, flags);
-
- /* is there enough mem to serve the request? */
- if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) >
- free_pfns )
+ if ( likely(!list_empty(&free_list)) )
{
- spin_unlock_irqrestore(&free_list_lock, flags);
- return -1;
+ page = list_entry(free_list.next, struct pfn_info, list);
+ list_del(&page->list);
+ free_pfns--;
}
+ spin_unlock_irqrestore(&free_list_lock, flags);
+
+ if ( unlikely(page == NULL) )
+ return NULL;
- /* allocate pages and build a thread through frame_table */
- temp = free_list.next;
- for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ )
+ if ( unlikely((mask = page->u.cpu_mask) != 0) )
{
- pf = list_entry(temp, struct pfn_info, list);
- pf->flags = p->domain;
- set_page_type_count(pf, 0);
- set_page_tot_count(pf, 0);
- temp = temp->next;
- list_del(&pf->list);
- list_add_tail(&pf->list, &p->pg_head);
- free_pfns--;
- ASSERT(free_pfns != 0);
+ pfn_stamp = page->tlbflush_timestamp;
+ for ( i = 0; mask != 0; i++ )
+ {
+ if ( unlikely(mask & (1<<i)) )
+ {
+ cpu_stamp = tlbflush_time[i];
+ if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
+ mask &= ~(1<<i);
+ }
+ }
+
+ if ( unlikely(mask != 0) )
+ {
+ if ( unlikely(in_irq()) )
+ {
+ DPRINTK("Returning NULL from alloc_domain_page: in_irq\n");
+ goto free_and_exit;
+ }
+ perfc_incrc(need_flush_tlb_flush);
+ flush_tlb_mask(mask);
+ }
}
-
- spin_unlock_irqrestore(&free_list_lock, flags);
-
- p->tot_pages = req_pages;
- /* TEMPORARY: max_pages should be explicitly specified. */
- p->max_pages = p->tot_pages;
+ page->u.domain = p;
+ page->type_and_flags = 0;
+ if ( p != NULL )
+ {
+ if ( unlikely(in_irq()) )
+ BUG();
+ wmb(); /* Domain pointer must be visible before updating refcnt. */
+ spin_lock(&p->page_list_lock);
+ if ( unlikely(p->tot_pages >= p->max_pages) )
+ {
+ spin_unlock(&p->page_list_lock);
+ goto free_and_exit;
+ }
+ list_add_tail(&page->list, &p->page_list);
+ p->tot_pages++;
+ page->count_and_flags = PGC_allocated | 1;
+ spin_unlock(&p->page_list_lock);
+ }
- return 0;
+ return page;
+
+ free_and_exit:
+ spin_lock_irqsave(&free_list_lock, flags);
+ list_add(&page->list, &free_list);
+ free_pfns++;
+ spin_unlock_irqrestore(&free_list_lock, flags);
+ return NULL;
}
-
-void free_all_dom_mem(struct task_struct *p)
+void free_domain_page(struct pfn_info *page)
{
- struct list_head *ent;
unsigned long flags;
+ struct task_struct *p = page->u.domain;
- spin_lock_irqsave(&free_list_lock, flags);
- while ( (ent = p->pg_head.next) != &p->pg_head )
+ if ( unlikely(in_irq()) )
+ BUG();
+
+ if ( likely(!IS_XEN_HEAP_FRAME(page)) )
{
- struct pfn_info *pf = list_entry(ent, struct pfn_info, list);
- set_page_type_count(pf, 0);
- set_page_tot_count(pf, 0);
- pf->flags = 0;
- ASSERT(ent->next->prev == ent);
- ASSERT(ent->prev->next == ent);
- list_del(ent);
- list_add(ent, &free_list);
+ /*
+ * No race with setting of zombie bit. If it wasn't set before the
+ * last reference was dropped, then it can't be set now.
+ */
+ page->u.cpu_mask = 0;
+ if ( !(page->count_and_flags & PGC_zombie) )
+ {
+ page->tlbflush_timestamp = tlbflush_clock;
+ page->u.cpu_mask = 1 << p->processor;
+
+ spin_lock(&p->page_list_lock);
+ list_del(&page->list);
+ p->tot_pages--;
+ spin_unlock(&p->page_list_lock);
+ }
+
+ page->count_and_flags = 0;
+
+ spin_lock_irqsave(&free_list_lock, flags);
+ list_add(&page->list, &free_list);
free_pfns++;
+ spin_unlock_irqrestore(&free_list_lock, flags);
}
- spin_unlock_irqrestore(&free_list_lock, flags);
+ else
+ {
+ /*
+ * No need for a TLB flush. Non-domain pages are always co-held by Xen,
+ * and the Xen reference is not dropped until the domain is dead.
+ * DOM0 may hold references, but it's trusted so no need to flush.
+ */
+ page->u.cpu_mask = 0;
+ page->count_and_flags = 0;
+ free_page((unsigned long)page_to_virt(page));
+ }
+}
+
+
+void free_all_dom_mem(struct task_struct *p)
+{
+ struct list_head *ent, zombies;
+ struct pfn_info *page;
+
+ INIT_LIST_HEAD(&zombies);
+
+ spin_lock(&p->page_list_lock);
+ while ( (ent = p->page_list.next) != &p->page_list )
+ {
+ page = list_entry(ent, struct pfn_info, list);
+
+ if ( unlikely(!get_page(page, p)) )
+ {
+ /*
+ * Another CPU has dropped the last reference and is responsible
+ * for removing the page from this list. Wait for them to do so.
+ */
+ spin_unlock(&p->page_list_lock);
+ while ( p->page_list.next == ent )
+ barrier();
+ spin_lock(&p->page_list_lock);
+ continue;
+ }
+
+ set_bit(_PGC_zombie, &page->count_and_flags);
+
+ list_del(&page->list);
+ p->tot_pages--;
+
+ list_add(&page->list, &zombies);
+ }
+ spin_unlock(&p->page_list_lock);
+
+ /* We do the potentially complex 'put' operations with no lock held. */
+ while ( (ent = zombies.next) != &zombies )
+ {
+ page = list_entry(ent, struct pfn_info, list);
- p->tot_pages = 0;
+ list_del(&page->list);
+
+ if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
+ put_page_and_type(page);
+
+ if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
+ put_page(page);
+
+ put_page(page);
+ }
}
+unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
+{
+ unsigned int alloc_pfns, nr_pages;
+
+ nr_pages = kbytes >> (PAGE_SHIFT - 10);
+
+ /* TEMPORARY: max_pages should be explicitly specified. */
+ p->max_pages = nr_pages;
+
+ for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ )
+ {
+ if ( unlikely(alloc_domain_page(p) == NULL) ||
+ unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >>
+ (PAGE_SHIFT-10))) )
+ {
+ free_all_dom_mem(p);
+ return -1;
+ }
+ }
+
+ p->tot_pages = nr_pages;
+
+ return 0;
+}
+
+
/* Release resources belonging to task @p. */
void release_task(struct task_struct *p)
{
@@ -309,7 +436,6 @@ void release_task(struct task_struct *p)
destroy_event_channels(p);
free_page((unsigned long)p->mm.perdomain_pt);
UNSHARE_PFN(virt_to_page(p->shared_info));
- free_page((unsigned long)p->shared_info);
free_all_dom_mem(p);
kmem_cache_free(task_struct_cachep, p);
@@ -360,11 +486,10 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
p->failsafe_address = builddomain->ctxt.failsafe_callback_eip;
- /* NB. Page base must already be pinned! */
phys_l2tab = builddomain->ctxt.pt_base;
p->mm.pagetable = mk_pagetable(phys_l2tab);
- get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]);
- get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]);
+ get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p,
+ PGT_l2_page_table);
/* Set up the shared info structure. */
update_dom_time(p->shared_info);
@@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
return -ENOMEM;
}
- alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) -
+ alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
frame_table;
alloc_address <<= PAGE_SHIFT;
alloc_index = p->tot_pages;
@@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
p->mm.pagetable = mk_pagetable(phys_l2tab);
l2tab += l2_table_offset(virt_load_address);
- cur_address = list_entry(p->pg_head.next, struct pfn_info, list) -
+ cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
frame_table;
cur_address <<= PAGE_SHIFT;
for ( count = 0; count < p->tot_pages; count++ )
@@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
}
*l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
- page = frame_table + (cur_address >> PAGE_SHIFT);
- page->flags = dom | PGT_writeable_page | PG_need_flush;
- set_page_type_count(page, 1);
- set_page_tot_count(page, 1);
+ page = &frame_table[cur_address >> PAGE_SHIFT];
+ set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
+ if ( !get_page_and_type(page, p, PGT_writeable_page) )
+ BUG();
/* Set up the MPT entry. */
machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
@@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
{
*l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
page = frame_table + l1_pgentry_to_pagenr(*l1tab);
- page->flags = dom | PGT_l1_page_table;
- get_page_tot(page);
+ page->type_and_flags &= ~PGT_type_mask;
+ page->type_and_flags |= PGT_l1_page_table;
+ get_page(page, p); /* an extra ref because of readable mapping */
l1tab++;
if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
{
@@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
l2tab++;
}
}
- get_page_type(page); /* guest_pinned */
- get_page_tot(page); /* guest_pinned */
- page->flags = dom | PG_guest_pinned | PGT_l2_page_table;
+ /* Rewrite last L1 page to be a L2 page. */
+ page->type_and_flags &= ~PGT_type_mask;
+ page->type_and_flags |= PGT_l2_page_table;
+ /* Get another ref to L2 page so that it can be pinned. */
+ if ( !get_page_and_type(page, p, PGT_l2_page_table) )
+ BUG();
+ set_bit(_PGC_guest_pinned, &page->count_and_flags);
unmap_domain_mem(l1start);
/* Set up shared info area. */
@@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
/* Install the new page tables. */
__cli();
- __write_cr3_counted(pagetable_val(p->mm.pagetable));
+ write_cr3_counted(pagetable_val(p->mm.pagetable));
/* Copy the guest OS image. */
src = (char *)(phy_data_start + 12);
@@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
/* Reinstate the caller's page tables. */
- __write_cr3_counted(pagetable_val(current->mm.pagetable));
+ write_cr3_counted(pagetable_val(current->mm.pagetable));
__sti();
p->flags |= PF_CONSTRUCTED;
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index 9f6fb74556..1bba43d7be 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -181,6 +181,13 @@ void cmain (unsigned long magic, multiboot_info_t *mbi)
for ( ; ; ) ;
}
+ /* The array of pfn_info structures must fit into the reserved area. */
+ if ( sizeof(struct pfn_info) > 24 )
+ {
+ printk("'struct pfn_info' too large to fit in Xen address space!\n");
+ for ( ; ; ) ;
+ }
+
set_current(&idle0_task);
max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10);
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 8cbb503cf3..c2b4ee9f7a 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -139,34 +139,28 @@
#include <asm/uaccess.h>
#include <asm/domain_page.h>
-#if 0
-#define MEM_LOG(_f, _a...)
+#ifndef NDEBUG
+#define MEM_LOG(_f, _a...) \
printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \
current->domain, __LINE__, ## _a )
#else
#define MEM_LOG(_f, _a...) ((void)0)
#endif
-/* Domain 0 is allowed to submit requests on behalf of others. */
-#define DOMAIN_OKAY(_f) \
- ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
-
-/* 'get' checks parameter for validity before inc'ing refcnt. */
-static int get_l2_table(unsigned long page_nr);
-static int get_l1_table(unsigned long page_nr);
-static int get_page(unsigned long page_nr, int writeable);
-static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
-/* 'put' does no checking because if refcnt not zero, entity must be valid. */
-static void put_l2_table(unsigned long page_nr);
-static void put_l1_table(unsigned long page_nr);
-static void put_page(unsigned long page_nr, int writeable);
-static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
-
-static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
+static int alloc_l2_table(struct pfn_info *page);
+static int alloc_l1_table(struct pfn_info *page);
+static int get_page_from_pagenr(unsigned long page_nr);
+static int get_page_and_type_from_pagenr(unsigned long page_nr,
+ unsigned int type);
+
+static void free_l2_table(struct pfn_info *page);
+static void free_l1_table(struct pfn_info *page);
+
+static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
/* frame table size and its size in pages */
-frame_table_t * frame_table;
+struct pfn_info *frame_table;
unsigned long frame_table_size;
unsigned long max_page;
@@ -176,8 +170,11 @@ unsigned int free_pfns;
/* Used to defer flushing of memory structures. */
static struct {
- int flush_tlb;
- int refresh_ldt;
+#define DOP_FLUSH_TLB (1<<0) /* Flush the TLB. */
+#define DOP_RELOAD_LDT (1<<1) /* Reload the LDT shadow mapping. */
+#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0. */
+ unsigned long flags;
+ unsigned long cr0;
} deferred_op[NR_CPUS] __cacheline_aligned;
/*
@@ -196,7 +193,7 @@ void __init init_frametable(unsigned long nr_pages)
max_page = nr_pages;
frame_table_size = nr_pages * sizeof(struct pfn_info);
frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
- frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
+ frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
memset(frame_table, 0, frame_table_size);
free_pfns = 0;
@@ -218,7 +215,7 @@ void __init init_frametable(unsigned long nr_pages)
static void __invalidate_shadow_ldt(struct task_struct *p)
{
- int i, cpu = p->processor;
+ int i;
unsigned long pfn;
struct pfn_info *page;
@@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(struct task_struct *p)
if ( pfn == 0 ) continue;
p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
page = frame_table + pfn;
- ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
- ASSERT((page->flags & PG_domain_mask) == p->domain);
- ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
- put_page_type(page);
- put_page_tot(page);
+ ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
+ ASSERT_PAGE_IS_DOMAIN(page, p);
+ put_page_and_type(page);
}
/* Dispose of the (now possibly invalid) mappings from the TLB. */
- deferred_op[cpu].flush_tlb = 1;
- deferred_op[cpu].refresh_ldt = 1;
+ deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
}
@@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt(void)
}
+int alloc_segdesc_page(struct pfn_info *page)
+{
+ unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
+ int i;
+
+ for ( i = 0; i < 512; i++ )
+ if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) )
+ goto fail;
+
+ unmap_domain_mem(descs);
+ return 1;
+
+ fail:
+ unmap_domain_mem(descs);
+ return 0;
+}
+
+
/* Map shadow page at offset @off. Returns 0 on success. */
int map_ldt_shadow_page(unsigned int off)
{
struct task_struct *p = current;
- unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
- unsigned long l1e, *ldt_page;
- struct pfn_info *page;
- int i, ret = -1;
+ unsigned long l1e;
- /* We cannot take a page_lock in interrupt context. */
- if ( in_interrupt() )
+ if ( unlikely(in_interrupt()) )
BUG();
- spin_lock(&p->page_lock);
+ __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >>
+ PAGE_SHIFT) + off]);
- __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
- if ( unlikely(!(l1e & _PAGE_PRESENT)) )
- goto out;
+ if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
+ unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT],
+ p, PGT_ldt_page)) )
+ return 0;
- page = frame_table + (l1e >> PAGE_SHIFT);
- if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) )
- {
- if ( unlikely(page_type_count(page) != 0) )
- goto out;
-
- /* Check all potential LDT entries in the page. */
- ldt_page = (unsigned long *)addr;
- for ( i = 0; i < 512; i++ )
- if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) )
- goto out;
- if ( unlikely(page->flags & PG_need_flush) )
- {
- perfc_incrc(need_flush_tlb_flush);
- __write_cr3_counted(pagetable_val(p->mm.pagetable));
- page->flags &= ~PG_need_flush;
- }
+ p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
+ p->mm.shadow_ldt_mapcnt++;
- page->flags &= ~PG_type_mask;
- page->flags |= PGT_ldt_page;
- }
+ return 1;
+}
- /* Success! */
- get_page_type(page);
- get_page_tot(page);
- p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW);
- p->mm.shadow_ldt_mapcnt++;
- ret = 0;
+/* Domain 0 is allowed to build page tables on others' behalf. */
+static inline int dom0_get_page(struct pfn_info *page)
+{
+ unsigned long x, nx, y = page->count_and_flags;
+
+ do {
+ x = y;
+ nx = x + 1;
+ if ( unlikely((x & PGC_count_mask) == 0) ||
+ unlikely((nx & PGC_count_mask) == 0) )
+ return 0;
+ }
+ while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
- out:
- spin_unlock(&p->page_lock);
- return ret;
+ return 1;
}
-/* Return original refcnt, or -1 on error. */
-static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
+static int get_page_from_pagenr(unsigned long page_nr)
{
- struct pfn_info *page;
- unsigned long flags;
+ struct pfn_info *page = &frame_table[page_nr];
if ( unlikely(page_nr >= max_page) )
{
MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
- return -1;
+ return 0;
}
- page = frame_table + page_nr;
- flags = page->flags;
- if ( unlikely(!DOMAIN_OKAY(flags)) )
+
+ if ( unlikely(!get_page(page, current)) &&
+ ((current->domain != 0) || !dom0_get_page(page)) )
{
- MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
- return -1;
+ MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr);
+ return 0;
}
- if ( (flags & PG_type_mask) != type )
+
+ return 1;
+}
+
+
+static int get_page_and_type_from_pagenr(unsigned long page_nr,
+ unsigned int type)
+{
+ struct pfn_info *page = &frame_table[page_nr];
+
+ if ( unlikely(!get_page_from_pagenr(page_nr)) )
+ return 0;
+
+ if ( unlikely(!get_page_type(page, type)) )
{
- if ( page_type_count(page) != 0 )
- {
- MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
- page_nr << PAGE_SHIFT,
- flags & PG_type_mask, type, page_type_count(page));
- return -1;
- }
+ MEM_LOG("Bad page type for pfn %08lx (%08lx)",
+ page_nr, page->type_and_flags);
+ put_page(page);
+ return 0;
+ }
- if ( unlikely(flags & PG_need_flush) )
- {
- deferred_op[smp_processor_id()].flush_tlb = 1;
- page->flags &= ~PG_need_flush;
- perfc_incrc(need_flush_tlb_flush);
- }
+ return 1;
+}
+
+
+/*
+ * We allow an L2 table to map itself, to achieve a linear p.t. Note that this
+ * does not raise any reference counts.
+ */
+static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn)
+{
+ if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
+ {
+ MEM_LOG("Attempt to create linear p.t. with write perms");
+ return 0;
+ }
- page->flags &= ~PG_type_mask;
- page->flags |= type;
+ if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
+ {
+ MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
+ return 0;
}
- get_page_tot(page);
- return get_page_type(page);
+ return 1;
}
-/* Return new refcnt, or -1 on error. */
-static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
+static int get_page_from_l1e(l1_pgentry_t l1e)
{
- struct pfn_info *page;
+ ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
- if ( unlikely(page_nr >= max_page) )
+ if ( unlikely((l1_pgentry_val(l1e) &
+ (_PAGE_GLOBAL|_PAGE_PAT))) )
{
- MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
- return -1;
+ MEM_LOG("Bad L1 page type settings %04lx",
+ l1_pgentry_val(l1e) &
+ (_PAGE_GLOBAL|_PAGE_PAT));
+ return 0;
}
- page = frame_table + page_nr;
- if ( unlikely(!DOMAIN_OKAY(page->flags)) ||
- unlikely(((page->flags & PG_type_mask) != type)) )
+
+ if ( l1_pgentry_val(l1e) & _PAGE_RW )
{
- MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
- page->flags & PG_domain_mask, page->flags & PG_type_mask,
- type);
- return -1;
+ if ( unlikely(!get_page_and_type_from_pagenr(
+ l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) )
+ return 0;
+ set_bit(_PGC_tlb_flush_on_type_change,
+ &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags);
}
- ASSERT(page_type_count(page) != 0);
- put_page_tot(page);
- return put_page_type(page);
+ else
+ {
+ if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) )
+ return 0;
+ }
+
+ return 1;
}
-/* We allow a L2 table to map itself, to achieve a linear pagetable. */
-/* NB. There's no need for a put_twisted_l2_table() function!! */
-static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
{
- unsigned long l2v = l2_pgentry_val(l2e);
+ ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
- /* Clearly the mapping must be read-only :-) */
- if ( (l2v & _PAGE_RW) )
+ if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
{
- MEM_LOG("Attempt to install twisted L2 entry with write permissions");
- return -1;
+ MEM_LOG("Bad L2 page type settings %04lx",
+ l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
+ return 0;
}
- /* This is a sufficient final check. */
- if ( (l2v >> PAGE_SHIFT) != entry_pfn )
+ if ( unlikely(!get_page_and_type_from_pagenr(
+ l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) &&
+ unlikely(!check_linear_pagetable(l2e, pfn)) )
+ return 0;
+
+ return 1;
+}
+
+
+static void put_page_from_l1e(l1_pgentry_t l1e)
+{
+ struct pfn_info *page;
+
+ ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
+
+ page = &frame_table[l1_pgentry_to_pagenr(l1e)];
+
+ if ( l1_pgentry_val(l1e) & _PAGE_RW )
{
- MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
- return -1;
+ put_page_and_type(page);
+ }
+ else
+ {
+ /* We expect this is rare so we blow the entire shadow LDT. */
+ if ( unlikely(((page->type_and_flags & PGT_type_mask) ==
+ PGT_ldt_page)) &&
+ unlikely(((page->type_and_flags & PGT_count_mask) != 0)) )
+ invalidate_shadow_ldt();
+ put_page(page);
}
-
- /* We don't bump the reference counts. */
- return 0;
}
-static int get_l2_table(unsigned long page_nr)
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
{
- struct pfn_info *page;
- struct task_struct *p;
- l2_pgentry_t *p_l2_entry, l2_entry;
- int i, ret=0;
+ ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
+
+ if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
+ ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
+ put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
+}
+
+
+static int alloc_l2_table(struct pfn_info *page)
+{
+ unsigned long page_nr = page - frame_table;
+ l2_pgentry_t *pl2e, l2e;
+ int i;
- ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
- if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
-
- /* NEW level-2 page table! Deal with every PDE in the table. */
- p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
+ pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+
for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
{
- l2_entry = *p_l2_entry++;
- if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
- if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) )
- {
- MEM_LOG("Bad L2 page type settings %04lx",
- l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
- ret = -1;
+ l2e = pl2e[i];
+
+ if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
+ continue;
+
+ if ( unlikely(!get_page_from_l2e(l2e, page_nr)) )
goto fail;
- }
- /* Assume we're mapping an L1 table, falling back to twisted L2. */
- ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
- if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry);
- if ( unlikely(ret) ) goto fail;
}
- /* Now we simply slap in our high mapping. */
- memcpy(p_l2_entry,
+ /* Now we add our private high mappings. */
+ memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
&idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
- p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
- DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
+ pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) |
+ __PAGE_HYPERVISOR);
- /*
- * The per-domain PGD is slightly tricky, as we may not be executing
- * in the context of the correct domain (DOM0 builds pt's for others).
- */
- page = frame_table + page_nr;
- if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL )
- {
- p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
- DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
- mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
- put_task_struct(p);
- }
-
- out:
- unmap_domain_mem(p_l2_entry);
- return ret;
+ unmap_domain_mem(pl2e);
+ return 1;
fail:
- p_l2_entry--;
while ( i-- > 0 )
{
- l2_entry = *--p_l2_entry;
- if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
- put_l1_table(l2_pgentry_to_pagenr(l2_entry));
+ l2e = pl2e[i];
+ if ( l2_pgentry_val(l2e) & _PAGE_PRESENT )
+ put_page_from_l2e(l2e, page_nr);
}
- if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 )
- BUG();
- goto out;
+
+ unmap_domain_mem(pl2e);
+ return 0;
}
-static int get_l1_table(unsigned long page_nr)
+static int alloc_l1_table(struct pfn_info *page)
{
- l1_pgentry_t *p_l1_entry, l1_entry;
- int i, ret;
+ unsigned long page_nr = page - frame_table;
+ l1_pgentry_t *pl1e, l1e;
+ int i;
- /* Update ref count for page pointed at by PDE. */
- ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
- if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
+ pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
- /* NEW level-1 page table! Deal with every PTE in the table. */
- p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
{
- l1_entry = *p_l1_entry++;
- if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
- if ( unlikely((l1_pgentry_val(l1_entry) &
- (_PAGE_GLOBAL|_PAGE_PAT))) )
- {
- MEM_LOG("Bad L1 page type settings %04lx",
- l1_pgentry_val(l1_entry) &
- (_PAGE_GLOBAL|_PAGE_PAT));
- ret = -1;
+ l1e = pl1e[i];
+
+ if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
+ continue;
+
+ if ( unlikely(!get_page_from_l1e(l1e)) )
goto fail;
- }
- ret = get_page(l1_pgentry_to_pagenr(l1_entry),
- l1_pgentry_val(l1_entry) & _PAGE_RW);
- if ( unlikely(ret) ) goto fail;
}
/* Make sure we unmap the right page! */
- unmap_domain_mem(p_l1_entry-1);
- return ret;
+ unmap_domain_mem(pl1e);
+ return 1;
fail:
- p_l1_entry--;
while ( i-- > 0 )
{
- l1_entry = *--p_l1_entry;
- if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) )
- put_page(l1_pgentry_to_pagenr(l1_entry),
- l1_pgentry_val(l1_entry) & _PAGE_RW);
- }
- if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 )
- BUG();
- unmap_domain_mem(p_l1_entry);
- return ret;
-}
-
-
-static int get_page(unsigned long page_nr, int writeable)
-{
- struct pfn_info *page;
- unsigned long flags;
-
- /* Update ref count for page pointed at by PTE. */
- if ( unlikely(page_nr >= max_page) )
- {
- MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
- return(-1);
- }
- page = frame_table + page_nr;
- flags = page->flags;
- if ( unlikely(!DOMAIN_OKAY(flags)) )
- {
- MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
- return(-1);
+ l1e = pl1e[i];
+ if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
+ continue;
+ put_page_from_l1e(l1e);
}
- if ( writeable )
- {
- if ( (flags & PG_type_mask) != PGT_writeable_page )
- {
- if ( page_type_count(page) != 0 )
- {
- MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
- flags & PG_type_mask, PGT_writeable_page,
- page_type_count(page));
- return(-1);
- }
- page->flags &= ~PG_type_mask;
- page->flags |= PGT_writeable_page;
- }
- page->flags |= PG_need_flush;
- get_page_type(page);
- }
-
- get_page_tot(page);
-
- return(0);
+ unmap_domain_mem(pl1e);
+ return 0;
}
-static void put_l2_table(unsigned long page_nr)
+static void free_l2_table(struct pfn_info *page)
{
- l2_pgentry_t *p_l2_entry, l2_entry;
+ unsigned long page_nr = page - frame_table;
+ l2_pgentry_t *pl2e, l2e;
int i;
- if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return;
+ pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
- /* We had last reference to level-2 page table. Free the PDEs. */
- p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
{
- l2_entry = *p_l2_entry++;
- if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
- put_l1_table(l2_pgentry_to_pagenr(l2_entry));
+ l2e = pl2e[i];
+ if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
+ unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) )
+ put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
}
- unmap_domain_mem(p_l2_entry);
+ unmap_domain_mem(pl2e);
}
-static void put_l1_table(unsigned long page_nr)
+static void free_l1_table(struct pfn_info *page)
{
- l1_pgentry_t *p_l1_entry, l1_entry;
+ unsigned long page_nr = page - frame_table;
+ l1_pgentry_t *pl1e, l1e;
int i;
- if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return;
+ pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
- /* We had last reference to level-1 page table. Free the PTEs. */
- p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
{
- l1_entry = *p_l1_entry++;
- if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) )
- put_page(l1_pgentry_to_pagenr(l1_entry),
- l1_pgentry_val(l1_entry) & _PAGE_RW);
+ l1e = pl1e[i];
+ if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
+ continue;
+ put_page_from_l1e(l1e);
}
- /* Make sure we unmap the right page! */
- unmap_domain_mem(p_l1_entry-1);
+ unmap_domain_mem(pl1e);
}
-static void put_page(unsigned long page_nr, int writeable)
+static inline int update_l2e(l2_pgentry_t *pl2e,
+ l2_pgentry_t ol2e,
+ l2_pgentry_t nl2e)
{
- struct pfn_info *page;
- ASSERT(page_nr < max_page);
- page = frame_table + page_nr;
- ASSERT(DOMAIN_OKAY(page->flags));
- ASSERT((!writeable) ||
- ((page_type_count(page) != 0) &&
- ((page->flags & PG_type_mask) == PGT_writeable_page) &&
- ((page->flags & PG_need_flush) == PG_need_flush)));
- if ( writeable )
- {
- put_page_type(page);
- }
- else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
- (page_type_count(page) != 0)) )
- {
- /* We expect this is rare so we just blow the entire shadow LDT. */
- invalidate_shadow_ldt();
- }
- put_page_tot(page);
+ unsigned long o = cmpxchg((unsigned long *)pl2e,
+ l2_pgentry_val(ol2e),
+ l2_pgentry_val(nl2e));
+ if ( o != l2_pgentry_val(ol2e) )
+ MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+ l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
+ return (o == l2_pgentry_val(ol2e));
}
-static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
+/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+static int mod_l2_entry(l2_pgentry_t *pl2e,
+ l2_pgentry_t nl2e,
+ unsigned long pfn)
{
- l2_pgentry_t old_l2_entry = *p_l2_entry;
+ l2_pgentry_t ol2e;
+ unsigned long _ol2e;
- if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
+ if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
{
- MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
- p_l2_entry);
- goto fail;
+ MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e);
+ return 0;
}
- if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
+ if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
+ return 0;
+ ol2e = mk_l2_pgentry(_ol2e);
+
+ if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
{
- if ( unlikely((l2_pgentry_val(new_l2_entry) &
- (_PAGE_GLOBAL|_PAGE_PSE))) )
- {
- MEM_LOG("Bad L2 entry val %04lx",
- l2_pgentry_val(new_l2_entry) &
- (_PAGE_GLOBAL|_PAGE_PSE));
- goto fail;
- }
/* Differ in mapping (bits 12-31) or presence (bit 0)? */
- if ( ((l2_pgentry_val(old_l2_entry) ^
- l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
+ if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 )
{
- /* Assume we're mapping an L1 table, falling back to twisted L2. */
- if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
+ if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
+ return 0;
+
+ if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
{
- /* NB. No need to sanity-check the VA: done already. */
- unsigned long l1e = l1_pgentry_val(
- linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
- if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
- goto fail;
+ put_page_from_l2e(nl2e, pfn);
+ return 0;
}
- if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
- put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
- }
+ if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
+ put_page_from_l2e(ol2e, pfn);
+ }
+ else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+ {
+ return 0;
+ }
}
- else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
+ else
{
- put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
+ if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+ return 0;
+
+ if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
+ put_page_from_l2e(ol2e, pfn);
}
- *p_l2_entry = new_l2_entry;
- return 0;
-
- fail:
- return -1;
+ return 1;
}
-static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
+static inline int update_l1e(l1_pgentry_t *pl1e,
+ l1_pgentry_t ol1e,
+ l1_pgentry_t nl1e)
{
- l1_pgentry_t old_l1_entry = *p_l1_entry;
+ unsigned long o = l1_pgentry_val(ol1e);
+ unsigned long n = l1_pgentry_val(nl1e);
- if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
+ while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
{
- if ( unlikely((l1_pgentry_val(new_l1_entry) &
- (_PAGE_GLOBAL|_PAGE_PAT))) )
+ unsigned int cpu = smp_processor_id();
+ /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */
+ if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 )
{
- MEM_LOG("Bad L1 entry val %04lx",
- l1_pgentry_val(new_l1_entry) &
- (_PAGE_GLOBAL|_PAGE_PAT));
- goto fail;
+ MEM_LOG("cmpxchg fault despite WP bit cleared\n");
+ return 0;
}
+ deferred_op[cpu].cr0 = read_cr0();
+ write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP);
+ deferred_op[cpu].flags |= DOP_RESTORE_CR0;
+ }
+
+ if ( o != l1_pgentry_val(ol1e))
+ MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+ l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
+
+ /* The swap was successful if the old value we saw is equal to ol1e. */
+ return (o == l1_pgentry_val(ol1e));
+}
+
+
+/* Update the L1 entry at pl1e to new value nl1e. */
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+{
+ l1_pgentry_t ol1e;
+ unsigned long _ol1e;
+
+ if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
+ {
+ MEM_LOG("Bad get_user\n");
+ return 0;
+ }
+
+ ol1e = mk_l1_pgentry(_ol1e);
+
+ if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
+ {
/*
* Differ in mapping (bits 12-31), writeable (bit 1), or
* presence (bit 0)?
*/
- if ( ((l1_pgentry_val(old_l1_entry) ^
- l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
+ if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 )
{
- if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
- l1_pgentry_val(new_l1_entry) & _PAGE_RW) )
- goto fail;
-
- if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
- put_page(l1_pgentry_to_pagenr(old_l1_entry),
- l1_pgentry_val(old_l1_entry) & _PAGE_RW);
- }
+ if ( unlikely(!get_page_from_l1e(nl1e)) )
+ return 0;
+
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ {
+ put_page_from_l1e(nl1e);
+ return 0;
+ }
+
+ if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
+ put_page_from_l1e(ol1e);
+ }
+ else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ {
+ return 0;
+ }
+ }
+ else
+ {
+ if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+ return 0;
+
+ if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
+ put_page_from_l1e(ol1e);
+ }
+
+ return 1;
+}
+
+
+int alloc_page_type(struct pfn_info *page, unsigned int type)
+{
+ if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change,
+ &page->count_and_flags)) )
+ {
+ struct task_struct *p = page->u.domain;
+ mb(); /* Check zombie status before using domain ptr. */
+ /*
+ * NB. 'p' may no longer be valid by time we dereference it, so
+ * p->processor might be garbage. We clamp it, just in case.
+ */
+ if ( !test_bit(_PGC_zombie, &page->count_and_flags) &&
+ unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)],
+ page->tlbflush_timestamp)) )
+ {
+ perfc_incr(need_flush_tlb_flush);
+ flush_tlb_cpu(p->processor);
+ }
}
- else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
+
+ switch ( type )
{
- put_page(l1_pgentry_to_pagenr(old_l1_entry),
- l1_pgentry_val(old_l1_entry) & _PAGE_RW);
+ case PGT_l1_page_table:
+ return alloc_l1_table(page);
+ case PGT_l2_page_table:
+ return alloc_l2_table(page);
+ case PGT_gdt_page:
+ case PGT_ldt_page:
+ return alloc_segdesc_page(page);
+ default:
+ BUG();
}
- *p_l1_entry = new_l1_entry;
return 0;
+}
- fail:
- return -1;
+
+void free_page_type(struct pfn_info *page, unsigned int type)
+{
+ switch ( type )
+ {
+ case PGT_l1_page_table:
+ return free_l1_table(page);
+ case PGT_l2_page_table:
+ return free_l2_table(page);
+ default:
+ BUG();
+ }
}
static int do_extended_command(unsigned long ptr, unsigned long val)
{
- int err = 0, cpu = smp_processor_id();
+ int okay = 1, cpu = smp_processor_id();
unsigned int cmd = val & MMUEXT_CMD_MASK;
unsigned long pfn = ptr >> PAGE_SHIFT;
- struct pfn_info *page = frame_table + pfn;
+ struct pfn_info *page = &frame_table[pfn];
/* 'ptr' must be in range except where it isn't a machine address. */
if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) )
+ {
+ MEM_LOG("Ptr out of range for extended MMU command");
return 1;
+ }
switch ( cmd )
{
case MMUEXT_PIN_L1_TABLE:
- if ( unlikely(page->flags & PG_guest_pinned) )
+ case MMUEXT_PIN_L2_TABLE:
+ okay = get_page_and_type_from_pagenr(pfn,
+ (cmd == MMUEXT_PIN_L2_TABLE) ?
+ PGT_l2_page_table :
+ PGT_l1_page_table);
+ if ( unlikely(!okay) )
{
- MEM_LOG("Pfn %08lx already pinned", pfn);
- err = 1;
+ MEM_LOG("Error while pinning pfn %08lx", pfn);
break;
}
- err = get_l1_table(pfn);
- goto mark_as_pinned;
- case MMUEXT_PIN_L2_TABLE:
- if ( unlikely(page->flags & PG_guest_pinned) )
+ if ( unlikely(test_and_set_bit(_PGC_guest_pinned,
+ &page->count_and_flags)) )
{
MEM_LOG("Pfn %08lx already pinned", pfn);
- err = 1;
+ put_page_and_type(page);
+ okay = 0;
break;
}
- err = get_l2_table(pfn);
- mark_as_pinned:
- if ( unlikely(err) )
- {
- MEM_LOG("Error while pinning pfn %08lx", pfn);
- break;
- }
- page->flags |= PG_guest_pinned;
break;
case MMUEXT_UNPIN_TABLE:
- if ( unlikely(!DOMAIN_OKAY(page->flags)) )
+ if ( unlikely(!(okay = get_page_from_pagenr(pfn))) )
{
- err = 1;
- MEM_LOG("Page %08lx bad domain (dom=%ld)",
- ptr, page->flags & PG_domain_mask);
+ MEM_LOG("Page %08lx bad domain (dom=%p)",
+ ptr, page->u.domain);
}
- else if ( likely(page->flags & PG_guest_pinned) )
+ else if ( likely(test_and_clear_bit(_PGC_guest_pinned,
+ &page->count_and_flags)) )
{
- page->flags &= ~PG_guest_pinned;
- ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
- put_l1_table(pfn) : put_l2_table(pfn);
+ put_page_and_type(page);
}
else
{
- err = 1;
+ okay = 0;
MEM_LOG("Pfn %08lx not pinned", pfn);
}
break;
case MMUEXT_NEW_BASEPTR:
- err = get_l2_table(pfn);
- if ( !err )
+ okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table);
+ if ( likely(okay) )
{
- put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
+ put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable)
+ >> PAGE_SHIFT]);
current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
invalidate_shadow_ldt();
- deferred_op[cpu].flush_tlb = 1;
+ deferred_op[cpu].flags |= DOP_FLUSH_TLB;
}
else
{
- MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
+ MEM_LOG("Error while installing new baseptr %08lx", ptr);
}
break;
case MMUEXT_TLB_FLUSH:
- deferred_op[cpu].flush_tlb = 1;
+ deferred_op[cpu].flags |= DOP_FLUSH_TLB;
break;
case MMUEXT_INVLPG:
@@ -815,7 +867,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
{
- err = 1;
+ okay = 0;
MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
}
else if ( (current->mm.ldt_ents != ents) ||
@@ -825,37 +877,39 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
current->mm.ldt_base = ptr;
current->mm.ldt_ents = ents;
load_LDT(current);
- deferred_op[cpu].refresh_ldt = (ents != 0);
+ deferred_op[cpu].flags &= ~DOP_RELOAD_LDT;
+ if ( ents != 0 )
+ deferred_op[cpu].flags |= DOP_RELOAD_LDT;
}
break;
}
default:
MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
- err = 1;
+ okay = 0;
break;
}
- return err;
+ return okay;
}
int do_mmu_update(mmu_update_t *ureqs, int count)
{
mmu_update_t req;
- unsigned long flags, pfn, l1e;
+ unsigned long va = 0, flags, pfn, prev_pfn = 0;
struct pfn_info *page;
- int rc = 0, err = 0, i, cpu = smp_processor_id();
+ int rc = 0, okay = 1, i, cpu = smp_processor_id();
unsigned int cmd;
- unsigned long cr0 = 0;
- perfc_incrc( calls_to_mmu_update );
- perfc_addc( num_page_updates, count );
+ perfc_incrc(calls_to_mmu_update);
+ perfc_addc(num_page_updates, count);
for ( i = 0; i < count; i++ )
{
if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
{
+ MEM_LOG("Bad copy_from_user");
rc = -EFAULT;
break;
}
@@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
pfn = req.ptr >> PAGE_SHIFT;
- err = 1;
-
- spin_lock(&current->page_lock);
-
- /* Get the page-frame number that a non-extended command references. */
- if ( (cmd == MMU_NORMAL_PT_UPDATE) ||
- (cmd == MMU_UNCHECKED_PT_UPDATE) )
- {
- if ( cr0 == 0 )
- {
- cr0 = read_cr0();
- write_cr0(cr0 & ~X86_CR0_WP);
- }
- /* Need to use 'get_user' since the VA's PGD may be absent. */
- __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
- /* Now check that the VA's PTE isn't absent. */
- if ( unlikely(!(l1e & _PAGE_PRESENT)) )
- {
- MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
- goto unlock;
- }
- /* Finally, get the underlying machine address. */
- pfn = l1e >> PAGE_SHIFT;
- }
+ okay = 0;
- /* Least significant bits of 'ptr' demux the operation type. */
switch ( cmd )
{
/*
* MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
*/
case MMU_NORMAL_PT_UPDATE:
- page = frame_table + pfn;
- flags = page->flags;
+ page = &frame_table[pfn];
- if ( likely(DOMAIN_OKAY(flags)) )
+ if ( unlikely(!get_page(page, current)) &&
+ ((current->domain != 0) || !dom0_get_page(page)) )
{
- switch ( (flags & PG_type_mask) )
- {
- case PGT_l1_page_table:
- err = mod_l1_entry((l1_pgentry_t *)req.ptr,
- mk_l1_pgentry(req.val));
- break;
- case PGT_l2_page_table:
- err = mod_l2_entry((l2_pgentry_t *)req.ptr,
- mk_l2_pgentry(req.val));
- break;
- default:
- if ( page_type_count(page) == 0 )
- {
- *(unsigned long *)req.ptr = req.val;
- err = 0;
- }
- else
- MEM_LOG("Update to bad page %08lx", req.ptr);
- break;
- }
+ MEM_LOG("Could not get page for normal update");
+ break;
+ }
+
+ if ( likely(prev_pfn == pfn) )
+ {
+ va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
}
else
{
- MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
- current->domain, pfn);
+ if ( prev_pfn != 0 )
+ unmap_domain_mem((void *)va);
+ va = (unsigned long)map_domain_mem(req.ptr);
+ prev_pfn = pfn;
+ }
+
+ switch ( (page->type_and_flags & PGT_type_mask) )
+ {
+ case PGT_l1_page_table:
+ if ( likely(get_page_type(page, PGT_l1_page_table)) )
+ {
+ okay = mod_l1_entry((l1_pgentry_t *)va,
+ mk_l1_pgentry(req.val));
+ put_page_type(page);
+ }
+ break;
+ case PGT_l2_page_table:
+ if ( likely(get_page_type(page, PGT_l2_page_table)) )
+ {
+ okay = mod_l2_entry((l2_pgentry_t *)va,
+ mk_l2_pgentry(req.val),
+ pfn);
+ put_page_type(page);
+ }
+ break;
+ default:
+ if ( likely(get_page_type(page, PGT_writeable_page)) )
+ {
+ *(unsigned long *)va = req.val;
+ okay = 1;
+ put_page_type(page);
+ }
+ break;
}
+
+ put_page(page);
+
break;
case MMU_UNCHECKED_PT_UPDATE:
req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
if ( likely(IS_PRIV(current)) )
{
- *(unsigned long *)req.ptr = req.val;
- err = 0;
+ if ( likely(prev_pfn == pfn) )
+ {
+ va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
+ }
+ else
+ {
+ if ( prev_pfn != 0 )
+ unmap_domain_mem((void *)va);
+ va = (unsigned long)map_domain_mem(req.ptr);
+ prev_pfn = pfn;
+ }
+ *(unsigned long *)va = req.val;
+ okay = 1;
}
else
{
@@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
break;
case MMU_MACHPHYS_UPDATE:
- page = frame_table + pfn;
+ page = &frame_table[pfn];
if ( unlikely(pfn >= max_page) )
{
MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
}
- else if ( likely(DOMAIN_OKAY(page->flags)) )
+ else if ( likely(get_page(page, current)) ||
+ ((current->domain == 0) && dom0_get_page(page)) )
{
machine_to_phys_mapping[pfn] = req.val;
- err = 0;
+ okay = 1;
+ put_page(page);
}
- else
- {
- MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
- current->domain, pfn);
- }
break;
/*
@@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
*/
case MMU_EXTENDED_COMMAND:
req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
- err = do_extended_command(req.ptr, req.val);
+ okay = do_extended_command(req.ptr, req.val);
break;
default:
@@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
break;
}
- unlock:
- spin_unlock(&current->page_lock);
-
- if ( unlikely(err) )
+ if ( unlikely(!okay) )
{
rc = -EINVAL;
break;
@@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
ureqs++;
}
- if ( deferred_op[cpu].flush_tlb )
- {
- deferred_op[cpu].flush_tlb = 0;
- __write_cr3_counted(pagetable_val(current->mm.pagetable));
- }
+ if ( prev_pfn != 0 )
+ unmap_domain_mem((void *)va);
- if ( deferred_op[cpu].refresh_ldt )
- {
- deferred_op[cpu].refresh_ldt = 0;
+ flags = deferred_op[cpu].flags;
+ deferred_op[cpu].flags = 0;
+
+ if ( flags & DOP_FLUSH_TLB )
+ write_cr3_counted(pagetable_val(current->mm.pagetable));
+
+ if ( flags & DOP_RELOAD_LDT )
(void)map_ldt_shadow_page(0);
- }
- if ( cr0 != 0 )
- write_cr0(cr0);
+ if ( unlikely(flags & DOP_RESTORE_CR0) )
+ write_cr0(deferred_op[cpu].cr0);
return rc;
}
@@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
int do_update_va_mapping(unsigned long page_nr,
unsigned long val,
- unsigned long flags)
+ unsigned long caller_flags)
{
- unsigned long _x, cr0 = 0;
struct task_struct *p = current;
- int err = -EINVAL;
+ int err = 0;
+ unsigned int cpu = p->processor;
+ unsigned long defer_flags;
if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
- goto out;
-
- spin_lock(&p->page_lock);
+ return -EINVAL;
- /* Check that the VA's page-directory entry is present.. */
- if ( unlikely((err = __get_user(_x, (unsigned long *)
- (&linear_pg_table[page_nr]))) != 0) )
- goto unlock_and_out;
-
- /* If the VA's page-directory entry is read-only, we frob the WP bit. */
- if ( unlikely(__put_user(_x, (unsigned long *)
- (&linear_pg_table[page_nr]))) )
- {
- cr0 = read_cr0();
- write_cr0(cr0 & ~X86_CR0_WP);
- }
-
- if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr],
- mk_l1_pgentry(val)) != 0) )
- {
+ if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr],
+ mk_l1_pgentry(val))) )
err = -EINVAL;
- goto check_cr0_unlock_and_out;
- }
- if ( unlikely(flags & UVMF_INVLPG) )
+ defer_flags = deferred_op[cpu].flags;
+ deferred_op[cpu].flags = 0;
+
+ if ( unlikely(defer_flags & DOP_FLUSH_TLB) ||
+ unlikely(caller_flags & UVMF_FLUSH_TLB) )
+ write_cr3_counted(pagetable_val(p->mm.pagetable));
+ else if ( unlikely(caller_flags & UVMF_INVLPG) )
__flush_tlb_one(page_nr << PAGE_SHIFT);
- if ( unlikely(flags & UVMF_FLUSH_TLB) )
- __write_cr3_counted(pagetable_val(p->mm.pagetable));
+ if ( unlikely(defer_flags & DOP_RELOAD_LDT) )
+ (void)map_ldt_shadow_page(0);
+
+ if ( unlikely(defer_flags & DOP_RESTORE_CR0) )
+ write_cr0(deferred_op[cpu].cr0);
- check_cr0_unlock_and_out:
- if ( unlikely(cr0 != 0) )
- write_cr0(cr0);
- unlock_and_out:
- spin_unlock(&p->page_lock);
- out:
return err;
}
diff --git a/xen/common/network.c b/xen/common/network.c
index 02b6f57580..14bfa8dac5 100644
--- a/xen/common/network.c
+++ b/xen/common/network.c
@@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain)
if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG();
new_ring = (net_ring_t *)get_free_page(GFP_KERNEL);
clear_page(new_ring);
- SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain);
+ SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
/*
* Fill in the new vif struct. Note that, while the vif's refcnt is
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 9e227f574a..ca609438e0 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned long min, unsigned long max)
/* Release a PHYSICAL address range to the allocator. */
void release_bytes_to_allocator(unsigned long min, unsigned long max)
{
- min = round_pgup (min) + PAGE_OFFSET;
- max = round_pgdown(max) + PAGE_OFFSET;
+ min = round_pgup (min);
+ max = round_pgdown(max);
while ( min < max )
{
- __free_pages(min, 0);
+ __free_pages(min+PAGE_OFFSET, 0);
min += PAGE_SIZE;
}
}
@@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask, int order)
retry:
spin_lock_irqsave(&alloc_lock, flags);
-
/* Find smallest order which can satisfy the request. */
for ( i = order; i < FREELIST_SIZE; i++ ) {
if ( !FREELIST_EMPTY(free_head[i]) )
diff --git a/xen/drivers/block/ll_rw_blk.c b/xen/drivers/block/ll_rw_blk.c
index 55fbdf3e79..9e1b0de266 100644
--- a/xen/drivers/block/ll_rw_blk.c
+++ b/xen/drivers/block/ll_rw_blk.c
@@ -14,31 +14,15 @@
#include <xeno/types.h>
#include <xeno/lib.h>
#include <xeno/sched.h>
-/*#include <xeno/kernel_stat.h>*/
#include <xeno/errno.h>
-/*#include <xeno/locks.h>*/
#include <xeno/mm.h>
-/*#include <xeno/swap.h>*/
#include <xeno/init.h>
-/*#include <xeno/smp_lock.h>*/
-/*#include <xeno/completion.h>*/
-
#include <asm/system.h>
#include <asm/io.h>
#include <xeno/blk.h>
-/*#include <xeno/highmem.h>*/
#include <xeno/slab.h>
#include <xeno/module.h>
-/*
- * KAF: We can turn off noise relating to barking guest-OS requests.
- */
-#if 0
-#define DPRINTK(_f, _a...) printk(_f , ## _a)
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
/* This will die as all synchronous stuff is coming to an end */
#if 0
#define complete(_r) panic("completion.h stuff may be needed...")
@@ -47,8 +31,6 @@
#define complete(_r) (*(int *)(_r) = 0)
#endif
-
-
/*
* MAC Floppy IWM hooks
*/
diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c
index 5103d85ffd..8b1cb119e6 100644
--- a/xen/drivers/block/xen_block.c
+++ b/xen/drivers/block/xen_block.c
@@ -20,12 +20,6 @@
#include <xeno/vbd.h>
#include <xeno/slab.h>
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
/*
* These are rather arbitrary. They are fairly large because adjacent
* requests pulled from a communication ring are quite likely to end
@@ -60,15 +54,11 @@ static atomic_t nr_pending;
static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
-static int __buffer_is_valid(struct task_struct *p,
- unsigned long buffer,
- unsigned short size,
- int writeable_buffer);
-static void __lock_buffer(unsigned long buffer,
- unsigned short size,
- int writeable_buffer);
-static void unlock_buffer(struct task_struct *p,
- unsigned long buffer,
+static int lock_buffer(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+static void unlock_buffer(unsigned long buffer,
unsigned short size,
int writeable_buffer);
@@ -185,8 +175,7 @@ static void end_block_io_op_softirq(struct softirq_action *h)
{
pending_req = bh->pending_req;
- unlock_buffer(pending_req->domain,
- virt_to_phys(bh->b_data),
+ unlock_buffer(virt_to_phys(bh->b_data),
bh->b_size,
(pending_req->operation==READ));
@@ -321,97 +310,60 @@ long do_block_io_op(block_io_op_t *u_block_io_op)
* DOWNWARD CALLS -- These interface with the block-device layer proper.
*/
-static int __buffer_is_valid(struct task_struct *p,
- unsigned long buffer,
- unsigned short size,
- int writeable_buffer)
+static int lock_buffer(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
{
unsigned long pfn;
struct pfn_info *page;
- int rc = 0;
- /* A request may span multiple page frames. Each must be checked. */
for ( pfn = buffer >> PAGE_SHIFT;
pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
pfn++ )
{
- /* Each frame must be within bounds of machine memory. */
- if ( pfn >= max_page )
- {
- DPRINTK("pfn out of range: %08lx\n", pfn);
- goto out;
- }
+ if ( unlikely(pfn >= max_page) )
+ goto fail;
- page = frame_table + pfn;
+ page = &frame_table[pfn];
- /* Each frame must belong to the requesting domain. */
- if ( (page->flags & PG_domain_mask) != p->domain )
- {
- DPRINTK("bad domain: expected %d, got %ld\n",
- p->domain, page->flags & PG_domain_mask);
- goto out;
- }
+ if ( unlikely(!get_page(page, p)) )
+ goto fail;
- /* If reading into the frame, the frame must be writeable. */
- if ( writeable_buffer &&
- ((page->flags & PG_type_mask) != PGT_writeable_page) &&
- (page_type_count(page) != 0) )
+ if ( writeable_buffer &&
+ unlikely(!get_page_type(page, PGT_writeable_page)) )
{
- DPRINTK("non-writeable page passed for block read\n");
- goto out;
+ put_page(page);
+ goto fail;
}
- }
-
- rc = 1;
- out:
- return rc;
-}
+ }
-static void __lock_buffer(unsigned long buffer,
- unsigned short size,
- int writeable_buffer)
-{
- unsigned long pfn;
- struct pfn_info *page;
+ return 1;
- for ( pfn = buffer >> PAGE_SHIFT;
- pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
- pfn++ )
- {
- page = frame_table + pfn;
+ fail:
+ while ( pfn-- > (buffer >> PAGE_SHIFT) )
+ {
if ( writeable_buffer )
- {
- if ( page_type_count(page) == 0 )
- {
- page->flags &= ~PG_type_mask;
- /* No need for PG_need_flush here. */
- page->flags |= PGT_writeable_page;
- }
- get_page_type(page);
- }
- get_page_tot(page);
+ put_page_type(&frame_table[pfn]);
+ put_page(&frame_table[pfn]);
}
+ return 0;
}
-static void unlock_buffer(struct task_struct *p,
- unsigned long buffer,
+static void unlock_buffer(unsigned long buffer,
unsigned short size,
int writeable_buffer)
{
- unsigned long pfn;
- struct pfn_info *page;
+ unsigned long pfn;
- spin_lock(&p->page_lock);
for ( pfn = buffer >> PAGE_SHIFT;
pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
pfn++ )
{
- page = frame_table + pfn;
if ( writeable_buffer )
- put_page_type(page);
- put_page_tot(page);
+ put_page_type(&frame_table[pfn]);
+ put_page(&frame_table[pfn]);
}
- spin_unlock(&p->page_lock);
}
static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
@@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
int new_segs, nr_psegs = 0;
phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
- spin_lock(&p->page_lock);
-
/* Check that number of segments is sane. */
if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
{
@@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
goto bad_descriptor;
}
- if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
+ if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) )
{
DPRINTK("invalid buffer\n");
goto bad_descriptor;
@@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
req->sector_number + tot_sects,
req->sector_number + tot_sects + nr_sects,
req->device);
+ unlock_buffer(buffer, nr_sects<<9, (operation==READ));
goto bad_descriptor;
}
@@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
}
- /* Lock pages associated with each buffer head. */
- for ( i = 0; i < nr_psegs; i++ )
- __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
- (operation==READ));
- spin_unlock(&p->page_lock);
-
atomic_inc(&nr_pending);
pending_req = pending_reqs + pending_ring[pending_cons];
PENDREQ_IDX_INC(pending_cons);
@@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
return;
bad_descriptor:
- spin_unlock(&p->page_lock);
make_response(p, req->id, req->operation, 1);
}
@@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct *p)
if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
clear_page(p->blk_ring_base);
- SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
+ SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
p->blkdev_list.next = NULL;
spin_lock_init(&p->vbd_lock);
}
@@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_struct *p)
{
ASSERT(!__on_blkdev_list(p));
UNSHARE_PFN(virt_to_page(p->blk_ring_base));
- free_page((unsigned long)p->blk_ring_base);
destroy_all_vbds(p);
}
diff --git a/xen/drivers/block/xen_vbd.c b/xen/drivers/block/xen_vbd.c
index f16adb6795..13da02d03c 100644
--- a/xen/drivers/block/xen_vbd.c
+++ b/xen/drivers/block/xen_vbd.c
@@ -23,13 +23,6 @@
extern int ide_probe_devices(xen_disk_info_t *xdi);
extern int scsi_probe_devices(xen_disk_info_t *xdi);
-
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
/* XXX SMH: crappy 'hash function' .. fix when care. */
#define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1))
@@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe)
if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) )
{
/* Privileged domains always get access to the 'real' devices. */
- if ( (ret = ide_probe_devices(&probe->xdi)) != 0 )
- {
- DPRINTK("vbd_probe: error %d in probing ide devices\n", ret);
+ if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) ||
+ ((ret = scsi_probe_devices(&probe->xdi)) != 0) )
goto out;
- }
- if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 )
- {
- DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret);
- goto out;
- }
}
if ( probe->domain == VBD_PROBE_ALL )
@@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe)
{
if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
{
- DPRINTK("vbd_probe: error %d in probing virtual devices\n",
- ret);
read_unlock_irqrestore(&tasklist_lock, flags);
goto out;
}
@@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe)
}
read_unlock_irqrestore(&tasklist_lock, flags);
}
- else
- {
- if ( (ret = vbd_probe_devices(&probe->xdi, p)) )
- {
- DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret);
- goto out;
- }
-
- }
+ else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
+ goto out;
out:
+ if ( ret != 0 )
+ DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret);
if ( p != NULL )
put_task_struct(p);
return ret;
diff --git a/xen/drivers/net/e1000/e1000_main.c b/xen/drivers/net/e1000/e1000_main.c
index 4d88a61465..f6f5bb7aa8 100644
--- a/xen/drivers/net/e1000/e1000_main.c
+++ b/xen/drivers/net/e1000/e1000_main.c
@@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
static void
e1000_tx_timeout(struct net_device *netdev)
{
+#if 0
struct e1000_adapter *adapter = netdev->priv;
/* Do the reset outside of interrupt context */
- //schedule_work(&adapter->tx_timeout_task);
+ schedule_work(&adapter->tx_timeout_task);
+#endif
e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN
}
diff --git a/xen/include/asm-i386/atomic.h b/xen/include/asm-i386/atomic.h
index 70a1212ed6..9dcdca93f7 100644
--- a/xen/include/asm-i386/atomic.h
+++ b/xen/include/asm-i386/atomic.h
@@ -186,15 +186,6 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v)
return c;
}
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
-__asm__ __volatile__(LOCK "andl %0,%1" \
-: : "r" (~(mask)),"m" (*addr) : "memory")
-
-#define atomic_set_mask(mask, addr) \
-__asm__ __volatile__(LOCK "orl %0,%1" \
-: : "r" (mask),"m" (*addr) : "memory")
-
/* Atomic operations are already serializing on x86 */
#define smp_mb__before_atomic_dec() barrier()
#define smp_mb__after_atomic_dec() barrier()
diff --git a/xen/include/asm-i386/flushtlb.h b/xen/include/asm-i386/flushtlb.h
index 3a063fc0cb..e6f61cb521 100644
--- a/xen/include/asm-i386/flushtlb.h
+++ b/xen/include/asm-i386/flushtlb.h
@@ -1,40 +1,39 @@
/******************************************************************************
* flushtlb.h
*
- * TLB flush macros that count flushes. Counting is used to enforce
- * zero-copy safety, particularily for the network code.
- *
- * akw - Jan 21, 2003
+ * TLB flushes are timestamped using a global virtual 'clock' which ticks
+ * on any TLB flush on any processor.
+ *
+ * Copyright (c) 2003, K A Fraser
*/
-#ifndef __FLUSHTLB_H
-#define __FLUSHTLB_H
+#ifndef __FLUSHTLB_H__
+#define __FLUSHTLB_H__
#include <xeno/smp.h>
-#include <asm/atomic.h>
-
-atomic_t tlb_flush_count[NR_CPUS];
-
-#define __write_cr3_counted(__pa) \
- do { \
- __asm__ __volatile__ ( \
- "movl %0, %%cr3;" \
- :: "r" (__pa) \
- : "memory"); \
- atomic_inc(&tlb_flush_count[smp_processor_id()]); \
- } while (0)
-
-#define __flush_tlb_counted() \
- do { \
- unsigned int tmpreg; \
- \
- __asm__ __volatile__( \
- "movl %%cr3, %0; # flush TLB \n" \
- "movl %0, %%cr3; " \
- : "=r" (tmpreg) \
- :: "memory"); \
- atomic_inc(&tlb_flush_count[smp_processor_id()]); \
- } while (0)
-
-#endif
-
+
+/*
+ * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the
+ * system is guaranteed to have been flushed.
+ */
+#define GLOBAL_FLUSH_PERIOD (1<<16)
+
+/*
+ * '_cpu_stamp' is the current timestamp for the CPU we are testing.
+ * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last
+ * used for a purpose that may have caused the CPU's TLB to become tainted.
+ */
+#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \
+ (((_cpu_stamp) > (_lastuse_stamp)) || \
+ (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD)))
+
+extern unsigned long tlbflush_mask;
+extern unsigned long tlbflush_clock;
+extern unsigned long tlbflush_time[NR_CPUS];
+
+extern void new_tlbflush_clock_period(void);
+
+extern void write_cr3_counted(unsigned long pa);
+extern void flush_tlb_counted(void);
+
+#endif /* __FLUSHTLB_H__ */
diff --git a/xen/include/asm-i386/io.h b/xen/include/asm-i386/io.h
index 9b54ae278d..1bae91a1e2 100644
--- a/xen/include/asm-i386/io.h
+++ b/xen/include/asm-i386/io.h
@@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsigned long address)
return __va(address);
}
-/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page) ((page - frame_table) << PAGE_SHIFT)
+#define page_to_pfn(_page) ((unsigned long)((_page) - frame_table))
+#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT)
+#define page_to_virt(_page) phys_to_virt(page_to_phys(_page))
extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
diff --git a/xen/include/asm-i386/page.h b/xen/include/asm-i386/page.h
index c9191c43eb..2fc1c43ef0 100644
--- a/xen/include/asm-i386/page.h
+++ b/xen/include/asm-i386/page.h
@@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } pagetable_t;
extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE];
extern void paging_init(void);
-#define __flush_tlb() __flush_tlb_counted()
+#define __flush_tlb() flush_tlb_counted()
/* Flush global pages as well. */
@@ -111,10 +111,10 @@ extern void paging_init(void);
} while (0)
-#define __flush_tlb_all() \
+#define __flush_tlb_pge() \
do { \
__pge_off(); \
- __flush_tlb_counted(); \
+ flush_tlb_counted(); \
__pge_on(); \
} while (0)
diff --git a/xen/include/asm-i386/pgalloc.h b/xen/include/asm-i386/pgalloc.h
index 841e5fd4a1..88e9064641 100644
--- a/xen/include/asm-i386/pgalloc.h
+++ b/xen/include/asm-i386/pgalloc.h
@@ -47,28 +47,24 @@
#ifndef CONFIG_SMP
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-#define flush_tlb_cpu(_cpu) __flush_tlb()
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb()
+#define flush_tlb_all_pge() __flush_tlb_pge()
+#define local_flush_tlb() __flush_tlb()
+#define flush_tlb_cpu(_cpu) __flush_tlb()
+#define flush_tlb_mask(_mask) __flush_tlb()
#else
#include <xeno/smp.h>
+extern void flush_tlb_mask(unsigned long mask);
+extern void flush_tlb_all_pge(void);
+
#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() flush_tlb_mask((1 << smp_num_cpus) - 1)
#define local_flush_tlb() __flush_tlb()
-
-extern void flush_tlb_all(void);
-
-extern void flush_tlb_others(unsigned long cpumask);
-static inline void flush_tlb_cpu(unsigned int cpu)
-{
- if ( cpu == smp_processor_id() )
- __flush_tlb();
- else
- flush_tlb_others(1<<cpu);
-}
+#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu))
#endif
diff --git a/xen/include/asm-i386/smp.h b/xen/include/asm-i386/smp.h
index cfec568c43..08eef3c8bd 100644
--- a/xen/include/asm-i386/smp.h
+++ b/xen/include/asm-i386/smp.h
@@ -1,15 +1,8 @@
#ifndef __ASM_SMP_H
#define __ASM_SMP_H
-#ifndef __ASSEMBLY__
#include <xeno/config.h>
#include <asm/ptrace.h>
-#include <asm/fixmap.h>
-#include <asm/bitops.h>
-#include <asm/mpspec.h>
-#include <asm/io_apic.h>
-#include <asm/apic.h>
-#endif
#ifdef CONFIG_SMP
#define TARGET_CPUS cpu_online_map
@@ -18,8 +11,6 @@
#endif
#ifdef CONFIG_SMP
-#ifndef __ASSEMBLY__
-
/*
* Private routines/data
*/
@@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id); /* Store per CPU info (like the initial
#define smp_processor_id() (current->processor)
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+
static __inline int hard_smp_processor_id(void)
{
/* we don't want to mark this access volatile - bad code generation */
@@ -86,7 +80,5 @@ static __inline int logical_smp_processor_id(void)
return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
}
-#endif /* !__ASSEMBLY__ */
-
#endif
#endif
diff --git a/xen/include/asm-i386/spinlock.h b/xen/include/asm-i386/spinlock.h
index 59dc7b209f..9a4fc8573d 100644
--- a/xen/include/asm-i386/spinlock.h
+++ b/xen/include/asm-i386/spinlock.h
@@ -1,11 +1,10 @@
#ifndef __ASM_SPINLOCK_H
#define __ASM_SPINLOCK_H
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <asm/page.h>
#include <xeno/config.h>
#include <xeno/lib.h>
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
#if 0
#define SPINLOCK_DEBUG 1
diff --git a/xen/include/asm-i386/system.h b/xen/include/asm-i386/system.h
index dc4ac3398b..3e85277d6c 100644
--- a/xen/include/asm-i386/system.h
+++ b/xen/include/asm-i386/system.h
@@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
#define cmpxchg(ptr,o,n)\
((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
(unsigned long)(n),sizeof(*(ptr))))
-
+
+
+/*
+ * This function causes longword _o to be changed to _n at location _p.
+ * If this access causes a fault then we return 1, otherwise we return 0.
+ * If no fault occurs then _o is updated to teh value we saw at _p. If this
+ * is the same as the initial value of _o then _n is written to location _p.
+ */
+#define cmpxchg_user(_p,_o,_n) \
+({ \
+ int _rc; \
+ __asm__ __volatile__ ( \
+ "1: " LOCK_PREFIX "cmpxchgl %2,%3\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: movl $1,%1\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 1b,3b\n" \
+ ".previous" \
+ : "=a" (_o), "=r" (_rc) \
+ : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \
+ : "memory"); \
+ _rc; \
+})
+
/*
* Force strict CPU ordering.
* And yes, this is required on UP too when we're talking
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h
index 045e4ad70e..e1d20ff2c7 100644
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st
{
/* IN variables. */
unsigned long pfn; /* Machine page frame number to query. */
- /* OUT variables. */
unsigned int domain; /* To which domain does the frame belong? */
+ /* OUT variables. */
enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type? */
} dom0_getpageframeinfo_t;
diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h
index 5bd13dba9b..145b1a0aac 100644
--- a/xen/include/hypervisor-ifs/hypervisor-if.h
+++ b/xen/include/hypervisor-ifs/hypervisor-if.h
@@ -125,9 +125,9 @@
* which shifts the least bits out.
*/
/* A normal page-table update request. */
-#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is VA. */
+#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
/* DOM0 can make entirely unchecked updates which do not affect refcnts. */
-#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is VA. */
+#define MMU_UNCHECKED_PT_UPDATE 1 /* unchecked '*ptr = val'. ptr is MA. */
/* Update an entry in the machine->physical mapping table. */
#define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */
/* An extended command. */
diff --git a/xen/include/xeno/config.h b/xen/include/xeno/config.h
index 64a99f66ce..c88e41d15b 100644
--- a/xen/include/xeno/config.h
+++ b/xen/include/xeno/config.h
@@ -145,6 +145,13 @@
#define capable(_c) 0
+#ifndef NDEBUG
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+ __FILE__, __LINE__, ## _a)
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
#ifndef __ASSEMBLY__
#include <xeno/compiler.h>
diff --git a/xen/include/xeno/mm.h b/xen/include/xeno/mm.h
index 8f0c032367..d5c3c5d6cb 100644
--- a/xen/include/xeno/mm.h
+++ b/xen/include/xeno/mm.h
@@ -3,34 +3,35 @@
#define __XENO_MM_H__
#include <xeno/config.h>
+#include <xeno/list.h>
+#include <xeno/spinlock.h>
+#include <xeno/perfc.h>
+#include <xeno/sched.h>
+
+#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/desc.h>
-#include <xeno/list.h>
+#include <asm/flushtlb.h>
+#include <asm/io.h>
+
#include <hypervisor-ifs/hypervisor-if.h>
-#include <xeno/spinlock.h>
-/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */
+/*
+ * These are for compatibility with calls to the Linux memory allocators.
+ */
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
#define __GFP_DMA 0x01
-
-/* Action modifiers - doesn't change the zoning */
+#define GFP_DMA __GFP_DMA
#define __GFP_WAIT 0x10 /* Can wait and reschedule? */
#define __GFP_HIGH 0x20 /* Should access emergency pools? */
#define __GFP_IO 0x40 /* Can start low memory physical IO? */
#define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */
#define __GFP_FS 0x100 /* Can call down to low-level FS? */
-
#define GFP_ATOMIC (__GFP_HIGH)
-#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-
-/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
- platforms, used as appropriate on others */
+#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \
+ __GFP_HIGHIO | __GFP_FS)
-#define GFP_DMA __GFP_DMA
-
-
-/******************************************************************************
+/*
* The following is for page_alloc.c.
*/
@@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int order);
#define free_page(_p) (__free_pages(_p,0))
-/******************************************************************************
- * The following is the array of page info. One entry per page owned
- * by the hypervisor, indexed from `mem_map', just like Linux.
- *
- * 12.11.02. We no longer use struct page or mem_map, these are replaced
- * with struct pfn_info and frame_table respectively. Boris Dragovic
- */
-
-typedef struct pfn_info {
- struct list_head list; /* ->mapping has some page lists. */
- unsigned long flags; /* atomic flags. */
- unsigned long tot_count; /* Total domain usage count. */
- unsigned long type_count; /* pagetable/dir, or domain-writeable refs. */
-} frame_table_t;
-
-#define get_page_tot(p) ((p)->tot_count++)
-#define put_page_tot(p) \
- ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; })
-#define page_tot_count(p) ((p)->tot_count)
-#define set_page_tot_count(p,v) ((p)->tot_count = v)
-
-#define get_page_type(p) ((p)->type_count++)
-#define put_page_type(p) \
- ({ ASSERT((p)->type_count != 0); --(p)->type_count; })
-#define page_type_count(p) ((p)->type_count)
-#define set_page_type_count(p,v) ((p)->type_count = v)
-
-#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */
-/* hypervisor flags (domain == 0) */
-#define PG_slab 24
-/* domain flags (domain != 0) */
/*
- * NB. The following page types are MUTUALLY EXCLUSIVE.
- * At most one can be true at any point, and 'type_count' counts how many
- * references exist of the current type. A change in type can only occur
- * when type_count == 0.
+ * Per-page-frame information.
*/
-#define PG_type_mask (15<<24) /* bits 24-27 */
-#define PGT_none (0<<24) /* no special uses of this page */
-#define PGT_l1_page_table (1<<24) /* using this page as an L1 page table? */
-#define PGT_l2_page_table (2<<24) /* using this page as an L2 page table? */
-#define PGT_l3_page_table (3<<24) /* using this page as an L3 page table? */
-#define PGT_l4_page_table (4<<24) /* using this page as an L4 page table? */
-#define PGT_gdt_page (5<<24) /* using this page in a GDT? */
-#define PGT_ldt_page (6<<24) /* using this page in an LDT? */
-#define PGT_writeable_page (7<<24) /* has writable mappings of this page? */
-/*
- * This bit indicates that the TLB must be flushed when the type count of this
- * frame drops to zero. This is needed on current x86 processors only for
- * frames which have guestos-accessible writeable mappings. In this case we
- * must prevent stale TLB entries allowing the frame to be written if it used
- * for a page table, for example.
- *
- * We have this bit because the writeable type is actually also used to pin a
- * page when it is used as a disk read buffer. This doesn't require a TLB flush
- * because the frame never has a mapping in the TLB.
- */
-#define PG_need_flush (1<<28)
+struct pfn_info
+{
+ /* Each frame can be threaded onto a doubly-linked list. */
+ struct list_head list;
+ /* The following possible uses are context-dependent. */
+ union {
+ /* Page is in use and not a zombie: we keep a pointer to its owner. */
+ struct task_struct *domain;
+ /* Page is not currently allocated: mask of possibly-tainted TLBs. */
+ unsigned long cpu_mask;
+ /* Page is a zombie: this word currently has no use. */
+ unsigned long _unused;
+ } u;
+ /* Reference count and various PGC_xxx flags and fields. */
+ unsigned long count_and_flags;
+ /* Type reference count and various PGT_xxx flags and fields. */
+ unsigned long type_and_flags;
+ /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
+ unsigned long tlbflush_timestamp;
+};
-/*
- * This bit indicates that the guest OS has pinned the page to its current
- * type. For page tables this can avoid the frame scanning and reference-count
- * updates that occur when the type count falls to zero.
- */
-#define PG_guest_pinned (1<<29)
+ /* The following page types are MUTUALLY EXCLUSIVE. */
+#define PGT_none (0<<29) /* no special uses of this page */
+#define PGT_l1_page_table (1<<29) /* using this page as an L1 page table? */
+#define PGT_l2_page_table (2<<29) /* using this page as an L2 page table? */
+#define PGT_l3_page_table (3<<29) /* using this page as an L3 page table? */
+#define PGT_l4_page_table (4<<29) /* using this page as an L4 page table? */
+#define PGT_gdt_page (5<<29) /* using this page in a GDT? */
+#define PGT_ldt_page (6<<29) /* using this page in an LDT? */
+#define PGT_writeable_page (7<<29) /* has writable mappings of this page? */
+#define PGT_type_mask (7<<29) /* Bits 29-31. */
+ /* Has this page been validated for use as its current type? */
+#define _PGT_validated 28
+#define PGT_validated (1<<_PGT_validated)
+ /* 28-bit count of uses of this frame as its current type. */
+#define PGT_count_mask ((1<<28)-1)
-#define PageSlab(page) test_bit(PG_slab, &(page)->flags)
-#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags)
-#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags)
+ /* The owner of this page is dead: 'u.domain' is no longer valid. */
+#define _PGC_zombie 31
+#define PGC_zombie (1<<_PGC_zombie)
+ /* For safety, force a TLB flush when this page's type changes. */
+#define _PGC_tlb_flush_on_type_change 30
+#define PGC_tlb_flush_on_type_change (1<<_PGC_tlb_flush_on_type_change)
+ /* Owning guest has pinned this page to its current type? */
+#define _PGC_guest_pinned 29
+#define PGC_guest_pinned (1<<_PGC_guest_pinned)
+ /* Cleared when the owning guest 'frees' this page. */
+#define _PGC_allocated 28
+#define PGC_allocated (1<<_PGC_allocated)
+ /* 28-bit count of references to this frame. */
+#define PGC_count_mask ((1<<28)-1)
-#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \
- do { \
- (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \
- set_page_tot_count((_pfn), 2); \
- set_page_type_count((_pfn), 2); \
- } while ( 0 )
+/* We trust the slab allocator in slab.c, and our use of it. */
+#define PageSlab(page) (1)
+#define PageSetSlab(page) ((void)0)
+#define PageClearSlab(page) ((void)0)
+
+#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS)
-#define UNSHARE_PFN(_pfn) \
- do { \
- (_pfn)->flags = 0; \
- set_page_tot_count((_pfn), 0); \
- set_page_type_count((_pfn), 0); \
+#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \
+ do { \
+ (_pfn)->u.domain = (_dom); \
+ wmb(); /* install valid domain ptr before updating refcnt. */ \
+ (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \
+ (_pfn)->type_and_flags = PGT_writeable_page | PGT_validated | 1; \
} while ( 0 )
-/* The array of struct pfn_info,
- * free pfn list and number of free pfns in the free list
- */
-extern frame_table_t * frame_table;
+#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn)
+
+extern struct pfn_info *frame_table;
extern unsigned long frame_table_size;
extern struct list_head free_list;
extern spinlock_t free_list_lock;
@@ -140,6 +126,180 @@ extern unsigned int free_pfns;
extern unsigned long max_page;
void init_frametable(unsigned long nr_pages);
+struct pfn_info *alloc_domain_page(struct task_struct *p);
+void free_domain_page(struct pfn_info *page);
+
+int alloc_page_type(struct pfn_info *page, unsigned int type);
+void free_page_type(struct pfn_info *page, unsigned int type);
+
+static inline void put_page(struct pfn_info *page)
+{
+ unsigned long nx, x, y = page->count_and_flags;
+
+ do {
+ x = y;
+ nx = x - 1;
+ }
+ while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
+
+ if ( unlikely((nx & PGC_count_mask) == 0) )
+ free_domain_page(page);
+}
+
+
+static inline int get_page(struct pfn_info *page,
+ struct task_struct *domain)
+{
+ unsigned long x, nx, y = page->count_and_flags;
+ struct task_struct *p, *np = page->u.domain;
+
+ do {
+ x = y;
+ nx = x + 1;
+ p = np;
+ if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */
+ unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
+ unlikely(x & PGC_zombie) || /* Zombie? */
+ unlikely(p != domain) ) /* Wrong owner? */
+ {
+ DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n",
+ page_to_pfn(page), domain, p, x);
+ return 0;
+ }
+ __asm__ __volatile__(
+ LOCK_PREFIX "cmpxchg8b %3"
+ : "=a" (np), "=d" (y), "=b" (p),
+ "=m" (*(volatile unsigned long long *)(&page->u.domain))
+ : "0" (p), "1" (x), "b" (p), "c" (nx) );
+ }
+ while ( unlikely(np != p) || unlikely(y != x) );
+
+ return 1;
+}
+
+
+static inline void put_page_type(struct pfn_info *page)
+{
+ unsigned long nx, x, y = page->type_and_flags;
+
+ again:
+ do {
+ x = y;
+ nx = x - 1;
+ if ( unlikely((nx & PGT_count_mask) == 0) )
+ {
+ page->tlbflush_timestamp = tlbflush_clock;
+ if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+ likely(nx & PGT_validated) )
+ {
+ /*
+ * Page-table pages must be unvalidated when count is zero. The
+ * 'free' is safe because the refcnt is non-zero and the
+ * validated bit is clear => other ops will spin or fail.
+ */
+ if ( unlikely((y = cmpxchg(&page->type_and_flags, x,
+ x & ~PGT_validated)) != x) )
+ goto again;
+ /* We cleared the 'valid bit' so we must do the clear up. */
+ free_page_type(page, x & PGT_type_mask);
+ /* Carry on as we were, but with the 'valid bit' now clear. */
+ x &= ~PGT_validated;
+ nx &= ~PGT_validated;
+ }
+ }
+ }
+ while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
+}
+
+
+static inline int get_page_type(struct pfn_info *page, unsigned long type)
+{
+ unsigned long nx, x, y = page->type_and_flags;
+ again:
+ do {
+ x = y;
+ nx = x + 1;
+ if ( unlikely((nx & PGT_count_mask) == 0) )
+ {
+ DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
+ return 0;
+ }
+ else if ( unlikely((x & PGT_count_mask) == 0) )
+ {
+ if ( (x & PGT_type_mask) != type )
+ {
+ nx &= ~(PGT_type_mask | PGT_validated);
+ nx |= type;
+ /* No extra validation needed for writeable pages. */
+ if ( type == PGT_writeable_page )
+ nx |= PGT_validated;
+ }
+ }
+ else if ( unlikely((x & PGT_type_mask) != type) )
+ {
+ DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n",
+ x & PGT_type_mask, type, page_to_pfn(page));
+ return 0;
+ }
+ else if ( unlikely(!(x & PGT_validated)) )
+ {
+ /* Someone else is updating validation of this page. Wait... */
+ while ( (y = page->type_and_flags) != x )
+ {
+ rep_nop();
+ barrier();
+ }
+ goto again;
+ }
+ }
+ while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
+
+ if ( unlikely(!(nx & PGT_validated)) )
+ {
+ /* Try to validate page type; drop the new reference on failure. */
+ if ( unlikely(!alloc_page_type(page, type)) )
+ {
+ DPRINTK("Error while validating pfn %08lx for type %08lx\n",
+ page_to_pfn(page), type);
+ put_page_type(page);
+ return 0;
+ }
+ set_bit(_PGT_validated, &page->type_and_flags);
+ }
+
+ return 1;
+}
+
+
+static inline void put_page_and_type(struct pfn_info *page)
+{
+ put_page_type(page);
+ put_page(page);
+}
+
+
+static inline int get_page_and_type(struct pfn_info *page,
+ struct task_struct *domain,
+ unsigned int type)
+{
+ int rc = get_page(page, domain);
+
+ if ( likely(rc) && unlikely(!get_page_type(page, type)) )
+ {
+ put_page(page);
+ rc = 0;
+ }
+
+ return rc;
+}
+
+#define ASSERT_PAGE_IS_TYPE(_p, _t) \
+ ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t)); \
+ ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0)
+#define ASSERT_PAGE_IS_DOMAIN(_p, _d) \
+ ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0); \
+ ASSERT((_p)->u.domain == (_d))
+
int check_descriptor(unsigned long a, unsigned long b);
/*
diff --git a/xen/include/xeno/perfc.h b/xen/include/xeno/perfc.h
index 4048000790..9ea244b3b8 100644
--- a/xen/include/xeno/perfc.h
+++ b/xen/include/xeno/perfc.h
@@ -1,6 +1,6 @@
-/*
- * xen performance counters
- */
+
+#ifndef __XENO_PERFC_H__
+#define __XENO_PERFC_H__
#include <asm/atomic.h>
@@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters;
#define perfc_addc(x,y) atomic_add((y), &perfcounters.x[smp_processor_id()])
#define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y])
+#endif /* __XENO_PERFC_H__ */
diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h
index 033f12c8c9..f81b5bcba1 100644
--- a/xen/include/xeno/perfc_defn.h
+++ b/xen/include/xeno/perfc_defn.h
@@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hypercalls" )
PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" )
PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" )
PERFCOUNTER( net_rx_delivered, "net rx delivered" )
-PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" )
PERFCOUNTER( net_tx_transmitted, "net tx transmitted" )
PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" )
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 736201446a..6c1984d795 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -4,7 +4,6 @@
#include <xeno/config.h>
#include <xeno/types.h>
#include <xeno/spinlock.h>
-#include <asm/page.h>
#include <asm/ptrace.h>
#include <xeno/smp.h>
#include <asm/processor.h>
@@ -16,7 +15,6 @@
#include <xeno/time.h>
#include <xeno/ac_timer.h>
#include <xeno/delay.h>
-#include <xeno/slab.h>
#define MAX_DOMAIN_NAME 16
@@ -94,9 +92,10 @@ struct task_struct
unsigned int domain; /* domain id */
- struct list_head pg_head;
- unsigned int tot_pages; /* number of pages currently possesed */
- unsigned int max_pages; /* max number of pages that can be possesed */
+ spinlock_t page_list_lock;
+ struct list_head page_list;
+ unsigned int tot_pages; /* number of pages currently possesed */
+ unsigned int max_pages; /* max number of pages that can be possesed */
/* scheduling */
struct list_head run_list;
@@ -132,8 +131,6 @@ struct task_struct
/* VM */
struct mm_struct mm;
- /* We need this lock to check page types and frob reference counts. */
- spinlock_t page_lock;
mm_segment_t addr_limit;
@@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_CPUS];
#define STACK_SIZE PAGE_SIZE
+#include <xeno/slab.h>
+
extern kmem_cache_t *task_struct_cachep;
#define alloc_task_struct() \
((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
diff --git a/xen/include/xeno/vif.h b/xen/include/xeno/vif.h
index f3ee9fa616..a557cb3802 100644
--- a/xen/include/xeno/vif.h
+++ b/xen/include/xeno/vif.h
@@ -34,7 +34,7 @@ extern struct net_device *the_dev;
typedef struct rx_shadow_entry_st
{
unsigned short id;
- unsigned short flush_count; /* 16 bits should be enough */
+ unsigned short _pad;
unsigned long pte_ptr;
unsigned long buf_pfn;
} rx_shadow_entry_t;
diff --git a/xen/net/dev.c b/xen/net/dev.c
index 280db4def1..91d6a4e0cf 100644
--- a/xen/net/dev.c
+++ b/xen/net/dev.c
@@ -39,12 +39,6 @@
#define rtnl_lock() ((void)0)
#define rtnl_unlock() ((void)0)
-#if 0
-#define DPRINTK(_f, _a...) printk(_f , ## _a)
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
#define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
#define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
#define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
@@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned;
static int get_tx_bufs(net_vif_t *vif);
-static void __make_tx_response(net_vif_t *vif,
- unsigned short id,
- unsigned char st);
+static void make_tx_response(net_vif_t *vif,
+ unsigned short id,
+ unsigned char st);
static void make_rx_response(net_vif_t *vif,
unsigned short id,
unsigned short size,
@@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_CPUS];
void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
{
rx_shadow_entry_t *rx;
- unsigned long *ptep;
+ unsigned long *ptep, pte;
struct pfn_info *old_page, *new_page, *pte_page;
unsigned int i;
unsigned short size;
unsigned char offset, status = RING_STATUS_OK;
+ struct task_struct *p = vif->domain;
memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
- /*
- * Slightly gross: we need the page_lock so that we can do PTE checking.
- * However, we take it slightly early so that it can protect the update
- * of rx_cons. This saves us from grabbing two locks.
- */
- spin_lock(&vif->domain->page_lock);
+ spin_lock(&vif->rx_lock);
if ( (i = vif->rx_cons) == vif->rx_prod )
{
- spin_unlock(&vif->domain->page_lock);
+ spin_unlock(&vif->rx_lock);
perfc_incr(net_rx_capacity_drop);
return;
}
- rx = vif->rx_shadow_ring + i;
+ rx = &vif->rx_shadow_ring[i];
vif->rx_cons = RX_RING_INC(i);
size = (unsigned short)skb->len;
offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
- /* Release the page-table page. */
- pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
- put_page_type(pte_page);
- put_page_tot(pte_page);
-
- old_page = frame_table + rx->buf_pfn;
+ pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT];
+ old_page = &frame_table[rx->buf_pfn];
new_page = skb->pf;
ptep = map_domain_mem(rx->pte_ptr);
- if ( (*ptep & _PAGE_PRESENT) )
+ new_page->u.domain = p;
+ wmb(); /* make dom ptr visible before updating refcnt. */
+ spin_lock(&p->page_list_lock);
+ list_add(&new_page->list, &p->page_list);
+ new_page->count_and_flags = PGC_allocated | 2;
+ spin_unlock(&p->page_list_lock);
+ get_page_type(new_page, PGT_writeable_page);
+ set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags);
+ wmb(); /* Get type count and set flush bit before updating PTE. */
+
+ pte = *ptep;
+ if ( unlikely(pte & _PAGE_PRESENT) ||
+ unlikely(cmpxchg(ptep, pte,
+ (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
+ ((new_page - frame_table) << PAGE_SHIFT))) != pte )
{
- /* Bail out if the PTE has been reused under our feet. */
- list_add(&old_page->list, &vif->domain->pg_head);
- old_page->flags = vif->domain->domain;
unmap_domain_mem(ptep);
- spin_unlock(&vif->domain->page_lock);
status = RING_STATUS_BAD_PAGE;
goto out;
}
- /* Give the new page to the domain, marking it writeable. */
- set_page_type_count(new_page, 1);
- set_page_tot_count(new_page, 1);
- new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
- list_add(&new_page->list, &vif->domain->pg_head);
-
- /* Patch the PTE to map the new page as writeable. */
machine_to_phys_mapping[new_page - frame_table]
- = machine_to_phys_mapping[old_page - frame_table];
- *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
- (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
+ = machine_to_phys_mapping[old_page - frame_table];
unmap_domain_mem(ptep);
- spin_unlock(&vif->domain->page_lock);
-
/* Our skbuff now points at the guest's old frame. */
skb->pf = old_page;
/* Updates must happen before releasing the descriptor. */
smp_wmb();
- /*
- * NB. The remote flush here should be safe, as we hold no locks. The
- * network driver that called us should also have no nasty locks.
- */
- if ( rx->flush_count == (unsigned short)
- atomic_read(&tlb_flush_count[vif->domain->processor]) )
- {
- perfc_incr(net_rx_tlbflush);
- flush_tlb_cpu(vif->domain->processor);
- }
-
perfc_incr(net_rx_delivered);
/* record this so they can be billed */
@@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
vif->total_bytes_received += size;
out:
+ put_page_and_type(pte_page);
make_rx_response(vif, rx->id, size, status, offset);
+ spin_unlock(&vif->rx_lock);
}
/**
@@ -785,8 +761,8 @@ static void net_tx_action(unsigned long unused)
skb->mac.raw = skb->data;
skb->guest_id = tx->id;
- skb_shinfo(skb)->frags[0].page = frame_table +
- (tx->payload >> PAGE_SHIFT);
+ skb_shinfo(skb)->frags[0].page =
+ &frame_table[tx->payload >> PAGE_SHIFT];
skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
skb_shinfo(skb)->nr_frags = 1;
@@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buff *skb)
vif = skb->src_vif;
- spin_lock(&vif->domain->page_lock);
for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
- put_page_tot(skb_shinfo(skb)->frags[i].page);
- spin_unlock(&vif->domain->page_lock);
+ put_page(skb_shinfo(skb)->frags[i].page);
if ( skb->skb_type == SKB_NODATA )
kmem_cache_free(net_header_cachep, skb->head);
@@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buff *skb)
skb_shinfo(skb)->nr_frags = 0;
spin_lock(&vif->tx_lock);
- __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
+ make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
spin_unlock(&vif->tx_lock);
/*
@@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif)
if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
{
DPRINTK("Bad packet size: %d\n", tx.size);
- __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+ make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
continue;
}
@@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif)
vif->remaining_credit -= tx.size;
/* No crossing a page boundary as the payload mustn't fragment. */
- if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
+ if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) )
{
DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
- __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+ make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
continue;
}
buf_pfn = tx.addr >> PAGE_SHIFT;
buf_page = frame_table + buf_pfn;
- spin_lock(&p->page_lock);
- if ( (buf_pfn >= max_page) ||
- ((buf_page->flags & PG_domain_mask) != p->domain) )
+ if ( unlikely(buf_pfn >= max_page) ||
+ unlikely(!get_page(buf_page, p)) )
{
DPRINTK("Bad page frame\n");
- spin_unlock(&p->page_lock);
- __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+ make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
continue;
}
@@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif)
init_tx_header(vif, g_data, tx.size, the_dev));
if ( protocol == 0 )
{
- __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
- goto tx_unmap_and_continue;
+ make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+ goto cleanup_and_continue;
}
target = net_get_target_vif(g_data, tx.size, vif);
@@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif)
/* Local delivery */
if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
{
- __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+ make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
put_vif(target);
- goto tx_unmap_and_continue;
+ goto cleanup_and_continue;
}
skb->src_vif = vif;
@@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif)
if ( netif_rx(skb) == NET_RX_DROP )
kfree_skb(skb);
- __make_tx_response(vif, tx.id, RING_STATUS_OK);
+ make_tx_response(vif, tx.id, RING_STATUS_OK);
}
else if ( (target == VIF_PHYS) || IS_PRIV(p) )
{
@@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif)
kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
if ( vif->tx_shadow_ring[j].header == NULL )
{
- __make_tx_response(vif, tx.id, RING_STATUS_OK);
- goto tx_unmap_and_continue;
+ make_tx_response(vif, tx.id, RING_STATUS_OK);
+ goto cleanup_and_continue;
}
memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
- get_page_tot(buf_page);
+ buf_page = NULL; /* hand off our page reference */
j = TX_RING_INC(j);
}
else
{
- __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
+ make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
}
- tx_unmap_and_continue:
+ cleanup_and_continue:
+ if ( buf_page != NULL )
+ put_page(buf_page);
unmap_domain_mem(g_data);
- spin_unlock(&p->page_lock);
}
/*
@@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif)
}
-static long get_bufs_from_vif(net_vif_t *vif)
+static void get_rx_bufs(net_vif_t *vif)
{
- net_ring_t *shared_rings;
- net_idx_t *shared_idxs;
+ struct task_struct *p = vif->domain;
+ net_ring_t *shared_rings = vif->shared_rings;
+ net_idx_t *shared_idxs = vif->shared_idxs;
unsigned int i, j;
rx_req_entry_t rx;
unsigned long pte_pfn, buf_pfn;
struct pfn_info *pte_page, *buf_page;
- struct task_struct *p = vif->domain;
- unsigned long *ptep;
-
- shared_idxs = vif->shared_idxs;
- shared_rings = vif->shared_rings;
-
- /*
- * PHASE 1 -- TRANSMIT RING
- */
-
- if ( get_tx_bufs(vif) )
- {
- add_to_net_schedule_list_tail(vif);
- maybe_schedule_tx_action();
- }
+ unsigned long *ptep, pte;
- /*
- * PHASE 2 -- RECEIVE RING
- */
+ spin_lock(&vif->rx_lock);
/*
* Collect up new receive buffers. We collect up to the guest OS's new
@@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t *vif)
{
rx = shared_rings->rx_ring[i].req;
- pte_pfn = rx.addr >> PAGE_SHIFT;
- pte_page = frame_table + pte_pfn;
+ pte_pfn = rx.addr >> PAGE_SHIFT;
+ pte_page = &frame_table[pte_pfn];
- spin_lock(&p->page_lock);
- if ( (pte_pfn >= max_page) ||
- ((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
- (PGT_l1_page_table | p->domain)) )
+ /* The address passed down must be to a valid PTE. */
+ if ( unlikely(pte_pfn >= max_page) ||
+ unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) )
{
DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
- p->domain, pte_pfn, max_page, pte_page->flags);
- spin_unlock(&p->page_lock);
+ p->domain, pte_pfn, max_page, pte_page->type_and_flags);
make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
continue;
}
-
+
ptep = map_domain_mem(rx.addr);
-
- if ( !(*ptep & _PAGE_PRESENT) )
+ pte = *ptep;
+
+ /* We must be passed a valid writeable mapping to swizzle. */
+ if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) !=
+ (_PAGE_PRESENT|_PAGE_RW)) ||
+ unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) )
{
- DPRINTK("Invalid PTE passed down (not present)\n");
+ DPRINTK("Invalid PTE passed down (not present or changing)\n");
+ put_page_and_type(pte_page);
make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
goto rx_unmap_and_continue;
}
-
- buf_pfn = *ptep >> PAGE_SHIFT;
- buf_page = frame_table + buf_pfn;
+
+ buf_pfn = pte >> PAGE_SHIFT;
+ buf_page = &frame_table[buf_pfn];
- if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
- (PGT_writeable_page | p->domain)) ||
- (page_tot_count(buf_page) != 1) )
+ /*
+ * The page must belong to the correct domain, and must be mapped
+ * just once as a writeable page.
+ */
+ if ( unlikely(buf_page->u.domain != p) ||
+ unlikely(!test_and_clear_bit(_PGC_allocated,
+ &buf_page->count_and_flags)) ||
+ unlikely(cmpxchg(&buf_page->type_and_flags,
+ PGT_writeable_page|PGT_validated|1,
+ 0) != (PGT_writeable_page|PGT_validated|1)) )
{
- DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
- page_type_count(buf_page), page_tot_count(buf_page),
- buf_page->flags);
+ DPRINTK("Bad domain or page mapped writeable more than once.\n");
+ if ( buf_page->u.domain == p )
+ set_bit(_PGC_allocated, &buf_page->count_and_flags);
+ if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
+ (pte & ~_PAGE_PRESENT)) )
+ put_page_and_type(buf_page);
make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
goto rx_unmap_and_continue;
}
-
+
/*
- * The pte they passed was good, so take it away from them. We also
- * lock down the page-table page, so it doesn't go away.
+ * Now ensure that we can take the last references to this page.
+ * The final count should be 2, because of PGC_allocated.
*/
- get_page_type(pte_page);
- get_page_tot(pte_page);
- *ptep &= ~_PAGE_PRESENT;
- buf_page->flags = 0;
- set_page_type_count(buf_page, 0);
- set_page_tot_count(buf_page, 0);
+ if ( unlikely(cmpxchg(&buf_page->count_and_flags,
+ PGC_tlb_flush_on_type_change | 2, 0) !=
+ (PGC_tlb_flush_on_type_change | 2)) )
+ {
+ DPRINTK("Page held more than once\n");
+ /* Leave the page unmapped at 'ptep'. Stoopid domain! */
+ make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
+ goto rx_unmap_and_continue;
+ }
+
+ /* Remove from the domain's allocation list. */
+ spin_lock(&p->page_list_lock);
list_del(&buf_page->list);
+ spin_unlock(&p->page_list_lock);
- vif->rx_shadow_ring[j].id = rx.id;
- vif->rx_shadow_ring[j].pte_ptr = rx.addr;
- vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
- vif->rx_shadow_ring[j].flush_count = (unsigned short)
- atomic_read(&tlb_flush_count[smp_processor_id()]);
+ vif->rx_shadow_ring[j].id = rx.id;
+ vif->rx_shadow_ring[j].pte_ptr = rx.addr;
+ vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
j = RX_RING_INC(j);
rx_unmap_and_continue:
unmap_domain_mem(ptep);
- spin_unlock(&p->page_lock);
}
vif->rx_req_cons = i;
@@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t *vif)
vif->rx_prod = j;
}
+ spin_unlock(&vif->rx_lock);
+}
+
+
+static long get_bufs_from_vif(net_vif_t *vif)
+{
+ if ( get_tx_bufs(vif) )
+ {
+ add_to_net_schedule_list_tail(vif);
+ maybe_schedule_tx_action();
+ }
+
+ get_rx_bufs(vif);
+
return 0;
}
@@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t *vif)
long flush_bufs_for_vif(net_vif_t *vif)
{
int i;
- unsigned long *pte;
+ unsigned long *ptep, pte;
struct pfn_info *page;
struct task_struct *p = vif->domain;
rx_shadow_entry_t *rx;
@@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
net_idx_t *shared_idxs = vif->shared_idxs;
/* Return any outstanding receive buffers to the guest OS. */
- spin_lock(&p->page_lock);
+ spin_lock(&vif->rx_lock);
for ( i = vif->rx_req_cons;
(i != shared_idxs->rx_req_prod) &&
(((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
@@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif)
{
rx = &vif->rx_shadow_ring[i];
- /* Release the page-table page. */
- page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
- put_page_type(page);
- put_page_tot(page);
-
/* Give the buffer page back to the domain. */
- page = frame_table + rx->buf_pfn;
- list_add(&page->list, &p->pg_head);
- page->flags = vif->domain->domain;
+ page = &frame_table[rx->buf_pfn];
+ spin_lock(&p->page_list_lock);
+ list_add(&page->list, &p->page_list);
+ page->count_and_flags = PGC_allocated | 2;
+ spin_unlock(&p->page_list_lock);
+ get_page_type(page, PGT_writeable_page);
+ set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
+ wmb();
/* Patch up the PTE if it hasn't changed under our feet. */
- pte = map_domain_mem(rx->pte_ptr);
- if ( !(*pte & _PAGE_PRESENT) )
- {
- *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) |
- _PAGE_RW | _PAGE_PRESENT;
- page->flags |= PGT_writeable_page | PG_need_flush;
- set_page_type_count(page, 1);
- set_page_tot_count(page, 1);
- }
- unmap_domain_mem(pte);
+ ptep = map_domain_mem(rx->pte_ptr);
+ pte = *ptep;
+ if ( unlikely(pte & _PAGE_PRESENT) ||
+ unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) |
+ (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT)
+ != pte) )
+ put_page_and_type(page);
+ unmap_domain_mem(ptep);
+
+ put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
}
vif->rx_cons = i;
- spin_unlock(&p->page_lock);
+ spin_unlock(&vif->rx_lock);
/*
* Flush pending transmit buffers. The guest may still have to wait for
@@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
(((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
i = TX_RING_INC(i) )
{
- __make_tx_response(vif, shared_rings->tx_ring[i].req.id,
+ make_tx_response(vif, shared_rings->tx_ring[i].req.id,
RING_STATUS_DROPPED);
}
vif->tx_req_cons = i;
@@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop)
}
-static void __make_tx_response(net_vif_t *vif,
- unsigned short id,
- unsigned char st)
+static void make_tx_response(net_vif_t *vif,
+ unsigned short id,
+ unsigned char st)
{
unsigned int pos;
tx_resp_entry_t *resp;
@@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t *vif,
rx_resp_entry_t *resp;
/* Place on the response ring for the relevant domain. */
- spin_lock(&vif->rx_lock);
pos = vif->rx_resp_prod;
resp = &vif->shared_rings->rx_ring[pos].resp;
resp->id = id;
@@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t *vif,
unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
guest_event_notify(cpu_mask);
}
- spin_unlock(&vif->rx_lock);
}
diff --git a/xen/net/skbuff.c b/xen/net/skbuff.c
index d8950633b9..5fcc044c5e 100644
--- a/xen/net/skbuff.c
+++ b/xen/net/skbuff.c
@@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool(struct sk_buff *skb)
static inline u8 *alloc_skb_data_page(struct sk_buff *skb)
{
- struct list_head *list_ptr;
- struct pfn_info *pf;
- unsigned long flags;
-
- spin_lock_irqsave(&free_list_lock, flags);
-
- if (!free_pfns) return NULL;
-
- list_ptr = free_list.next;
- pf = list_entry(list_ptr, struct pfn_info, list);
- pf->flags = 0;
- list_del(&pf->list);
- free_pfns--;
-
- spin_unlock_irqrestore(&free_list_lock, flags);
-
+ struct pfn_info *pf;
+ if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) )
+ return NULL;
skb->pf = pf;
return (u8 *)((pf - frame_table) << PAGE_SHIFT);
}
static inline void dealloc_skb_data_page(struct sk_buff *skb)
{
- struct pfn_info *pf;
+ struct pfn_info *pf = skb->pf;
unsigned long flags;
-
- pf = skb->pf;
-
spin_lock_irqsave(&free_list_lock, flags);
-
- pf->flags = 0;
- set_page_type_count(pf, 0);
- set_page_tot_count(pf, 0);
list_add(&pf->list, &free_list);
free_pfns++;
-
spin_unlock_irqrestore(&free_list_lock, flags);
}
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c
index b4784ccc02..6bc8baa47a 100644
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c
@@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void)
pte = update_debug_queue[i].ptep;
if ( pte == NULL ) continue;
update_debug_queue[i].ptep = NULL;
- update.ptr = pte;
+ update.ptr = virt_to_machine(pte);
update.val = update_debug_queue[i].pteval;
HYPERVISOR_mmu_update(&update, 1);
}
@@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsigned long va)
pgd = pgd_offset_k(va);
pmd = pmd_offset(pgd, va);
pte = pte_offset(pmd, va);
- update.ptr = pte;
+ update.ptr = virt_to_machine(pte);
pteval = *(unsigned long *)pte;
update.val = pteval & ~_PAGE_PRESENT;
HYPERVISOR_mmu_update(&update, 1);
@@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(void)
#if MMU_UPDATE_DEBUG > 0
DEBUG_allow_pt_reads();
#endif
- queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx);
+ queue_multicall2(__HYPERVISOR_mmu_update,
+ (unsigned long)update_queue,
+ idx);
idx = 0;
}
spin_unlock_irqrestore(&update_lock, flags);
@@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, unsigned long val)
#if MMU_UPDATE_DEBUG > 0
DEBUG_disallow_pt_read((unsigned long)ptr);
#endif
- update_queue[idx].ptr = (unsigned long)ptr;
+ update_queue[idx].ptr = virt_to_machine(ptr);
update_queue[idx].val = val;
increment_index();
spin_unlock_irqrestore(&update_lock, flags);
@@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, unsigned long val)
{
unsigned long flags;
spin_lock_irqsave(&update_lock, flags);
- update_queue[idx].ptr = (unsigned long)ptr;
+ update_queue[idx].ptr = virt_to_machine(ptr);
update_queue[idx].val = val;
increment_index();
spin_unlock_irqrestore(&update_lock, flags);
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c
index 883cd03b37..b1f8019ef9 100644
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c
@@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigned long vaddr,
}
pte = pte_offset(pmd, vaddr);
-#if 0 /* Not in Xen, since this breaks clear_fixmap. */
- if (pte_val(*pte))
- pte_ERROR(*pte);
-#endif
-
- /* We queue directly, avoiding hidden phys->machine translation. */
- queue_l1_entry_update(pte, phys | pgprot_val(prot));
+ if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) )
+ queue_unchecked_mmu_update(pte, phys | pgprot_val(prot));
+ else
+ queue_l1_entry_update(pte, phys | pgprot_val(prot));
/*
* It's enough to flush this one mapping.
@@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys,
printk("Invalid __set_fixmap\n");
return;
}
- set_pte_phys(address, phys,
- __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags)));
+ set_pte_phys(address, phys, flags);
}
void clear_fixmap(enum fixed_addresses idx)
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
index eac5c6a63c..078fede144 100644
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
@@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long machine_addr, unsigned long size)
*/
nrpages = size >> PAGE_SHIFT;
if (nrpages > NR_FIX_BTMAPS)
- return NULL;
+ return NULL;
/*
* Ok, go for it..
*/
idx = FIX_BTMAP_BEGIN;
while (nrpages > 0) {
- set_fixmap(idx, machine_addr);
+ __set_fixmap(idx, machine_addr,
+ __pgprot(__PAGE_KERNEL|_PAGE_IO));
machine_addr += PAGE_SIZE;
--idx;
--nrpages;