aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>2004-07-19 10:28:08 +0000
committercl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>2004-07-19 10:28:08 +0000
commit5ecc8cea0e3eb724831d100c501b0e3282933987 (patch)
treedefede138331fb30fdcab68262f8c00a646eb219
parent7e6280698eb45e80c147913e4f52646dac6f6a1d (diff)
downloadxen-5ecc8cea0e3eb724831d100c501b0e3282933987.tar.gz
xen-5ecc8cea0e3eb724831d100c501b0e3282933987.tar.bz2
xen-5ecc8cea0e3eb724831d100c501b0e3282933987.zip
bitkeeper revision 1.1102.1.1 (40fba238l_-yBFQR6TV9GfynDkYi9A)
first go at writable pagetables
-rw-r--r--linux-2.4.26-xen-sparse/include/asm-xen/page.h3
-rw-r--r--linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h4
-rw-r--r--linux-2.4.26-xen-sparse/mm/memory.c29
-rw-r--r--xen/arch/x86/memory.c40
-rw-r--r--xen/arch/x86/traps.c150
-rw-r--r--xen/include/asm-x86/mm.h7
6 files changed, 226 insertions, 7 deletions
diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/page.h b/linux-2.4.26-xen-sparse/include/asm-xen/page.h
index 6826f65cc0..582992f1a2 100644
--- a/linux-2.4.26-xen-sparse/include/asm-xen/page.h
+++ b/linux-2.4.26-xen-sparse/include/asm-xen/page.h
@@ -78,7 +78,8 @@ typedef struct { unsigned long pgprot; } pgprot_t;
static inline unsigned long pmd_val(pmd_t x)
{
unsigned long ret = x.pmd;
- if ( (ret & 1) ) ret = machine_to_phys(ret);
+ if (!(ret & 0x801) && ret) printk("pmd_val really invalid!!!\n");
+ if (ret) ret = machine_to_phys(ret);
return ret;
}
#define pgd_val(x) ({ BUG(); (unsigned long)0; })
diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h
index 143beeeef5..c6ce165432 100644
--- a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h
+++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h
@@ -135,7 +135,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
clear_page(pte);
__make_page_readonly(pte);
- queue_pte_pin(__pa(pte));
+ // queue_pte_pin(__pa(pte));
}
return pte;
@@ -154,7 +154,7 @@ static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm,
static __inline__ void pte_free_slow(pte_t *pte)
{
- queue_pte_unpin(__pa(pte));
+ // queue_pte_unpin(__pa(pte));
__make_page_writeable(pte);
free_page((unsigned long)pte);
}
diff --git a/linux-2.4.26-xen-sparse/mm/memory.c b/linux-2.4.26-xen-sparse/mm/memory.c
index 6bdb08afe7..a60d07042f 100644
--- a/linux-2.4.26-xen-sparse/mm/memory.c
+++ b/linux-2.4.26-xen-sparse/mm/memory.c
@@ -163,6 +163,18 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
+#undef set_pte
+#define set_pte(pteptr, pteval) do { \
+ (*(pteptr) = pteval); \
+ /* printk("set_pte %p -> %08lx\n", pteptr, pteval); */ \
+} while (0)
+//void queue_l1_entry_update_queued(pte_t *ptr, unsigned long val);
+//#define set_pte(pteptr, pteval) queue_l1_entry_update_queued(pteptr, (pteval).pte_low)
+// #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0))
+//#undef pte_unmap
+//#define pte_unmap(pte) xen_flush_page_update_queue()
+#undef pmd_bad
+#define pmd_bad(x) (((x).pmd & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT & ~0x800)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
@@ -184,6 +196,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1;
+ /* printk("copy_page_range src %p dst %p src_pgd %p dst_pgd %p %08lx-%08lx\n", */
+/* src, dst, src_pgd, dst_pgd, address, end); */
for (;;) {
pmd_t * src_pmd, * dst_pmd;
@@ -205,6 +219,7 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
src_pmd = pmd_offset(src_pgd, address);
dst_pmd = pmd_alloc(dst, dst_pgd, address);
+ /* printk("src_pmd %p dst_pmd %p\n", src_pmd, dst_pmd); */
if (!dst_pmd)
goto nomem;
@@ -226,6 +241,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
src_pte = pte_offset(src_pmd, address);
dst_pte = pte_alloc(dst, dst_pmd, address);
+ /* printk("src_pte %p(%p,%08lx,%08lx, %08lx) dst_pte %p\n", */
+/* src_pte, src_pmd, *src_pmd, pmd_page(*src_pmd), address, dst_pte); */
if (!dst_pte)
goto nomem;
@@ -239,6 +256,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
if (pte_none(pte))
goto cont_copy_pte_range_noset;
if (!pte_present(pte)) {
+ printk("swap_dup call %p:%08lx\n",
+ src_pte, pte.pte_low);
swap_duplicate(pte_to_swp_entry(pte));
goto cont_copy_pte_range;
}
@@ -249,10 +268,17 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow && pte_write(pte)) {
+ /* printk("ptep_set_wrprotect %p was %08lx\n", src_pte, *src_pte); */
+#if 0
/* XEN modification: modified ordering here to avoid RaW hazard. */
pte = *src_pte;
pte = pte_wrprotect(pte);
ptep_set_wrprotect(src_pte);
+#else
+ clear_bit(_PAGE_BIT_RW, src_pte); //ptep_set_wrprotect(src_pte);
+ pte = *src_pte;
+ /* printk("ptep_set_wrprotect %p now %08lx\n", src_pte, *src_pte); */
+#endif
}
/* If it's a shared mapping, mark it clean in the child */
@@ -278,10 +304,13 @@ cont_copy_pmd_range: src_pmd++;
out_unlock:
spin_unlock(&src->page_table_lock);
out:
+ /* printk("out\n"); */
return 0;
nomem:
return -ENOMEM;
}
+#undef set_pte
+#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
/*
* Return indicates whether a page was freed so caller can adjust rss
diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c
index a2cef38ed2..211f309033 100644
--- a/xen/arch/x86/memory.c
+++ b/xen/arch/x86/memory.c
@@ -1,3 +1,7 @@
+extern unsigned long disconnected;
+extern void ptwr_reconnect(unsigned long);
+extern int writable_idx;
+extern void ptwr_flush(void);
/******************************************************************************
* arch/x86/memory.c
*
@@ -117,7 +121,7 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr,
static void free_l2_table(struct pfn_info *page);
static void free_l1_table(struct pfn_info *page);
-static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
+int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
/* Used to defer flushing of memory structures. */
@@ -509,8 +513,20 @@ static inline int update_l2e(l2_pgentry_t *pl2e,
}
+static inline void set_l1_page_va(unsigned long pfn,
+ unsigned long va_idx)
+{
+ struct pfn_info *page;
+
+ page = &frame_table[pfn];
+ page->type_and_flags &= ~PGT_va_mask;
+ page->type_and_flags |= va_idx << PGT_va_shift;
+}
+
+
+#define NPRINTK if (0) printk
/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
-static int mod_l2_entry(l2_pgentry_t *pl2e,
+int mod_l2_entry(l2_pgentry_t *pl2e,
l2_pgentry_t nl2e,
unsigned long pfn)
{
@@ -528,6 +544,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
return 0;
ol2e = mk_l2_pgentry(_ol2e);
+ NPRINTK("mod_l2_entry pl2e %p ol2e %08lx nl2e %08lx pfn %08lx\n",
+ pl2e, l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), pfn);
if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
{
/* Differ in mapping (bits 12-31) or presence (bit 0)? */
@@ -537,6 +555,9 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
return 0;
+ set_l1_page_va(l2_pgentry_val(nl2e) >> PAGE_SHIFT,
+ ((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2);
+
if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
{
put_page_from_l2e(nl2e, pfn);
@@ -698,6 +719,11 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
u32 x, y;
domid_t domid;
+ if (disconnected != ENTRIES_PER_L2_PAGETABLE)
+ ptwr_reconnect(0L);
+ if (writable_idx)
+ ptwr_flush();
+
switch ( cmd )
{
case MMUEXT_PIN_L1_TABLE:
@@ -946,6 +972,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count)
perfc_incrc(calls_to_mmu_update);
perfc_addc(num_page_updates, count);
+ if (disconnected != ENTRIES_PER_L2_PAGETABLE)
+ ptwr_reconnect(0L);
+ if (writable_idx)
+ ptwr_flush();
+
for ( i = 0; i < count; i++ )
{
if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
@@ -1119,6 +1150,11 @@ int do_update_va_mapping(unsigned long page_nr,
perfc_incrc(calls_to_update_va);
+ if (disconnected != ENTRIES_PER_L2_PAGETABLE)
+ ptwr_reconnect(0L);
+ if (writable_idx)
+ ptwr_flush();
+
if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
return -EINVAL;
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index aa74ae4dfe..dcac9268c7 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -310,6 +310,82 @@ asmlinkage void do_double_fault(void)
for ( ; ; ) ;
}
+extern int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
+unsigned long disconnected = ENTRIES_PER_L2_PAGETABLE;
+static unsigned long *writable_l1;
+#define NR_WRITABLES 4
+static unsigned long *writables[NR_WRITABLES];
+int writable_idx = 0;
+#define PRINTK if (0) printk
+#define NPRINTK if (0) printk
+
+void ptwr_reconnect(unsigned long addr)
+{
+ unsigned long pte;
+ unsigned long pfn;
+ struct pfn_info *page;
+ l2_pgentry_t *pl2e;
+ PRINTK("page fault in disconnected space: addr %08lx space %08lx\n",
+ addr, disconnected << L2_PAGETABLE_SHIFT);
+ pl2e = &linear_l2_table[disconnected];
+
+ if (__get_user(pte, writable_l1))
+ BUG();
+ pfn = pte >> PAGE_SHIFT;
+ page = &frame_table[pfn];
+
+ /* reconnect l1 page */
+ PRINTK(" pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n", pl2e,
+ l2_pgentry_val(*pl2e),
+ l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >>
+ PAGE_SHIFT]) >> PAGE_SHIFT,
+ frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags,
+ frame_table[pfn].type_and_flags);
+ mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) & ~0x800) |
+ _PAGE_PRESENT),
+ l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >>
+ PAGE_SHIFT]) >> PAGE_SHIFT);
+ PRINTK("now pl2e %p l2e %08lx taf %08x/%08x\n", pl2e,
+ l2_pgentry_val(*pl2e),
+ frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags,
+ frame_table[pfn].type_and_flags);
+ disconnected = ENTRIES_PER_L2_PAGETABLE;
+ /* make pt page write protected */
+ if (__get_user(pte, writable_l1))
+ BUG();
+ PRINTK("writable_l1 at %p is %08lx\n", writable_l1, pte);
+ pte &= ~_PAGE_RW;
+ if (__put_user(pte, writable_l1))
+ BUG();
+ PRINTK("writable_l1 at %p now %08lx\n", writable_l1, pte);
+ /* and try again */
+ return;
+}
+
+void ptwr_flush(void)
+{
+ unsigned long pte, pfn;
+ struct pfn_info *page;
+ int i;
+
+ for (i = 0; i < writable_idx; i++) {
+ if (__get_user(pte, writables[i]))
+ BUG();
+ pfn = pte >> PAGE_SHIFT;
+ page = &frame_table[pfn];
+ PRINTK("alloc l1 page %p\n", page);
+ if (!get_page_type(page, PGT_l1_page_table))
+ BUG();
+ /* make pt page writable */
+ PRINTK("writable_l1 at %p is %08lx\n", writables[i], pte);
+ pte &= ~_PAGE_RW;
+ if (__put_user(pte, writables[i]))
+ BUG();
+ PRINTK("writable_l1 at %p now %08lx\n", writables[i], pte);
+ }
+ writable_idx = 0;
+}
+
asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
{
struct guest_trap_bounce *gtb = guest_trap_bounce+smp_processor_id();
@@ -335,6 +411,80 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
return; /* successfully copied the mapping */
}
+ if ((addr >> L2_PAGETABLE_SHIFT) == disconnected) {
+ ptwr_reconnect(addr);
+ return;
+ }
+
+ if (addr < PAGE_OFFSET && error_code & 2) {
+ /* write page fault, check if we're trying to modify an l1
+ page table */
+ unsigned long pte, pfn;
+ struct pfn_info *page;
+ l2_pgentry_t *pl2e;
+ NPRINTK("get user %p for va %08lx\n",
+ &linear_pg_table[addr>>PAGE_SHIFT], addr);
+ if (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
+ _PAGE_PRESENT &&
+ __get_user(pte, (unsigned long *)
+ &linear_pg_table[addr >> PAGE_SHIFT]) == 0) {
+ pfn = pte >> PAGE_SHIFT;
+ NPRINTK("check pte %08lx = pfn %08lx for va %08lx\n", pte, pfn, addr);
+ page = &frame_table[pfn];
+ if ((page->type_and_flags & PGT_type_mask) == PGT_l1_page_table) {
+ pl2e = &linear_l2_table[(page->type_and_flags &
+ PGT_va_mask) >> PGT_va_shift];
+ PRINTK("page_fault on l1 pt at va %08lx, pt for %08x, pfn %08lx\n",
+ addr, ((page->type_and_flags & PGT_va_mask) >>
+ PGT_va_shift) << L2_PAGETABLE_SHIFT, pfn);
+ if (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) {
+ PRINTK("freeing l1 page %p\n", page);
+ if (writable_idx == NR_WRITABLES)
+ ptwr_flush();
+ writables[writable_idx++] = (unsigned long *)
+ &linear_pg_table[addr>>PAGE_SHIFT];
+ if ((page->type_and_flags & PGT_count_mask) != 1)
+ BUG();
+ put_page_type(page);
+ } else {
+ if (disconnected != ENTRIES_PER_L2_PAGETABLE)
+ ptwr_reconnect(addr);
+ PRINTK(" pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n",
+ pl2e, l2_pgentry_val(*pl2e),
+ l1_pgentry_val(linear_pg_table[(unsigned long)pl2e
+ >> PAGE_SHIFT]) >>
+ PAGE_SHIFT,
+ frame_table[l2_pgentry_to_pagenr(*pl2e)].
+ type_and_flags, frame_table[pfn].type_and_flags);
+ /* disconnect l1 page */
+ mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) &
+ ~_PAGE_PRESENT) | 0x800),
+ l1_pgentry_val(linear_pg_table
+ [(unsigned long)pl2e
+ >> PAGE_SHIFT]) >>
+ PAGE_SHIFT);
+ disconnected = (page->type_and_flags & PGT_va_mask) >>
+ PGT_va_shift;
+ PRINTK("now pl2e %p l2e %08lx taf %08x/%08x\n",
+ pl2e, l2_pgentry_val(*pl2e),
+ frame_table[l2_pgentry_to_pagenr(*pl2e)].
+ type_and_flags,
+ frame_table[pfn].type_and_flags);
+ writable_l1 = (unsigned long *)
+ &linear_pg_table[addr>>PAGE_SHIFT];
+ }
+ /* make pt page writable */
+ pte |= _PAGE_RW;
+ PRINTK("update %p pte to %08lx\n",
+ &linear_pg_table[addr>>PAGE_SHIFT], pte);
+ if (__put_user(pte, (unsigned long *)
+ &linear_pg_table[addr>>PAGE_SHIFT]))
+ BUG();
+ return;
+ }
+ }
+ }
+
if ( unlikely(p->mm.shadow_mode) &&
(addr < PAGE_OFFSET) && shadow_fault(addr, error_code) )
return; /* Returns TRUE if fault was handled. */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index b8a4c5e496..ce7667c7c2 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -52,8 +52,11 @@ struct pfn_info
/* Has this page been validated for use as its current type? */
#define _PGT_validated 28
#define PGT_validated (1<<_PGT_validated)
- /* 28-bit count of uses of this frame as its current type. */
-#define PGT_count_mask ((1<<28)-1)
+ /* 10-bit most significant bits of va address if used as l1 page table */
+#define PGT_va_shift 18
+#define PGT_va_mask (((1<<10)-1)<<PGT_va_shift)
+ /* 18-bit count of uses of this frame as its current type. */
+#define PGT_count_mask ((1<<18)-1)
/* For safety, force a TLB flush when this page's type changes. */
#define _PGC_tlb_flush_on_type_change 31