diff options
author | cl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk> | 2004-07-19 10:28:08 +0000 |
---|---|---|
committer | cl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk> | 2004-07-19 10:28:08 +0000 |
commit | 5ecc8cea0e3eb724831d100c501b0e3282933987 (patch) | |
tree | defede138331fb30fdcab68262f8c00a646eb219 | |
parent | 7e6280698eb45e80c147913e4f52646dac6f6a1d (diff) | |
download | xen-5ecc8cea0e3eb724831d100c501b0e3282933987.tar.gz xen-5ecc8cea0e3eb724831d100c501b0e3282933987.tar.bz2 xen-5ecc8cea0e3eb724831d100c501b0e3282933987.zip |
bitkeeper revision 1.1102.1.1 (40fba238l_-yBFQR6TV9GfynDkYi9A)
first go at writable pagetables
-rw-r--r-- | linux-2.4.26-xen-sparse/include/asm-xen/page.h | 3 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h | 4 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/mm/memory.c | 29 | ||||
-rw-r--r-- | xen/arch/x86/memory.c | 40 | ||||
-rw-r--r-- | xen/arch/x86/traps.c | 150 | ||||
-rw-r--r-- | xen/include/asm-x86/mm.h | 7 |
6 files changed, 226 insertions, 7 deletions
diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/page.h b/linux-2.4.26-xen-sparse/include/asm-xen/page.h index 6826f65cc0..582992f1a2 100644 --- a/linux-2.4.26-xen-sparse/include/asm-xen/page.h +++ b/linux-2.4.26-xen-sparse/include/asm-xen/page.h @@ -78,7 +78,8 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pmd_val(pmd_t x) { unsigned long ret = x.pmd; - if ( (ret & 1) ) ret = machine_to_phys(ret); + if (!(ret & 0x801) && ret) printk("pmd_val really invalid!!!\n"); + if (ret) ret = machine_to_phys(ret); return ret; } #define pgd_val(x) ({ BUG(); (unsigned long)0; }) diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h index 143beeeef5..c6ce165432 100644 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h @@ -135,7 +135,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) { clear_page(pte); __make_page_readonly(pte); - queue_pte_pin(__pa(pte)); + // queue_pte_pin(__pa(pte)); } return pte; @@ -154,7 +154,7 @@ static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, static __inline__ void pte_free_slow(pte_t *pte) { - queue_pte_unpin(__pa(pte)); + // queue_pte_unpin(__pa(pte)); __make_page_writeable(pte); free_page((unsigned long)pte); } diff --git a/linux-2.4.26-xen-sparse/mm/memory.c b/linux-2.4.26-xen-sparse/mm/memory.c index 6bdb08afe7..a60d07042f 100644 --- a/linux-2.4.26-xen-sparse/mm/memory.c +++ b/linux-2.4.26-xen-sparse/mm/memory.c @@ -163,6 +163,18 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) +#undef set_pte +#define set_pte(pteptr, pteval) do { \ + (*(pteptr) = pteval); \ + /* printk("set_pte %p -> %08lx\n", pteptr, pteval); */ \ +} while (0) +//void queue_l1_entry_update_queued(pte_t *ptr, unsigned long val); +//#define set_pte(pteptr, pteval) queue_l1_entry_update_queued(pteptr, (pteval).pte_low) +// #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) +//#undef pte_unmap +//#define pte_unmap(pte) xen_flush_page_update_queue() +#undef pmd_bad +#define pmd_bad(x) (((x).pmd & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT & ~0x800)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -184,6 +196,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; + /* printk("copy_page_range src %p dst %p src_pgd %p dst_pgd %p %08lx-%08lx\n", */ +/* src, dst, src_pgd, dst_pgd, address, end); */ for (;;) { pmd_t * src_pmd, * dst_pmd; @@ -205,6 +219,7 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_alloc(dst, dst_pgd, address); + /* printk("src_pmd %p dst_pmd %p\n", src_pmd, dst_pmd); */ if (!dst_pmd) goto nomem; @@ -226,6 +241,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; src_pte = pte_offset(src_pmd, address); dst_pte = pte_alloc(dst, dst_pmd, address); + /* printk("src_pte %p(%p,%08lx,%08lx, %08lx) dst_pte %p\n", */ +/* src_pte, src_pmd, *src_pmd, pmd_page(*src_pmd), address, dst_pte); */ if (!dst_pte) goto nomem; @@ -239,6 +256,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (pte_none(pte)) goto cont_copy_pte_range_noset; if (!pte_present(pte)) { + printk("swap_dup call %p:%08lx\n", + src_pte, pte.pte_low); swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } @@ -249,10 +268,17 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow && pte_write(pte)) { + /* printk("ptep_set_wrprotect %p was %08lx\n", src_pte, *src_pte); */ +#if 0 /* XEN modification: modified ordering here to avoid RaW hazard. */ pte = *src_pte; pte = pte_wrprotect(pte); ptep_set_wrprotect(src_pte); +#else + clear_bit(_PAGE_BIT_RW, src_pte); //ptep_set_wrprotect(src_pte); + pte = *src_pte; + /* printk("ptep_set_wrprotect %p now %08lx\n", src_pte, *src_pte); */ +#endif } /* If it's a shared mapping, mark it clean in the child */ @@ -278,10 +304,13 @@ cont_copy_pmd_range: src_pmd++; out_unlock: spin_unlock(&src->page_table_lock); out: + /* printk("out\n"); */ return 0; nomem: return -ENOMEM; } +#undef set_pte +#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low) /* * Return indicates whether a page was freed so caller can adjust rss diff --git a/xen/arch/x86/memory.c b/xen/arch/x86/memory.c index a2cef38ed2..211f309033 100644 --- a/xen/arch/x86/memory.c +++ b/xen/arch/x86/memory.c @@ -1,3 +1,7 @@ +extern unsigned long disconnected; +extern void ptwr_reconnect(unsigned long); +extern int writable_idx; +extern void ptwr_flush(void); /****************************************************************************** * arch/x86/memory.c * @@ -117,7 +121,7 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr, static void free_l2_table(struct pfn_info *page); static void free_l1_table(struct pfn_info *page); -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); +int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); /* Used to defer flushing of memory structures. */ @@ -509,8 +513,20 @@ static inline int update_l2e(l2_pgentry_t *pl2e, } +static inline void set_l1_page_va(unsigned long pfn, + unsigned long va_idx) +{ + struct pfn_info *page; + + page = &frame_table[pfn]; + page->type_and_flags &= ~PGT_va_mask; + page->type_and_flags |= va_idx << PGT_va_shift; +} + + +#define NPRINTK if (0) printk /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ -static int mod_l2_entry(l2_pgentry_t *pl2e, +int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, unsigned long pfn) { @@ -528,6 +544,8 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, return 0; ol2e = mk_l2_pgentry(_ol2e); + NPRINTK("mod_l2_entry pl2e %p ol2e %08lx nl2e %08lx pfn %08lx\n", + pl2e, l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), pfn); if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT ) { /* Differ in mapping (bits 12-31) or presence (bit 0)? */ @@ -537,6 +555,9 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, if ( unlikely(!get_page_from_l2e(nl2e, pfn)) ) return 0; + set_l1_page_va(l2_pgentry_val(nl2e) >> PAGE_SHIFT, + ((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2); + if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) ) { put_page_from_l2e(nl2e, pfn); @@ -698,6 +719,11 @@ static int do_extended_command(unsigned long ptr, unsigned long val) u32 x, y; domid_t domid; + if (disconnected != ENTRIES_PER_L2_PAGETABLE) + ptwr_reconnect(0L); + if (writable_idx) + ptwr_flush(); + switch ( cmd ) { case MMUEXT_PIN_L1_TABLE: @@ -946,6 +972,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count, int *success_count) perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); + if (disconnected != ENTRIES_PER_L2_PAGETABLE) + ptwr_reconnect(0L); + if (writable_idx) + ptwr_flush(); + for ( i = 0; i < count; i++ ) { if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) ) @@ -1119,6 +1150,11 @@ int do_update_va_mapping(unsigned long page_nr, perfc_incrc(calls_to_update_va); + if (disconnected != ENTRIES_PER_L2_PAGETABLE) + ptwr_reconnect(0L); + if (writable_idx) + ptwr_flush(); + if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) ) return -EINVAL; diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index aa74ae4dfe..dcac9268c7 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -310,6 +310,82 @@ asmlinkage void do_double_fault(void) for ( ; ; ) ; } +extern int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); +unsigned long disconnected = ENTRIES_PER_L2_PAGETABLE; +static unsigned long *writable_l1; +#define NR_WRITABLES 4 +static unsigned long *writables[NR_WRITABLES]; +int writable_idx = 0; +#define PRINTK if (0) printk +#define NPRINTK if (0) printk + +void ptwr_reconnect(unsigned long addr) +{ + unsigned long pte; + unsigned long pfn; + struct pfn_info *page; + l2_pgentry_t *pl2e; + PRINTK("page fault in disconnected space: addr %08lx space %08lx\n", + addr, disconnected << L2_PAGETABLE_SHIFT); + pl2e = &linear_l2_table[disconnected]; + + if (__get_user(pte, writable_l1)) + BUG(); + pfn = pte >> PAGE_SHIFT; + page = &frame_table[pfn]; + + /* reconnect l1 page */ + PRINTK(" pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n", pl2e, + l2_pgentry_val(*pl2e), + l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >> + PAGE_SHIFT]) >> PAGE_SHIFT, + frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags, + frame_table[pfn].type_and_flags); + mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) & ~0x800) | + _PAGE_PRESENT), + l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >> + PAGE_SHIFT]) >> PAGE_SHIFT); + PRINTK("now pl2e %p l2e %08lx taf %08x/%08x\n", pl2e, + l2_pgentry_val(*pl2e), + frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags, + frame_table[pfn].type_and_flags); + disconnected = ENTRIES_PER_L2_PAGETABLE; + /* make pt page write protected */ + if (__get_user(pte, writable_l1)) + BUG(); + PRINTK("writable_l1 at %p is %08lx\n", writable_l1, pte); + pte &= ~_PAGE_RW; + if (__put_user(pte, writable_l1)) + BUG(); + PRINTK("writable_l1 at %p now %08lx\n", writable_l1, pte); + /* and try again */ + return; +} + +void ptwr_flush(void) +{ + unsigned long pte, pfn; + struct pfn_info *page; + int i; + + for (i = 0; i < writable_idx; i++) { + if (__get_user(pte, writables[i])) + BUG(); + pfn = pte >> PAGE_SHIFT; + page = &frame_table[pfn]; + PRINTK("alloc l1 page %p\n", page); + if (!get_page_type(page, PGT_l1_page_table)) + BUG(); + /* make pt page writable */ + PRINTK("writable_l1 at %p is %08lx\n", writables[i], pte); + pte &= ~_PAGE_RW; + if (__put_user(pte, writables[i])) + BUG(); + PRINTK("writable_l1 at %p now %08lx\n", writables[i], pte); + } + writable_idx = 0; +} + asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) { struct guest_trap_bounce *gtb = guest_trap_bounce+smp_processor_id(); @@ -335,6 +411,80 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) return; /* successfully copied the mapping */ } + if ((addr >> L2_PAGETABLE_SHIFT) == disconnected) { + ptwr_reconnect(addr); + return; + } + + if (addr < PAGE_OFFSET && error_code & 2) { + /* write page fault, check if we're trying to modify an l1 + page table */ + unsigned long pte, pfn; + struct pfn_info *page; + l2_pgentry_t *pl2e; + NPRINTK("get user %p for va %08lx\n", + &linear_pg_table[addr>>PAGE_SHIFT], addr); + if (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) & + _PAGE_PRESENT && + __get_user(pte, (unsigned long *) + &linear_pg_table[addr >> PAGE_SHIFT]) == 0) { + pfn = pte >> PAGE_SHIFT; + NPRINTK("check pte %08lx = pfn %08lx for va %08lx\n", pte, pfn, addr); + page = &frame_table[pfn]; + if ((page->type_and_flags & PGT_type_mask) == PGT_l1_page_table) { + pl2e = &linear_l2_table[(page->type_and_flags & + PGT_va_mask) >> PGT_va_shift]; + PRINTK("page_fault on l1 pt at va %08lx, pt for %08x, pfn %08lx\n", + addr, ((page->type_and_flags & PGT_va_mask) >> + PGT_va_shift) << L2_PAGETABLE_SHIFT, pfn); + if (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) { + PRINTK("freeing l1 page %p\n", page); + if (writable_idx == NR_WRITABLES) + ptwr_flush(); + writables[writable_idx++] = (unsigned long *) + &linear_pg_table[addr>>PAGE_SHIFT]; + if ((page->type_and_flags & PGT_count_mask) != 1) + BUG(); + put_page_type(page); + } else { + if (disconnected != ENTRIES_PER_L2_PAGETABLE) + ptwr_reconnect(addr); + PRINTK(" pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n", + pl2e, l2_pgentry_val(*pl2e), + l1_pgentry_val(linear_pg_table[(unsigned long)pl2e + >> PAGE_SHIFT]) >> + PAGE_SHIFT, + frame_table[l2_pgentry_to_pagenr(*pl2e)]. + type_and_flags, frame_table[pfn].type_and_flags); + /* disconnect l1 page */ + mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) & + ~_PAGE_PRESENT) | 0x800), + l1_pgentry_val(linear_pg_table + [(unsigned long)pl2e + >> PAGE_SHIFT]) >> + PAGE_SHIFT); + disconnected = (page->type_and_flags & PGT_va_mask) >> + PGT_va_shift; + PRINTK("now pl2e %p l2e %08lx taf %08x/%08x\n", + pl2e, l2_pgentry_val(*pl2e), + frame_table[l2_pgentry_to_pagenr(*pl2e)]. + type_and_flags, + frame_table[pfn].type_and_flags); + writable_l1 = (unsigned long *) + &linear_pg_table[addr>>PAGE_SHIFT]; + } + /* make pt page writable */ + pte |= _PAGE_RW; + PRINTK("update %p pte to %08lx\n", + &linear_pg_table[addr>>PAGE_SHIFT], pte); + if (__put_user(pte, (unsigned long *) + &linear_pg_table[addr>>PAGE_SHIFT])) + BUG(); + return; + } + } + } + if ( unlikely(p->mm.shadow_mode) && (addr < PAGE_OFFSET) && shadow_fault(addr, error_code) ) return; /* Returns TRUE if fault was handled. */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index b8a4c5e496..ce7667c7c2 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -52,8 +52,11 @@ struct pfn_info /* Has this page been validated for use as its current type? */ #define _PGT_validated 28 #define PGT_validated (1<<_PGT_validated) - /* 28-bit count of uses of this frame as its current type. */ -#define PGT_count_mask ((1<<28)-1) + /* 10-bit most significant bits of va address if used as l1 page table */ +#define PGT_va_shift 18 +#define PGT_va_mask (((1<<10)-1)<<PGT_va_shift) + /* 18-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1<<18)-1) /* For safety, force a TLB flush when this page's type changes. */ #define _PGC_tlb_flush_on_type_change 31 |