diff options
author | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-08-13 16:17:33 +0000 |
---|---|---|
committer | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-08-13 16:17:33 +0000 |
commit | c589b05239e1596266dbb37d2c811812a4c89ddd (patch) | |
tree | f166d428787cd9f040a28d405fa54c5a845868a3 | |
parent | a5d95ebe5d0445c2330a54304495a58e1c49d4d0 (diff) | |
download | xen-c589b05239e1596266dbb37d2c811812a4c89ddd.tar.gz xen-c589b05239e1596266dbb37d2c811812a4c89ddd.tar.bz2 xen-c589b05239e1596266dbb37d2c811812a4c89ddd.zip |
bitkeeper revision 1.1159.17.26 (411ce99dzAxJPoMN9ygmUMItgtICCA)
Merged binary-rewrite fixup back into 2.4. Emulation and rewriting
still both seem to have issues with Fedora Core 3 Test 1, however.
Works fine with 2.6, but not with 2.4. Probably due to different
code paths being exercised in glibc?
-rw-r--r-- | .rootkeys | 1 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c | 5 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile | 2 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S | 5 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c | 3 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c | 9 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/mm/Makefile | 2 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/arch/xen/mm/fault.c | 9 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h | 26 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h | 11 | ||||
-rwxr-xr-x | linux-2.4.26-xen-sparse/mkbuildtree | 2 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/mm/mmap.c | 1219 | ||||
-rw-r--r-- | linux-2.4.26-xen-sparse/mm/vmalloc.c | 2 | ||||
-rw-r--r-- | linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c | 7 | ||||
-rw-r--r-- | linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c | 152 |
15 files changed, 1372 insertions, 83 deletions
@@ -119,6 +119,7 @@ 3e6e7c1efbQe93xCvOpOVCnXTMmQ5w linux-2.4.26-xen-sparse/mkbuildtree 406aeeafkrnCuIVWLFv3kfn4uAD5Eg linux-2.4.26-xen-sparse/mm/highmem.c 3e5a4e68GxCIaFH4sy01v1wjapetaA linux-2.4.26-xen-sparse/mm/memory.c +411ce99d_uOUTK61pkqbdIAi1CIaSA linux-2.4.26-xen-sparse/mm/mmap.c 3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.26-xen-sparse/mm/mprotect.c 3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.26-xen-sparse/mm/mremap.c 409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.26-xen-sparse/mm/page_alloc.c diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c index f5c447e9cf..b72d0efe11 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c @@ -65,7 +65,6 @@ static unsigned long inflate_balloon(unsigned long num_pages) unsigned long *currp; unsigned long curraddr; unsigned long ret = 0; - unsigned long vaddr; unsigned long i, j; parray = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long)); @@ -102,7 +101,7 @@ static unsigned long inflate_balloon(unsigned long num_pages) for ( i = 0, currp = parray; i < num_pages; i++, currp++ ) { unsigned long mfn = phys_to_machine_mapping[*currp]; - curraddr = page_address(mem_map + *currp); + curraddr = (unsigned long)page_address(mem_map + *currp); if (curraddr) queue_l1_entry_update(get_ptep(curraddr), 0); @@ -178,7 +177,7 @@ unsigned long deflate_balloon(unsigned long num_pages) if ( num_pages > credit ) { - printk(KERN_ERR "deflate_balloon: %d pages > %d credit.\n", + printk(KERN_ERR "deflate_balloon: %lu pages > %lu credit.\n", num_pages, credit); return -EAGAIN; } diff --git a/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile b/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile index 2d669c8f9e..02634ec4c2 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile @@ -11,7 +11,7 @@ export-objs := i386_ksyms.o obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o \ i386_ksyms.o i387.o evtchn.o ctrl_if.o pci-dma.o \ - reboot.o + reboot.o fixup.o ifdef CONFIG_PCI obj-y += pci-i386.o pci-pc.o diff --git a/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S b/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S index 305bd42c70..7bfe1f5994 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S @@ -518,9 +518,8 @@ ENTRY(machine_check) pushl $ SYMBOL_NAME(do_machine_check) jmp error_code -ENTRY(spurious_interrupt_bug) - pushl $0 - pushl $ SYMBOL_NAME(do_spurious_interrupt_bug) +ENTRY(fixup_4gb_segment) + pushl $ SYMBOL_NAME(do_fixup_4gb_segment) jmp error_code .data diff --git a/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c b/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c index 4922d661b4..e530df79d3 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c @@ -230,6 +230,9 @@ void __init setup_arch(char **cmdline_p) blk_nohighio = 1; #endif + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments); + HYPERVISOR_set_callbacks( __KERNEL_CS, (unsigned long)hypervisor_callback, __KERNEL_CS, (unsigned long)failsafe_callback); diff --git a/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c b/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c index 548941bc58..aea85ffca6 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c @@ -63,7 +63,7 @@ asmlinkage void safe_page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); +asmlinkage void fixup_4gb_segment(void); asmlinkage void machine_check(void); int kstack_depth_to_print = 24; @@ -539,11 +539,6 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs, } } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs, - long error_code) -{ -} - /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -605,7 +600,7 @@ static trap_info_t trap_table[] = { { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, - { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug }, + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, diff --git a/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile b/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile index d0d16114b6..45e189775d 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile +++ b/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile @@ -9,7 +9,7 @@ O_TARGET := mm.o -obj-y := init.o fault.o extable.o pageattr.o hypervisor.o ioremap.o +obj-y := init.o fault.o extable.o pageattr.o hypervisor.o ioremap.o mmap.o export-objs := pageattr.o diff --git a/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c b/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c index 496e974487..94f8cf95a6 100644 --- a/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c +++ b/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c @@ -105,7 +105,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, } #endif - if ( flush_page_update_queue() != 0 ) return; + if ( flush_page_update_queue() != 0 ) + return; /* * We fault-in kernel-space virtual memory on-demand. The @@ -120,8 +121,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 1) == 0. */ - if (address >= TASK_SIZE && !(error_code & 5)) - goto vmalloc_fault; + if (unlikely(address >= TASK_SIZE) || + unlikely(address < (FIRST_USER_PGD_NR<<PGDIR_SHIFT))) + if (!(error_code & 5)) + goto vmalloc_fault; mm = tsk->mm; info.si_code = SEGV_MAPERR; diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h index 143beeeef5..6de5a0c137 100644 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h @@ -54,11 +54,15 @@ static inline pgd_t *get_pgd_slow(void) if (!pmd) goto out_oom; clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + set_pgd(pgd + FIRST_USER_PGD_NR, __pgd(1 + __pa(pmd))); } - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + memcpy(pgd, + swapper_pg_dir, + FIRST_USER_PGD_NR * sizeof(pgd_t)); + memcpy(pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD, + swapper_pg_dir + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD - + FIRST_USER_PGD_NR) * sizeof(pgd_t)); } return pgd; out_oom: @@ -75,13 +79,17 @@ static inline pgd_t *get_pgd_slow(void) pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - init_mm.pgd + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + memset(pgd + FIRST_USER_PGD_NR, + 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + memcpy(pgd, + init_mm.pgd, + FIRST_USER_PGD_NR * sizeof(pgd_t)); + memcpy(pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD, + init_mm.pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD - + FIRST_USER_PGD_NR) * sizeof(pgd_t)); __make_page_readonly(pgd); queue_pgd_pin(__pa(pgd)); - } return pgd; } diff --git a/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h b/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h index dc25864d2c..5faf5350fa 100644 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h @@ -83,16 +83,16 @@ extern void pgtable_cache_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 +#define FIRST_USER_PGD_NR (1) +#define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE)-FIRST_USER_PGD_NR) +#if 0 /* XEN */ #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - #define TWOLEVEL_PGDIR_SHIFT 22 #define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) #define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) - +#endif #ifndef __ASSEMBLY__ /* 4MB is just a nice "safety zone". Also, we align to a fresh pde. */ @@ -367,4 +367,7 @@ static inline unsigned long arbitrary_virt_to_phys(void *va) #define io_remap_page_range remap_page_range +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_CHECK_FIXED_MAPPING + #endif /* _I386_PGTABLE_H */ diff --git a/linux-2.4.26-xen-sparse/mkbuildtree b/linux-2.4.26-xen-sparse/mkbuildtree index 870df2cfbf..103fead012 100755 --- a/linux-2.4.26-xen-sparse/mkbuildtree +++ b/linux-2.4.26-xen-sparse/mkbuildtree @@ -222,6 +222,7 @@ ln -sf ../../i386/kernel/semaphore.c ln -sf ../../i386/kernel/sys_i386.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/ctrl_if.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/evtchn.c +ln -sf ../../../${LINUX_26}/arch/xen/kernel/fixup.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/reboot.c ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/ioport.c ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/pci-dma.c @@ -242,6 +243,7 @@ cd ${AD}/arch/xen/mm ln -sf ../../i386/mm/extable.c ln -sf ../../i386/mm/pageattr.c ln -sf ../../../${LINUX_26}/arch/xen/i386/mm/hypervisor.c +ln -sf ../../../${LINUX_26}/arch/xen/i386/mm/mmap.c cd ${AD}/arch/xen/drivers/console ln -sf ../../../../${LINUX_26}/drivers/xen/console/console.c diff --git a/linux-2.4.26-xen-sparse/mm/mmap.c b/linux-2.4.26-xen-sparse/mm/mmap.c new file mode 100644 index 0000000000..ed7b11c063 --- /dev/null +++ b/linux-2.4.26-xen-sparse/mm/mmap.c @@ -0,0 +1,1219 @@ +/* + * linux/mm/mmap.c + * + * Written by obz. + */ +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/personality.h> +#include <linux/mount.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory; +int max_map_count = DEFAULT_MAX_MAP_COUNT; + +/* Check that a process has enough memory to allocate a + * new virtual mapping. + */ +int vm_enough_memory(long pages) +{ + /* Stupid algorithm to decide if we have enough memory: while + * simple, it hopefully works in most obvious cases.. Easy to + * fool it, but this should catch most mistakes. + */ + /* 23/11/98 NJC: Somewhat less stupid version of algorithm, + * which tries to do "TheRightThing". Instead of using half of + * (buffers+cache), use the minimum values. Allow an extra 2% + * of num_physpages for safety margin. + */ + + unsigned long free; + + /* Sometimes we want to use more memory than we have. */ + if (sysctl_overcommit_memory) + return 1; + + /* The page cache contains buffer pages these days.. */ + free = page_cache_size; + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * This double-counts: the nrpages are both in the page-cache + * and in the swapper space. At the same time, this compensates + * for the swap-space over-allocation (ie "nr_swap_pages" being + * too small. + */ + free += swapper_space.nrpages; + + /* + * The code below doesn't account for free space in the inode + * and dentry slab cache, slab cache fragmentation, inodes and + * dentries which will become freeable under VM load, etc. + * Lets just hope all these (complex) factors balance out... + */ + free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + + return free > pages; +} + +/* Remove one vm structure from the inode's i_mapping address space. */ +static inline void __remove_shared_vm_struct(struct vm_area_struct *vma) +{ + struct file * file = vma->vm_file; + + if (file) { + struct inode *inode = file->f_dentry->d_inode; + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&inode->i_writecount); + if(vma->vm_next_share) + vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; + *vma->vm_pprev_share = vma->vm_next_share; + } +} + +static inline void remove_shared_vm_struct(struct vm_area_struct *vma) +{ + lock_vma_mappings(vma); + __remove_shared_vm_struct(vma); + unlock_vma_mappings(vma); +} + +void lock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_lock(&mapping->i_shared_lock); +} + +void unlock_vma_mappings(struct vm_area_struct *vma) +{ + struct address_space *mapping; + + mapping = NULL; + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping) + spin_unlock(&mapping->i_shared_lock); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Check if we have enough memory.. */ + if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used + * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits + * into "VM_xxx". + */ +static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags) +{ +#define _trans(x,bit1,bit2) \ +((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0) + + unsigned long prot_bits, flag_bits; + prot_bits = + _trans(prot, PROT_READ, VM_READ) | + _trans(prot, PROT_WRITE, VM_WRITE) | + _trans(prot, PROT_EXEC, VM_EXEC); + flag_bits = + _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) | + _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) | + _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE); + return prot_bits | flag_bits; +#undef _trans +} + +#ifdef DEBUG_MM_RB +static int browse_rb(rb_node_t * rb_node) { + int i = 0; + if (rb_node) { + i++; + i += browse_rb(rb_node->rb_left); + i += browse_rb(rb_node->rb_right); + } + return i; +} + +static void validate_mm(struct mm_struct * mm) { + int bug = 0; + int i = 0; + struct vm_area_struct * tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(mm->mm_rb.rb_node); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct ** pprev, + rb_node_t *** rb_link, rb_node_t ** rb_parent) +{ + struct vm_area_struct * vma; + rb_node_t ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t * rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct * vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct inode * inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct vm_area_struct **head; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + head = &mapping->i_mmap; + if (vma->vm_flags & VM_SHARED) + head = &mapping->i_mmap_shared; + + /* insert vma into inode's share list */ + if((vma->vm_next_share = *head) != NULL) + (*head)->vm_pprev_share = &vma->vm_next_share; + *head = vma; + vma->vm_pprev_share = head; + } +} + +static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __vma_link_file(vma); +} + +static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, + rb_node_t ** rb_link, rb_node_t * rb_parent) +{ + lock_vma_mappings(vma); + spin_lock(&mm->page_table_lock); + __vma_link(mm, vma, prev, rb_link, rb_parent); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(vma); + + mm->map_count++; + validate_mm(mm); +} + +static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev, + rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags) +{ + spinlock_t * lock = &mm->page_table_lock; + if (!prev) { + prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + goto merge_next; + } + if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) { + struct vm_area_struct * next; + + spin_lock(lock); + prev->vm_end = end; + next = prev->vm_next; + if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) { + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + return 1; + } + spin_unlock(lock); + return 1; + } + + prev = prev->vm_next; + if (prev) { + merge_next: + if (!can_vma_merge(prev, vm_flags)) + return 0; + if (end == prev->vm_start) { + spin_lock(lock); + prev->vm_start = addr; + spin_unlock(lock); + return 1; + } + } + + return 0; +} + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + rb_node_t ** rb_link, * rb_parent; + + if (file) { + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + if ((prot & PROT_EXEC) && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) + return -EPERM; + } + + if (!len) + return addr; + + len = PAGE_ALIGN(len); + + if (len > TASK_SIZE || len == 0) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + /* Too many mappings? */ + if (mm->map_count > max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* Make sure we don't allow writing to an append-only file.. */ + if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* make sure there are no mandatory locks on the file. */ + if (locks_verify_locked(file->f_dentry->d_inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + vm_flags |= VM_SHARED | VM_MAYSHARE; + switch (flags & MAP_TYPE) { + default: + return -EINVAL; + case MAP_PRIVATE: + vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + /* fall through */ + case MAP_SHARED: + break; + } + } + + /* Clear old maps */ +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + /* Private writable mapping? Check memory availability.. */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + /* Can we just expand an old anonymous mapping? */ + if (!file && !(vm_flags & VM_SHARED) && rb_parent) + if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags)) + goto out; + + /* Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = pgoff; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + vma->vm_raend = 0; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (flags & MAP_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + if (addr != vma->vm_start) { + /* + * It is a bit too late to pretend changing the virtual + * area of the mapping, we just corrupted userspace + * in the do_munmap, so FIXME (not in 2.4 to avoid breaking + * the driver API). + */ + struct vm_area_struct * stale_vma; + /* Since addr changed, we rely on the mmap op to prevent + * collisions with existing vmas and just use find_vma_prepare + * to update the tree pointers. + */ + addr = vma->vm_start; + stale_vma = find_vma_prepare(mm, addr, &prev, + &rb_link, &rb_parent); + /* + * Make sure the lowlevel driver did its job right. + */ + if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) { + printk(KERN_ERR "buggy mmap operation: [<%p>]\n", + file ? file->f_op->mmap : NULL); + BUG(); + } + } + + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); +free_vma: + kmem_cache_free(vm_area_cachep, vma); + return error; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct vm_area_struct *vma; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(current->mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); + + for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + if (!vma || addr + len <= vma->vm_start) + return addr; + addr = vma->vm_end; + } +} +#else +extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#endif + +#ifndef HAVE_ARCH_CHECK_FIXED_MAPPING +#define arch_check_fixed_mapping(_file,_addr,_len,_pgoff,_flags) 0 +#else +extern unsigned long +arch_check_fixed_mapping(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +#endif + +unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) +{ + unsigned long ret; + + if (flags & MAP_FIXED) { + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + ret = arch_check_fixed_mapping(file, addr, len, pgoff, flags); + if (ret != 0) + return ret; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) + return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); + + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + rb_node_t * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + if (mm) { + /* Go through the RB tree quickly. */ + struct vm_area_struct * vma; + rb_node_t * rb_node, * rb_last_right, * rb_prev; + + rb_node = mm->mm_rb.rb_node; + rb_last_right = rb_prev = NULL; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + rb_prev = rb_last_right; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else { + rb_last_right = rb_node; + rb_node = rb_node->rb_right; + } + } + if (vma) { + if (vma->vm_rb.rb_left) { + rb_prev = vma->vm_rb.rb_left; + while (rb_prev->rb_right) + rb_prev = rb_prev->rb_right; + } + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma) + BUG(); + return vma; + } + } + *pprev = NULL; + return NULL; +} + +struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * This function works out what part of an area is affected and + * adjusts the mapping information. Since the actual page + * manipulation is done in do_mmap(), none need be done here, + * though it would probably be more appropriate. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list, so it needs to be + * reinserted if necessary. + * + * The 4 main cases are: + * Unmapping the whole area + * Unmapping from the start of the segment to a point in it + * Unmapping from an intermediate point to the end + * Unmapping between to intermediate points, making a hole. + * + * Case 4 involves the creation of 2 new areas, for each side of + * the hole. If possible, we reuse the existing area rather than + * allocate a new one, and the return indicates whether the old + * area was reused. + */ +static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, + struct vm_area_struct *area, unsigned long addr, size_t len, + struct vm_area_struct *extra) +{ + struct vm_area_struct *mpnt; + unsigned long end = addr + len; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + + /* Unmapping the whole area. */ + if (addr == area->vm_start && end == area->vm_end) { + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_file) + fput(area->vm_file); + kmem_cache_free(vm_area_cachep, area); + return extra; + } + + /* Work out to one of the ends. */ + if (end == area->vm_end) { + /* + * here area isn't visible to the semaphore-less readers + * so we don't need to update it under the spinlock. + */ + area->vm_end = addr; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else if (addr == area->vm_start) { + area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; + /* same locking considerations of the above case */ + area->vm_start = end; + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + } else { + /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ + /* Add end mapping -- leave beginning for below */ + mpnt = extra; + extra = NULL; + + mpnt->vm_mm = area->vm_mm; + mpnt->vm_start = end; + mpnt->vm_end = area->vm_end; + mpnt->vm_page_prot = area->vm_page_prot; + mpnt->vm_flags = area->vm_flags; + mpnt->vm_raend = 0; + mpnt->vm_ops = area->vm_ops; + mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); + mpnt->vm_file = area->vm_file; + mpnt->vm_private_data = area->vm_private_data; + if (mpnt->vm_file) + get_file(mpnt->vm_file); + if (mpnt->vm_ops && mpnt->vm_ops->open) + mpnt->vm_ops->open(mpnt); + area->vm_end = addr; /* Truncate area */ + + /* Because mpnt->vm_file == area->vm_file this locks + * things correctly. + */ + lock_vma_mappings(area); + spin_lock(&mm->page_table_lock); + __insert_vm_struct(mm, mpnt); + } + + __insert_vm_struct(mm, area); + spin_unlock(&mm->page_table_lock); + unlock_vma_mappings(area); + return extra; +} + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + unsigned long start_index, end_index; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + if (last < first) + return; + /* + * If the PGD bits are not consecutive in the virtual address, the + * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. + */ + start_index = pgd_index(first); + end_index = pgd_index(last); + if (end_index > start_index) { + clear_page_tables(mm, start_index, end_index - start_index); + flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); + } +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardine <jeremy@sw.oz.au> + */ +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +{ + struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; + + if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Check if this memory area is ok - put it on the temporary + * list if so.. The checks here are pretty simple -- + * every area affected in some way (by any overlap) is put + * on the list. If nothing is put on, nothing is affected. + */ + mpnt = find_vma_prev(mm, addr, &prev); + if (!mpnt) + return 0; + /* we have addr < mpnt->vm_end */ + + if (mpnt->vm_start >= addr+len) + return 0; + + /* If we'll make "hole", check the vm areas limit */ + if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) + && mm->map_count >= max_map_count) + return -ENOMEM; + + /* + * We may need one additional vma to fix up the mappings ... + * and this is the last chance for an easy error exit. + */ + extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!extra) + return -ENOMEM; + + npp = (prev ? &prev->vm_next : &mm->mmap); + free = NULL; + spin_lock(&mm->page_table_lock); + for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { + *npp = mpnt->vm_next; + mpnt->vm_next = free; + free = mpnt; + rb_erase(&mpnt->vm_rb, &mm->mm_rb); + } + mm->mmap_cache = NULL; /* Kill the cache. */ + spin_unlock(&mm->page_table_lock); + + /* Ok - we have the memory areas we should free on the 'free' list, + * so release them, and unmap the page range.. + * If the one of the segments is only being partially unmapped, + * it will put new vm_area_struct(s) into the address space. + * In that case we have to be careful with VM_DENYWRITE. + */ + while ((mpnt = free) != NULL) { + unsigned long st, end, size; + struct file *file = NULL; + + free = free->vm_next; + + st = addr < mpnt->vm_start ? mpnt->vm_start : addr; + end = addr+len; + end = end > mpnt->vm_end ? mpnt->vm_end : end; + size = end - st; + + if (mpnt->vm_flags & VM_DENYWRITE && + (st != mpnt->vm_start || end != mpnt->vm_end) && + (file = mpnt->vm_file) != NULL) { + atomic_dec(&file->f_dentry->d_inode->i_writecount); + } + remove_shared_vm_struct(mpnt); + mm->map_count--; + + zap_page_range(mm, st, size); + + /* + * Fix the mapping, and free the old area if it wasn't reused. + */ + extra = unmap_fixup(mm, mpnt, st, size, extra); + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + } + validate_mm(mm); + + /* Release the extra vma struct if it wasn't used */ + if (extra) + kmem_cache_free(vm_area_cachep, extra); + + free_pgtables(mm, prev, addr, addr+len); + + return 0; +} + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + rb_node_t ** rb_link, * rb_parent; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > max_map_count) + return -ENOMEM; + + if (!vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags; + + /* Can we just expand an old anonymous mapping? */ + if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) + return -ENOMEM; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = 0; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + + vma_link(mm, vma, prev, rb_link, rb_parent); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +/* Build the RB tree corresponding to the VMA list. */ +void build_mmap_rb(struct mm_struct * mm) +{ + struct vm_area_struct * vma; + rb_node_t ** rb_link, * rb_parent; + + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + __vma_link_rb(mm, vma, rb_link, rb_parent); + rb_parent = &vma->vm_rb; + rb_link = &rb_parent->rb_right; + } +} + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct * mm) +{ + struct vm_area_struct * mpnt; + + release_segments(mm); + spin_lock(&mm->page_table_lock); + mpnt = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + mm->rss = 0; + spin_unlock(&mm->page_table_lock); + mm->total_vm = 0; + mm->locked_vm = 0; + + flush_cache_mm(mm); + while (mpnt) { + struct vm_area_struct * next = mpnt->vm_next; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long size = end - start; + + if (mpnt->vm_ops) { + if (mpnt->vm_ops->close) + mpnt->vm_ops->close(mpnt); + } + mm->map_count--; + remove_shared_vm_struct(mpnt); + zap_page_range(mm, start, size); + if (mpnt->vm_file) + fput(mpnt->vm_file); + kmem_cache_free(vm_area_cachep, mpnt); + mpnt = next; + } + + /* This is just debugging */ + if (mm->map_count) + BUG(); + + clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); + + flush_tlb_mm(mm); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap ring. If vm_file is non-NULL + * then the i_shared_lock must be held here. + */ +void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; + validate_mm(mm); +} + +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + rb_node_t ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + vma_link(mm, vma, prev, rb_link, rb_parent); + validate_mm(mm); +} diff --git a/linux-2.4.26-xen-sparse/mm/vmalloc.c b/linux-2.4.26-xen-sparse/mm/vmalloc.c index 812b2145fe..df02fcbf7a 100644 --- a/linux-2.4.26-xen-sparse/mm/vmalloc.c +++ b/linux-2.4.26-xen-sparse/mm/vmalloc.c @@ -152,7 +152,7 @@ static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, return 0; } -static inline int __vmalloc_area_pages (unsigned long address, +/*static inline*/ int __vmalloc_area_pages (unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot, diff --git a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c index 2bbc3d8209..d3ffc00d9c 100644 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c @@ -1,5 +1,6 @@ #include <linux/slab.h> +#include <linux/version.h> #include <linux/mman.h> #include <linux/init.h> #include <asm/pgalloc.h> @@ -23,7 +24,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || ((addr + len) <= vma->vm_start))) return addr; } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) start_addr = addr = mm->free_area_cache; +#else + addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); +#endif full_search: for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { @@ -43,7 +48,9 @@ full_search: /* * Remember the place where we stopped the search: */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) mm->free_area_cache = addr + len; +#endif return addr; } addr = vma->vm_end; diff --git a/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c b/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c index f77a5b34c1..2e35a50567 100644 --- a/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c +++ b/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c @@ -25,9 +25,11 @@ #include <linux/config.h> #include <linux/init.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/kernel.h> -#include <linux/highmem.h> +#include <linux/pagemap.h> #include <linux/vmalloc.h> +#include <linux/highmem.h> #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/uaccess.h> @@ -44,6 +46,29 @@ #define DPRINTK(_f, _a...) ((void)0) #endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#define TestSetPageLocked(_p) TryLockPage(_p) +#define PageAnon(_p) 0 /* no equivalent in 2.4 */ +#define pte_offset_kernel pte_offset +extern int __vmalloc_area_pages(unsigned long address, + unsigned long size, + int gfp_mask, + pgprot_t prot, + struct page ***pages); +#else +static inline int __vmalloc_area_pages(unsigned long address, + unsigned long size, + int gfp_mask, + pgprot_t prot, + struct page ***pages) +{ + struct vm_struct vma; + vma.addr = (void *)address; + vma.size = size + PAGE_SIZE; /* retarded interface */ + return map_vm_area(&vma, prot, pages); +} +#endif + static unsigned char *fixup_buf; #define FIXUP_BUF_USER PAGE_SIZE #define FIXUP_BUF_ORDER 1 @@ -214,35 +239,41 @@ static unsigned int parse_insn(unsigned char *insn, * Mainly this function checks that our patches can't erroneously get flushed * to a file on disc, which would screw us after reboot! */ -static int safe_to_patch(unsigned long addr) +#define SUCCESS 1 +#define FAIL 0 +static int safe_to_patch(struct mm_struct *mm, unsigned long addr) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct file *file; unsigned char _name[30], *name; /* Always safe to patch the fixup buffer. */ if ( addr <= (FIXUP_BUF_USER + FIXUP_BUF_SIZE) ) - return 1; - - down_read(&mm->mmap_sem); + return SUCCESS; if ( (vma = find_vma(current->mm, addr)) == NULL ) { DPRINTK("No VMA contains fault address."); - goto fail; + return FAIL; } - /* No backing file, so safe to patch. */ + /* Only patch shared libraries. */ if ( (file = vma->vm_file) == NULL ) - goto success; + { + DPRINTK("VMA is anonymous!"); + return FAIL; + } /* No shared mappings => nobody can dirty the file. */ /* XXX Note the assumption that noone will dirty the file in future! */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) if ( file->f_mapping->i_mmap_writable != 0 ) +#else + if ( file->f_dentry->d_inode->i_mapping->i_mmap_shared != NULL ) +#endif { DPRINTK("Shared mappings exist."); - goto fail; + return FAIL; } /* @@ -251,24 +282,19 @@ static int safe_to_patch(unsigned long addr) * unlinking the old files and installing completely fresh ones. :-) */ name = d_path(file->f_dentry, file->f_vfsmnt, _name, sizeof(_name)); - if ( strncmp("/lib/tls", name, 8) != 0 ) + if ( IS_ERR(name) || (strncmp("/lib/tls", name, 8) != 0) ) { DPRINTK("Backing file is not in /lib/tls"); - goto fail; + return FAIL; } - success: - up_read(&mm->mmap_sem); - return 1; - - fail: - up_read(&mm->mmap_sem); - return 0; + return SUCCESS; } asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) { static unsigned int fixup_idx = 0; + struct mm_struct *mm = current->mm; unsigned int fi; int save_indirect_reg, hash, i; unsigned int insn_len = (unsigned int)error_code, new_insn_len; @@ -288,13 +314,16 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) return; } - if ( unlikely(!safe_to_patch(eip)) ) - return; + /* Hold the mmap_sem to prevent the mapping from disappearing under us. */ + down_read(&mm->mmap_sem); + + if ( unlikely(!safe_to_patch(mm, eip)) ) + goto out; if ( unlikely(copy_from_user(b, (void *)eip, sizeof(b)) != 0) ) { DPRINTK("Could not read instruction bytes from user space."); - return; + goto out; } /* Already created a fixup for this code sequence? */ @@ -312,7 +341,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( !printed ) printk(KERN_ALERT "WARNING: Out of room in segment-fixup page.\n"); printed = 1; - return; + goto out; } /* Must be a handleable opcode with GS override. */ @@ -320,7 +349,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) !test_bit((unsigned int)b[1], (unsigned long *)handleable_code) ) { DPRINTK("No GS override, or not a MOV (%02x %02x).", b[0], b[1]); - return; + goto out; } modrm = b[2]; @@ -335,7 +364,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( rm == 4 ) { DPRINTK("We don't grok SIB bytes."); - return; + goto out; } /* Ensure Mod/RM specifies (r32) or disp8(r32). */ @@ -345,14 +374,14 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( rm == 5 ) { DPRINTK("Unhandleable disp32 EA %d.", rm); - return; + goto out; } break; /* m32 == (r32) */ case 1: break; /* m32 == disp8(r32) */ default: DPRINTK("Unhandleable Mod value %d.", mod); - return; + goto out; } /* Indirect jump pointer. */ @@ -398,7 +427,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) parse_insn(&b[insn_len], &opcode, &decode)) == 0) ) { DPRINTK("Could not decode following instruction."); - return; + goto out; } if ( (decode & CODE_MASK) == JMP ) @@ -520,7 +549,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) test_bit(opcode, (unsigned long *)opcode_uses_reg) ) { DPRINTK("Data movement to ESP unsupported."); - return; + goto out; } if ( rm == 4 ) @@ -528,7 +557,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( mod == 3 ) { DPRINTK("Data movement to ESP is unsupported."); - return; + goto out; } sib = fixup_buf[fi++] = b[insn_len++]; @@ -585,14 +614,14 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( (insn_len += new_insn_len) > 20 ) { DPRINTK("Code to patch is too long!"); - return; + goto out; } /* Can't have a RET in the middle of a patch sequence. */ if ( (opcode == 0xc3) && (insn_len < PATCH_LEN) ) { DPRINTK("RET in middle of patch seq!\n"); - return; + goto out; } } @@ -601,7 +630,7 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( unlikely(fe == NULL) ) { DPRINTK("Not enough memory to allocate a fixup_entry."); - return; + goto out; } fe->patched_code_len = insn_len; memcpy(fe->patched_code, b, insn_len); @@ -619,7 +648,13 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if ( unlikely(((eip ^ (eip + fe->patched_code_len)) & PAGE_MASK) != 0) ) { DPRINTK("Patch instruction would straddle a page boundary."); - return; + goto out; + } + + if ( put_user(eip + PATCH_LEN, (unsigned long *)regs->esp - 1) != 0 ) + { + DPRINTK("Failed to place return address on user stack."); + goto out; } /* Create the patching instructions in a temporary buffer. */ @@ -630,40 +665,56 @@ asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) for ( i = 5; i < fe->patched_code_len; i++ ) patch[i] = 0x90; /* nop */ - /* Find the physical page that is to be patched. Check it isn't dirty. */ + spin_lock(&mm->page_table_lock); + + /* Find the physical page that is to be patched. */ pgd = pgd_offset(current->mm, eip); pmd = pmd_offset(pgd, eip); pte = pte_offset_kernel(pmd, eip); page = pte_page(*pte); - if ( unlikely(PageDirty(page)) ) + + /* + * We get lock to prevent page going AWOL on us. Also a locked page + * might be getting flushed to disc! + */ + if ( unlikely(TestSetPageLocked(page)) ) { - DPRINTK("Page is already dirty."); - return; + DPRINTK("Page is locked."); + spin_unlock(&mm->page_table_lock); + goto out; } - if ( put_user(eip + PATCH_LEN, (unsigned long *)regs->esp - 1) != 0 ) + /* + * If page is dirty it will get flushed back to disc - bad news! An + * anonymous page may be moulinexed under our feet by another thread. + */ + if ( unlikely(PageDirty(page)) || unlikely(PageAnon(page)) ) { - DPRINTK("Failed to place return address on user stack."); - return; + DPRINTK("Page is dirty or anonymous."); + unlock_page(page); + spin_unlock(&mm->page_table_lock); + goto out; } - /* Success! Return to user land to execute 2nd insn of the pair. */ - regs->esp -= 4; - regs->eip = FIXUP_BUF_USER + fe->return_idx; - - /* [SMP] Need to pause other threads while patching. */ veip = kmap(page); memcpy((char *)veip + (eip & ~PAGE_MASK), patch, fe->patched_code_len); kunmap(page); - return; + unlock_page(page); + spin_unlock(&mm->page_table_lock); + + /* Success! Return to user land to execute 2nd insn of the pair. */ + regs->esp -= 4; + regs->eip = FIXUP_BUF_USER + fe->return_idx; + + out: + up_read(&mm->mmap_sem); } static int nosegfixup = 0; static int __init fixup_init(void) { - struct vm_struct vma; struct page *_pages[1<<FIXUP_BUF_ORDER], **pages=_pages; int i; @@ -677,9 +728,8 @@ static int __init fixup_init(void) for ( i = 0; i < (1<<FIXUP_BUF_ORDER); i++ ) _pages[i] = virt_to_page(fixup_buf) + i; - vma.addr = (void *)FIXUP_BUF_USER; - vma.size = FIXUP_BUF_SIZE + PAGE_SIZE; /* fucking stupid interface */ - if ( map_vm_area(&vma, PAGE_READONLY, &pages) != 0 ) + if ( __vmalloc_area_pages(FIXUP_BUF_USER, FIXUP_BUF_SIZE, + 0, PAGE_READONLY, &pages) != 0 ) BUG(); memset(fixup_hash, 0, sizeof(fixup_hash)); |