diff options
35 files changed, 217 insertions, 2998 deletions
@@ -167,7 +167,6 @@ 3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c 40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile 41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c -3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c 3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h 3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h 3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h @@ -205,8 +204,6 @@ 3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c 3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c 409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c -3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c -41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c 41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c 40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig 40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers diff --git a/BitKeeper/etc/logging_ok b/BitKeeper/etc/logging_ok index 442fa4c826..a5cd08799d 100644 --- a/BitKeeper/etc/logging_ok +++ b/BitKeeper/etc/logging_ok @@ -74,6 +74,7 @@ rneugeba@wyvis.research.intel-research.net sd386@font.cl.cam.ac.uk shand@spidean.research.intel-research.net smh22@boulderdash.cl.cam.ac.uk +smh22@firebug.cl.cam.ac.uk smh22@labyrinth.cl.cam.ac.uk smh22@tempest.cl.cam.ac.uk smh22@uridium.cl.cam.ac.uk diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S index cda41ae56c..c856a0bd29 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S @@ -1,6 +1,9 @@ .section __xen_guest - .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000" + .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000" + .ascii ",LOADER=generic" + .ascii ",PT_MODE_WRITABLE" + .byte 0 .text #include <linux/config.h> diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c index 374c9b6c30..61fc1eb824 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c @@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) } memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE); + flush_page_update_queue(); return 0; } diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c index ada06dd973..f593714e02 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c @@ -623,6 +623,7 @@ void __init trap_init(void) set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); __make_page_readonly(&default_ldt[0]); + flush_page_update_queue(); cpu_init(); } diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c index d19218fe32..49a0afc887 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c @@ -296,7 +296,6 @@ vmalloc_fault: if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); - XEN_flush_page_update_queue(); /* flush PMD update */ pte_k = pte_offset(pmd_k, address); if (!pte_present(*pte_k)) diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c index 40a5af9273..88d775bcd4 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c @@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigned long vaddr, } pte = pte_offset(pmd, vaddr); - queue_l1_entry_update(pte, phys | pgprot_val(prot)); + set_pte(pte, (pte_t) { phys | pgprot_val(prot) }); /* * It's enough to flush this one mapping. @@ -201,17 +201,13 @@ static void __init fixrange_init (unsigned long start, kpgd = pgd_offset_k((unsigned long)pte); kpmd = pmd_offset(kpgd, (unsigned long)pte); kpte = pte_offset(kpmd, (unsigned long)pte); - queue_l1_entry_update(kpte, - (*(unsigned long *)kpte)&~_PAGE_RW); - + set_pte(kpte, pte_wrprotect(*kpte)); set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); } vaddr += PMD_SIZE; } j = 0; } - - XEN_flush_page_update_queue(); } @@ -257,10 +253,8 @@ static void __init pagetable_init (void) kpgd = pgd_offset_k((unsigned long)pte_base); kpmd = pmd_offset(kpgd, (unsigned long)pte_base); kpte = pte_offset(kpmd, (unsigned long)pte_base); - queue_l1_entry_update(kpte, - (*(unsigned long *)kpte)&~_PAGE_RW); + set_pte(kpte, pte_wrprotect(*kpte)); set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); - XEN_flush_page_update_queue(); } } @@ -311,6 +305,7 @@ void __init paging_init(void) pagetable_init(); zone_sizes_init(); + /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); diff --git a/linux-2.4.29-xen-sparse/fs/exec.c b/linux-2.4.29-xen-sparse/fs/exec.c deleted file mode 100644 index 8a114151a9..0000000000 --- a/linux-2.4.29-xen-sparse/fs/exec.c +++ /dev/null @@ -1,1179 +0,0 @@ -/* - * linux/fs/exec.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * #!-checking implemented by tytso. - */ -/* - * Demand-loading implemented 01.12.91 - no need to read anything but - * the header into memory. The inode of the executable is put into - * "current->executable", and page faults do the actual loading. Clean. - * - * Once more I can proudly say that linux stood up to being changed: it - * was less than 2 hours work to get demand-loading completely implemented. - * - * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, - * current->executable is only used by the procfs. This allows a dispatch - * table to check for several different types of binary formats. We keep - * trying until we recognize the file or we run out of supported binary - * formats. - */ - -#include <linux/config.h> -#include <linux/slab.h> -#include <linux/file.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/spinlock.h> -#include <linux/personality.h> -#include <linux/swap.h> -#include <linux/utsname.h> -#define __NO_VERSION__ -#include <linux/module.h> - -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/mmu_context.h> - -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - -int core_uses_pid; -char core_pattern[65] = "core"; -int core_setuid_ok = 0; -/* The maximal length of core_pattern is also specified in sysctl.c */ - -static struct linux_binfmt *formats; -static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; - -int register_binfmt(struct linux_binfmt * fmt) -{ - struct linux_binfmt ** tmp = &formats; - - if (!fmt) - return -EINVAL; - if (fmt->next) - return -EBUSY; - write_lock(&binfmt_lock); - while (*tmp) { - if (fmt == *tmp) { - write_unlock(&binfmt_lock); - return -EBUSY; - } - tmp = &(*tmp)->next; - } - fmt->next = formats; - formats = fmt; - write_unlock(&binfmt_lock); - return 0; -} - -int unregister_binfmt(struct linux_binfmt * fmt) -{ - struct linux_binfmt ** tmp = &formats; - - write_lock(&binfmt_lock); - while (*tmp) { - if (fmt == *tmp) { - *tmp = fmt->next; - write_unlock(&binfmt_lock); - return 0; - } - tmp = &(*tmp)->next; - } - write_unlock(&binfmt_lock); - return -EINVAL; -} - -static inline void put_binfmt(struct linux_binfmt * fmt) -{ - if (fmt->module) - __MOD_DEC_USE_COUNT(fmt->module); -} - -/* - * Note that a shared library must be both readable and executable due to - * security reasons. - * - * Also note that we take the address to load from from the file itself. - */ -asmlinkage long sys_uselib(const char * library) -{ - struct file * file; - struct nameidata nd; - int error; - - error = user_path_walk(library, &nd); - if (error) - goto out; - - error = -EINVAL; - if (!S_ISREG(nd.dentry->d_inode->i_mode)) - goto exit; - - error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC); - if (error) - goto exit; - - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - - error = -ENOEXEC; - if(file->f_op && file->f_op->read) { - struct linux_binfmt * fmt; - - read_lock(&binfmt_lock); - for (fmt = formats ; fmt ; fmt = fmt->next) { - if (!fmt->load_shlib) - continue; - if (!try_inc_mod_count(fmt->module)) - continue; - read_unlock(&binfmt_lock); - error = fmt->load_shlib(file); - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (error != -ENOEXEC) - break; - } - read_unlock(&binfmt_lock); - } - fput(file); -out: - return error; -exit: - path_release(&nd); - goto out; -} - -/* - * count() counts the number of arguments/envelopes - */ -static int count(char ** argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - char * p; - - if (get_user(p, argv)) - return -EFAULT; - if (!p) - break; - argv++; - if(++i > max) - return -E2BIG; - } - } - return i; -} - -/* - * 'copy_strings()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) -{ - struct page *kmapped_page = NULL; - char *kaddr = NULL; - int ret; - - while (argc-- > 0) { - char *str; - int len; - unsigned long pos; - - if (get_user(str, argv+argc) || - !(len = strnlen_user(str, bprm->p))) { - ret = -EFAULT; - goto out; - } - - if (bprm->p < len) { - ret = -E2BIG; - goto out; - } - - bprm->p -= len; - /* XXX: add architecture specific overflow check here. */ - pos = bprm->p; - - while (len > 0) { - int i, new, err; - int offset, bytes_to_copy; - struct page *page; - - offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; - if (!page) { - ret = -ENOMEM; - goto out; - } - new = 1; - } - - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); - if (err) { - ret = -EFAULT; - goto out; - } - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - ret = 0; -out: - if (kmapped_page) - kunmap(kmapped_page); - return ret; -} - -/* - * Like copy_strings, but get argv and its values from kernel memory. - */ -int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) -{ - int r; - mm_segment_t oldfs = get_fs(); - set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); - set_fs(oldfs); - return r; -} - -/* - * This routine is used to map in a page into an address space: needed by - * execve() for the initial stack and environment pages. - * - * tsk->mmap_sem is held for writing. - */ -void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) -{ - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct vm_area_struct *vma; - pgprot_t prot = PAGE_COPY; - - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); - if (!pmd) - goto out; - pte = pte_alloc(tsk->mm, pmd, address); - if (!pte) - goto out; - if (!pte_none(*pte)) - goto out; - lru_cache_add(page); - flush_dcache_page(page); - flush_page_to_ram(page); - /* lookup is cheap because there is only a single entry in the list */ - vma = find_vma(tsk->mm, address); - if (vma) - prot = vma->vm_page_prot; - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - XEN_flush_page_update_queue(); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); - - /* no need for flush_tlb */ - return; -out: - spin_unlock(&tsk->mm->page_table_lock); - __free_page(page); - force_sig(SIGKILL, tsk); - return; -} - -int setup_arg_pages(struct linux_binprm *bprm) -{ - unsigned long stack_base; - struct vm_area_struct *mpnt; - int i, ret; - - stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE; - - bprm->p += stack_base; - if (bprm->loader) - bprm->loader += stack_base; - bprm->exec += stack_base; - - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!mpnt) - return -ENOMEM; - - down_write(¤t->mm->mmap_sem); - { - mpnt->vm_mm = current->mm; - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; - mpnt->vm_end = STACK_TOP; - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; - mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; - mpnt->vm_file = NULL; - mpnt->vm_private_data = (void *) 0; - if ((ret = insert_vm_struct(current->mm, mpnt))) { - up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } - current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - } - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page *page = bprm->page[i]; - if (page) { - bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base); - } - stack_base += PAGE_SIZE; - } - up_write(¤t->mm->mmap_sem); - - return 0; -} - -struct file *open_exec(const char *name) -{ - struct nameidata nd; - struct inode *inode; - struct file *file; - int err = 0; - - err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; - file = ERR_PTR(-EACCES); - if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && - S_ISREG(inode->i_mode)) { - int err = permission(inode, MAY_EXEC); - if (!err && !(inode->i_mode & 0111)) - err = -EACCES; - file = ERR_PTR(err); - if (!err) { - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { - fput(file); - file = ERR_PTR(err); - } - } -out: - return file; - } - } - path_release(&nd); - } - goto out; -} - -int kernel_read(struct file *file, unsigned long offset, - char * addr, unsigned long count) -{ - mm_segment_t old_fs; - loff_t pos = offset; - int result = -ENOSYS; - - if (!file->f_op->read) - goto fail; - old_fs = get_fs(); - set_fs(get_ds()); - result = file->f_op->read(file, addr, count, &pos); - set_fs(old_fs); -fail: - return result; -} - -static int exec_mmap(void) -{ - struct mm_struct * mm, * old_mm; - - old_mm = current->mm; - - if (old_mm && atomic_read(&old_mm->mm_users) == 1) { - mm_release(); - down_write(&old_mm->mmap_sem); - exit_mmap(old_mm); - up_write(&old_mm->mmap_sem); - return 0; - } - - - mm = mm_alloc(); - if (mm) { - struct mm_struct *active_mm; - - if (init_new_context(current, mm)) { - mmdrop(mm); - return -ENOMEM; - } - - /* Add it to the list of mm's */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, &init_mm.mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - - task_lock(current); - active_mm = current->active_mm; - current->mm = mm; - current->active_mm = mm; - task_unlock(current); - activate_mm(active_mm, mm); - mm_release(); - if (old_mm) { - if (active_mm != old_mm) BUG(); - mmput(old_mm); - return 0; - } - mmdrop(active_mm); - return 0; - } - return -ENOMEM; -} - -/* - * This function makes sure the current process has its own signal table, - * so that flush_signal_handlers can later reset the handlers without - * disturbing other processes. (Other processes might share the signal - * table via the CLONE_SIGNAL option to clone().) - */ - -static inline int make_private_signals(void) -{ - struct signal_struct * newsig; - - if (atomic_read(¤t->sig->count) <= 1) - return 0; - newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); - if (newsig == NULL) - return -ENOMEM; - spin_lock_init(&newsig->siglock); - atomic_set(&newsig->count, 1); - memcpy(newsig->action, current->sig->action, sizeof(newsig->action)); - spin_lock_irq(¤t->sigmask_lock); - current->sig = newsig; - spin_unlock_irq(¤t->sigmask_lock); - return 0; -} - -/* - * If make_private_signals() made a copy of the signal table, decrement the - * refcount of the original table, and free it if necessary. - * We don't do that in make_private_signals() so that we can back off - * in flush_old_exec() if an error occurs after calling make_private_signals(). - */ - -static inline void release_old_signals(struct signal_struct * oldsig) -{ - if (current->sig == oldsig) - return; - if (atomic_dec_and_test(&oldsig->count)) - kmem_cache_free(sigact_cachep, oldsig); -} - -/* - * These functions flushes out all traces of the currently running executable - * so that a new one can be started - */ - -static inline void flush_old_files(struct files_struct * files) -{ - long j = -1; - - write_lock(&files->file_lock); - for (;;) { - unsigned long set, i; - - j++; - i = j * __NFDBITS; - if (i >= files->max_fds || i >= files->max_fdset) - break; - set = files->close_on_exec->fds_bits[j]; - if (!set) - continue; - files->close_on_exec->fds_bits[j] = 0; - write_unlock(&files->file_lock); - for ( ; set ; i++,set >>= 1) { - if (set & 1) { - sys_close(i); - } - } - write_lock(&files->file_lock); - - } - write_unlock(&files->file_lock); -} - -/* - * An execve() will automatically "de-thread" the process. - * Note: we don't have to hold the tasklist_lock to test - * whether we migth need to do this. If we're not part of - * a thread group, there is no way we can become one - * dynamically. And if we are, we only need to protect the - * unlink - even if we race with the last other thread exit, - * at worst the list_del_init() might end up being a no-op. - */ -static inline void de_thread(struct task_struct *tsk) -{ - if (!list_empty(&tsk->thread_group)) { - write_lock_irq(&tasklist_lock); - list_del_init(&tsk->thread_group); - write_unlock_irq(&tasklist_lock); - } - - /* Minor oddity: this might stay the same. */ - tsk->tgid = tsk->pid; -} - -void get_task_comm(char *buf, struct task_struct *tsk) -{ - /* buf must be at least sizeof(tsk->comm) in size */ - task_lock(tsk); - memcpy(buf, tsk->comm, sizeof(tsk->comm)); - task_unlock(tsk); -} - -void set_task_comm(struct task_struct *tsk, char *buf) -{ - task_lock(tsk); - strncpy(tsk->comm, buf, sizeof(tsk->comm)); - tsk->comm[sizeof(tsk->comm)-1]='\0'; - task_unlock(tsk); -} - -int flush_old_exec(struct linux_binprm * bprm) -{ - char * name; - int i, ch, retval; - struct signal_struct * oldsig; - struct files_struct * files; - char tcomm[sizeof(current->comm)]; - - /* - * Make sure we have a private signal table - */ - oldsig = current->sig; - retval = make_private_signals(); - if (retval) goto flush_failed; - - /* - * Make sure we have private file handles. Ask the - * fork helper to do the work for us and the exit - * helper to do the cleanup of the old one. - */ - - files = current->files; /* refcounted so safe to hold */ - retval = unshare_files(); - if(retval) - goto flush_failed; - - /* - * Release all of the old mmap stuff - */ - retval = exec_mmap(); - if (retval) goto mmap_failed; - - /* This is the point of no return */ - steal_locks(files); - put_files_struct(files); - release_old_signals(oldsig); - - current->sas_ss_sp = current->sas_ss_size = 0; - - if (current->euid == current->uid && current->egid == current->gid) { - current->mm->dumpable = 1; - current->task_dumpable = 1; - } - name = bprm->filename; - for (i=0; (ch = *(name++)) != '\0';) { - if (ch == '/') - i = 0; - else - if (i < (sizeof(tcomm) - 1)) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; - set_task_comm(current, tcomm); - - flush_thread(); - - de_thread(current); - - if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || - permission(bprm->file->f_dentry->d_inode,MAY_READ)) - current->mm->dumpable = 0; - - /* An exec changes our domain. We are no longer part of the thread - group */ - - current->self_exec_id++; - - flush_signal_handlers(current); - flush_old_files(current->files); - - return 0; - -mmap_failed: - put_files_struct(current->files); - current->files = files; -flush_failed: - spin_lock_irq(¤t->sigmask_lock); - if (current->sig != oldsig) { - kmem_cache_free(sigact_cachep, current->sig); - current->sig = oldsig; - } - spin_unlock_irq(¤t->sigmask_lock); - return retval; -} - -/* - * We mustn't allow tracing of suid binaries, unless - * the tracer has the capability to trace anything.. - */ -static inline int must_not_trace_exec(struct task_struct * p) -{ - return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); -} - -/* - * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes - */ -int prepare_binprm(struct linux_binprm *bprm) -{ - int mode; - struct inode * inode = bprm->file->f_dentry->d_inode; - - mode = inode->i_mode; - /* - * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, - * vfs_permission lets a non-executable through - */ - if (!(mode & 0111)) /* with at least _one_ execute bit set */ - return -EACCES; - if (bprm->file->f_op == NULL) - return -EACCES; - - bprm->e_uid = current->euid; - bprm->e_gid = current->egid; - - if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { - /* Set-uid? */ - if (mode & S_ISUID) - bprm->e_uid = inode->i_uid; - - /* Set-gid? */ - /* - * If setgid is set but no group execute bit then this - * is a candidate for mandatory locking, not a setgid - * executable. - */ - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) - bprm->e_gid = inode->i_gid; - } - - /* We don't have VFS support for capabilities yet */ - cap_clear(bprm->cap_inheritable); - cap_clear(bprm->cap_permitted); - cap_clear(bprm->cap_effective); - - /* To support inheritance of root-permissions and suid-root - * executables under compatibility mode, we raise all three - * capability sets for the file. - * - * If only the real uid is 0, we only raise the inheritable - * and permitted sets of the executable file. - */ - - if (!issecure(SECURE_NOROOT)) { - if (bprm->e_uid == 0 || current->uid == 0) { - cap_set_full(bprm->cap_inheritable); - cap_set_full(bprm->cap_permitted); - } - if (bprm->e_uid == 0) - cap_set_full(bprm->cap_effective); - } - - memset(bprm->buf,0,BINPRM_BUF_SIZE); - return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); -} - -/* - * This function is used to produce the new IDs and capabilities - * from the old ones and the file's capabilities. - * - * The formula used for evolving capabilities is: - * - * pI' = pI - * (***) pP' = (fP & X) | (fI & pI) - * pE' = pP' & fE [NB. fE is 0 or ~0] - * - * I=Inheritable, P=Permitted, E=Effective // p=process, f=file - * ' indicates post-exec(), and X is the global 'cap_bset'. - * - */ - -void compute_creds(struct linux_binprm *bprm) -{ - kernel_cap_t new_permitted, working; - int do_unlock = 0; - - new_permitted = cap_intersect(bprm->cap_permitted, cap_bset); - working = cap_intersect(bprm->cap_inheritable, - current->cap_inheritable); - new_permitted = cap_combine(new_permitted, working); - - if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || - !cap_issubset(new_permitted, current->cap_permitted)) { - current->mm->dumpable = 0; - - lock_kernel(); - if (must_not_trace_exec(current) - || atomic_read(¤t->fs->count) > 1 - || atomic_read(¤t->files->count) > 1 - || atomic_read(¤t->sig->count) > 1) { - if(!capable(CAP_SETUID)) { - bprm->e_uid = current->uid; - bprm->e_gid = current->gid; - } - if(!capable(CAP_SETPCAP)) { - new_permitted = cap_intersect(new_permitted, - current->cap_permitted); - } - } - do_unlock = 1; - } - - - /* For init, we want to retain the capabilities set - * in the init_task struct. Thus we skip the usual - * capability rules */ - if (current->pid != 1) { - current->cap_permitted = new_permitted; - current->cap_effective = - cap_intersect(new_permitted, bprm->cap_effective); - } - - /* AUD: Audit candidate if current->cap_effective is set */ - - current->suid = current->euid = current->fsuid = bprm->e_uid; - current->sgid = current->egid = current->fsgid = bprm->e_gid; - - if(do_unlock) - unlock_kernel(); - current->keep_capabilities = 0; -} - - -void remove_arg_zero(struct linux_binprm *bprm) -{ - if (bprm->argc) { - unsigned long offset; - char * kaddr; - struct page *page; - - offset = bprm->p % PAGE_SIZE; - goto inside; - - while (bprm->p++, *(kaddr+offset++)) { - if (offset != PAGE_SIZE) - continue; - offset = 0; - kunmap(page); -inside: - page = bprm->page[bprm->p/PAGE_SIZE]; - kaddr = kmap(page); - } - kunmap(page); - bprm->argc--; - } -} - -/* - * cycle the list of binary formats handler, until one recognizes the image - */ -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) -{ - int try,retval=0; - struct linux_binfmt *fmt; -#ifdef __alpha__ - /* handle /sbin/loader.. */ - { - struct exec * eh = (struct exec *) bprm->buf; - - if (!bprm->loader && eh->fh.f_magic == 0x183 && - (eh->fh.f_flags & 0x3000) == 0x3000) - { - struct file * file; - unsigned long loader; - - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - - loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - - file = open_exec("/sbin/loader"); - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - /* Remember if the application is TASO. */ - bprm->sh_bang = eh->ah.entry < 0x100000000; - - bprm->file = file; - bprm->loader = loader; - retval = prepare_binprm(bprm); - if (retval<0) - return retval; - /* should call search_binary_handler recursively here, - but it does not matter */ - } - } -#endif - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - for (fmt = formats ; fmt ; fmt = fmt->next) { - int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; - if (!fn) - continue; - if (!try_inc_mod_count(fmt->module)) - continue; - read_unlock(&binfmt_lock); - retval = fn(bprm, regs); - if (retval >= 0) { - put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; - current->did_exec = 1; - return retval; - } - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } - } - read_unlock(&binfmt_lock); - if (retval != -ENOEXEC) { - break; -#ifdef CONFIG_KMOD - }else{ -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - char modname[20]; - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); - request_module(modname); -#endif - } - } - return retval; -} - - -/* - * sys_execve() executes a new program. - */ -int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file *file; - int retval; - int i; - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) { - allow_write_access(file); - fput(file); - return bprm.argc; - } - - if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) { - allow_write_access(file); - fput(file); - return bprm.envc; - } - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm,regs); - if (retval >= 0) - /* execve success */ - return retval; - -out: - /* Something went wrong, return the inode and free the argument pages*/ - allow_write_access(bprm.file); - if (bprm.file) - fput(bprm.file); - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm.page[i]; - if (page) - __free_page(page); - } - - return retval; -} - -void set_binfmt(struct linux_binfmt *new) -{ - struct linux_binfmt *old = current->binfmt; - if (new && new->module) - __MOD_INC_USE_COUNT(new->module); - current->binfmt = new; - if (old && old->module) - __MOD_DEC_USE_COUNT(old->module); -} - -#define CORENAME_MAX_SIZE 64 - -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. - */ -void format_corename(char *corename, const char *pattern, long signr) -{ - const char *pat_ptr = pattern; - char *out_ptr = corename; - char *const out_end = corename + CORENAME_MAX_SIZE; - int rc; - int pid_in_pattern = 0; - - /* Repeat as long as we have more pattern to process and more output - space */ - while (*pat_ptr) { - if (*pat_ptr != '%') { - if (out_ptr == out_end) - goto out; - *out_ptr++ = *pat_ptr++; - } else { - switch (*++pat_ptr) { - case 0: - goto out; - /* Double percent, output one percent */ - case '%': - if (out_ptr == out_end) - goto out; - *out_ptr++ = '%'; - break; - /* pid */ - case 'p': - pid_in_pattern = 1; - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->pid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* uid */ - case 'u': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->uid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* gid */ - case 'g': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->gid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* signal that caused the coredump */ - case 's': - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", signr); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* UNIX time of coredump */ - case 't': { - struct timeval tv; - do_gettimeofday(&tv); - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", tv.tv_sec); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - } - /* hostname */ - case 'h': - down_read(&uts_sem); - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", system_utsname.nodename); - up_read(&uts_sem); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* executable */ - case 'e': - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", current->comm); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - default: - break; - } - ++pat_ptr; - } - } - /* Backward compatibility with core_uses_pid: - * - * If core_pattern does not include a %p (as is the default) - * and core_uses_pid is set, then .%pid will be appended to - * the filename */ - if (!pid_in_pattern - && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { - rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", current->pid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - } - out: - *out_ptr = 0; -} - -int do_coredump(long signr, struct pt_regs * regs) -{ - struct linux_binfmt * binfmt; - char corename[CORENAME_MAX_SIZE + 1]; - struct file * file; - struct inode * inode; - int retval = 0; - int fsuid = current->fsuid; - - lock_kernel(); - binfmt = current->binfmt; - if (!binfmt || !binfmt->core_dump) - goto fail; - if (!is_dumpable(current)) - { - if(!core_setuid_ok || !current->task_dumpable) - goto fail; - current->fsuid = 0; - } - current->mm->dumpable = 0; - if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) - goto fail; - - format_corename(corename, core_pattern, signr); - file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600); - if (IS_ERR(file)) - goto fail; - inode = file->f_dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; /* multiple links - don't dump */ - if (d_unhashed(file->f_dentry)) - goto close_fail; - - if (!S_ISREG(inode->i_mode)) - goto close_fail; - if (!file->f_op) - goto close_fail; - if (!file->f_op->write) - goto close_fail; - if (do_truncate(file->f_dentry, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); - -close_fail: - filp_close(file, NULL); -fail: - if (fsuid != current->fsuid) - current->fsuid = fsuid; - unlock_kernel(); - return retval; -} diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/page.h b/linux-2.4.29-xen-sparse/include/asm-xen/page.h index fbab7f5ff1..3150545429 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/page.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/page.h @@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pmd_val(pmd_t x) { unsigned long ret = x.pmd; - if ( (ret & 1) ) ret = machine_to_phys(ret); + if ( ret ) ret = machine_to_phys(ret) | 1; return ret; } #define pmd_val_ma(x) ((x).pmd) #define pgd_val(x) ({ BUG(); (unsigned long)0; }) #define pgprot_val(x) ((x).pgprot) -static inline pte_t __pte(unsigned long x) -{ - if ( (x & 1) ) x = phys_to_machine(x); - return ((pte_t) { (x) }); -} -static inline pmd_t __pmd(unsigned long x) -{ - if ( (x & 1) ) x = phys_to_machine(x); - return ((pmd_t) { (x) }); -} +#define __pte(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) +#define __pte_ma(x) ((pte_t) { (x) } ) +#define __pmd(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); }) #define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; }) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h index 4e9584e918..2a0c226c71 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h @@ -22,7 +22,6 @@ #define pmd_populate(mm, pmd, pte) \ do { \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ - XEN_flush_page_update_queue(); \ } while ( 0 ) /* @@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void) memcpy(pgd + USER_PTRS_PER_PGD, init_mm.pgd + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - __make_page_readonly(pgd); + __make_page_readonly(pgd); queue_pgd_pin(__pa(pgd)); + flush_page_update_queue(); } return pgd; } @@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *pgd) kmem_cache_free(pae_pgd_cachep, pgd); #else queue_pgd_unpin(__pa(pgd)); - __make_page_writable(pgd); + __make_page_writable(pgd); + flush_page_update_queue(); free_page((unsigned long)pgd); #endif } @@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) clear_page(pte); __make_page_readonly(pte); queue_pte_pin(__pa(pte)); + flush_page_update_queue(); } return pte; @@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte_t *pte) { queue_pte_unpin(__pa(pte)); __make_page_writable(pte); + flush_page_update_queue(); free_page((unsigned long)pte); } @@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int); static inline void flush_tlb_mm(struct mm_struct *mm) { - if (mm == current->active_mm) queue_tlb_flush(); - XEN_flush_page_update_queue(); + if (mm == current->active_mm) xen_tlb_flush(); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) queue_invlpg(addr); - XEN_flush_page_update_queue(); + if (vma->vm_mm == current->active_mm) xen_invlpg(addr); } static inline void flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) { - if (mm == current->active_mm) queue_tlb_flush(); - XEN_flush_page_update_queue(); + if (mm == current->active_mm) xen_tlb_flush(); } #else @@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) { /* i386 does not keep any page table caches in TLB */ - XEN_flush_page_update_queue(); } /* diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h index d91b48360e..70f8356fb1 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h @@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd) { return 0; } static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low) -#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low) -#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval)) +/* + * Certain architectures need to do special things when PTEs + * within a page table are directly modified. Thus, the following + * hook is made available. + */ +#define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval) + +/* + * (pmds are folded into pgds so this doesnt get actually called, + * but the define is needed for a generic inline function.) + */ +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) #define set_pgd(pgdptr, pgdval) ((void)0) #define pgd_page(pgd) \ @@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) return (pmd_t *) dir; } +#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) /* @@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) #define pte_none(x) (!(x).pte_low) #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -/* - * A note on implementation of this atomic 'get-and-clear' operation. - * This is actually very simple because XenoLinux can only run on a single - * processor. Therefore, we cannot race other processors setting the 'accessed' - * or 'dirty' bits on a page-table entry. - * Even if pages are shared between domains, that is not a problem because - * each domain will have separate page tables, with their own versions of - * accessed & dirty state. - */ -static inline pte_t ptep_get_and_clear(pte_t *xp) -{ - pte_t pte = *xp; - if ( !pte_none(pte) ) - queue_l1_entry_update(xp, 0); - return pte; -} - #endif /* _I386_PGTABLE_2LEVEL_H */ diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h index c15f0e9509..f5a53adc82 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h @@ -38,11 +38,11 @@ extern void paging_init(void); extern unsigned long pgkern_mask; -#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); }) +#define __flush_tlb() xen_tlb_flush() #define __flush_tlb_global() __flush_tlb() #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); }) -#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); }) +#define __flush_tlb_one(addr) xen_invlpg(addr) +#define __flush_tlb_single(addr) xen_invlpg(addr) /* * ZERO_PAGE is a global shared page that is always zero: used @@ -179,12 +179,14 @@ extern void * high_memory; #define __S111 PAGE_SHARED #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) queue_l1_entry_update(xp, 0) +#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) -#define pmd_none(x) (!(x).pmd) -#define pmd_present(x) ((x).pmd & _PAGE_PRESENT) +#define pmd_none(x) (!pmd_val(x)) +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ +#define pmd_present(x) (pmd_val(x)) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) (((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return p static inline int ptep_test_and_clear_dirty(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - int ret = pteval & _PAGE_DIRTY; - if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY); - return ret; + if (!pte_dirty(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } -static inline int ptep_test_and_clear_young(pte_t *ptep) + +static inline int ptep_test_and_clear_young(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - int ret = pteval & _PAGE_ACCESSED; - if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED); - return ret; + if (!pte_young(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } + static inline void ptep_set_wrprotect(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - if ( (pteval & _PAGE_RW) ) - queue_l1_entry_update(ptep, pteval & ~_PAGE_RW); + if (pte_write(*ptep)) + clear_bit(_PAGE_BIT_RW, &ptep->pte_low); } + static inline void ptep_mkdirty(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - if ( !(pteval & _PAGE_DIRTY) ) - queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY); + if (!pte_dirty(*ptep)) + set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } /* diff --git a/linux-2.4.29-xen-sparse/mm/highmem.c b/linux-2.4.29-xen-sparse/mm/highmem.c index 341e6e29a9..f8182820ac 100644 --- a/linux-2.4.29-xen-sparse/mm/highmem.c +++ b/linux-2.4.29-xen-sparse/mm/highmem.c @@ -122,7 +122,6 @@ start: } vaddr = PKMAP_ADDR(last_pkmap_nr); set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); - XEN_flush_page_update_queue(); pkmap_count[last_pkmap_nr] = 1; page->virtual = (void *) vaddr; diff --git a/linux-2.4.29-xen-sparse/mm/memory.c b/linux-2.4.29-xen-sparse/mm/memory.c index 7d81c86589..880b6981c4 100644 --- a/linux-2.4.29-xen-sparse/mm/memory.c +++ b/linux-2.4.29-xen-sparse/mm/memory.c @@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) free_one_pgd(page_dir); page_dir++; } while (--nr); - XEN_flush_page_update_queue(); spin_unlock(&mm->page_table_lock); /* keep the page table cache within bounds */ @@ -249,10 +248,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow && pte_write(pte)) { - /* XEN modification: modified ordering here to avoid RaW hazard. */ - pte = *src_pte; - pte = pte_wrprotect(pte); ptep_set_wrprotect(src_pte); + pte = *src_pte; } /* If it's a shared mapping, mark it clean in the child */ @@ -914,7 +911,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr { #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG); } else { set_pte(page_table, entry); @@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct * mm, flush_page_to_ram(page); flush_icache_page(vma, page); #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(address, pte, 0); - } else { + else set_pte(page_table, pte); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, pte); #endif @@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, } #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(addr, entry, 0); - } else { + else set_pte(page_table, entry); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, entry); #endif @@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(address, entry, 0); - } else { + else set_pte(page_table, entry); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, entry); #endif @@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long addres /* "fast" allocation can happen without dropping the lock.. */ new = pte_alloc_one_fast(mm, address); if (!new) { - XEN_flush_page_update_queue(); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); diff --git a/linux-2.4.29-xen-sparse/mm/mremap.c b/linux-2.4.29-xen-sparse/mm/mremap.c index 330e194bae..475c308b1b 100644 --- a/linux-2.4.29-xen-sparse/mm/mremap.c +++ b/linux-2.4.29-xen-sparse/mm/mremap.c @@ -119,11 +119,9 @@ static int move_page_tables(struct mm_struct * mm, * the old page tables) */ oops_we_failed: - XEN_flush_page_update_queue(); flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - XEN_flush_page_update_queue(); zap_page_range(mm, new_addr, len); return -1; } diff --git a/linux-2.4.29-xen-sparse/mm/swapfile.c b/linux-2.4.29-xen-sparse/mm/swapfile.c deleted file mode 100644 index 6457f19b74..0000000000 --- a/linux-2.4.29-xen-sparse/mm/swapfile.c +++ /dev/null @@ -1,1267 +0,0 @@ -/* - * linux/mm/swapfile.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * Swap reorganised 29.12.95, Stephen Tweedie - */ - -#include <linux/slab.h> -#include <linux/smp_lock.h> -#include <linux/kernel_stat.h> -#include <linux/swap.h> -#include <linux/swapctl.h> -#include <linux/blkdev.h> /* for blk_size */ -#include <linux/vmalloc.h> -#include <linux/pagemap.h> -#include <linux/shm.h> - -#include <asm/pgtable.h> - -spinlock_t swaplock = SPIN_LOCK_UNLOCKED; -unsigned int nr_swapfiles; -int total_swap_pages; -static int swap_overflow; - -static const char Bad_file[] = "Bad swap file entry "; -static const char Unused_file[] = "Unused swap file entry "; -static const char Bad_offset[] = "Bad swap offset entry "; -static const char Unused_offset[] = "Unused swap offset entry "; - -struct swap_list_t swap_list = {-1, -1}; - -struct swap_info_struct swap_info[MAX_SWAPFILES]; - -#define SWAPFILE_CLUSTER 256 - -static inline int scan_swap_map(struct swap_info_struct *si) -{ - unsigned long offset; - /* - * We try to cluster swap pages by allocating them - * sequentially in swap. Once we've allocated - * SWAPFILE_CLUSTER pages this way, however, we resort to - * first-free allocation, starting a new cluster. This - * prevents us from scattering swap pages all over the entire - * swap partition, so that we reduce overall disk seek times - * between swap pages. -- sct */ - if (si->cluster_nr) { - while (si->cluster_next <= si->highest_bit) { - offset = si->cluster_next++; - if (si->swap_map[offset]) - continue; - si->cluster_nr--; - goto got_page; - } - } - si->cluster_nr = SWAPFILE_CLUSTER; - - /* try to find an empty (even not aligned) cluster. */ - offset = si->lowest_bit; - check_next_cluster: - if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) - { - int nr; - for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) - if (si->swap_map[nr]) - { - offset = nr+1; - goto check_next_cluster; - } - /* We found a completly empty cluster, so start - * using it. - */ - goto got_page; - } - /* No luck, so now go finegrined as usual. -Andrea */ - for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { - if (si->swap_map[offset]) - continue; - si->lowest_bit = offset+1; - got_page: - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - if (si->lowest_bit > si->highest_bit) { - si->lowest_bit = si->max; - si->highest_bit = 0; - } - si->swap_map[offset] = 1; - nr_swap_pages--; - si->cluster_next = offset+1; - return offset; - } - si->lowest_bit = si->max; - si->highest_bit = 0; - return 0; -} - -swp_entry_t get_swap_page(void) -{ - struct swap_info_struct * p; - unsigned long offset; - swp_entry_t entry; - int type, wrapped = 0; - - entry.val = 0; /* Out of memory */ - swap_list_lock(); - type = swap_list.next; - if (type < 0) - goto out; - if (nr_swap_pages <= 0) - goto out; - - while (1) { - p = &swap_info[type]; - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - swap_device_lock(p); - offset = scan_swap_map(p); - swap_device_unlock(p); - if (offset) { - entry = SWP_ENTRY(type,offset); - type = swap_info[type].next; - if (type < 0 || - p->prio != swap_info[type].prio) { - swap_list.next = swap_list.head; - } else { - swap_list.next = type; - } - goto out; - } - } - type = p->next; - if (!wrapped) { - if (type < 0 || p->prio != swap_info[type].prio) { - type = swap_list.head; - wrapped = 1; - } - } else - if (type < 0) - goto out; /* out of swap space */ - } -out: - swap_list_unlock(); - return entry; -} - -static struct swap_info_struct * swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - - if (!entry.val) - goto out; - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) - goto bad_nofile; - p = & swap_info[type]; - if (!(p->flags & SWP_USED)) - goto bad_device; - offset = SWP_OFFSET(entry); - if (offset >= p->max) - goto bad_offset; - if (!p->swap_map[offset]) - goto bad_free; - swap_list_lock(); - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = type; - swap_device_lock(p); - return p; - -bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); - goto out; -bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); - goto out; -bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); - goto out; -bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); -out: - return NULL; -} - -static void swap_info_put(struct swap_info_struct * p) -{ - swap_device_unlock(p); - swap_list_unlock(); -} - -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) -{ - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - nr_swap_pages++; - } - } - return count; -} - -/* - * Caller has made sure that the swapdevice corresponding to entry - * is still around or has not been recycled. - */ -void swap_free(swp_entry_t entry) -{ - struct swap_info_struct * p; - - p = swap_info_get(entry); - if (p) { - swap_entry_free(p, SWP_OFFSET(entry)); - swap_info_put(p); - } -} - -/* - * Check if we're the only user of a swap page, - * when the page is locked. - */ -static int exclusive_swap_page(struct page *page) -{ - int retval = 0; - struct swap_info_struct * p; - swp_entry_t entry; - - entry.val = page->index; - p = swap_info_get(entry); - if (p) { - /* Is the only swap cache user the cache itself? */ - if (p->swap_map[SWP_OFFSET(entry)] == 1) { - /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) - retval = 1; - spin_unlock(&pagecache_lock); - } - swap_info_put(p); - } - return retval; -} - -/* - * We can use this swap cache entry directly - * if there are no other references to it. - * - * Here "exclusive_swap_page()" does the real - * work, but we opportunistically check whether - * we need to get all the locks first.. - */ -int fastcall can_share_swap_page(struct page *page) -{ - int retval = 0; - - if (!PageLocked(page)) - BUG(); - switch (page_count(page)) { - case 3: - if (!page->buffers) - break; - /* Fallthrough */ - case 2: - if (!PageSwapCache(page)) - break; - retval = exclusive_swap_page(page); - break; - case 1: - if (PageReserved(page)) - break; - retval = 1; - } - return retval; -} - -/* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. - */ -int fastcall remove_exclusive_swap_page(struct page *page) -{ - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - return 0; - if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ - return 0; - - entry.val = page->index; - p = swap_info_get(entry); - if (!p) - return 0; - - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[SWP_OFFSET(entry)] == 1) { - /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; - } - spin_unlock(&pagecache_lock); - } - swap_info_put(p); - - if (retval) { - block_flushpage(page, 0); - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* - * Free the swap entry like above, but also try to - * free the page cache entry if it is the last user. - */ -void free_swap_and_cache(swp_entry_t entry) -{ - struct swap_info_struct * p; - struct page *page = NULL; - - p = swap_info_get(entry); - if (p) { - if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); - swap_info_put(p); - } - if (page) { - page_cache_get(page); - /* Only cache user (+us), or swap space full? Free it! */ - if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) { - delete_from_swap_cache(page); - SetPageDirty(page); - } - UnlockPage(page); - page_cache_release(page); - } -} - -/* - * The swap entry has been read in advance, and we return 1 to indicate - * that the page has been used or is no longer needed. - * - * Always set the resulting pte to be nowrite (the same as COW pages - * after one process has exited). We don't know just how many PTEs will - * share this swap entry, so be cautious and let do_wp_page work out - * what to do if a write is requested later. - */ -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, swp_entry_t entry, struct page* page) -{ - pte_t pte = *dir; - - if (likely(pte_to_swp_entry(pte).val != entry.val)) - return; - if (unlikely(pte_none(pte) || pte_present(pte))) - return; - get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - swap_free(entry); - ++vma->vm_mm->rss; -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page* page) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - pte = pte_offset(dir, address); - offset += address & PMD_MASK; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long size, - swp_entry_t entry, struct page* page) -{ - pmd_t * pmd; - unsigned long offset, end; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, address); - offset = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - unuse_pmd(vma, pmd, address, end - address, offset, entry, - page); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page* page) -{ - unsigned long start = vma->vm_start, end = vma->vm_end; - - if (start >= end) - BUG(); - do { - unuse_pgd(vma, pgdir, start, end - start, entry, page); - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (start && (start < end)); -} - -static void unuse_process(struct mm_struct * mm, - swp_entry_t entry, struct page* page) -{ - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - spin_lock(&mm->page_table_lock); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - pgd_t * pgd = pgd_offset(mm, vma->vm_start); - unuse_vma(vma, pgd, entry, page); - } - XEN_flush_page_update_queue(); - spin_unlock(&mm->page_table_lock); - return; -} - -/* - * Scan swap_map from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. - */ -static int find_next_to_unuse(struct swap_info_struct *si, int prev) -{ - int max = si->max; - int i = prev; - int count; - - /* - * No need for swap_device_lock(si) here: we're just looking - * for whether an entry is in use, not modifying it; false - * hits are okay, and sys_swapoff() has already prevented new - * allocations from this area (while holding swap_list_lock()). - */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } - count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) - break; - } - return i; -} - -/* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - */ -static int try_to_unuse(unsigned int type) -{ - struct swap_info_struct * si = &swap_info[type]; - struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; - struct page *page; - swp_entry_t entry; - int i = 0; - int retval = 0; - int reset_overflow = 0; - int shmem; - - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering (now preserved by swap_out()), - * which clusters forked address spaces together, most recent - * child immediately after parent. If we race with dup_mmap(), - * we very much want to resolve parent before child, otherwise - * we may miss some entries: using last mm would invert that. - */ - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * mmput() removes mm from mmlist before exit_mmap() and its - * zap_page_range(). That's not too bad, those entries are - * on their way out, and handled faster there than here. - * do_munmap() behaves similarly, taking the range out of mm's - * vma list before zap_page_range(). But unfortunately, when - * unmapping a part of a vma, it takes the whole out first, - * then reinserts what's left after (might even reschedule if - * open() method called) - so swap entries may be invisible - * to swapoff for a while, then reappear - but that is rare. - */ - while ((i = find_next_to_unuse(si, i))) { - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = SWP_ENTRY(type, i); - page = read_swap_cache_async(entry); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - if (!*swap_map) - continue; - retval = -ENOMEM; - break; - } - - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - } - - /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page(page); - lock_page(page); - - /* - * Remove all references to entry, without blocking. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. - */ - shmem = 0; - swcount = *swap_map; - if (swcount > 1) { - flush_page_to_ram(page); - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - unuse_process(start_mm, entry, page); - } - if (*swap_map > 1) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *mm; - - spin_lock(&mmlist_lock); - while (*swap_map > 1 && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - swcount = *swap_map; - if (mm == &init_mm) { - set_start_mm = 1; - spin_unlock(&mmlist_lock); - shmem = shmem_unuse(entry, page); - spin_lock(&mmlist_lock); - } else - unuse_process(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { - new_start_mm = mm; - set_start_mm = 0; - } - } - atomic_inc(&new_start_mm->mm_users); - spin_unlock(&mmlist_lock); - mmput(start_mm); - start_mm = new_start_mm; - } - - /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - if (*swap_map == SWAP_MAP_MAX) { - swap_list_lock(); - swap_device_lock(si); - nr_swap_pages++; - *swap_map = 1; - swap_device_unlock(si); - swap_list_unlock(); - reset_overflow = 1; - } - - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_swap_out could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Note shmem_unuse already deleted swappage from cache, - * unless corresponding filepage found already in cache: - * in which case it left swappage in cache, lowered its - * swap count to pass quickly through the loops above, - * and now we must reincrement count to try again later. - */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { - rw_swap_page(WRITE, page); - lock_page(page); - } - if (PageSwapCache(page)) { - if (shmem) - swap_duplicate(entry); - else - delete_from_swap_cache(page); - } - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so try_to_swap_out will preserve it. - */ - SetPageDirty(page); - UnlockPage(page); - page_cache_release(page); - - /* - * Make sure that we aren't completely killing - * interactive performance. Interruptible check on - * signal_pending() would be nice, but changes the spec? - */ - if (current->need_resched) - schedule(); - } - - mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } - return retval; -} - -asmlinkage long sys_swapoff(const char * specialfile) -{ - struct swap_info_struct * p = NULL; - unsigned short *swap_map; - struct nameidata nd; - int i, type, prev; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = user_path_walk(specialfile, &nd); - if (err) - goto out; - - lock_kernel(); - prev = -1; - swap_list_lock(); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - if (p->swap_file == nd.dentry) - break; - } - prev = type; - } - err = -EINVAL; - if (type < 0) { - swap_list_unlock(); - goto out_dput; - } - - if (prev < 0) { - swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } - nr_swap_pages -= p->pages; - total_swap_pages -= p->pages; - p->flags = SWP_USED; - swap_list_unlock(); - unlock_kernel(); - err = try_to_unuse(type); - lock_kernel(); - if (err) { - /* re-insert swap space back into swap_list */ - swap_list_lock(); - for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) - if (p->prio >= swap_info[i].prio) - break; - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; - else - swap_info[prev].next = p - swap_info; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags = SWP_WRITEOK; - swap_list_unlock(); - goto out_dput; - } - if (p->swap_device) - blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); - path_release(&nd); - - swap_list_lock(); - swap_device_lock(p); - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_vfsmnt = NULL; - p->swap_file = NULL; - p->swap_device = 0; - p->max = 0; - swap_map = p->swap_map; - p->swap_map = NULL; - p->flags = 0; - swap_device_unlock(p); - swap_list_unlock(); - vfree(swap_map); - err = 0; - -out_dput: - unlock_kernel(); - path_release(&nd); -out: - return err; -} - -int get_swaparea_info(char *buf) -{ - char * page = (char *) __get_free_page(GFP_KERNEL); - struct swap_info_struct *ptr = swap_info; - int i, j, len = 0, usedswap; - - if (!page) - return -ENOMEM; - - len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); - for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { - if ((ptr->flags & SWP_USED) && ptr->swap_map) { - char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, - page, PAGE_SIZE); - - len += sprintf(buf + len, "%-31s ", path); - - if (!ptr->swap_device) - len += sprintf(buf + len, "file\t\t"); - else - len += sprintf(buf + len, "partition\t"); - - usedswap = 0; - for (j = 0; j < ptr->max; ++j) - switch (ptr->swap_map[j]) { - case SWAP_MAP_BAD: - case 0: - continue; - default: - usedswap++; - } - len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), - usedswap << (PAGE_SHIFT - 10), ptr->prio); - } - } - free_page((unsigned long) page); - return len; -} - -int is_swap_partition(kdev_t dev) { - struct swap_info_struct *ptr = swap_info; - int i; - - for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { - if (ptr->flags & SWP_USED) - if (ptr->swap_device == dev) - return 1; - } - return 0; -} - -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -asmlinkage long sys_swapon(const char * specialfile, int swap_flags) -{ - struct swap_info_struct * p; - struct nameidata nd; - struct inode * swap_inode; - unsigned int type; - int i, j, prev; - int error; - static int least_priority = 0; - union swap_header *swap_header = 0; - int swap_header_version; - int nr_good_pages = 0; - unsigned long maxpages = 1; - int swapfilesize; - struct block_device *bdev = NULL; - unsigned short *swap_map; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - lock_kernel(); - swap_list_lock(); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) - break; - error = -EPERM; - if (type >= MAX_SWAPFILES) { - swap_list_unlock(); - goto out; - } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - p->flags = SWP_USED; - p->swap_file = NULL; - p->swap_vfsmnt = NULL; - p->swap_device = 0; - p->swap_map = NULL; - p->lowest_bit = 0; - p->highest_bit = 0; - p->cluster_nr = 0; - p->sdev_lock = SPIN_LOCK_UNLOCKED; - p->next = -1; - if (swap_flags & SWAP_FLAG_PREFER) { - p->prio = - (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; - } else { - p->prio = --least_priority; - } - swap_list_unlock(); - error = user_path_walk(specialfile, &nd); - if (error) - goto bad_swap_2; - - p->swap_file = nd.dentry; - p->swap_vfsmnt = nd.mnt; - swap_inode = nd.dentry->d_inode; - error = -EINVAL; - - if (S_ISBLK(swap_inode->i_mode)) { - kdev_t dev = swap_inode->i_rdev; - struct block_device_operations *bdops; - devfs_handle_t de; - - if (is_mounted(dev)) { - error = -EBUSY; - goto bad_swap_2; - } - - p->swap_device = dev; - set_blocksize(dev, PAGE_SIZE); - - bd_acquire(swap_inode); - bdev = swap_inode->i_bdev; - de = devfs_get_handle_from_inode(swap_inode); - bdops = devfs_get_ops(de); /* Increments module use count */ - if (bdops) bdev->bd_op = bdops; - - error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); - devfs_put_ops(de);/*Decrement module use count now we're safe*/ - if (error) - goto bad_swap_2; - set_blocksize(dev, PAGE_SIZE); - error = -ENODEV; - if (!dev || (blk_size[MAJOR(dev)] && - !blk_size[MAJOR(dev)][MINOR(dev)])) - goto bad_swap; - swapfilesize = 0; - if (blk_size[MAJOR(dev)]) - swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] - >> (PAGE_SHIFT - 10); - } else if (S_ISREG(swap_inode->i_mode)) - swapfilesize = swap_inode->i_size >> PAGE_SHIFT; - else - goto bad_swap; - - error = -EBUSY; - for (i = 0 ; i < nr_swapfiles ; i++) { - struct swap_info_struct *q = &swap_info[i]; - if (i == type || !q->swap_file) - continue; - if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) - goto bad_swap; - } - - swap_header = (void *) __get_free_page(GFP_USER); - if (!swap_header) { - printk("Unable to start swapping: out of memory :-)\n"); - error = -ENOMEM; - goto bad_swap; - } - - lock_page(virt_to_page(swap_header)); - rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); - - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { - printk("Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; - } - - switch (swap_header_version) { - case 1: - memset(((char *) swap_header)+PAGE_SIZE-10,0,10); - j = 0; - p->lowest_bit = 0; - p->highest_bit = 0; - for (i = 1 ; i < 8*PAGE_SIZE ; i++) { - if (test_bit(i,(char *) swap_header)) { - if (!p->lowest_bit) - p->lowest_bit = i; - p->highest_bit = i; - maxpages = i+1; - j++; - } - } - nr_good_pages = j; - p->swap_map = vmalloc(maxpages * sizeof(short)); - if (!p->swap_map) { - error = -ENOMEM; - goto bad_swap; - } - for (i = 1 ; i < maxpages ; i++) { - if (test_bit(i,(char *) swap_header)) - p->swap_map[i] = 0; - else - p->swap_map[i] = SWAP_MAP_BAD; - } - break; - - case 2: - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } - - p->lowest_bit = 1; - maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; - - error = -EINVAL; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; - - /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { - error = -ENOMEM; - goto bad_swap; - } - - error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); - for (i=0; i<swap_header->info.nr_badpages; i++) { - int page = swap_header->info.badpages[i]; - if (page <= 0 || page >= swap_header->info.last_page) - error = -EINVAL; - else - p->swap_map[page] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) - goto bad_swap; - } - - if (swapfilesize && maxpages > swapfilesize) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - error = -EINVAL; - goto bad_swap; - } - if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); - error = -EINVAL; - goto bad_swap; - } - p->swap_map[0] = SWAP_MAP_BAD; - swap_list_lock(); - swap_device_lock(p); - p->max = maxpages; - p->flags = SWP_WRITEOK; - p->pages = nr_good_pages; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; - printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", - nr_good_pages<<(PAGE_SHIFT-10), p->prio); - - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { - break; - } - prev = i; - } - p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } - swap_device_unlock(p); - swap_list_unlock(); - error = 0; - goto out; -bad_swap: - if (bdev) - blkdev_put(bdev, BDEV_SWAP); -bad_swap_2: - swap_list_lock(); - swap_map = p->swap_map; - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_device = 0; - p->swap_file = NULL; - p->swap_vfsmnt = NULL; - p->swap_map = NULL; - p->flags = 0; - if (!(swap_flags & SWAP_FLAG_PREFER)) - ++least_priority; - swap_list_unlock(); - if (swap_map) - vfree(swap_map); - path_release(&nd); -out: - if (swap_header) - free_page((long) swap_header); - unlock_kernel(); - return error; -} - -void si_swapinfo(struct sysinfo *val) -{ - unsigned int i; - unsigned long nr_to_be_unused = 0; - - swap_list_lock(); - for (i = 0; i < nr_swapfiles; i++) { - unsigned int j; - if (swap_info[i].flags != SWP_USED) - continue; - for (j = 0; j < swap_info[i].max; ++j) { - switch (swap_info[i].swap_map[j]) { - case 0: - case SWAP_MAP_BAD: - continue; - default: - nr_to_be_unused++; - } - } - } - val->freeswap = nr_swap_pages + nr_to_be_unused; - val->totalswap = total_swap_pages + nr_to_be_unused; - swap_list_unlock(); -} - -/* - * Verify that a swap entry is valid and increment its swap map count. - * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. - */ -int swap_duplicate(swp_entry_t entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - int result = 0; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = type + swap_info; - offset = SWP_OFFSET(entry); - - swap_device_lock(p); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; - } - } - swap_device_unlock(p); -out: - return result; - -bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; -} - -/* - * Prior swap_duplicate protects against swap device deletion. - */ -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, - kdev_t *dev, struct inode **swapf) -{ - unsigned long type; - struct swap_info_struct *p; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); - return; - } - - p = &swap_info[type]; - *offset = SWP_OFFSET(entry); - if (*offset >= p->max && *offset != 0) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); - return; - } - if (p->swap_map && !p->swap_map[*offset]) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); - return; - } - if (!(p->flags & SWP_USED)) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); - return; - } - - if (p->swap_device) { - *dev = p->swap_device; - } else if (p->swap_file) { - *swapf = p->swap_file->d_inode; - } else { - printk(KERN_ERR "rw_swap_page: no swap file or device\n"); - } - return; -} - -/* - * swap_device_lock prevents swap_map being freed. Don't grab an extra - * reference on the swaphandle, it doesn't matter if it becomes unused. - */ -int valid_swaphandles(swp_entry_t entry, unsigned long *offset) -{ - int ret = 0, i = 1 << page_cluster; - unsigned long toff; - struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; - - if (!page_cluster) /* no readahead */ - return 0; - toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; - if (!toff) /* first page is swap header */ - toff++, i--; - *offset = toff; - - swap_device_lock(swapdev); - do { - /* Don't read-ahead past the end of the swap area */ - if (toff >= swapdev->max) - break; - /* Don't read in free or bad pages */ - if (!swapdev->swap_map[toff]) - break; - if (swapdev->swap_map[toff] == SWAP_MAP_BAD) - break; - toff++; - ret++; - } while (--i); - swap_device_unlock(swapdev); - return ret; -} diff --git a/linux-2.4.29-xen-sparse/mm/vmalloc.c b/linux-2.4.29-xen-sparse/mm/vmalloc.c deleted file mode 100644 index df02fcbf7a..0000000000 --- a/linux-2.4.29-xen-sparse/mm/vmalloc.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * linux/mm/vmalloc.c - * - * Copyright (C) 1993 Linus Torvalds - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 - */ - -#include <linux/config.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/spinlock.h> -#include <linux/highmem.h> -#include <linux/smp_lock.h> - -#include <asm/uaccess.h> -#include <asm/pgalloc.h> - -rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; -struct vm_struct * vmlist; - -static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) { - struct page *ptpage = pte_page(page); - if (VALID_PAGE(ptpage) && (!PageReserved(ptpage))) - __free_page(ptpage); - continue; - } - printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); - } while (address < end); -} - -static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - free_area_pte(pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -void vmfree_area_pages(unsigned long address, unsigned long size) -{ - pgd_t * dir; - unsigned long end = address + size; - - dir = pgd_offset_k(address); - flush_cache_all(); - do { - free_area_pmd(dir, address, end - address); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - flush_tlb_all(); -} - -static inline int alloc_area_pte (pte_t * pte, unsigned long address, - unsigned long size, int gfp_mask, - pgprot_t prot, struct page ***pages) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - struct page * page; - - if (!pages) { - spin_unlock(&init_mm.page_table_lock); - page = alloc_page(gfp_mask); - spin_lock(&init_mm.page_table_lock); - } else { - page = (**pages); - (*pages)++; - - /* Add a reference to the page so we can free later */ - if (page) - atomic_inc(&page->count); - - } - if (!pte_none(*pte)) - printk(KERN_ERR "alloc_area_pte: page already exists\n"); - if (!page) - return -ENOMEM; - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; - } while (address < end); - return 0; -} - -static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, - unsigned long size, int gfp_mask, - pgprot_t prot, struct page ***pages) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); - if (!pte) - return -ENOMEM; - if (alloc_area_pte(pte, address, end - address, - gfp_mask, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; -} - -/*static inline*/ int __vmalloc_area_pages (unsigned long address, - unsigned long size, - int gfp_mask, - pgprot_t prot, - struct page ***pages) -{ - pgd_t * dir; - unsigned long start = address; - unsigned long end = address + size; - - dir = pgd_offset_k(address); - spin_lock(&init_mm.page_table_lock); - do { - pmd_t *pmd; - - pmd = pmd_alloc(&init_mm, dir, address); - if (!pmd) - goto err; - - if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages)) - goto err; // The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here - - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); - flush_cache_all(); - XEN_flush_page_update_queue(); - return 0; -err: - spin_unlock(&init_mm.page_table_lock); - flush_cache_all(); - if (address > start) - vmfree_area_pages(start, address - start); - return -ENOMEM; -} - -int vmalloc_area_pages(unsigned long address, unsigned long size, - int gfp_mask, pgprot_t prot) -{ - return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL); -} - -struct vm_struct * get_vm_area(unsigned long size, unsigned long flags) -{ - unsigned long addr, next; - struct vm_struct **p, *tmp, *area; - - area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); - if (!area) - return NULL; - - size += PAGE_SIZE; - if (!size) { - kfree (area); - return NULL; - } - - addr = VMALLOC_START; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long) tmp->addr) - break; - next = tmp->size + (unsigned long) tmp->addr; - if (next > addr) - addr = next; - if (addr > VMALLOC_END-size) - goto out; - } - area->flags = flags; - area->addr = (void *)addr; - area->size = size; - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - return NULL; -} - -void __vfree(void * addr, int free_area_pages) -{ - struct vm_struct **p, *tmp; - - if (!addr) - return; - if ((PAGE_SIZE-1) & (unsigned long) addr) { - printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); - return; - } - write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { - if (tmp->addr == addr) { - *p = tmp->next; - if (free_area_pages) - vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); - write_unlock(&vmlist_lock); - kfree(tmp); - return; - } - } - write_unlock(&vmlist_lock); - printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); -} - -void vfree(void * addr) -{ - __vfree(addr,1); -} - -void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot) -{ - void * addr; - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) - return NULL; - area = get_vm_area(size, VM_ALLOC); - if (!area) - return NULL; - addr = area->addr; - if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, - prot, NULL)) { - __vfree(addr, 0); - return NULL; - } - return addr; -} - -void * vmap(struct page **pages, int count, - unsigned long flags, pgprot_t prot) -{ - void * addr; - struct vm_struct *area; - unsigned long size = count << PAGE_SHIFT; - - if (!size || size > (max_mapnr << PAGE_SHIFT)) - return NULL; - area = get_vm_area(size, flags); - if (!area) { - return NULL; - } - addr = area->addr; - if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0, - prot, &pages)) { - __vfree(addr, 0); - return NULL; - } - return addr; -} - -long vread(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - *buf = '\0'; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} - -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} diff --git a/linux-2.6.11-xen-sparse/arch/xen/Kconfig b/linux-2.6.11-xen-sparse/arch/xen/Kconfig index 2a8c5f200f..1c2ba9b4a2 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/Kconfig @@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP to a character device, allowing device prototyping in application space. Odds are that you want to say N here. -config XEN_WRITABLE_PAGETABLES - bool - default y - config XEN_SCRUB_PAGES bool "Scrub memory before freeing it to Xen" default y diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig index e906f98521..a781740c94 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig @@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set -CONFIG_XEN_WRITABLE_PAGETABLES=y CONFIG_XEN_SCRUB_PAGES=y CONFIG_X86=y # CONFIG_X86_64 is not set diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig index 95dee5b159..b1fc951a81 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig @@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set -CONFIG_XEN_WRITABLE_PAGETABLES=y CONFIG_XEN_SCRUB_PAGES=y CONFIG_X86=y # CONFIG_X86_64 is not set diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c index b8829c8cdc..b7c29174fc 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c @@ -963,7 +963,7 @@ void __init trap_init(void) * and a callgate to lcall27 for Solaris/x86 binaries */ make_lowmem_page_readonly(&default_ldt[0]); - xen_flush_page_update_queue(); + flush_page_update_queue(); /* * Should be a barrier for any external CPU state. diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c index 7a0b091ca3..0cac0f30c3 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c @@ -553,7 +553,6 @@ vmalloc_fault: if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); - xen_flush_page_update_queue(); /* flush PMD update */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c index 62427b2301..f9d8e089e0 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c @@ -48,19 +48,12 @@ */ static spinlock_t update_lock = SPIN_LOCK_UNLOCKED; -/* Linux 2.6 isn't using the traditional batched interface. */ +#define QUEUE_SIZE 128 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#define QUEUE_SIZE 2048 #define pte_offset_kernel pte_offset -#define pmd_val_ma(v) (v).pmd; #define pud_t pgd_t #define pud_offset(d, va) d #else -#ifdef CONFIG_SMP -#define QUEUE_SIZE 1 -#else -#define QUEUE_SIZE 128 -#endif #define pmd_val_ma(v) (v).pud.pgd.pgd; #endif diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c index 6fe3f08632..2682ac5b90 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c @@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); if (pte) { make_page_readonly(pte); - xen_flush_page_update_queue(); + flush_page_update_queue(); } return pte; } diff --git a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c index f69db851a4..36c934fc5d 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c +++ b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c @@ -109,10 +109,8 @@ static void __do_suspend(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); -#ifdef CONFIG_XEN_WRITABLE_PAGETABLES HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); -#endif shutting_down = -1; diff --git a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h index eb084e8f06..a55be6c23e 100644 --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h +++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h @@ -30,6 +30,8 @@ /* -------[ debug / pretty printing ]--------------------------------- */ +#define PRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) #if 0 #define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ __FILE__ , __LINE__ , ## _a ) diff --git a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c index b503b1ec13..93594623b0 100644 --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c +++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c @@ -299,7 +299,7 @@ int blktap_write_fe_ring(blkif_request_t *req) } if ( RING_FULL(&blktap_ufe_ring) ) { - DPRINTK("blktap: fe_ring is full, can't add.\n"); + PRINTK("blktap: fe_ring is full, can't add.\n"); return 0; } @@ -383,10 +383,9 @@ static int blktap_read_fe_ring(void) zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0), ar->nr_pages << PAGE_SHIFT, NULL); write_resp_to_fe_ring(blkif, resp_s); + blktap_ufe_ring.rsp_cons = i + 1; kick_fe_domain(blkif); } - - blktap_ufe_ring.rsp_cons = i; } return 0; } diff --git a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c b/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c index c2cdbf9e72..219b218920 100644 --- a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c +++ b/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c @@ -88,6 +88,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, { int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? PRIVCMD_MMAP_SZ:(mmapcmd.num-i); + + if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) ) return -EFAULT; @@ -96,6 +98,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, struct vm_area_struct *vma = find_vma( current->mm, msg[j].va ); + if ( !vma ) return -EINVAL; @@ -151,6 +154,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, addr = m.addr; for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ ) { + if ( get_user(mfn, p) ) return -EFAULT; @@ -166,10 +170,12 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, v = w; } + ret = 0; break; batch_err: + printk(KERN_ALERT "XXX SMH: ERROR IN MMAPBATCH\n"); printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end); break; @@ -183,7 +189,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, pgd_t *pgd = pgd_offset_k(m2pv); pud_t *pud = pud_offset(pgd, m2pv); pmd_t *pmd = pmd_offset(pud, m2pv); - unsigned long m2p_start_mfn = pfn_to_mfn(pmd_val(*pmd) >> PAGE_SHIFT); + unsigned long m2p_start_mfn = (*(unsigned long *)pmd) >> PAGE_SHIFT; ret = put_user(m2p_start_mfn, (unsigned long *)data) ? -EFAULT: 0; } break; diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h index 345b8264b8..1379b49694 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h @@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pgd_val(pgd_t x) { unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret); + if (ret) ret = machine_to_phys(ret) | 1; return ret; } #define pgprot_val(x) ((x).pgprot) diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h index d932c6c17f..dfc5b1e155 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h @@ -407,7 +407,6 @@ extern void noexec_setup(const char *str); do { \ if (__dirty) { \ if ( likely((__vma)->vm_mm == current->mm) ) { \ - xen_flush_page_update_queue(); \ HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \ } else { \ xen_l1_entry_update((__ptep), (__entry).pte_low); \ @@ -426,7 +425,6 @@ do { \ #define ptep_establish_new(__vma, __address, __ptep, __entry) \ do { \ if (likely((__vma)->vm_mm == current->mm)) { \ - xen_flush_page_update_queue(); \ HYPERVISOR_update_va_mapping((__address), \ __entry, 0); \ } else { \ diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h index 4d77312f6e..568e84bc2f 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h @@ -117,8 +117,6 @@ void _flush_page_update_queue(void); if (per_cpu(mmu_update_queue_idx, smp_processor_id())) \ _flush_page_update_queue(); \ } while (0) -#define xen_flush_page_update_queue() (_flush_page_update_queue()) -#define XEN_flush_page_update_queue() (_flush_page_update_queue()) void MULTICALL_flush_page_update_queue(void); #ifdef CONFIG_XEN_PHYSDEV_ACCESS diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile index 7f71a219bf..3478552ac4 100644 --- a/tools/blktap/Makefile +++ b/tools/blktap/Makefile @@ -58,7 +58,7 @@ OBJS = $(patsubst %.c,%.o,$(SRCS)) LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) -all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax +all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax parallax-threaded blockstored $(MAKE) $(LIB) LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) @@ -120,42 +120,42 @@ blkaio: $(LIB) blkaio.c blkaiolib.c $(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread parallax: $(LIB) $(PLX_SRCS) - $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap $(PLX_SRCS) libgnbd/libgnbd.a + $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap -lpthread $(PLX_SRCS) libgnbd/libgnbd.a parallax-threaded: $(LIB) $(PLXT_SRCS) $(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) libgnbd/libgnbd.a vdi_test: $(LIB) $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE -lpthread $(VDI_SRCS) vdi_list: $(LIB) vdi_list.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c -lpthread $(VDI_SRCS) vdi_create: $(LIB) vdi_create.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c -lpthread $(VDI_SRCS) vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c -lpthread $(VDI_SRCS) vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c -lpthread $(VDI_SRCS) vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c -lpthread $(VDI_SRCS) vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c -lpthread $(VDI_SRCS) vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c -lpthread $(VDI_SRCS) vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c -lpthread $(VDI_SRCS) blockstored: blockstored.c - $(CC) $(CFLAGS) -g3 -o blockstored blockstored.c + $(CC) $(CFLAGS) -g3 -o blockstored -lpthread blockstored.c bstest: bstest.c blockstore.c - $(CC) $(CFLAGS) -g3 -o bstest bstest.c blockstore.c + $(CC) $(CFLAGS) -g3 -o bstest bstest.c -lpthread blockstore.c .PHONY: TAGS clean install mk-symlinks rpm TAGS: diff --git a/tools/blktap/blktaplib.c b/tools/blktap/blktaplib.c index 35b893f677..87b680d2cc 100644 --- a/tools/blktap/blktaplib.c +++ b/tools/blktap/blktaplib.c @@ -248,12 +248,21 @@ static void apply_rsp_hooks(blkif_response_t *rsp) } } +static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER; + void blktap_inject_response(blkif_response_t *rsp) { + apply_rsp_hooks(rsp); + write_rsp_to_fe_ring(rsp); + + pthread_mutex_lock(&push_mutex); + RING_PUSH_RESPONSES(&fe_ring); ioctl(fd, BLKTAP_IOCTL_KICK_FE); + + pthread_mutex_unlock(&push_mutex); } /*-----[ Polling fd listeners ]------------------------------------------*/ @@ -449,7 +458,9 @@ int blktap_listen(void) } /* Using this as a unidirectional ring. */ ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i; +pthread_mutex_lock(&push_mutex); RING_PUSH_RESPONSES(&ctrl_ring); +pthread_mutex_unlock(&push_mutex); /* empty the fe_ring */ notify_fe = 0; @@ -517,14 +528,18 @@ int blktap_listen(void) if (notify_be) { DPRINTF("notifying be\n"); +pthread_mutex_lock(&push_mutex); RING_PUSH_REQUESTS(&be_ring); ioctl(fd, BLKTAP_IOCTL_KICK_BE); +pthread_mutex_unlock(&push_mutex); } if (notify_fe) { DPRINTF("notifying fe\n"); +pthread_mutex_lock(&push_mutex); RING_PUSH_RESPONSES(&fe_ring); ioctl(fd, BLKTAP_IOCTL_KICK_FE); +pthread_mutex_unlock(&push_mutex); } } } diff --git a/tools/blktap/blockstore.c b/tools/blktap/blockstore.c index 5de2a6885a..36903fe09e 100644 --- a/tools/blktap/blockstore.c +++ b/tools/blktap/blockstore.c @@ -13,13 +13,16 @@ #include <string.h> #include <sys/types.h> #include <sys/stat.h> +#include <sys/time.h> #include <stdarg.h> #include "blockstore.h" #include <pthread.h> #include "parallax-threaded.h" #define BLOCKSTORE_REMOTE -#define BSDEBUG +//#define BSDEBUG + +#define RETRY_TIMEOUT 1000000 /* microseconds */ /***************************************************************************** * Debugging @@ -63,6 +66,37 @@ struct sockaddr_in sin_local; int bssock = 0; /***************************************************************************** + * Notification * + *****************************************************************************/ + +typedef struct pool_thread_t_struct { + pthread_mutex_t ptmutex; + pthread_cond_t ptcv; + int newdata; +} pool_thread_t; + +pool_thread_t pool_thread[READ_POOL_SIZE+1]; + +#define RECV_NOTIFY(tid) { \ + pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ + pool_thread[tid].newdata = 1; \ + DB("CV Waking %u", tid); \ + pthread_cond_signal(&(pool_thread[tid].ptcv)); \ + pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } +#define RECV_AWAIT(tid) { \ + pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ + if (pool_thread[tid].newdata) { \ + pool_thread[tid].newdata = 0; \ + DB("CV Woken %u", tid); \ + } \ + else { \ + DB("CV Waiting %u", tid); \ + pthread_cond_wait(&(pool_thread[tid].ptcv), \ + &(pool_thread[tid].ptmutex)); \ + } \ + pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } + +/***************************************************************************** * Message queue management * *****************************************************************************/ @@ -76,23 +110,6 @@ pthread_mutex_t ptmutex_recv; #define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv) #define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv) -int notify = 0; -pthread_mutex_t ptmutex_notify; -pthread_cond_t ptcv_notify; -#define RECV_NOTIFY { \ - pthread_mutex_lock(&ptmutex_notify); \ - notify = 1; \ - pthread_cond_signal(&ptcv_notify); \ - pthread_mutex_unlock(&ptmutex_notify); } -#define RECV_AWAIT { \ - pthread_mutex_lock(&ptmutex_notify); \ - if (notify) \ - notify = 0; \ - else \ - pthread_cond_wait(&ptcv_notify, &ptmutex_notify); \ - pthread_mutex_unlock(&ptmutex_notify); } - - /* A message queue entry. We allocate one of these for every request we send. * Asynchronous reply reception also used one of these. */ @@ -104,6 +121,8 @@ typedef struct bsq_t_struct { int length; struct msghdr msghdr; struct iovec iov[2]; + int tid; + struct timeval tv_sent; bshdr_t message; void *block; } bsq_t; @@ -267,11 +286,13 @@ int send_message(bsq_t *qe) { qe->message.luid = new_luid(); qe->status = 0; + qe->tid = (int)pthread_getspecific(tid_key); if (enqueue(qe) < 0) { fprintf(stderr, "Error enqueuing request.\n"); return -1; } + gettimeofday(&(qe->tv_sent), NULL); DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid); rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0, @@ -407,6 +428,7 @@ void recv_recycle_buffer(bsq_t *q) { int wait_recv(bsq_t **reqs, int numreqs) { bsq_t *q, *m; unsigned int x, i; + int tid = (int)pthread_getspecific(tid_key); DB("ENTER wait_recv %u\n", numreqs); @@ -420,7 +442,7 @@ int wait_recv(bsq_t **reqs, int numreqs) { return numreqs; } - RECV_AWAIT; + RECV_AWAIT(tid); /* rxagain: @@ -442,6 +464,52 @@ int wait_recv(bsq_t **reqs, int numreqs) { } +/* retry + */ +static int retry_count = 0; +int retry(bsq_t *qe) +{ + int rc; + gettimeofday(&(qe->tv_sent), NULL); + DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid); + retry_count++; + rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); + if (rc < 0) + return rc; + return 0; +} + +/* queue runner + */ +void *queue_runner(void *arg) +{ + for (;;) { + struct timeval now; + long long nowus, sus; + bsq_t *q; + int r; + + sleep(1); + + gettimeofday(&now, NULL); + nowus = now.tv_usec + now.tv_sec * 1000000; + ENTER_QUEUE_CR; + r = retry_count; + for (q = bs_head; q; q = q->next) { + sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000; + if ((nowus - sus) > RETRY_TIMEOUT) { + if (retry(q) < 0) { + fprintf(stderr, "Error on sendmsg retry.\n"); + } + } + } + if (r != retry_count) { + fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count); + } + LEAVE_QUEUE_CR; + } +} + /* receive loop */ void *receive_loop(void *arg) @@ -461,7 +529,7 @@ void *receive_loop(void *arg) } else { DB("RX MATCH"); - RECV_NOTIFY; + RECV_NOTIFY(m->tid); } } } @@ -1146,8 +1214,12 @@ int __init_blockstore(void) pthread_mutex_init(&ptmutex_queue, NULL); pthread_mutex_init(&ptmutex_luid, NULL); pthread_mutex_init(&ptmutex_recv, NULL); - pthread_mutex_init(&ptmutex_notify, NULL); - pthread_cond_init(&ptcv_notify, NULL); + /*pthread_mutex_init(&ptmutex_notify, NULL);*/ + for (i = 0; i <= READ_POOL_SIZE; i++) { + pool_thread[i].newdata = 0; + pthread_mutex_init(&(pool_thread[i].ptmutex), NULL); + pthread_cond_init(&(pool_thread[i].ptcv), NULL); + } bsservers[0].hostname = "firebug.cl.cam.ac.uk"; bsservers[1].hostname = "planb.cl.cam.ac.uk"; @@ -1225,6 +1297,7 @@ int __init_blockstore(void) } pthread_create(&pthread_recv, NULL, receive_loop, NULL); + pthread_create(&pthread_recv, NULL, queue_runner, NULL); #else /* /BLOCKSTORE_REMOTE */ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); @@ -1262,9 +1335,14 @@ int __init_blockstore(void) void __exit_blockstore(void) { + int i; pthread_mutex_destroy(&ptmutex_recv); pthread_mutex_destroy(&ptmutex_luid); pthread_mutex_destroy(&ptmutex_queue); - pthread_mutex_destroy(&ptmutex_notify); - pthread_cond_destroy(&ptcv_notify); + /*pthread_mutex_destroy(&ptmutex_notify); + pthread_cond_destroy(&ptcv_notify);*/ + for (i = 0; i <= READ_POOL_SIZE; i++) { + pthread_mutex_destroy(&(pool_thread[i].ptmutex)); + pthread_cond_destroy(&(pool_thread[i].ptcv)); + } } diff --git a/tools/blktap/parallax-threaded.h b/tools/blktap/parallax-threaded.h index 17cdcb983e..de39609fcc 100644 --- a/tools/blktap/parallax-threaded.h +++ b/tools/blktap/parallax-threaded.h @@ -14,7 +14,8 @@ #define NOTHREADS #endif -#define READ_POOL_SIZE 128 +//#define READ_POOL_SIZE 128 +#define READ_POOL_SIZE 8 /* per-thread identifier */ pthread_key_t tid_key; |