aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.rootkeys3
-rw-r--r--BitKeeper/etc/logging_ok1
-rw-r--r--linux-2.4.29-xen-sparse/arch/xen/kernel/head.S5
-rw-r--r--linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c1
-rw-r--r--linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c1
-rw-r--r--linux-2.4.29-xen-sparse/arch/xen/mm/fault.c1
-rw-r--r--linux-2.4.29-xen-sparse/arch/xen/mm/init.c13
-rw-r--r--linux-2.4.29-xen-sparse/fs/exec.c1179
-rw-r--r--linux-2.4.29-xen-sparse/include/asm-xen/page.h17
-rw-r--r--linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h19
-rw-r--r--linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h34
-rw-r--r--linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h45
-rw-r--r--linux-2.4.29-xen-sparse/mm/highmem.c1
-rw-r--r--linux-2.4.29-xen-sparse/mm/memory.c28
-rw-r--r--linux-2.4.29-xen-sparse/mm/mremap.c2
-rw-r--r--linux-2.4.29-xen-sparse/mm/swapfile.c1267
-rw-r--r--linux-2.4.29-xen-sparse/mm/vmalloc.c385
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/Kconfig4
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig1
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig1
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c2
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c1
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c9
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c2
-rw-r--r--linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c2
-rw-r--r--linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h2
-rw-r--r--linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c5
-rw-r--r--linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c8
-rw-r--r--linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h2
-rw-r--r--linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h2
-rw-r--r--linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h2
-rw-r--r--tools/blktap/Makefile26
-rw-r--r--tools/blktap/blktaplib.c15
-rw-r--r--tools/blktap/blockstore.c126
-rw-r--r--tools/blktap/parallax-threaded.h3
35 files changed, 217 insertions, 2998 deletions
diff --git a/.rootkeys b/.rootkeys
index 1213b4c359..c93eea13ac 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -167,7 +167,6 @@
3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c
40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile
41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c
-3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c
3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h
3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h
3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h
@@ -205,8 +204,6 @@
3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c
3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c
409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c
-3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c
-41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c
41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c
40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig
40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers
diff --git a/BitKeeper/etc/logging_ok b/BitKeeper/etc/logging_ok
index 442fa4c826..a5cd08799d 100644
--- a/BitKeeper/etc/logging_ok
+++ b/BitKeeper/etc/logging_ok
@@ -74,6 +74,7 @@ rneugeba@wyvis.research.intel-research.net
sd386@font.cl.cam.ac.uk
shand@spidean.research.intel-research.net
smh22@boulderdash.cl.cam.ac.uk
+smh22@firebug.cl.cam.ac.uk
smh22@labyrinth.cl.cam.ac.uk
smh22@tempest.cl.cam.ac.uk
smh22@uridium.cl.cam.ac.uk
diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S
index cda41ae56c..c856a0bd29 100644
--- a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S
+++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S
@@ -1,6 +1,9 @@
.section __xen_guest
- .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
+ .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
+ .ascii ",LOADER=generic"
+ .ascii ",PT_MODE_WRITABLE"
+ .byte 0
.text
#include <linux/config.h>
diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c
index 374c9b6c30..61fc1eb824 100644
--- a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c
+++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c
@@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
}
memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE);
+ flush_page_update_queue();
return 0;
}
diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c
index ada06dd973..f593714e02 100644
--- a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c
+++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c
@@ -623,6 +623,7 @@ void __init trap_init(void)
set_call_gate(&default_ldt[0],lcall7);
set_call_gate(&default_ldt[4],lcall27);
__make_page_readonly(&default_ldt[0]);
+ flush_page_update_queue();
cpu_init();
}
diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c
index d19218fe32..49a0afc887 100644
--- a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c
+++ b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c
@@ -296,7 +296,6 @@ vmalloc_fault:
if (!pmd_present(*pmd_k))
goto no_context;
set_pmd(pmd, *pmd_k);
- XEN_flush_page_update_queue(); /* flush PMD update */
pte_k = pte_offset(pmd_k, address);
if (!pte_present(*pte_k))
diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c
index 40a5af9273..88d775bcd4 100644
--- a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c
+++ b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c
@@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigned long vaddr,
}
pte = pte_offset(pmd, vaddr);
- queue_l1_entry_update(pte, phys | pgprot_val(prot));
+ set_pte(pte, (pte_t) { phys | pgprot_val(prot) });
/*
* It's enough to flush this one mapping.
@@ -201,17 +201,13 @@ static void __init fixrange_init (unsigned long start,
kpgd = pgd_offset_k((unsigned long)pte);
kpmd = pmd_offset(kpgd, (unsigned long)pte);
kpte = pte_offset(kpmd, (unsigned long)pte);
- queue_l1_entry_update(kpte,
- (*(unsigned long *)kpte)&~_PAGE_RW);
-
+ set_pte(kpte, pte_wrprotect(*kpte));
set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
}
vaddr += PMD_SIZE;
}
j = 0;
}
-
- XEN_flush_page_update_queue();
}
@@ -257,10 +253,8 @@ static void __init pagetable_init (void)
kpgd = pgd_offset_k((unsigned long)pte_base);
kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
kpte = pte_offset(kpmd, (unsigned long)pte_base);
- queue_l1_entry_update(kpte,
- (*(unsigned long *)kpte)&~_PAGE_RW);
+ set_pte(kpte, pte_wrprotect(*kpte));
set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
- XEN_flush_page_update_queue();
}
}
@@ -311,6 +305,7 @@ void __init paging_init(void)
pagetable_init();
zone_sizes_init();
+
/* Switch to the real shared_info page, and clear the dummy page. */
set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
diff --git a/linux-2.4.29-xen-sparse/fs/exec.c b/linux-2.4.29-xen-sparse/fs/exec.c
deleted file mode 100644
index 8a114151a9..0000000000
--- a/linux-2.4.29-xen-sparse/fs/exec.c
+++ /dev/null
@@ -1,1179 +0,0 @@
-/*
- * linux/fs/exec.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * #!-checking implemented by tytso.
- */
-/*
- * Demand-loading implemented 01.12.91 - no need to read anything but
- * the header into memory. The inode of the executable is put into
- * "current->executable", and page faults do the actual loading. Clean.
- *
- * Once more I can proudly say that linux stood up to being changed: it
- * was less than 2 hours work to get demand-loading completely implemented.
- *
- * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
- * current->executable is only used by the procfs. This allows a dispatch
- * table to check for several different types of binary formats. We keep
- * trying until we recognize the file or we run out of supported binary
- * formats.
- */
-
-#include <linux/config.h>
-#include <linux/slab.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/spinlock.h>
-#include <linux/personality.h>
-#include <linux/swap.h>
-#include <linux/utsname.h>
-#define __NO_VERSION__
-#include <linux/module.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
-int core_uses_pid;
-char core_pattern[65] = "core";
-int core_setuid_ok = 0;
-/* The maximal length of core_pattern is also specified in sysctl.c */
-
-static struct linux_binfmt *formats;
-static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
-
-int register_binfmt(struct linux_binfmt * fmt)
-{
- struct linux_binfmt ** tmp = &formats;
-
- if (!fmt)
- return -EINVAL;
- if (fmt->next)
- return -EBUSY;
- write_lock(&binfmt_lock);
- while (*tmp) {
- if (fmt == *tmp) {
- write_unlock(&binfmt_lock);
- return -EBUSY;
- }
- tmp = &(*tmp)->next;
- }
- fmt->next = formats;
- formats = fmt;
- write_unlock(&binfmt_lock);
- return 0;
-}
-
-int unregister_binfmt(struct linux_binfmt * fmt)
-{
- struct linux_binfmt ** tmp = &formats;
-
- write_lock(&binfmt_lock);
- while (*tmp) {
- if (fmt == *tmp) {
- *tmp = fmt->next;
- write_unlock(&binfmt_lock);
- return 0;
- }
- tmp = &(*tmp)->next;
- }
- write_unlock(&binfmt_lock);
- return -EINVAL;
-}
-
-static inline void put_binfmt(struct linux_binfmt * fmt)
-{
- if (fmt->module)
- __MOD_DEC_USE_COUNT(fmt->module);
-}
-
-/*
- * Note that a shared library must be both readable and executable due to
- * security reasons.
- *
- * Also note that we take the address to load from from the file itself.
- */
-asmlinkage long sys_uselib(const char * library)
-{
- struct file * file;
- struct nameidata nd;
- int error;
-
- error = user_path_walk(library, &nd);
- if (error)
- goto out;
-
- error = -EINVAL;
- if (!S_ISREG(nd.dentry->d_inode->i_mode))
- goto exit;
-
- error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
- if (error)
- goto exit;
-
- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-
- error = -ENOEXEC;
- if(file->f_op && file->f_op->read) {
- struct linux_binfmt * fmt;
-
- read_lock(&binfmt_lock);
- for (fmt = formats ; fmt ; fmt = fmt->next) {
- if (!fmt->load_shlib)
- continue;
- if (!try_inc_mod_count(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- error = fmt->load_shlib(file);
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (error != -ENOEXEC)
- break;
- }
- read_unlock(&binfmt_lock);
- }
- fput(file);
-out:
- return error;
-exit:
- path_release(&nd);
- goto out;
-}
-
-/*
- * count() counts the number of arguments/envelopes
- */
-static int count(char ** argv, int max)
-{
- int i = 0;
-
- if (argv != NULL) {
- for (;;) {
- char * p;
-
- if (get_user(p, argv))
- return -EFAULT;
- if (!p)
- break;
- argv++;
- if(++i > max)
- return -E2BIG;
- }
- }
- return i;
-}
-
-/*
- * 'copy_strings()' copies argument/envelope strings from user
- * memory to free pages in kernel mem. These are in a format ready
- * to be put directly into the top of new user memory.
- */
-int copy_strings(int argc,char ** argv, struct linux_binprm *bprm)
-{
- struct page *kmapped_page = NULL;
- char *kaddr = NULL;
- int ret;
-
- while (argc-- > 0) {
- char *str;
- int len;
- unsigned long pos;
-
- if (get_user(str, argv+argc) ||
- !(len = strnlen_user(str, bprm->p))) {
- ret = -EFAULT;
- goto out;
- }
-
- if (bprm->p < len) {
- ret = -E2BIG;
- goto out;
- }
-
- bprm->p -= len;
- /* XXX: add architecture specific overflow check here. */
- pos = bprm->p;
-
- while (len > 0) {
- int i, new, err;
- int offset, bytes_to_copy;
- struct page *page;
-
- offset = pos % PAGE_SIZE;
- i = pos/PAGE_SIZE;
- page = bprm->page[i];
- new = 0;
- if (!page) {
- page = alloc_page(GFP_HIGHUSER);
- bprm->page[i] = page;
- if (!page) {
- ret = -ENOMEM;
- goto out;
- }
- new = 1;
- }
-
- if (page != kmapped_page) {
- if (kmapped_page)
- kunmap(kmapped_page);
- kmapped_page = page;
- kaddr = kmap(kmapped_page);
- }
- if (new && offset)
- memset(kaddr, 0, offset);
- bytes_to_copy = PAGE_SIZE - offset;
- if (bytes_to_copy > len) {
- bytes_to_copy = len;
- if (new)
- memset(kaddr+offset+len, 0,
- PAGE_SIZE-offset-len);
- }
- err = copy_from_user(kaddr+offset, str, bytes_to_copy);
- if (err) {
- ret = -EFAULT;
- goto out;
- }
-
- pos += bytes_to_copy;
- str += bytes_to_copy;
- len -= bytes_to_copy;
- }
- }
- ret = 0;
-out:
- if (kmapped_page)
- kunmap(kmapped_page);
- return ret;
-}
-
-/*
- * Like copy_strings, but get argv and its values from kernel memory.
- */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
-{
- int r;
- mm_segment_t oldfs = get_fs();
- set_fs(KERNEL_DS);
- r = copy_strings(argc, argv, bprm);
- set_fs(oldfs);
- return r;
-}
-
-/*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
- *
- * tsk->mmap_sem is held for writing.
- */
-void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
-{
- pgd_t * pgd;
- pmd_t * pmd;
- pte_t * pte;
- struct vm_area_struct *vma;
- pgprot_t prot = PAGE_COPY;
-
- if (page_count(page) != 1)
- printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
- pgd = pgd_offset(tsk->mm, address);
-
- spin_lock(&tsk->mm->page_table_lock);
- pmd = pmd_alloc(tsk->mm, pgd, address);
- if (!pmd)
- goto out;
- pte = pte_alloc(tsk->mm, pmd, address);
- if (!pte)
- goto out;
- if (!pte_none(*pte))
- goto out;
- lru_cache_add(page);
- flush_dcache_page(page);
- flush_page_to_ram(page);
- /* lookup is cheap because there is only a single entry in the list */
- vma = find_vma(tsk->mm, address);
- if (vma)
- prot = vma->vm_page_prot;
- set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
- XEN_flush_page_update_queue();
- tsk->mm->rss++;
- spin_unlock(&tsk->mm->page_table_lock);
-
- /* no need for flush_tlb */
- return;
-out:
- spin_unlock(&tsk->mm->page_table_lock);
- __free_page(page);
- force_sig(SIGKILL, tsk);
- return;
-}
-
-int setup_arg_pages(struct linux_binprm *bprm)
-{
- unsigned long stack_base;
- struct vm_area_struct *mpnt;
- int i, ret;
-
- stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
-
- bprm->p += stack_base;
- if (bprm->loader)
- bprm->loader += stack_base;
- bprm->exec += stack_base;
-
- mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!mpnt)
- return -ENOMEM;
-
- down_write(&current->mm->mmap_sem);
- {
- mpnt->vm_mm = current->mm;
- mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
- mpnt->vm_end = STACK_TOP;
- mpnt->vm_flags = VM_STACK_FLAGS;
- mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
- mpnt->vm_ops = NULL;
- mpnt->vm_pgoff = 0;
- mpnt->vm_file = NULL;
- mpnt->vm_private_data = (void *) 0;
- if ((ret = insert_vm_struct(current->mm, mpnt))) {
- up_write(&current->mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, mpnt);
- return ret;
- }
- current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
- }
-
- for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
- struct page *page = bprm->page[i];
- if (page) {
- bprm->page[i] = NULL;
- put_dirty_page(current,page,stack_base);
- }
- stack_base += PAGE_SIZE;
- }
- up_write(&current->mm->mmap_sem);
-
- return 0;
-}
-
-struct file *open_exec(const char *name)
-{
- struct nameidata nd;
- struct inode *inode;
- struct file *file;
- int err = 0;
-
- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
- file = ERR_PTR(err);
- if (!err) {
- inode = nd.dentry->d_inode;
- file = ERR_PTR(-EACCES);
- if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
- S_ISREG(inode->i_mode)) {
- int err = permission(inode, MAY_EXEC);
- if (!err && !(inode->i_mode & 0111))
- err = -EACCES;
- file = ERR_PTR(err);
- if (!err) {
- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
- if (!IS_ERR(file)) {
- err = deny_write_access(file);
- if (err) {
- fput(file);
- file = ERR_PTR(err);
- }
- }
-out:
- return file;
- }
- }
- path_release(&nd);
- }
- goto out;
-}
-
-int kernel_read(struct file *file, unsigned long offset,
- char * addr, unsigned long count)
-{
- mm_segment_t old_fs;
- loff_t pos = offset;
- int result = -ENOSYS;
-
- if (!file->f_op->read)
- goto fail;
- old_fs = get_fs();
- set_fs(get_ds());
- result = file->f_op->read(file, addr, count, &pos);
- set_fs(old_fs);
-fail:
- return result;
-}
-
-static int exec_mmap(void)
-{
- struct mm_struct * mm, * old_mm;
-
- old_mm = current->mm;
-
- if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
- mm_release();
- down_write(&old_mm->mmap_sem);
- exit_mmap(old_mm);
- up_write(&old_mm->mmap_sem);
- return 0;
- }
-
-
- mm = mm_alloc();
- if (mm) {
- struct mm_struct *active_mm;
-
- if (init_new_context(current, mm)) {
- mmdrop(mm);
- return -ENOMEM;
- }
-
- /* Add it to the list of mm's */
- spin_lock(&mmlist_lock);
- list_add(&mm->mmlist, &init_mm.mmlist);
- mmlist_nr++;
- spin_unlock(&mmlist_lock);
-
- task_lock(current);
- active_mm = current->active_mm;
- current->mm = mm;
- current->active_mm = mm;
- task_unlock(current);
- activate_mm(active_mm, mm);
- mm_release();
- if (old_mm) {
- if (active_mm != old_mm) BUG();
- mmput(old_mm);
- return 0;
- }
- mmdrop(active_mm);
- return 0;
- }
- return -ENOMEM;
-}
-
-/*
- * This function makes sure the current process has its own signal table,
- * so that flush_signal_handlers can later reset the handlers without
- * disturbing other processes. (Other processes might share the signal
- * table via the CLONE_SIGNAL option to clone().)
- */
-
-static inline int make_private_signals(void)
-{
- struct signal_struct * newsig;
-
- if (atomic_read(&current->sig->count) <= 1)
- return 0;
- newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
- if (newsig == NULL)
- return -ENOMEM;
- spin_lock_init(&newsig->siglock);
- atomic_set(&newsig->count, 1);
- memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
- spin_lock_irq(&current->sigmask_lock);
- current->sig = newsig;
- spin_unlock_irq(&current->sigmask_lock);
- return 0;
-}
-
-/*
- * If make_private_signals() made a copy of the signal table, decrement the
- * refcount of the original table, and free it if necessary.
- * We don't do that in make_private_signals() so that we can back off
- * in flush_old_exec() if an error occurs after calling make_private_signals().
- */
-
-static inline void release_old_signals(struct signal_struct * oldsig)
-{
- if (current->sig == oldsig)
- return;
- if (atomic_dec_and_test(&oldsig->count))
- kmem_cache_free(sigact_cachep, oldsig);
-}
-
-/*
- * These functions flushes out all traces of the currently running executable
- * so that a new one can be started
- */
-
-static inline void flush_old_files(struct files_struct * files)
-{
- long j = -1;
-
- write_lock(&files->file_lock);
- for (;;) {
- unsigned long set, i;
-
- j++;
- i = j * __NFDBITS;
- if (i >= files->max_fds || i >= files->max_fdset)
- break;
- set = files->close_on_exec->fds_bits[j];
- if (!set)
- continue;
- files->close_on_exec->fds_bits[j] = 0;
- write_unlock(&files->file_lock);
- for ( ; set ; i++,set >>= 1) {
- if (set & 1) {
- sys_close(i);
- }
- }
- write_lock(&files->file_lock);
-
- }
- write_unlock(&files->file_lock);
-}
-
-/*
- * An execve() will automatically "de-thread" the process.
- * Note: we don't have to hold the tasklist_lock to test
- * whether we migth need to do this. If we're not part of
- * a thread group, there is no way we can become one
- * dynamically. And if we are, we only need to protect the
- * unlink - even if we race with the last other thread exit,
- * at worst the list_del_init() might end up being a no-op.
- */
-static inline void de_thread(struct task_struct *tsk)
-{
- if (!list_empty(&tsk->thread_group)) {
- write_lock_irq(&tasklist_lock);
- list_del_init(&tsk->thread_group);
- write_unlock_irq(&tasklist_lock);
- }
-
- /* Minor oddity: this might stay the same. */
- tsk->tgid = tsk->pid;
-}
-
-void get_task_comm(char *buf, struct task_struct *tsk)
-{
- /* buf must be at least sizeof(tsk->comm) in size */
- task_lock(tsk);
- memcpy(buf, tsk->comm, sizeof(tsk->comm));
- task_unlock(tsk);
-}
-
-void set_task_comm(struct task_struct *tsk, char *buf)
-{
- task_lock(tsk);
- strncpy(tsk->comm, buf, sizeof(tsk->comm));
- tsk->comm[sizeof(tsk->comm)-1]='\0';
- task_unlock(tsk);
-}
-
-int flush_old_exec(struct linux_binprm * bprm)
-{
- char * name;
- int i, ch, retval;
- struct signal_struct * oldsig;
- struct files_struct * files;
- char tcomm[sizeof(current->comm)];
-
- /*
- * Make sure we have a private signal table
- */
- oldsig = current->sig;
- retval = make_private_signals();
- if (retval) goto flush_failed;
-
- /*
- * Make sure we have private file handles. Ask the
- * fork helper to do the work for us and the exit
- * helper to do the cleanup of the old one.
- */
-
- files = current->files; /* refcounted so safe to hold */
- retval = unshare_files();
- if(retval)
- goto flush_failed;
-
- /*
- * Release all of the old mmap stuff
- */
- retval = exec_mmap();
- if (retval) goto mmap_failed;
-
- /* This is the point of no return */
- steal_locks(files);
- put_files_struct(files);
- release_old_signals(oldsig);
-
- current->sas_ss_sp = current->sas_ss_size = 0;
-
- if (current->euid == current->uid && current->egid == current->gid) {
- current->mm->dumpable = 1;
- current->task_dumpable = 1;
- }
- name = bprm->filename;
- for (i=0; (ch = *(name++)) != '\0';) {
- if (ch == '/')
- i = 0;
- else
- if (i < (sizeof(tcomm) - 1))
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
- set_task_comm(current, tcomm);
-
- flush_thread();
-
- de_thread(current);
-
- if (bprm->e_uid != current->euid || bprm->e_gid != current->egid ||
- permission(bprm->file->f_dentry->d_inode,MAY_READ))
- current->mm->dumpable = 0;
-
- /* An exec changes our domain. We are no longer part of the thread
- group */
-
- current->self_exec_id++;
-
- flush_signal_handlers(current);
- flush_old_files(current->files);
-
- return 0;
-
-mmap_failed:
- put_files_struct(current->files);
- current->files = files;
-flush_failed:
- spin_lock_irq(&current->sigmask_lock);
- if (current->sig != oldsig) {
- kmem_cache_free(sigact_cachep, current->sig);
- current->sig = oldsig;
- }
- spin_unlock_irq(&current->sigmask_lock);
- return retval;
-}
-
-/*
- * We mustn't allow tracing of suid binaries, unless
- * the tracer has the capability to trace anything..
- */
-static inline int must_not_trace_exec(struct task_struct * p)
-{
- return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
-}
-
-/*
- * Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
- */
-int prepare_binprm(struct linux_binprm *bprm)
-{
- int mode;
- struct inode * inode = bprm->file->f_dentry->d_inode;
-
- mode = inode->i_mode;
- /*
- * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
- * vfs_permission lets a non-executable through
- */
- if (!(mode & 0111)) /* with at least _one_ execute bit set */
- return -EACCES;
- if (bprm->file->f_op == NULL)
- return -EACCES;
-
- bprm->e_uid = current->euid;
- bprm->e_gid = current->egid;
-
- if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
- /* Set-uid? */
- if (mode & S_ISUID)
- bprm->e_uid = inode->i_uid;
-
- /* Set-gid? */
- /*
- * If setgid is set but no group execute bit then this
- * is a candidate for mandatory locking, not a setgid
- * executable.
- */
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
- bprm->e_gid = inode->i_gid;
- }
-
- /* We don't have VFS support for capabilities yet */
- cap_clear(bprm->cap_inheritable);
- cap_clear(bprm->cap_permitted);
- cap_clear(bprm->cap_effective);
-
- /* To support inheritance of root-permissions and suid-root
- * executables under compatibility mode, we raise all three
- * capability sets for the file.
- *
- * If only the real uid is 0, we only raise the inheritable
- * and permitted sets of the executable file.
- */
-
- if (!issecure(SECURE_NOROOT)) {
- if (bprm->e_uid == 0 || current->uid == 0) {
- cap_set_full(bprm->cap_inheritable);
- cap_set_full(bprm->cap_permitted);
- }
- if (bprm->e_uid == 0)
- cap_set_full(bprm->cap_effective);
- }
-
- memset(bprm->buf,0,BINPRM_BUF_SIZE);
- return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
-}
-
-/*
- * This function is used to produce the new IDs and capabilities
- * from the old ones and the file's capabilities.
- *
- * The formula used for evolving capabilities is:
- *
- * pI' = pI
- * (***) pP' = (fP & X) | (fI & pI)
- * pE' = pP' & fE [NB. fE is 0 or ~0]
- *
- * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
- * ' indicates post-exec(), and X is the global 'cap_bset'.
- *
- */
-
-void compute_creds(struct linux_binprm *bprm)
-{
- kernel_cap_t new_permitted, working;
- int do_unlock = 0;
-
- new_permitted = cap_intersect(bprm->cap_permitted, cap_bset);
- working = cap_intersect(bprm->cap_inheritable,
- current->cap_inheritable);
- new_permitted = cap_combine(new_permitted, working);
-
- if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
- !cap_issubset(new_permitted, current->cap_permitted)) {
- current->mm->dumpable = 0;
-
- lock_kernel();
- if (must_not_trace_exec(current)
- || atomic_read(&current->fs->count) > 1
- || atomic_read(&current->files->count) > 1
- || atomic_read(&current->sig->count) > 1) {
- if(!capable(CAP_SETUID)) {
- bprm->e_uid = current->uid;
- bprm->e_gid = current->gid;
- }
- if(!capable(CAP_SETPCAP)) {
- new_permitted = cap_intersect(new_permitted,
- current->cap_permitted);
- }
- }
- do_unlock = 1;
- }
-
-
- /* For init, we want to retain the capabilities set
- * in the init_task struct. Thus we skip the usual
- * capability rules */
- if (current->pid != 1) {
- current->cap_permitted = new_permitted;
- current->cap_effective =
- cap_intersect(new_permitted, bprm->cap_effective);
- }
-
- /* AUD: Audit candidate if current->cap_effective is set */
-
- current->suid = current->euid = current->fsuid = bprm->e_uid;
- current->sgid = current->egid = current->fsgid = bprm->e_gid;
-
- if(do_unlock)
- unlock_kernel();
- current->keep_capabilities = 0;
-}
-
-
-void remove_arg_zero(struct linux_binprm *bprm)
-{
- if (bprm->argc) {
- unsigned long offset;
- char * kaddr;
- struct page *page;
-
- offset = bprm->p % PAGE_SIZE;
- goto inside;
-
- while (bprm->p++, *(kaddr+offset++)) {
- if (offset != PAGE_SIZE)
- continue;
- offset = 0;
- kunmap(page);
-inside:
- page = bprm->page[bprm->p/PAGE_SIZE];
- kaddr = kmap(page);
- }
- kunmap(page);
- bprm->argc--;
- }
-}
-
-/*
- * cycle the list of binary formats handler, until one recognizes the image
- */
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
-{
- int try,retval=0;
- struct linux_binfmt *fmt;
-#ifdef __alpha__
- /* handle /sbin/loader.. */
- {
- struct exec * eh = (struct exec *) bprm->buf;
-
- if (!bprm->loader && eh->fh.f_magic == 0x183 &&
- (eh->fh.f_flags & 0x3000) == 0x3000)
- {
- struct file * file;
- unsigned long loader;
-
- allow_write_access(bprm->file);
- fput(bprm->file);
- bprm->file = NULL;
-
- loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
-
- file = open_exec("/sbin/loader");
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- return retval;
-
- /* Remember if the application is TASO. */
- bprm->sh_bang = eh->ah.entry < 0x100000000;
-
- bprm->file = file;
- bprm->loader = loader;
- retval = prepare_binprm(bprm);
- if (retval<0)
- return retval;
- /* should call search_binary_handler recursively here,
- but it does not matter */
- }
- }
-#endif
- /* kernel module loader fixup */
- /* so we don't try to load run modprobe in kernel space. */
- set_fs(USER_DS);
- for (try=0; try<2; try++) {
- read_lock(&binfmt_lock);
- for (fmt = formats ; fmt ; fmt = fmt->next) {
- int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
- if (!fn)
- continue;
- if (!try_inc_mod_count(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- retval = fn(bprm, regs);
- if (retval >= 0) {
- put_binfmt(fmt);
- allow_write_access(bprm->file);
- if (bprm->file)
- fput(bprm->file);
- bprm->file = NULL;
- current->did_exec = 1;
- return retval;
- }
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (retval != -ENOEXEC)
- break;
- if (!bprm->file) {
- read_unlock(&binfmt_lock);
- return retval;
- }
- }
- read_unlock(&binfmt_lock);
- if (retval != -ENOEXEC) {
- break;
-#ifdef CONFIG_KMOD
- }else{
-#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
- char modname[20];
- if (printable(bprm->buf[0]) &&
- printable(bprm->buf[1]) &&
- printable(bprm->buf[2]) &&
- printable(bprm->buf[3]))
- break; /* -ENOEXEC */
- sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
- request_module(modname);
-#endif
- }
- }
- return retval;
-}
-
-
-/*
- * sys_execve() executes a new program.
- */
-int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
-{
- struct linux_binprm bprm;
- struct file *file;
- int retval;
- int i;
-
- file = open_exec(filename);
-
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- return retval;
-
- bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
- memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));
-
- bprm.file = file;
- bprm.filename = filename;
- bprm.sh_bang = 0;
- bprm.loader = 0;
- bprm.exec = 0;
- if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) {
- allow_write_access(file);
- fput(file);
- return bprm.argc;
- }
-
- if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) {
- allow_write_access(file);
- fput(file);
- return bprm.envc;
- }
-
- retval = prepare_binprm(&bprm);
- if (retval < 0)
- goto out;
-
- retval = copy_strings_kernel(1, &bprm.filename, &bprm);
- if (retval < 0)
- goto out;
-
- bprm.exec = bprm.p;
- retval = copy_strings(bprm.envc, envp, &bprm);
- if (retval < 0)
- goto out;
-
- retval = copy_strings(bprm.argc, argv, &bprm);
- if (retval < 0)
- goto out;
-
- retval = search_binary_handler(&bprm,regs);
- if (retval >= 0)
- /* execve success */
- return retval;
-
-out:
- /* Something went wrong, return the inode and free the argument pages*/
- allow_write_access(bprm.file);
- if (bprm.file)
- fput(bprm.file);
-
- for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
- struct page * page = bprm.page[i];
- if (page)
- __free_page(page);
- }
-
- return retval;
-}
-
-void set_binfmt(struct linux_binfmt *new)
-{
- struct linux_binfmt *old = current->binfmt;
- if (new && new->module)
- __MOD_INC_USE_COUNT(new->module);
- current->binfmt = new;
- if (old && old->module)
- __MOD_DEC_USE_COUNT(old->module);
-}
-
-#define CORENAME_MAX_SIZE 64
-
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
- */
-void format_corename(char *corename, const char *pattern, long signr)
-{
- const char *pat_ptr = pattern;
- char *out_ptr = corename;
- char *const out_end = corename + CORENAME_MAX_SIZE;
- int rc;
- int pid_in_pattern = 0;
-
- /* Repeat as long as we have more pattern to process and more output
- space */
- while (*pat_ptr) {
- if (*pat_ptr != '%') {
- if (out_ptr == out_end)
- goto out;
- *out_ptr++ = *pat_ptr++;
- } else {
- switch (*++pat_ptr) {
- case 0:
- goto out;
- /* Double percent, output one percent */
- case '%':
- if (out_ptr == out_end)
- goto out;
- *out_ptr++ = '%';
- break;
- /* pid */
- case 'p':
- pid_in_pattern = 1;
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", current->pid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- /* uid */
- case 'u':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", current->uid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- /* gid */
- case 'g':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", current->gid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- /* signal that caused the coredump */
- case 's':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%ld", signr);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- /* UNIX time of coredump */
- case 't': {
- struct timeval tv;
- do_gettimeofday(&tv);
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%ld", tv.tv_sec);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- }
- /* hostname */
- case 'h':
- down_read(&uts_sem);
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%s", system_utsname.nodename);
- up_read(&uts_sem);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- /* executable */
- case 'e':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%s", current->comm);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- break;
- default:
- break;
- }
- ++pat_ptr;
- }
- }
- /* Backward compatibility with core_uses_pid:
- *
- * If core_pattern does not include a %p (as is the default)
- * and core_uses_pid is set, then .%pid will be appended to
- * the filename */
- if (!pid_in_pattern
- && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
- rc = snprintf(out_ptr, out_end - out_ptr,
- ".%d", current->pid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
- }
- out:
- *out_ptr = 0;
-}
-
-int do_coredump(long signr, struct pt_regs * regs)
-{
- struct linux_binfmt * binfmt;
- char corename[CORENAME_MAX_SIZE + 1];
- struct file * file;
- struct inode * inode;
- int retval = 0;
- int fsuid = current->fsuid;
-
- lock_kernel();
- binfmt = current->binfmt;
- if (!binfmt || !binfmt->core_dump)
- goto fail;
- if (!is_dumpable(current))
- {
- if(!core_setuid_ok || !current->task_dumpable)
- goto fail;
- current->fsuid = 0;
- }
- current->mm->dumpable = 0;
- if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
- goto fail;
-
- format_corename(corename, core_pattern, signr);
- file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
- if (IS_ERR(file))
- goto fail;
- inode = file->f_dentry->d_inode;
- if (inode->i_nlink > 1)
- goto close_fail; /* multiple links - don't dump */
- if (d_unhashed(file->f_dentry))
- goto close_fail;
-
- if (!S_ISREG(inode->i_mode))
- goto close_fail;
- if (!file->f_op)
- goto close_fail;
- if (!file->f_op->write)
- goto close_fail;
- if (do_truncate(file->f_dentry, 0) != 0)
- goto close_fail;
-
- retval = binfmt->core_dump(signr, regs, file);
-
-close_fail:
- filp_close(file, NULL);
-fail:
- if (fsuid != current->fsuid)
- current->fsuid = fsuid;
- unlock_kernel();
- return retval;
-}
diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/page.h b/linux-2.4.29-xen-sparse/include/asm-xen/page.h
index fbab7f5ff1..3150545429 100644
--- a/linux-2.4.29-xen-sparse/include/asm-xen/page.h
+++ b/linux-2.4.29-xen-sparse/include/asm-xen/page.h
@@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; } pgprot_t;
static inline unsigned long pmd_val(pmd_t x)
{
unsigned long ret = x.pmd;
- if ( (ret & 1) ) ret = machine_to_phys(ret);
+ if ( ret ) ret = machine_to_phys(ret) | 1;
return ret;
}
#define pmd_val_ma(x) ((x).pmd)
#define pgd_val(x) ({ BUG(); (unsigned long)0; })
#define pgprot_val(x) ((x).pgprot)
-static inline pte_t __pte(unsigned long x)
-{
- if ( (x & 1) ) x = phys_to_machine(x);
- return ((pte_t) { (x) });
-}
-static inline pmd_t __pmd(unsigned long x)
-{
- if ( (x & 1) ) x = phys_to_machine(x);
- return ((pmd_t) { (x) });
-}
+#define __pte(x) ({ unsigned long _x = (x); \
+ (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
+#define __pte_ma(x) ((pte_t) { (x) } )
+#define __pmd(x) ({ unsigned long _x = (x); \
+ (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
#define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; })
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h
index 4e9584e918..2a0c226c71 100644
--- a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h
+++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h
@@ -22,7 +22,6 @@
#define pmd_populate(mm, pmd, pte) \
do { \
set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \
- XEN_flush_page_update_queue(); \
} while ( 0 )
/*
@@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void)
memcpy(pgd + USER_PTRS_PER_PGD,
init_mm.pgd + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
- __make_page_readonly(pgd);
+ __make_page_readonly(pgd);
queue_pgd_pin(__pa(pgd));
+ flush_page_update_queue();
}
return pgd;
}
@@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *pgd)
kmem_cache_free(pae_pgd_cachep, pgd);
#else
queue_pgd_unpin(__pa(pgd));
- __make_page_writable(pgd);
+ __make_page_writable(pgd);
+ flush_page_update_queue();
free_page((unsigned long)pgd);
#endif
}
@@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
clear_page(pte);
__make_page_readonly(pte);
queue_pte_pin(__pa(pte));
+ flush_page_update_queue();
}
return pte;
@@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte_t *pte)
{
queue_pte_unpin(__pa(pte));
__make_page_writable(pte);
+ flush_page_update_queue();
free_page((unsigned long)pte);
}
@@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int);
static inline void flush_tlb_mm(struct mm_struct *mm)
{
- if (mm == current->active_mm) queue_tlb_flush();
- XEN_flush_page_update_queue();
+ if (mm == current->active_mm) xen_tlb_flush();
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long addr)
{
- if (vma->vm_mm == current->active_mm) queue_invlpg(addr);
- XEN_flush_page_update_queue();
+ if (vma->vm_mm == current->active_mm) xen_invlpg(addr);
}
static inline void flush_tlb_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
- if (mm == current->active_mm) queue_tlb_flush();
- XEN_flush_page_update_queue();
+ if (mm == current->active_mm) xen_tlb_flush();
}
#else
@@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
/* i386 does not keep any page table caches in TLB */
- XEN_flush_page_update_queue();
}
/*
diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h
index d91b48360e..70f8356fb1 100644
--- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h
+++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h
@@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd) { return 0; }
static inline int pgd_present(pgd_t pgd) { return 1; }
#define pgd_clear(xp) do { } while (0)
-#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
-#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
-#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval))
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified. Thus, the following
+ * hook is made available.
+ */
+#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
+#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval)
+
+/*
+ * (pmds are folded into pgds so this doesnt get actually called,
+ * but the define is needed for a generic inline function.)
+ */
+#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
#define set_pgd(pgdptr, pgdval) ((void)0)
#define pgd_page(pgd) \
@@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
return (pmd_t *) dir;
}
+#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0))
#define pte_same(a, b) ((a).pte_low == (b).pte_low)
/*
@@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
#define pte_none(x) (!(x).pte_low)
#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
-/*
- * A note on implementation of this atomic 'get-and-clear' operation.
- * This is actually very simple because XenoLinux can only run on a single
- * processor. Therefore, we cannot race other processors setting the 'accessed'
- * or 'dirty' bits on a page-table entry.
- * Even if pages are shared between domains, that is not a problem because
- * each domain will have separate page tables, with their own versions of
- * accessed & dirty state.
- */
-static inline pte_t ptep_get_and_clear(pte_t *xp)
-{
- pte_t pte = *xp;
- if ( !pte_none(pte) )
- queue_l1_entry_update(xp, 0);
- return pte;
-}
-
#endif /* _I386_PGTABLE_2LEVEL_H */
diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h
index c15f0e9509..f5a53adc82 100644
--- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h
+++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h
@@ -38,11 +38,11 @@ extern void paging_init(void);
extern unsigned long pgkern_mask;
-#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); })
+#define __flush_tlb() xen_tlb_flush()
#define __flush_tlb_global() __flush_tlb()
#define __flush_tlb_all() __flush_tlb_global()
-#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
-#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+#define __flush_tlb_single(addr) xen_invlpg(addr)
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -179,12 +179,14 @@ extern void * high_memory;
#define __S111 PAGE_SHARED
#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pte_clear(xp) queue_l1_entry_update(xp, 0)
+#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0)
-#define pmd_none(x) (!(x).pmd)
-#define pmd_present(x) ((x).pmd & _PAGE_PRESENT)
+#define pmd_none(x) (!pmd_val(x))
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+ can temporarily clear it. */
+#define pmd_present(x) (pmd_val(x))
#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
-#define pmd_bad(x) (((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
@@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return p
static inline int ptep_test_and_clear_dirty(pte_t *ptep)
{
- unsigned long pteval = *(unsigned long *)ptep;
- int ret = pteval & _PAGE_DIRTY;
- if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY);
- return ret;
+ if (!pte_dirty(*ptep))
+ return 0;
+ return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
}
-static inline int ptep_test_and_clear_young(pte_t *ptep)
+
+static inline int ptep_test_and_clear_young(pte_t *ptep)
{
- unsigned long pteval = *(unsigned long *)ptep;
- int ret = pteval & _PAGE_ACCESSED;
- if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED);
- return ret;
+ if (!pte_young(*ptep))
+ return 0;
+ return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
}
+
static inline void ptep_set_wrprotect(pte_t *ptep)
{
- unsigned long pteval = *(unsigned long *)ptep;
- if ( (pteval & _PAGE_RW) )
- queue_l1_entry_update(ptep, pteval & ~_PAGE_RW);
+ if (pte_write(*ptep))
+ clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
}
+
static inline void ptep_mkdirty(pte_t *ptep)
{
- unsigned long pteval = *(unsigned long *)ptep;
- if ( !(pteval & _PAGE_DIRTY) )
- queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY);
+ if (!pte_dirty(*ptep))
+ set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
}
/*
diff --git a/linux-2.4.29-xen-sparse/mm/highmem.c b/linux-2.4.29-xen-sparse/mm/highmem.c
index 341e6e29a9..f8182820ac 100644
--- a/linux-2.4.29-xen-sparse/mm/highmem.c
+++ b/linux-2.4.29-xen-sparse/mm/highmem.c
@@ -122,7 +122,6 @@ start:
}
vaddr = PKMAP_ADDR(last_pkmap_nr);
set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
- XEN_flush_page_update_queue();
pkmap_count[last_pkmap_nr] = 1;
page->virtual = (void *) vaddr;
diff --git a/linux-2.4.29-xen-sparse/mm/memory.c b/linux-2.4.29-xen-sparse/mm/memory.c
index 7d81c86589..880b6981c4 100644
--- a/linux-2.4.29-xen-sparse/mm/memory.c
+++ b/linux-2.4.29-xen-sparse/mm/memory.c
@@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
free_one_pgd(page_dir);
page_dir++;
} while (--nr);
- XEN_flush_page_update_queue();
spin_unlock(&mm->page_table_lock);
/* keep the page table cache within bounds */
@@ -249,10 +248,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow && pte_write(pte)) {
- /* XEN modification: modified ordering here to avoid RaW hazard. */
- pte = *src_pte;
- pte = pte_wrprotect(pte);
ptep_set_wrprotect(src_pte);
+ pte = *src_pte;
}
/* If it's a shared mapping, mark it clean in the child */
@@ -914,7 +911,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
{
#ifdef CONFIG_XEN
if ( likely(vma->vm_mm == current->mm) ) {
- XEN_flush_page_update_queue();
HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
} else {
set_pte(page_table, entry);
@@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct * mm,
flush_page_to_ram(page);
flush_icache_page(vma, page);
#ifdef CONFIG_XEN
- if ( likely(vma->vm_mm == current->mm) ) {
- XEN_flush_page_update_queue();
+ if ( likely(vma->vm_mm == current->mm) )
HYPERVISOR_update_va_mapping(address, pte, 0);
- } else {
+ else
set_pte(page_table, pte);
- XEN_flush_page_update_queue();
- }
#else
set_pte(page_table, pte);
#endif
@@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
}
#ifdef CONFIG_XEN
- if ( likely(vma->vm_mm == current->mm) ) {
- XEN_flush_page_update_queue();
+ if ( likely(vma->vm_mm == current->mm) )
HYPERVISOR_update_va_mapping(addr, entry, 0);
- } else {
+ else
set_pte(page_table, entry);
- XEN_flush_page_update_queue();
- }
#else
set_pte(page_table, entry);
#endif
@@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
if (write_access)
entry = pte_mkwrite(pte_mkdirty(entry));
#ifdef CONFIG_XEN
- if ( likely(vma->vm_mm == current->mm) ) {
- XEN_flush_page_update_queue();
+ if ( likely(vma->vm_mm == current->mm) )
HYPERVISOR_update_va_mapping(address, entry, 0);
- } else {
+ else
set_pte(page_table, entry);
- XEN_flush_page_update_queue();
- }
#else
set_pte(page_table, entry);
#endif
@@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long addres
/* "fast" allocation can happen without dropping the lock.. */
new = pte_alloc_one_fast(mm, address);
if (!new) {
- XEN_flush_page_update_queue();
spin_unlock(&mm->page_table_lock);
new = pte_alloc_one(mm, address);
spin_lock(&mm->page_table_lock);
diff --git a/linux-2.4.29-xen-sparse/mm/mremap.c b/linux-2.4.29-xen-sparse/mm/mremap.c
index 330e194bae..475c308b1b 100644
--- a/linux-2.4.29-xen-sparse/mm/mremap.c
+++ b/linux-2.4.29-xen-sparse/mm/mremap.c
@@ -119,11 +119,9 @@ static int move_page_tables(struct mm_struct * mm,
* the old page tables)
*/
oops_we_failed:
- XEN_flush_page_update_queue();
flush_cache_range(mm, new_addr, new_addr + len);
while ((offset += PAGE_SIZE) < len)
move_one_page(mm, new_addr + offset, old_addr + offset);
- XEN_flush_page_update_queue();
zap_page_range(mm, new_addr, len);
return -1;
}
diff --git a/linux-2.4.29-xen-sparse/mm/swapfile.c b/linux-2.4.29-xen-sparse/mm/swapfile.c
deleted file mode 100644
index 6457f19b74..0000000000
--- a/linux-2.4.29-xen-sparse/mm/swapfile.c
+++ /dev/null
@@ -1,1267 +0,0 @@
-/*
- * linux/mm/swapfile.c
- *
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- * Swap reorganised 29.12.95, Stephen Tweedie
- */
-
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
-#include <linux/kernel_stat.h>
-#include <linux/swap.h>
-#include <linux/swapctl.h>
-#include <linux/blkdev.h> /* for blk_size */
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/shm.h>
-
-#include <asm/pgtable.h>
-
-spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
-unsigned int nr_swapfiles;
-int total_swap_pages;
-static int swap_overflow;
-
-static const char Bad_file[] = "Bad swap file entry ";
-static const char Unused_file[] = "Unused swap file entry ";
-static const char Bad_offset[] = "Bad swap offset entry ";
-static const char Unused_offset[] = "Unused swap offset entry ";
-
-struct swap_list_t swap_list = {-1, -1};
-
-struct swap_info_struct swap_info[MAX_SWAPFILES];
-
-#define SWAPFILE_CLUSTER 256
-
-static inline int scan_swap_map(struct swap_info_struct *si)
-{
- unsigned long offset;
- /*
- * We try to cluster swap pages by allocating them
- * sequentially in swap. Once we've allocated
- * SWAPFILE_CLUSTER pages this way, however, we resort to
- * first-free allocation, starting a new cluster. This
- * prevents us from scattering swap pages all over the entire
- * swap partition, so that we reduce overall disk seek times
- * between swap pages. -- sct */
- if (si->cluster_nr) {
- while (si->cluster_next <= si->highest_bit) {
- offset = si->cluster_next++;
- if (si->swap_map[offset])
- continue;
- si->cluster_nr--;
- goto got_page;
- }
- }
- si->cluster_nr = SWAPFILE_CLUSTER;
-
- /* try to find an empty (even not aligned) cluster. */
- offset = si->lowest_bit;
- check_next_cluster:
- if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
- {
- int nr;
- for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
- if (si->swap_map[nr])
- {
- offset = nr+1;
- goto check_next_cluster;
- }
- /* We found a completly empty cluster, so start
- * using it.
- */
- goto got_page;
- }
- /* No luck, so now go finegrined as usual. -Andrea */
- for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
- if (si->swap_map[offset])
- continue;
- si->lowest_bit = offset+1;
- got_page:
- if (offset == si->lowest_bit)
- si->lowest_bit++;
- if (offset == si->highest_bit)
- si->highest_bit--;
- if (si->lowest_bit > si->highest_bit) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- }
- si->swap_map[offset] = 1;
- nr_swap_pages--;
- si->cluster_next = offset+1;
- return offset;
- }
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- return 0;
-}
-
-swp_entry_t get_swap_page(void)
-{
- struct swap_info_struct * p;
- unsigned long offset;
- swp_entry_t entry;
- int type, wrapped = 0;
-
- entry.val = 0; /* Out of memory */
- swap_list_lock();
- type = swap_list.next;
- if (type < 0)
- goto out;
- if (nr_swap_pages <= 0)
- goto out;
-
- while (1) {
- p = &swap_info[type];
- if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- swap_device_lock(p);
- offset = scan_swap_map(p);
- swap_device_unlock(p);
- if (offset) {
- entry = SWP_ENTRY(type,offset);
- type = swap_info[type].next;
- if (type < 0 ||
- p->prio != swap_info[type].prio) {
- swap_list.next = swap_list.head;
- } else {
- swap_list.next = type;
- }
- goto out;
- }
- }
- type = p->next;
- if (!wrapped) {
- if (type < 0 || p->prio != swap_info[type].prio) {
- type = swap_list.head;
- wrapped = 1;
- }
- } else
- if (type < 0)
- goto out; /* out of swap space */
- }
-out:
- swap_list_unlock();
- return entry;
-}
-
-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct * p;
- unsigned long offset, type;
-
- if (!entry.val)
- goto out;
- type = SWP_TYPE(entry);
- if (type >= nr_swapfiles)
- goto bad_nofile;
- p = & swap_info[type];
- if (!(p->flags & SWP_USED))
- goto bad_device;
- offset = SWP_OFFSET(entry);
- if (offset >= p->max)
- goto bad_offset;
- if (!p->swap_map[offset])
- goto bad_free;
- swap_list_lock();
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = type;
- swap_device_lock(p);
- return p;
-
-bad_free:
- printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
- goto out;
-bad_offset:
- printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
- goto out;
-bad_device:
- printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
- goto out;
-bad_nofile:
- printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
-out:
- return NULL;
-}
-
-static void swap_info_put(struct swap_info_struct * p)
-{
- swap_device_unlock(p);
- swap_list_unlock();
-}
-
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
-{
- int count = p->swap_map[offset];
-
- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = count;
- if (!count) {
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
- nr_swap_pages++;
- }
- }
- return count;
-}
-
-/*
- * Caller has made sure that the swapdevice corresponding to entry
- * is still around or has not been recycled.
- */
-void swap_free(swp_entry_t entry)
-{
- struct swap_info_struct * p;
-
- p = swap_info_get(entry);
- if (p) {
- swap_entry_free(p, SWP_OFFSET(entry));
- swap_info_put(p);
- }
-}
-
-/*
- * Check if we're the only user of a swap page,
- * when the page is locked.
- */
-static int exclusive_swap_page(struct page *page)
-{
- int retval = 0;
- struct swap_info_struct * p;
- swp_entry_t entry;
-
- entry.val = page->index;
- p = swap_info_get(entry);
- if (p) {
- /* Is the only swap cache user the cache itself? */
- if (p->swap_map[SWP_OFFSET(entry)] == 1) {
- /* Recheck the page count with the pagecache lock held.. */
- spin_lock(&pagecache_lock);
- if (page_count(page) - !!page->buffers == 2)
- retval = 1;
- spin_unlock(&pagecache_lock);
- }
- swap_info_put(p);
- }
- return retval;
-}
-
-/*
- * We can use this swap cache entry directly
- * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
- */
-int fastcall can_share_swap_page(struct page *page)
-{
- int retval = 0;
-
- if (!PageLocked(page))
- BUG();
- switch (page_count(page)) {
- case 3:
- if (!page->buffers)
- break;
- /* Fallthrough */
- case 2:
- if (!PageSwapCache(page))
- break;
- retval = exclusive_swap_page(page);
- break;
- case 1:
- if (PageReserved(page))
- break;
- retval = 1;
- }
- return retval;
-}
-
-/*
- * Work out if there are any other processes sharing this
- * swap cache page. Free it if you can. Return success.
- */
-int fastcall remove_exclusive_swap_page(struct page *page)
-{
- int retval;
- struct swap_info_struct * p;
- swp_entry_t entry;
-
- if (!PageLocked(page))
- BUG();
- if (!PageSwapCache(page))
- return 0;
- if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */
- return 0;
-
- entry.val = page->index;
- p = swap_info_get(entry);
- if (!p)
- return 0;
-
- /* Is the only swap cache user the cache itself? */
- retval = 0;
- if (p->swap_map[SWP_OFFSET(entry)] == 1) {
- /* Recheck the page count with the pagecache lock held.. */
- spin_lock(&pagecache_lock);
- if (page_count(page) - !!page->buffers == 2) {
- __delete_from_swap_cache(page);
- SetPageDirty(page);
- retval = 1;
- }
- spin_unlock(&pagecache_lock);
- }
- swap_info_put(p);
-
- if (retval) {
- block_flushpage(page, 0);
- swap_free(entry);
- page_cache_release(page);
- }
-
- return retval;
-}
-
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
- */
-void free_swap_and_cache(swp_entry_t entry)
-{
- struct swap_info_struct * p;
- struct page *page = NULL;
-
- p = swap_info_get(entry);
- if (p) {
- if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
- page = find_trylock_page(&swapper_space, entry.val);
- swap_info_put(p);
- }
- if (page) {
- page_cache_get(page);
- /* Only cache user (+us), or swap space full? Free it! */
- if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
- delete_from_swap_cache(page);
- SetPageDirty(page);
- }
- UnlockPage(page);
- page_cache_release(page);
- }
-}
-
-/*
- * The swap entry has been read in advance, and we return 1 to indicate
- * that the page has been used or is no longer needed.
- *
- * Always set the resulting pte to be nowrite (the same as COW pages
- * after one process has exited). We don't know just how many PTEs will
- * share this swap entry, so be cautious and let do_wp_page work out
- * what to do if a write is requested later.
- */
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
- pte_t *dir, swp_entry_t entry, struct page* page)
-{
- pte_t pte = *dir;
-
- if (likely(pte_to_swp_entry(pte).val != entry.val))
- return;
- if (unlikely(pte_none(pte) || pte_present(pte)))
- return;
- get_page(page);
- set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
- swap_free(entry);
- ++vma->vm_mm->rss;
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
- unsigned long address, unsigned long size, unsigned long offset,
- swp_entry_t entry, struct page* page)
-{
- pte_t * pte;
- unsigned long end;
-
- if (pmd_none(*dir))
- return;
- if (pmd_bad(*dir)) {
- pmd_ERROR(*dir);
- pmd_clear(dir);
- return;
- }
- pte = pte_offset(dir, address);
- offset += address & PMD_MASK;
- address &= ~PMD_MASK;
- end = address + size;
- if (end > PMD_SIZE)
- end = PMD_SIZE;
- do {
- unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
- address += PAGE_SIZE;
- pte++;
- } while (address && (address < end));
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
- unsigned long address, unsigned long size,
- swp_entry_t entry, struct page* page)
-{
- pmd_t * pmd;
- unsigned long offset, end;
-
- if (pgd_none(*dir))
- return;
- if (pgd_bad(*dir)) {
- pgd_ERROR(*dir);
- pgd_clear(dir);
- return;
- }
- pmd = pmd_offset(dir, address);
- offset = address & PGDIR_MASK;
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- if (address >= end)
- BUG();
- do {
- unuse_pmd(vma, pmd, address, end - address, offset, entry,
- page);
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address && (address < end));
-}
-
-/* mmlist_lock and vma->vm_mm->page_table_lock are held */
-static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
- swp_entry_t entry, struct page* page)
-{
- unsigned long start = vma->vm_start, end = vma->vm_end;
-
- if (start >= end)
- BUG();
- do {
- unuse_pgd(vma, pgdir, start, end - start, entry, page);
- start = (start + PGDIR_SIZE) & PGDIR_MASK;
- pgdir++;
- } while (start && (start < end));
-}
-
-static void unuse_process(struct mm_struct * mm,
- swp_entry_t entry, struct page* page)
-{
- struct vm_area_struct* vma;
-
- /*
- * Go through process' page directory.
- */
- spin_lock(&mm->page_table_lock);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- pgd_t * pgd = pgd_offset(mm, vma->vm_start);
- unuse_vma(vma, pgd, entry, page);
- }
- XEN_flush_page_update_queue();
- spin_unlock(&mm->page_table_lock);
- return;
-}
-
-/*
- * Scan swap_map from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
- */
-static int find_next_to_unuse(struct swap_info_struct *si, int prev)
-{
- int max = si->max;
- int i = prev;
- int count;
-
- /*
- * No need for swap_device_lock(si) here: we're just looking
- * for whether an entry is in use, not modifying it; false
- * hits are okay, and sys_swapoff() has already prevented new
- * allocations from this area (while holding swap_list_lock()).
- */
- for (;;) {
- if (++i >= max) {
- if (!prev) {
- i = 0;
- break;
- }
- /*
- * No entries in use at top of swap_map,
- * loop back to start and recheck there.
- */
- max = prev + 1;
- prev = 0;
- i = 1;
- }
- count = si->swap_map[i];
- if (count && count != SWAP_MAP_BAD)
- break;
- }
- return i;
-}
-
-/*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it. All the necessary
- * page table adjustments can then be made atomically.
- */
-static int try_to_unuse(unsigned int type)
-{
- struct swap_info_struct * si = &swap_info[type];
- struct mm_struct *start_mm;
- unsigned short *swap_map;
- unsigned short swcount;
- struct page *page;
- swp_entry_t entry;
- int i = 0;
- int retval = 0;
- int reset_overflow = 0;
- int shmem;
-
- /*
- * When searching mms for an entry, a good strategy is to
- * start at the first mm we freed the previous entry from
- * (though actually we don't notice whether we or coincidence
- * freed the entry). Initialize this start_mm with a hold.
- *
- * A simpler strategy would be to start at the last mm we
- * freed the previous entry from; but that would take less
- * advantage of mmlist ordering (now preserved by swap_out()),
- * which clusters forked address spaces together, most recent
- * child immediately after parent. If we race with dup_mmap(),
- * we very much want to resolve parent before child, otherwise
- * we may miss some entries: using last mm would invert that.
- */
- start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
-
- /*
- * Keep on scanning until all entries have gone. Usually,
- * one pass through swap_map is enough, but not necessarily:
- * mmput() removes mm from mmlist before exit_mmap() and its
- * zap_page_range(). That's not too bad, those entries are
- * on their way out, and handled faster there than here.
- * do_munmap() behaves similarly, taking the range out of mm's
- * vma list before zap_page_range(). But unfortunately, when
- * unmapping a part of a vma, it takes the whole out first,
- * then reinserts what's left after (might even reschedule if
- * open() method called) - so swap entries may be invisible
- * to swapoff for a while, then reappear - but that is rare.
- */
- while ((i = find_next_to_unuse(si, i))) {
- /*
- * Get a page for the entry, using the existing swap
- * cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
- */
- swap_map = &si->swap_map[i];
- entry = SWP_ENTRY(type, i);
- page = read_swap_cache_async(entry);
- if (!page) {
- /*
- * Either swap_duplicate() failed because entry
- * has been freed independently, and will not be
- * reused since sys_swapoff() already disabled
- * allocation from here, or alloc_page() failed.
- */
- if (!*swap_map)
- continue;
- retval = -ENOMEM;
- break;
- }
-
- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
- }
-
- /*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page(page);
- lock_page(page);
-
- /*
- * Remove all references to entry, without blocking.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
- */
- shmem = 0;
- swcount = *swap_map;
- if (swcount > 1) {
- flush_page_to_ram(page);
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- unuse_process(start_mm, entry, page);
- }
- if (*swap_map > 1) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *mm;
-
- spin_lock(&mmlist_lock);
- while (*swap_map > 1 &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- swcount = *swap_map;
- if (mm == &init_mm) {
- set_start_mm = 1;
- spin_unlock(&mmlist_lock);
- shmem = shmem_unuse(entry, page);
- spin_lock(&mmlist_lock);
- } else
- unuse_process(mm, entry, page);
- if (set_start_mm && *swap_map < swcount) {
- new_start_mm = mm;
- set_start_mm = 0;
- }
- }
- atomic_inc(&new_start_mm->mm_users);
- spin_unlock(&mmlist_lock);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
-
- /*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- if (*swap_map == SWAP_MAP_MAX) {
- swap_list_lock();
- swap_device_lock(si);
- nr_swap_pages++;
- *swap_map = 1;
- swap_device_unlock(si);
- swap_list_unlock();
- reset_overflow = 1;
- }
-
- /*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_swap_out could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Note shmem_unuse already deleted swappage from cache,
- * unless corresponding filepage found already in cache:
- * in which case it left swappage in cache, lowered its
- * swap count to pass quickly through the loops above,
- * and now we must reincrement count to try again later.
- */
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
- rw_swap_page(WRITE, page);
- lock_page(page);
- }
- if (PageSwapCache(page)) {
- if (shmem)
- swap_duplicate(entry);
- else
- delete_from_swap_cache(page);
- }
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so try_to_swap_out will preserve it.
- */
- SetPageDirty(page);
- UnlockPage(page);
- page_cache_release(page);
-
- /*
- * Make sure that we aren't completely killing
- * interactive performance. Interruptible check on
- * signal_pending() would be nice, but changes the spec?
- */
- if (current->need_resched)
- schedule();
- }
-
- mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
- return retval;
-}
-
-asmlinkage long sys_swapoff(const char * specialfile)
-{
- struct swap_info_struct * p = NULL;
- unsigned short *swap_map;
- struct nameidata nd;
- int i, type, prev;
- int err;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- err = user_path_walk(specialfile, &nd);
- if (err)
- goto out;
-
- lock_kernel();
- prev = -1;
- swap_list_lock();
- for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
- p = swap_info + type;
- if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- if (p->swap_file == nd.dentry)
- break;
- }
- prev = type;
- }
- err = -EINVAL;
- if (type < 0) {
- swap_list_unlock();
- goto out_dput;
- }
-
- if (prev < 0) {
- swap_list.head = p->next;
- } else {
- swap_info[prev].next = p->next;
- }
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
- nr_swap_pages -= p->pages;
- total_swap_pages -= p->pages;
- p->flags = SWP_USED;
- swap_list_unlock();
- unlock_kernel();
- err = try_to_unuse(type);
- lock_kernel();
- if (err) {
- /* re-insert swap space back into swap_list */
- swap_list_lock();
- for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
- if (p->prio >= swap_info[i].prio)
- break;
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p - swap_info;
- else
- swap_info[prev].next = p - swap_info;
- nr_swap_pages += p->pages;
- total_swap_pages += p->pages;
- p->flags = SWP_WRITEOK;
- swap_list_unlock();
- goto out_dput;
- }
- if (p->swap_device)
- blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
- path_release(&nd);
-
- swap_list_lock();
- swap_device_lock(p);
- nd.mnt = p->swap_vfsmnt;
- nd.dentry = p->swap_file;
- p->swap_vfsmnt = NULL;
- p->swap_file = NULL;
- p->swap_device = 0;
- p->max = 0;
- swap_map = p->swap_map;
- p->swap_map = NULL;
- p->flags = 0;
- swap_device_unlock(p);
- swap_list_unlock();
- vfree(swap_map);
- err = 0;
-
-out_dput:
- unlock_kernel();
- path_release(&nd);
-out:
- return err;
-}
-
-int get_swaparea_info(char *buf)
-{
- char * page = (char *) __get_free_page(GFP_KERNEL);
- struct swap_info_struct *ptr = swap_info;
- int i, j, len = 0, usedswap;
-
- if (!page)
- return -ENOMEM;
-
- len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
- for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
- if ((ptr->flags & SWP_USED) && ptr->swap_map) {
- char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
- page, PAGE_SIZE);
-
- len += sprintf(buf + len, "%-31s ", path);
-
- if (!ptr->swap_device)
- len += sprintf(buf + len, "file\t\t");
- else
- len += sprintf(buf + len, "partition\t");
-
- usedswap = 0;
- for (j = 0; j < ptr->max; ++j)
- switch (ptr->swap_map[j]) {
- case SWAP_MAP_BAD:
- case 0:
- continue;
- default:
- usedswap++;
- }
- len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
- usedswap << (PAGE_SHIFT - 10), ptr->prio);
- }
- }
- free_page((unsigned long) page);
- return len;
-}
-
-int is_swap_partition(kdev_t dev) {
- struct swap_info_struct *ptr = swap_info;
- int i;
-
- for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
- if (ptr->flags & SWP_USED)
- if (ptr->swap_device == dev)
- return 1;
- }
- return 0;
-}
-
-/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
-{
- struct swap_info_struct * p;
- struct nameidata nd;
- struct inode * swap_inode;
- unsigned int type;
- int i, j, prev;
- int error;
- static int least_priority = 0;
- union swap_header *swap_header = 0;
- int swap_header_version;
- int nr_good_pages = 0;
- unsigned long maxpages = 1;
- int swapfilesize;
- struct block_device *bdev = NULL;
- unsigned short *swap_map;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- lock_kernel();
- swap_list_lock();
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++)
- if (!(p->flags & SWP_USED))
- break;
- error = -EPERM;
- if (type >= MAX_SWAPFILES) {
- swap_list_unlock();
- goto out;
- }
- if (type >= nr_swapfiles)
- nr_swapfiles = type+1;
- p->flags = SWP_USED;
- p->swap_file = NULL;
- p->swap_vfsmnt = NULL;
- p->swap_device = 0;
- p->swap_map = NULL;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- p->cluster_nr = 0;
- p->sdev_lock = SPIN_LOCK_UNLOCKED;
- p->next = -1;
- if (swap_flags & SWAP_FLAG_PREFER) {
- p->prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
- } else {
- p->prio = --least_priority;
- }
- swap_list_unlock();
- error = user_path_walk(specialfile, &nd);
- if (error)
- goto bad_swap_2;
-
- p->swap_file = nd.dentry;
- p->swap_vfsmnt = nd.mnt;
- swap_inode = nd.dentry->d_inode;
- error = -EINVAL;
-
- if (S_ISBLK(swap_inode->i_mode)) {
- kdev_t dev = swap_inode->i_rdev;
- struct block_device_operations *bdops;
- devfs_handle_t de;
-
- if (is_mounted(dev)) {
- error = -EBUSY;
- goto bad_swap_2;
- }
-
- p->swap_device = dev;
- set_blocksize(dev, PAGE_SIZE);
-
- bd_acquire(swap_inode);
- bdev = swap_inode->i_bdev;
- de = devfs_get_handle_from_inode(swap_inode);
- bdops = devfs_get_ops(de); /* Increments module use count */
- if (bdops) bdev->bd_op = bdops;
-
- error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
- devfs_put_ops(de);/*Decrement module use count now we're safe*/
- if (error)
- goto bad_swap_2;
- set_blocksize(dev, PAGE_SIZE);
- error = -ENODEV;
- if (!dev || (blk_size[MAJOR(dev)] &&
- !blk_size[MAJOR(dev)][MINOR(dev)]))
- goto bad_swap;
- swapfilesize = 0;
- if (blk_size[MAJOR(dev)])
- swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
- >> (PAGE_SHIFT - 10);
- } else if (S_ISREG(swap_inode->i_mode))
- swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
- else
- goto bad_swap;
-
- error = -EBUSY;
- for (i = 0 ; i < nr_swapfiles ; i++) {
- struct swap_info_struct *q = &swap_info[i];
- if (i == type || !q->swap_file)
- continue;
- if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
- goto bad_swap;
- }
-
- swap_header = (void *) __get_free_page(GFP_USER);
- if (!swap_header) {
- printk("Unable to start swapping: out of memory :-)\n");
- error = -ENOMEM;
- goto bad_swap;
- }
-
- lock_page(virt_to_page(swap_header));
- rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
-
- if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
- swap_header_version = 1;
- else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
- swap_header_version = 2;
- else {
- printk("Unable to find swap-space signature\n");
- error = -EINVAL;
- goto bad_swap;
- }
-
- switch (swap_header_version) {
- case 1:
- memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
- j = 0;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
- if (test_bit(i,(char *) swap_header)) {
- if (!p->lowest_bit)
- p->lowest_bit = i;
- p->highest_bit = i;
- maxpages = i+1;
- j++;
- }
- }
- nr_good_pages = j;
- p->swap_map = vmalloc(maxpages * sizeof(short));
- if (!p->swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
- for (i = 1 ; i < maxpages ; i++) {
- if (test_bit(i,(char *) swap_header))
- p->swap_map[i] = 0;
- else
- p->swap_map[i] = SWAP_MAP_BAD;
- }
- break;
-
- case 2:
- /* Check the swap header's sub-version and the size of
- the swap file and bad block lists */
- if (swap_header->info.version != 1) {
- printk(KERN_WARNING
- "Unable to handle swap header version %d\n",
- swap_header->info.version);
- error = -EINVAL;
- goto bad_swap;
- }
-
- p->lowest_bit = 1;
- maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
- if (maxpages > swap_header->info.last_page)
- maxpages = swap_header->info.last_page;
- p->highest_bit = maxpages - 1;
-
- error = -EINVAL;
- if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
- goto bad_swap;
-
- /* OK, set up the swap map and apply the bad block list */
- if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
- error = -ENOMEM;
- goto bad_swap;
- }
-
- error = 0;
- memset(p->swap_map, 0, maxpages * sizeof(short));
- for (i=0; i<swap_header->info.nr_badpages; i++) {
- int page = swap_header->info.badpages[i];
- if (page <= 0 || page >= swap_header->info.last_page)
- error = -EINVAL;
- else
- p->swap_map[page] = SWAP_MAP_BAD;
- }
- nr_good_pages = swap_header->info.last_page -
- swap_header->info.nr_badpages -
- 1 /* header page */;
- if (error)
- goto bad_swap;
- }
-
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- error = -EINVAL;
- goto bad_swap;
- }
- if (!nr_good_pages) {
- printk(KERN_WARNING "Empty swap-file\n");
- error = -EINVAL;
- goto bad_swap;
- }
- p->swap_map[0] = SWAP_MAP_BAD;
- swap_list_lock();
- swap_device_lock(p);
- p->max = maxpages;
- p->flags = SWP_WRITEOK;
- p->pages = nr_good_pages;
- nr_swap_pages += nr_good_pages;
- total_swap_pages += nr_good_pages;
- printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
- nr_good_pages<<(PAGE_SHIFT-10), p->prio);
-
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio) {
- break;
- }
- prev = i;
- }
- p->next = i;
- if (prev < 0) {
- swap_list.head = swap_list.next = p - swap_info;
- } else {
- swap_info[prev].next = p - swap_info;
- }
- swap_device_unlock(p);
- swap_list_unlock();
- error = 0;
- goto out;
-bad_swap:
- if (bdev)
- blkdev_put(bdev, BDEV_SWAP);
-bad_swap_2:
- swap_list_lock();
- swap_map = p->swap_map;
- nd.mnt = p->swap_vfsmnt;
- nd.dentry = p->swap_file;
- p->swap_device = 0;
- p->swap_file = NULL;
- p->swap_vfsmnt = NULL;
- p->swap_map = NULL;
- p->flags = 0;
- if (!(swap_flags & SWAP_FLAG_PREFER))
- ++least_priority;
- swap_list_unlock();
- if (swap_map)
- vfree(swap_map);
- path_release(&nd);
-out:
- if (swap_header)
- free_page((long) swap_header);
- unlock_kernel();
- return error;
-}
-
-void si_swapinfo(struct sysinfo *val)
-{
- unsigned int i;
- unsigned long nr_to_be_unused = 0;
-
- swap_list_lock();
- for (i = 0; i < nr_swapfiles; i++) {
- unsigned int j;
- if (swap_info[i].flags != SWP_USED)
- continue;
- for (j = 0; j < swap_info[i].max; ++j) {
- switch (swap_info[i].swap_map[j]) {
- case 0:
- case SWAP_MAP_BAD:
- continue;
- default:
- nr_to_be_unused++;
- }
- }
- }
- val->freeswap = nr_swap_pages + nr_to_be_unused;
- val->totalswap = total_swap_pages + nr_to_be_unused;
- swap_list_unlock();
-}
-
-/*
- * Verify that a swap entry is valid and increment its swap map count.
- *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
- */
-int swap_duplicate(swp_entry_t entry)
-{
- struct swap_info_struct * p;
- unsigned long offset, type;
- int result = 0;
-
- type = SWP_TYPE(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = type + swap_info;
- offset = SWP_OFFSET(entry);
-
- swap_device_lock(p);
- if (offset < p->max && p->swap_map[offset]) {
- if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
- p->swap_map[offset]++;
- result = 1;
- } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
- if (swap_overflow++ < 5)
- printk(KERN_WARNING "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = SWAP_MAP_MAX;
- result = 1;
- }
- }
- swap_device_unlock(p);
-out:
- return result;
-
-bad_file:
- printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
-}
-
-/*
- * Prior swap_duplicate protects against swap device deletion.
- */
-void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
- kdev_t *dev, struct inode **swapf)
-{
- unsigned long type;
- struct swap_info_struct *p;
-
- type = SWP_TYPE(entry);
- if (type >= nr_swapfiles) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
- return;
- }
-
- p = &swap_info[type];
- *offset = SWP_OFFSET(entry);
- if (*offset >= p->max && *offset != 0) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
- return;
- }
- if (p->swap_map && !p->swap_map[*offset]) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
- return;
- }
- if (!(p->flags & SWP_USED)) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
- return;
- }
-
- if (p->swap_device) {
- *dev = p->swap_device;
- } else if (p->swap_file) {
- *swapf = p->swap_file->d_inode;
- } else {
- printk(KERN_ERR "rw_swap_page: no swap file or device\n");
- }
- return;
-}
-
-/*
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
- */
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
-{
- int ret = 0, i = 1 << page_cluster;
- unsigned long toff;
- struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
-
- if (!page_cluster) /* no readahead */
- return 0;
- toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
- if (!toff) /* first page is swap header */
- toff++, i--;
- *offset = toff;
-
- swap_device_lock(swapdev);
- do {
- /* Don't read-ahead past the end of the swap area */
- if (toff >= swapdev->max)
- break;
- /* Don't read in free or bad pages */
- if (!swapdev->swap_map[toff])
- break;
- if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
- break;
- toff++;
- ret++;
- } while (--i);
- swap_device_unlock(swapdev);
- return ret;
-}
diff --git a/linux-2.4.29-xen-sparse/mm/vmalloc.c b/linux-2.4.29-xen-sparse/mm/vmalloc.c
deleted file mode 100644
index df02fcbf7a..0000000000
--- a/linux-2.4.29-xen-sparse/mm/vmalloc.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * linux/mm/vmalloc.c
- *
- * Copyright (C) 1993 Linus Torvalds
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
- */
-
-#include <linux/config.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/highmem.h>
-#include <linux/smp_lock.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-
-rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
-struct vm_struct * vmlist;
-
-static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
-{
- pte_t * pte;
- unsigned long end;
-
- if (pmd_none(*pmd))
- return;
- if (pmd_bad(*pmd)) {
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
- return;
- }
- pte = pte_offset(pmd, address);
- address &= ~PMD_MASK;
- end = address + size;
- if (end > PMD_SIZE)
- end = PMD_SIZE;
- do {
- pte_t page;
- page = ptep_get_and_clear(pte);
- address += PAGE_SIZE;
- pte++;
- if (pte_none(page))
- continue;
- if (pte_present(page)) {
- struct page *ptpage = pte_page(page);
- if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
- __free_page(ptpage);
- continue;
- }
- printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
- } while (address < end);
-}
-
-static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size)
-{
- pmd_t * pmd;
- unsigned long end;
-
- if (pgd_none(*dir))
- return;
- if (pgd_bad(*dir)) {
- pgd_ERROR(*dir);
- pgd_clear(dir);
- return;
- }
- pmd = pmd_offset(dir, address);
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- free_area_pte(pmd, address, end - address);
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
-}
-
-void vmfree_area_pages(unsigned long address, unsigned long size)
-{
- pgd_t * dir;
- unsigned long end = address + size;
-
- dir = pgd_offset_k(address);
- flush_cache_all();
- do {
- free_area_pmd(dir, address, end - address);
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- dir++;
- } while (address && (address < end));
- flush_tlb_all();
-}
-
-static inline int alloc_area_pte (pte_t * pte, unsigned long address,
- unsigned long size, int gfp_mask,
- pgprot_t prot, struct page ***pages)
-{
- unsigned long end;
-
- address &= ~PMD_MASK;
- end = address + size;
- if (end > PMD_SIZE)
- end = PMD_SIZE;
- do {
- struct page * page;
-
- if (!pages) {
- spin_unlock(&init_mm.page_table_lock);
- page = alloc_page(gfp_mask);
- spin_lock(&init_mm.page_table_lock);
- } else {
- page = (**pages);
- (*pages)++;
-
- /* Add a reference to the page so we can free later */
- if (page)
- atomic_inc(&page->count);
-
- }
- if (!pte_none(*pte))
- printk(KERN_ERR "alloc_area_pte: page already exists\n");
- if (!page)
- return -ENOMEM;
- set_pte(pte, mk_pte(page, prot));
- address += PAGE_SIZE;
- pte++;
- } while (address < end);
- return 0;
-}
-
-static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address,
- unsigned long size, int gfp_mask,
- pgprot_t prot, struct page ***pages)
-{
- unsigned long end;
-
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- pte_t * pte = pte_alloc(&init_mm, pmd, address);
- if (!pte)
- return -ENOMEM;
- if (alloc_area_pte(pte, address, end - address,
- gfp_mask, prot, pages))
- return -ENOMEM;
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
- return 0;
-}
-
-/*static inline*/ int __vmalloc_area_pages (unsigned long address,
- unsigned long size,
- int gfp_mask,
- pgprot_t prot,
- struct page ***pages)
-{
- pgd_t * dir;
- unsigned long start = address;
- unsigned long end = address + size;
-
- dir = pgd_offset_k(address);
- spin_lock(&init_mm.page_table_lock);
- do {
- pmd_t *pmd;
-
- pmd = pmd_alloc(&init_mm, dir, address);
- if (!pmd)
- goto err;
-
- if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages))
- goto err; // The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here
-
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- dir++;
- } while (address && (address < end));
- spin_unlock(&init_mm.page_table_lock);
- flush_cache_all();
- XEN_flush_page_update_queue();
- return 0;
-err:
- spin_unlock(&init_mm.page_table_lock);
- flush_cache_all();
- if (address > start)
- vmfree_area_pages(start, address - start);
- return -ENOMEM;
-}
-
-int vmalloc_area_pages(unsigned long address, unsigned long size,
- int gfp_mask, pgprot_t prot)
-{
- return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL);
-}
-
-struct vm_struct * get_vm_area(unsigned long size, unsigned long flags)
-{
- unsigned long addr, next;
- struct vm_struct **p, *tmp, *area;
-
- area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
- if (!area)
- return NULL;
-
- size += PAGE_SIZE;
- if (!size) {
- kfree (area);
- return NULL;
- }
-
- addr = VMALLOC_START;
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
- if ((size + addr) < addr)
- goto out;
- if (size + addr <= (unsigned long) tmp->addr)
- break;
- next = tmp->size + (unsigned long) tmp->addr;
- if (next > addr)
- addr = next;
- if (addr > VMALLOC_END-size)
- goto out;
- }
- area->flags = flags;
- area->addr = (void *)addr;
- area->size = size;
- area->next = *p;
- *p = area;
- write_unlock(&vmlist_lock);
- return area;
-
-out:
- write_unlock(&vmlist_lock);
- kfree(area);
- return NULL;
-}
-
-void __vfree(void * addr, int free_area_pages)
-{
- struct vm_struct **p, *tmp;
-
- if (!addr)
- return;
- if ((PAGE_SIZE-1) & (unsigned long) addr) {
- printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
- return;
- }
- write_lock(&vmlist_lock);
- for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
- if (tmp->addr == addr) {
- *p = tmp->next;
- if (free_area_pages)
- vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
- write_unlock(&vmlist_lock);
- kfree(tmp);
- return;
- }
- }
- write_unlock(&vmlist_lock);
- printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr);
-}
-
-void vfree(void * addr)
-{
- __vfree(addr,1);
-}
-
-void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
-{
- void * addr;
- struct vm_struct *area;
-
- size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > num_physpages)
- return NULL;
- area = get_vm_area(size, VM_ALLOC);
- if (!area)
- return NULL;
- addr = area->addr;
- if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask,
- prot, NULL)) {
- __vfree(addr, 0);
- return NULL;
- }
- return addr;
-}
-
-void * vmap(struct page **pages, int count,
- unsigned long flags, pgprot_t prot)
-{
- void * addr;
- struct vm_struct *area;
- unsigned long size = count << PAGE_SHIFT;
-
- if (!size || size > (max_mapnr << PAGE_SHIFT))
- return NULL;
- area = get_vm_area(size, flags);
- if (!area) {
- return NULL;
- }
- addr = area->addr;
- if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0,
- prot, &pages)) {
- __vfree(addr, 0);
- return NULL;
- }
- return addr;
-}
-
-long vread(char *buf, char *addr, unsigned long count)
-{
- struct vm_struct *tmp;
- char *vaddr, *buf_start = buf;
- unsigned long n;
-
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
- continue;
- while (addr < vaddr) {
- if (count == 0)
- goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
- }
- n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *buf = *addr;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
- }
-finished:
- read_unlock(&vmlist_lock);
- return buf - buf_start;
-}
-
-long vwrite(char *buf, char *addr, unsigned long count)
-{
- struct vm_struct *tmp;
- char *vaddr, *buf_start = buf;
- unsigned long n;
-
- /* Don't allow overflow */
- if ((unsigned long) addr + count < count)
- count = -(unsigned long) addr;
-
- read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
- continue;
- while (addr < vaddr) {
- if (count == 0)
- goto finished;
- buf++;
- addr++;
- count--;
- }
- n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *addr = *buf;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
- }
-finished:
- read_unlock(&vmlist_lock);
- return buf - buf_start;
-}
diff --git a/linux-2.6.11-xen-sparse/arch/xen/Kconfig b/linux-2.6.11-xen-sparse/arch/xen/Kconfig
index 2a8c5f200f..1c2ba9b4a2 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig
+++ b/linux-2.6.11-xen-sparse/arch/xen/Kconfig
@@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP
to a character device, allowing device prototyping in application
space. Odds are that you want to say N here.
-config XEN_WRITABLE_PAGETABLES
- bool
- default y
-
config XEN_SCRUB_PAGES
bool "Scrub memory before freeing it to Xen"
default y
diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig
index e906f98521..a781740c94 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig
+++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig
@@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
# CONFIG_XEN_BLKDEV_TAP is not set
-CONFIG_XEN_WRITABLE_PAGETABLES=y
CONFIG_XEN_SCRUB_PAGES=y
CONFIG_X86=y
# CONFIG_X86_64 is not set
diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig
index 95dee5b159..b1fc951a81 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig
+++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig
@@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
CONFIG_XEN_NETDEV_FRONTEND=y
# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
# CONFIG_XEN_BLKDEV_TAP is not set
-CONFIG_XEN_WRITABLE_PAGETABLES=y
CONFIG_XEN_SCRUB_PAGES=y
CONFIG_X86=y
# CONFIG_X86_64 is not set
diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c
index b8829c8cdc..b7c29174fc 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c
+++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c
@@ -963,7 +963,7 @@ void __init trap_init(void)
* and a callgate to lcall27 for Solaris/x86 binaries
*/
make_lowmem_page_readonly(&default_ldt[0]);
- xen_flush_page_update_queue();
+ flush_page_update_queue();
/*
* Should be a barrier for any external CPU state.
diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c
index 7a0b091ca3..0cac0f30c3 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c
+++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c
@@ -553,7 +553,6 @@ vmalloc_fault:
if (!pmd_present(*pmd_k))
goto no_context;
set_pmd(pmd, *pmd_k);
- xen_flush_page_update_queue(); /* flush PMD update */
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c
index 62427b2301..f9d8e089e0 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c
+++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c
@@ -48,19 +48,12 @@
*/
static spinlock_t update_lock = SPIN_LOCK_UNLOCKED;
-/* Linux 2.6 isn't using the traditional batched interface. */
+#define QUEUE_SIZE 128
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
-#define QUEUE_SIZE 2048
#define pte_offset_kernel pte_offset
-#define pmd_val_ma(v) (v).pmd;
#define pud_t pgd_t
#define pud_offset(d, va) d
#else
-#ifdef CONFIG_SMP
-#define QUEUE_SIZE 1
-#else
-#define QUEUE_SIZE 128
-#endif
#define pmd_val_ma(v) (v).pud.pgd.pgd;
#endif
diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c
index 6fe3f08632..2682ac5b90 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c
+++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c
@@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
if (pte) {
make_page_readonly(pte);
- xen_flush_page_update_queue();
+ flush_page_update_queue();
}
return pte;
}
diff --git a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c
index f69db851a4..36c934fc5d 100644
--- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c
+++ b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c
@@ -109,10 +109,8 @@ static void __do_suspend(void)
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_4gb_segments);
-#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_writable_pagetables);
-#endif
shutting_down = -1;
diff --git a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h
index eb084e8f06..a55be6c23e 100644
--- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h
+++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h
@@ -30,6 +30,8 @@
/* -------[ debug / pretty printing ]--------------------------------- */
+#define PRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
#if 0
#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
__FILE__ , __LINE__ , ## _a )
diff --git a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c
index b503b1ec13..93594623b0 100644
--- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c
+++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c
@@ -299,7 +299,7 @@ int blktap_write_fe_ring(blkif_request_t *req)
}
if ( RING_FULL(&blktap_ufe_ring) ) {
- DPRINTK("blktap: fe_ring is full, can't add.\n");
+ PRINTK("blktap: fe_ring is full, can't add.\n");
return 0;
}
@@ -383,10 +383,9 @@ static int blktap_read_fe_ring(void)
zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0),
ar->nr_pages << PAGE_SHIFT, NULL);
write_resp_to_fe_ring(blkif, resp_s);
+ blktap_ufe_ring.rsp_cons = i + 1;
kick_fe_domain(blkif);
}
-
- blktap_ufe_ring.rsp_cons = i;
}
return 0;
}
diff --git a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c b/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c
index c2cdbf9e72..219b218920 100644
--- a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c
+++ b/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c
@@ -88,6 +88,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
{
int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
+
+
if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) )
return -EFAULT;
@@ -96,6 +98,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
struct vm_area_struct *vma =
find_vma( current->mm, msg[j].va );
+
if ( !vma )
return -EINVAL;
@@ -151,6 +154,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
addr = m.addr;
for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ )
{
+
if ( get_user(mfn, p) )
return -EFAULT;
@@ -166,10 +170,12 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
v = w;
}
+
ret = 0;
break;
batch_err:
+ printk(KERN_ALERT "XXX SMH: ERROR IN MMAPBATCH\n");
printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n",
ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end);
break;
@@ -183,7 +189,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
pgd_t *pgd = pgd_offset_k(m2pv);
pud_t *pud = pud_offset(pgd, m2pv);
pmd_t *pmd = pmd_offset(pud, m2pv);
- unsigned long m2p_start_mfn = pfn_to_mfn(pmd_val(*pmd) >> PAGE_SHIFT);
+ unsigned long m2p_start_mfn = (*(unsigned long *)pmd) >> PAGE_SHIFT;
ret = put_user(m2p_start_mfn, (unsigned long *)data) ? -EFAULT: 0;
}
break;
diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
index 345b8264b8..1379b49694 100644
--- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
+++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h
@@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
static inline unsigned long pgd_val(pgd_t x)
{
unsigned long ret = x.pgd;
- if (ret) ret = machine_to_phys(ret);
+ if (ret) ret = machine_to_phys(ret) | 1;
return ret;
}
#define pgprot_val(x) ((x).pgprot)
diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h
index d932c6c17f..dfc5b1e155 100644
--- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h
+++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h
@@ -407,7 +407,6 @@ extern void noexec_setup(const char *str);
do { \
if (__dirty) { \
if ( likely((__vma)->vm_mm == current->mm) ) { \
- xen_flush_page_update_queue(); \
HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
} else { \
xen_l1_entry_update((__ptep), (__entry).pte_low); \
@@ -426,7 +425,6 @@ do { \
#define ptep_establish_new(__vma, __address, __ptep, __entry) \
do { \
if (likely((__vma)->vm_mm == current->mm)) { \
- xen_flush_page_update_queue(); \
HYPERVISOR_update_va_mapping((__address), \
__entry, 0); \
} else { \
diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h
index 4d77312f6e..568e84bc2f 100644
--- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h
+++ b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h
@@ -117,8 +117,6 @@ void _flush_page_update_queue(void);
if (per_cpu(mmu_update_queue_idx, smp_processor_id())) \
_flush_page_update_queue(); \
} while (0)
-#define xen_flush_page_update_queue() (_flush_page_update_queue())
-#define XEN_flush_page_update_queue() (_flush_page_update_queue())
void MULTICALL_flush_page_update_queue(void);
#ifdef CONFIG_XEN_PHYSDEV_ACCESS
diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile
index 7f71a219bf..3478552ac4 100644
--- a/tools/blktap/Makefile
+++ b/tools/blktap/Makefile
@@ -58,7 +58,7 @@ OBJS = $(patsubst %.c,%.o,$(SRCS))
LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
-all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax
+all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax parallax-threaded blockstored
$(MAKE) $(LIB)
LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
@@ -120,42 +120,42 @@ blkaio: $(LIB) blkaio.c blkaiolib.c
$(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread
parallax: $(LIB) $(PLX_SRCS)
- $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap $(PLX_SRCS) libgnbd/libgnbd.a
+ $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap -lpthread $(PLX_SRCS) libgnbd/libgnbd.a
parallax-threaded: $(LIB) $(PLXT_SRCS)
$(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) libgnbd/libgnbd.a
vdi_test: $(LIB) $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE -lpthread $(VDI_SRCS)
vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c -lpthread $(VDI_SRCS)
vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c -lpthread $(VDI_SRCS)
vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c -lpthread $(VDI_SRCS)
vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c -lpthread $(VDI_SRCS)
vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c -lpthread $(VDI_SRCS)
vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c -lpthread $(VDI_SRCS)
vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c -lpthread $(VDI_SRCS)
vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
- $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c -lpthread $(VDI_SRCS)
blockstored: blockstored.c
- $(CC) $(CFLAGS) -g3 -o blockstored blockstored.c
+ $(CC) $(CFLAGS) -g3 -o blockstored -lpthread blockstored.c
bstest: bstest.c blockstore.c
- $(CC) $(CFLAGS) -g3 -o bstest bstest.c blockstore.c
+ $(CC) $(CFLAGS) -g3 -o bstest bstest.c -lpthread blockstore.c
.PHONY: TAGS clean install mk-symlinks rpm
TAGS:
diff --git a/tools/blktap/blktaplib.c b/tools/blktap/blktaplib.c
index 35b893f677..87b680d2cc 100644
--- a/tools/blktap/blktaplib.c
+++ b/tools/blktap/blktaplib.c
@@ -248,12 +248,21 @@ static void apply_rsp_hooks(blkif_response_t *rsp)
}
}
+static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER;
+
void blktap_inject_response(blkif_response_t *rsp)
{
+
apply_rsp_hooks(rsp);
+
write_rsp_to_fe_ring(rsp);
+
+ pthread_mutex_lock(&push_mutex);
+
RING_PUSH_RESPONSES(&fe_ring);
ioctl(fd, BLKTAP_IOCTL_KICK_FE);
+
+ pthread_mutex_unlock(&push_mutex);
}
/*-----[ Polling fd listeners ]------------------------------------------*/
@@ -449,7 +458,9 @@ int blktap_listen(void)
}
/* Using this as a unidirectional ring. */
ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i;
+pthread_mutex_lock(&push_mutex);
RING_PUSH_RESPONSES(&ctrl_ring);
+pthread_mutex_unlock(&push_mutex);
/* empty the fe_ring */
notify_fe = 0;
@@ -517,14 +528,18 @@ int blktap_listen(void)
if (notify_be) {
DPRINTF("notifying be\n");
+pthread_mutex_lock(&push_mutex);
RING_PUSH_REQUESTS(&be_ring);
ioctl(fd, BLKTAP_IOCTL_KICK_BE);
+pthread_mutex_unlock(&push_mutex);
}
if (notify_fe) {
DPRINTF("notifying fe\n");
+pthread_mutex_lock(&push_mutex);
RING_PUSH_RESPONSES(&fe_ring);
ioctl(fd, BLKTAP_IOCTL_KICK_FE);
+pthread_mutex_unlock(&push_mutex);
}
}
}
diff --git a/tools/blktap/blockstore.c b/tools/blktap/blockstore.c
index 5de2a6885a..36903fe09e 100644
--- a/tools/blktap/blockstore.c
+++ b/tools/blktap/blockstore.c
@@ -13,13 +13,16 @@
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/time.h>
#include <stdarg.h>
#include "blockstore.h"
#include <pthread.h>
#include "parallax-threaded.h"
#define BLOCKSTORE_REMOTE
-#define BSDEBUG
+//#define BSDEBUG
+
+#define RETRY_TIMEOUT 1000000 /* microseconds */
/*****************************************************************************
* Debugging
@@ -63,6 +66,37 @@ struct sockaddr_in sin_local;
int bssock = 0;
/*****************************************************************************
+ * Notification *
+ *****************************************************************************/
+
+typedef struct pool_thread_t_struct {
+ pthread_mutex_t ptmutex;
+ pthread_cond_t ptcv;
+ int newdata;
+} pool_thread_t;
+
+pool_thread_t pool_thread[READ_POOL_SIZE+1];
+
+#define RECV_NOTIFY(tid) { \
+ pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+ pool_thread[tid].newdata = 1; \
+ DB("CV Waking %u", tid); \
+ pthread_cond_signal(&(pool_thread[tid].ptcv)); \
+ pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+#define RECV_AWAIT(tid) { \
+ pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+ if (pool_thread[tid].newdata) { \
+ pool_thread[tid].newdata = 0; \
+ DB("CV Woken %u", tid); \
+ } \
+ else { \
+ DB("CV Waiting %u", tid); \
+ pthread_cond_wait(&(pool_thread[tid].ptcv), \
+ &(pool_thread[tid].ptmutex)); \
+ } \
+ pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+
+/*****************************************************************************
* Message queue management *
*****************************************************************************/
@@ -76,23 +110,6 @@ pthread_mutex_t ptmutex_recv;
#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
-int notify = 0;
-pthread_mutex_t ptmutex_notify;
-pthread_cond_t ptcv_notify;
-#define RECV_NOTIFY { \
- pthread_mutex_lock(&ptmutex_notify); \
- notify = 1; \
- pthread_cond_signal(&ptcv_notify); \
- pthread_mutex_unlock(&ptmutex_notify); }
-#define RECV_AWAIT { \
- pthread_mutex_lock(&ptmutex_notify); \
- if (notify) \
- notify = 0; \
- else \
- pthread_cond_wait(&ptcv_notify, &ptmutex_notify); \
- pthread_mutex_unlock(&ptmutex_notify); }
-
-
/* A message queue entry. We allocate one of these for every request we send.
* Asynchronous reply reception also used one of these.
*/
@@ -104,6 +121,8 @@ typedef struct bsq_t_struct {
int length;
struct msghdr msghdr;
struct iovec iov[2];
+ int tid;
+ struct timeval tv_sent;
bshdr_t message;
void *block;
} bsq_t;
@@ -267,11 +286,13 @@ int send_message(bsq_t *qe) {
qe->message.luid = new_luid();
qe->status = 0;
+ qe->tid = (int)pthread_getspecific(tid_key);
if (enqueue(qe) < 0) {
fprintf(stderr, "Error enqueuing request.\n");
return -1;
}
+ gettimeofday(&(qe->tv_sent), NULL);
DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
//rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
@@ -407,6 +428,7 @@ void recv_recycle_buffer(bsq_t *q) {
int wait_recv(bsq_t **reqs, int numreqs) {
bsq_t *q, *m;
unsigned int x, i;
+ int tid = (int)pthread_getspecific(tid_key);
DB("ENTER wait_recv %u\n", numreqs);
@@ -420,7 +442,7 @@ int wait_recv(bsq_t **reqs, int numreqs) {
return numreqs;
}
- RECV_AWAIT;
+ RECV_AWAIT(tid);
/*
rxagain:
@@ -442,6 +464,52 @@ int wait_recv(bsq_t **reqs, int numreqs) {
}
+/* retry
+ */
+static int retry_count = 0;
+int retry(bsq_t *qe)
+{
+ int rc;
+ gettimeofday(&(qe->tv_sent), NULL);
+ DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
+ retry_count++;
+ rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+ if (rc < 0)
+ return rc;
+ return 0;
+}
+
+/* queue runner
+ */
+void *queue_runner(void *arg)
+{
+ for (;;) {
+ struct timeval now;
+ long long nowus, sus;
+ bsq_t *q;
+ int r;
+
+ sleep(1);
+
+ gettimeofday(&now, NULL);
+ nowus = now.tv_usec + now.tv_sec * 1000000;
+ ENTER_QUEUE_CR;
+ r = retry_count;
+ for (q = bs_head; q; q = q->next) {
+ sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
+ if ((nowus - sus) > RETRY_TIMEOUT) {
+ if (retry(q) < 0) {
+ fprintf(stderr, "Error on sendmsg retry.\n");
+ }
+ }
+ }
+ if (r != retry_count) {
+ fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
+ }
+ LEAVE_QUEUE_CR;
+ }
+}
+
/* receive loop
*/
void *receive_loop(void *arg)
@@ -461,7 +529,7 @@ void *receive_loop(void *arg)
}
else {
DB("RX MATCH");
- RECV_NOTIFY;
+ RECV_NOTIFY(m->tid);
}
}
}
@@ -1146,8 +1214,12 @@ int __init_blockstore(void)
pthread_mutex_init(&ptmutex_queue, NULL);
pthread_mutex_init(&ptmutex_luid, NULL);
pthread_mutex_init(&ptmutex_recv, NULL);
- pthread_mutex_init(&ptmutex_notify, NULL);
- pthread_cond_init(&ptcv_notify, NULL);
+ /*pthread_mutex_init(&ptmutex_notify, NULL);*/
+ for (i = 0; i <= READ_POOL_SIZE; i++) {
+ pool_thread[i].newdata = 0;
+ pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
+ pthread_cond_init(&(pool_thread[i].ptcv), NULL);
+ }
bsservers[0].hostname = "firebug.cl.cam.ac.uk";
bsservers[1].hostname = "planb.cl.cam.ac.uk";
@@ -1225,6 +1297,7 @@ int __init_blockstore(void)
}
pthread_create(&pthread_recv, NULL, receive_loop, NULL);
+ pthread_create(&pthread_recv, NULL, queue_runner, NULL);
#else /* /BLOCKSTORE_REMOTE */
block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
@@ -1262,9 +1335,14 @@ int __init_blockstore(void)
void __exit_blockstore(void)
{
+ int i;
pthread_mutex_destroy(&ptmutex_recv);
pthread_mutex_destroy(&ptmutex_luid);
pthread_mutex_destroy(&ptmutex_queue);
- pthread_mutex_destroy(&ptmutex_notify);
- pthread_cond_destroy(&ptcv_notify);
+ /*pthread_mutex_destroy(&ptmutex_notify);
+ pthread_cond_destroy(&ptcv_notify);*/
+ for (i = 0; i <= READ_POOL_SIZE; i++) {
+ pthread_mutex_destroy(&(pool_thread[i].ptmutex));
+ pthread_cond_destroy(&(pool_thread[i].ptcv));
+ }
}
diff --git a/tools/blktap/parallax-threaded.h b/tools/blktap/parallax-threaded.h
index 17cdcb983e..de39609fcc 100644
--- a/tools/blktap/parallax-threaded.h
+++ b/tools/blktap/parallax-threaded.h
@@ -14,7 +14,8 @@
#define NOTHREADS
#endif
-#define READ_POOL_SIZE 128
+//#define READ_POOL_SIZE 128
+#define READ_POOL_SIZE 8
/* per-thread identifier */
pthread_key_t tid_key;