diff options
author | kaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk> | 2003-01-03 18:24:03 +0000 |
---|---|---|
committer | kaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk> | 2003-01-03 18:24:03 +0000 |
commit | 66519329ac71070a73949e1d795422e55e66dd00 (patch) | |
tree | 561f690c7567887b961db089bd444c6d43d34a79 | |
parent | e1c642ea89639d8eff4cafd611542bb7bbb084ae (diff) | |
download | xen-66519329ac71070a73949e1d795422e55e66dd00.tar.gz xen-66519329ac71070a73949e1d795422e55e66dd00.tar.bz2 xen-66519329ac71070a73949e1d795422e55e66dd00.zip |
bitkeeper revision 1.13 (3e15d543UbLg8zdxnspCHQplKUlnzQ)
Many files:
Clean up page-table update interface. BIG MODIFICATIONS HERE.
mremap.c, swapfile.c, memory.c, exec.c:
new file
20 files changed, 4621 insertions, 199 deletions
@@ -290,6 +290,7 @@ 3ddb79bbx682YH6vR2zbVOXwg73ULg xenolinux-2.4.16-sparse/drivers/block/ll_rw_blk.c 3ddb79bcJfHdwrPsjqgI33_OsGdVCg xenolinux-2.4.16-sparse/drivers/block/rd.c 3ddb79bcpVu-IbnqwQqpRqsEbLpsuw xenolinux-2.4.16-sparse/drivers/char/tty_io.c +3e15d5273gfR2fbcYe05kqBSAvCX_w xenolinux-2.4.16-sparse/fs/exec.c 3ddb79bba_zKpuurHVeWfgDkyPoq8A xenolinux-2.4.16-sparse/fs/nfs/nfsroot.c 3ddb79b8VFtfWSCrXKPN2K21zd_vtw xenolinux-2.4.16-sparse/include/asm-xeno/a.out.h 3ddb79b8Zzi13p3OAPV25QgiC3THAQ xenolinux-2.4.16-sparse/include/asm-xeno/apic.h @@ -400,3 +401,6 @@ 3ddb79bb_7YG4U75ZmEic9YXWTW7Vw xenolinux-2.4.16-sparse/include/linux/sunrpc/debug.h 3ddb79bcxkVPfWlZ1PQKvDrfArzOVw xenolinux-2.4.16-sparse/kernel/panic.c 3ddb79bbP31im-mx2NbfthSeqty1Dg xenolinux-2.4.16-sparse/mk +3e15d52e0_j129JPvo7xfYGndVFpwQ xenolinux-2.4.16-sparse/mm/memory.c +3e15d535DLvpzTrLRUIerB69LpJD1g xenolinux-2.4.16-sparse/mm/mremap.c +3e15d531m1Y1_W8ki64AFOU_ua4C4w xenolinux-2.4.16-sparse/mm/swapfile.c diff --git a/xen-2.4.16/arch/i386/Rules.mk b/xen-2.4.16/arch/i386/Rules.mk index 172777193a..9149644cf4 100644 --- a/xen-2.4.16/arch/i386/Rules.mk +++ b/xen-2.4.16/arch/i386/Rules.mk @@ -8,7 +8,7 @@ MONITOR_BASE := 0xE0100000 # Bootloader should load monitor to this real address LOAD_BASE := 0x00100000 CFLAGS := -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE) -CFLAGS += -I$(BASEDIR)/include -D__KERNEL__ +CFLAGS += -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG LDFLAGS := -T xeno.lds -N diff --git a/xen-2.4.16/arch/i386/entry.S b/xen-2.4.16/arch/i386/entry.S index ecd878acd1..6152b6bc3a 100644 --- a/xen-2.4.16/arch/i386/entry.S +++ b/xen-2.4.16/arch/i386/entry.S @@ -516,7 +516,7 @@ ENTRY(hypervisor_call_table) .long SYMBOL_NAME(do_set_trap_table) .long SYMBOL_NAME(do_process_page_updates) .long SYMBOL_NAME(do_console_write) - .long SYMBOL_NAME(do_set_pagetable) + .long SYMBOL_NAME(sys_ni_syscall) .long SYMBOL_NAME(do_set_guest_stack) .long SYMBOL_NAME(do_net_update) .long SYMBOL_NAME(do_fpu_taskswitch) diff --git a/xen-2.4.16/common/memory.c b/xen-2.4.16/common/memory.c index 5227ecf05a..4cfb4348f9 100644 --- a/xen-2.4.16/common/memory.c +++ b/xen-2.4.16/common/memory.c @@ -7,8 +7,7 @@ * * Domains trap to process_page_updates with a list of update requests. * This is a list of (ptr, val) pairs, where the requested operation - * is *ptr = val. The exceptions are when ptr is PGREQ_ADD_BASEPTR, or - * PGREQ_REMOVE_BASEPTR. + * is *ptr = val. * * Reference counting of pages: * ---------------------------- @@ -28,6 +27,15 @@ * referred to in its current incarnation. Therefore, a page can only * change its type when its type count is zero. * + * Pinning the page type: + * ---------------------- + * The type of a page can be pinned/unpinned with the commands + * PGEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, + * pinning is not reference counted, so it can't be nested). + * This is useful to prevent a page's type count falling to zero, at which + * point safety checks would need to be carried out next time the count + * is increased again. + * * A further note on writeable page mappings: * ------------------------------------------ * For simplicity, the count of writeable mappings for a page may not @@ -194,6 +202,7 @@ unsigned long max_page; struct list_head free_list; unsigned int free_pfns; +static int tlb_flush[NR_CPUS]; /* * init_frametable: @@ -208,6 +217,8 @@ unsigned long __init init_frametable(unsigned long nr_pages) struct pfn_info *pf; unsigned long page_index; + memset(tlb_flush, 0, sizeof(tlb_flush)); + max_page = nr_pages; frame_table_size = nr_pages * sizeof(struct pfn_info); frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; @@ -440,13 +451,14 @@ static void put_page(unsigned long page_nr, int writeable) ASSERT(page_nr < max_page); page = frame_table + page_nr; ASSERT((page->flags & PG_domain_mask) == current->domain); - ASSERT((((page->flags & PG_type_mask) == PGT_writeable_page) && - (page_type_count(page) != 0)) || - (((page->flags & PG_type_mask) == PGT_none) && - (page_type_count(page) == 0))); - ASSERT((!writeable) || (page_type_count(page) != 0)); + ASSERT((!writeable) || + ((page_type_count(page) != 0) && + ((page->flags & PG_type_mask) == PGT_writeable_page))); if ( writeable && (put_page_type(page) == 0) ) + { + tlb_flush[smp_processor_id()] = 1; page->flags &= ~PG_type_mask; + } put_page_tot(page); } @@ -458,7 +470,7 @@ static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry) if ( (((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >= DOMAIN_ENTRIES_PER_L2_PAGETABLE ) { - MEM_LOG("Illegal L2 update attempt in hypervisor area %p\n", + MEM_LOG("Illegal L2 update attempt in hypervisor area %p", p_l2_entry); goto fail; } @@ -544,6 +556,95 @@ static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry) } +static int do_extended_command(unsigned long ptr, unsigned long val) +{ + int err = 0; + unsigned long pfn = ptr >> PAGE_SHIFT; + struct pfn_info *page = frame_table + pfn; + + switch ( (val & PGEXT_CMD_MASK) ) + { + case PGEXT_PIN_L1_TABLE: + err = get_l1_table(pfn); + goto mark_as_pinned; + case PGEXT_PIN_L2_TABLE: + err = get_l2_table(pfn); + mark_as_pinned: + if ( err ) + { + MEM_LOG("Error while pinning pfn %08lx", pfn); + break; + } + put_page_type(page); + put_page_tot(page); + if ( !(page->type_count & REFCNT_PIN_BIT) ) + { + page->type_count |= REFCNT_PIN_BIT; + page->tot_count |= REFCNT_PIN_BIT; + } + else + { + MEM_LOG("Pfn %08lx already pinned", pfn); + err = 1; + } + break; + + case PGEXT_UNPIN_TABLE: + if ( (page->flags & PG_domain_mask) != current->domain ) + { + err = 1; + MEM_LOG("Page %08lx bad domain (dom=%ld)", + ptr, page->flags & PG_domain_mask); + } + else if ( (page->type_count & REFCNT_PIN_BIT) ) + { + page->type_count &= ~REFCNT_PIN_BIT; + page->tot_count &= ~REFCNT_PIN_BIT; + get_page_type(page); + get_page_tot(page); + ((page->flags & PG_type_mask) == PGT_l1_page_table) ? + put_l1_table(pfn) : put_l2_table(pfn); + } + else + { + err = 1; + MEM_LOG("Pfn %08lx not pinned", pfn); + } + break; + + case PGEXT_NEW_BASEPTR: + err = get_l2_table(pfn); + if ( !err ) + { + put_l2_table(__pa(pagetable_ptr(current->mm.pagetable)) + >> PAGE_SHIFT); + current->mm.pagetable = + mk_pagetable((unsigned long)__va(pfn<<PAGE_SHIFT)); + } + else + { + MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err); + } + /* fall through */ + + case PGEXT_TLB_FLUSH: + tlb_flush[smp_processor_id()] = 1; + break; + + case PGEXT_INVLPG: + __asm__ __volatile__ ("invlpg %0" : : + "m" (*(char*)(val & ~PGEXT_CMD_MASK))); + break; + + default: + MEM_LOG("Invalid extended pt command 0x%08lx", val & PGEXT_CMD_MASK); + err = 1; + break; + } + + return err; +} + /* Apply updates to page table @pagetable_id within the current domain. */ int do_process_page_updates(page_update_request_t *updates, int count) { @@ -559,39 +660,23 @@ int do_process_page_updates(page_update_request_t *updates, int count) kill_domain_with_errmsg("Cannot read page update request"); } - err = 1; - pfn = cur.ptr >> PAGE_SHIFT; - if ( !pfn ) - { - switch ( cur.ptr ) - { - case PGREQ_ADD_BASEPTR: - err = get_l2_table(cur.val >> PAGE_SHIFT); - break; - case PGREQ_REMOVE_BASEPTR: - if ( cur.val == __pa(pagetable_ptr(current->mm.pagetable)) ) - { - MEM_LOG("Attempt to remove current baseptr! %08lx", - cur.val); - } - else - { - err = put_l2_table(cur.val >> PAGE_SHIFT); - } - break; - default: - MEM_LOG("Invalid page update command %08lx", cur.ptr); - break; - } - } - else if ( (cur.ptr & (sizeof(l1_pgentry_t)-1)) || (pfn >= max_page) ) + if ( pfn >= max_page ) { - MEM_LOG("Page out of range (%08lx>%08lx) or misalign %08lx", - pfn, max_page, cur.ptr); + MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page); + kill_domain_with_errmsg("Page update request out of range"); } - else + + err = 1; + + /* Least significant bits of 'ptr' demux the operation type. */ + switch ( cur.ptr & (sizeof(l1_pgentry_t)-1) ) { + + /* + * PGREQ_NORMAL: Normal update to any level of page table. + */ + case PGREQ_NORMAL: page = frame_table + pfn; flags = page->flags; if ( (flags & PG_domain_mask) == current->domain ) @@ -607,20 +692,47 @@ int do_process_page_updates(page_update_request_t *updates, int count) mk_l2_pgentry(cur.val)); break; default: - /* - * This might occur if a page-table update is - * requested before we've inferred the type - * of the containing page. It shouldn't happen - * if page tables are built strictly top-down, so - * we have a MEM_LOG warning message. - */ - MEM_LOG("Unnecessary update to non-pt page %08lx", - cur.ptr); - *(unsigned long *)__va(cur.ptr) = cur.val; - err = 0; + MEM_LOG("Update to non-pt page %08lx", cur.ptr); break; } } + break; + + /* + * PGREQ_UNCHECKED_UPDATE: Make an unchecked update to a + * bottom-level page-table entry. + * Restrictions apply: + * 1. Update only allowed by domain 0. + * 2. Update must be to a level-1 pte belonging to dom0. + */ + case PGREQ_UNCHECKED_UPDATE: + cur.ptr &= ~(sizeof(l1_pgentry_t) - 1); + page = frame_table + pfn; + flags = page->flags; + if ( (flags | current->domain) == PGT_l1_page_table ) + { + *(unsigned long *)__va(cur.ptr) = cur.val; + err = 0; + } + else + { + MEM_LOG("UNCHECKED_UPDATE: Bad domain %d, or" + " bad pte type %08lx", current->domain, flags); + } + break; + + /* + * PGREQ_EXTENDED_COMMAND: Extended command is specified + * in the least-siginificant bits of the 'value' field. + */ + case PGREQ_EXTENDED_COMMAND: + cur.ptr &= ~(sizeof(l1_pgentry_t) - 1); + err = do_extended_command(cur.ptr, cur.val); + break; + + default: + MEM_LOG("Invalid page update command %08lx", cur.ptr); + break; } if ( err ) @@ -631,40 +743,14 @@ int do_process_page_updates(page_update_request_t *updates, int count) updates++; } - __asm__ __volatile__ ("movl %%eax,%%cr3" : : - "a" (__pa(pagetable_ptr(current->mm.pagetable)))); - return(0); -} - - -int do_set_pagetable(unsigned long ptr) -{ - struct pfn_info *page; - unsigned long pfn, flags; - - if ( (ptr & ~PAGE_MASK) ) - { - MEM_LOG("Misaligned new baseptr %08lx", ptr); - return -1; - } - pfn = ptr >> PAGE_SHIFT; - if ( pfn >= max_page ) + if ( tlb_flush[smp_processor_id()] ) { - MEM_LOG("Page out of range (%08lx>%08lx)", pfn, max_page); - return -1; + tlb_flush[smp_processor_id()] = 0; + __asm__ __volatile__ ( + "movl %%eax,%%cr3" : : + "a" (__pa(pagetable_ptr(current->mm.pagetable)))); } - page = frame_table + (ptr >> PAGE_SHIFT); - flags = page->flags; - if ( (flags & (PG_domain_mask|PG_type_mask)) != - (current->domain|PGT_l2_page_table) ) - { - MEM_LOG("Page %08lx bad type/domain (dom=%ld) " - "(type %08lx != expected %08x)", - ptr, flags & PG_domain_mask, flags & PG_type_mask, - PGT_l2_page_table); - return -1; - } - current->mm.pagetable = mk_pagetable((unsigned long)__va(ptr)); - __asm__ __volatile__ ("movl %%eax,%%cr3" : : "a" (ptr)); - return 0; + + return(0); } + diff --git a/xen-2.4.16/common/page_alloc.c b/xen-2.4.16/common/page_alloc.c index 72785c8a35..48966e2acc 100644 --- a/xen-2.4.16/common/page_alloc.c +++ b/xen-2.4.16/common/page_alloc.c @@ -188,7 +188,7 @@ unsigned long __get_free_pages(int mask, int order) if ( i == FREELIST_SIZE ) { printk("Cannot handle page request order %d!\n", order); - return NULL; + return 0; } /* Unlink a chunk. */ diff --git a/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h b/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h index 9961f4b96d..3633423bd2 100644 --- a/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h +++ b/xen-2.4.16/include/hypervisor-ifs/hypervisor-if.h @@ -21,9 +21,30 @@ typedef struct trap_info_st typedef struct { -#define PGREQ_ADD_BASEPTR 0 -#define PGREQ_REMOVE_BASEPTR 1 +/* + * PGREQ_XXX: specified in least-significant bits of 'ptr' field. + * All requests specify relevent PTE or PT address in 'ptr'. + * Normal requests specify update value in 'value'. + * Extended requests specify command in least 8 bits of 'value'. + */ +/* A normal page-table update request. */ +#define PGREQ_NORMAL 0 +/* Make an unchecked update to a base-level pte. */ +#define PGREQ_UNCHECKED_UPDATE 1 +/* An extended command. */ +#define PGREQ_EXTENDED_COMMAND 2 unsigned long ptr, val; /* *ptr = val */ +/* Announce a new top-level page table. */ +#define PGEXT_PIN_L1_TABLE 0 +#define PGEXT_PIN_L2_TABLE 1 +#define PGEXT_PIN_L3_TABLE 2 +#define PGEXT_PIN_L4_TABLE 3 +#define PGEXT_UNPIN_TABLE 4 +#define PGEXT_NEW_BASEPTR 5 +#define PGEXT_TLB_FLUSH 6 +#define PGEXT_INVLPG 7 +#define PGEXT_CMD_MASK 255 +#define PGEXT_CMD_SHIFT 8 } page_update_request_t; @@ -32,7 +53,7 @@ typedef struct #define __HYPERVISOR_set_trap_table 0 #define __HYPERVISOR_pt_update 1 #define __HYPERVISOR_console_write 2 -#define __HYPERVISOR_set_pagetable 3 +/* vector 3 unused */ #define __HYPERVISOR_set_guest_stack 4 #define __HYPERVISOR_net_update 5 #define __HYPERVISOR_fpu_taskswitch 6 diff --git a/xen-2.4.16/include/xeno/mm.h b/xen-2.4.16/include/xeno/mm.h index 8a547ea9cc..9179ee201c 100644 --- a/xen-2.4.16/include/xeno/mm.h +++ b/xen-2.4.16/include/xeno/mm.h @@ -62,6 +62,13 @@ typedef struct pfn_info { unsigned long type_count; /* pagetable/dir, or domain-writeable refs. */ } frame_table_t; +/* + * We use a high bit to indicate that a page is pinned. + * We do not use the top bit as that would mean that we'd get confused with + * -ve error numbers in some places in common/memory.c. + */ +#define REFCNT_PIN_BIT 0x40000000UL + #define get_page_tot(p) ((p)->tot_count++) #define put_page_tot(p) (--(p)->tot_count) #define page_tot_count(p) ((p)->tot_count) diff --git a/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S b/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S index b1a4e3eb21..f05ebc7b6f 100644 --- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S +++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/head.S @@ -57,5 +57,11 @@ ENTRY(stack_start) ENTRY(empty_zero_page) .org 0x2000 +ENTRY(cpu0_pte_quicklist) + +.org 0x2400 +ENTRY(cpu0_pgd_quicklist) + +.org 0x2800 ENTRY(stext) ENTRY(_stext) diff --git a/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c b/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c index 3d4325f0e9..6a98b88883 100644 --- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c +++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/setup.c @@ -145,14 +145,20 @@ void __init setup_arch(char **cmdline_p) unsigned long bootmap_size; char str[256]; int strcnt; - void hypervisor_callback(void); - void failsafe_callback(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + + extern unsigned long cpu0_pte_quicklist[]; + extern unsigned long cpu0_pgd_quicklist[]; HYPERVISOR_shared_info->event_address = (unsigned long)hypervisor_callback; HYPERVISOR_shared_info->failsafe_address = (unsigned long)failsafe_callback; + boot_cpu_data.pgd_quick = cpu0_pgd_quicklist; + boot_cpu_data.pte_quick = cpu0_pte_quicklist; + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); memset(&drive_info, 0, sizeof(drive_info)); memset(&screen_info, 0, sizeof(screen_info)); diff --git a/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c b/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c index 18e21674a9..c2cd7262e9 100644 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/fault.c @@ -155,9 +155,32 @@ asmlinkage void do_page_fault(struct pt_regs *regs, siginfo_t info; /* Set the "privileged fault" bit to something sane. */ - error_code &= ~4; + error_code &= 3; error_code |= (regs->xcs & 2) << 1; +#if PT_UPDATE_DEBUG > 0 + if ( (error_code == 0) && (address >= TASK_SIZE) ) + { + unsigned long paddr = __pa(address); + int i; + for ( i = 0; i < pt_update_queue_idx; i++ ) + { + if ( update_debug_queue[i].ptr == paddr ) + { + printk("XXX now(EIP=%08lx:ptr=%08lx) " + "then(%s/%d:p/v=%08lx/%08lx)\n", + regs->eip, address, + update_debug_queue[i].file, + update_debug_queue[i].line, + update_debug_queue[i].ptr, + update_debug_queue[i].val); + } + } + } +#endif + + if ( flush_page_update_queue() != 0 ) return; + /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. @@ -291,12 +314,14 @@ no_context: printk(" printing eip:\n"); printk("%08lx\n", regs->eip); page = ((unsigned long *) cur_pgd)[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); + printk(KERN_ALERT "*pde = %08lx(%08lx)\n", page, page - start_info.phys_base); if (page & 1) { page &= PAGE_MASK; address &= 0x003ff000; + page -= start_info.phys_base; page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + printk(KERN_ALERT "*pte = %08lx(%08lx)\n", page, + page - start_info.phys_base); } die("Oops", regs, error_code); bust_spinlocks(0); @@ -366,6 +391,7 @@ vmalloc_fault: if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); + XENO_flush_page_update_queue(); /* flush PMD update */ pte_k = pte_offset(pmd_k, address); if (!pte_present(*pte_k)) diff --git a/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c b/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c index 8454c9b36a..7e33eaa53e 100644 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/hypervisor.c @@ -7,52 +7,162 @@ */ #include <linux/config.h> +#include <linux/sched.h> #include <asm/hypervisor.h> #include <asm/page.h> #include <asm/pgtable.h> -#define QUEUE_SIZE 1 +#define QUEUE_SIZE 2048 static page_update_request_t update_queue[QUEUE_SIZE]; +unsigned int pt_update_queue_idx = 0; +#define idx pt_update_queue_idx + +#if PT_UPDATE_DEBUG > 0 +page_update_debug_t update_debug_queue[QUEUE_SIZE] = {{0}}; +#undef queue_l1_entry_update +#undef queue_l2_entry_update +static void DEBUG_allow_pt_reads(void) +{ + pte_t *pte; + page_update_request_t update; + int i; + for ( i = idx-1; i >= 0; i-- ) + { + pte = update_debug_queue[i].ptep; + if ( pte == NULL ) continue; + update_debug_queue[i].ptep = NULL; + update.ptr = __pa(pte) + start_info.phys_base; + update.val = update_debug_queue[i].pteval; + HYPERVISOR_pt_update(&update, 1); + } +} +static void DEBUG_disallow_pt_read(unsigned long pa) +{ + pte_t *pte; + pmd_t *pmd; + pgd_t *pgd; + unsigned long pteval; + /* + * We may fault because of an already outstanding update. + * That's okay -- it'll get fixed up in the fault handler. + */ + page_update_request_t update; + unsigned long va = (unsigned long)__va(pa); + pgd = pgd_offset_k(va); + pmd = pmd_offset(pgd, va); + pte = pte_offset(pmd, va); + update.ptr = __pa(pte) + start_info.phys_base; + pteval = *(unsigned long *)pte; + update.val = pteval & ~_PAGE_PRESENT; + HYPERVISOR_pt_update(&update, 1); + update_debug_queue[idx].ptep = pte; + update_debug_queue[idx].pteval = pteval; +} +#endif + +#if PT_UPDATE_DEBUG > 1 +#undef queue_pt_switch +#undef queue_tlb_flush +#undef queue_invlpg +#undef queue_pgd_pin +#undef queue_pgd_unpin +#undef queue_pte_pin +#undef queue_pte_unpin +#endif + + +/* + * This is the current pagetable base pointer, which is updated + * on context switch. + */ +unsigned long pt_baseptr; + +void _flush_page_update_queue(void) +{ + if ( idx == 0 ) return; +#if PT_UPDATE_DEBUG > 1 + printk("Flushing %d entries from pt update queue\n", idx); +#endif +#if PT_UPDATE_DEBUG > 0 + DEBUG_allow_pt_reads(); +#endif + HYPERVISOR_pt_update(update_queue, idx); + idx = 0; +} + +static void increment_index(void) +{ + if ( ++idx == QUEUE_SIZE ) _flush_page_update_queue(); +} void queue_l1_entry_update(unsigned long ptr, unsigned long val) { - update_queue[0].ptr = ptr + start_info.phys_base; - update_queue[0].val = val; - flush_page_update_queue(); +#if PT_UPDATE_DEBUG > 0 + DEBUG_disallow_pt_read(ptr); +#endif + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].val = val; + increment_index(); } void queue_l2_entry_update(unsigned long ptr, unsigned long val) { - update_queue[0].ptr = ptr + start_info.phys_base; - update_queue[0].val = val; - flush_page_update_queue(); + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].val = val; + increment_index(); } -void queue_baseptr_create(unsigned long ptr) +void queue_pt_switch(unsigned long ptr) { - update_queue[0].ptr = PGREQ_ADD_BASEPTR; - update_queue[0].val = ptr + start_info.phys_base; - flush_page_update_queue(); + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_NEW_BASEPTR; + increment_index(); } -void queue_baseptr_remove(unsigned long ptr) +void queue_tlb_flush(void) { - update_queue[0].ptr = PGREQ_REMOVE_BASEPTR; - update_queue[0].val = ptr + start_info.phys_base; - flush_page_update_queue(); + update_queue[idx].ptr = PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_TLB_FLUSH; + increment_index(); } -void queue_tlb_flush(void) +void queue_invlpg(unsigned long ptr) +{ + update_queue[idx].ptr = PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = ptr & PAGE_MASK; + update_queue[idx].val |= PGEXT_INVLPG; + increment_index(); +} + +void queue_pgd_pin(unsigned long ptr) +{ + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_PIN_L2_TABLE; + increment_index(); +} + +void queue_pgd_unpin(unsigned long ptr) { - /* nothing */ + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_UNPIN_TABLE; + increment_index(); } -void queue_tlb_flush_one(unsigned long ptr) +void queue_pte_pin(unsigned long ptr) { - /* nothing */ + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_PIN_L1_TABLE; + increment_index(); } -void flush_page_update_queue(void) +void queue_pte_unpin(unsigned long ptr) { - HYPERVISOR_pt_update(update_queue, 1); + update_queue[idx].ptr = ptr + start_info.phys_base; + update_queue[idx].ptr |= PGREQ_EXTENDED_COMMAND; + update_queue[idx].val = PGEXT_UNPIN_TABLE; + increment_index(); } diff --git a/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c b/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c index 022a511cdd..3641429eb7 100644 --- a/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c +++ b/xenolinux-2.4.16-sparse/arch/xeno/mm/init.c @@ -46,15 +46,11 @@ int do_check_pgt_cache(int low, int high) int freed = 0; if(pgtable_cache_size > high) { do { - if (pgd_quicklist) { + if (!QUICKLIST_EMPTY(pgd_quicklist)) { free_pgd_slow(get_pgd_fast()); freed++; } - if (pmd_quicklist) { - pmd_free_slow(pmd_alloc_one_fast(NULL, 0)); - freed++; - } - if (pte_quicklist) { + if (!QUICKLIST_EMPTY(pte_quicklist)) { pte_free_slow(pte_alloc_one_fast(NULL, 0)); freed++; } diff --git a/xenolinux-2.4.16-sparse/fs/exec.c b/xenolinux-2.4.16-sparse/fs/exec.c new file mode 100644 index 0000000000..700c6caa50 --- /dev/null +++ b/xenolinux-2.4.16-sparse/fs/exec.c @@ -0,0 +1,986 @@ +/* + * linux/fs/exec.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * #!-checking implemented by tytso. + */ +/* + * Demand-loading implemented 01.12.91 - no need to read anything but + * the header into memory. The inode of the executable is put into + * "current->executable", and page faults do the actual loading. Clean. + * + * Once more I can proudly say that linux stood up to being changed: it + * was less than 2 hours work to get demand-loading completely implemented. + * + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary + * formats. + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/mman.h> +#include <linux/a.out.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> +#include <linux/personality.h> +#define __NO_VERSION__ +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif + +int core_uses_pid; + +static struct linux_binfmt *formats; +static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; + +int register_binfmt(struct linux_binfmt * fmt) +{ + struct linux_binfmt ** tmp = &formats; + + if (!fmt) + return -EINVAL; + if (fmt->next) + return -EBUSY; + write_lock(&binfmt_lock); + while (*tmp) { + if (fmt == *tmp) { + write_unlock(&binfmt_lock); + return -EBUSY; + } + tmp = &(*tmp)->next; + } + fmt->next = formats; + formats = fmt; + write_unlock(&binfmt_lock); + return 0; +} + +int unregister_binfmt(struct linux_binfmt * fmt) +{ + struct linux_binfmt ** tmp = &formats; + + write_lock(&binfmt_lock); + while (*tmp) { + if (fmt == *tmp) { + *tmp = fmt->next; + write_unlock(&binfmt_lock); + return 0; + } + tmp = &(*tmp)->next; + } + write_unlock(&binfmt_lock); + return -EINVAL; +} + +static inline void put_binfmt(struct linux_binfmt * fmt) +{ + if (fmt->module) + __MOD_DEC_USE_COUNT(fmt->module); +} + +/* + * Note that a shared library must be both readable and executable due to + * security reasons. + * + * Also note that we take the address to load from from the file itself. + */ +asmlinkage long sys_uselib(const char * library) +{ + struct file * file; + struct nameidata nd; + int error; + + error = user_path_walk(library, &nd); + if (error) + goto out; + + error = -EINVAL; + if (!S_ISREG(nd.dentry->d_inode->i_mode)) + goto exit; + + error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC); + if (error) + goto exit; + + file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + error = -ENOEXEC; + if(file->f_op && file->f_op->read) { + struct linux_binfmt * fmt; + + read_lock(&binfmt_lock); + for (fmt = formats ; fmt ; fmt = fmt->next) { + if (!fmt->load_shlib) + continue; + if (!try_inc_mod_count(fmt->module)) + continue; + read_unlock(&binfmt_lock); + error = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (error != -ENOEXEC) + break; + } + read_unlock(&binfmt_lock); + } + fput(file); +out: + return error; +exit: + path_release(&nd); + goto out; +} + +/* + * count() counts the number of arguments/envelopes + */ +static int count(char ** argv, int max) +{ + int i = 0; + + if (argv != NULL) { + for (;;) { + char * p; + + if (get_user(p, argv)) + return -EFAULT; + if (!p) + break; + argv++; + if(++i > max) + return -E2BIG; + } + } + return i; +} + +/* + * 'copy_strings()' copies argument/envelope strings from user + * memory to free pages in kernel mem. These are in a format ready + * to be put directly into the top of new user memory. + */ +int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) +{ + while (argc-- > 0) { + char *str; + int len; + unsigned long pos; + + if (get_user(str, argv+argc) || !(len = strnlen_user(str, bprm->p))) + return -EFAULT; + if (bprm->p < len) + return -E2BIG; + + bprm->p -= len; + /* XXX: add architecture specific overflow check here. */ + + pos = bprm->p; + while (len > 0) { + char *kaddr; + int i, new, err; + struct page *page; + int offset, bytes_to_copy; + + offset = pos % PAGE_SIZE; + i = pos/PAGE_SIZE; + page = bprm->page[i]; + new = 0; + if (!page) { + page = alloc_page(GFP_HIGHUSER); + bprm->page[i] = page; + if (!page) + return -ENOMEM; + new = 1; + } + kaddr = kmap(page); + + if (new && offset) + memset(kaddr, 0, offset); + bytes_to_copy = PAGE_SIZE - offset; + if (bytes_to_copy > len) { + bytes_to_copy = len; + if (new) + memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); + } + err = copy_from_user(kaddr + offset, str, bytes_to_copy); + kunmap(page); + + if (err) + return -EFAULT; + + pos += bytes_to_copy; + str += bytes_to_copy; + len -= bytes_to_copy; + } + } + return 0; +} + +/* + * Like copy_strings, but get argv and its values from kernel memory. + */ +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) +{ + int r; + mm_segment_t oldfs = get_fs(); + set_fs(KERNEL_DS); + r = copy_strings(argc, argv, bprm); + set_fs(oldfs); + return r; +} + +/* + * This routine is used to map in a page into an address space: needed by + * execve() for the initial stack and environment pages. + * + * tsk->mmap_sem is held for writing. + */ +void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte; + + if (page_count(page) != 1) + printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); + pgd = pgd_offset(tsk->mm, address); + + spin_lock(&tsk->mm->page_table_lock); + pmd = pmd_alloc(tsk->mm, pgd, address); + if (!pmd) + goto out; + pte = pte_alloc(tsk->mm, pmd, address); + if (!pte) + goto out; + if (!pte_none(*pte)) + goto out; + lru_cache_add(page); + flush_dcache_page(page); + flush_page_to_ram(page); + set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + XENO_flush_page_update_queue(); + tsk->mm->rss++; + spin_unlock(&tsk->mm->page_table_lock); + + /* no need for flush_tlb */ + return; +out: + spin_unlock(&tsk->mm->page_table_lock); + __free_page(page); + force_sig(SIGKILL, tsk); + return; +} + +int setup_arg_pages(struct linux_binprm *bprm) +{ + unsigned long stack_base; + struct vm_area_struct *mpnt; + int i; + + stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE; + + bprm->p += stack_base; + if (bprm->loader) + bprm->loader += stack_base; + bprm->exec += stack_base; + + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) + return -ENOMEM; + + down_write(¤t->mm->mmap_sem); + { + mpnt->vm_mm = current->mm; + mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; + mpnt->vm_end = STACK_TOP; + mpnt->vm_page_prot = PAGE_COPY; + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_ops = NULL; + mpnt->vm_pgoff = 0; + mpnt->vm_file = NULL; + mpnt->vm_private_data = (void *) 0; + insert_vm_struct(current->mm, mpnt); + current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + } + + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + struct page *page = bprm->page[i]; + if (page) { + bprm->page[i] = NULL; + put_dirty_page(current,page,stack_base); + } + stack_base += PAGE_SIZE; + } + up_write(¤t->mm->mmap_sem); + + return 0; +} + +struct file *open_exec(const char *name) +{ + struct nameidata nd; + struct inode *inode; + struct file *file; + int err = 0; + + if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) + err = path_walk(name, &nd); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; + file = ERR_PTR(-EACCES); + if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && + S_ISREG(inode->i_mode)) { + int err = permission(inode, MAY_EXEC); + if (!err && !(inode->i_mode & 0111)) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { + file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { + fput(file); + file = ERR_PTR(err); + } + } +out: + return file; + } + } + path_release(&nd); + } + goto out; +} + +int kernel_read(struct file *file, unsigned long offset, + char * addr, unsigned long count) +{ + mm_segment_t old_fs; + loff_t pos = offset; + int result = -ENOSYS; + + if (!file->f_op->read) + goto fail; + old_fs = get_fs(); + set_fs(get_ds()); + result = file->f_op->read(file, addr, count, &pos); + set_fs(old_fs); +fail: + return result; +} + +static int exec_mmap(void) +{ + struct mm_struct * mm, * old_mm; + + old_mm = current->mm; + if (old_mm && atomic_read(&old_mm->mm_users) == 1) { + mm_release(); + exit_mmap(old_mm); + return 0; + } + + mm = mm_alloc(); + if (mm) { + struct mm_struct *active_mm; + + if (init_new_context(current, mm)) { + mmdrop(mm); + return -ENOMEM; + } + + /* Add it to the list of mm's */ + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, &init_mm.mmlist); + mmlist_nr++; + spin_unlock(&mmlist_lock); + + task_lock(current); + active_mm = current->active_mm; + current->mm = mm; + current->active_mm = mm; + task_unlock(current); + activate_mm(active_mm, mm); + mm_release(); + if (old_mm) { + if (active_mm != old_mm) BUG(); + mmput(old_mm); + return 0; + } + mmdrop(active_mm); + return 0; + } + return -ENOMEM; +} + +/* + * This function makes sure the current process has its own signal table, + * so that flush_signal_handlers can later reset the handlers without + * disturbing other processes. (Other processes might share the signal + * table via the CLONE_SIGNAL option to clone().) + */ + +static inline int make_private_signals(void) +{ + struct signal_struct * newsig; + + if (atomic_read(¤t->sig->count) <= 1) + return 0; + newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); + if (newsig == NULL) + return -ENOMEM; + spin_lock_init(&newsig->siglock); + atomic_set(&newsig->count, 1); + memcpy(newsig->action, current->sig->action, sizeof(newsig->action)); + spin_lock_irq(¤t->sigmask_lock); + current->sig = newsig; + spin_unlock_irq(¤t->sigmask_lock); + return 0; +} + +/* + * If make_private_signals() made a copy of the signal table, decrement the + * refcount of the original table, and free it if necessary. + * We don't do that in make_private_signals() so that we can back off + * in flush_old_exec() if an error occurs after calling make_private_signals(). + */ + +static inline void release_old_signals(struct signal_struct * oldsig) +{ + if (current->sig == oldsig) + return; + if (atomic_dec_and_test(&oldsig->count)) + kmem_cache_free(sigact_cachep, oldsig); +} + +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ + +static inline void flush_old_files(struct files_struct * files) +{ + long j = -1; + + write_lock(&files->file_lock); + for (;;) { + unsigned long set, i; + + j++; + i = j * __NFDBITS; + if (i >= files->max_fds || i >= files->max_fdset) + break; + set = files->close_on_exec->fds_bits[j]; + if (!set) + continue; + files->close_on_exec->fds_bits[j] = 0; + write_unlock(&files->file_lock); + for ( ; set ; i++,set >>= 1) { + if (set & 1) { + sys_close(i); + } + } + write_lock(&files->file_lock); + + } + write_unlock(&files->file_lock); +} + +/* + * An execve() will automatically "de-thread" the process. + * Note: we don't have to hold the tasklist_lock to test + * whether we migth need to do this. If we're not part of + * a thread group, there is no way we can become one + * dynamically. And if we are, we only need to protect the + * unlink - even if we race with the last other thread exit, + * at worst the list_del_init() might end up being a no-op. + */ +static inline void de_thread(struct task_struct *tsk) +{ + if (!list_empty(&tsk->thread_group)) { + write_lock_irq(&tasklist_lock); + list_del_init(&tsk->thread_group); + write_unlock_irq(&tasklist_lock); + } + + /* Minor oddity: this might stay the same. */ + tsk->tgid = tsk->pid; +} + +int flush_old_exec(struct linux_binprm * bprm) +{ + char * name; + int i, ch, retval; + struct signal_struct * oldsig; + + /* + * Make sure we have a private signal table + */ + oldsig = current->sig; + retval = make_private_signals(); + if (retval) goto flush_failed; + + /* + * Release all of the old mmap stuff + */ + retval = exec_mmap(); + if (retval) goto mmap_failed; + + /* This is the point of no return */ + release_old_signals(oldsig); + + current->sas_ss_sp = current->sas_ss_size = 0; + + if (current->euid == current->uid && current->egid == current->gid) + current->mm->dumpable = 1; + name = bprm->filename; + for (i=0; (ch = *(name++)) != '\0';) { + if (ch == '/') + i = 0; + else + if (i < 15) + current->comm[i++] = ch; + } + current->comm[i] = '\0'; + + flush_thread(); + + de_thread(current); + + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || + permission(bprm->file->f_dentry->d_inode,MAY_READ)) + current->mm->dumpable = 0; + + /* An exec changes our domain. We are no longer part of the thread + group */ + + current->self_exec_id++; + + flush_signal_handlers(current); + flush_old_files(current->files); + + return 0; + +mmap_failed: +flush_failed: + spin_lock_irq(¤t->sigmask_lock); + if (current->sig != oldsig) { + kfree(current->sig); + current->sig = oldsig; + } + spin_unlock_irq(¤t->sigmask_lock); + return retval; +} + +/* + * We mustn't allow tracing of suid binaries, unless + * the tracer has the capability to trace anything.. + */ +static inline int must_not_trace_exec(struct task_struct * p) +{ + return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); +} + +/* + * Fill the binprm structure from the inode. + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + */ +int prepare_binprm(struct linux_binprm *bprm) +{ + int mode; + struct inode * inode = bprm->file->f_dentry->d_inode; + + mode = inode->i_mode; + /* + * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, + * vfs_permission lets a non-executable through + */ + if (!(mode & 0111)) /* with at least _one_ execute bit set */ + return -EACCES; + if (bprm->file->f_op == NULL) + return -EACCES; + + bprm->e_uid = current->euid; + bprm->e_gid = current->egid; + + if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { + /* Set-uid? */ + if (mode & S_ISUID) + bprm->e_uid = inode->i_uid; + + /* Set-gid? */ + /* + * If setgid is set but no group execute bit then this + * is a candidate for mandatory locking, not a setgid + * executable. + */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + bprm->e_gid = inode->i_gid; + } + + /* We don't have VFS support for capabilities yet */ + cap_clear(bprm->cap_inheritable); + cap_clear(bprm->cap_permitted); + cap_clear(bprm->cap_effective); + + /* To support inheritance of root-permissions and suid-root + * executables under compatibility mode, we raise all three + * capability sets for the file. + * + * If only the real uid is 0, we only raise the inheritable + * and permitted sets of the executable file. + */ + + if (!issecure(SECURE_NOROOT)) { + if (bprm->e_uid == 0 || current->uid == 0) { + cap_set_full(bprm->cap_inheritable); + cap_set_full(bprm->cap_permitted); + } + if (bprm->e_uid == 0) + cap_set_full(bprm->cap_effective); + } + + memset(bprm->buf,0,BINPRM_BUF_SIZE); + return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); +} + +/* + * This function is used to produce the new IDs and capabilities + * from the old ones and the file's capabilities. + * + * The formula used for evolving capabilities is: + * + * pI' = pI + * (***) pP' = (fP & X) | (fI & pI) + * pE' = pP' & fE [NB. fE is 0 or ~0] + * + * I=Inheritable, P=Permitted, E=Effective // p=process, f=file + * ' indicates post-exec(), and X is the global 'cap_bset'. + * + */ + +void compute_creds(struct linux_binprm *bprm) +{ + kernel_cap_t new_permitted, working; + int do_unlock = 0; + + new_permitted = cap_intersect(bprm->cap_permitted, cap_bset); + working = cap_intersect(bprm->cap_inheritable, + current->cap_inheritable); + new_permitted = cap_combine(new_permitted, working); + + if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || + !cap_issubset(new_permitted, current->cap_permitted)) { + current->mm->dumpable = 0; + + lock_kernel(); + if (must_not_trace_exec(current) + || atomic_read(¤t->fs->count) > 1 + || atomic_read(¤t->files->count) > 1 + || atomic_read(¤t->sig->count) > 1) { + if(!capable(CAP_SETUID)) { + bprm->e_uid = current->uid; + bprm->e_gid = current->gid; + } + if(!capable(CAP_SETPCAP)) { + new_permitted = cap_intersect(new_permitted, + current->cap_permitted); + } + } + do_unlock = 1; + } + + + /* For init, we want to retain the capabilities set + * in the init_task struct. Thus we skip the usual + * capability rules */ + if (current->pid != 1) { + current->cap_permitted = new_permitted; + current->cap_effective = + cap_intersect(new_permitted, bprm->cap_effective); + } + + /* AUD: Audit candidate if current->cap_effective is set */ + + current->suid = current->euid = current->fsuid = bprm->e_uid; + current->sgid = current->egid = current->fsgid = bprm->e_gid; + + if(do_unlock) + unlock_kernel(); + current->keep_capabilities = 0; +} + + +void remove_arg_zero(struct linux_binprm *bprm) +{ + if (bprm->argc) { + unsigned long offset; + char * kaddr; + struct page *page; + + offset = bprm->p % PAGE_SIZE; + goto inside; + + while (bprm->p++, *(kaddr+offset++)) { + if (offset != PAGE_SIZE) + continue; + offset = 0; + kunmap(page); +inside: + page = bprm->page[bprm->p/PAGE_SIZE]; + kaddr = kmap(page); + } + kunmap(page); + bprm->argc--; + } +} + +/* + * cycle the list of binary formats handler, until one recognizes the image + */ +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) +{ + int try,retval=0; + struct linux_binfmt *fmt; +#ifdef __alpha__ + /* handle /sbin/loader.. */ + { + struct exec * eh = (struct exec *) bprm->buf; + + if (!bprm->loader && eh->fh.f_magic == 0x183 && + (eh->fh.f_flags & 0x3000) == 0x3000) + { + struct file * file; + unsigned long loader; + + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + + loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + + file = open_exec("/sbin/loader"); + retval = PTR_ERR(file); + if (IS_ERR(file)) + return retval; + + /* Remember if the application is TASO. */ + bprm->sh_bang = eh->ah.entry < 0x100000000; + + bprm->file = file; + bprm->loader = loader; + retval = prepare_binprm(bprm); + if (retval<0) + return retval; + /* should call search_binary_handler recursively here, + but it does not matter */ + } + } +#endif + /* kernel module loader fixup */ + /* so we don't try to load run modprobe in kernel space. */ + set_fs(USER_DS); + for (try=0; try<2; try++) { + read_lock(&binfmt_lock); + for (fmt = formats ; fmt ; fmt = fmt->next) { + int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; + if (!fn) + continue; + if (!try_inc_mod_count(fmt->module)) + continue; + read_unlock(&binfmt_lock); + retval = fn(bprm, regs); + if (retval >= 0) { + put_binfmt(fmt); + allow_write_access(bprm->file); + if (bprm->file) + fput(bprm->file); + bprm->file = NULL; + current->did_exec = 1; + return retval; + } + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (retval != -ENOEXEC) + break; + if (!bprm->file) { + read_unlock(&binfmt_lock); + return retval; + } + } + read_unlock(&binfmt_lock); + if (retval != -ENOEXEC) { + break; +#ifdef CONFIG_KMOD + }else{ +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) + char modname[20]; + if (printable(bprm->buf[0]) && + printable(bprm->buf[1]) && + printable(bprm->buf[2]) && + printable(bprm->buf[3])) + break; /* -ENOEXEC */ + sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); + request_module(modname); +#endif + } + } + return retval; +} + + +/* + * sys_execve() executes a new program. + */ +int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs) +{ + struct linux_binprm bprm; + struct file *file; + int retval; + int i; + + file = open_exec(filename); + + retval = PTR_ERR(file); + if (IS_ERR(file)) + return retval; + + bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); + + bprm.file = file; + bprm.filename = filename; + bprm.sh_bang = 0; + bprm.loader = 0; + bprm.exec = 0; + if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) { + allow_write_access(file); + fput(file); + return bprm.argc; + } + + if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) { + allow_write_access(file); + fput(file); + return bprm.envc; + } + + retval = prepare_binprm(&bprm); + if (retval < 0) + goto out; + + retval = copy_strings_kernel(1, &bprm.filename, &bprm); + if (retval < 0) + goto out; + + bprm.exec = bprm.p; + retval = copy_strings(bprm.envc, envp, &bprm); + if (retval < 0) + goto out; + + retval = copy_strings(bprm.argc, argv, &bprm); + if (retval < 0) + goto out; + + retval = search_binary_handler(&bprm,regs); + if (retval >= 0) + /* execve success */ + return retval; + +out: + /* Something went wrong, return the inode and free the argument pages*/ + allow_write_access(bprm.file); + if (bprm.file) + fput(bprm.file); + + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + struct page * page = bprm.page[i]; + if (page) + __free_page(page); + } + + return retval; +} + +void set_binfmt(struct linux_binfmt *new) +{ + struct linux_binfmt *old = current->binfmt; + if (new && new->module) + __MOD_INC_USE_COUNT(new->module); + current->binfmt = new; + if (old && old->module) + __MOD_DEC_USE_COUNT(old->module); +} + +int do_coredump(long signr, struct pt_regs * regs) +{ + struct linux_binfmt * binfmt; + char corename[6+sizeof(current->comm)+10]; + struct file * file; + struct inode * inode; + int retval = 0; + + lock_kernel(); + binfmt = current->binfmt; + if (!binfmt || !binfmt->core_dump) + goto fail; + if (!current->mm->dumpable) + goto fail; + current->mm->dumpable = 0; + if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) + goto fail; + + memcpy(corename,"core.", 5); + corename[4] = '\0'; + if (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1) + sprintf(&corename[4], ".%d", current->pid); + file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600); + if (IS_ERR(file)) + goto fail; + inode = file->f_dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; /* multiple links - don't dump */ + if (d_unhashed(file->f_dentry)) + goto close_fail; + + if (!S_ISREG(inode->i_mode)) + goto close_fail; + if (!file->f_op) + goto close_fail; + if (!file->f_op->write) + goto close_fail; + if (do_truncate(file->f_dentry, 0) != 0) + goto close_fail; + + down_read(¤t->mm->mmap_sem); + retval = binfmt->core_dump(signr, regs, file); + up_read(¤t->mm->mmap_sem); + +close_fail: + filp_close(file, NULL); +fail: + unlock_kernel(); + return retval; +} diff --git a/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h b/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h index f46bfe4c44..839feed153 100644 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/hypervisor.h @@ -24,18 +24,111 @@ extern union start_info_union start_info_union; /* arch/xeno/kernel/hypervisor.c */ void do_hypervisor_callback(struct pt_regs *regs); + /* arch/xeno/mm/hypervisor.c */ /* - * NB. ptr values should be fake-physical. 'vals' should be alread + * NB. ptr values should be fake-physical. 'vals' should be already * fully adjusted (ie. for start_info.phys_base). */ + +extern unsigned int pt_update_queue_idx; + void queue_l1_entry_update(unsigned long ptr, unsigned long val); void queue_l2_entry_update(unsigned long ptr, unsigned long val); -void queue_baseptr_create(unsigned long ptr); -void queue_baseptr_remove(unsigned long ptr); +void queue_pt_switch(unsigned long ptr); void queue_tlb_flush(void); -void queue_tlb_flush_one(unsigned long ptr); -void flush_page_update_queue(void); +void queue_invlpg(unsigned long ptr); +void queue_pgd_pin(unsigned long ptr); +void queue_pgd_unpin(unsigned long ptr); +void queue_pte_pin(unsigned long ptr); +void queue_pte_unpin(unsigned long ptr); + +#define PT_UPDATE_DEBUG 0 + +#if PT_UPDATE_DEBUG > 0 +typedef struct { + unsigned long ptr, val, pteval; + void *ptep; + int line; char *file; +} page_update_debug_t; +extern page_update_debug_t update_debug_queue[]; +#define queue_l1_entry_update(_p,_v) ({ \ + update_debug_queue[pt_update_queue_idx].ptr = (_p); \ + update_debug_queue[pt_update_queue_idx].val = (_v); \ + update_debug_queue[pt_update_queue_idx].line = __LINE__; \ + update_debug_queue[pt_update_queue_idx].file = __FILE__; \ + queue_l1_entry_update((_p),(_v)); \ +}) +#define queue_l2_entry_update(_p,_v) ({ \ + update_debug_queue[pt_update_queue_idx].ptr = (_p); \ + update_debug_queue[pt_update_queue_idx].val = (_v); \ + update_debug_queue[pt_update_queue_idx].line = __LINE__; \ + update_debug_queue[pt_update_queue_idx].file = __FILE__; \ + queue_l2_entry_update((_p),(_v)); \ +}) +#endif + +#if PT_UPDATE_DEBUG > 1 +#undef queue_l1_entry_update +#undef queue_l2_entry_update +#define queue_l1_entry_update(_p,_v) ({ \ + update_debug_queue[pt_update_queue_idx].ptr = (_p); \ + update_debug_queue[pt_update_queue_idx].val = (_v); \ + update_debug_queue[pt_update_queue_idx].line = __LINE__; \ + update_debug_queue[pt_update_queue_idx].file = __FILE__; \ + printk("L1 %s %d: %08lx (%08lx -> %08lx)\n", __FILE__, __LINE__, \ + (_p)+start_info.phys_base, *(unsigned long *)__va(_p), \ + (unsigned long)(_v)); \ + queue_l1_entry_update((_p),(_v)); \ +}) +#define queue_l2_entry_update(_p,_v) ({ \ + update_debug_queue[pt_update_queue_idx].ptr = (_p); \ + update_debug_queue[pt_update_queue_idx].val = (_v); \ + update_debug_queue[pt_update_queue_idx].line = __LINE__; \ + update_debug_queue[pt_update_queue_idx].file = __FILE__; \ + printk("L2 %s %d: %08lx (%08lx -> %08lx)\n", __FILE__, __LINE__, \ + (_p)+start_info.phys_base, *(unsigned long *)__va(_p), \ + (unsigned long)(_v)); \ + queue_l2_entry_update((_p),(_v)); \ +}) +#define queue_pt_switch(_p) ({ \ + printk("PTSWITCH %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pt_switch(_p); \ +}) +#define queue_tlb_flush() ({ \ + printk("TLB FLUSH %s %d\n", __FILE__, __LINE__); \ + queue_tlb_flush(); \ +}) +#define queue_invlpg(_p) ({ \ + printk("INVLPG %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_invlpg(_p); \ +}) +#define queue_pgd_pin(_p) ({ \ + printk("PGD PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pgd_pin(_p); \ +}) +#define queue_pgd_unpin(_p) ({ \ + printk("PGD UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pgd_unpin(_p); \ +}) +#define queue_pte_pin(_p) ({ \ + printk("PTE PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pte_pin(_p); \ +}) +#define queue_pte_unpin(_p) ({ \ + printk("PTE UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \ + queue_pte_unpin(_p); \ +}) +#endif + +void _flush_page_update_queue(void); +static inline int flush_page_update_queue(void) +{ + unsigned int idx = pt_update_queue_idx; + if ( idx != 0 ) _flush_page_update_queue(); + return idx; +} +#define XENO_flush_page_update_queue() (_flush_page_update_queue()) /* @@ -78,17 +171,6 @@ static inline int HYPERVISOR_console_write(const char *str, int count) return ret; } -static inline int HYPERVISOR_set_pagetable(unsigned long ptr) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" (__HYPERVISOR_set_pagetable), - "b" (ptr) ); - - return ret; -} - static inline int HYPERVISOR_set_guest_stack( unsigned long ss, unsigned long esp) { diff --git a/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h b/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h index aecc68582d..db09973f3d 100644 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/mmu_context.h @@ -45,7 +45,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, str set_bit(cpu, &next->context.cpuvalid); /* Re-load page tables */ cur_pgd = next->pgd; - HYPERVISOR_set_pagetable(__pa(cur_pgd) + start_info.phys_base); + queue_pt_switch(__pa(cur_pgd)); + XENO_flush_page_update_queue(); } #ifdef CONFIG_SMP else { diff --git a/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h b/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h index da6bf8422d..a3a90ad9f1 100644 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/pgalloc.h @@ -7,13 +7,23 @@ #include <asm/fixmap.h> #include <linux/threads.h> +/* + * Quick lists are aligned so that least significant bits of array pointer + * are all zero when list is empty, and all one when list is full. + */ +#define QUICKLIST_ENTRIES 256 +#define QUICKLIST_EMPTY(_l) !((unsigned long)(_l) & ((QUICKLIST_ENTRIES*4)-1)) +#define QUICKLIST_FULL(_l) QUICKLIST_EMPTY((_l)+1) #define pgd_quicklist (current_cpu_data.pgd_quick) #define pmd_quicklist (current_cpu_data.pmd_quick) #define pte_quicklist (current_cpu_data.pte_quick) #define pgtable_cache_size (current_cpu_data.pgtable_cache_sz) -#define pmd_populate(mm, pmd, pte) \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) +#define pmd_populate(mm, pmd, pte) \ + do { \ + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ + XENO_flush_page_update_queue(); \ + } while ( 0 ) static __inline__ pgd_t *get_pgd_slow(void) { @@ -31,43 +41,18 @@ static __inline__ pgd_t *get_pgd_slow(void) kpmd = pmd_offset(kpgd, (unsigned long)pgd); kpte = pte_offset(kpmd, (unsigned long)pgd); queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)&~_PAGE_RW); - queue_baseptr_create(__pa(pgd)); + queue_pgd_pin(__pa(pgd)); } return pgd; } -#if 0 -static __inline__ pgd_t *get_pgd_fast(void) -{ - unsigned long *ret; - - if ((ret = pgd_quicklist) != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } else - ret = (unsigned long *)get_pgd_slow(); - return (pgd_t *)ret; -} - -static __inline__ void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long *)pgd = (unsigned long) pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size++; -} -#else -#define get_pgd_fast get_pgd_slow -#define free_pgd_fast free_pgd_slow -#endif - static __inline__ void free_pgd_slow(pgd_t *pgd) { pgd_t *kpgd; pmd_t *kpmd; pte_t *kpte; - queue_baseptr_remove(__pa(pgd)); + queue_pgd_unpin(__pa(pgd)); kpgd = pgd_offset_k((unsigned long)pgd); kpmd = pmd_offset(kpgd, (unsigned long)pgd); kpte = pte_offset(kpmd, (unsigned long)pgd); @@ -75,6 +60,27 @@ static __inline__ void free_pgd_slow(pgd_t *pgd) free_page((unsigned long)pgd); } +static __inline__ pgd_t *get_pgd_fast(void) +{ + unsigned long ret; + + if ( !QUICKLIST_EMPTY(pgd_quicklist) ) { + ret = *(--pgd_quicklist); + pgtable_cache_size--; + } else + ret = (unsigned long)get_pgd_slow(); + return (pgd_t *)ret; +} + +static __inline__ void free_pgd_fast(pgd_t *pgd) +{ + if ( !QUICKLIST_FULL(pgd_quicklist) ) { + *(pgd_quicklist++) = (unsigned long)pgd; + pgtable_cache_size++; + } else + free_pgd_slow(pgd); +} + static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) { pte_t *pte; @@ -90,39 +96,17 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) kpmd = pmd_offset(kpgd, (unsigned long)pte); kpte = pte_offset(kpmd, (unsigned long)pte); queue_l1_entry_update(__pa(kpte), (*(unsigned long *)kpte)&~_PAGE_RW); + queue_pte_pin(__pa(pte)); } return pte; } -#if 0 -static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) -{ - unsigned long *ret; - - if ((ret = (unsigned long *)pte_quicklist) != NULL) { - pte_quicklist = (unsigned long *)(*ret); - ret[0] = ret[1]; - pgtable_cache_size--; - } - return (pte_t *)ret; -} - -static __inline__ void pte_free_fast(pte_t *pte) -{ - *(unsigned long *)pte = (unsigned long) pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} -#else -#define pte_alloc_one_fast pte_alloc_one -#define pte_free_fast pte_free_slow -#endif - static __inline__ void pte_free_slow(pte_t *pte) { pgd_t *kpgd; pmd_t *kpmd; pte_t *kpte; + queue_pte_unpin(__pa(pte)); kpgd = pgd_offset_k((unsigned long)pte); kpmd = pmd_offset(kpgd, (unsigned long)pte); kpte = pte_offset(kpmd, (unsigned long)pte); @@ -130,6 +114,25 @@ static __inline__ void pte_free_slow(pte_t *pte) free_page((unsigned long)pte); } +static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) +{ + unsigned long ret = 0; + if ( !QUICKLIST_EMPTY(pte_quicklist) ) { + ret = *(--pte_quicklist); + pgtable_cache_size--; + } + return (pte_t *)ret; +} + +static __inline__ void pte_free_fast(pte_t *pte) +{ + if ( !QUICKLIST_FULL(pte_quicklist) ) { + *(pte_quicklist++) = (unsigned long)pte; + pgtable_cache_size++; + } else + pte_free_slow(pte); +} + #define pte_free(pte) pte_free_fast(pte) #define pgd_alloc(mm) get_pgd_fast() #define pgd_free(pgd) free_pgd_fast(pgd) @@ -158,28 +161,29 @@ extern int do_check_pgt_cache(int, int); static inline void flush_tlb_mm(struct mm_struct *mm) { - if (mm == current->active_mm) - __flush_tlb(); + if ( mm == current->active_mm ) queue_tlb_flush(); + XENO_flush_page_update_queue(); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); + if ( vma->vm_mm == current->active_mm ) queue_invlpg(addr); + XENO_flush_page_update_queue(); } static inline void flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) { - if (mm == current->active_mm) - __flush_tlb(); + if ( mm == current->active_mm ) queue_tlb_flush(); + XENO_flush_page_update_queue(); } static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) { /* i386 does not keep any page table caches in TLB */ + XENO_flush_page_update_queue(); } #endif /* _I386_PGALLOC_H */ diff --git a/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h b/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h index 2eaa28733d..6273ef9d98 100644 --- a/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h +++ b/xenolinux-2.4.16-sparse/include/asm-xeno/pgtable.h @@ -37,10 +37,10 @@ extern void paging_init(void); extern unsigned long pgkern_mask; -#define __flush_tlb() queue_tlb_flush() +#define __flush_tlb() ({ queue_tlb_flush(); XENO_flush_page_update_queue(); }) #define __flush_tlb_global() __flush_tlb() #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) queue_tlb_flush_one(addr) +#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XENO_flush_page_update_queue(); }) /* * ZERO_PAGE is a global shared page that is always zero: used @@ -281,7 +281,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ - ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \ __pte_offset(address)) diff --git a/xenolinux-2.4.16-sparse/mm/memory.c b/xenolinux-2.4.16-sparse/mm/memory.c new file mode 100644 index 0000000000..58eb472e2d --- /dev/null +++ b/xenolinux-2.4.16-sparse/mm/memory.c @@ -0,0 +1,1442 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + */ + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/swapctl.h> +#include <linux/iobuf.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> + +unsigned long max_mapnr; +unsigned long num_physpages; +void * high_memory; +struct page *highmem_start_page; + +/* + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). + */ +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +{ + if (from == ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); +} + +mem_map_t * mem_map; + +/* + * Called by TLB shootdown + */ +void __free_pte(pte_t pte) +{ + struct page *page = pte_page(pte); + if ((!VALID_PAGE(page)) || PageReserved(page)) + return; + if (pte_dirty(pte)) + set_page_dirty(page); + free_page_and_swap_cache(page); +} + + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void free_one_pmd(pmd_t * dir) +{ + pte_t * pte; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, 0); + pmd_clear(dir); + pte_free(pte); +} + +static inline void free_one_pgd(pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + for (j = 0; j < PTRS_PER_PMD ; j++) { + prefetchw(pmd+j+(PREFETCH_STRIDE/16)); + free_one_pmd(pmd+j); + } + pmd_free(pmd); +} + +/* Low and high watermarks for page table cache. + The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] + */ +int pgt_cache_water[2] = { 25, 50 }; + +/* Returns the number of pages freed */ +int check_pgt_cache(void) +{ + return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); +} + + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. + */ +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) +{ + pgd_t * page_dir = mm->pgd; + + spin_lock(&mm->page_table_lock); + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +} + +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc(). + */ +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); +skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + + src_pte = pte_offset(src_pmd, address); + dst_pte = pte_alloc(dst, dst_pmd, address); + if (!dst_pte) + goto nomem; + + spin_lock(&src->page_table_lock); + do { + pte_t pte = *src_pte; + struct page *ptepage; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range_noset; + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + goto cont_copy_pte_range; + } + ptepage = pte_page(pte); + if ((!VALID_PAGE(ptepage)) || + PageReserved(ptepage)) + goto cont_copy_pte_range; + + /* If it's a COW mapping, write protect it both in the parent and the child */ + if (cow) { + /* XENO modification: modified ordering here to avoid RaW hazard. */ + pte = *src_pte; + pte = pte_wrprotect(pte); + ptep_set_wrprotect(src_pte); + } + + /* If it's a shared mapping, mark it clean in the child */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(ptepage); + dst->rss++; + +cont_copy_pte_range: set_pte(dst_pte, pte); +cont_copy_pte_range_noset: address += PAGE_SIZE; + if (address >= end) + goto out_unlock; + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); + +cont_copy_pmd_range: src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out_unlock: + spin_unlock(&src->page_table_lock); +out: + return 0; +nomem: + return -ENOMEM; +} + +/* + * Return indicates whether a page was freed so caller can adjust rss + */ +static inline void forget_pte(pte_t page) +{ + if (!pte_none(page)) { + printk("forget_pte: old mapping existed!\n"); + BUG(); + } +} + +static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t * ptep; + int freed = 0; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + ptep = pte_offset(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) + freed ++; + /* This will eventually call __free_pte on the pte. */ + tlb_remove_page(tlb, ptep, address + offset); + } else { + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); + } + } + + return freed; +} + +static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + int freed; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + end = address + size; + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + freed = 0; + do { + freed += zap_pte_range(tlb, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return freed; +} + +/* + * remove user pages in a given range. + */ +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +{ + mmu_gather_t *tlb; + pgd_t * dir; + unsigned long start = address, end = address + size; + int freed = 0; + + dir = pgd_offset(mm, address); + + /* + * This is a long-lived spinlock. That's fine. + * There's no contention, because the page table + * lock only protects against kswapd anyway, and + * even if kswapd happened to be looking at this + * process we _want_ it to get stuck. + */ + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + flush_cache_range(mm, address, end); + tlb = tlb_gather_mmu(mm); + + do { + freed += zap_pmd_range(tlb, dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + + /* this will flush any remaining tlb entries */ + tlb_finish_mmu(tlb, start, end); + + /* + * Update rss for the mm_struct (not necessarily current->mm) + * Notice that rss is an unsigned long. + */ + if (mm->rss > freed) + mm->rss -= freed; + else + mm->rss = 0; + spin_unlock(&mm->page_table_lock); +} + + +/* + * Do a quick page-table lookup for a single page. + */ +static struct page * follow_page(unsigned long address, int write) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + + pgd = pgd_offset(current->mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + + ptep = pte_offset(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + if (pte_present(pte)) { + if (!write || + (pte_write(pte) && pte_dirty(pte))) + return pte_page(pte); + } + +out: + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages in kiobufs. + */ + +static inline struct page * get_page_map(struct page *page) +{ + if (!VALID_PAGE(page)) + return 0; + return page; +} + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ + +#define dprintk(x...) +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + struct page * map; + int i; + int datain = (rw == READ); + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + down_read(&mm->mmap_sem); + + err = -EFAULT; + iobuf->locked = 0; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + page_cache_get(map); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; + } + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return 0; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; +} + +/* + * Mark all of the pages in a kiobuf as dirty + * + * We need to be able to deal with short reads from disk: if an IO error + * occurs, the number of bytes read into memory may be less than the + * size of the kiobuf, so we have to stop marking pages dirty once the + * requested byte count has been reached. + */ + +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + if (map) { + if (iobuf->locked) + UnlockPage(map); + page_cache_release(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + + +/* + * Lock down all of the pages of a kiovec for IO. + * + * If any page is mapped twice in the kiovec, we return the error -EINVAL. + * + * The optional wait parameter causes the lock call to block until all + * pages can be locked if set. If wait==0, the lock operation is + * aborted if any locked pages are found and -EAGAIN is returned. + */ + +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + int doublepage = 0; + int repeat = 0; + + repeat: + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (iobuf->locked) + continue; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + + if (TryLockPage(page)) { + while (j--) { + struct page *tmp = *--ppage; + if (tmp) + UnlockPage(tmp); + } + goto retry; + } + } + iobuf->locked = 1; + } + + return 0; + + retry: + + /* + * We couldn't lock one of the pages. Undo the locking so far, + * wait on the page we got to, and try again. + */ + + unlock_kiovec(nr, iovec); + if (!wait) + return -EAGAIN; + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(page)) { + /* + * If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ just be a coincidence, + * but if it happens more than once, chances + * are we have a double-mapped page. + */ + if (++doublepage >= 3) + return -EINVAL; + + /* Try again... */ + wait_on_page(page); + } + + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + +/* + * Unlock all of the pages of a kiovec after IO. + */ + +int unlock_kiovec(int nr, struct kiobuf *iovec[]) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (!iobuf->locked) + continue; + iobuf->locked = 0; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + UnlockPage(page); + } + } + return 0; +} + +static inline void zeromap_pte_range(pte_t * pte, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t oldpage = ptep_get_and_clear(pte); + set_pte(pte, zero_pte); + forget_pte(oldpage); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, address, end - address, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = address; + unsigned long end = address + size; + struct mm_struct *mm = current->mm; + + dir = pgd_offset(mm, address); + flush_cache_range(mm, beg, end); + if (address >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + struct page *page; + pte_t oldpage; + oldpage = ptep_get_and_clear(pte); + + page = virt_to_page(__va(phys_addr)); + if ((!VALID_PAGE(page)) || PageReserved(page)) + set_pte(pte, mk_pte_phys(phys_addr, prot)); + forget_pte(oldpage); + address += PAGE_SIZE; + phys_addr += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, address, end - address, address + phys_addr, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = from; + unsigned long end = from + size; + struct mm_struct *mm = current->mm; + + phys_addr -= from; + dir = pgd_offset(mm, from); + flush_cache_range(mm, beg, end); + if (from >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (from && (from < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * Establish a new mapping: + * - flush the old one + * - update the page tables + * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +{ + set_pte(page_table, entry); + flush_tlb_page(vma, address); + update_mmu_cache(vma, address, entry); +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + flush_page_to_ram(new_page); + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pte_t pte) +{ + struct page *old_page, *new_page; + + old_page = pte_page(pte); + if (!VALID_PAGE(old_page)) + goto bad_wp_page; + + if (!TryLockPage(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + } + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + copy_cow_page(old_page,new_page,address); + + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + if (pte_same(*page_table, pte)) { + if (PageReserved(old_page)) + ++mm->rss; + break_cow(vma, new_page, address, page_table); + lru_cache_add(new_page); + + /* Free the old page.. */ + new_page = old_page; + } + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + page_cache_release(old_page); + return 1; /* Minor fault */ + +bad_wp_page: + spin_unlock(&mm->page_table_lock); + printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); + return -1; +no_mem: + page_cache_release(old_page); + return -1; +} + +static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) +{ + do { + struct mm_struct *mm = mpnt->vm_mm; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long len = end - start; + unsigned long diff; + + /* mapping wholly truncated? */ + if (mpnt->vm_pgoff >= pgoff) { + zap_page_range(mm, start, len); + continue; + } + + /* mapping wholly unaffected? */ + len = len >> PAGE_SHIFT; + diff = pgoff - mpnt->vm_pgoff; + if (diff >= len) + continue; + + /* Ok, partially affected.. */ + start += diff << PAGE_SHIFT; + len = (len - diff) << PAGE_SHIFT; + zap_page_range(mm, start, len); + } while ((mpnt = mpnt->vm_next_share) != NULL); +} + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + unsigned long pgoff; + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + inode->i_size = offset; + spin_lock(&mapping->i_shared_lock); + if (!mapping->i_mmap && !mapping->i_mmap_shared) + goto out_unlock; + + pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (mapping->i_mmap != NULL) + vmtruncate_list(mapping->i_mmap, pgoff); + if (mapping->i_mmap_shared != NULL) + vmtruncate_list(mapping->i_mmap_shared, pgoff); + +out_unlock: + spin_unlock(&mapping->i_shared_lock); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY) { + if (inode->i_size >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (offset > limit) { + send_sig(SIGXFSZ, current, 0); + offset = limit; + } + } + inode->i_size = offset; + +out_truncate: + if (inode->i_op && inode->i_op->truncate) { + lock_kernel(); + inode->i_op->truncate(inode); + unlock_kernel(); + } +out: + return 0; +} + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(swp_entry_t entry) +{ + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); + } + return; +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t * page_table, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = 1; + + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + int retval; + spin_lock(&mm->page_table_lock); + retval = pte_same(*page_table, orig_pte) ? -1 : 1; + spin_unlock(&mm->page_table_lock); + return retval; + } + + /* Had to read the page from swap area: Major fault */ + ret = 2; + } + + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + if (!pte_same(*page_table, orig_pte)) { + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + return 1; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + mm->rss++; + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) + pte = pte_mkdirty(pte_mkwrite(pte)); + unlock_page(page); + + flush_page_to_ram(page); + flush_icache_page(vma, page); + set_pte(page_table, pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +{ + pte_t entry; + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + struct page *page; + + /* Allocate our own private page. */ + spin_unlock(&mm->page_table_lock); + + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto no_mem; + clear_user_highpage(page, addr); + + spin_lock(&mm->page_table_lock); + if (!pte_none(*page_table)) { + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + return 1; + } + mm->rss++; + flush_page_to_ram(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + lru_cache_add(page); + } + + set_pte(page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + +no_mem: + return -1; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) +{ + struct page * new_page; + pte_t entry; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, write_access, address); + spin_unlock(&mm->page_table_lock); + + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + if (new_page == NULL) /* no page was available -- SIGBUS */ + return 0; + if (new_page == NOPAGE_OOM) + return -1; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page * page = alloc_page(GFP_HIGHUSER); + if (!page) + return -1; + copy_highpage(page, new_page); + page_cache_release(new_page); + lru_cache_add(page); + new_page = page; + } + + spin_lock(&mm->page_table_lock); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + ++mm->rss; + flush_page_to_ram(new_page); + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = pte_mkwrite(pte_mkdirty(entry)); + set_pte(page_table, entry); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + return 1; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 2; /* Major fault */ +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t * pte) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte); + return do_swap_page(mm, vma, address, pte, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + establish_pte(vma, address, pte, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 1; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + + current->state = TASK_RUNNING; + pgd = pgd_offset(mm, address); + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); + + if (pmd) { + pte_t * pte = pte_alloc(mm, pmd, address); + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte); + } + spin_unlock(&mm->page_table_lock); + return -1; +} + +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level page table, this ends up actually being entirely + * optimized away. + */ +pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pmd_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pgd_none(*pgd)) { + pmd_free(new); + goto out; + } + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset(pgd, address); +} + +/* + * Allocate the page table directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (pmd_none(*pmd)) { + pte_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pte_alloc_one_fast(mm, address); + if (!new) { + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pmd_none(*pmd)) { + pte_free(new); + goto out; + } + } + pmd_populate(mm, pmd, new); + } +out: + return pte_offset(pmd, address); +} + +/* + * Simplistic page force-in.. + */ +int make_pages_present(unsigned long addr, unsigned long end) +{ + int write; + struct mm_struct *mm = current->mm; + struct vm_area_struct * vma; + + vma = find_vma(mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + do { + if (handle_mm_fault(mm, vma, addr, write) < 0) + return -1; + addr += PAGE_SIZE; + } while (addr < end); + return 0; +} diff --git a/xenolinux-2.4.16-sparse/mm/mremap.c b/xenolinux-2.4.16-sparse/mm/mremap.c new file mode 100644 index 0000000000..a2e0d860dd --- /dev/null +++ b/xenolinux-2.4.16-sparse/mm/mremap.c @@ -0,0 +1,354 @@ +/* + * linux/mm/remap.c + * + * (C) Copyright 1996 Linus Torvalds + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/swap.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +extern int vm_enough_memory(long pages); + +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + goto end; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + goto end; + } + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto end; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto end; + } + + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) + pte = NULL; +end: + return pte; +} + +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pmd_t * pmd; + pte_t * pte = NULL; + + pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + if (pmd) + pte = pte_alloc(mm, pmd, addr); + return pte; +} + +static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) +{ + int error = 0; + pte_t pte; + + if (!pte_none(*src)) { + pte = ptep_get_and_clear(src); + if (!dst) { + /* No dest? We must put it back. */ + dst = src; + error++; + } + set_pte(dst, pte); + } + return error; +} + +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +{ + int error = 0; + pte_t * src; + + spin_lock(&mm->page_table_lock); + src = get_one_pte(mm, old_addr); + if (src) + error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr)); + spin_unlock(&mm->page_table_lock); + return error; +} + +static int move_page_tables(struct mm_struct * mm, + unsigned long new_addr, unsigned long old_addr, unsigned long len) +{ + unsigned long offset = len; + + flush_cache_range(mm, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + while (offset) { + offset -= PAGE_SIZE; + if (move_one_page(mm, old_addr + offset, new_addr + offset)) + goto oops_we_failed; + } + flush_tlb_range(mm, old_addr, old_addr + len); + return 0; + + /* + * Ok, the move failed because we didn't have enough pages for + * the new page table tree. This is unlikely, but we have to + * take the possibility into account. In that case we just move + * all the pages back (this will work, because we still have + * the old page tables) + */ +oops_we_failed: + XENO_flush_page_update_queue(); + flush_cache_range(mm, new_addr, new_addr + len); + while ((offset += PAGE_SIZE) < len) + move_one_page(mm, new_addr + offset, old_addr + offset); + XENO_flush_page_update_queue(); + zap_page_range(mm, new_addr, len); + return -1; +} + +static inline unsigned long move_vma(struct vm_area_struct * vma, + unsigned long addr, unsigned long old_len, unsigned long new_len, + unsigned long new_addr) +{ + struct mm_struct * mm = vma->vm_mm; + struct vm_area_struct * new_vma, * next, * prev; + int allocated_vma; + + new_vma = NULL; + next = find_vma_prev(mm, new_addr, &prev); + if (next) { + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + if (next != prev->vm_next) + BUG(); + if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(&mm->page_table_lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + } + } else if (next->vm_start == new_addr + new_len && + can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + next->vm_start = new_addr; + spin_unlock(&mm->page_table_lock); + new_vma = next; + } + } else { + prev = find_vma(mm, new_addr-1); + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + } + } + + allocated_vma = 0; + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new_vma) + goto out; + allocated_vma = 1; + } + + if (!move_page_tables(current->mm, new_addr, addr, old_len)) { + if (allocated_vma) { + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_raend = 0; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + } + do_munmap(current->mm, addr, old_len); + current->mm->total_vm += new_len >> PAGE_SHIFT; + if (new_vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += new_len >> PAGE_SHIFT; + make_pages_present(new_vma->vm_start, + new_vma->vm_end); + } + return new_addr; + } + if (allocated_vma) + kmem_cache_free(vm_area_cachep, new_vma); + out: + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + do_munmap(current->mm, new_addr, new_len); + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + */ + ret = addr; + if (old_len >= new_len) { + do_munmap(current->mm, addr+new_len, old_len - new_len); + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + goto out; + } + ret = -ENOMEM; + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->rlim[RLIMIT_AS].rlim_cur) + goto out; + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) + goto out; + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = addr + new_len; + spin_unlock(&vma->vm_mm->page_table_lock); + current->mm->total_vm += pages; + if (vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_SHARED) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + return ret; +} + +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/xenolinux-2.4.16-sparse/mm/swapfile.c b/xenolinux-2.4.16-sparse/mm/swapfile.c new file mode 100644 index 0000000000..48846184d4 --- /dev/null +++ b/xenolinux-2.4.16-sparse/mm/swapfile.c @@ -0,0 +1,1291 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/blkdev.h> /* for blk_size */ +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/shm.h> +#include <linux/compiler.h> + +#include <asm/pgtable.h> + +spinlock_t swaplock = SPIN_LOCK_UNLOCKED; +unsigned int nr_swapfiles; +int total_swap_pages; +static int swap_overflow; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +struct swap_list_t swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + +#define SWAPFILE_CLUSTER 256 + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAPFILE_CLUSTER pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + int nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + si->lowest_bit = offset+1; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + if (si->lowest_bit > si->highest_bit) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + nr_swap_pages--; + si->cluster_next = offset+1; + return offset; + } + si->lowest_bit = si->max; + si->highest_bit = 0; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset; + swp_entry_t entry; + int type, wrapped = 0; + + entry.val = 0; /* Out of memory */ + swap_list_lock(); + type = swap_list.next; + if (type < 0) + goto out; + if (nr_swap_pages <= 0) + goto out; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + swap_device_lock(p); + offset = scan_swap_map(p); + swap_device_unlock(p); + if (offset) { + entry = SWP_ENTRY(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) { + swap_list.next = swap_list.head; + } else { + swap_list.next = type; + } + goto out; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else + if (type < 0) + goto out; /* out of swap space */ + } +out: + swap_list_unlock(); + return entry; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static void swap_info_put(struct swap_info_struct * p) +{ + swap_device_unlock(p); + swap_list_unlock(); +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, SWP_OFFSET(entry)); + swap_info_put(p); + } +} + +/* + * Check if we're the only user of a swap page, + * when the page is locked. + */ +static int exclusive_swap_page(struct page *page) +{ + int retval = 0; + struct swap_info_struct * p; + swp_entry_t entry; + + entry.val = page->index; + p = swap_info_get(entry); + if (p) { + /* Is the only swap cache user the cache itself? */ + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) + retval = 1; + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + } + return retval; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + * + * Here "exclusive_swap_page()" does the real + * work, but we opportunistically check whether + * we need to get all the locks first.. + */ +int can_share_swap_page(struct page *page) +{ + int retval = 0; + + if (!PageLocked(page)) + BUG(); + switch (page_count(page)) { + case 3: + if (!page->buffers) + break; + /* Fallthrough */ + case 2: + if (!PageSwapCache(page)) + break; + retval = exclusive_swap_page(page); + break; + case 1: + if (PageReserved(page)) + break; + retval = 1; + } + return retval; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +int remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + + entry.val = page->index; + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + + if (retval) { + block_flushpage(page, 0); + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) + page = find_trylock_page(&swapper_space, entry.val); + swap_info_put(p); + } + if (page) { + page_cache_get(page); + /* Only cache user (+us), or swap space full? Free it! */ + if (page_count(page) == 2 || vm_swap_full()) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + UnlockPage(page); + page_cache_release(page); + } +} + +/* + * The swap entry has been read in advance, and we return 1 to indicate + * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + */ +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, swp_entry_t entry, struct page* page) +{ + pte_t pte = *dir; + + if (likely(pte_to_swp_entry(pte).val != entry.val)) + return; + if (unlikely(pte_none(pte) || pte_present(pte))) + return; + get_page(page); + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); + ++vma->vm_mm->rss; +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + swp_entry_t entry, struct page* page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + swp_entry_t entry, struct page* page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + unuse_pmd(vma, pmd, address, end - address, offset, entry, + page); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page* page) +{ + unsigned long start = vma->vm_start, end = vma->vm_end; + + if (start >= end) + BUG(); + do { + unuse_pgd(vma, pgdir, start, end - start, entry, page); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (start && (start < end)); +} + +static void unuse_process(struct mm_struct * mm, + swp_entry_t entry, struct page* page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + unuse_vma(vma, pgd, entry, page); + } + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static int find_next_to_unuse(struct swap_info_struct *si, int prev) +{ + int max = si->max; + int i = prev; + int count; + + /* + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering (now preserved by swap_out()), + * which clusters forked address spaces together, most recent + * child immediately after parent. If we race with dup_mmap(), + * we very much want to resolve parent before child, otherwise + * we may miss some entries: using last mm would invert that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * mmput() removes mm from mmlist before exit_mmap() and its + * zap_page_range(). That's not too bad, those entries are + * on their way out, and handled faster there than here. + * do_munmap() behaves similarly, taking the range out of mm's + * vma list before zap_page_range(). But unfortunately, when + * unmapping a part of a vma, it takes the whole out first, + * then reinserts what's left after (might even reschedule if + * open() method called) - so swap entries may be invisible + * to swapoff for a while, then reappear - but that is rare. + */ + while ((i = find_next_to_unuse(si, i))) { + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = SWP_ENTRY(type, i); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page(page); + lock_page(page); + + /* + * Remove all references to entry, without blocking. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + swcount = *swap_map; + if (swcount > 1) { + flush_page_to_ram(page); + if (start_mm == &init_mm) + shmem_unuse(entry, page); + else + unuse_process(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *mm; + + spin_lock(&mmlist_lock); + while (*swap_map > 1 && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + swcount = *swap_map; + if (mm == &init_mm) { + set_start_mm = 1; + shmem_unuse(entry, page); + } else + unuse_process(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + new_start_mm = mm; + set_start_mm = 0; + } + } + atomic_inc(&new_start_mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(start_mm); + start_mm = new_start_mm; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_list_lock(); + swap_device_lock(si); + nr_swap_pages++; + *swap_map = 1; + swap_device_unlock(si); + swap_list_unlock(); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_swap_out could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * Note shmem_unuse already deleted its from swap cache. + */ + swcount = *swap_map; + if ((swcount > 0) != PageSwapCache(page)) + BUG(); + if ((swcount > 1) && PageDirty(page)) { + rw_swap_page(WRITE, page); + lock_page(page); + } + if (PageSwapCache(page)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so try_to_swap_out will preserve it. + */ + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. Interruptible check on + * signal_pending() would be nice, but changes the spec? + */ + if (current->need_resched) + schedule(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +asmlinkage long sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct nameidata nd; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = user_path_walk(specialfile, &nd); + if (err) + goto out; + + lock_kernel(); + prev = -1; + swap_list_lock(); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + if (p->swap_file == nd.dentry) + break; + } + prev = type; + } + err = -EINVAL; + if (type < 0) { + swap_list_unlock(); + goto out_dput; + } + + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags = SWP_USED; + swap_list_unlock(); + unlock_kernel(); + err = try_to_unuse(type); + lock_kernel(); + if (err) { + /* re-insert swap space back into swap_list */ + swap_list_lock(); + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags = SWP_WRITEOK; + swap_list_unlock(); + goto out_dput; + } + if (p->swap_device) + blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); + path_release(&nd); + + swap_list_lock(); + swap_device_lock(p); + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_vfsmnt = NULL; + p->swap_file = NULL; + p->swap_device = 0; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + swap_device_unlock(p); + swap_list_unlock(); + vfree(swap_map); + err = 0; + +out_dput: + unlock_kernel(); + path_release(&nd); +out: + return err; +} + +int get_swaparea_info(char *buf) +{ + char * page = (char *) __get_free_page(GFP_KERNEL); + struct swap_info_struct *ptr = swap_info; + int i, j, len = 0, usedswap; + + if (!page) + return -ENOMEM; + + len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if ((ptr->flags & SWP_USED) && ptr->swap_map) { + char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, + page, PAGE_SIZE); + + len += sprintf(buf + len, "%-31s ", path); + + if (!ptr->swap_device) + len += sprintf(buf + len, "file\t\t"); + else + len += sprintf(buf + len, "partition\t"); + + usedswap = 0; + for (j = 0; j < ptr->max; ++j) + switch (ptr->swap_map[j]) { + case SWAP_MAP_BAD: + case 0: + continue; + default: + usedswap++; + } + len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), + usedswap << (PAGE_SHIFT - 10), ptr->prio); + } + } + free_page((unsigned long) page); + return len; +} + +int is_swap_partition(kdev_t dev) { + struct swap_info_struct *ptr = swap_info; + int i; + + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if (ptr->flags & SWP_USED) + if (ptr->swap_device == dev) + return 1; + } + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage long sys_swapon(const char * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + struct nameidata nd; + struct inode * swap_inode; + unsigned int type; + int i, j, prev; + int error; + static int least_priority = 0; + union swap_header *swap_header = 0; + int swap_header_version; + int nr_good_pages = 0; + unsigned long maxpages = 1; + int swapfilesize; + struct block_device *bdev = NULL; + unsigned short *swap_map; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + lock_kernel(); + swap_list_lock(); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + if (type >= MAX_SWAPFILES) { + swap_list_unlock(); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->sdev_lock = SPIN_LOCK_UNLOCKED; + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + swap_list_unlock(); + error = user_path_walk(specialfile, &nd); + if (error) + goto bad_swap_2; + + p->swap_file = nd.dentry; + p->swap_vfsmnt = nd.mnt; + swap_inode = nd.dentry->d_inode; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + kdev_t dev = swap_inode->i_rdev; + struct block_device_operations *bdops; + + p->swap_device = dev; + set_blocksize(dev, PAGE_SIZE); + + bd_acquire(swap_inode); + bdev = swap_inode->i_bdev; + bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode)); + if (bdops) bdev->bd_op = bdops; + + error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); + if (error) + goto bad_swap_2; + set_blocksize(dev, PAGE_SIZE); + error = -ENODEV; + if (!dev || (blk_size[MAJOR(dev)] && + !blk_size[MAJOR(dev)][MINOR(dev)])) + goto bad_swap; + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + >> (PAGE_SHIFT - 10); + } else if (S_ISREG(swap_inode->i_mode)) + swapfilesize = swap_inode->i_size >> PAGE_SHIFT; + else + goto bad_swap; + + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + struct swap_info_struct *q = &swap_info[i]; + if (i == type || !q->swap_file) + continue; + if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + goto bad_swap; + } + + swap_header = (void *) __get_free_page(GFP_USER); + if (!swap_header) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + + lock_page(virt_to_page(swap_header)); + rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + memset(((char *) swap_header)+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,(char *) swap_header)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + maxpages = i+1; + j++; + } + } + nr_good_pages = j; + p->swap_map = vmalloc(maxpages * sizeof(short)); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < maxpages ; i++) { + if (test_bit(i,(char *) swap_header)) + p->swap_map[i] = 0; + else + p->swap_map[i] = SWAP_MAP_BAD; + } + break; + + case 2: + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(p->swap_map, 0, maxpages * sizeof(short)); + for (i=0; i<swap_header->info.nr_badpages; i++) { + int page = swap_header->info.badpages[i]; + if (page <= 0 || page >= swap_header->info.last_page) + error = -EINVAL; + else + p->swap_map[page] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map[0] = SWAP_MAP_BAD; + swap_list_lock(); + swap_device_lock(p); + p->max = maxpages; + p->flags = SWP_WRITEOK; + p->pages = nr_good_pages; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", + nr_good_pages<<(PAGE_SHIFT-10), p->prio); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_device_unlock(p); + swap_list_unlock(); + error = 0; + goto out; +bad_swap: + if (bdev) + blkdev_put(bdev, BDEV_SWAP); +bad_swap_2: + swap_list_lock(); + swap_map = p->swap_map; + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_device = 0; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_map = NULL; + p->flags = 0; + if (!(swap_flags & SWAP_FLAG_PREFER)) + ++least_priority; + swap_list_unlock(); + if (swap_map) + vfree(swap_map); + path_release(&nd); +out: + if (swap_header) + free_page((long) swap_header); + unlock_kernel(); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + swap_list_lock(); + for (i = 0; i < nr_swapfiles; i++) { + unsigned int j; + if (swap_info[i].flags != SWP_USED) + continue; + for (j = 0; j < swap_info[i].max; ++j) { + switch (swap_info[i].swap_map[j]) { + case 0: + case SWAP_MAP_BAD: + continue; + default: + nr_to_be_unused++; + } + } + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + swap_list_unlock(); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + + swap_device_lock(p); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + swap_device_unlock(p); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +/* + * Page lock needs to be held in all cases to prevent races with + * swap file deletion. + */ +int swap_count(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + swp_entry_t entry; + int retval = 0; + + entry.val = page->index; + if (!entry.val) + goto bad_entry; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_unused; + retval = p->swap_map[offset]; +out: + return retval; + +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_unused: + printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val); + goto out; +} + +/* + * Prior swap_duplicate protects against swap device deletion. + */ +void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, + kdev_t *dev, struct inode **swapf) +{ + unsigned long type; + struct swap_info_struct *p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); + return; + } + + p = &swap_info[type]; + *offset = SWP_OFFSET(entry); + if (*offset >= p->max && *offset != 0) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); + return; + } + if (p->swap_map && !p->swap_map[*offset]) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); + return; + } + if (!(p->flags & SWP_USED)) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); + return; + } + + if (p->swap_device) { + *dev = p->swap_device; + } else if (p->swap_file) { + *swapf = p->swap_file->d_inode; + } else { + printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + } + return; +} + +/* + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + int ret = 0, i = 1 << page_cluster; + unsigned long toff; + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + if (!page_cluster) /* no readahead */ + return 0; + toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; + + swap_device_lock(swapdev); + do { + /* Don't read-ahead past the end of the swap area */ + if (toff >= swapdev->max) + break; + /* Don't read in free or bad pages */ + if (!swapdev->swap_map[toff]) + break; + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + break; + toff++; + ret++; + } while (--i); + swap_device_unlock(swapdev); + return ret; +} |