diff options
Diffstat (limited to 'old/xenolinux-2.4.16-sparse/mm')
-rw-r--r-- | old/xenolinux-2.4.16-sparse/mm/memory.c | 1442 | ||||
-rw-r--r-- | old/xenolinux-2.4.16-sparse/mm/mremap.c | 354 | ||||
-rw-r--r-- | old/xenolinux-2.4.16-sparse/mm/swapfile.c | 1291 |
3 files changed, 3087 insertions, 0 deletions
diff --git a/old/xenolinux-2.4.16-sparse/mm/memory.c b/old/xenolinux-2.4.16-sparse/mm/memory.c new file mode 100644 index 0000000000..58eb472e2d --- /dev/null +++ b/old/xenolinux-2.4.16-sparse/mm/memory.c @@ -0,0 +1,1442 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + */ + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/swapctl.h> +#include <linux/iobuf.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> + +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> + +unsigned long max_mapnr; +unsigned long num_physpages; +void * high_memory; +struct page *highmem_start_page; + +/* + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). + */ +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +{ + if (from == ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); +} + +mem_map_t * mem_map; + +/* + * Called by TLB shootdown + */ +void __free_pte(pte_t pte) +{ + struct page *page = pte_page(pte); + if ((!VALID_PAGE(page)) || PageReserved(page)) + return; + if (pte_dirty(pte)) + set_page_dirty(page); + free_page_and_swap_cache(page); +} + + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void free_one_pmd(pmd_t * dir) +{ + pte_t * pte; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, 0); + pmd_clear(dir); + pte_free(pte); +} + +static inline void free_one_pgd(pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + for (j = 0; j < PTRS_PER_PMD ; j++) { + prefetchw(pmd+j+(PREFETCH_STRIDE/16)); + free_one_pmd(pmd+j); + } + pmd_free(pmd); +} + +/* Low and high watermarks for page table cache. + The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] + */ +int pgt_cache_water[2] = { 25, 50 }; + +/* Returns the number of pages freed */ +int check_pgt_cache(void) +{ + return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); +} + + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. + */ +void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) +{ + pgd_t * page_dir = mm->pgd; + + spin_lock(&mm->page_table_lock); + page_dir += first; + do { + free_one_pgd(page_dir); + page_dir++; + } while (--nr); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +} + +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc(). + */ +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE; + + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); +skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + + src_pte = pte_offset(src_pmd, address); + dst_pte = pte_alloc(dst, dst_pmd, address); + if (!dst_pte) + goto nomem; + + spin_lock(&src->page_table_lock); + do { + pte_t pte = *src_pte; + struct page *ptepage; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range_noset; + if (!pte_present(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + goto cont_copy_pte_range; + } + ptepage = pte_page(pte); + if ((!VALID_PAGE(ptepage)) || + PageReserved(ptepage)) + goto cont_copy_pte_range; + + /* If it's a COW mapping, write protect it both in the parent and the child */ + if (cow) { + /* XENO modification: modified ordering here to avoid RaW hazard. */ + pte = *src_pte; + pte = pte_wrprotect(pte); + ptep_set_wrprotect(src_pte); + } + + /* If it's a shared mapping, mark it clean in the child */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(ptepage); + dst->rss++; + +cont_copy_pte_range: set_pte(dst_pte, pte); +cont_copy_pte_range_noset: address += PAGE_SIZE; + if (address >= end) + goto out_unlock; + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); + +cont_copy_pmd_range: src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out_unlock: + spin_unlock(&src->page_table_lock); +out: + return 0; +nomem: + return -ENOMEM; +} + +/* + * Return indicates whether a page was freed so caller can adjust rss + */ +static inline void forget_pte(pte_t page) +{ + if (!pte_none(page)) { + printk("forget_pte: old mapping existed!\n"); + BUG(); + } +} + +static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t * ptep; + int freed = 0; + + if (pmd_none(*pmd)) + return 0; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return 0; + } + ptep = pte_offset(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) + freed ++; + /* This will eventually call __free_pte on the pte. */ + tlb_remove_page(tlb, ptep, address + offset); + } else { + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); + } + } + + return freed; +} + +static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + int freed; + + if (pgd_none(*dir)) + return 0; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return 0; + } + pmd = pmd_offset(dir, address); + end = address + size; + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + freed = 0; + do { + freed += zap_pte_range(tlb, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); + return freed; +} + +/* + * remove user pages in a given range. + */ +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +{ + mmu_gather_t *tlb; + pgd_t * dir; + unsigned long start = address, end = address + size; + int freed = 0; + + dir = pgd_offset(mm, address); + + /* + * This is a long-lived spinlock. That's fine. + * There's no contention, because the page table + * lock only protects against kswapd anyway, and + * even if kswapd happened to be looking at this + * process we _want_ it to get stuck. + */ + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + flush_cache_range(mm, address, end); + tlb = tlb_gather_mmu(mm); + + do { + freed += zap_pmd_range(tlb, dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + + /* this will flush any remaining tlb entries */ + tlb_finish_mmu(tlb, start, end); + + /* + * Update rss for the mm_struct (not necessarily current->mm) + * Notice that rss is an unsigned long. + */ + if (mm->rss > freed) + mm->rss -= freed; + else + mm->rss = 0; + spin_unlock(&mm->page_table_lock); +} + + +/* + * Do a quick page-table lookup for a single page. + */ +static struct page * follow_page(unsigned long address, int write) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + + pgd = pgd_offset(current->mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + + ptep = pte_offset(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + if (pte_present(pte)) { + if (!write || + (pte_write(pte) && pte_dirty(pte))) + return pte_page(pte); + } + +out: + return 0; +} + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages in kiobufs. + */ + +static inline struct page * get_page_map(struct page *page) +{ + if (!VALID_PAGE(page)) + return 0; + return page; +} + +/* + * Force in an entire range of pages from the current process's user VA, + * and pin them in physical memory. + */ + +#define dprintk(x...) +int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) +{ + unsigned long ptr, end; + int err; + struct mm_struct * mm; + struct vm_area_struct * vma = 0; + struct page * map; + int i; + int datain = (rw == READ); + + /* Make sure the iobuf is not already mapped somewhere. */ + if (iobuf->nr_pages) + return -EINVAL; + + mm = current->mm; + dprintk ("map_user_kiobuf: begin\n"); + + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); + if (err) + return err; + + down_read(&mm->mmap_sem); + + err = -EFAULT; + iobuf->locked = 0; + iobuf->offset = va & ~PAGE_MASK; + iobuf->length = len; + + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + page_cache_get(map); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; + } + + up_read(&mm->mmap_sem); + dprintk ("map_user_kiobuf: end OK\n"); + return 0; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; +} + +/* + * Mark all of the pages in a kiobuf as dirty + * + * We need to be able to deal with short reads from disk: if an IO error + * occurs, the number of bytes read into memory may be less than the + * size of the kiobuf, so we have to stop marking pages dirty once the + * requested byte count has been reached. + */ + +void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) +{ + int index, offset, remaining; + struct page *page; + + index = iobuf->offset >> PAGE_SHIFT; + offset = iobuf->offset & ~PAGE_MASK; + remaining = bytes; + if (remaining > iobuf->length) + remaining = iobuf->length; + + while (remaining > 0 && index < iobuf->nr_pages) { + page = iobuf->maplist[index]; + + if (!PageReserved(page)) + SetPageDirty(page); + + remaining -= (PAGE_SIZE - offset); + offset = 0; + index++; + } +} + +/* + * Unmap all of the pages referenced by a kiobuf. We release the pages, + * and unlock them if they were locked. + */ + +void unmap_kiobuf (struct kiobuf *iobuf) +{ + int i; + struct page *map; + + for (i = 0; i < iobuf->nr_pages; i++) { + map = iobuf->maplist[i]; + if (map) { + if (iobuf->locked) + UnlockPage(map); + page_cache_release(map); + } + } + + iobuf->nr_pages = 0; + iobuf->locked = 0; +} + + +/* + * Lock down all of the pages of a kiovec for IO. + * + * If any page is mapped twice in the kiovec, we return the error -EINVAL. + * + * The optional wait parameter causes the lock call to block until all + * pages can be locked if set. If wait==0, the lock operation is + * aborted if any locked pages are found and -EAGAIN is returned. + */ + +int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + int doublepage = 0; + int repeat = 0; + + repeat: + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (iobuf->locked) + continue; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + + if (TryLockPage(page)) { + while (j--) { + struct page *tmp = *--ppage; + if (tmp) + UnlockPage(tmp); + } + goto retry; + } + } + iobuf->locked = 1; + } + + return 0; + + retry: + + /* + * We couldn't lock one of the pages. Undo the locking so far, + * wait on the page we got to, and try again. + */ + + unlock_kiovec(nr, iovec); + if (!wait) + return -EAGAIN; + + /* + * Did the release also unlock the page we got stuck on? + */ + if (!PageLocked(page)) { + /* + * If so, we may well have the page mapped twice + * in the IO address range. Bad news. Of + * course, it _might_ just be a coincidence, + * but if it happens more than once, chances + * are we have a double-mapped page. + */ + if (++doublepage >= 3) + return -EINVAL; + + /* Try again... */ + wait_on_page(page); + } + + if (++repeat < 16) + goto repeat; + return -EAGAIN; +} + +/* + * Unlock all of the pages of a kiovec after IO. + */ + +int unlock_kiovec(int nr, struct kiobuf *iovec[]) +{ + struct kiobuf *iobuf; + int i, j; + struct page *page, **ppage; + + for (i = 0; i < nr; i++) { + iobuf = iovec[i]; + + if (!iobuf->locked) + continue; + iobuf->locked = 0; + + ppage = iobuf->maplist; + for (j = 0; j < iobuf->nr_pages; ppage++, j++) { + page = *ppage; + if (!page) + continue; + UnlockPage(page); + } + } + return 0; +} + +static inline void zeromap_pte_range(pte_t * pte, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t oldpage = ptep_get_and_clear(pte); + set_pte(pte, zero_pte); + forget_pte(oldpage); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, address, end - address, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = address; + unsigned long end = address + size; + struct mm_struct *mm = current->mm; + + dir = pgd_offset(mm, address); + flush_cache_range(mm, beg, end); + if (address >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + struct page *page; + pte_t oldpage; + oldpage = ptep_get_and_clear(pte); + + page = virt_to_page(__va(phys_addr)); + if ((!VALID_PAGE(page)) || PageReserved(page)) + set_pte(pte, mk_pte_phys(phys_addr, prot)); + forget_pte(oldpage); + address += PAGE_SIZE; + phys_addr += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + do { + pte_t * pte = pte_alloc(mm, pmd, address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, address, end - address, address + phys_addr, prot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = from; + unsigned long end = from + size; + struct mm_struct *mm = current->mm; + + phys_addr -= from; + dir = pgd_offset(mm, from); + flush_cache_range(mm, beg, end); + if (from >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (from && (from < end)); + spin_unlock(&mm->page_table_lock); + flush_tlb_range(mm, beg, end); + return error; +} + +/* + * Establish a new mapping: + * - flush the old one + * - update the page tables + * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +{ + set_pte(page_table, entry); + flush_tlb_page(vma, address); + update_mmu_cache(vma, address, entry); +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + flush_page_to_ram(new_page); + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pte_t pte) +{ + struct page *old_page, *new_page; + + old_page = pte_page(pte); + if (!VALID_PAGE(old_page)) + goto bad_wp_page; + + if (!TryLockPage(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + } + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + copy_cow_page(old_page,new_page,address); + + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + if (pte_same(*page_table, pte)) { + if (PageReserved(old_page)) + ++mm->rss; + break_cow(vma, new_page, address, page_table); + lru_cache_add(new_page); + + /* Free the old page.. */ + new_page = old_page; + } + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + page_cache_release(old_page); + return 1; /* Minor fault */ + +bad_wp_page: + spin_unlock(&mm->page_table_lock); + printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); + return -1; +no_mem: + page_cache_release(old_page); + return -1; +} + +static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) +{ + do { + struct mm_struct *mm = mpnt->vm_mm; + unsigned long start = mpnt->vm_start; + unsigned long end = mpnt->vm_end; + unsigned long len = end - start; + unsigned long diff; + + /* mapping wholly truncated? */ + if (mpnt->vm_pgoff >= pgoff) { + zap_page_range(mm, start, len); + continue; + } + + /* mapping wholly unaffected? */ + len = len >> PAGE_SHIFT; + diff = pgoff - mpnt->vm_pgoff; + if (diff >= len) + continue; + + /* Ok, partially affected.. */ + start += diff << PAGE_SHIFT; + len = (len - diff) << PAGE_SHIFT; + zap_page_range(mm, start, len); + } while ((mpnt = mpnt->vm_next_share) != NULL); +} + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + unsigned long pgoff; + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + inode->i_size = offset; + spin_lock(&mapping->i_shared_lock); + if (!mapping->i_mmap && !mapping->i_mmap_shared) + goto out_unlock; + + pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (mapping->i_mmap != NULL) + vmtruncate_list(mapping->i_mmap, pgoff); + if (mapping->i_mmap_shared != NULL) + vmtruncate_list(mapping->i_mmap_shared, pgoff); + +out_unlock: + spin_unlock(&mapping->i_shared_lock); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY) { + if (inode->i_size >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (offset > limit) { + send_sig(SIGXFSZ, current, 0); + offset = limit; + } + } + inode->i_size = offset; + +out_truncate: + if (inode->i_op && inode->i_op->truncate) { + lock_kernel(); + inode->i_op->truncate(inode); + unlock_kernel(); + } +out: + return 0; +} + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(swp_entry_t entry) +{ + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); + if (!new_page) + break; + page_cache_release(new_page); + } + return; +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t * page_table, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = 1; + + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + int retval; + spin_lock(&mm->page_table_lock); + retval = pte_same(*page_table, orig_pte) ? -1 : 1; + spin_unlock(&mm->page_table_lock); + return retval; + } + + /* Had to read the page from swap area: Major fault */ + ret = 2; + } + + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + if (!pte_same(*page_table, orig_pte)) { + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + return 1; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + mm->rss++; + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) + pte = pte_mkdirty(pte_mkwrite(pte)); + unlock_page(page); + + flush_page_to_ram(page); + flush_icache_page(vma, page); + set_pte(page_table, pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) +{ + pte_t entry; + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + struct page *page; + + /* Allocate our own private page. */ + spin_unlock(&mm->page_table_lock); + + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto no_mem; + clear_user_highpage(page, addr); + + spin_lock(&mm->page_table_lock); + if (!pte_none(*page_table)) { + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + return 1; + } + mm->rss++; + flush_page_to_ram(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + lru_cache_add(page); + } + + set_pte(page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ + +no_mem: + return -1; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *page_table) +{ + struct page * new_page; + pte_t entry; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, write_access, address); + spin_unlock(&mm->page_table_lock); + + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + if (new_page == NULL) /* no page was available -- SIGBUS */ + return 0; + if (new_page == NOPAGE_OOM) + return -1; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page * page = alloc_page(GFP_HIGHUSER); + if (!page) + return -1; + copy_highpage(page, new_page); + page_cache_release(new_page); + lru_cache_add(page); + new_page = page; + } + + spin_lock(&mm->page_table_lock); + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + ++mm->rss; + flush_page_to_ram(new_page); + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = pte_mkwrite(pte_mkdirty(entry)); + set_pte(page_table, entry); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + return 1; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 2; /* Major fault */ +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t * pte) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte); + return do_swap_page(mm, vma, address, pte, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + establish_pte(vma, address, pte, entry); + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return 1; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + + current->state = TASK_RUNNING; + pgd = pgd_offset(mm, address); + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); + + if (pmd) { + pte_t * pte = pte_alloc(mm, pmd, address); + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte); + } + spin_unlock(&mm->page_table_lock); + return -1; +} + +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level page table, this ends up actually being entirely + * optimized away. + */ +pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pmd_alloc_one_fast(mm, address); + if (!new) { + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pgd_none(*pgd)) { + pmd_free(new); + goto out; + } + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset(pgd, address); +} + +/* + * Allocate the page table directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + */ +pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (pmd_none(*pmd)) { + pte_t *new; + + /* "fast" allocation can happen without dropping the lock.. */ + new = pte_alloc_one_fast(mm, address); + if (!new) { + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (!pmd_none(*pmd)) { + pte_free(new); + goto out; + } + } + pmd_populate(mm, pmd, new); + } +out: + return pte_offset(pmd, address); +} + +/* + * Simplistic page force-in.. + */ +int make_pages_present(unsigned long addr, unsigned long end) +{ + int write; + struct mm_struct *mm = current->mm; + struct vm_area_struct * vma; + + vma = find_vma(mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + do { + if (handle_mm_fault(mm, vma, addr, write) < 0) + return -1; + addr += PAGE_SIZE; + } while (addr < end); + return 0; +} diff --git a/old/xenolinux-2.4.16-sparse/mm/mremap.c b/old/xenolinux-2.4.16-sparse/mm/mremap.c new file mode 100644 index 0000000000..a2e0d860dd --- /dev/null +++ b/old/xenolinux-2.4.16-sparse/mm/mremap.c @@ -0,0 +1,354 @@ +/* + * linux/mm/remap.c + * + * (C) Copyright 1996 Linus Torvalds + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/swap.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +extern int vm_enough_memory(long pages); + +static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pgd_t * pgd; + pmd_t * pmd; + pte_t * pte = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + goto end; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + goto end; + } + + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto end; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto end; + } + + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) + pte = NULL; +end: + return pte; +} + +static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) +{ + pmd_t * pmd; + pte_t * pte = NULL; + + pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + if (pmd) + pte = pte_alloc(mm, pmd, addr); + return pte; +} + +static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) +{ + int error = 0; + pte_t pte; + + if (!pte_none(*src)) { + pte = ptep_get_and_clear(src); + if (!dst) { + /* No dest? We must put it back. */ + dst = src; + error++; + } + set_pte(dst, pte); + } + return error; +} + +static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) +{ + int error = 0; + pte_t * src; + + spin_lock(&mm->page_table_lock); + src = get_one_pte(mm, old_addr); + if (src) + error = copy_one_pte(mm, src, alloc_one_pte(mm, new_addr)); + spin_unlock(&mm->page_table_lock); + return error; +} + +static int move_page_tables(struct mm_struct * mm, + unsigned long new_addr, unsigned long old_addr, unsigned long len) +{ + unsigned long offset = len; + + flush_cache_range(mm, old_addr, old_addr + len); + + /* + * This is not the clever way to do this, but we're taking the + * easy way out on the assumption that most remappings will be + * only a few pages.. This also makes error recovery easier. + */ + while (offset) { + offset -= PAGE_SIZE; + if (move_one_page(mm, old_addr + offset, new_addr + offset)) + goto oops_we_failed; + } + flush_tlb_range(mm, old_addr, old_addr + len); + return 0; + + /* + * Ok, the move failed because we didn't have enough pages for + * the new page table tree. This is unlikely, but we have to + * take the possibility into account. In that case we just move + * all the pages back (this will work, because we still have + * the old page tables) + */ +oops_we_failed: + XENO_flush_page_update_queue(); + flush_cache_range(mm, new_addr, new_addr + len); + while ((offset += PAGE_SIZE) < len) + move_one_page(mm, new_addr + offset, old_addr + offset); + XENO_flush_page_update_queue(); + zap_page_range(mm, new_addr, len); + return -1; +} + +static inline unsigned long move_vma(struct vm_area_struct * vma, + unsigned long addr, unsigned long old_len, unsigned long new_len, + unsigned long new_addr) +{ + struct mm_struct * mm = vma->vm_mm; + struct vm_area_struct * new_vma, * next, * prev; + int allocated_vma; + + new_vma = NULL; + next = find_vma_prev(mm, new_addr, &prev); + if (next) { + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + if (next != prev->vm_next) + BUG(); + if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + spin_unlock(&mm->page_table_lock); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + } + } else if (next->vm_start == new_addr + new_len && + can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + next->vm_start = new_addr; + spin_unlock(&mm->page_table_lock); + new_vma = next; + } + } else { + prev = find_vma(mm, new_addr-1); + if (prev && prev->vm_end == new_addr && + can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { + spin_lock(&mm->page_table_lock); + prev->vm_end = new_addr + new_len; + spin_unlock(&mm->page_table_lock); + new_vma = prev; + } + } + + allocated_vma = 0; + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new_vma) + goto out; + allocated_vma = 1; + } + + if (!move_page_tables(current->mm, new_addr, addr, old_len)) { + if (allocated_vma) { + *new_vma = *vma; + new_vma->vm_start = new_addr; + new_vma->vm_end = new_addr+new_len; + new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_raend = 0; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + insert_vm_struct(current->mm, new_vma); + } + do_munmap(current->mm, addr, old_len); + current->mm->total_vm += new_len >> PAGE_SHIFT; + if (new_vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += new_len >> PAGE_SHIFT; + make_pages_present(new_vma->vm_start, + new_vma->vm_end); + } + return new_addr; + } + if (allocated_vma) + kmem_cache_free(vm_area_cachep, new_vma); + out: + return -ENOMEM; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + goto out; + + if (addr & ~PAGE_MASK) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* new_addr is only valid if MREMAP_FIXED is specified */ + if (flags & MREMAP_FIXED) { + if (new_addr & ~PAGE_MASK) + goto out; + if (!(flags & MREMAP_MAYMOVE)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + do_munmap(current->mm, new_addr, new_len); + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + */ + ret = addr; + if (old_len >= new_len) { + do_munmap(current->mm, addr+new_len, old_len - new_len); + if (!(flags & MREMAP_FIXED) || (new_addr == addr)) + goto out; + } + + /* + * Ok, we need to grow.. or relocate. + */ + ret = -EFAULT; + vma = find_vma(current->mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto out; + if (vma->vm_flags & VM_DONTEXPAND) { + if (new_len > old_len) + goto out; + } + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; + locked += new_len - old_len; + ret = -EAGAIN; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + goto out; + } + ret = -ENOMEM; + if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) + > current->rlim[RLIMIT_AS].rlim_cur) + goto out; + /* Private writable mapping? Check memory availability.. */ + if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && + !(flags & MAP_NORESERVE) && + !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) + goto out; + + /* old_len exactly to the end of the area.. + * And we're not relocating the area. + */ + if (old_len == vma->vm_end - addr && + !((flags & MREMAP_FIXED) && (addr != new_addr)) && + (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { + unsigned long max_addr = TASK_SIZE; + if (vma->vm_next) + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + spin_lock(&vma->vm_mm->page_table_lock); + vma->vm_end = addr + new_len; + spin_unlock(&vma->vm_mm->page_table_lock); + current->mm->total_vm += pages; + if (vma->vm_flags & VM_LOCKED) { + current->mm->locked_vm += pages; + make_pages_present(addr + old_len, + addr + new_len); + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + if (!(flags & MREMAP_FIXED)) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_SHARED) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); + ret = new_addr; + if (new_addr & ~PAGE_MASK) + goto out; + } + ret = move_vma(vma, addr, old_len, new_len, new_addr); + } +out: + return ret; +} + +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} diff --git a/old/xenolinux-2.4.16-sparse/mm/swapfile.c b/old/xenolinux-2.4.16-sparse/mm/swapfile.c new file mode 100644 index 0000000000..48846184d4 --- /dev/null +++ b/old/xenolinux-2.4.16-sparse/mm/swapfile.c @@ -0,0 +1,1291 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/blkdev.h> /* for blk_size */ +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/shm.h> +#include <linux/compiler.h> + +#include <asm/pgtable.h> + +spinlock_t swaplock = SPIN_LOCK_UNLOCKED; +unsigned int nr_swapfiles; +int total_swap_pages; +static int swap_overflow; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +struct swap_list_t swap_list = {-1, -1}; + +struct swap_info_struct swap_info[MAX_SWAPFILES]; + +#define SWAPFILE_CLUSTER 256 + +static inline int scan_swap_map(struct swap_info_struct *si) +{ + unsigned long offset; + /* + * We try to cluster swap pages by allocating them + * sequentially in swap. Once we've allocated + * SWAPFILE_CLUSTER pages this way, however, we resort to + * first-free allocation, starting a new cluster. This + * prevents us from scattering swap pages all over the entire + * swap partition, so that we reduce overall disk seek times + * between swap pages. -- sct */ + if (si->cluster_nr) { + while (si->cluster_next <= si->highest_bit) { + offset = si->cluster_next++; + if (si->swap_map[offset]) + continue; + si->cluster_nr--; + goto got_page; + } + } + si->cluster_nr = SWAPFILE_CLUSTER; + + /* try to find an empty (even not aligned) cluster. */ + offset = si->lowest_bit; + check_next_cluster: + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) + { + int nr; + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) + if (si->swap_map[nr]) + { + offset = nr+1; + goto check_next_cluster; + } + /* We found a completly empty cluster, so start + * using it. + */ + goto got_page; + } + /* No luck, so now go finegrined as usual. -Andrea */ + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { + if (si->swap_map[offset]) + continue; + si->lowest_bit = offset+1; + got_page: + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + if (si->lowest_bit > si->highest_bit) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + nr_swap_pages--; + si->cluster_next = offset+1; + return offset; + } + si->lowest_bit = si->max; + si->highest_bit = 0; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct * p; + unsigned long offset; + swp_entry_t entry; + int type, wrapped = 0; + + entry.val = 0; /* Out of memory */ + swap_list_lock(); + type = swap_list.next; + if (type < 0) + goto out; + if (nr_swap_pages <= 0) + goto out; + + while (1) { + p = &swap_info[type]; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + swap_device_lock(p); + offset = scan_swap_map(p); + swap_device_unlock(p); + if (offset) { + entry = SWP_ENTRY(type,offset); + type = swap_info[type].next; + if (type < 0 || + p->prio != swap_info[type].prio) { + swap_list.next = swap_list.head; + } else { + swap_list.next = type; + } + goto out; + } + } + type = p->next; + if (!wrapped) { + if (type < 0 || p->prio != swap_info[type].prio) { + type = swap_list.head; + wrapped = 1; + } + } else + if (type < 0) + goto out; /* out of swap space */ + } +out: + swap_list_unlock(); + return entry; +} + +static struct swap_info_struct * swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = & swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + swap_list_lock(); + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = type; + swap_device_lock(p); + return p; + +bad_free: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static void swap_info_put(struct swap_info_struct * p) +{ + swap_device_unlock(p); + swap_list_unlock(); +} + +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) +{ + int count = p->swap_map[offset]; + + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = count; + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + nr_swap_pages++; + } + } + return count; +} + +/* + * Caller has made sure that the swapdevice corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, SWP_OFFSET(entry)); + swap_info_put(p); + } +} + +/* + * Check if we're the only user of a swap page, + * when the page is locked. + */ +static int exclusive_swap_page(struct page *page) +{ + int retval = 0; + struct swap_info_struct * p; + swp_entry_t entry; + + entry.val = page->index; + p = swap_info_get(entry); + if (p) { + /* Is the only swap cache user the cache itself? */ + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) + retval = 1; + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + } + return retval; +} + +/* + * We can use this swap cache entry directly + * if there are no other references to it. + * + * Here "exclusive_swap_page()" does the real + * work, but we opportunistically check whether + * we need to get all the locks first.. + */ +int can_share_swap_page(struct page *page) +{ + int retval = 0; + + if (!PageLocked(page)) + BUG(); + switch (page_count(page)) { + case 3: + if (!page->buffers) + break; + /* Fallthrough */ + case 2: + if (!PageSwapCache(page)) + break; + retval = exclusive_swap_page(page); + break; + case 1: + if (PageReserved(page)) + break; + retval = 1; + } + return retval; +} + +/* + * Work out if there are any other processes sharing this + * swap cache page. Free it if you can. Return success. + */ +int remove_exclusive_swap_page(struct page *page) +{ + int retval; + struct swap_info_struct * p; + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + + entry.val = page->index; + p = swap_info_get(entry); + if (!p) + return 0; + + /* Is the only swap cache user the cache itself? */ + retval = 0; + if (p->swap_map[SWP_OFFSET(entry)] == 1) { + /* Recheck the page count with the pagecache lock held.. */ + spin_lock(&pagecache_lock); + if (page_count(page) - !!page->buffers == 2) { + __delete_from_swap_cache(page); + SetPageDirty(page); + retval = 1; + } + spin_unlock(&pagecache_lock); + } + swap_info_put(p); + + if (retval) { + block_flushpage(page, 0); + swap_free(entry); + page_cache_release(page); + } + + return retval; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +void free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct * p; + struct page *page = NULL; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) + page = find_trylock_page(&swapper_space, entry.val); + swap_info_put(p); + } + if (page) { + page_cache_get(page); + /* Only cache user (+us), or swap space full? Free it! */ + if (page_count(page) == 2 || vm_swap_full()) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + UnlockPage(page); + page_cache_release(page); + } +} + +/* + * The swap entry has been read in advance, and we return 1 to indicate + * that the page has been used or is no longer needed. + * + * Always set the resulting pte to be nowrite (the same as COW pages + * after one process has exited). We don't know just how many PTEs will + * share this swap entry, so be cautious and let do_wp_page work out + * what to do if a write is requested later. + */ +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, + pte_t *dir, swp_entry_t entry, struct page* page) +{ + pte_t pte = *dir; + + if (likely(pte_to_swp_entry(pte).val != entry.val)) + return; + if (unlikely(pte_none(pte) || pte_present(pte))) + return; + get_page(page); + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + swap_free(entry); + ++vma->vm_mm->rss; +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, + unsigned long address, unsigned long size, unsigned long offset, + swp_entry_t entry, struct page* page) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + pte = pte_offset(dir, address); + offset += address & PMD_MASK; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, + unsigned long address, unsigned long size, + swp_entry_t entry, struct page* page) +{ + pmd_t * pmd; + unsigned long offset, end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + offset = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + unuse_pmd(vma, pmd, address, end - address, offset, entry, + page); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page* page) +{ + unsigned long start = vma->vm_start, end = vma->vm_end; + + if (start >= end) + BUG(); + do { + unuse_pgd(vma, pgdir, start, end - start, entry, page); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (start && (start < end)); +} + +static void unuse_process(struct mm_struct * mm, + swp_entry_t entry, struct page* page) +{ + struct vm_area_struct* vma; + + /* + * Go through process' page directory. + */ + spin_lock(&mm->page_table_lock); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + pgd_t * pgd = pgd_offset(mm, vma->vm_start); + unuse_vma(vma, pgd, entry, page); + } + XENO_flush_page_update_queue(); + spin_unlock(&mm->page_table_lock); + return; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static int find_next_to_unuse(struct swap_info_struct *si, int prev) +{ + int max = si->max; + int i = prev; + int count; + + /* + * No need for swap_device_lock(si) here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_list_lock()). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + count = si->swap_map[i]; + if (count && count != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + */ +static int try_to_unuse(unsigned int type) +{ + struct swap_info_struct * si = &swap_info[type]; + struct mm_struct *start_mm; + unsigned short *swap_map; + unsigned short swcount; + struct page *page; + swp_entry_t entry; + int i = 0; + int retval = 0; + int reset_overflow = 0; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering (now preserved by swap_out()), + * which clusters forked address spaces together, most recent + * child immediately after parent. If we race with dup_mmap(), + * we very much want to resolve parent before child, otherwise + * we may miss some entries: using last mm would invert that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * mmput() removes mm from mmlist before exit_mmap() and its + * zap_page_range(). That's not too bad, those entries are + * on their way out, and handled faster there than here. + * do_munmap() behaves similarly, taking the range out of mm's + * vma list before zap_page_range(). But unfortunately, when + * unmapping a part of a vma, it takes the whole out first, + * then reinserts what's left after (might even reschedule if + * open() method called) - so swap entries may be invisible + * to swapoff for a while, then reappear - but that is rare. + */ + while ((i = find_next_to_unuse(si, i))) { + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = SWP_ENTRY(type, i); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page(page); + lock_page(page); + + /* + * Remove all references to entry, without blocking. + * Whenever we reach init_mm, there's no address space + * to search, but use it as a reminder to search shmem. + */ + swcount = *swap_map; + if (swcount > 1) { + flush_page_to_ram(page); + if (start_mm == &init_mm) + shmem_unuse(entry, page); + else + unuse_process(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *mm; + + spin_lock(&mmlist_lock); + while (*swap_map > 1 && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + swcount = *swap_map; + if (mm == &init_mm) { + set_start_mm = 1; + shmem_unuse(entry, page); + } else + unuse_process(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + new_start_mm = mm; + set_start_mm = 0; + } + } + atomic_inc(&new_start_mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(start_mm); + start_mm = new_start_mm; + } + + /* + * How could swap count reach 0x7fff when the maximum + * pid is 0x7fff, and there's no way to repeat a swap + * page within an mm (except in shmem, where it's the + * shared object which takes the reference count)? + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about + * exit_mmap() and do_munmap() cases described above: + * we might be resetting SWAP_MAP_MAX too early here. + * We know "Undead"s can happen, they're okay, so don't + * report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + swap_list_lock(); + swap_device_lock(si); + nr_swap_pages++; + *swap_map = 1; + swap_device_unlock(si); + swap_list_unlock(); + reset_overflow = 1; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_swap_out could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * Note shmem_unuse already deleted its from swap cache. + */ + swcount = *swap_map; + if ((swcount > 0) != PageSwapCache(page)) + BUG(); + if ((swcount > 1) && PageDirty(page)) { + rw_swap_page(WRITE, page); + lock_page(page); + } + if (PageSwapCache(page)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so try_to_swap_out will preserve it. + */ + SetPageDirty(page); + UnlockPage(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. Interruptible check on + * signal_pending() would be nice, but changes the spec? + */ + if (current->need_resched) + schedule(); + } + + mmput(start_mm); + if (reset_overflow) { + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); + swap_overflow = 0; + } + return retval; +} + +asmlinkage long sys_swapoff(const char * specialfile) +{ + struct swap_info_struct * p = NULL; + unsigned short *swap_map; + struct nameidata nd; + int i, type, prev; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = user_path_walk(specialfile, &nd); + if (err) + goto out; + + lock_kernel(); + prev = -1; + swap_list_lock(); + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { + p = swap_info + type; + if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { + if (p->swap_file == nd.dentry) + break; + } + prev = type; + } + err = -EINVAL; + if (type < 0) { + swap_list_unlock(); + goto out_dput; + } + + if (prev < 0) { + swap_list.head = p->next; + } else { + swap_info[prev].next = p->next; + } + if (type == swap_list.next) { + /* just pick something that's safe... */ + swap_list.next = swap_list.head; + } + nr_swap_pages -= p->pages; + total_swap_pages -= p->pages; + p->flags = SWP_USED; + swap_list_unlock(); + unlock_kernel(); + err = try_to_unuse(type); + lock_kernel(); + if (err) { + /* re-insert swap space back into swap_list */ + swap_list_lock(); + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio >= swap_info[i].prio) + break; + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p - swap_info; + else + swap_info[prev].next = p - swap_info; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + p->flags = SWP_WRITEOK; + swap_list_unlock(); + goto out_dput; + } + if (p->swap_device) + blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); + path_release(&nd); + + swap_list_lock(); + swap_device_lock(p); + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_vfsmnt = NULL; + p->swap_file = NULL; + p->swap_device = 0; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + p->flags = 0; + swap_device_unlock(p); + swap_list_unlock(); + vfree(swap_map); + err = 0; + +out_dput: + unlock_kernel(); + path_release(&nd); +out: + return err; +} + +int get_swaparea_info(char *buf) +{ + char * page = (char *) __get_free_page(GFP_KERNEL); + struct swap_info_struct *ptr = swap_info; + int i, j, len = 0, usedswap; + + if (!page) + return -ENOMEM; + + len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if ((ptr->flags & SWP_USED) && ptr->swap_map) { + char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, + page, PAGE_SIZE); + + len += sprintf(buf + len, "%-31s ", path); + + if (!ptr->swap_device) + len += sprintf(buf + len, "file\t\t"); + else + len += sprintf(buf + len, "partition\t"); + + usedswap = 0; + for (j = 0; j < ptr->max; ++j) + switch (ptr->swap_map[j]) { + case SWAP_MAP_BAD: + case 0: + continue; + default: + usedswap++; + } + len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), + usedswap << (PAGE_SHIFT - 10), ptr->prio); + } + } + free_page((unsigned long) page); + return len; +} + +int is_swap_partition(kdev_t dev) { + struct swap_info_struct *ptr = swap_info; + int i; + + for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { + if (ptr->flags & SWP_USED) + if (ptr->swap_device == dev) + return 1; + } + return 0; +} + +/* + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. + * + * The swapon system call + */ +asmlinkage long sys_swapon(const char * specialfile, int swap_flags) +{ + struct swap_info_struct * p; + struct nameidata nd; + struct inode * swap_inode; + unsigned int type; + int i, j, prev; + int error; + static int least_priority = 0; + union swap_header *swap_header = 0; + int swap_header_version; + int nr_good_pages = 0; + unsigned long maxpages = 1; + int swapfilesize; + struct block_device *bdev = NULL; + unsigned short *swap_map; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + lock_kernel(); + swap_list_lock(); + p = swap_info; + for (type = 0 ; type < nr_swapfiles ; type++,p++) + if (!(p->flags & SWP_USED)) + break; + error = -EPERM; + if (type >= MAX_SWAPFILES) { + swap_list_unlock(); + goto out; + } + if (type >= nr_swapfiles) + nr_swapfiles = type+1; + p->flags = SWP_USED; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_device = 0; + p->swap_map = NULL; + p->lowest_bit = 0; + p->highest_bit = 0; + p->cluster_nr = 0; + p->sdev_lock = SPIN_LOCK_UNLOCKED; + p->next = -1; + if (swap_flags & SWAP_FLAG_PREFER) { + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; + } else { + p->prio = --least_priority; + } + swap_list_unlock(); + error = user_path_walk(specialfile, &nd); + if (error) + goto bad_swap_2; + + p->swap_file = nd.dentry; + p->swap_vfsmnt = nd.mnt; + swap_inode = nd.dentry->d_inode; + error = -EINVAL; + + if (S_ISBLK(swap_inode->i_mode)) { + kdev_t dev = swap_inode->i_rdev; + struct block_device_operations *bdops; + + p->swap_device = dev; + set_blocksize(dev, PAGE_SIZE); + + bd_acquire(swap_inode); + bdev = swap_inode->i_bdev; + bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode)); + if (bdops) bdev->bd_op = bdops; + + error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); + if (error) + goto bad_swap_2; + set_blocksize(dev, PAGE_SIZE); + error = -ENODEV; + if (!dev || (blk_size[MAJOR(dev)] && + !blk_size[MAJOR(dev)][MINOR(dev)])) + goto bad_swap; + swapfilesize = 0; + if (blk_size[MAJOR(dev)]) + swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] + >> (PAGE_SHIFT - 10); + } else if (S_ISREG(swap_inode->i_mode)) + swapfilesize = swap_inode->i_size >> PAGE_SHIFT; + else + goto bad_swap; + + error = -EBUSY; + for (i = 0 ; i < nr_swapfiles ; i++) { + struct swap_info_struct *q = &swap_info[i]; + if (i == type || !q->swap_file) + continue; + if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) + goto bad_swap; + } + + swap_header = (void *) __get_free_page(GFP_USER); + if (!swap_header) { + printk("Unable to start swapping: out of memory :-)\n"); + error = -ENOMEM; + goto bad_swap; + } + + lock_page(virt_to_page(swap_header)); + rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); + + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) + swap_header_version = 1; + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) + swap_header_version = 2; + else { + printk("Unable to find swap-space signature\n"); + error = -EINVAL; + goto bad_swap; + } + + switch (swap_header_version) { + case 1: + memset(((char *) swap_header)+PAGE_SIZE-10,0,10); + j = 0; + p->lowest_bit = 0; + p->highest_bit = 0; + for (i = 1 ; i < 8*PAGE_SIZE ; i++) { + if (test_bit(i,(char *) swap_header)) { + if (!p->lowest_bit) + p->lowest_bit = i; + p->highest_bit = i; + maxpages = i+1; + j++; + } + } + nr_good_pages = j; + p->swap_map = vmalloc(maxpages * sizeof(short)); + if (!p->swap_map) { + error = -ENOMEM; + goto bad_swap; + } + for (i = 1 ; i < maxpages ; i++) { + if (test_bit(i,(char *) swap_header)) + p->swap_map[i] = 0; + else + p->swap_map[i] = SWAP_MAP_BAD; + } + break; + + case 2: + /* Check the swap header's sub-version and the size of + the swap file and bad block lists */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + error = -EINVAL; + goto bad_swap; + } + + p->lowest_bit = 1; + maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; + + error = -EINVAL; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + /* OK, set up the swap map and apply the bad block list */ + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + error = -ENOMEM; + goto bad_swap; + } + + error = 0; + memset(p->swap_map, 0, maxpages * sizeof(short)); + for (i=0; i<swap_header->info.nr_badpages; i++) { + int page = swap_header->info.badpages[i]; + if (page <= 0 || page >= swap_header->info.last_page) + error = -EINVAL; + else + p->swap_map[page] = SWAP_MAP_BAD; + } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; + if (error) + goto bad_swap; + } + + if (swapfilesize && maxpages > swapfilesize) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + error = -EINVAL; + goto bad_swap; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + p->swap_map[0] = SWAP_MAP_BAD; + swap_list_lock(); + swap_device_lock(p); + p->max = maxpages; + p->flags = SWP_WRITEOK; + p->pages = nr_good_pages; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", + nr_good_pages<<(PAGE_SHIFT-10), p->prio); + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { + if (p->prio >= swap_info[i].prio) { + break; + } + prev = i; + } + p->next = i; + if (prev < 0) { + swap_list.head = swap_list.next = p - swap_info; + } else { + swap_info[prev].next = p - swap_info; + } + swap_device_unlock(p); + swap_list_unlock(); + error = 0; + goto out; +bad_swap: + if (bdev) + blkdev_put(bdev, BDEV_SWAP); +bad_swap_2: + swap_list_lock(); + swap_map = p->swap_map; + nd.mnt = p->swap_vfsmnt; + nd.dentry = p->swap_file; + p->swap_device = 0; + p->swap_file = NULL; + p->swap_vfsmnt = NULL; + p->swap_map = NULL; + p->flags = 0; + if (!(swap_flags & SWAP_FLAG_PREFER)) + ++least_priority; + swap_list_unlock(); + if (swap_map) + vfree(swap_map); + path_release(&nd); +out: + if (swap_header) + free_page((long) swap_header); + unlock_kernel(); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int i; + unsigned long nr_to_be_unused = 0; + + swap_list_lock(); + for (i = 0; i < nr_swapfiles; i++) { + unsigned int j; + if (swap_info[i].flags != SWP_USED) + continue; + for (j = 0; j < swap_info[i].max; ++j) { + switch (swap_info[i].swap_map[j]) { + case 0: + case SWAP_MAP_BAD: + continue; + default: + nr_to_be_unused++; + } + } + } + val->freeswap = nr_swap_pages + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + swap_list_unlock(); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as + * "permanent", but will be reclaimed by the next swapoff. + */ +int swap_duplicate(swp_entry_t entry) +{ + struct swap_info_struct * p; + unsigned long offset, type; + int result = 0; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + + swap_device_lock(p); + if (offset < p->max && p->swap_map[offset]) { + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { + p->swap_map[offset]++; + result = 1; + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + if (swap_overflow++ < 5) + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = SWAP_MAP_MAX; + result = 1; + } + } + swap_device_unlock(p); +out: + return result; + +bad_file: + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +/* + * Page lock needs to be held in all cases to prevent races with + * swap file deletion. + */ +int swap_count(struct page *page) +{ + struct swap_info_struct * p; + unsigned long offset, type; + swp_entry_t entry; + int retval = 0; + + entry.val = page->index; + if (!entry.val) + goto bad_entry; + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = type + swap_info; + offset = SWP_OFFSET(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_unused; + retval = p->swap_map[offset]; +out: + return retval; + +bad_entry: + printk(KERN_ERR "swap_count: null entry!\n"); + goto out; +bad_file: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val); + goto out; +bad_offset: + printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_unused: + printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val); + goto out; +} + +/* + * Prior swap_duplicate protects against swap device deletion. + */ +void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, + kdev_t *dev, struct inode **swapf) +{ + unsigned long type; + struct swap_info_struct *p; + + type = SWP_TYPE(entry); + if (type >= nr_swapfiles) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); + return; + } + + p = &swap_info[type]; + *offset = SWP_OFFSET(entry); + if (*offset >= p->max && *offset != 0) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); + return; + } + if (p->swap_map && !p->swap_map[*offset]) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); + return; + } + if (!(p->flags & SWP_USED)) { + printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); + return; + } + + if (p->swap_device) { + *dev = p->swap_device; + } else if (p->swap_file) { + *swapf = p->swap_file->d_inode; + } else { + printk(KERN_ERR "rw_swap_page: no swap file or device\n"); + } + return; +} + +/* + * swap_device_lock prevents swap_map being freed. Don't grab an extra + * reference on the swaphandle, it doesn't matter if it becomes unused. + */ +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) +{ + int ret = 0, i = 1 << page_cluster; + unsigned long toff; + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + if (!page_cluster) /* no readahead */ + return 0; + toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; + if (!toff) /* first page is swap header */ + toff++, i--; + *offset = toff; + + swap_device_lock(swapdev); + do { + /* Don't read-ahead past the end of the swap area */ + if (toff >= swapdev->max) + break; + /* Don't read in free or bad pages */ + if (!swapdev->swap_map[toff]) + break; + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + break; + toff++; + ret++; + } while (--i); + swap_device_unlock(swapdev); + return ret; +} |