diff options
Diffstat (limited to 'linux-2.6-xen-sparse/arch/i386/mm')
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/Makefile | 18 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c | 769 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/highmem-xen.c | 136 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c | 432 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/init-xen.c | 850 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c | 443 | ||||
-rw-r--r-- | linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c | 727 |
7 files changed, 0 insertions, 3375 deletions
diff --git a/linux-2.6-xen-sparse/arch/i386/mm/Makefile b/linux-2.6-xen-sparse/arch/i386/mm/Makefile deleted file mode 100644 index 2b33b20038..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# -# Makefile for the linux i386-specific parts of the memory manager. -# - -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o - -obj-$(CONFIG_NUMA) += discontig.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_HIGHMEM) += highmem.o -obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o - -ifdef CONFIG_XEN -include $(srctree)/scripts/Makefile.xen - -obj-y += hypervisor.o - -obj-y := $(call cherrypickxen, $(obj-y)) -endif diff --git a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c deleted file mode 100644 index 78639201bc..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c +++ /dev/null @@ -1,769 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/kdebug.h> - -extern void die(const char *,struct pt_regs *,long); - -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); -int register_page_fault_notifier(struct notifier_block *nb) -{ - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} - -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); -} - -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - struct die_args args = { - .regs = regs, - .str = str, - .err = err, - .trapnr = trap, - .signr = sig - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); -} -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif - - -/* - * Unlock any spinlocks which will prevent us from getting the - * message out - */ -void bust_spinlocks(int yes) -{ - int loglevel_save = console_loglevel; - - if (yes) { - oops_in_progress = 1; - return; - } -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; -} - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* Unlikely, but must come before segment checks. */ - if (unlikely(regs->eflags & VM_MASK)) { - base = seg << 4; - *eip_limit = base + 0xffff; - return base + (eip & 0xffff); - } - - /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; - - /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == GET_KERNEL_CS())) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - down(¤t->mm->context.sem); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - up(¤t->mm->context.sem); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned long instr = get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > limit) - break; - if (__get_user(opcode, (unsigned char __user *) instr)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > limit) - break; - if (__get_user(opcode, (unsigned char __user *) instr)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -#ifdef CONFIG_X86_PAE -static void dump_fault_path(unsigned long address) -{ - unsigned long *p, page; - unsigned long mfn; - - page = read_cr3(); - p = (unsigned long *)__va(page); - p += (address >> 30) * 2; - printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); - if (p[0] & 1) { - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); - page = mfn_to_pfn(mfn) << PAGE_SHIFT; - p = (unsigned long *)__va(page); - address &= 0x3fffffff; - p += (address >> 21) * 2; - printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", - page, p[1], p[0]); - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); -#ifdef CONFIG_HIGHPTE - if (mfn_to_pfn(mfn) >= highstart_pfn) - return; -#endif - if (p[0] & 1) { - page = mfn_to_pfn(mfn) << PAGE_SHIFT; - p = (unsigned long *) __va(page); - address &= 0x001fffff; - p += (address >> 12) * 2; - printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n", - page, p[1], p[0]); - } - } -} -#else -static void dump_fault_path(unsigned long address) -{ - unsigned long page; - - page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> 22]; - if (oops_may_print()) - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - /* - * We must not directly access the pte in the highpte - * case if the page table is located in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifdef CONFIG_HIGHPTE - if ((page >> PAGE_SHIFT) >= highstart_pfn) - return; -#endif - if ((page & 1) && oops_may_print()) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = machine_to_phys(page); - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - } -} -#endif - -static int spurious_fault(struct pt_regs *regs, - unsigned long address, - unsigned long error_code) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & 0x0c) - return 0; - - pgd = init_mm.pgd + pgd_index(address); - if (!pgd_present(*pgd)) - return 0; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return 0; - if ((error_code & 0x02) && !pte_write(*pte)) - return 0; -#ifdef CONFIG_X86_PAE - if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX)) - return 0; -#endif - - return 1; -} - -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) -#ifndef CONFIG_XEN - set_pmd(pmd, *pmd_k); -#else - /* - * When running on Xen we must launder *pmd_k through - * pmd_val() to ensure that _PAGE_PRESENT is correctly set. - */ - set_pmd(pmd, __pmd(pmd_val(*pmd_k))); -#endif - else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} - -/* - * Handle a fault on the vmalloc or module mapping area - * - * This assumes no large pages in there. - */ -static inline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -} - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - int write, si_code; - - /* get the address */ - address = read_cr2(); - - /* Set the "privileged fault" bit to something sane. */ - error_code &= ~4; - error_code |= (regs->xcs & 2) << 1; - if (regs->eflags & X86_EFLAGS_VM) - error_code |= 4; - - tsk = current; - - si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { -#ifdef CONFIG_XEN - /* Faults in hypervisor area can never be patched up. */ - if (address >= hypervisor_virt_start) - goto bad_area_nosemaphore; -#endif - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) - return; - /* Can take a spurious fault if mapping changes R/O -> R/W. */ - if (spurious_fault(regs, address, error_code)) - return; - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * Accessing the stack below %esp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes - * 32 pointers and then decrements %esp by 65535.) - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == GET_KERNEL_CS()) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - case VM_FAULT_OOM: - goto out_of_memory; - default: - BUG(); - } - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (oops_may_print()) { - #ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); - } - #endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL " - "pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging" - " request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); - } - dump_fault_path(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (tsk->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -#if !HAVE_SHARED_KERNEL_PMD -void vmalloc_sync_all(void) -{ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; - unsigned long address; - - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { - BUG_ON(page != pgd_list); - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); - } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; - } -} -#endif diff --git a/linux-2.6-xen-sparse/arch/i386/mm/highmem-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/highmem-xen.c deleted file mode 100644 index 20838cce53..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/highmem-xen.c +++ /dev/null @@ -1,136 +0,0 @@ -#include <linux/highmem.h> -#include <linux/module.h> - -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - inc_preempt_count(); - if (!PageHighMem(page)) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); -#endif - set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); - - return (void*) vaddr; -} - -void *kmap_atomic(struct page *page, enum km_type type) -{ - return __kmap_atomic(page, type, kmap_prot); -} - -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ -void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - return __kmap_atomic(page, type, - test_bit(PG_pinned, &page->flags) - ? PAGE_KERNEL_RO : kmap_prot); -} - -void kunmap_atomic(void *kvaddr, enum km_type type) -{ -#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN) - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); - preempt_check_resched(); - return; - } -#endif - -#if defined(CONFIG_DEBUG_HIGHMEM) - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - __flush_tlb_one(vaddr); -#elif defined(CONFIG_XEN) - /* - * We must ensure there are no dangling pagetable references when - * returning memory to Xen (decrease_reservation). - * XXX TODO: We could make this faster by only zapping when - * kmap_flush_unused is called but that is trickier and more invasive. - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); -#endif - - dec_preempt_count(); - preempt_check_resched(); -} - -/* This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - inc_preempt_count(); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; -} - -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kmap_atomic_pte); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); diff --git a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c deleted file mode 100644 index 3222b9d5ae..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c +++ /dev/null @@ -1,432 +0,0 @@ -/****************************************************************************** - * mm/hypervisor.c - * - * Update page tables via the hypervisor. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/hypervisor.h> -#include <xen/balloon.h> -#include <xen/features.h> -#include <xen/interface/memory.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <asm/tlbflush.h> - -void xen_l1_entry_update(pte_t *ptr, pte_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = __pte_val(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -void xen_l2_entry_update(pmd_t *ptr, pmd_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = __pmd_val(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -void xen_l3_entry_update(pud_t *ptr, pud_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = __pud_val(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} -#endif - -#ifdef CONFIG_X86_64 -void xen_l4_entry_update(pgd_t *ptr, pgd_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = __pgd_val(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} -#endif /* CONFIG_X86_64 */ - -void xen_pt_switch(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_NEW_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_new_user_pt(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_NEW_USER_BASEPTR; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} -EXPORT_SYMBOL(xen_tlb_flush); - -void xen_invlpg(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_LOCAL; - op.arg1.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} -EXPORT_SYMBOL(xen_invlpg); - -#ifdef CONFIG_SMP - -void xen_tlb_flush_all(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_ALL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush_mask(cpumask_t *mask) -{ - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_TLB_FLUSH_MULTI; - set_xen_guest_handle(op.arg2.vcpumask, mask->bits); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_invlpg_all(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_ALL; - op.arg1.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) -{ - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_INVLPG_MULTI; - op.arg1.linear_addr = ptr & PAGE_MASK; - set_xen_guest_handle(op.arg2.vcpumask, mask->bits); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#endif /* CONFIG_SMP */ - -void xen_pgd_pin(unsigned long ptr) -{ - struct mmuext_op op; -#ifdef CONFIG_X86_64 - op.cmd = MMUEXT_PIN_L4_TABLE; -#elif defined(CONFIG_X86_PAE) - op.cmd = MMUEXT_PIN_L3_TABLE; -#else - op.cmd = MMUEXT_PIN_L2_TABLE; -#endif - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pgd_unpin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_set_ldt(unsigned long ptr, unsigned long len) -{ - struct mmuext_op op; - op.cmd = MMUEXT_SET_LDT; - op.arg1.linear_addr = ptr; - op.arg2.nr_ents = len; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -/* - * Bitmap is indexed by page number. If bit is set, the page is part of a - * xen_create_contiguous_region() area of memory. - */ -unsigned long *contiguous_bitmap; - -static void contiguous_bitmap_set( - unsigned long first_page, unsigned long nr_pages) -{ - unsigned long start_off, end_off, curr_idx, end_idx; - - curr_idx = first_page / BITS_PER_LONG; - start_off = first_page & (BITS_PER_LONG-1); - end_idx = (first_page + nr_pages) / BITS_PER_LONG; - end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); - - if (curr_idx == end_idx) { - contiguous_bitmap[curr_idx] |= - ((1UL<<end_off)-1) & -(1UL<<start_off); - } else { - contiguous_bitmap[curr_idx] |= -(1UL<<start_off); - while ( ++curr_idx < end_idx ) - contiguous_bitmap[curr_idx] = ~0UL; - contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; - } -} - -static void contiguous_bitmap_clear( - unsigned long first_page, unsigned long nr_pages) -{ - unsigned long start_off, end_off, curr_idx, end_idx; - - curr_idx = first_page / BITS_PER_LONG; - start_off = first_page & (BITS_PER_LONG-1); - end_idx = (first_page + nr_pages) / BITS_PER_LONG; - end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); - - if (curr_idx == end_idx) { - contiguous_bitmap[curr_idx] &= - -(1UL<<end_off) | ((1UL<<start_off)-1); - } else { - contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; - while ( ++curr_idx != end_idx ) - contiguous_bitmap[curr_idx] = 0; - contiguous_bitmap[curr_idx] &= -(1UL<<end_off); - } -} - -/* Protected by balloon_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; -static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER]; - -/* Ensure multi-page extents are contiguous in machine memory. */ -int xen_create_contiguous_region( - unsigned long vstart, unsigned int order, unsigned int address_bits) -{ - unsigned long *in_frames = discontig_frames, out_frame; - unsigned long frame, i, flags; - long rc; - int success; - struct xen_memory_exchange exchange = { - .in = { - .nr_extents = 1UL << order, - .extent_order = 0, - .domid = DOMID_SELF - }, - .out = { - .nr_extents = 1, - .extent_order = order, - .address_bits = address_bits, - .domid = DOMID_SELF - } - }; - - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; - - set_xen_guest_handle(exchange.in.extent_start, in_frames); - set_xen_guest_handle(exchange.out.extent_start, &out_frame); - - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - - /* 1. Zap current PTEs, remembering MFNs. */ - for (i = 0; i < (1UL<<order); i++) { - in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i); - MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), - __pte_ma(0), 0); - set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, - INVALID_P2M_ENTRY); - } - if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) - BUG(); - - /* 2. Get a new contiguous memory extent. */ - out_frame = __pa(vstart) >> PAGE_SHIFT; - rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); - success = (exchange.nr_exchanged == (1UL << order)); - BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); - BUG_ON(success && (rc != 0)); -#if CONFIG_XEN_COMPAT <= 0x030002 - if (unlikely(rc == -ENOSYS)) { - /* Compatibility when XENMEM_exchange is unsupported. */ - if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &exchange.in) != (1UL << order)) - BUG(); - success = (HYPERVISOR_memory_op(XENMEM_populate_physmap, - &exchange.out) == 1); - if (!success) { - /* Couldn't get special memory: fall back to normal. */ - for (i = 0; i < (1UL<<order); i++) - in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i; - if (HYPERVISOR_memory_op(XENMEM_populate_physmap, - &exchange.in) != (1UL<<order)) - BUG(); - } - } -#endif - - /* 3. Map the new extent in place of old pages. */ - for (i = 0; i < (1UL<<order); i++) { - frame = success ? (out_frame + i) : in_frames[i]; - MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), - pfn_pte_ma(frame, PAGE_KERNEL), 0); - set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); - } - - cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order - ? UVMF_TLB_FLUSH|UVMF_ALL - : UVMF_INVLPG|UVMF_ALL; - if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) - BUG(); - - if (success) - contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT, - 1UL << order); - - balloon_unlock(flags); - - return success ? 0 : -ENOMEM; -} -EXPORT_SYMBOL_GPL(xen_create_contiguous_region); - -void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) -{ - unsigned long *out_frames = discontig_frames, in_frame; - unsigned long frame, i, flags; - long rc; - int success; - struct xen_memory_exchange exchange = { - .in = { - .nr_extents = 1, - .extent_order = order, - .domid = DOMID_SELF - }, - .out = { - .nr_extents = 1UL << order, - .extent_order = 0, - .domid = DOMID_SELF - } - }; - - if (xen_feature(XENFEAT_auto_translated_physmap) || - !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap)) - return; - - if (unlikely(order > MAX_CONTIG_ORDER)) - return; - - set_xen_guest_handle(exchange.in.extent_start, &in_frame); - set_xen_guest_handle(exchange.out.extent_start, out_frames); - - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - - contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order); - - /* 1. Find start MFN of contiguous extent. */ - in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT); - - /* 2. Zap current PTEs. */ - for (i = 0; i < (1UL<<order); i++) { - MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), - __pte_ma(0), 0); - set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, - INVALID_P2M_ENTRY); - out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i; - } - if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) - BUG(); - - /* 3. Do the exchange for non-contiguous MFNs. */ - rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); - success = (exchange.nr_exchanged == 1); - BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); - BUG_ON(success && (rc != 0)); -#if CONFIG_XEN_COMPAT <= 0x030002 - if (unlikely(rc == -ENOSYS)) { - /* Compatibility when XENMEM_exchange is unsupported. */ - if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &exchange.in) != 1) - BUG(); - if (HYPERVISOR_memory_op(XENMEM_populate_physmap, - &exchange.out) != (1UL << order)) - BUG(); - success = 1; - } -#endif - - /* 4. Map new pages in place of old pages. */ - for (i = 0; i < (1UL<<order); i++) { - frame = success ? out_frames[i] : (in_frame + i); - MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), - pfn_pte_ma(frame, PAGE_KERNEL), 0); - set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); - } - - cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order - ? UVMF_TLB_FLUSH|UVMF_ALL - : UVMF_INVLPG|UVMF_ALL; - if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) - BUG(); - - balloon_unlock(flags); -} -EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); - -#ifdef __i386__ -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) -{ - __u32 *lp = (__u32 *)((char *)ldt + entry * 8); - maddr_t mach_lp = arbitrary_virt_to_machine(lp); - return HYPERVISOR_update_descriptor( - mach_lp, (u64)entry_a | ((u64)entry_b<<32)); -} -#endif diff --git a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c deleted file mode 100644 index 9a04cb6b1e..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c +++ /dev/null @@ -1,850 +0,0 @@ -/* - * linux/arch/i386/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#include <linux/module.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/poison.h> -#include <linux/bootmem.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/efi.h> -#include <linux/memory_hotplug.h> -#include <linux/initrd.h> -#include <linux/cpumask.h> -#include <linux/dma-mapping.h> -#include <linux/scatterlist.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/dma.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/sections.h> -#include <asm/hypervisor.h> -#include <asm/swiotlb.h> - -extern unsigned long *contiguous_bitmap; - -unsigned int __VMALLOC_RESERVE = 128 << 20; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long highstart_pfn, highend_pfn; - -static int noinline do_test_wp_bit(void); - -/* - * Creates a middle page table and puts a pointer to it in the - * given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t * __init one_md_table_init(pgd_t *pgd) -{ - pud_t *pud; - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else - pud = pud_offset(pgd, 0); - pmd_table = pmd_offset(pud, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static pte_t * __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - make_lowmem_page_readonly(page_table, - XENFEAT_writable_page_tables); - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -static inline int is_kernel_text(unsigned long addr) -{ - if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) - return 1; - return 0; -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - unsigned long max_ram_pfn = xen_start_info->nr_pages; - if (max_ram_pfn > max_low_pfn) - max_ram_pfn = max_low_pfn; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - pmd_idx = pmd_index(PAGE_OFFSET); - pte_ofs = pte_index(PAGE_OFFSET); - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { -#ifdef CONFIG_XEN - /* - * Native linux hasn't PAE-paging enabled yet at this - * point. When running as xen domain we are in PAE - * mode already, thus we can't simply hook a empty - * pmd. That would kill the mappings we are currently - * using ... - */ - pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); -#else - pmd = one_md_table_init(pgd); -#endif - if (pfn >= max_low_pfn) - continue; - pmd += pmd_idx; - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - if (address >= hypervisor_virt_start) - continue; - - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - pte += pte_ofs; - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - /* XEN: Only map initial RAM allocation. */ - if ((pfn >= max_ram_pfn) || pte_present(*pte)) - continue; - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - pte_ofs = 0; - } - } - pmd_idx = 0; - } -} - -#ifndef CONFIG_XEN - -static inline int page_kills_ppro(unsigned long pagenr) -{ - if (pagenr >= 0x70000 && pagenr <= 0x7003F) - return 1; - return 0; -} - -#else - -#define page_kills_ppro(p) 0 - -#endif - -extern int is_available_memory(efi_memory_desc_t *); - -int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -static void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; -} - -static void __meminit free_new_highpage(struct page *page, int pfn) -{ - init_page_count(page); - if (pfn < xen_start_info->nr_pages) - __free_page(page); - totalhigh_pages++; -} - -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) -{ - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { - ClearPageReserved(page); - free_new_highpage(page, pfn); - } else - SetPageReserved(page); -} - -static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) -{ - free_new_highpage(page, pfn); - totalram_pages++; -#ifdef CONFIG_FLATMEM - max_mapnr = max(pfn, max_mapnr); -#endif - num_physpages++; - return 0; -} - -/* - * Not currently handling the NUMA case. - * Assuming single node and all memory that - * has been added dynamically that would be - * onlined here is in HIGHMEM - */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - add_one_highpage_hotplug(page, page_to_pfn(page)); -} - - -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else -static void __init set_highmem_pages_init(int bad_ppro) -{ - int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - totalram_pages += totalhigh_pages; -} -#endif /* CONFIG_FLATMEM */ - -#else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ - -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; -EXPORT_SYMBOL(__PAGE_KERNEL); -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; - -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif - -pgd_t *swapper_pg_dir; - -static void __init pagetable_init (void) -{ - unsigned long vaddr; - pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; - - swapper_pg_dir = pgd_base; - init_mm.pgd = pgd_base; - - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; - } - - kernel_physical_mapping_init(pgd_base); - remap_numa_kva(); - - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); - - permanent_kmaps_init(pgd_base); -} - -#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) -/* - * Swap suspend & friends need this for resume because things like the intel-agp - * driver might have split up a kernel 4MB mapping. - */ -char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); - -static inline void save_pg_dir(void) -{ - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); -} -#else -static inline void save_pg_dir(void) -{ -} -#endif - -void zap_low_mappings (void) -{ - int i; - - save_pg_dir(); - - /* - * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. - */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); -} - -static int disable_nx __initdata = 0; -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; -EXPORT_SYMBOL(__supported_pte_mask); - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -void __init noexec_setup(const char *str) -{ - if (!strncmp(str, "on",2) && cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str,"off",3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } -} - -int nx_enabled = 0; -#ifdef CONFIG_X86_PAE - -static void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - __flush_tlb_all(); -out: - return ret; -} - -#endif - -/* - * paging_init() sets up the page tables - note that the first 8MB are - * already mapped by head.S. - * - * This routines also unmaps the page at virtual kernel address 0, so - * that we can trap those pesky NULL-reference errors in the kernel. - */ -void __init paging_init(void) -{ - int i; - -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); -#endif - - pagetable_init(); - -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - * when running as xen domain we are already in PAE mode at - * this point. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif - __flush_tlb_all(); - - kmap_init(); - - /* Switch to the real shared_info page, and clear the - * dummy page. */ - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - - /* Setup mapping of lower 1st MB */ - for (i = 0; i < NR_FIX_ISAMAPS; i++) - if (is_initial_xendomain()) - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); - else - __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_machine(empty_zero_page), - PAGE_KERNEL_RO); -} - -/* - * Test if the WP bit works in supervisor mode. It isn't supported on 386's - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This - * used to involve black magic jumps to work around some nasty CPU bugs, - * but fortunately the switch to using exceptions got rid of all that. - */ - -static void __init test_wp_bit(void) -{ - printk("Checking if this processor honours the WP bit even in supervisor mode... "); - - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); - boot_cpu_data.wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); - - if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif - } else { - printk("Ok.\n"); - } -} - -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - num_physpages = highend_pfn; -#else - num_physpages = max_low_pfn; -#endif -#ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; -#endif -} - -static struct kcore_list kcore_mem, kcore_vmalloc; - -void __init mem_init(void) -{ - extern int ppro_with_ram_bug(void); - int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; - unsigned long pfn; - - contiguous_bitmap = alloc_bootmem_low_pages( - (max_low_pfn + 2*BITS_PER_LONG) >> 3); - BUG_ON(!contiguous_bitmap); - memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3); - -#if defined(CONFIG_SWIOTLB) - swiotlb_init(); -#endif - -#ifdef CONFIG_FLATMEM - if (!mem_map) - BUG(); -#endif - - bad_ppro = ppro_with_ram_bug(); - -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - - set_max_mapnr_init(); - -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk("vmalloc area: %lx-%lx, maxmem %lx\n", - VMALLOC_START,VMALLOC_END,MAXMEM); - BUG_ON(VMALLOC_START > VMALLOC_END); - - /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); - /* XEN: init and count low-mem pages outside initial allocation. */ - for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { - ClearPageReserved(pfn_to_page(pfn)); - init_page_count(pfn_to_page(pfn)); - totalram_pages++; - } - - reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; - - set_highmem_pages_init(bad_ppro); - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); - -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif - if (boot_cpu_data.wp_works_ok < 0) - test_wp_bit(); - - /* - * Subtle. SMP is doing it's boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif - - set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); -} - -/* - * this is for the non-NUMA, single node SMP system case. - * Specifically, in the case of x86, we will always add - * memory to the highmem for now. - */ -#ifdef CONFIG_MEMORY_HOTPLUG -#ifndef CONFIG_NEED_MULTIPLE_NODES -int arch_add_memory(int nid, u64 start, u64 size) -{ - struct pglist_data *pgdata = &contig_page_data; - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - return __add_pages(zone, start_pfn, nr_pages); -} - -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -#endif -#endif - -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - 0, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } - pgd_cache = kmem_cache_create("pgd", -#ifndef CONFIG_XEN - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), -#else - PAGE_SIZE, - PAGE_SIZE, -#endif - 0, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); -} - -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static int noinline do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" - "2: \n" - ".section __ex_table,\"a\"\n" - " .align 4 \n" - " .long 1b,2b \n" - ".previous \n" - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - -#ifdef CONFIG_DEBUG_RODATA - -void mark_rodata_ro(void) -{ - unsigned long addr = (unsigned long)__start_rodata; - - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); - - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); - - /* - * change_page_attr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); -} -#endif - -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - diff --git a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c deleted file mode 100644 index 6d6edd203a..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c +++ /dev/null @@ -1,443 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -#define ISA_START_ADDRESS 0x0 -#define ISA_END_ADDRESS 0x100000 - -static int direct_remap_area_pte_fn(pte_t *pte, - struct page *pmd_page, - unsigned long address, - void *data) -{ - mmu_update_t **v = (mmu_update_t **)data; - - BUG_ON(!pte_none(*pte)); - - (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); - (*v)++; - - return 0; -} - -static int __direct_remap_pfn_range(struct mm_struct *mm, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int rc; - unsigned long i, start_address; - mmu_update_t *u, *v, *w; - - u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (u == NULL) - return -ENOMEM; - - start_address = address; - - flush_cache_all(); - - for (i = 0; i < size; i += PAGE_SIZE) { - if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { - /* Flush a full batch after filling in the PTE ptrs. */ - rc = apply_to_page_range(mm, start_address, - address - start_address, - direct_remap_area_pte_fn, &w); - if (rc) - goto out; - rc = -EFAULT; - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) - goto out; - v = w = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * __direct_remap_area_pages(). - */ - v->val = __pte_val(pfn_pte_ma(mfn, prot)); - - mfn++; - address += PAGE_SIZE; - v++; - } - - if (v != u) { - /* Final batch. */ - rc = apply_to_page_range(mm, start_address, - address - start_address, - direct_remap_area_pte_fn, &w); - if (rc) - goto out; - rc = -EFAULT; - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) - goto out; - } - - rc = 0; - - out: - flush_tlb_all(); - - free_page((unsigned long)u); - - return rc; -} - -int direct_remap_pfn_range(struct vm_area_struct *vma, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return remap_pfn_range(vma, address, mfn, size, prot); - - if (domid == DOMID_SELF) - return -EINVAL; - - vma->vm_flags |= VM_IO | VM_RESERVED; - - vma->vm_mm->context.has_foreign_mappings = 1; - - return __direct_remap_pfn_range( - vma->vm_mm, address, mfn, size, prot, domid); -} -EXPORT_SYMBOL(direct_remap_pfn_range); - -int direct_kernel_remap_pfn_range(unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - return __direct_remap_pfn_range( - &init_mm, address, mfn, size, prot, domid); -} -EXPORT_SYMBOL(direct_kernel_remap_pfn_range); - -static int lookup_pte_fn( - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) -{ - uint64_t *ptep = (uint64_t *)data; - if (ptep) - *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); - return 0; -} - -int create_lookup_pte_addr(struct mm_struct *mm, - unsigned long address, - uint64_t *ptep) -{ - return apply_to_page_range(mm, address, PAGE_SIZE, - lookup_pte_fn, ptep); -} - -EXPORT_SYMBOL(create_lookup_pte_addr); - -static int noop_fn( - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) -{ - return 0; -} - -int touch_pte_range(struct mm_struct *mm, - unsigned long address, - unsigned long size) -{ - return apply_to_page_range(mm, address, size, noop_fn, NULL); -} - -EXPORT_SYMBOL(touch_pte_range); - -/* - * Does @address reside within a non-highmem page that is local to this virtual - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand - * why this works. - */ -static inline int is_local_lowmem(unsigned long address) -{ - extern unsigned long max_low_pfn; - return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn); -} - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - domid_t domid = DOMID_IO; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_initial_xendomain() && - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) isa_bus_to_virt(phys_addr); - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (is_local_lowmem(phys_addr)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = bus_to_virt(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - - domid = DOMID_SELF; - } - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - flags |= _KERNPG_TABLE; - if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, - phys_addr>>PAGE_SHIFT, - size, __pgprot(flags), domid)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (is_local_lowmem(last_addr)) { - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if ((void __force *)addr <= high_memory) - return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { - /* p->size includes the guard page, but cpa doesn't like that */ - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_initial_xendomain() && - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return isa_bus_to_virt(phys_addr); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} diff --git a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c deleted file mode 100644 index df20e591c1..0000000000 --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c +++ /dev/null @@ -1,727 +0,0 @@ -/* - * linux/arch/i386/mm/pgtable.c - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/highmem.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/spinlock.h> -#include <linux/module.h> - -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/io.h> -#include <asm/mmu_context.h> - -#include <xen/features.h> -#include <asm/hypervisor.h> - -static void pgd_test_and_unpin(pgd_t *pgd); - -void show_mem(void) -{ - int total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - unsigned long flags; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_online_pgdat(pgdat) { - pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - page = pgdat_page_nr(pgdat, i); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - pgdat_resize_unlock(pgdat, &flags); - } - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); - printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); - printk(KERN_INFO "%lu pages writeback\n", - global_page_state(NR_WRITEBACK)); - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); - printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB)); - printk(KERN_INFO "%lu pages pagetables\n", - global_page_state(NR_PAGETABLE)); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - if (pgprot_val(flags)) - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte(pfn, flags)); - else - pte_clear(&init_mm, vaddr, pte); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn, - pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - if (pgprot_val(flags)) - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte_ma(pfn, flags)); - else - pte_clear(&init_mm, vaddr, pte); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* - * Associate a large virtual page frame with a given physical page frame - * and protection flags for that frame. pfn is for the base of the page, - * vaddr is what the page gets mapped to - both must be properly aligned. - * The pmd must already be instantiated. Assumes PAE mode. - */ -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ - printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); - return; /* BUG(); */ - } - if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ - printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); - return; /* BUG(); */ - } - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); - return; /* BUG(); */ - } - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - set_pmd(pmd, pfn_pmd(pfn, flags)); - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -static int nr_fixmaps = 0; -unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; -unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE); -EXPORT_SYMBOL(__FIXADDR_TOP); - -void __init set_fixaddr_top(unsigned long top) -{ - BUG_ON(nr_fixmaps > 0); - hypervisor_virt_start = top; - __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE; -} - -void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - BUG(); - return; - } - switch (idx) { - case FIX_WP_TEST: -#ifdef CONFIG_X86_F00F_BUG - case FIX_F00F_IDT: -#endif - case FIX_VDSO: - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); - break; - default: - set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags); - break; - } - nr_fixmaps++; -} - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); - return pte; -} - -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) { - SetPageForeign(pte, pte_free); - init_page_count(pte); - } - return pte; -} - -void pte_free(struct page *pte) -{ - unsigned long pfn = page_to_pfn(pte); - - if (!PageHighMem(pte)) { - unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT); - - if (!pte_write(*virt_to_ptep(va))) - if (HYPERVISOR_update_va_mapping( - va, pfn_pte(pfn, PAGE_KERNEL), 0)) - BUG(); - } else - clear_bit(PG_pinned, &pte->flags); - - ClearPageForeign(pte); - init_page_count(pte); - - __free_page(pte); -} - -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * The locking scheme was chosen on the basis of manfred's - * recommendations and having no core impact whatsoever. - * -- wli - */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); - pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page_private(page); - *pprev = next; - if (next) - set_page_private(next, (unsigned long)pprev); -} - -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) -{ - unsigned long flags; - - if (PTRS_PER_PMD > 1) { - if (HAVE_SHARED_KERNEL_PMD) - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } else { - spin_lock_irqsave(&pgd_lock, flags); - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - } -} - -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) -{ - unsigned long flags; /* can be called from interrupt context */ - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - - pgd_test_and_unpin(pgd); -} - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - pmd_t **pmd; - unsigned long flags; - - pgd_test_and_unpin(pgd); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; - - if (HAVE_SHARED_KERNEL_PMD) { - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); - } - return pgd; - } - - /* - * We can race save/restore (if we sleep during a GFP_KERNEL memory - * allocation). We therefore store virtual addresses of pmds as they - * do not change across save/restore, and poke the machine addresses - * into the pgdir under the pgd_lock. - */ - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); - if (!pmd) { - kmem_cache_free(pgd_cache, pgd); - return NULL; - } - - /* Allocate pmds, remember virtual addresses. */ - for (i = 0; i < PTRS_PER_PGD; ++i) { - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd[i]) - goto out_oom; - } - - spin_lock_irqsave(&pgd_lock, flags); - - /* Protect against save/restore: move below 4GB under pgd_lock. */ - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { - int rc = xen_create_contiguous_region( - (unsigned long)pgd, 0, 32); - if (rc) { - spin_unlock_irqrestore(&pgd_lock, flags); - goto out_oom; - } - } - - /* Copy kernel pmd contents and write-protect the new pmds. */ - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - unsigned long v = (unsigned long)i << PGDIR_SHIFT; - pgd_t *kpgd = pgd_offset_k(v); - pud_t *kpud = pud_offset(kpgd, v); - pmd_t *kpmd = pmd_offset(kpud, v); - memcpy(pmd[i], kpmd, PAGE_SIZE); - make_lowmem_page_readonly( - pmd[i], XENFEAT_writable_page_tables); - } - - /* It is safe to poke machine addresses of pmds under the pmd_lock. */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); - - /* Ensure this pgd gets picked up and pinned on save/restore. */ - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); - - kfree(pmd); - - return pgd; - -out_oom: - if (HAVE_SHARED_KERNEL_PMD) { - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, - (void *)__va(pgd_val(pgd[i])-1)); - } else { - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, pmd[i]); - kfree(pmd); - } - kmem_cache_free(pgd_cache, pgd); - return NULL; -} - -void pgd_free(pgd_t *pgd) -{ - int i; - - /* - * After this the pgd should not be pinned for the duration of this - * function's execution. We should never sleep and thus never race: - * 1. User pmds will not become write-protected under our feet due - * to a concurrent mm_pin_all(). - * 2. The machine addresses in PGD entries will not become invalid - * due to a concurrent save/restore. - */ - pgd_test_and_unpin(pgd); - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) { - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); - kmem_cache_free(pmd_cache, pmd); - } - - if (!HAVE_SHARED_KERNEL_PMD) { - unsigned long flags; - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); - make_lowmem_page_writable( - pmd, XENFEAT_writable_page_tables); - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); - kmem_cache_free(pmd_cache, pmd); - } - - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) - xen_destroy_contiguous_region( - (unsigned long)pgd, 0); - } - } - - /* in the non-PAE case, free_pgtables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); -} - -void make_lowmem_page_readonly(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_wrprotect(*pte), 0); - BUG_ON(rc); -} - -void make_lowmem_page_writable(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_mkwrite(*pte), 0); - BUG_ON(rc); -} - -void make_page_readonly(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_wrprotect(*pte), 0); - if (rc) /* fallback? */ - xen_l1_entry_update(pte, pte_wrprotect(*pte)); - if ((unsigned long)va >= (unsigned long)high_memory) { - unsigned long pfn = pte_pfn(*pte); -#ifdef CONFIG_HIGHMEM - if (pfn >= highstart_pfn) - kmap_flush_unused(); /* flush stale writable kmaps */ - else -#endif - make_lowmem_page_readonly( - phys_to_virt(pfn << PAGE_SHIFT), feature); - } -} - -void make_page_writable(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_mkwrite(*pte), 0); - if (rc) /* fallback? */ - xen_l1_entry_update(pte, pte_mkwrite(*pte)); - if ((unsigned long)va >= (unsigned long)high_memory) { - unsigned long pfn = pte_pfn(*pte); -#ifdef CONFIG_HIGHMEM - if (pfn < highstart_pfn) -#endif - make_lowmem_page_writable( - phys_to_virt(pfn << PAGE_SHIFT), feature); - } -} - -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - make_page_readonly(va, feature); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -void make_pages_writable(void *va, unsigned int nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - make_page_writable(va, feature); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -static inline void pgd_walk_set_prot(struct page *page, pgprot_t flags) -{ - unsigned long pfn = page_to_pfn(page); - int rc; - - if (PageHighMem(page)) { - if (pgprot_val(flags) & _PAGE_RW) - clear_bit(PG_pinned, &page->flags); - else - set_bit(PG_pinned, &page->flags); - } else { - rc = HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); - if (rc) - BUG(); - } -} - -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) -{ - pgd_t *pgd = pgd_base; - pud_t *pud; - pmd_t *pmd; - int g, u, m, rc; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - if (PTRS_PER_PUD > 1) /* not folded */ - pgd_walk_set_prot(virt_to_page(pud),flags); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - if (PTRS_PER_PMD > 1) /* not folded */ - pgd_walk_set_prot(virt_to_page(pmd),flags); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - if (pmd_none(*pmd)) - continue; - pgd_walk_set_prot(pmd_page(*pmd),flags); - } - } - } - - rc = HYPERVISOR_update_va_mapping( - (unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH); - if (rc) - BUG(); -} - -static void __pgd_pin(pgd_t *pgd) -{ - pgd_walk(pgd, PAGE_KERNEL_RO); - kmap_flush_unused(); - xen_pgd_pin(__pa(pgd)); - set_bit(PG_pinned, &virt_to_page(pgd)->flags); -} - -static void __pgd_unpin(pgd_t *pgd) -{ - xen_pgd_unpin(__pa(pgd)); - pgd_walk(pgd, PAGE_KERNEL); - clear_bit(PG_pinned, &virt_to_page(pgd)->flags); -} - -static void pgd_test_and_unpin(pgd_t *pgd) -{ - if (test_bit(PG_pinned, &virt_to_page(pgd)->flags)) - __pgd_unpin(pgd); -} - -void mm_pin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - spin_lock(&mm->page_table_lock); - __pgd_pin(mm->pgd); - spin_unlock(&mm->page_table_lock); -} - -void mm_unpin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - spin_lock(&mm->page_table_lock); - __pgd_unpin(mm->pgd); - spin_unlock(&mm->page_table_lock); -} - -void mm_pin_all(void) -{ - struct page *page; - unsigned long flags; - - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - /* - * Allow uninterrupted access to the pgd_list. Also protects - * __pgd_pin() by disabling preemption. - * All other CPUs must be at a safe point (e.g., in stop_machine - * or offlined entirely). - */ - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = (struct page *)page->index) { - if (!test_bit(PG_pinned, &page->flags)) - __pgd_pin((pgd_t *)page_address(page)); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -void _arch_dup_mmap(struct mm_struct *mm) -{ - if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) - mm_pin(mm); -} - -void _arch_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if (tsk->active_mm == mm) { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); - - switch_mm(mm, &init_mm, tsk); - - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } - - task_unlock(tsk); - - if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) && - (atomic_read(&mm->mm_count) == 1) && - !mm->context.has_foreign_mappings) - mm_unpin(mm); -} |