diff options
Diffstat (limited to 'linux-2.6-xen-sparse')
117 files changed, 5600 insertions, 2389 deletions
diff --git a/linux-2.6-xen-sparse/arch/i386/Kconfig b/linux-2.6-xen-sparse/arch/i386/Kconfig index 661d0bbecd..b3ef1013c4 100644 --- a/linux-2.6-xen-sparse/arch/i386/Kconfig +++ b/linux-2.6-xen-sparse/arch/i386/Kconfig @@ -789,6 +789,9 @@ config DOUBLEFAULT endmenu +config ARCH_ENABLE_MEMORY_HOTPLUG + def_bool y + depends on HIGHMEM menu "Power management options (ACPI, APM)" depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/fixup.c b/linux-2.6-xen-sparse/arch/i386/kernel/fixup.c index 20535e8fd4..2bf16fb732 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/fixup.c +++ b/linux-2.6-xen-sparse/arch/i386/kernel/fixup.c @@ -46,6 +46,9 @@ fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) if (test_and_set_bit(0, &printed)) return; + if (current->tgid == 1) /* Ignore statically linked init */ + return; + HYPERVISOR_vm_assist( VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify); diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S index dcef669792..1d0278c69b 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S +++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S @@ -9,7 +9,7 @@ #include <asm/page.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> -#include <xen/interface/arch-x86_32.h> +#include <xen/interface/xen.h> #include <xen/interface/elfnote.h> /* @@ -192,6 +192,7 @@ ENTRY(cpu_gdt_table) #endif /* !CONFIG_XEN_COMPAT_030002 */ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") #ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/microcode-xen.c b/linux-2.6-xen-sparse/arch/i386/kernel/microcode-xen.c index 65d6ff995e..926ba175c3 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/microcode-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/kernel/microcode-xen.c @@ -50,9 +50,6 @@ MODULE_LICENSE("GPL"); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DECLARE_MUTEX(microcode_sem); - -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ static int microcode_open (struct inode *unused1, struct file *unused2) { @@ -60,21 +57,26 @@ static int microcode_open (struct inode *unused1, struct file *unused2) } -static int do_microcode_update (void) +static int do_microcode_update (const void __user *ubuf, size_t len) { int err; - dom0_op_t op; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; - err = sys_mlock((unsigned long)user_buffer, user_buffer_size); - if (err != 0) - return err; + if (copy_from_user(kbuf, ubuf, len) == 0) { + dom0_op_t op; - op.cmd = DOM0_MICROCODE; - set_xen_guest_handle(op.u.microcode.data, user_buffer); - op.u.microcode.length = user_buffer_size; - err = HYPERVISOR_dom0_op(&op); + op.cmd = DOM0_MICROCODE; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_dom0_op(&op); + } else + err = -EFAULT; - (void)sys_munlock((unsigned long)user_buffer, user_buffer_size); + vfree(kbuf); return err; } @@ -88,17 +90,9 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ return -EINVAL; } - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - down(µcode_sem); - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); + ret = do_microcode_update(buf, len); if (!ret) ret = (ssize_t)len; diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c b/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c index 3457e31c8c..2586296dbb 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c @@ -65,6 +65,7 @@ #include <xen/interface/physdev.h> #include <xen/interface/memory.h> #include <xen/features.h> +#include <xen/xencons.h> #include "setup_arch_pre.h" #include <bios_ebda.h> @@ -155,6 +156,9 @@ struct ist_info ist_info; EXPORT_SYMBOL(ist_info); #endif struct e820map e820; +#ifdef CONFIG_XEN +struct e820map machine_e820; +#endif extern void early_cpu_init(void); extern void generic_apic_probe(char *); @@ -1450,7 +1454,6 @@ e820_setup_gap(struct e820entry *e820, int nr_map) static void __init register_memory(void) { #ifdef CONFIG_XEN - struct e820entry *machine_e820; struct xen_memory_map memmap; #endif int i; @@ -1460,14 +1463,14 @@ static void __init register_memory(void) return; #ifdef CONFIG_XEN - machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE); - memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, machine_e820); + set_xen_guest_handle(memmap.buffer, machine_e820.map); - BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)); + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; - legacy_init_iomem_resources(machine_e820, memmap.nr_entries, + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map, &code_resource, &data_resource); #else if (efi_enabled) @@ -1485,8 +1488,7 @@ static void __init register_memory(void) request_resource(&ioport_resource, &standard_io_resources[i]); #ifdef CONFIG_XEN - e820_setup_gap(machine_e820, memmap.nr_entries); - free_bootmem(__pa(machine_e820), PAGE_SIZE); + e820_setup_gap(machine_e820.map, machine_e820.nr_map); #else e820_setup_gap(e820.map, e820.nr_map); #endif @@ -1665,33 +1667,15 @@ void __init setup_arch(char **cmdline_p) screen_info.orig_video_cols = 80; screen_info.orig_video_ega_bx = 3; screen_info.orig_video_points = 16; + screen_info.orig_y = screen_info.orig_video_lines - 1; if (xen_start_info->console.dom0.info_size >= sizeof(struct dom0_vga_console_info)) { const struct dom0_vga_console_info *info = (struct dom0_vga_console_info *)( (char *)xen_start_info + xen_start_info->console.dom0.info_off); - screen_info.orig_video_mode = info->txt_mode; - screen_info.orig_video_isVGA = info->video_type; - screen_info.orig_video_lines = info->video_height; - screen_info.orig_video_cols = info->video_width; - screen_info.orig_video_points = info->txt_points; - screen_info.lfb_width = info->video_width; - screen_info.lfb_height = info->video_height; - screen_info.lfb_depth = info->lfb_depth; - screen_info.lfb_base = info->lfb_base; - screen_info.lfb_size = info->lfb_size; - screen_info.lfb_linelength = info->lfb_linelen; - screen_info.red_size = info->red_size; - screen_info.red_pos = info->red_pos; - screen_info.green_size = info->green_size; - screen_info.green_pos = info->green_pos; - screen_info.blue_size = info->blue_size; - screen_info.blue_pos = info->blue_pos; - screen_info.rsvd_size = info->rsvd_size; - screen_info.rsvd_pos = info->rsvd_pos; + dom0_init_screen_info(info); } - screen_info.orig_y = screen_info.orig_video_lines - 1; xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; } else diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/sysenter.c b/linux-2.6-xen-sparse/arch/i386/kernel/sysenter.c index 844c87e78c..f300bd159e 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/sysenter.c +++ b/linux-2.6-xen-sparse/arch/i386/kernel/sysenter.c @@ -60,7 +60,7 @@ int __init sysenter_setup(void) #ifdef CONFIG_XEN if (boot_cpu_has(X86_FEATURE_SEP)) { - struct callback_register sysenter = { + static struct callback_register __initdata sysenter = { .type = CALLBACKTYPE_sysenter, .address = { __KERNEL_CS, (unsigned long)sysenter_entry }, }; diff --git a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c index cc9907901b..05f3c47e50 100644 --- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c @@ -716,6 +716,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) rcu_check_callbacks(cpu, user_mode(regs)); scheduler_tick(); run_posix_cpu_timers(current); + profile_tick(CPU_PROFILING, regs); return IRQ_HANDLED; } diff --git a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c index 16a0155ecb..4939ab106e 100644 --- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c @@ -282,12 +282,6 @@ static int spurious_fault(struct pt_regs *regs, pmd_t *pmd; pte_t *pte; -#ifdef CONFIG_XEN - /* Faults in hypervisor area are never spurious. */ - if (address >= HYPERVISOR_VIRT_START) - return 0; -#endif - /* Reserved-bit violation or user access to kernel space? */ if (error_code & 0x0c) return 0; @@ -372,7 +366,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(address >= TASK_SIZE)) { #ifdef CONFIG_XEN /* Faults in hypervisor area can never be patched up. */ - if (address >= HYPERVISOR_VIRT_START) + if (address >= hypervisor_virt_start) goto bad_area_nosemaphore; #endif if (!(error_code & 5)) diff --git a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c index 5dc6646cf5..3c20351e92 100644 --- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c +++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c @@ -99,18 +99,6 @@ void xen_l4_entry_update(pgd_t *ptr, pgd_t val) } #endif /* CONFIG_X86_64 */ -void xen_machphys_update(unsigned long mfn, unsigned long pfn) -{ - mmu_update_t u; - if (xen_feature(XENFEAT_auto_translated_physmap)) { - BUG_ON(pfn != mfn); - return; - } - u.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - u.val = pfn; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - void xen_pt_switch(unsigned long ptr) { struct mmuext_op op; @@ -325,6 +313,7 @@ int xen_create_contiguous_region( success = (exchange.nr_exchanged == (1UL << order)); BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); BUG_ON(success && (rc != 0)); +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { /* Compatibility when XENMEM_exchange is unsupported. */ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, @@ -341,6 +330,7 @@ int xen_create_contiguous_region( BUG(); } } +#endif /* 3. Map the new extent in place of old pages. */ for (i = 0; i < (1UL<<order); i++) { @@ -419,6 +409,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) success = (exchange.nr_exchanged == 1); BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); BUG_ON(success && (rc != 0)); +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { /* Compatibility when XENMEM_exchange is unsupported. */ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, @@ -429,6 +420,7 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) BUG(); success = 1; } +#endif /* 4. Map new pages in place of old pages. */ for (i = 0; i < (1UL<<order); i++) { diff --git a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c index c0c0bb2fa8..4d2b33068f 100644 --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c @@ -130,7 +130,7 @@ static void __init page_table_range_init (unsigned long start, unsigned long end pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) + if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -187,7 +187,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd += pmd_idx; for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - if (address >= HYPERVISOR_VIRT_START) + if (address >= hypervisor_virt_start) continue; /* Map with big pages if possible, otherwise create normal page tables. */ @@ -410,7 +410,7 @@ static void __init pagetable_init (void) * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); permanent_kmaps_init(pgd_base); } @@ -663,8 +663,8 @@ void __init mem_init(void) totalram_pages += free_all_bootmem(); /* XEN: init and count low-mem pages outside initial allocation. */ for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { - ClearPageReserved(&mem_map[pfn]); - set_page_count(&mem_map[pfn], 1); + ClearPageReserved(pfn_to_page(pfn)); + set_page_count(pfn_to_page(pfn), 1); totalram_pages++; } diff --git a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c index 2fac26719c..b2e8832b85 100644 --- a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c @@ -29,6 +29,8 @@ static int direct_remap_area_pte_fn(pte_t *pte, { mmu_update_t **v = (mmu_update_t **)data; + BUG_ON(!pte_none(*pte)); + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); (*v)++; @@ -110,12 +112,14 @@ int direct_remap_pfn_range(struct vm_area_struct *vma, pgprot_t prot, domid_t domid) { - /* Same as remap_pfn_range(). */ - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return remap_pfn_range(vma, address, mfn, size, prot); if (domid == DOMID_SELF) return -EINVAL; + vma->vm_flags |= VM_IO | VM_RESERVED; + vma->vm_mm->context.has_foreign_mappings = 1; return __direct_remap_pfn_range( @@ -245,7 +249,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l return NULL; area->phys_addr = phys_addr; addr = (void __iomem *) area->addr; - flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; + flags |= _KERNPG_TABLE; if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, phys_addr>>PAGE_SHIFT, size, __pgprot(flags), domid)) { diff --git a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c index 843c9d0fd3..0ff01f52f8 100644 --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c @@ -102,8 +102,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) return; } pte = pte_offset_kernel(pmd, vaddr); - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte(pfn, flags)); + if (pgprot_val(flags)) + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + else + pte_clear(&init_mm, vaddr, pte); /* * It's enough to flush this one mapping. @@ -140,8 +143,11 @@ static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn, return; } pte = pte_offset_kernel(pmd, vaddr); - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte_ma(pfn, flags)); + if (pgprot_val(flags)) + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte_ma(pfn, flags)); + else + pte_clear(&init_mm, vaddr, pte); /* * It's enough to flush this one mapping. @@ -186,9 +192,16 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) } static int nr_fixmaps = 0; +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE); EXPORT_SYMBOL(__FIXADDR_TOP); +void __init set_fixaddr_top() +{ + BUG_ON(nr_fixmaps > 0); + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE; +} + void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); @@ -211,12 +224,6 @@ void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) nr_fixmaps++; } -void set_fixaddr_top(unsigned long top) -{ - BUG_ON(nr_fixmaps > 0); - __FIXADDR_TOP = top - PAGE_SIZE; -} - pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); diff --git a/linux-2.6-xen-sparse/arch/i386/oprofile/Makefile b/linux-2.6-xen-sparse/arch/i386/oprofile/Makefile index e596c39c09..caaff108dc 100644 --- a/linux-2.6-xen-sparse/arch/i386/oprofile/Makefile +++ b/linux-2.6-xen-sparse/arch/i386/oprofile/Makefile @@ -7,7 +7,10 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) ifdef CONFIG_XEN -oprofile-y := $(DRIVER_OBJS) xenoprof.o +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \ + xenoprofile.o) +oprofile-y := $(DRIVER_OBJS) \ + $(XENOPROF_COMMON_OBJS) xenoprof.o else oprofile-y := $(DRIVER_OBJS) init.o backtrace.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ diff --git a/linux-2.6-xen-sparse/arch/i386/oprofile/xenoprof.c b/linux-2.6-xen-sparse/arch/i386/oprofile/xenoprof.c index ed648c72cd..cf6d463fe7 100644 --- a/linux-2.6-xen-sparse/arch/i386/oprofile/xenoprof.c +++ b/linux-2.6-xen-sparse/arch/i386/oprofile/xenoprof.c @@ -9,249 +9,83 @@ * Modified by Aravind Menon and Jose Renato Santos for Xen * These modifications are: * Copyright (C) 2005 Hewlett-Packard Co. + * + * x86-specific part + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. */ #include <linux/init.h> -#include <linux/notifier.h> -#include <linux/smp.h> #include <linux/oprofile.h> -#include <linux/sysdev.h> -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/vmalloc.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/apic.h> +#include <linux/sched.h> #include <asm/pgtable.h> -#include <xen/evtchn.h> -#include "op_counter.h" #include <xen/driver_util.h> #include <xen/interface/xen.h> #include <xen/interface/xenoprof.h> -#include <../../../drivers/oprofile/cpu_buffer.h> -#include <../../../drivers/oprofile/event_buffer.h> - -#define MAX_XENOPROF_SAMPLES 16 - -static int xenoprof_start(void); -static void xenoprof_stop(void); +#include <xen/xenoprof.h> +#include "op_counter.h" -static int xenoprof_enabled = 0; static unsigned int num_events = 0; -static int is_primary = 0; -static int active_defined; -/* sample buffers shared with Xen */ -xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS]; -/* Shared buffer area */ -char * shared_buffer = NULL; -/* Number of buffers in shared area (one per VCPU) */ -int nbuf; -/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */ -int ovf_irq[NR_CPUS]; -/* cpu model type string - copied from Xen memory space on XENOPROF_init command */ -char cpu_type[XENOPROF_CPU_TYPE_SIZE]; - -/* Passive sample buffers shared with Xen */ -xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS]; -/* Passive shared buffer area */ -char *p_shared_buffer[MAX_OPROF_DOMAINS]; - -#ifdef CONFIG_PM - -static int xenoprof_suspend(struct sys_device * dev, pm_message_t state) +void __init xenoprof_arch_init_counter(struct xenoprof_init *init) { - if (xenoprof_enabled == 1) - xenoprof_stop(); - return 0; -} - - -static int xenoprof_resume(struct sys_device * dev) -{ - if (xenoprof_enabled == 1) - xenoprof_start(); - return 0; -} - - -static struct sysdev_class oprofile_sysclass = { - set_kset_name("oprofile"), - .resume = xenoprof_resume, - .suspend = xenoprof_suspend -}; - - -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - - -static int __init init_driverfs(void) -{ - int error; - if (!(error = sysdev_class_register(&oprofile_sysclass))) - error = sysdev_register(&device_oprofile); - return error; -} - - -static void __exit exit_driverfs(void) -{ - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); -} - -#else -#define init_driverfs() do { } while (0) -#define exit_driverfs() do { } while (0) -#endif /* CONFIG_PM */ - -unsigned long long oprofile_samples = 0; -unsigned long long p_oprofile_samples = 0; - -unsigned int pdomains; -struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS]; - -static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive) -{ - int head, tail, size; - - head = buf->event_head; - tail = buf->event_tail; - size = buf->event_size; - - if (tail > head) { - while (tail < size) { - oprofile_add_pc(buf->event_log[tail].eip, - buf->event_log[tail].mode, - buf->event_log[tail].event); - if (!is_passive) - oprofile_samples++; - else - p_oprofile_samples++; - tail++; - } - tail = 0; - } - while (tail < head) { - oprofile_add_pc(buf->event_log[tail].eip, - buf->event_log[tail].mode, - buf->event_log[tail].event); - if (!is_passive) - oprofile_samples++; - else - p_oprofile_samples++; - tail++; + num_events = init->num_events; + /* just in case - make sure we do not overflow event list + (i.e. counter_config list) */ + if (num_events > OP_MAX_COUNTER) { + num_events = OP_MAX_COUNTER; + init->num_events = num_events; } - - buf->event_tail = tail; } -static void xenoprof_handle_passive(void) +void xenoprof_arch_counter(void) { - int i, j; - int flag_domain, flag_switch = 0; - - for (i = 0; i < pdomains; i++) { - flag_domain = 0; - for (j = 0; j < passive_domains[i].nbuf; j++) { - xenoprof_buf_t *buf = p_xenoprof_buf[i][j]; - if (buf->event_head == buf->event_tail) - continue; - if (!flag_domain) { - if (!oprofile_add_domain_switch(passive_domains[i]. - domain_id)) - goto done; - flag_domain = 1; - } - xenoprof_add_pc(buf, 1); - flag_switch = 1; - } + int i; + struct xenoprof_counter counter; + + for (i=0; i<num_events; i++) { + counter.ind = i; + counter.count = (uint64_t)counter_config[i].count; + counter.enabled = (uint32_t)counter_config[i].enabled; + counter.event = (uint32_t)counter_config[i].event; + counter.kernel = (uint32_t)counter_config[i].kernel; + counter.user = (uint32_t)counter_config[i].user; + counter.unit_mask = (uint64_t)counter_config[i].unit_mask; + HYPERVISOR_xenoprof_op(XENOPROF_counter, + &counter); } -done: - if (flag_switch) - oprofile_add_domain_switch(COORDINATOR_DOMAIN); } -static irqreturn_t -xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) +void xenoprof_arch_start(void) { - struct xenoprof_buf * buf; - int cpu; - static unsigned long flag; - - cpu = smp_processor_id(); - buf = xenoprof_buf[cpu]; - - xenoprof_add_pc(buf, 0); - - if (is_primary && !test_and_set_bit(0, &flag)) { - xenoprof_handle_passive(); - smp_mb__before_clear_bit(); - clear_bit(0, &flag); - } - - return IRQ_HANDLED; + /* nothing */ } - -static void unbind_virq(void) +void xenoprof_arch_stop(void) { - int i; - - for_each_cpu(i) { - if (ovf_irq[i] >= 0) { - unbind_from_irqhandler(ovf_irq[i], NULL); - ovf_irq[i] = -1; - } - } + /* nothing */ } - -static int bind_virq(void) +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf) { - int i, result; - - for_each_cpu(i) { - result = bind_virq_to_irqhandler(VIRQ_XENOPROF, - i, - xenoprof_ovf_interrupt, - SA_INTERRUPT, - "xenoprof", - NULL); - - if (result < 0) { - unbind_virq(); - return result; - } - - ovf_irq[i] = result; + if (sbuf->buffer) { + vunmap(sbuf->buffer); + sbuf->buffer = NULL; } - - return 0; } - -static int map_xenoprof_buffer(int max_samples) +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer, + struct xenoprof_shared_buffer * sbuf) { - struct xenoprof_get_buffer get_buffer; - struct xenoprof_buf *buf; - int npages, ret, i; + int npages, ret; struct vm_struct *area; - if ( shared_buffer ) - return 0; - - get_buffer.max_samples = max_samples; - - if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, &get_buffer)) ) + sbuf->buffer = NULL; + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) ) return ret; - nbuf = get_buffer.nbuf; - npages = (get_buffer.bufsize * nbuf - 1) / PAGE_SIZE + 1; + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1; area = alloc_vm_area(npages * PAGE_SIZE); if (area == NULL) @@ -259,231 +93,55 @@ static int map_xenoprof_buffer(int max_samples) if ( (ret = direct_kernel_remap_pfn_range( (unsigned long)area->addr, - get_buffer.buf_maddr >> PAGE_SHIFT, - npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE), DOMID_SELF)) ) { + get_buffer->buf_gmaddr >> PAGE_SHIFT, + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE), + DOMID_SELF)) ) { vunmap(area->addr); return ret; } - shared_buffer = area->addr; - for (i=0; i< nbuf; i++) { - buf = (struct xenoprof_buf*) - &shared_buffer[i * get_buffer.bufsize]; - BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); - xenoprof_buf[buf->vcpu_id] = buf; - } - - return 0; -} - - -static int xenoprof_setup(void) -{ - int ret; - int i; - - if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) ) - return ret; - - if ( (ret = bind_virq()) ) - return ret; - - if (is_primary) { - struct xenoprof_counter counter; - - /* Define dom0 as an active domain if not done yet */ - if (!active_defined) { - domid_t domid; - ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); - if (ret) - goto err; - domid = 0; - ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); - if (ret) - goto err; - active_defined = 1; - } - - ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); - if (ret) - goto err; - for (i=0; i<num_events; i++) { - counter.ind = i; - counter.count = (uint64_t)counter_config[i].count; - counter.enabled = (uint32_t)counter_config[i].enabled; - counter.event = (uint32_t)counter_config[i].event; - counter.kernel = (uint32_t)counter_config[i].kernel; - counter.user = (uint32_t)counter_config[i].user; - counter.unit_mask = (uint64_t)counter_config[i].unit_mask; - HYPERVISOR_xenoprof_op(XENOPROF_counter, - &counter); - } - ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL); - - if (ret) - goto err; - } - - ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL); - if (ret) - goto err; - - xenoprof_enabled = 1; - return 0; - err: - unbind_virq(); - return ret; -} - - -static void xenoprof_shutdown(void) -{ - xenoprof_enabled = 0; - - HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL); - - if (is_primary) { - HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL); - active_defined = 0; - } - - unbind_virq(); - -} - - -static int xenoprof_start(void) -{ - int ret = 0; - - if (is_primary) - ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL); - - return ret; -} - - -static void xenoprof_stop(void) -{ - if (is_primary) - HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL); -} - - -static int xenoprof_set_active(int * active_domains, - unsigned int adomains) -{ - int ret = 0; - int i; - int set_dom0 = 0; - domid_t domid; - - if (!is_primary) - return 0; - - if (adomains > MAX_OPROF_DOMAINS) - return -E2BIG; - - ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); - if (ret) - return ret; - - for (i=0; i<adomains; i++) { - domid = active_domains[i]; - if (domid != active_domains[i]) { - ret = -EINVAL; - goto out; - } - ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); - if (ret) - goto out; - if (active_domains[i] == 0) - set_dom0 = 1; - } - /* dom0 must always be active but may not be in the list */ - if (!set_dom0) { - domid = 0; - ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); - } - -out: - if (ret) - HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); - active_defined = !ret; + sbuf->buffer = area->addr; return ret; } -static int xenoprof_set_passive(int * p_domains, - unsigned int pdoms) +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain, + struct xenoprof_shared_buffer * sbuf) { int ret; - int i, j; int npages; - struct xenoprof_buf *buf; struct vm_struct *area; pgprot_t prot = __pgprot(_KERNPG_TABLE); - if (!is_primary) - return 0; - - if (pdoms > MAX_OPROF_DOMAINS) - return -E2BIG; - - ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL); + sbuf->buffer = NULL; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain); if (ret) - return ret; - - for (i = 0; i < pdoms; i++) { - passive_domains[i].domain_id = p_domains[i]; - passive_domains[i].max_samples = 2048; - ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, - &passive_domains[i]); - if (ret) - goto out; - - npages = (passive_domains[i].bufsize * passive_domains[i].nbuf - 1) / PAGE_SIZE + 1; - - area = alloc_vm_area(npages * PAGE_SIZE); - if (area == NULL) { - ret = -ENOMEM; - goto out; - } - - ret = direct_kernel_remap_pfn_range( - (unsigned long)area->addr, - passive_domains[i].buf_maddr >> PAGE_SHIFT, - npages * PAGE_SIZE, prot, DOMID_SELF); - if (ret) { - vunmap(area->addr); - goto out; - } + goto out; - p_shared_buffer[i] = area->addr; - - for (j = 0; j < passive_domains[i].nbuf; j++) { - buf = (struct xenoprof_buf *) - &p_shared_buffer[i][j * passive_domains[i].bufsize]; - BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); - p_xenoprof_buf[i][buf->vcpu_id] = buf; - } + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1; + area = alloc_vm_area(npages * PAGE_SIZE); + if (area == NULL) { + ret = -ENOMEM; + goto out; } - pdomains = pdoms; - return 0; - -out: - for (j = 0; j < i; j++) { - vunmap(p_shared_buffer[j]); - p_shared_buffer[j] = NULL; + ret = direct_kernel_remap_pfn_range( + (unsigned long)area->addr, + pdomain->buf_gmaddr >> PAGE_SHIFT, + npages * PAGE_SIZE, prot, DOMID_SELF); + if (ret) { + vunmap(area->addr); + goto out; } + sbuf->buffer = area->addr; - return ret; +out: + return ret; } struct op_counter_config counter_config[OP_MAX_COUNTER]; -static int xenoprof_create_files(struct super_block * sb, struct dentry * root) +int xenoprof_create_files(struct super_block * sb, struct dentry * root) { unsigned int i; @@ -510,75 +168,12 @@ static int xenoprof_create_files(struct super_block * sb, struct dentry * root) return 0; } - -struct oprofile_operations xenoprof_ops = { - .create_files = xenoprof_create_files, - .set_active = xenoprof_set_active, - .set_passive = xenoprof_set_passive, - .setup = xenoprof_setup, - .shutdown = xenoprof_shutdown, - .start = xenoprof_start, - .stop = xenoprof_stop -}; - - -/* in order to get driverfs right */ -static int using_xenoprof; - int __init oprofile_arch_init(struct oprofile_operations * ops) { - struct xenoprof_init init; - int ret, i; - - ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init); - - if (!ret) { - num_events = init.num_events; - is_primary = init.is_primary; - - /* just in case - make sure we do not overflow event list - (i.e. counter_config list) */ - if (num_events > OP_MAX_COUNTER) - num_events = OP_MAX_COUNTER; - - /* cpu_type is detected by Xen */ - cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0; - strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1); - xenoprof_ops.cpu_type = cpu_type; - - init_driverfs(); - using_xenoprof = 1; - *ops = xenoprof_ops; - - for (i=0; i<NR_CPUS; i++) - ovf_irq[i] = -1; - - active_defined = 0; - } - printk(KERN_INFO "oprofile_arch_init: ret %d, events %d, " - "is_primary %d\n", ret, num_events, is_primary); - return ret; + return xenoprofile_init(ops); } - -void __exit oprofile_arch_exit(void) +void oprofile_arch_exit(void) { - int i; - - if (using_xenoprof) - exit_driverfs(); - - if (shared_buffer) { - vunmap(shared_buffer); - shared_buffer = NULL; - } - if (is_primary) { - for (i = 0; i < pdomains; i++) - if (p_shared_buffer[i]) { - vunmap(p_shared_buffer[i]); - p_shared_buffer[i] = NULL; - } - HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL); - } - + xenoprofile_exit(); } diff --git a/linux-2.6-xen-sparse/arch/ia64/Kconfig b/linux-2.6-xen-sparse/arch/ia64/Kconfig index c32f4cee4c..4073a04638 100644 --- a/linux-2.6-xen-sparse/arch/ia64/Kconfig +++ b/linux-2.6-xen-sparse/arch/ia64/Kconfig @@ -64,6 +64,20 @@ config XEN_IA64_VDSO_PARAVIRT help vDSO paravirtualization +config XEN_IA64_EXPOSE_P2M + bool "Xen/IA64 exposure p2m table" + depends on XEN + default y + help + expose p2m from xen + +config XEN_IA64_EXPOSE_P2M_USE_DTR + bool "Xen/IA64 map p2m table with dtr" + depends on XEN_IA64_EXPOSE_P2M + default y + help + use dtr to map the exposed p2m table + config SCHED_NO_NO_OMIT_FRAME_POINTER bool default y @@ -276,6 +290,9 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config ARCH_ENABLE_MEMORY_HOTPLUG + def_bool y + config SCHED_SMT bool "SMT scheduler support" depends on SMP diff --git a/linux-2.6-xen-sparse/arch/ia64/dig/setup.c b/linux-2.6-xen-sparse/arch/ia64/dig/setup.c index 90d6ab64fa..7f3826991a 100644 --- a/linux-2.6-xen-sparse/arch/ia64/dig/setup.c +++ b/linux-2.6-xen-sparse/arch/ia64/dig/setup.c @@ -25,6 +25,8 @@ #include <asm/machvec.h> #include <asm/system.h> +#include <xen/xencons.h> + void __init dig_setup (char **cmdline_p) { @@ -78,27 +80,8 @@ dig_setup (char **cmdline_p) (struct dom0_vga_console_info *)( (char *)xen_start_info + xen_start_info->console.dom0.info_off); - screen_info.orig_video_mode = info->txt_mode; - screen_info.orig_video_isVGA = info->video_type; - screen_info.orig_video_lines = info->video_height; - screen_info.orig_video_cols = info->video_width; - screen_info.orig_video_points = info->txt_points; - screen_info.lfb_width = info->video_width; - screen_info.lfb_height = info->video_height; - screen_info.lfb_depth = info->lfb_depth; - screen_info.lfb_base = info->lfb_base; - screen_info.lfb_size = info->lfb_size; - screen_info.lfb_linelength = info->lfb_linelen; - screen_info.red_size = info->red_size; - screen_info.red_pos = info->red_pos; - screen_info.green_size = info->green_size; - screen_info.green_pos = info->green_pos; - screen_info.blue_size = info->blue_size; - screen_info.blue_pos = info->blue_pos; - screen_info.rsvd_size = info->rsvd_size; - screen_info.rsvd_pos = info->rsvd_pos; + dom0_init_screen_info(info); } - screen_info.orig_y = screen_info.orig_video_lines - 1; xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; #endif diff --git a/linux-2.6-xen-sparse/arch/ia64/kernel/Makefile b/linux-2.6-xen-sparse/arch/ia64/kernel/Makefile new file mode 100644 index 0000000000..003e9ee600 --- /dev/null +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/Makefile @@ -0,0 +1,62 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head.o init_task.o vmlinux.lds + +obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ + unwind.o mca.o mca_asm.o topology.o + +obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o +obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o +obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o +obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o + +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += acpi-processor.o +endif + +obj-$(CONFIG_IA64_PALINFO) += palinfo.o +obj-$(CONFIG_IOSAPIC) += iosapic.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_SMP) += smp.o smpboot.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o +obj-$(CONFIG_IA64_CYCLONE) += cyclone.o +obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o +mca_recovery-y += mca_drv.o mca_drv_asm.o + +# The gate DSO image is built using a special linker script. +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + +# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. +CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 + +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data.gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so diff --git a/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S b/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S index 5f0163f0be..45377beaa4 100644 --- a/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S @@ -13,6 +13,7 @@ SECTIONS . = GATE_ADDR + SIZEOF_HEADERS; .hash : { *(.hash) } :readable + .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } diff --git a/linux-2.6-xen-sparse/arch/ia64/kernel/setup.c b/linux-2.6-xen-sparse/arch/ia64/kernel/setup.c index f792900cf9..8f15b3001c 100644 --- a/linux-2.6-xen-sparse/arch/ia64/kernel/setup.c +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/setup.c @@ -63,6 +63,7 @@ #include <asm/system.h> #ifdef CONFIG_XEN #include <asm/hypervisor.h> +#include <asm/xen/xencomm.h> #endif #include <linux/dma-mapping.h> @@ -433,6 +434,9 @@ setup_arch (char **cmdline_p) #ifdef CONFIG_XEN if (is_running_on_xen()) { + /* Must be done before any hypercall. */ + xencomm_init(); + setup_xen_features(); /* Register a call for panic conditions. */ notifier_chain_register(&panic_notifier_list, &xen_panic_block); diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/Makefile b/linux-2.6-xen-sparse/arch/ia64/xen/Makefile index c2b4f94edd..36434aac72 100644 --- a/linux-2.6-xen-sparse/arch/ia64/xen/Makefile +++ b/linux-2.6-xen-sparse/arch/ia64/xen/Makefile @@ -3,6 +3,7 @@ # obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o \ - hypervisor.o pci-dma-xen.o util.o + hypervisor.o pci-dma-xen.o util.o xencomm.o xcom_hcall.o \ + xcom_mini.o xcom_privcmd.o pci-dma-xen-y := ../../i386/kernel/pci-dma-xen.o diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c b/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c index 0b286047b9..2a85caa0d5 100644 --- a/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c +++ b/linux-2.6-xen-sparse/arch/ia64/xen/hypervisor.c @@ -40,59 +40,11 @@ EXPORT_SYMBOL(xen_start_info); int running_on_xen; EXPORT_SYMBOL(running_on_xen); -//XXX xen/ia64 copy_from_guest() is broken. -// This is a temporal work around until it is fixed. -// used by balloon.c netfront.c - -// get_xen_guest_handle is defined only when __XEN_TOOLS__ is defined -// if the definition in arch-ia64.h is changed, this must be updated. -#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) - -int -ia64_xenmem_reservation_op(unsigned long op, - struct xen_memory_reservation* reservation__) -{ - struct xen_memory_reservation reservation = *reservation__; - unsigned long* frame_list; - unsigned long nr_extents = reservation__->nr_extents; - int ret = 0; - get_xen_guest_handle(frame_list, reservation__->extent_start); - - BUG_ON(op != XENMEM_increase_reservation && - op != XENMEM_decrease_reservation && - op != XENMEM_populate_physmap); - - while (nr_extents > 0) { - int tmp_ret; - volatile unsigned long dummy; - - set_xen_guest_handle(reservation.extent_start, frame_list); - reservation.nr_extents = nr_extents; - - dummy = frame_list[0];// re-install tlb entry before hypercall - tmp_ret = ____HYPERVISOR_memory_op(op, &reservation); - if (tmp_ret < 0) { - if (ret == 0) { - ret = tmp_ret; - } - break; - } - if (tmp_ret == 0) { - //XXX dirty work around for skbuff_ctor() - // of a non-privileged domain, - if ((op == XENMEM_increase_reservation || - op == XENMEM_populate_physmap) && - !is_initial_xendomain() && - reservation.extent_order > 0) - return ret; - } - frame_list += tmp_ret; - nr_extents -= tmp_ret; - ret += tmp_ret; - } - return ret; -} -EXPORT_SYMBOL(ia64_xenmem_reservation_op); +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M +static int p2m_expose_init(void); +#else +#define p2m_expose_init() (-ENOSYS) +#endif //XXX same as i386, x86_64 contiguous_bitmap_set(), contiguous_bitmap_clear() // move those to lib/contiguous_bitmap? @@ -371,8 +323,6 @@ gnttab_map_grant_ref_pre(struct gnttab_map_grant_ref *uop) int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) { - __u64 va1, va2, pa1, pa2; - if (cmd == GNTTABOP_map_grant_ref) { unsigned int i; for (i = 0; i < count; i++) { @@ -380,29 +330,7 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) (struct gnttab_map_grant_ref*)uop + i); } } - va1 = (__u64)uop & PAGE_MASK; - pa1 = pa2 = 0; - if ((REGION_NUMBER(va1) == 5) && - ((va1 - KERNEL_START) >= KERNEL_TR_PAGE_SIZE)) { - pa1 = ia64_tpa(va1); - if (cmd <= GNTTABOP_transfer) { - static uint32_t uop_size[GNTTABOP_transfer + 1] = { - sizeof(struct gnttab_map_grant_ref), - sizeof(struct gnttab_unmap_grant_ref), - sizeof(struct gnttab_setup_table), - sizeof(struct gnttab_dump_table), - sizeof(struct gnttab_transfer), - }; - va2 = (__u64)uop + (uop_size[cmd] * count) - 1; - va2 &= PAGE_MASK; - if (va1 != va2) { - /* maximum size of uop is 2pages */ - BUG_ON(va2 > va1 + PAGE_SIZE); - pa2 = ia64_tpa(va2); - } - } - } - return ____HYPERVISOR_grant_table_op(cmd, uop, count, pa1, pa2); + return xencomm_mini_hypercall_grant_table_op(cmd, uop, count); } EXPORT_SYMBOL(HYPERVISOR_grant_table_op); @@ -526,6 +454,10 @@ out: privcmd_resource_min, privcmd_resource_max, (privcmd_resource_max - privcmd_resource_min) >> 20); BUG_ON(privcmd_resource_min >= privcmd_resource_max); + + // XXX this should be somewhere appropriate + (void)p2m_expose_init(); + return 0; } late_initcall(xen_ia64_privcmd_init); @@ -546,6 +478,7 @@ struct xen_ia64_privcmd_range { }; struct xen_ia64_privcmd_vma { + int is_privcmd_mmapped; struct xen_ia64_privcmd_range* range; unsigned long num_entries; @@ -684,12 +617,15 @@ __xen_ia64_privcmd_vma_open(struct vm_area_struct* vma, static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma) { + struct xen_ia64_privcmd_vma* old_privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data; struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data; struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range; atomic_inc(&privcmd_range->ref_count); // vm_op->open() can't fail. privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL | __GFP_NOFAIL); + // copy original value if necessary + privcmd_vma->is_privcmd_mmapped = old_privcmd_vma->is_privcmd_mmapped; __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range); } @@ -725,6 +661,14 @@ xen_ia64_privcmd_vma_close(struct vm_area_struct* vma) } int +privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +{ + struct xen_ia64_privcmd_vma* privcmd_vma = + (struct xen_ia64_privcmd_vma *)vma->vm_private_data; + return (xchg(&privcmd_vma->is_privcmd_mmapped, 1) == 0); +} + +int privcmd_mmap(struct file * file, struct vm_area_struct * vma) { int error; @@ -749,6 +693,8 @@ privcmd_mmap(struct file * file, struct vm_area_struct * vma) if (privcmd_vma == NULL) { goto out_enomem1; } + privcmd_vma->is_privcmd_mmapped = 0; + res = kzalloc(sizeof(*res), GFP_KERNEL); if (res == NULL) { goto out_enomem1; @@ -831,3 +777,276 @@ time_resume(void) /* Just trigger a tick. */ ia64_cpu_local_tick(); } + +/////////////////////////////////////////////////////////////////////////// +// expose p2m table +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M +#include <linux/cpu.h> +#include <asm/uaccess.h> + +int p2m_initialized __read_mostly = 0; + +unsigned long p2m_min_low_pfn __read_mostly; +unsigned long p2m_max_low_pfn __read_mostly; +unsigned long p2m_convert_min_pfn __read_mostly; +unsigned long p2m_convert_max_pfn __read_mostly; + +static struct resource p2m_resource = { + .name = "Xen p2m table", + .flags = IORESOURCE_MEM, +}; +static unsigned long p2m_assign_start_pfn __read_mostly; +static unsigned long p2m_assign_end_pfn __read_mostly; +volatile const pte_t* p2m_pte __read_mostly; + +#define GRNULE_PFN PTRS_PER_PTE +static unsigned long p2m_granule_pfn __read_mostly = GRNULE_PFN; + +#define ROUNDDOWN(x, y) ((x) & ~((y) - 1)) +#define ROUNDUP(x, y) (((x) + (y) - 1) & ~((y) - 1)) + +#define P2M_PREFIX "Xen p2m: " + +static int xen_ia64_p2m_expose __read_mostly = 1; +module_param(xen_ia64_p2m_expose, int, 0); +MODULE_PARM_DESC(xen_ia64_p2m_expose, + "enable/disable xen/ia64 p2m exposure optimization\n"); + +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR +static int xen_ia64_p2m_expose_use_dtr __read_mostly = 1; +module_param(xen_ia64_p2m_expose_use_dtr, int, 0); +MODULE_PARM_DESC(xen_ia64_p2m_expose_use_dtr, + "use/unuse dtr to map exposed p2m table\n"); + +static const int p2m_page_shifts[] = { + _PAGE_SIZE_4K, + _PAGE_SIZE_8K, + _PAGE_SIZE_16K, + _PAGE_SIZE_64K, + _PAGE_SIZE_256K, + _PAGE_SIZE_1M, + _PAGE_SIZE_4M, + _PAGE_SIZE_16M, + _PAGE_SIZE_64M, + _PAGE_SIZE_256M, +}; + +struct p2m_itr_arg { + unsigned long vaddr; + unsigned long pteval; + unsigned long log_page_size; +}; +static struct p2m_itr_arg p2m_itr_arg __read_mostly; + +// This should be in asm-ia64/kregs.h +#define IA64_TR_P2M_TABLE 3 + +static void +p2m_itr(void* info) +{ + struct p2m_itr_arg* arg = (struct p2m_itr_arg*)info; + ia64_itr(0x2, IA64_TR_P2M_TABLE, + arg->vaddr, arg->pteval, arg->log_page_size); + ia64_srlz_d(); +} + +static int +p2m_expose_dtr_call(struct notifier_block *self, + unsigned long event, void* ptr) +{ + unsigned int cpu = (unsigned int)(long)ptr; + if (event != CPU_ONLINE) + return 0; + if (!(p2m_initialized && xen_ia64_p2m_expose_use_dtr)) + smp_call_function_single(cpu, &p2m_itr, &p2m_itr_arg, 1, 1); + return 0; +} + +static struct notifier_block p2m_expose_dtr_hotplug_notifier = { + .notifier_call = p2m_expose_dtr_call, + .next = NULL, + .priority = 0 +}; +#endif + +static int +p2m_expose_init(void) +{ + unsigned long num_pfn; + unsigned long size = 0; + unsigned long p2m_size = 0; + unsigned long align = ~0UL; + int error = 0; +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR + int i; + unsigned long page_size; + unsigned long log_page_size = 0; +#endif + + if (!xen_ia64_p2m_expose) + return -ENOSYS; + if (p2m_initialized) + return 0; + +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR + error = register_cpu_notifier(&p2m_expose_dtr_hotplug_notifier); + if (error < 0) + return error; +#endif + + lock_cpu_hotplug(); + if (p2m_initialized) + goto out; + +#ifdef CONFIG_DISCONTIGMEM + p2m_min_low_pfn = min_low_pfn; + p2m_max_low_pfn = max_low_pfn; +#else + p2m_min_low_pfn = 0; + p2m_max_low_pfn = max_pfn; +#endif + +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR + if (xen_ia64_p2m_expose_use_dtr) { + unsigned long granule_pfn = 0; + p2m_size = p2m_max_low_pfn - p2m_min_low_pfn; + for (i = 0; + i < sizeof(p2m_page_shifts)/sizeof(p2m_page_shifts[0]); + i++) { + log_page_size = p2m_page_shifts[i]; + page_size = 1UL << log_page_size; + if (page_size < p2m_size) + continue; + + granule_pfn = max(page_size >> PAGE_SHIFT, + p2m_granule_pfn); + p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn, + granule_pfn); + p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn, + granule_pfn); + num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn; + size = num_pfn << PAGE_SHIFT; + p2m_size = num_pfn / PTRS_PER_PTE; + p2m_size = ROUNDUP(p2m_size, granule_pfn << PAGE_SHIFT); + if (p2m_size == page_size) + break; + } + if (p2m_size != page_size) { + printk(KERN_ERR "p2m_size != page_size\n"); + error = -EINVAL; + goto out; + } + align = max(privcmd_resource_align, granule_pfn << PAGE_SHIFT); + } else +#endif + { + BUG_ON(p2m_granule_pfn & (p2m_granule_pfn - 1)); + p2m_convert_min_pfn = ROUNDDOWN(p2m_min_low_pfn, + p2m_granule_pfn); + p2m_convert_max_pfn = ROUNDUP(p2m_max_low_pfn, p2m_granule_pfn); + num_pfn = p2m_convert_max_pfn - p2m_convert_min_pfn; + size = num_pfn << PAGE_SHIFT; + p2m_size = num_pfn / PTRS_PER_PTE; + p2m_size = ROUNDUP(p2m_size, p2m_granule_pfn << PAGE_SHIFT); + align = max(privcmd_resource_align, + p2m_granule_pfn << PAGE_SHIFT); + } + + // use privcmd region + error = allocate_resource(&iomem_resource, &p2m_resource, p2m_size, + privcmd_resource_min, privcmd_resource_max, + align, NULL, NULL); + if (error) { + printk(KERN_ERR P2M_PREFIX + "can't allocate region for p2m exposure " + "[0x%016lx, 0x%016lx) 0x%016lx\n", + p2m_convert_min_pfn, p2m_convert_max_pfn, p2m_size); + goto out; + } + + p2m_assign_start_pfn = p2m_resource.start >> PAGE_SHIFT; + p2m_assign_end_pfn = p2m_resource.end >> PAGE_SHIFT; + + error = HYPERVISOR_expose_p2m(p2m_convert_min_pfn, + p2m_assign_start_pfn, + size, p2m_granule_pfn); + if (error) { + printk(KERN_ERR P2M_PREFIX "failed expose p2m hypercall %d\n", + error); + printk(KERN_ERR P2M_PREFIX "conv 0x%016lx assign 0x%016lx " + "size 0x%016lx granule 0x%016lx\n", + p2m_convert_min_pfn, p2m_assign_start_pfn, + size, p2m_granule_pfn);; + release_resource(&p2m_resource); + goto out; + } + p2m_pte = (volatile const pte_t*)pfn_to_kaddr(p2m_assign_start_pfn); +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR + if (xen_ia64_p2m_expose_use_dtr) { + p2m_itr_arg.vaddr = (unsigned long)__va(p2m_assign_start_pfn + << PAGE_SHIFT); + p2m_itr_arg.pteval = pte_val(pfn_pte(p2m_assign_start_pfn, + PAGE_KERNEL)); + p2m_itr_arg.log_page_size = log_page_size; + smp_mb(); + smp_call_function(&p2m_itr, &p2m_itr_arg, 1, 1); + p2m_itr(&p2m_itr_arg); + } +#endif + smp_mb(); + p2m_initialized = 1; + printk(P2M_PREFIX "assign p2m table of [0x%016lx, 0x%016lx)\n", + p2m_convert_min_pfn << PAGE_SHIFT, + p2m_convert_max_pfn << PAGE_SHIFT); + printk(P2M_PREFIX "to [0x%016lx, 0x%016lx) (%ld KBytes)\n", + p2m_assign_start_pfn << PAGE_SHIFT, + p2m_assign_end_pfn << PAGE_SHIFT, + p2m_size / 1024); +out: + unlock_cpu_hotplug(); + return error; +} + +#ifdef notyet +void +p2m_expose_cleanup(void) +{ + BUG_ON(!p2m_initialized); +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M_USE_DTR + unregister_cpu_notifier(&p2m_expose_dtr_hotplug_notifier); +#endif + release_resource(&p2m_resource); +} +#endif + +//XXX inlinize? +unsigned long +p2m_phystomach(unsigned long gpfn) +{ + volatile const pte_t* pte; + unsigned long mfn; + unsigned long pteval; + + if (!p2m_initialized || + gpfn < p2m_min_low_pfn || gpfn > p2m_max_low_pfn + /* || !pfn_valid(gpfn) */) + return INVALID_MFN; + pte = p2m_pte + (gpfn - p2m_convert_min_pfn); + + mfn = INVALID_MFN; + if (likely(__get_user(pteval, (unsigned long __user *)pte) == 0 && + pte_present(__pte(pteval)) && + pte_pfn(__pte(pteval)) != (INVALID_MFN >> PAGE_SHIFT))) + mfn = (pteval & _PFN_MASK) >> PAGE_SHIFT; + + return mfn; +} + +EXPORT_SYMBOL_GPL(p2m_initialized); +EXPORT_SYMBOL_GPL(p2m_min_low_pfn); +EXPORT_SYMBOL_GPL(p2m_max_low_pfn); +EXPORT_SYMBOL_GPL(p2m_convert_min_pfn); +EXPORT_SYMBOL_GPL(p2m_convert_max_pfn); +EXPORT_SYMBOL_GPL(p2m_pte); +EXPORT_SYMBOL_GPL(p2m_phystomach); +#endif diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/util.c b/linux-2.6-xen-sparse/arch/ia64/xen/util.c index 02dfaabc66..7df0c5f72c 100644 --- a/linux-2.6-xen-sparse/arch/ia64/xen/util.c +++ b/linux-2.6-xen-sparse/arch/ia64/xen/util.c @@ -28,6 +28,8 @@ #include <linux/vmalloc.h> #include <asm/uaccess.h> #include <xen/driver_util.h> +#include <xen/interface/memory.h> +#include <asm/hypercall.h> struct vm_struct *alloc_vm_area(unsigned long size) { diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/xcom_hcall.c b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_hcall.c new file mode 100644 index 0000000000..fda0a45ec5 --- /dev/null +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_hcall.c @@ -0,0 +1,303 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Tristan Gingold <tristan.gingold@bull.net> + */ +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/module.h> +#include <xen/interface/xen.h> +#include <xen/interface/dom0_ops.h> +#include <xen/interface/memory.h> +#include <xen/interface/xencomm.h> +#include <xen/interface/version.h> +#include <xen/interface/sched.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/physdev.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/callback.h> +#include <xen/interface/acm_ops.h> +#include <xen/interface/hvm/params.h> +#include <asm/hypercall.h> +#include <asm/page.h> +#include <asm/uaccess.h> +#include <asm/xen/xencomm.h> + +/* Xencomm notes: + * This file defines hypercalls to be used by xencomm. The hypercalls simply + * create inlines descriptors for pointers and then call the raw arch hypercall + * xencomm_arch_hypercall_XXX + * + * If the arch wants to directly use these hypercalls, simply define macros + * in asm/hypercall.h, eg: + * #define HYPERVISOR_sched_op xencomm_hypercall_sched_op + * + * The arch may also define HYPERVISOR_xxx as a function and do more operations + * before/after doing the hypercall. + * + * Note: because only inline descriptors are created these functions must only + * be called with in kernel memory parameters. + */ + +int +xencomm_hypercall_console_io(int cmd, int count, char *str) +{ + return xencomm_arch_hypercall_console_io + (cmd, count, xencomm_create_inline(str)); +} + +int +xencomm_hypercall_event_channel_op(int cmd, void *op) +{ + return xencomm_arch_hypercall_event_channel_op + (cmd, xencomm_create_inline(op)); +} + +int +xencomm_hypercall_xen_version(int cmd, void *arg) +{ + switch (cmd) { + case XENVER_version: + case XENVER_extraversion: + case XENVER_compile_info: + case XENVER_capabilities: + case XENVER_changeset: + case XENVER_platform_parameters: + case XENVER_pagesize: + case XENVER_get_features: + break; + default: + printk("%s: unknown version cmd %d\n", __func__, cmd); + return -ENOSYS; + } + + return xencomm_arch_hypercall_xen_version + (cmd, xencomm_create_inline(arg)); +} + +int +xencomm_hypercall_physdev_op(int cmd, void *op) +{ + return xencomm_arch_hypercall_physdev_op + (cmd, xencomm_create_inline(op)); +} + +static void * +xencommize_grant_table_op(unsigned int cmd, void *op, unsigned int count) +{ + switch (cmd) { + case GNTTABOP_map_grant_ref: + case GNTTABOP_unmap_grant_ref: + break; + case GNTTABOP_setup_table: + { + struct gnttab_setup_table *setup = op; + struct xencomm_handle *frame_list; + + frame_list = xencomm_create_inline + (xen_guest_handle(setup->frame_list)); + + set_xen_guest_handle(setup->frame_list, (void *)frame_list); + break; + } + case GNTTABOP_dump_table: + case GNTTABOP_transfer: + case GNTTABOP_copy: + break; + default: + printk("%s: unknown grant table op %d\n", __func__, cmd); + BUG(); + } + + return xencomm_create_inline(op); +} + +int +xencomm_hypercall_grant_table_op(unsigned int cmd, void *op, unsigned int count) +{ + void *desc = xencommize_grant_table_op (cmd, op, count); + + return xencomm_arch_hypercall_grant_table_op(cmd, desc, count); +} + +int +xencomm_hypercall_sched_op(int cmd, void *arg) +{ + switch (cmd) { + case SCHEDOP_yield: + case SCHEDOP_block: + case SCHEDOP_shutdown: + case SCHEDOP_remote_shutdown: + break; + case SCHEDOP_poll: + { + sched_poll_t *poll = arg; + struct xencomm_handle *ports; + + ports = xencomm_create_inline(xen_guest_handle(poll->ports)); + + set_xen_guest_handle(poll->ports, (void *)ports); + break; + } + default: + printk("%s: unknown sched op %d\n", __func__, cmd); + return -ENOSYS; + } + + return xencomm_arch_hypercall_sched_op(cmd, xencomm_create_inline(arg)); +} + +int +xencomm_hypercall_multicall(void *call_list, int nr_calls) +{ + int i; + multicall_entry_t *mce; + + for (i = 0; i < nr_calls; i++) { + mce = (multicall_entry_t *)call_list + i; + + switch (mce->op) { + case __HYPERVISOR_update_va_mapping: + case __HYPERVISOR_mmu_update: + /* No-op on ia64. */ + break; + case __HYPERVISOR_grant_table_op: + mce->args[1] = (unsigned long)xencommize_grant_table_op + (mce->args[0], (void *)mce->args[1], + mce->args[2]); + break; + case __HYPERVISOR_memory_op: + default: + printk("%s: unhandled multicall op entry op %lu\n", + __func__, mce->op); + return -ENOSYS; + } + } + + return xencomm_arch_hypercall_multicall + (xencomm_create_inline(call_list), nr_calls); +} + +int +xencomm_hypercall_callback_op(int cmd, void *arg) +{ + switch (cmd) + { + case CALLBACKOP_register: + case CALLBACKOP_unregister: + break; + default: + printk("%s: unknown callback op %d\n", __func__, cmd); + return -ENOSYS; + } + + return xencomm_arch_hypercall_callback_op + (cmd, xencomm_create_inline(arg)); +} + +static void +xencommize_memory_reservation (xen_memory_reservation_t *mop) +{ + struct xencomm_handle *desc; + + desc = xencomm_create_inline(xen_guest_handle(mop->extent_start)); + set_xen_guest_handle(mop->extent_start, (void *)desc); +} + +int +xencomm_hypercall_memory_op(unsigned int cmd, void *arg) +{ + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2]; + xen_memory_reservation_t *xmr = NULL, *xme_in = NULL, *xme_out = NULL; + int rc; + + switch (cmd) { + case XENMEM_increase_reservation: + case XENMEM_decrease_reservation: + case XENMEM_populate_physmap: + xmr = (xen_memory_reservation_t *)arg; + xen_guest_handle(extent_start_va[0]) = + xen_guest_handle(xmr->extent_start); + xencommize_memory_reservation((xen_memory_reservation_t *)arg); + break; + + case XENMEM_maximum_ram_page: + break; + + case XENMEM_exchange: + xme_in = &((xen_memory_exchange_t *)arg)->in; + xme_out = &((xen_memory_exchange_t *)arg)->out; + xen_guest_handle(extent_start_va[0]) = + xen_guest_handle(xme_in->extent_start); + xen_guest_handle(extent_start_va[1]) = + xen_guest_handle(xme_out->extent_start); + xencommize_memory_reservation + (&((xen_memory_exchange_t *)arg)->in); + xencommize_memory_reservation + (&((xen_memory_exchange_t *)arg)->out); + break; + + default: + printk("%s: unknown memory op %d\n", __func__, cmd); + return -ENOSYS; + } + + rc = xencomm_arch_hypercall_memory_op(cmd, xencomm_create_inline(arg)); + + switch (cmd) { + case XENMEM_increase_reservation: + case XENMEM_decrease_reservation: + case XENMEM_populate_physmap: + xen_guest_handle(xmr->extent_start) = + xen_guest_handle(extent_start_va[0]); + break; + + case XENMEM_exchange: + xen_guest_handle(xme_in->extent_start) = + xen_guest_handle(extent_start_va[0]); + xen_guest_handle(xme_out->extent_start) = + xen_guest_handle(extent_start_va[1]); + break; + } + + return rc; +} + +unsigned long +xencomm_hypercall_hvm_op(int cmd, void *arg) +{ + switch (cmd) { + case HVMOP_set_param: + case HVMOP_get_param: + break; + default: + printk("%s: unknown hvm op %d\n", __func__, cmd); + return -ENOSYS; + } + + return xencomm_arch_hypercall_hvm_op(cmd, xencomm_create_inline(arg)); +} + +int +xencomm_hypercall_suspend(unsigned long srec) +{ + struct sched_shutdown arg; + + arg.reason = SHUTDOWN_suspend; + + return xencomm_arch_hypercall_suspend(xencomm_create_inline(&arg)); +} diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/xcom_mini.c b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_mini.c new file mode 100644 index 0000000000..5adec0c325 --- /dev/null +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_mini.c @@ -0,0 +1,319 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Tristan Gingold <tristan.gingold@bull.net> + */ +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <xen/interface/xen.h> +#include <xen/interface/dom0_ops.h> +#include <xen/interface/memory.h> +#include <xen/interface/xencomm.h> +#include <xen/interface/version.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/physdev.h> +#include <xen/interface/grant_table.h> +#include <xen/interface/hvm/params.h> +#ifdef CONFIG_VMX_GUEST +#include <asm/hypervisor.h> +#else +#include <asm/hypercall.h> +#endif +#include <asm/xen/xencomm.h> + +int +xencomm_mini_hypercall_event_channel_op(int cmd, void *op) +{ + struct xencomm_mini xc_area[2]; + int nbr_area = 2; + struct xencomm_handle *desc; + int rc; + + rc = xencomm_create_mini(xc_area, &nbr_area, + op, sizeof(evtchn_op_t), &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_event_channel_op(cmd, desc); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_event_channel_op); + +static int +xencommize_mini_grant_table_op(struct xencomm_mini *xc_area, int *nbr_area, + unsigned int cmd, void *op, unsigned int count, + struct xencomm_handle **desc) +{ + struct xencomm_handle *desc1; + unsigned int argsize; + int rc; + + switch (cmd) { + case GNTTABOP_map_grant_ref: + argsize = sizeof(struct gnttab_map_grant_ref); + break; + case GNTTABOP_unmap_grant_ref: + argsize = sizeof(struct gnttab_unmap_grant_ref); + break; + case GNTTABOP_setup_table: + { + struct gnttab_setup_table *setup = op; + + argsize = sizeof(*setup); + + if (count != 1) + return -EINVAL; + rc = xencomm_create_mini + (xc_area, nbr_area, + xen_guest_handle(setup->frame_list), + setup->nr_frames + * sizeof(*xen_guest_handle(setup->frame_list)), + &desc1); + if (rc) + return rc; + set_xen_guest_handle(setup->frame_list, (void *)desc1); + break; + } + case GNTTABOP_dump_table: + argsize = sizeof(struct gnttab_dump_table); + break; + case GNTTABOP_transfer: + argsize = sizeof(struct gnttab_transfer); + break; + default: + printk("%s: unknown mini grant table op %d\n", __func__, cmd); + BUG(); + } + + rc = xencomm_create_mini(xc_area, nbr_area, op, count * argsize, desc); + if (rc) + return rc; + + return 0; +} + +int +xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op, + unsigned int count) +{ + int rc; + struct xencomm_handle *desc; + int nbr_area = 2; + struct xencomm_mini xc_area[2]; + + rc = xencommize_mini_grant_table_op(xc_area, &nbr_area, + cmd, op, count, &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_grant_table_op(cmd, desc, count); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_grant_table_op); + +int +xencomm_mini_hypercall_multicall(void *call_list, int nr_calls) +{ + int i; + multicall_entry_t *mce; + int nbr_area = 2 + nr_calls * 3; + struct xencomm_mini xc_area[nbr_area]; + struct xencomm_handle *desc; + int rc; + + for (i = 0; i < nr_calls; i++) { + mce = (multicall_entry_t *)call_list + i; + + switch (mce->op) { + case __HYPERVISOR_update_va_mapping: + case __HYPERVISOR_mmu_update: + /* No-op on ia64. */ + break; + case __HYPERVISOR_grant_table_op: + rc = xencommize_mini_grant_table_op + (xc_area, &nbr_area, + mce->args[0], (void *)mce->args[1], + mce->args[2], &desc); + if (rc) + return rc; + mce->args[1] = (unsigned long)desc; + break; + case __HYPERVISOR_memory_op: + default: + printk("%s: unhandled multicall op entry op %lu\n", + __func__, mce->op); + return -ENOSYS; + } + } + + rc = xencomm_create_mini(xc_area, &nbr_area, call_list, + nr_calls * sizeof(multicall_entry_t), &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_multicall(desc, nr_calls); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_multicall); + +static int +xencommize_mini_memory_reservation(struct xencomm_mini *area, int *nbr_area, + xen_memory_reservation_t *mop) +{ + struct xencomm_handle *desc; + int rc; + + rc = xencomm_create_mini + (area, nbr_area, + xen_guest_handle(mop->extent_start), + mop->nr_extents + * sizeof(*xen_guest_handle(mop->extent_start)), + &desc); + if (rc) + return rc; + + set_xen_guest_handle(mop->extent_start, (void *)desc); + + return 0; +} + +int +xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg) +{ + int nbr_area = 4; + struct xencomm_mini xc_area[4]; + struct xencomm_handle *desc; + int rc; + unsigned int argsize; + + switch (cmd) { + case XENMEM_increase_reservation: + case XENMEM_decrease_reservation: + case XENMEM_populate_physmap: + argsize = sizeof(xen_memory_reservation_t); + rc = xencommize_mini_memory_reservation + (xc_area, &nbr_area, (xen_memory_reservation_t *)arg); + if (rc) + return rc; + break; + + case XENMEM_maximum_ram_page: + argsize = 0; + break; + + case XENMEM_exchange: + argsize = sizeof(xen_memory_exchange_t); + rc = xencommize_mini_memory_reservation + (xc_area, &nbr_area, + &((xen_memory_exchange_t *)arg)->in); + if (rc) + return rc; + rc = xencommize_mini_memory_reservation + (xc_area, &nbr_area, + &((xen_memory_exchange_t *)arg)->out); + if (rc) + return rc; + break; + + case XENMEM_add_to_physmap: + argsize = sizeof (xen_add_to_physmap_t); + break; + + default: + printk("%s: unknown mini memory op %d\n", __func__, cmd); + return -ENOSYS; + } + + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_memory_op(cmd, desc); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_memory_op); + +unsigned long +xencomm_mini_hypercall_hvm_op(int cmd, void *arg) +{ + struct xencomm_handle *desc; + int nbr_area = 2; + struct xencomm_mini xc_area[2]; + unsigned int argsize; + int rc; + + switch (cmd) { + case HVMOP_get_param: + case HVMOP_set_param: + argsize = sizeof(xen_hvm_param_t); + break; + default: + printk("%s: unknown HVMOP %d\n", __func__, cmd); + return -EINVAL; + } + + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_hvm_op(cmd, desc); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_hvm_op); + +int +xencomm_mini_hypercall_xen_version(int cmd, void *arg) +{ + struct xencomm_handle *desc; + int nbr_area = 2; + struct xencomm_mini xc_area[2]; + unsigned int argsize; + int rc; + + switch (cmd) { + case XENVER_version: + /* do not actually pass an argument */ + return xencomm_arch_hypercall_xen_version(cmd, 0); + case XENVER_extraversion: + argsize = sizeof(xen_extraversion_t); + break; + case XENVER_compile_info: + argsize = sizeof(xen_compile_info_t); + break; + case XENVER_capabilities: + argsize = sizeof(xen_capabilities_info_t); + break; + case XENVER_changeset: + argsize = sizeof(xen_changeset_info_t); + break; + case XENVER_platform_parameters: + argsize = sizeof(xen_platform_parameters_t); + break; + case XENVER_pagesize: + argsize = (arg == NULL) ? 0 : sizeof(void *); + break; + case XENVER_get_features: + argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t); + break; + + default: + printk("%s: unknown version op %d\n", __func__, cmd); + return -ENOSYS; + } + + rc = xencomm_create_mini(xc_area, &nbr_area, arg, argsize, &desc); + if (rc) + return rc; + + return xencomm_arch_hypercall_xen_version(cmd, desc); +} +EXPORT_SYMBOL(xencomm_mini_hypercall_xen_version); diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/xcom_privcmd.c b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_privcmd.c new file mode 100644 index 0000000000..51cbb9f4c0 --- /dev/null +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xcom_privcmd.c @@ -0,0 +1,656 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + * Tristan Gingold <tristan.gingold@bull.net> + */ +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/module.h> +#include <xen/interface/xen.h> +#include <xen/interface/dom0_ops.h> +#define __XEN__ +#include <xen/interface/domctl.h> +#include <xen/interface/sysctl.h> +#include <xen/interface/memory.h> +#include <xen/interface/version.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/acm_ops.h> +#include <xen/interface/hvm/params.h> +#include <xen/public/privcmd.h> +#include <asm/hypercall.h> +#include <asm/page.h> +#include <asm/uaccess.h> +#include <asm/xen/xencomm.h> + +#define ROUND_DIV(v,s) (((v) + (s) - 1) / (s)) + +static int +xencomm_privcmd_dom0_op(privcmd_hypercall_t *hypercall) +{ + dom0_op_t kern_op; + dom0_op_t __user *user_op = (dom0_op_t __user *)hypercall->arg[0]; + struct xencomm_handle *op_desc; + struct xencomm_handle *desc = NULL; + int ret = 0; + + if (copy_from_user(&kern_op, user_op, sizeof(dom0_op_t))) + return -EFAULT; + + if (kern_op.interface_version != DOM0_INTERFACE_VERSION) + return -EACCES; + + op_desc = xencomm_create_inline(&kern_op); + + switch (kern_op.cmd) { + default: + printk("%s: unknown dom0 cmd %d\n", __func__, kern_op.cmd); + return -ENOSYS; + } + + if (ret) { + /* error mapping the nested pointer */ + return ret; + } + + ret = xencomm_arch_hypercall_dom0_op(op_desc); + + /* FIXME: should we restore the handle? */ + if (copy_to_user(user_op, &kern_op, sizeof(dom0_op_t))) + ret = -EFAULT; + + if (desc) + xencomm_free(desc); + return ret; +} + +/* + * Temporarily disable the NUMA PHYSINFO code until the rest of the + * changes are upstream. + */ +#undef IA64_NUMA_PHYSINFO + +static int +xencomm_privcmd_sysctl(privcmd_hypercall_t *hypercall) +{ + xen_sysctl_t kern_op; + xen_sysctl_t __user *user_op; + struct xencomm_handle *op_desc; + struct xencomm_handle *desc = NULL; + struct xencomm_handle *desc1 = NULL; + int ret = 0; + + user_op = (xen_sysctl_t __user *)hypercall->arg[0]; + + if (copy_from_user(&kern_op, user_op, sizeof(xen_sysctl_t))) + return -EFAULT; + + if (kern_op.interface_version != XEN_SYSCTL_INTERFACE_VERSION) + return -EACCES; + + op_desc = xencomm_create_inline(&kern_op); + + switch (kern_op.cmd) { + case XEN_SYSCTL_readconsole: + ret = xencomm_create( + xen_guest_handle(kern_op.u.readconsole.buffer), + kern_op.u.readconsole.count, + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.readconsole.buffer, + (void *)desc); + break; + case XEN_SYSCTL_tbuf_op: +#ifndef IA64_NUMA_PHYSINFO + case XEN_SYSCTL_physinfo: +#endif + case XEN_SYSCTL_sched_id: + break; + case XEN_SYSCTL_perfc_op: + { + struct xencomm_handle *tmp_desc; + xen_sysctl_t tmp_op = { + .cmd = XEN_SYSCTL_perfc_op, + .interface_version = XEN_SYSCTL_INTERFACE_VERSION, + .u.perfc_op = { + .cmd = XEN_SYSCTL_PERFCOP_query, + // .desc.p = NULL, + // .val.p = NULL, + }, + }; + + if (xen_guest_handle(kern_op.u.perfc_op.desc) == NULL) { + if (xen_guest_handle(kern_op.u.perfc_op.val) != NULL) + return -EINVAL; + break; + } + + /* query the buffer size for xencomm */ + tmp_desc = xencomm_create_inline(&tmp_op); + ret = xencomm_arch_hypercall_sysctl(tmp_desc); + if (ret) + return ret; + + ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.desc), + tmp_op.u.perfc_op.nr_counters * + sizeof(xen_sysctl_perfc_desc_t), + &desc, GFP_KERNEL); + if (ret) + return ret; + + set_xen_guest_handle(kern_op.u.perfc_op.desc, (void *)desc); + + ret = xencomm_create(xen_guest_handle(kern_op.u.perfc_op.val), + tmp_op.u.perfc_op.nr_vals * + sizeof(xen_sysctl_perfc_val_t), + &desc1, GFP_KERNEL); + if (ret) + xencomm_free(desc); + + set_xen_guest_handle(kern_op.u.perfc_op.val, (void *)desc1); + break; + } + case XEN_SYSCTL_getdomaininfolist: + ret = xencomm_create( + xen_guest_handle(kern_op.u.getdomaininfolist.buffer), + kern_op.u.getdomaininfolist.max_domains * + sizeof(xen_domctl_getdomaininfo_t), + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.getdomaininfolist.buffer, + (void *)desc); + break; +#ifdef IA64_NUMA_PHYSINFO + case XEN_SYSCTL_physinfo: + ret = xencomm_create( + xen_guest_handle(kern_op.u.physinfo.memory_chunks), + PUBLIC_MAXCHUNKS * sizeof(node_data_t), + &desc, GFP_KERNEL); + if (ret) + return ret; + set_xen_guest_handle(kern_op.u.physinfo.memory_chunks, + (void *)desc); + + ret = xencomm_create( + xen_guest_handle(kern_op.u.physinfo.cpu_to_node), + PUBLIC_MAX_NUMNODES * sizeof(u64), + &desc1, GFP_KERNEL); + if (ret) + xencomm_free(desc); + set_xen_guest_handle(kern_op.u.physinfo.cpu_to_node, + (void *)desc1); + break; +#endif + default: + printk("%s: unknown sysctl cmd %d\n", __func__, kern_op.cmd); + return -ENOSYS; + } + + if (ret) { + /* error mapping the nested pointer */ + return ret; + } + + ret = xencomm_arch_hypercall_sysctl(op_desc); + + /* FIXME: should we restore the handles? */ + if (copy_to_user(user_op, &kern_op, sizeof(xen_sysctl_t))) + ret = -EFAULT; + + if (desc) + xencomm_free(desc); + if (desc1) + xencomm_free(desc1); + return ret; +} + +static int +xencomm_privcmd_domctl(privcmd_hypercall_t *hypercall) +{ + xen_domctl_t kern_op; + xen_domctl_t __user *user_op; + struct xencomm_handle *op_desc; + struct xencomm_handle *desc = NULL; + int ret = 0; + + user_op = (xen_domctl_t __user *)hypercall->arg[0]; + + if (copy_from_user(&kern_op, user_op, sizeof(xen_domctl_t))) + return -EFAULT; + + if (kern_op.interface_version != XEN_DOMCTL_INTERFACE_VERSION) + return -EACCES; + + op_desc = xencomm_create_inline(&kern_op); + + switch (kern_op.cmd) { + case XEN_DOMCTL_createdomain: + case XEN_DOMCTL_destroydomain: + case XEN_DOMCTL_pausedomain: + case XEN_DOMCTL_unpausedomain: + case XEN_DOMCTL_getdomaininfo: + break; + case XEN_DOMCTL_getmemlist: + { + unsigned long nr_pages = kern_op.u.getmemlist.max_pfns; + + ret = xencomm_create( + xen_guest_handle(kern_op.u.getmemlist.buffer), + nr_pages * sizeof(unsigned long), + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.getmemlist.buffer, + (void *)desc); + break; + } + case XEN_DOMCTL_getpageframeinfo: + break; + case XEN_DOMCTL_getpageframeinfo2: + ret = xencomm_create( + xen_guest_handle(kern_op.u.getpageframeinfo2.array), + kern_op.u.getpageframeinfo2.num, + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.getpageframeinfo2.array, + (void *)desc); + break; + case XEN_DOMCTL_shadow_op: + ret = xencomm_create( + xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap), + ROUND_DIV(kern_op.u.shadow_op.pages, 8), + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.shadow_op.dirty_bitmap, + (void *)desc); + break; + case XEN_DOMCTL_max_mem: + break; + case XEN_DOMCTL_setvcpucontext: + case XEN_DOMCTL_getvcpucontext: + ret = xencomm_create( + xen_guest_handle(kern_op.u.vcpucontext.ctxt), + sizeof(vcpu_guest_context_t), + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.vcpucontext.ctxt, (void *)desc); + break; + case XEN_DOMCTL_getvcpuinfo: + break; + case XEN_DOMCTL_setvcpuaffinity: + case XEN_DOMCTL_getvcpuaffinity: + ret = xencomm_create( + xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap), + ROUND_DIV(kern_op.u.vcpuaffinity.cpumap.nr_cpus, 8), + &desc, GFP_KERNEL); + set_xen_guest_handle(kern_op.u.vcpuaffinity.cpumap.bitmap, + (void *)desc); + break; + case XEN_DOMCTL_max_vcpus: + case XEN_DOMCTL_scheduler_op: + case XEN_DOMCTL_setdomainhandle: + case XEN_DOMCTL_setdebugging: + case XEN_DOMCTL_irq_permission: + case XEN_DOMCTL_iomem_permission: + case XEN_DOMCTL_ioport_permission: + case XEN_DOMCTL_hypercall_init: + case XEN_DOMCTL_arch_setup: + case XEN_DOMCTL_settimeoffset: + break; + default: + printk("%s: unknown domctl cmd %d\n", __func__, kern_op.cmd); + return -ENOSYS; + } + + if (ret) { + /* error mapping the nested pointer */ + return ret; + } + + ret = xencomm_arch_hypercall_domctl (op_desc); + + /* FIXME: should we restore the handle? */ + if (copy_to_user(user_op, &kern_op, sizeof(xen_domctl_t))) + ret = -EFAULT; + + if (desc) + xencomm_free(desc); + return ret; +} + +static int +xencomm_privcmd_acm_op(privcmd_hypercall_t *hypercall) +{ + int cmd = hypercall->arg[0]; + void __user *arg = (void __user *)hypercall->arg[1]; + struct xencomm_handle *op_desc; + struct xencomm_handle *desc = NULL; + int ret; + + switch (cmd) { + case ACMOP_getssid: + { + struct acm_getssid kern_arg; + + if (copy_from_user(&kern_arg, arg, sizeof (kern_arg))) + return -EFAULT; + + op_desc = xencomm_create_inline(&kern_arg); + + ret = xencomm_create(xen_guest_handle(kern_arg.ssidbuf), + kern_arg.ssidbuf_size, &desc, GFP_KERNEL); + if (ret) + return ret; + + set_xen_guest_handle(kern_arg.ssidbuf, (void *)desc); + + ret = xencomm_arch_hypercall_acm_op(cmd, op_desc); + + xencomm_free(desc); + + if (copy_to_user(arg, &kern_arg, sizeof (kern_arg))) + return -EFAULT; + + return ret; + } + default: + printk("%s: unknown acm_op cmd %d\n", __func__, cmd); + return -ENOSYS; + } + + return ret; +} + +static int +xencomm_privcmd_memory_op(privcmd_hypercall_t *hypercall) +{ + const unsigned long cmd = hypercall->arg[0]; + int ret = 0; + + switch (cmd) { + case XENMEM_increase_reservation: + case XENMEM_decrease_reservation: + case XENMEM_populate_physmap: + { + xen_memory_reservation_t kern_op; + xen_memory_reservation_t __user *user_op; + struct xencomm_handle *desc = NULL; + struct xencomm_handle *desc_op; + + user_op = (xen_memory_reservation_t __user *)hypercall->arg[1]; + if (copy_from_user(&kern_op, user_op, + sizeof(xen_memory_reservation_t))) + return -EFAULT; + desc_op = xencomm_create_inline(&kern_op); + + if (xen_guest_handle(kern_op.extent_start)) { + void * addr; + + addr = xen_guest_handle(kern_op.extent_start); + ret = xencomm_create + (addr, + kern_op.nr_extents * + sizeof(*xen_guest_handle + (kern_op.extent_start)), + &desc, GFP_KERNEL); + if (ret) + return ret; + set_xen_guest_handle(kern_op.extent_start, + (void *)desc); + } + + ret = xencomm_arch_hypercall_memory_op(cmd, desc_op); + + if (desc) + xencomm_free(desc); + + if (ret != 0) + return ret; + + if (copy_to_user(user_op, &kern_op, + sizeof(xen_memory_reservation_t))) + return -EFAULT; + + return ret; + } + case XENMEM_translate_gpfn_list: + { + xen_translate_gpfn_list_t kern_op; + xen_translate_gpfn_list_t __user *user_op; + struct xencomm_handle *desc_gpfn = NULL; + struct xencomm_handle *desc_mfn = NULL; + struct xencomm_handle *desc_op; + void *addr; + + user_op = (xen_translate_gpfn_list_t __user *) + hypercall->arg[1]; + if (copy_from_user(&kern_op, user_op, + sizeof(xen_translate_gpfn_list_t))) + return -EFAULT; + desc_op = xencomm_create_inline(&kern_op); + + if (kern_op.nr_gpfns) { + /* gpfn_list. */ + addr = xen_guest_handle(kern_op.gpfn_list); + + ret = xencomm_create(addr, kern_op.nr_gpfns * + sizeof(*xen_guest_handle + (kern_op.gpfn_list)), + &desc_gpfn, GFP_KERNEL); + if (ret) + return ret; + set_xen_guest_handle(kern_op.gpfn_list, + (void *)desc_gpfn); + + /* mfn_list. */ + addr = xen_guest_handle(kern_op.mfn_list); + + ret = xencomm_create(addr, kern_op.nr_gpfns * + sizeof(*xen_guest_handle + (kern_op.mfn_list)), + &desc_mfn, GFP_KERNEL); + if (ret) + return ret; + set_xen_guest_handle(kern_op.mfn_list, + (void *)desc_mfn); + } + + ret = xencomm_arch_hypercall_memory_op(cmd, desc_op); + + if (desc_gpfn) + xencomm_free(desc_gpfn); + + if (desc_mfn) + xencomm_free(desc_mfn); + + if (ret != 0) + return ret; + + return ret; + } + default: + printk("%s: unknown memory op %lu\n", __func__, cmd); + ret = -ENOSYS; + } + return ret; +} + +static int +xencomm_privcmd_xen_version(privcmd_hypercall_t *hypercall) +{ + int cmd = hypercall->arg[0]; + void __user *arg = (void __user *)hypercall->arg[1]; + struct xencomm_handle *desc; + size_t argsize; + int rc; + + switch (cmd) { + case XENVER_version: + /* do not actually pass an argument */ + return xencomm_arch_hypercall_xen_version(cmd, 0); + case XENVER_extraversion: + argsize = sizeof(xen_extraversion_t); + break; + case XENVER_compile_info: + argsize = sizeof(xen_compile_info_t); + break; + case XENVER_capabilities: + argsize = sizeof(xen_capabilities_info_t); + break; + case XENVER_changeset: + argsize = sizeof(xen_changeset_info_t); + break; + case XENVER_platform_parameters: + argsize = sizeof(xen_platform_parameters_t); + break; + case XENVER_pagesize: + argsize = (arg == NULL) ? 0 : sizeof(void *); + break; + case XENVER_get_features: + argsize = (arg == NULL) ? 0 : sizeof(xen_feature_info_t); + break; + + default: + printk("%s: unknown version op %d\n", __func__, cmd); + return -ENOSYS; + } + + rc = xencomm_create(arg, argsize, &desc, GFP_KERNEL); + if (rc) + return rc; + + rc = xencomm_arch_hypercall_xen_version(cmd, desc); + + xencomm_free(desc); + + return rc; +} + +static int +xencomm_privcmd_event_channel_op(privcmd_hypercall_t *hypercall) +{ + int cmd = hypercall->arg[0]; + struct xencomm_handle *desc; + unsigned int argsize; + int ret; + + switch (cmd) { + case EVTCHNOP_alloc_unbound: + argsize = sizeof(evtchn_alloc_unbound_t); + break; + + case EVTCHNOP_status: + argsize = sizeof(evtchn_status_t); + break; + + default: + printk("%s: unknown EVTCHNOP %d\n", __func__, cmd); + return -EINVAL; + } + + ret = xencomm_create((void *)hypercall->arg[1], argsize, + &desc, GFP_KERNEL); + if (ret) + return ret; + + ret = xencomm_arch_hypercall_event_channel_op(cmd, desc); + + xencomm_free(desc); + return ret; +} + +static int +xencomm_privcmd_hvm_op(privcmd_hypercall_t *hypercall) +{ + int cmd = hypercall->arg[0]; + struct xencomm_handle *desc; + unsigned int argsize; + int ret; + + switch (cmd) { + case HVMOP_get_param: + case HVMOP_set_param: + argsize = sizeof(xen_hvm_param_t); + break; + case HVMOP_set_irq_level: + argsize = sizeof(xen_hvm_set_irq_level_t); + break; + default: + printk("%s: unknown HVMOP %d\n", __func__, cmd); + return -EINVAL; + } + + ret = xencomm_create((void *)hypercall->arg[1], argsize, + &desc, GFP_KERNEL); + if (ret) + return ret; + + ret = xencomm_arch_hypercall_hvm_op(cmd, desc); + + xencomm_free(desc); + return ret; +} + +static int +xencomm_privcmd_sched_op(privcmd_hypercall_t *hypercall) +{ + int cmd = hypercall->arg[0]; + struct xencomm_handle *desc; + unsigned int argsize; + int ret; + + switch (cmd) { + case SCHEDOP_remote_shutdown: + argsize = sizeof(sched_remote_shutdown_t); + break; + default: + printk("%s: unknown SCHEDOP %d\n", __func__, cmd); + return -EINVAL; + } + + ret = xencomm_create((void *)hypercall->arg[1], argsize, + &desc, GFP_KERNEL); + if (ret) + return ret; + + ret = xencomm_arch_hypercall_sched_op(cmd, desc); + + xencomm_free(desc); + return ret; +} + +int +privcmd_hypercall(privcmd_hypercall_t *hypercall) +{ + switch (hypercall->op) { + case __HYPERVISOR_dom0_op: + return xencomm_privcmd_dom0_op(hypercall); + case __HYPERVISOR_domctl: + return xencomm_privcmd_domctl(hypercall); + case __HYPERVISOR_sysctl: + return xencomm_privcmd_sysctl(hypercall); + case __HYPERVISOR_acm_op: + return xencomm_privcmd_acm_op(hypercall); + case __HYPERVISOR_xen_version: + return xencomm_privcmd_xen_version(hypercall); + case __HYPERVISOR_memory_op: + return xencomm_privcmd_memory_op(hypercall); + case __HYPERVISOR_event_channel_op: + return xencomm_privcmd_event_channel_op(hypercall); + case __HYPERVISOR_hvm_op: + return xencomm_privcmd_hvm_op(hypercall); + case __HYPERVISOR_sched_op: + return xencomm_privcmd_sched_op(hypercall); + default: + printk("%s: unknown hcall (%ld)\n", __func__, hypercall->op); + return -ENOSYS; + } +} + diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/xencomm.c b/linux-2.6-xen-sparse/arch/ia64/xen/xencomm.c new file mode 100644 index 0000000000..367b6b32de --- /dev/null +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xencomm.c @@ -0,0 +1,263 @@ +/* + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/gfp.h> +#include <linux/mm.h> +#include <xen/interface/xen.h> +#include <asm/page.h> + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#include <asm/xen/xencomm.h> + +static int xencomm_debug = 0; + +static unsigned long kernel_start_pa; + +void +xencomm_init (void) +{ + kernel_start_pa = KERNEL_START - ia64_tpa(KERNEL_START); +} + +/* Translate virtual address to physical address. */ +unsigned long +xencomm_vaddr_to_paddr(unsigned long vaddr) +{ +#ifndef CONFIG_VMX_GUEST + struct page *page; + struct vm_area_struct *vma; +#endif + + if (vaddr == 0) + return 0; + +#ifdef __ia64__ + if (REGION_NUMBER(vaddr) == 5) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + /* On ia64, TASK_SIZE refers to current. It is not initialized + during boot. + Furthermore the kernel is relocatable and __pa() doesn't + work on addresses. */ + if (vaddr >= KERNEL_START + && vaddr < (KERNEL_START + KERNEL_TR_PAGE_SIZE)) { + return vaddr - kernel_start_pa; + } + + /* In kernel area -- virtually mapped. */ + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + return ~0UL; + + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud) || pud_bad(*pud)) + return ~0UL; + + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + return ~0UL; + + ptep = pte_offset_kernel(pmd, vaddr); + if (!ptep) + return ~0UL; + + return (pte_val(*ptep) & _PFN_MASK) | (vaddr & ~PAGE_MASK); + } +#endif + + if (vaddr > TASK_SIZE) { + /* kernel address */ + return __pa(vaddr); + } + + +#ifdef CONFIG_VMX_GUEST + /* No privcmd within vmx guest. */ + return ~0UL; +#else + /* XXX double-check (lack of) locking */ + vma = find_extend_vma(current->mm, vaddr); + if (!vma) + return ~0UL; + + /* We assume the page is modified. */ + page = follow_page(vma, vaddr, FOLL_WRITE | FOLL_TOUCH); + if (!page) + return ~0UL; + + return (page_to_pfn(page) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); +#endif +} + +static int +xencomm_init_desc(struct xencomm_desc *desc, void *buffer, unsigned long bytes) +{ + unsigned long recorded = 0; + int i = 0; + + BUG_ON((buffer == NULL) && (bytes > 0)); + + /* record the physical pages used */ + if (buffer == NULL) + desc->nr_addrs = 0; + + while ((recorded < bytes) && (i < desc->nr_addrs)) { + unsigned long vaddr = (unsigned long)buffer + recorded; + unsigned long paddr; + int offset; + int chunksz; + + offset = vaddr % PAGE_SIZE; /* handle partial pages */ + chunksz = min(PAGE_SIZE - offset, bytes - recorded); + + paddr = xencomm_vaddr_to_paddr(vaddr); + if (paddr == ~0UL) { + printk("%s: couldn't translate vaddr %lx\n", + __func__, vaddr); + return -EINVAL; + } + + desc->address[i++] = paddr; + recorded += chunksz; + } + + if (recorded < bytes) { + printk("%s: could only translate %ld of %ld bytes\n", + __func__, recorded, bytes); + return -ENOSPC; + } + + /* mark remaining addresses invalid (just for safety) */ + while (i < desc->nr_addrs) + desc->address[i++] = XENCOMM_INVALID; + + desc->magic = XENCOMM_MAGIC; + + return 0; +} + +static struct xencomm_desc * +xencomm_alloc(gfp_t gfp_mask) +{ + struct xencomm_desc *desc; + + desc = (struct xencomm_desc *)__get_free_page(gfp_mask); + if (desc == NULL) + panic("%s: page allocation failed\n", __func__); + + desc->nr_addrs = (PAGE_SIZE - sizeof(struct xencomm_desc)) / + sizeof(*desc->address); + + return desc; +} + +void +xencomm_free(struct xencomm_handle *desc) +{ + if (desc) + free_page((unsigned long)__va(desc)); +} + +int +xencomm_create(void *buffer, unsigned long bytes, + struct xencomm_handle **ret, gfp_t gfp_mask) +{ + struct xencomm_desc *desc; + struct xencomm_handle *handle; + int rc; + + if (xencomm_debug) + printk("%s: %p[%ld]\n", __func__, buffer, bytes); + + if (buffer == NULL || bytes == 0) { + *ret = (struct xencomm_handle *)NULL; + return 0; + } + + desc = xencomm_alloc(gfp_mask); + if (!desc) { + printk("%s failure\n", "xencomm_alloc"); + return -ENOMEM; + } + handle = (struct xencomm_handle *)__pa(desc); + + rc = xencomm_init_desc(desc, buffer, bytes); + if (rc) { + printk("%s failure: %d\n", "xencomm_init_desc", rc); + xencomm_free(handle); + return rc; + } + + *ret = handle; + return 0; +} + +/* "mini" routines, for stack-based communications: */ + +static void * +xencomm_alloc_mini(struct xencomm_mini *area, int *nbr_area) +{ + unsigned long base; + unsigned int pageoffset; + + while (*nbr_area >= 0) { + /* Allocate an area. */ + (*nbr_area)--; + + base = (unsigned long)(area + *nbr_area); + pageoffset = base % PAGE_SIZE; + + /* If the area does not cross a page, use it. */ + if ((PAGE_SIZE - pageoffset) >= sizeof(struct xencomm_mini)) + return &area[*nbr_area]; + } + /* No more area. */ + return NULL; +} + +int +xencomm_create_mini(struct xencomm_mini *area, int *nbr_area, + void *buffer, unsigned long bytes, + struct xencomm_handle **ret) +{ + struct xencomm_desc *desc; + int rc; + unsigned long res; + + desc = xencomm_alloc_mini(area, nbr_area); + if (!desc) + return -ENOMEM; + desc->nr_addrs = XENCOMM_MINI_ADDRS; + + rc = xencomm_init_desc(desc, buffer, bytes); + if (rc) + return rc; + + res = xencomm_vaddr_to_paddr((unsigned long)desc); + if (res == ~0UL) + return -EINVAL; + + *ret = (struct xencomm_handle*)res; + return 0; +} diff --git a/linux-2.6-xen-sparse/arch/ia64/xen/xensetup.S b/linux-2.6-xen-sparse/arch/ia64/xen/xensetup.S index 918622918e..e761278670 100644 --- a/linux-2.6-xen-sparse/arch/ia64/xen/xensetup.S +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xensetup.S @@ -23,12 +23,11 @@ GLOBAL_ENTRY(early_xen_setup) mov cr.iva=r10 -#if XSI_BASE != 0xf100000000000000UL - /* Backward compatibility. */ -(isBP) mov r2=0x600 + /* Set xsi base. */ +#define FW_HYPERCALL_SET_SHARED_INFO_VA 0x600 +(isBP) mov r2=FW_HYPERCALL_SET_SHARED_INFO_VA (isBP) movl r28=XSI_BASE;; (isBP) break 0x1000;; -#endif br.ret.sptk.many rp ;; @@ -38,18 +37,18 @@ END(early_xen_setup) /* Stub for suspend. Just force the stacked registers to be written in memory. */ -GLOBAL_ENTRY(HYPERVISOR_suspend) +GLOBAL_ENTRY(xencomm_arch_hypercall_suspend) + mov r15=r32 + ;; alloc r20=ar.pfs,0,0,0,0 - mov r14=2 - mov r15=r12 - ;; + mov r2=__HYPERVISOR_sched_op + ;; /* We don't want to deal with RSE. */ flushrs - mov r2=__HYPERVISOR_sched_op - st4 [r12]=r14 + mov r14=2 // SCHEDOP_shutdown ;; break 0x1000 ;; mov ar.pfs=r20 br.ret.sptk.many b0 -END(HYPERVISOR_suspend) +END(xencomm_arch_hypercall_suspend) diff --git a/linux-2.6-xen-sparse/arch/x86_64/Kconfig b/linux-2.6-xen-sparse/arch/x86_64/Kconfig index 45d8302cb4..79bd0044fa 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig +++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig @@ -368,6 +368,8 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config ARCH_ENABLE_MEMORY_HOTPLUG + def_bool y config HPET_TIMER bool diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/acpi/Makefile b/linux-2.6-xen-sparse/arch/x86_64/kernel/acpi/Makefile index aa84f6eb98..57b7fe1c11 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/acpi/Makefile +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/acpi/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += processor.o +processor-y := ../../../i386/kernel/acpi/processor.o ../../../i386/kernel/acpi/cstate.o endif boot-$(CONFIG_XEN) := ../../../i386/kernel/acpi/boot-xen.o diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S index 7edd8d5cb6..687c486878 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S @@ -316,12 +316,7 @@ tracesys: ja 1f movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) -1: SAVE_REST - movq %rsp,%rdi - call syscall_trace_leave - RESTORE_TOP_OF_STACK %rbx - RESTORE_REST +1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ jmp int_ret_from_sys_call CFI_ENDPROC diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c b/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c index ca96708d1c..65ba83c625 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c @@ -350,7 +350,6 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, - .contents = (3 << 3), /* user */ .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c index 8fe13ee2b0..711ce5d198 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c @@ -74,6 +74,7 @@ #include <asm/hypervisor.h> #include <xen/interface/nmi.h> #include <xen/features.h> +#include <xen/xencons.h> #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_PHYS(x) ((x) << PAGE_SHIFT) #include <asm/mach-xen/setup_arch_post.h> @@ -143,6 +144,9 @@ struct sys_desc_table_struct { struct edid_info edid_info; struct e820map e820; +#ifdef CONFIG_XEN +struct e820map machine_e820; +#endif extern int root_mountflags; @@ -625,7 +629,6 @@ static void __init reserve_ebda_region(void) void __init setup_arch(char **cmdline_p) { unsigned long kernel_end; - struct e820entry *machine_e820; struct xen_memory_map memmap; #ifdef CONFIG_XEN @@ -645,33 +648,15 @@ void __init setup_arch(char **cmdline_p) screen_info.orig_video_cols = 80; screen_info.orig_video_ega_bx = 3; screen_info.orig_video_points = 16; + screen_info.orig_y = screen_info.orig_video_lines - 1; if (xen_start_info->console.dom0.info_size >= sizeof(struct dom0_vga_console_info)) { const struct dom0_vga_console_info *info = (struct dom0_vga_console_info *)( (char *)xen_start_info + xen_start_info->console.dom0.info_off); - screen_info.orig_video_mode = info->txt_mode; - screen_info.orig_video_isVGA = info->video_type; - screen_info.orig_video_lines = info->video_height; - screen_info.orig_video_cols = info->video_width; - screen_info.orig_video_points = info->txt_points; - screen_info.lfb_width = info->video_width; - screen_info.lfb_height = info->video_height; - screen_info.lfb_depth = info->lfb_depth; - screen_info.lfb_base = info->lfb_base; - screen_info.lfb_size = info->lfb_size; - screen_info.lfb_linelength = info->lfb_linelen; - screen_info.red_size = info->red_size; - screen_info.red_pos = info->red_pos; - screen_info.green_size = info->green_size; - screen_info.green_pos = info->green_pos; - screen_info.blue_size = info->blue_size; - screen_info.blue_pos = info->blue_pos; - screen_info.rsvd_size = info->rsvd_size; - screen_info.rsvd_pos = info->rsvd_pos; + dom0_init_screen_info(info); } - screen_info.orig_y = screen_info.orig_video_lines - 1; xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; } else @@ -936,14 +921,14 @@ void __init setup_arch(char **cmdline_p) probe_roms(); #ifdef CONFIG_XEN if (is_initial_xendomain()) { - machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE); - memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, machine_e820); + set_xen_guest_handle(memmap.buffer, machine_e820.map); - BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)); + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; - e820_reserve_resources(machine_e820, memmap.nr_entries); + e820_reserve_resources(machine_e820.map, machine_e820.nr_map); } #else e820_reserve_resources(e820.map, e820.nr_map); @@ -959,10 +944,8 @@ void __init setup_arch(char **cmdline_p) } #ifdef CONFIG_XEN - if (is_initial_xendomain()) { - e820_setup_gap(machine_e820, memmap.nr_entries); - free_bootmem(__pa(machine_e820), PAGE_SIZE); - } + if (is_initial_xendomain()) + e820_setup_gap(machine_e820.map, machine_e820.nr_map); #else e820_setup_gap(e820.map, e820.nr_map); #endif diff --git a/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c b/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c index 9fd0b94cb5..653af67c08 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c @@ -30,6 +30,7 @@ #include <linux/moduleparam.h> #include <linux/nmi.h> #include <linux/kprobes.h> +#include <linux/kexec.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -438,6 +439,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk(KERN_ALERT "RIP "); printk_address(regs->rip); printk(" RSP <%016lx>\n", regs->rsp); + if (kexec_should_crash(current)) + crash_kexec(regs); } void die(const char * str, struct pt_regs * regs, long err) @@ -461,6 +464,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) */ printk(str, safe_smp_processor_id()); show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); if (panic_on_timeout || panic_on_oops) panic("nmi watchdog"); printk("console shuts up ...\n"); diff --git a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c index d3ce58355f..032c31e532 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @@ -56,6 +56,11 @@ struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); +#ifdef CONFIG_XEN_COMPAT_030002 +unsigned int __kernel_page_user; +EXPORT_SYMBOL(__kernel_page_user); +#endif + extern unsigned long *contiguous_bitmap; static unsigned long dma_reserve __initdata; @@ -260,7 +265,10 @@ static void set_pte_phys(unsigned long vaddr, return; } } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + if (pgprot_val(prot)) + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + else + new_pte = __pte(0); pte = pte_offset_kernel(pmd, vaddr); if (!pte_none(*pte) && @@ -524,6 +532,33 @@ void __init xen_init_pt(void) addr = page[pud_index(__START_KERNEL_map)]; addr_to_page(addr, page); +#ifdef CONFIG_XEN_COMPAT_030002 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER + in kernel PTEs. We check that here. */ + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) { + unsigned long *pg; + pte_t pte; + + /* Mess with the initial mapping of page 0. It's not needed. */ + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map); + addr = page[pmd_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + pte.pte = pg[pte_index(__START_KERNEL_map)]; + BUG_ON(!(pte.pte & _PAGE_PRESENT)); + + /* If _PAGE_USER isn't set, we obviously do not need it. */ + if (pte.pte & _PAGE_USER) { + /* _PAGE_USER is needed, but is it set implicitly? */ + pte.pte &= ~_PAGE_USER; + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map, + pte, 0) != 0) || + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER)) + /* We need to explicitly specify _PAGE_USER. */ + __kernel_page_user = _PAGE_USER; + } + } +#endif + /* Construct mapping of initial pte page in our own directories. */ init_level4_pgt[pgd_index(__START_KERNEL_map)] = mk_kernel_pgd(__pa_symbol(level3_kernel_pgt)); @@ -913,8 +948,8 @@ void __init mem_init(void) #endif /* XEN: init and count pages outside initial allocation. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { - ClearPageReserved(&mem_map[pfn]); - set_page_count(&mem_map[pfn], 1); + ClearPageReserved(pfn_to_page(pfn)); + set_page_count(pfn_to_page(pfn), 1); totalram_pages++; } reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); diff --git a/linux-2.6-xen-sparse/arch/x86_64/oprofile/Makefile b/linux-2.6-xen-sparse/arch/x86_64/oprofile/Makefile index 589a7966e8..cc3b9939b0 100644 --- a/linux-2.6-xen-sparse/arch/x86_64/oprofile/Makefile +++ b/linux-2.6-xen-sparse/arch/x86_64/oprofile/Makefile @@ -12,6 +12,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) ifdef CONFIG_XEN +XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \ + xenoprofile.o) OPROFILE-y := xenoprof.o else OPROFILE-y := init.o backtrace.o @@ -19,4 +21,5 @@ OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \ op_model_ppro.o OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o endif -oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) +oprofile-y = $(DRIVER_OBJS) $(XENOPROF_COMMON_OBJS) \ + $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) diff --git a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c b/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c index 71c7dd3a00..adf016ba90 100644 --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c +++ b/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c @@ -41,6 +41,7 @@ #include <xen/evtchn.h> #include <xen/interface/grant_table.h> #include <xen/interface/io/tpmif.h> +#include <xen/gnttab.h> #include <xen/xenbus.h> #include "tpm.h" #include "tpm_vtpm.h" @@ -343,6 +344,7 @@ static void backend_changed(struct xenbus_device *dev, case XenbusStateInitialising: case XenbusStateInitWait: case XenbusStateInitialised: + case XenbusStateUnknown: break; case XenbusStateConnected: @@ -351,13 +353,14 @@ static void backend_changed(struct xenbus_device *dev, case XenbusStateClosing: tpmif_set_connected_state(tp, 0); + xenbus_frontend_closed(dev); break; - case XenbusStateUnknown: case XenbusStateClosed: + tpmif_set_connected_state(tp, 0); if (tp->is_suspended == 0) device_unregister(&dev->dev); - xenbus_switch_state(dev, XenbusStateClosed); + xenbus_frontend_closed(dev); break; } } @@ -419,9 +422,10 @@ static int tpmfront_suspend(struct xenbus_device *dev) mutex_lock(&suspend_lock); tp->is_suspended = 1; - for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 25; ctr++) { + for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 300; ctr++) { if ((ctr % 10) == 0) - printk("TPM-FE [INFO]: Waiting for outstanding request.\n"); + printk("TPM-FE [INFO]: Waiting for outstanding " + "request.\n"); /* * Wait for a request to be responded to. */ diff --git a/linux-2.6-xen-sparse/drivers/char/tty_io.c b/linux-2.6-xen-sparse/drivers/char/tty_io.c index f6f0689771..0372d93bca 100644 --- a/linux-2.6-xen-sparse/drivers/char/tty_io.c +++ b/linux-2.6-xen-sparse/drivers/char/tty_io.c @@ -2761,7 +2761,7 @@ static void flush_to_ldisc(void *private_) struct tty_struct *tty = (struct tty_struct *) private_; unsigned long flags; struct tty_ldisc *disc; - struct tty_buffer *tbuf; + struct tty_buffer *tbuf, *head; int count; char *char_buf; unsigned char *flag_buf; @@ -2778,7 +2778,9 @@ static void flush_to_ldisc(void *private_) goto out; } spin_lock_irqsave(&tty->buf.lock, flags); - while((tbuf = tty->buf.head) != NULL) { + head = tty->buf.head; + tty->buf.head = NULL; + while((tbuf = head) != NULL) { while ((count = tbuf->commit - tbuf->read) != 0) { char_buf = tbuf->char_buf_ptr + tbuf->read; flag_buf = tbuf->flag_buf_ptr + tbuf->read; @@ -2787,10 +2789,12 @@ static void flush_to_ldisc(void *private_) disc->receive_buf(tty, char_buf, flag_buf, count); spin_lock_irqsave(&tty->buf.lock, flags); } - if (tbuf->active) + if (tbuf->active) { + tty->buf.head = head; break; - tty->buf.head = tbuf->next; - if (tty->buf.head == NULL) + } + head = tbuf->next; + if (head == NULL) tty->buf.tail = NULL; tty_buffer_free(tty, tbuf); } diff --git a/linux-2.6-xen-sparse/drivers/serial/Kconfig b/linux-2.6-xen-sparse/drivers/serial/Kconfig index fa1fdb0b37..c6be86d83e 100644 --- a/linux-2.6-xen-sparse/drivers/serial/Kconfig +++ b/linux-2.6-xen-sparse/drivers/serial/Kconfig @@ -821,6 +821,7 @@ config SERIAL_ICOM tristate "IBM Multiport Serial Adapter" depends on PCI && (PPC_ISERIES || PPC_PSERIES) select SERIAL_CORE + select FW_LOADER help This driver is for a family of multiport serial adapters including 2 port RVX, 2 port internal modem, 4 port internal diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile b/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile index 0e3a3485c4..3fc3d0bae5 100644 --- a/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile @@ -1,2 +1,2 @@ -obj-y += balloon.o +obj-y := balloon.o sysfs.o diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c index a6a8396c05..b621d76383 100644 --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c @@ -53,10 +53,8 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <linux/list.h> - #include <xen/xenbus.h> - -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) +#include "common.h" #ifdef CONFIG_PROC_FS static struct proc_dir_entry *balloon_pde; @@ -71,9 +69,7 @@ static DECLARE_MUTEX(balloon_mutex); */ DEFINE_SPINLOCK(balloon_lock); -/* We aim for 'current allocation' == 'target allocation'. */ -static unsigned long current_pages; -static unsigned long target_pages; +struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -81,18 +77,8 @@ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; /* VM /proc information for memory */ extern unsigned long totalram_pages; -/* We may hit the hard limit in Xen. If we do then we remember it. */ -static unsigned long hard_limit; - -/* - * Drivers may alter the memory reservation independently, but they must - * inform the balloon driver so that we can avoid hitting the hard limit. - */ -static unsigned long driver_pages; - /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); -static unsigned long balloon_low, balloon_high; /* Main work function, always executed in process context. */ static void balloon_process(void *unused); @@ -124,10 +110,10 @@ static void balloon_append(struct page *page) /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); - balloon_high++; + bs.balloon_high++; } else { list_add(PAGE_TO_LIST(page), &ballooned_pages); - balloon_low++; + bs.balloon_low++; } } @@ -143,9 +129,9 @@ static struct page *balloon_retrieve(void) UNLIST_PAGE(page); if (PageHighMem(page)) - balloon_high--; + bs.balloon_high--; else - balloon_low--; + bs.balloon_low--; return page; } @@ -172,9 +158,9 @@ static void balloon_alarm(unsigned long unused) static unsigned long current_target(void) { - unsigned long target = min(target_pages, hard_limit); - if (target > (current_pages + balloon_low + balloon_high)) - target = current_pages + balloon_low + balloon_high; + unsigned long target = min(bs.target_pages, bs.hard_limit); + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; return target; } @@ -216,7 +202,8 @@ static int increase_reservation(unsigned long nr_pages) BUG_ON(ret != rc); } if (rc >= 0) - hard_limit = current_pages + rc - driver_pages; + bs.hard_limit = (bs.current_pages + rc - + bs.driver_pages); goto out; } @@ -228,9 +215,7 @@ static int increase_reservation(unsigned long nr_pages) BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && phys_to_machine_mapping_valid(pfn)); - /* Update P->M and M->P tables. */ set_phys_to_machine(pfn, frame_list[i]); - xen_machphys_update(frame_list[i], pfn); /* Link back into the page tables if not highmem. */ if (pfn < max_low_pfn) { @@ -248,8 +233,8 @@ static int increase_reservation(unsigned long nr_pages) __free_page(page); } - current_pages += nr_pages; - totalram_pages = current_pages; + bs.current_pages += nr_pages; + totalram_pages = bs.current_pages; out: balloon_unlock(flags); @@ -317,8 +302,8 @@ static int decrease_reservation(unsigned long nr_pages) ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != nr_pages); - current_pages -= nr_pages; - totalram_pages = current_pages; + bs.current_pages -= nr_pages; + totalram_pages = bs.current_pages; balloon_unlock(flags); @@ -339,7 +324,7 @@ static void balloon_process(void *unused) down(&balloon_mutex); do { - credit = current_target() - current_pages; + credit = current_target() - bs.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) @@ -352,18 +337,18 @@ static void balloon_process(void *unused) } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ - if (current_target() != current_pages) + if (current_target() != bs.current_pages) mod_timer(&balloon_timer, jiffies + HZ); up(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - hard_limit = ~0UL; - target_pages = target; + bs.hard_limit = ~0UL; + bs.target_pages = target; schedule_work(&balloon_worker); } @@ -388,7 +373,7 @@ static void watch_target(struct xenbus_watch *watch, /* The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ - set_new_target(new_target >> (PAGE_SHIFT - 10)); + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); } static int balloon_init_watcher(struct notifier_block *notifier, @@ -424,7 +409,7 @@ static int balloon_write(struct file *file, const char __user *buffer, memstring[sizeof(memstring)-1] = '\0'; target_bytes = memparse(memstring, &endchar); - set_new_target(target_bytes >> PAGE_SHIFT); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); return count; } @@ -442,12 +427,13 @@ static int balloon_read(char *page, char **start, off_t off, "High-mem balloon: %8lu kB\n" "Driver pages: %8lu kB\n" "Xen hard limit: ", - PAGES2KB(current_pages), PAGES2KB(target_pages), - PAGES2KB(balloon_low), PAGES2KB(balloon_high), - PAGES2KB(driver_pages)); + PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), + PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high), + PAGES2KB(bs.driver_pages)); - if (hard_limit != ~0UL) - len += sprintf(page + len, "%8lu kB\n", PAGES2KB(hard_limit)); + if (bs.hard_limit != ~0UL) + len += sprintf(page + len, "%8lu kB\n", + PAGES2KB(bs.hard_limit)); else len += sprintf(page + len, " ??? kB\n"); @@ -468,13 +454,13 @@ static int __init balloon_init(void) IPRINTK("Initialising balloon driver.\n"); - current_pages = min(xen_start_info->nr_pages, max_pfn); - totalram_pages = current_pages; - target_pages = current_pages; - balloon_low = 0; - balloon_high = 0; - driver_pages = 0UL; - hard_limit = ~0UL; + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); + totalram_pages = bs.current_pages; + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + bs.hard_limit = ~0UL; init_timer(&balloon_timer); balloon_timer.data = 0; @@ -489,6 +475,7 @@ static int __init balloon_init(void) balloon_pde->read_proc = balloon_read; balloon_pde->write_proc = balloon_write; #endif + balloon_sysfs_init(); /* Initialise the balloon with excess memory space. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { @@ -512,7 +499,7 @@ void balloon_update_driver_allowance(long delta) unsigned long flags; balloon_lock(flags); - driver_pages += delta; + bs.driver_pages += delta; balloon_unlock(flags); } @@ -534,75 +521,87 @@ static int dealloc_pte_fn( return 0; } -struct page *balloon_alloc_empty_page_range(unsigned long nr_pages) +struct page **alloc_empty_pages_and_pagevec(int nr_pages) { - unsigned long vstart, flags; - unsigned int order = get_order(nr_pages * PAGE_SIZE); - int ret; - unsigned long i; - struct page *page; + unsigned long vaddr, flags; + struct page *page, **pagevec; + int i, ret; - vstart = __get_free_pages(GFP_KERNEL, order); - if (vstart == 0) + pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); + if (pagevec == NULL) return NULL; - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - if (xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long gmfn = __pa(vstart) >> PAGE_SHIFT; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = order, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &gmfn); - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - if (ret == -ENOSYS) - goto err; - BUG_ON(ret != 1); - } else { - ret = apply_to_page_range(&init_mm, vstart, PAGE_SIZE << order, - dealloc_pte_fn, NULL); - if (ret == -ENOSYS) + for (i = 0; i < nr_pages; i++) { + page = pagevec[i] = alloc_page(GFP_KERNEL); + if (page == NULL) goto err; - BUG_ON(ret); - } - current_pages -= 1UL << order; - totalram_pages = current_pages; - balloon_unlock(flags); - schedule_work(&balloon_worker); + vaddr = (unsigned long)page_address(page); - flush_tlb_all(); + scrub_pages(vaddr, 1); - page = virt_to_page(vstart); + balloon_lock(flags); - for (i = 0; i < (1UL << order); i++) - set_page_count(page + i, 1); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long gmfn = page_to_pfn(page); + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &gmfn); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + if (ret == 1) + ret = 0; /* success */ + } else { + ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE, + dealloc_pte_fn, NULL); + } - return page; + if (ret != 0) { + balloon_unlock(flags); + __free_page(page); + goto err; + } + + totalram_pages = --bs.current_pages; + + balloon_unlock(flags); + } + + out: + schedule_work(&balloon_worker); + flush_tlb_all(); + return pagevec; err: - free_pages(vstart, order); + balloon_lock(flags); + while (--i >= 0) + balloon_append(pagevec[i]); balloon_unlock(flags); - return NULL; + kfree(pagevec); + pagevec = NULL; + goto out; } -void balloon_dealloc_empty_page_range( - struct page *page, unsigned long nr_pages) +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) { - unsigned long i, flags; - unsigned int order = get_order(nr_pages * PAGE_SIZE); + unsigned long flags; + int i; + + if (pagevec == NULL) + return; balloon_lock(flags); - for (i = 0; i < (1UL << order); i++) { - BUG_ON(page_count(page + i) != 1); - balloon_append(page + i); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page_count(pagevec[i]) != 1); + balloon_append(pagevec[i]); } balloon_unlock(flags); + kfree(pagevec); + schedule_work(&balloon_worker); } @@ -612,15 +611,15 @@ void balloon_release_driver_page(struct page *page) balloon_lock(flags); balloon_append(page); - driver_pages--; + bs.driver_pages--; balloon_unlock(flags); schedule_work(&balloon_worker); } EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); -EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range); -EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range); +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); EXPORT_SYMBOL_GPL(balloon_release_driver_page); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/common.h b/linux-2.6-xen-sparse/drivers/xen/balloon/common.h new file mode 100644 index 0000000000..4496d215e2 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/common.h @@ -0,0 +1,58 @@ +/****************************************************************************** + * balloon/common.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BALLOON_COMMON_H__ +#define __XEN_BALLOON_COMMON_H__ + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* We may hit the hard limit in Xen. If we do then we remember it. */ + unsigned long hard_limit; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; + +extern struct balloon_stats balloon_stats; +#define bs balloon_stats + +int balloon_sysfs_init(void); +void balloon_sysfs_exit(void); + +void balloon_set_new_target(unsigned long target); + +#endif /* __XEN_BALLOON_COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c b/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c new file mode 100644 index 0000000000..a4ed8a6f1e --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c @@ -0,0 +1,165 @@ +/****************************************************************************** + * balloon/sysfs.c + * + * Xen balloon driver - sysfs interfaces. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/capability.h> +#include <linux/stat.h> +#include <linux/sysdev.h> +#include "common.h" + +#define BALLOON_CLASS_NAME "memory" + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high)); +BALLOON_SHOW(hard_limit_kb, + (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n", + (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0); +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages)); + +static ssize_t show_target_kb(struct sys_device *dev, char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + const char *buf, + size_t count) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + strcpy(memstring, buf); + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + &attr_hard_limit_kb.attr, + &attr_driver_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs, +}; + +static struct sysdev_class balloon_sysdev_class = { + set_kset_name(BALLOON_CLASS_NAME), +}; + +static struct sys_device balloon_sysdev; + +static int register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +static void unregister_balloon(struct sys_device *sysdev) +{ + int i; + + sysfs_remove_group(&sysdev->kobj, &balloon_info_group); + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); +} + +int balloon_sysfs_init(void) +{ + return register_balloon(&balloon_sysdev); +} + +void balloon_sysfs_exit(void) +{ + unregister_balloon(&balloon_sysdev); +} diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c index 416f7bc18c..e8df9e0346 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c @@ -56,8 +56,6 @@ static int blkif_reqs = 64; module_param_named(reqs, blkif_reqs, int, 0); MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); -static int mmap_pages; - /* Run-time switchable: /sys/module/blkback/parameters/ */ static unsigned int log_stats = 0; static unsigned int debug_lvl = 0; @@ -87,8 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); #define BLKBACK_INVALID_HANDLE (~0) -static unsigned long mmap_vstart; -static unsigned long *pending_vaddrs; +static struct page **pending_pages; static grant_handle_t *pending_grant_handles; static inline int vaddr_pagenr(pending_req_t *req, int seg) @@ -98,7 +95,8 @@ static inline int vaddr_pagenr(pending_req_t *req, int seg) static inline unsigned long vaddr(pending_req_t *req, int seg) { - return pending_vaddrs[vaddr_pagenr(req, seg)]; + unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]); + return (unsigned long)pfn_to_kaddr(pfn); } #define pending_handle(_req, _seg) \ @@ -191,9 +189,9 @@ static void fast_flush_area(pending_req_t *req) static void print_stats(blkif_t *blkif) { - printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req); + blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; @@ -243,11 +241,17 @@ int blkif_schedule(void *arg) * COMPLETION CALLBACK -- Called as bh->b_end_io() */ -static void __end_block_io_op(pending_req_t *pending_req, int uptodate) +static void __end_block_io_op(pending_req_t *pending_req, int error) { /* An error fails the entire request. */ - if (!uptodate) { - DPRINTK("Buffer not up-to-date at end of operation\n"); + if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + DPRINTK("blkback: write barrier op failed, not supported\n"); + blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + DPRINTK("Buffer not up-to-date at end of operation, " + "error=%d\n", error); pending_req->status = BLKIF_RSP_ERROR; } @@ -264,7 +268,7 @@ static int end_block_io_op(struct bio *bio, unsigned int done, int error) { if (bio->bi_size != 0) return 1; - __end_block_io_op(bio->bi_private, !error); + __end_block_io_op(bio->bi_private, error); bio_put(bio); return error; } @@ -295,7 +299,7 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) static int do_block_io_op(blkif_t *blkif) { blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; + blkif_request_t req; pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0; @@ -313,22 +317,25 @@ static int do_block_io_op(blkif_t *blkif) break; } - req = RING_GET_REQUEST(blk_ring, rc); + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req)); blk_ring->req_cons = ++rc; /* before make_response() */ - switch (req->operation) { + switch (req.operation) { case BLKIF_OP_READ: blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; + case BLKIF_OP_WRITE_BARRIER: + blkif->st_br_req++; + /* fall through */ case BLKIF_OP_WRITE: blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; default: DPRINTK("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, + req.operation); + make_response(blkif, req.id, req.operation, BLKIF_RSP_ERROR); free_req(pending_req); break; @@ -342,7 +349,6 @@ static void dispatch_rw_block_io(blkif_t *blkif, pending_req_t *pending_req) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); - int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct phys_req preq; struct { @@ -351,6 +357,22 @@ static void dispatch_rw_block_io(blkif_t *blkif, unsigned int nseg; struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int ret, i, nbio = 0; + int operation; + + switch (req->operation) { + case BLKIF_OP_READ: + operation = READ; + break; + case BLKIF_OP_WRITE: + operation = WRITE; + break; + case BLKIF_OP_WRITE_BARRIER: + operation = WRITE_BARRIER; + break; + default: + operation = 0; /* make gcc happy */ + BUG(); + } /* Check that number of segments is sane. */ nseg = req->nr_segments; @@ -366,7 +388,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, pending_req->blkif = blkif; pending_req->id = req->id; - pending_req->operation = operation; + pending_req->operation = req->operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; @@ -377,12 +399,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, req->seg[i].first_sect + 1; if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || - (seg[i].nsec <= 0)) + (req->seg[i].last_sect < req->seg[i].first_sect)) goto fail_response; preq.nr_sects += seg[i].nsec; flags = GNTMAP_host_map; - if ( operation == WRITE ) + if (operation != READ) flags |= GNTMAP_readonly; gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, req->seg[i].gref, blkif->domid); @@ -394,10 +416,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, for (i = 0; i < nseg; i++) { if (unlikely(map[i].status != 0)) { DPRINTK("invalid buffer -- could not remap it\n"); - goto fail_flush; + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= 1; } pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + set_phys_to_machine(__pa(vaddr( pending_req, i)) >> PAGE_SHIFT, FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); @@ -405,6 +432,9 @@ static void dispatch_rw_block_io(blkif_t *blkif, (req->seg[i].first_sect << 9); } + if (ret) + goto fail_flush; + if (vbd_translate(&preq, blkif, operation) != 0) { DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", @@ -506,52 +536,43 @@ static void make_response(blkif_t *blkif, unsigned long id, static int __init blkif_init(void) { - struct page *page; - int i; + int i, mmap_pages; if (!is_running_on_xen()) return -ENODEV; - mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - - page = balloon_alloc_empty_page_range(mmap_pages); - if (page == NULL) - return -ENOMEM; - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; pending_reqs = kmalloc(sizeof(pending_reqs[0]) * blkif_reqs, GFP_KERNEL); pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * mmap_pages, GFP_KERNEL); - pending_vaddrs = kmalloc(sizeof(pending_vaddrs[0]) * - mmap_pages, GFP_KERNEL); - if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) { - kfree(pending_reqs); - kfree(pending_grant_handles); - kfree(pending_vaddrs); - printk("%s: out of memory\n", __FUNCTION__); - return -ENOMEM; - } + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); - blkif_interface_init(); - - printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", - __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart); - BUG_ON(mmap_vstart == 0); - for (i = 0; i < mmap_pages; i++) { - pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); + if (!pending_reqs || !pending_grant_handles || !pending_pages) + goto out_of_memory; + + for (i = 0; i < mmap_pages; i++) pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; - } + + blkif_interface_init(); memset(pending_reqs, 0, sizeof(pending_reqs)); INIT_LIST_HEAD(&pending_free); for (i = 0; i < blkif_reqs; i++) list_add_tail(&pending_reqs[i].free_list, &pending_free); - + blkif_xenbus_init(); return 0; + + out_of_memory: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; } module_init(blkif_init); diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h index 38cb756964..1b5b6a427e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h @@ -44,6 +44,7 @@ #include <xen/interface/io/ring.h> #include <xen/gnttab.h> #include <xen/driver_util.h> +#include <xen/xenbus.h> #define DPRINTK(_f, _a...) \ pr_debug("(file=%s, line=%d) " _f, \ @@ -87,6 +88,7 @@ typedef struct blkif_st { int st_rd_req; int st_wr_req; int st_oo_req; + int st_br_req; wait_queue_head_t waiting_to_free; @@ -111,7 +113,7 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, unsigned minor, int readonly); void vbd_free(struct vbd *vbd); -unsigned long vbd_size(struct vbd *vbd); +unsigned long long vbd_size(struct vbd *vbd); unsigned int vbd_info(struct vbd *vbd); unsigned long vbd_secsize(struct vbd *vbd); @@ -131,4 +133,7 @@ void blkif_xenbus_init(void); irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); int blkif_schedule(void *arg); +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); + #endif /* __BLKIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c index a809b04cd1..34048b32c4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c @@ -31,12 +31,11 @@ */ #include "common.h" -#include <xen/xenbus.h> #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) -unsigned long vbd_size(struct vbd *vbd) +unsigned long long vbd_size(struct vbd *vbd) { return vbd_sz(vbd); } @@ -104,7 +103,7 @@ int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) struct vbd *vbd = &blkif->vbd; int rc = -EACCES; - if ((operation == WRITE) && vbd->readonly) + if ((operation != READ) && vbd->readonly) goto out; if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c index 02f90a6803..349ae64d0f 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c @@ -20,7 +20,6 @@ #include <stdarg.h> #include <linux/module.h> #include <linux/kthread.h> -#include <xen/xenbus.h> #include "common.h" #undef DPRINTK @@ -91,11 +90,13 @@ static void update_blkif_status(blkif_t *blkif) VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); static struct attribute *vbdstat_attrs[] = { &dev_attr_oo_req.attr, &dev_attr_rd_req.attr, &dev_attr_wr_req.attr, + &dev_attr_br_req.attr, NULL }; @@ -165,6 +166,19 @@ static int blkback_remove(struct xenbus_device *dev) return 0; } +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-barrier"); + + return err; +} /** * Entry point to this code when a new device is created. Allocate the basic @@ -366,13 +380,16 @@ static void connect(struct backend_info *be) /* Supply the information about the device the frontend needs */ again: err = xenbus_transaction_start(&xbt); - if (err) { xenbus_dev_fatal(dev, err, "starting transaction"); return; } - err = xenbus_printf(xbt, dev->nodename, "sectors", "%lu", + err = blkback_barrier(xbt, be, 1); + if (err) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", vbd_size(&be->blkif->vbd)); if (err) { xenbus_dev_fatal(dev, err, "writing %s/sectors", diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c index 4c44d7608d..95cff46ff9 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c @@ -48,6 +48,10 @@ #include <asm/hypervisor.h> #include <asm/maddr.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_STATE_DISCONNECTED 0 #define BLKIF_STATE_CONNECTED 1 #define BLKIF_STATE_SUSPENDED 2 @@ -134,10 +138,10 @@ static int blkfront_resume(struct xenbus_device *dev) DPRINTK("blkfront_resume: %s\n", dev->nodename); - blkif_free(info, 1); + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = talk_to_backend(dev, info); - if (!err) + if (info->connected == BLKIF_STATE_SUSPENDED && !err) blkif_recover(info); return err; @@ -273,7 +277,7 @@ static void backend_changed(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); down(&bd->bd_sem); - if (info->users > 0 && system_state == SYSTEM_RUNNING) + if (info->users > 0) xenbus_dev_error(dev, -EBUSY, "Device in use; refusing to close"); else @@ -294,7 +298,8 @@ static void backend_changed(struct xenbus_device *dev, */ static void connect(struct blkfront_info *info) { - unsigned long sectors, sector_size; + unsigned long long sectors; + unsigned long sector_size; unsigned int binfo; int err; @@ -305,7 +310,7 @@ static void connect(struct blkfront_info *info) DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "sectors", "%lu", §ors, + "sectors", "%llu", §ors, "info", "%u", &binfo, "sector-size", "%lu", §or_size, NULL); @@ -316,6 +321,12 @@ static void connect(struct blkfront_info *info) return; } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", @@ -355,9 +366,11 @@ static void blkfront_closing(struct xenbus_device *dev) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - flush_scheduled_work(); spin_unlock_irqrestore(&blkif_io_lock, flags); + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + xlvbd_del(info); xenbus_frontend_closed(dev); @@ -466,6 +479,27 @@ int blkif_ioctl(struct inode *inode, struct file *filep, command, (long)argument, inode->i_rdev); switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blkif_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif case CDROMMULTISESSION: DPRINTK("FIXME: support multisession CDs later\n"); for (i = 0; i < sizeof(struct cdrom_multisession); i++) @@ -542,11 +576,14 @@ static int blkif_queue_request(struct request *req) info->shadow[id].request = (unsigned long)req; ring_req->id = id; - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; ring_req->sector_number = (blkif_sector_t)req->sector; ring_req->handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (blk_barrier_rq(req)) + ring_req->operation = BLKIF_OP_WRITE_BARRIER; + ring_req->nr_segments = 0; rq_for_each_bio (bio, req) { bio_for_each_segment (bvec, bio, idx) { @@ -643,6 +680,7 @@ static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; + int uptodate; spin_lock_irqsave(&blkif_io_lock, flags); @@ -667,19 +705,27 @@ static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) ADD_ID_TO_FREELIST(info, id); + uptodate = (bret->status == BLKIF_RSP_OKAY); switch (bret->operation) { + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk("blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + uptodate = -EOPNOTSUPP; + info->feature_barrier = 0; + xlvbd_barrier(info); + } + /* fall through */ case BLKIF_OP_READ: case BLKIF_OP_WRITE: if (unlikely(bret->status != BLKIF_RSP_OKAY)) DPRINTK("Bad return from blkdev data " "request: %x\n", bret->status); - ret = end_that_request_first( - req, (bret->status == BLKIF_RSP_OKAY), + ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); BUG_ON(ret); - end_that_request_last( - req, (bret->status == BLKIF_RSP_OKAY)); + end_that_request_last(req, uptodate); break; default: BUG(); @@ -714,9 +760,11 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - flush_scheduled_work(); spin_unlock_irq(&blkif_io_lock); + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + /* Free resources associated with old device channel. */ if (info->ring_ref != GRANT_INVALID_REF) { gnttab_end_foreign_access(info->ring_ref, 0, diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h index 5ba3d1ebc3..b86360f405 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h @@ -126,6 +126,7 @@ struct blkfront_info struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; unsigned long shadow_free; + int feature_barrier; /** * The number of people holding this device open. We won't allow a @@ -152,5 +153,6 @@ extern void do_blkif_request (request_queue_t *rq); int xlvbd_add(blkif_sector_t capacity, int device, u16 vdisk_info, u16 sector_size, struct blkfront_info *info); void xlvbd_del(struct blkfront_info *info); +int xlvbd_barrier(struct blkfront_info *info); #endif /* __XEN_DRIVERS_BLOCK_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c index 8aa453d3a0..f040a2b7e3 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c @@ -36,6 +36,10 @@ #include <linux/blkdev.h> #include <linux/list.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_MAJOR(dev) ((dev)>>8) #define BLKIF_MINOR(dev) ((dev) & 0xff) @@ -46,7 +50,7 @@ */ #define NUM_IDE_MAJORS 10 -#define NUM_SCSI_MAJORS 9 +#define NUM_SCSI_MAJORS 17 #define NUM_VBD_MAJORS 1 static struct xlbd_type_info xlbd_ide_type = { @@ -91,7 +95,9 @@ static struct block_device_operations xlvbd_block_fops = .open = blkif_open, .release = blkif_release, .ioctl = blkif_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .getgeo = blkif_getgeo +#endif }; DEFINE_SPINLOCK(blkif_io_lock); @@ -159,8 +165,11 @@ xlbd_get_major_info(int vdevice) case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: index = 11 + major - SCSI_DISK1_MAJOR; break; - case SCSI_CDROM_MAJOR: index = 18; break; - default: index = 19; break; + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: + index = 18 + major - SCSI_DISK8_MAJOR; + break; + case SCSI_CDROM_MAJOR: index = 26; break; + default: index = 27; break; } mi = ((major_info[index] != NULL) ? major_info[index] : @@ -186,7 +195,11 @@ xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) if (rq == NULL) return -1; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_hardsect_size(rq, sector_size); @@ -217,6 +230,7 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, struct xlbd_major_info *mi; int nr_minors = 1; int err = -ENODEV; + unsigned int offset; BUG_ON(info->gd != NULL); BUG_ON(info->mi != NULL); @@ -234,15 +248,33 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, if (gd == NULL) goto out; - if (nr_minors > 1) - sprintf(gd->disk_name, "%s%c", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift)); - else - sprintf(gd->disk_name, "%s%c%d", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift), - minor & ((1 << mi->type->partn_shift) - 1)); + offset = mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift); + if (nr_minors > 1) { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c", + mi->type->diskname, 'a' + offset ); + } + else { + sprintf(gd->disk_name, "%s%c%c", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26) ); + } + } + else { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c%d", + mi->type->diskname, + 'a' + offset, + minor & ((1 << mi->type->partn_shift) - 1)); + } + else { + sprintf(gd->disk_name, "%s%c%c%d", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26), + minor & ((1 << mi->type->partn_shift) - 1)); + } + } gd->major = mi->major; gd->first_minor = minor; @@ -257,6 +289,10 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, } info->rq = gd->queue; + info->gd = gd; + + if (info->feature_barrier) + xlvbd_barrier(info); if (vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); @@ -267,8 +303,6 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, if (vdisk_info & VDISK_CDROM) gd->flags |= GENHD_FL_CD; - info->gd = gd; - return 0; out: @@ -316,3 +350,26 @@ xlvbd_del(struct blkfront_info *info) blk_cleanup_queue(info->rq); info->rq = NULL; } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +int +xlvbd_barrier(struct blkfront_info *info) +{ + int err; + + err = blk_queue_ordered(info->rq, + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL); + if (err) + return err; + printk("blkfront: %s: barriers %s\n", + info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled"); + return 0; +} +#else +int +xlvbd_barrier(struct blkfront_info *info) +{ + printk("blkfront: %s: barriers disabled\n", info->gd->disk_name); + return -ENOSYS; +} +#endif diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c index a6f1379c27..e0d898ab98 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c @@ -10,6 +10,9 @@ * * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield * + * Clean ups and fix ups: + * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc. + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed @@ -44,7 +47,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/miscdevice.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/gfp.h> @@ -52,9 +54,33 @@ #include <asm/tlbflush.h> #include <linux/devfs_fs_kernel.h> -#define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */ +#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ + +struct class *xen_class; +EXPORT_SYMBOL_GPL(xen_class); + +/* + * Setup the xen class. This should probably go in another file, but + * since blktap is the only user of it so far, it gets to keep it. + */ +int setup_xen_class(void) +{ + int ret; + + if (xen_class) + return 0; + + xen_class = class_create(THIS_MODULE, "xen"); + if ((ret = IS_ERR(xen_class))) { + xen_class = NULL; + return ret; + } + + return 0; +} + /* * The maximum number of requests that can be outstanding at any time * is determined by @@ -67,8 +93,9 @@ * mmap_alloc is initialised to 2 and should be adjustable on the fly via * sysfs. */ -#define MAX_DYNAMIC_MEM 64 -#define MAX_PENDING_REQS 64 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) #define MMAP_VADDR(_start, _req,_seg) \ (_start + \ @@ -82,6 +109,12 @@ static int mmap_pages = MMAP_PAGES; * memory rings. */ +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + /*Data struct associated with each of the tapdisk devices*/ typedef struct tap_blkif { struct vm_area_struct *vma; /*Shared memory area */ @@ -100,22 +133,11 @@ typedef struct tap_blkif { unsigned long *idx_map; /*Record the user ring id to kern [req id, idx] tuple */ blkif_t *blkif; /*Associate blkif with tapdev */ + struct domid_translate trans; /*Translation from domid to bus. */ } tap_blkif_t; -/*Private data struct associated with the inode*/ -typedef struct private_info { - int idx; -} private_info_t; - -/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ -typedef struct domid_translate { - unsigned short domid; - unsigned short busid; -} domid_translate_t ; - - -static domid_translate_t translate_domid[MAX_TAP_DEV]; -static tap_blkif_t *tapfds[MAX_TAP_DEV]; +static struct tap_blkif *tapfds[MAX_TAP_DEV]; +static int blktap_next_minor; static int __init set_blkif_reqs(char *str) { @@ -168,16 +190,18 @@ static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { #define BLKBACK_INVALID_HANDLE (~0) -typedef struct mmap_page { - unsigned long start; - struct page *mpage; -} mmap_page_t; +static struct page **foreign_pages[MAX_DYNAMIC_MEM]; +static inline unsigned long idx_to_kaddr( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx; + unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]); + return (unsigned long)pfn_to_kaddr(pfn); +} -static mmap_page_t mmap_start[MAX_DYNAMIC_MEM]; static unsigned short mmap_alloc = 0; static unsigned short mmap_lock = 0; static unsigned short mmap_inuse = 0; -static unsigned long *pending_addrs[MAX_DYNAMIC_MEM]; /****************************************************************** * GRANT HANDLES @@ -192,6 +216,7 @@ struct grant_handle_pair grant_handle_t kernel; grant_handle_t user; }; +#define INVALID_GRANT_HANDLE 0xFFFF static struct grant_handle_pair pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; @@ -200,15 +225,13 @@ static struct grant_handle_pair + (_i)]) -static int blktap_read_ufe_ring(int idx); /*local prototypes*/ +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/ -#define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number - major=254, minor numbers begin at 0 */ -#define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic * - * and create devices in the kernel * - */ +#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */ #define BLKTAP_DEV_DIR "/dev/xen" +static int blktap_major; + /* blktap IOCTLs: */ #define BLKTAP_IOCTL_KICK_FE 1 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ @@ -264,17 +287,19 @@ static inline int GET_NEXT_REQ(unsigned long *idx_map) { int i; for (i = 0; i < MAX_PENDING_REQS; i++) - if (idx_map[i] == INVALID_REQ) return i; + if (idx_map[i] == INVALID_REQ) + return i; return INVALID_REQ; } #define BLKTAP_INVALID_HANDLE(_g) \ - (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) + (((_g->kernel) == INVALID_GRANT_HANDLE) && \ + ((_g->user) == INVALID_GRANT_HANDLE)) #define BLKTAP_INVALIDATE_HANDLE(_g) do { \ - (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ + (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \ } while(0) @@ -303,7 +328,7 @@ struct vm_operations_struct blktap_vm_ops = { */ /*Function Declarations*/ -static int get_next_free_dev(void); +static tap_blkif_t *get_next_free_dev(void); static int blktap_open(struct inode *inode, struct file *filp); static int blktap_release(struct inode *inode, struct file *filp); static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); @@ -311,8 +336,6 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); static unsigned int blktap_poll(struct file *file, poll_table *wait); -struct miscdevice *set_misc(int minor, char *name, int dev); - static struct file_operations blktap_fops = { .owner = THIS_MODULE, .poll = blktap_poll, @@ -323,41 +346,96 @@ static struct file_operations blktap_fops = { }; -static int get_next_free_dev(void) +static tap_blkif_t *get_next_free_dev(void) { tap_blkif_t *info; - int i = 0, ret = -1; - unsigned long flags; + int minor; - spin_lock_irqsave(&pending_free_lock, flags); - - while (i < MAX_TAP_DEV) { - info = tapfds[i]; - if ( (tapfds[i] != NULL) && (info->dev_inuse == 0) - && (info->dev_pending == 0) ) { + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&pending_free_lock); + + /* tapfds[0] is always NULL */ + + for (minor = 1; minor < blktap_next_minor; minor++) { + info = tapfds[minor]; + /* we could have failed a previous attempt. */ + if (!info || + ((info->dev_inuse == 0) && + (info->dev_pending == 0)) ) { info->dev_pending = 1; - ret = i; - goto done; + goto found; } - i++; } - -done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return ret; + info = NULL; + minor = -1; + + /* + * We didn't find free device. If we can still allocate + * more, then we grab the next device minor that is + * available. This is done while we are still under + * the protection of the pending_free_lock. + */ + if (blktap_next_minor < MAX_TAP_DEV) + minor = blktap_next_minor++; +found: + spin_unlock_irq(&pending_free_lock); + + if (!info && minor > 0) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (unlikely(!info)) { + /* + * If we failed here, try to put back + * the next minor number. But if one + * was just taken, then we just lose this + * minor. We can try to allocate this + * minor again later. + */ + spin_lock_irq(&pending_free_lock); + if (blktap_next_minor == minor+1) + blktap_next_minor--; + spin_unlock_irq(&pending_free_lock); + goto out; + } + + info->minor = minor; + /* + * Make sure that we have a minor before others can + * see us. + */ + wmb(); + tapfds[minor] = info; + + class_device_create(xen_class, NULL, + MKDEV(blktap_major, minor), NULL, + "blktap%d", minor); + devfs_mk_cdev(MKDEV(blktap_major, minor), + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor); + } + +out: + return info; } int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) { + tap_blkif_t *info; int i; - - for (i = 0; i < MAX_TAP_DEV; i++) - if ( (translate_domid[i].domid == domid) - && (translate_domid[i].busid == xenbus_id) ) { - tapfds[i]->blkif = blkif; - tapfds[i]->status = RUNNING; + + for (i = 1; i < blktap_next_minor; i++) { + info = tapfds[i]; + if ( info && + (info->trans.domid == domid) && + (info->trans.busid == xenbus_id) ) { + info->blkif = blkif; + info->status = RUNNING; return i; } + } return -1; } @@ -367,13 +445,16 @@ void signal_tapdisk(int idx) struct task_struct *ptask; info = tapfds[idx]; - if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) { + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + if (info->pid > 0) { ptask = find_task_by_pid(info->pid); - if (ptask) { + if (ptask) info->status = CLEANSHUTDOWN; - } } info->blkif = NULL; + return; } @@ -382,18 +463,22 @@ static int blktap_open(struct inode *inode, struct file *filp) blkif_sring_t *sring; int idx = iminor(inode) - BLKTAP_MINOR; tap_blkif_t *info; - private_info_t *prv; int i; - if (tapfds[idx] == NULL) { + /* ctrl device, treat differently */ + if (!idx) + return 0; + + info = tapfds[idx]; + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) { WPRINTK("Unable to open device /dev/xen/blktap%d\n", - idx); - return -ENOMEM; + idx); + return -ENODEV; } + DPRINTK("Opening device /dev/xen/blktap%d\n",idx); - info = tapfds[idx]; - /*Only one process can access device at a time*/ if (test_and_set_bit(0, &info->dev_inuse)) return -EBUSY; @@ -410,9 +495,7 @@ static int blktap_open(struct inode *inode, struct file *filp) SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); - prv = kzalloc(sizeof(private_info_t),GFP_KERNEL); - prv->idx = idx; - filp->private_data = prv; + filp->private_data = info; info->vma = NULL; info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, @@ -433,17 +516,14 @@ static int blktap_open(struct inode *inode, struct file *filp) static int blktap_release(struct inode *inode, struct file *filp) { - int idx = iminor(inode) - BLKTAP_MINOR; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - if (tapfds[idx] == NULL) { - WPRINTK("Trying to free device that doesn't exist " - "[/dev/xen/blktap%d]\n",idx); - return -1; - } - info = tapfds[idx]; + /* check for control device */ + if (!info) + return 0; + info->dev_inuse = 0; - DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx); + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor); /* Free the ring page. */ ClearPageReserved(virt_to_page(info->ufe_ring.sring)); @@ -457,11 +537,11 @@ static int blktap_release(struct inode *inode, struct file *filp) info->vma = NULL; } - if (filp->private_data) kfree(filp->private_data); - if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { - kthread_stop(info->blkif->xenblkd); - info->blkif->xenblkd = NULL; + if (info->blkif->xenblkd != NULL) { + kthread_stop(info->blkif->xenblkd); + info->blkif->xenblkd = NULL; + } info->status = CLEANSHUTDOWN; } return 0; @@ -491,16 +571,12 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) int size; struct page **map; int i; - private_info_t *prv; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - /*Retrieve the dev info*/ - prv = (private_info_t *)filp->private_data; - if (prv == NULL) { + if (info == NULL) { WPRINTK("blktap: mmap, retrieving idx failed\n"); return -ENOMEM; } - info = tapfds[prv->idx]; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &blktap_vm_ops; @@ -517,8 +593,6 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); /* Map the ring pages to the start of the region and reserve it. */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (remap_pfn_range(vma, vma->vm_start, __pa(info->ufe_ring.sring) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) { @@ -556,20 +630,17 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info = filp->private_data; + switch(cmd) { case BLKTAP_IOCTL_KICK_FE: { /* There are fe messages to process. */ - return blktap_read_ufe_ring(idx); + return blktap_read_ufe_ring(info); } case BLKTAP_IOCTL_SETMODE: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { if (BLKTAP_MODE_VALID(arg)) { info->mode = arg; /* XXX: may need to flush rings here. */ @@ -582,11 +653,7 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, } case BLKTAP_IOCTL_PRINT_IDXS: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { printk("User Rings: \n-----------\n"); printk("UF: rsp_cons: %2d, req_prod_prv: %2d " "| req_prod: %2d, rsp_prod: %2d\n", @@ -599,11 +666,7 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, } case BLKTAP_IOCTL_SENDPID: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { info->pid = (pid_t)arg; DPRINTK("blktap: pid received %d\n", info->pid); @@ -614,43 +677,49 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, { uint64_t val = (uint64_t)arg; domid_translate_t *tr = (domid_translate_t *)&val; - int newdev; DPRINTK("NEWINTF Req for domid %d and bus id %d\n", tr->domid, tr->busid); - newdev = get_next_free_dev(); - if (newdev < 1) { + info = get_next_free_dev(); + if (!info) { WPRINTK("Error initialising /dev/xen/blktap - " "No more devices\n"); return -1; } - translate_domid[newdev].domid = tr->domid; - translate_domid[newdev].busid = tr->busid; - return newdev; + info->trans.domid = tr->domid; + info->trans.busid = tr->busid; + return info->minor; } case BLKTAP_IOCTL_FREEINTF: { unsigned long dev = arg; - tap_blkif_t *info = NULL; + unsigned long flags; + + info = tapfds[dev]; - if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; + if ((dev > MAX_TAP_DEV) || !info) + return 0; /* should this be an error? */ - if ( (info != NULL) && (info->dev_pending) ) + spin_lock_irqsave(&pending_free_lock, flags); + if (info->dev_pending) info->dev_pending = 0; + spin_unlock_irqrestore(&pending_free_lock, flags); + return 0; } case BLKTAP_IOCTL_MINOR: { unsigned long dev = arg; - tap_blkif_t *info = NULL; - - if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; - - if (info != NULL) return info->minor; - else return -1; + + info = tapfds[dev]; + + if ((dev > MAX_TAP_DEV) || !info) + return -EINVAL; + + return info->minor; } case BLKTAP_IOCTL_MAJOR: - return BLKTAP_DEV_MAJOR; + return blktap_major; case BLKTAP_QUERY_ALLOC_REQS: { @@ -662,25 +731,16 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, return -ENOIOCTLCMD; } -static unsigned int blktap_poll(struct file *file, poll_table *wait) +static unsigned int blktap_poll(struct file *filp, poll_table *wait) { - private_info_t *prv; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - /*Retrieve the dev info*/ - prv = (private_info_t *)file->private_data; - if (prv == NULL) { - WPRINTK(" poll, retrieving idx failed\n"); + /* do not work on the control device */ + if (!info) return 0; - } - - if (prv->idx == 0) return 0; - - info = tapfds[prv->idx]; - - poll_wait(file, &info->wait, wait); + + poll_wait(filp, &info->wait, wait); if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { - flush_tlb_all(); RING_PUSH_REQUESTS(&info->ufe_ring); return POLLIN | POLLRDNORM; } @@ -691,11 +751,13 @@ void blktap_kick_user(int idx) { tap_blkif_t *info; - if (idx == 0) return; - info = tapfds[idx]; - - if (info != NULL) wake_up_interruptible(&info->wait); + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + wake_up_interruptible(&info->wait); + return; } @@ -712,66 +774,21 @@ static void make_response(blkif_t *blkif, unsigned long id, static int req_increase(void) { int i, j; - struct page *page; - unsigned long flags; - int ret; - spin_lock_irqsave(&pending_free_lock, flags); - - ret = -EINVAL; if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) - goto done; - -#ifdef __ia64__ - extern unsigned long alloc_empty_foreign_map_page_range( - unsigned long pages); - mmap_start[mmap_alloc].start = (unsigned long) - alloc_empty_foreign_map_page_range(mmap_pages); -#else /* ! ia64 */ - page = balloon_alloc_empty_page_range(mmap_pages); - ret = -ENOMEM; - if (page == NULL) { - printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__); - goto done; - } - - /* Pin all of the pages. */ - for (i=0; i<mmap_pages; i++) - get_page(&page[i]); - - mmap_start[mmap_alloc].start = - (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - mmap_start[mmap_alloc].mpage = page; - -#endif - - pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) * - blkif_reqs, GFP_KERNEL); - pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) * - mmap_pages, GFP_KERNEL); - - ret = -ENOMEM; - if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) { - kfree(pending_reqs[mmap_alloc]); - kfree(pending_addrs[mmap_alloc]); - WPRINTK("%s: out of memory\n", __FUNCTION__); - ret = -ENOMEM; - goto done; - } - - ret = 0; + return -EINVAL; - DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", - __FUNCTION__, blkif_reqs, mmap_pages, - mmap_start[mmap_alloc].start); + pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) + * blkif_reqs, GFP_KERNEL); + foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages); - BUG_ON(mmap_start[mmap_alloc].start == 0); + if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc]) + goto out_of_memory; - for (i = 0; i < mmap_pages; i++) - pending_addrs[mmap_alloc][i] = - mmap_start[mmap_alloc].start + (i << PAGE_SHIFT); + DPRINTK("%s: reqs=%d, pages=%d\n", + __FUNCTION__, blkif_reqs, mmap_pages); - for (i = 0; i < MAX_PENDING_REQS ; i++) { + for (i = 0; i < MAX_PENDING_REQS; i++) { list_add_tail(&pending_reqs[mmap_alloc][i].free_list, &pending_free); pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc; @@ -782,67 +799,30 @@ static int req_increase(void) mmap_alloc++; DPRINTK("# MMAPs increased to %d\n",mmap_alloc); - done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return ret; + return 0; + + out_of_memory: + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + kfree(pending_reqs[mmap_alloc]); + WPRINTK("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; } static void mmap_req_del(int mmap) { - int i; - struct page *page; + BUG_ON(!spin_is_locked(&pending_free_lock)); - /*Spinlock already acquired*/ kfree(pending_reqs[mmap]); - kfree(pending_addrs[mmap]); - -#ifdef __ia64__ - /*Not sure what goes here yet!*/ -#else - - /* Unpin all of the pages. */ - page = mmap_start[mmap].mpage; - for (i=0; i<mmap_pages; i++) - put_page(&page[i]); + pending_reqs[mmap] = NULL; - balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages); -#endif + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + foreign_pages[mmap] = NULL; mmap_lock = 0; DPRINTK("# MMAPs decreased to %d\n",mmap_alloc); mmap_alloc--; } -/*N.B. Currently unused - will be accessed via sysfs*/ -static void req_decrease(void) -{ - pending_req_t *req; - int i; - unsigned long flags; - - spin_lock_irqsave(&pending_free_lock, flags); - - DPRINTK("Req decrease called.\n"); - if (mmap_lock || mmap_alloc == 1) - goto done; - - mmap_lock = 1; - mmap_inuse = MAX_PENDING_REQS; - - /*Go through reqs and remove any that aren't in use*/ - for (i = 0; i < MAX_PENDING_REQS ; i++) { - req = &pending_reqs[mmap_alloc-1][i]; - if (req->inuse == 0) { - list_del(&req->free_list); - mmap_inuse--; - } - } - if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); - done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return; -} - static pending_req_t* alloc_req(void) { pending_req_t *req = NULL; @@ -888,8 +868,8 @@ static void free_req(pending_req_t *req) wake_up(&pending_free_wq); } -static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int - tapidx) +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, + int tapidx) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int i, invcount = 0; @@ -897,49 +877,65 @@ static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int uint64_t ptep; int ret, mmap_idx; unsigned long kvaddr, uvaddr; - - tap_blkif_t *info = tapfds[tapidx]; + tap_blkif_t *info; - if (info == NULL) { + + info = tapfds[tapidx]; + + if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) { WPRINTK("fast_flush: Couldn't get info!\n"); return; } + + if (info->vma != NULL && + xen_feature(XENFEAT_auto_translated_physmap)) { + down_write(&info->vma->vm_mm->mmap_sem); + zap_page_range(info->vma, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages << PAGE_SHIFT, NULL); + up_write(&info->vma->vm_mm->mmap_sem); + } + mmap_idx = req->mem_idx; for (i = 0; i < req->nr_pages; i++) { - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, k_idx, i); uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); khandle = &pending_handle(mmap_idx, k_idx, i); - if (BLKTAP_INVALID_HANDLE(khandle)) { - WPRINTK("BLKTAP_INVALID_HANDLE\n"); - continue; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[invcount], + idx_to_kaddr(mmap_idx, k_idx, i), + GNTMAP_host_map, khandle->kernel); + invcount++; } - gnttab_set_unmap_op(&unmap[invcount], - MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), - GNTMAP_host_map, khandle->kernel); - invcount++; - - if (create_lookup_pte_addr( - info->vma->vm_mm, - MMAP_VADDR(info->user_vstart, u_idx, i), - &ptep) !=0) { - WPRINTK("Couldn't get a pte addr!\n"); - return; + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (create_lookup_pte_addr( + info->vma->vm_mm, + MMAP_VADDR(info->user_vstart, u_idx, i), + &ptep) !=0) { + WPRINTK("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[invcount], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + invcount++; } - gnttab_set_unmap_op(&unmap[invcount], - ptep, GNTMAP_host_map, - khandle->user); - invcount++; - BLKTAP_INVALIDATE_HANDLE(khandle); } ret = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, unmap, invcount); BUG_ON(ret); - if (info->vma != NULL) + if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) zap_page_range(info->vma, MMAP_VADDR(info->user_vstart, u_idx, 0), req->nr_pages << PAGE_SHIFT, NULL); @@ -1002,7 +998,7 @@ int tap_blkif_schedule(void *arg) * COMPLETION CALLBACK -- Called by user level ioctl() */ -static int blktap_read_ufe_ring(int idx) +static int blktap_read_ufe_ring(tap_blkif_t *info) { /* This is called to read responses from the UFE ring. */ RING_IDX i, j, rp; @@ -1010,12 +1006,9 @@ static int blktap_read_ufe_ring(int idx) blkif_t *blkif=NULL; int pending_idx, usr_idx, mmap_idx; pending_req_t *pending_req; - tap_blkif_t *info; - info = tapfds[idx]; - if (info == NULL) { + if (!info) return 0; - } /* We currently only forward packets in INTERCEPT_FE mode. */ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) @@ -1026,11 +1019,14 @@ static int blktap_read_ufe_ring(int idx) rmb(); for (i = info->ufe_ring.rsp_cons; i != rp; i++) { + blkif_response_t res; resp = RING_GET_RESPONSE(&info->ufe_ring, i); + memcpy(&res, resp, sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ ++info->ufe_ring.rsp_cons; /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ - usr_idx = (int)resp->id; + usr_idx = (int)res.id; pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); @@ -1053,9 +1049,8 @@ static int blktap_read_ufe_ring(int idx) struct page *pg; int offset; - uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, j); + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ClearPageReserved(pg); @@ -1063,10 +1058,10 @@ static int blktap_read_ufe_ring(int idx) >> PAGE_SHIFT; map[offset] = NULL; } - fast_flush_area(pending_req, pending_idx, usr_idx, idx); - make_response(blkif, pending_req->id, resp->operation, - resp->status); + fast_flush_area(pending_req, pending_idx, usr_idx, info->minor); info->idx_map[usr_idx] = INVALID_REQ; + make_response(blkif, pending_req->id, res.operation, + res.status); blkif_put(pending_req->blkif); free_req(pending_req); } @@ -1100,7 +1095,7 @@ static int print_dbug = 1; static int do_block_io_op(blkif_t *blkif) { blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; + blkif_request_t req; pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0; @@ -1111,7 +1106,7 @@ static int do_block_io_op(blkif_t *blkif) rmb(); /* Ensure we see queued requests up to 'rp'. */ /*Check blkif has corresponding UE ring*/ - if (blkif->dev_num == -1) { + if (blkif->dev_num < 0) { /*oops*/ if (print_dbug) { WPRINTK("Corresponding UE " @@ -1122,7 +1117,8 @@ static int do_block_io_op(blkif_t *blkif) } info = tapfds[blkif->dev_num]; - if (info == NULL || !info->dev_inuse) { + + if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) { if (print_dbug) { WPRINTK("Can't get UE info!\n"); print_dbug = 0; @@ -1152,24 +1148,24 @@ static int do_block_io_op(blkif_t *blkif) break; } - req = RING_GET_REQUEST(blk_ring, rc); + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req)); blk_ring->req_cons = ++rc; /* before make_response() */ - switch (req->operation) { + switch (req.operation) { case BLKIF_OP_READ: blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; case BLKIF_OP_WRITE: blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; default: WPRINTK("unknown operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, + req.operation); + make_response(blkif, req.id, req.operation, BLKIF_RSP_ERROR); free_req(pending_req); break; @@ -1190,17 +1186,27 @@ static void dispatch_rw_block_io(blkif_t *blkif, struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int nseg; int ret, i; - tap_blkif_t *info = tapfds[blkif->dev_num]; + tap_blkif_t *info; uint64_t sector; - blkif_request_t *target; int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); - int usr_idx = GET_NEXT_REQ(info->idx_map); + int usr_idx; uint16_t mmap_idx = pending_req->mem_idx; - /*Check we have space on user ring - should never fail*/ - if(usr_idx == INVALID_REQ) goto fail_flush; - + if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV) + goto fail_response; + + info = tapfds[blkif->dev_num]; + if (info == NULL) + goto fail_response; + + /* Check we have space on user ring - should never fail. */ + usr_idx = GET_NEXT_REQ(info->idx_map); + if (usr_idx == INVALID_REQ) { + BUG(); + goto fail_response; + } + /* Check that number of segments is sane. */ nseg = req->nr_segments; if ( unlikely(nseg == 0) || @@ -1233,15 +1239,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, unsigned long uvaddr; unsigned long kvaddr; uint64_t ptep; - struct page *page; uint32_t flags; uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i); - page = virt_to_page(kvaddr); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); - sector = req->sector_number + (8*i); + sector = req->sector_number + ((PAGE_SIZE / 512) * i); if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) { WPRINTK("BLKTAP: Sector request greater" "than size\n"); @@ -1251,7 +1254,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, BLKIF_OP_WRITE ? "WRITE" : "READ"), (long long unsigned) sector, (long long unsigned) sector>>9, - blkif->sectors); + (long long unsigned) blkif->sectors); } flags = GNTMAP_host_map; @@ -1261,71 +1264,123 @@ static void dispatch_rw_block_io(blkif_t *blkif, req->seg[i].gref, blkif->domid); op++; - /* Now map it to user. */ - ret = create_lookup_pte_addr(info->vma->vm_mm, - uvaddr, &ptep); - if (ret) { - WPRINTK("Couldn't get a pte addr!\n"); - fast_flush_area(pending_req, pending_idx, usr_idx, - blkif->dev_num); - goto fail_flush; - } + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Now map it to user. */ + ret = create_lookup_pte_addr(info->vma->vm_mm, + uvaddr, &ptep); + if (ret) { + WPRINTK("Couldn't get a pte addr!\n"); + goto fail_flush; + } - flags = GNTMAP_host_map | GNTMAP_application_map - | GNTMAP_contains_pte; - if (operation == WRITE) - flags |= GNTMAP_readonly; - gnttab_set_map_op(&map[op], ptep, flags, - req->seg[i].gref, blkif->domid); - op++; + flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], ptep, flags, + req->seg[i].gref, blkif->domid); + op++; + } } ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); BUG_ON(ret); - for (i = 0; i < (nseg*2); i+=2) { - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long offset; - struct page *pg; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + for (i = 0; i < (nseg*2); i+=2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + struct page *pg; - uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i/2); + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2); - if (unlikely(map[i].status != 0)) { - WPRINTK("invalid kernel buffer -- " - "could not remap it\n"); - goto fail_flush; - } + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } - if (unlikely(map[i+1].status != 0)) { - WPRINTK("invalid user buffer -- " - "could not remap it\n"); - goto fail_flush; + if (unlikely(map[i+1].status != 0)) { + WPRINTK("invalid user buffer -- " + "could not remap it\n"); + ret |= 1; + map[i+1].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i/2).kernel + = map[i].handle; + pending_handle(mmap_idx, pending_idx, i/2).user + = map[i+1].handle; + + if (ret) + continue; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(map[i].dev_bus_addr + >> PAGE_SHIFT)); + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ((struct page **)info->vma->vm_private_data)[offset] = + pg; } + } else { + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + struct page *pg; - pending_handle(mmap_idx, pending_idx, i/2).kernel - = map[i].handle; - pending_handle(mmap_idx, pending_idx, i/2).user - = map[i+1].handle; - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, - FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); - offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; - pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - ((struct page **)info->vma->vm_private_data)[offset] = - pg; + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); + + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i).kernel + = map[i].handle; + + if (ret) + continue; + + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ((struct page **)info->vma->vm_private_data)[offset] = + pg; + } } + + if (ret) + goto fail_flush; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&info->vma->vm_mm->mmap_sem); /* Mark mapped pages as reserved: */ for (i = 0; i < req->nr_segments; i++) { unsigned long kvaddr; struct page *pg; - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); SetPageReserved(pg); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = vm_insert_page(info->vma, + MMAP_VADDR(info->user_vstart, + usr_idx, i), pg); + if (ret) { + up_write(&info->vma->vm_mm->mmap_sem); + goto fail_flush; + } + } } + if (xen_feature(XENFEAT_auto_translated_physmap)) + up_write(&info->vma->vm_mm->mmap_sem); /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); @@ -1336,6 +1391,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, info->ufe_ring.req_prod_pvt); memcpy(target, req, sizeof(*req)); target->id = usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ info->ufe_ring.req_prod_pvt++; return; @@ -1393,7 +1449,6 @@ static void make_response(blkif_t *blkif, unsigned long id, static int __init blkif_init(void) { int i,ret,blktap_dir; - tap_blkif_t *info; if (!is_running_on_xen()) return -ENODEV; @@ -1413,10 +1468,8 @@ static int __init blkif_init(void) tap_blkif_xenbus_init(); - /*Create the blktap devices, but do not map memory or waitqueue*/ - for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF; - - ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops); + /* Dynamically allocate a major for this device */ + ret = register_chrdev(0, "blktap", &blktap_fops); blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL); if ( (ret < 0)||(blktap_dir < 0) ) { @@ -1424,22 +1477,36 @@ static int __init blkif_init(void) return -ENOMEM; } - for(i = 0; i < MAX_TAP_DEV; i++ ) { - info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL); - if(tapfds[i] == NULL) return -ENOMEM; - info->minor = i; - info->pid = 0; - info->blkif = NULL; + blktap_major = ret; - ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i), - S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); + /* tapfds[0] is always NULL */ + blktap_next_minor++; - if(ret != 0) return -ENOMEM; - info->dev_pending = info->dev_inuse = 0; + ret = devfs_mk_cdev(MKDEV(blktap_major, i), + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); - DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + if(ret != 0) + return -ENOMEM; + + DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + + /* Make sure the xen class exists */ + if (!setup_xen_class()) { + /* + * This will allow udev to create the blktap ctrl device. + * We only want to create blktap0 first. We don't want + * to flood the sysfs system with needless blktap devices. + * We only create the device when a request of a new device is + * made. + */ + class_device_create(xen_class, NULL, + MKDEV(blktap_major, 0), NULL, + "blktap0"); + } else { + /* this is bad, but not fatal */ + WPRINTK("blktap: sysfs xen_class not created\n"); } - + DPRINTK("Blktap device successfully created\n"); return 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c index 6c16a2e60b..553ad45c48 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c @@ -189,7 +189,7 @@ static int blktap_probe(struct xenbus_device *dev, return 0; fail: - DPRINTK("blktap probe failed"); + DPRINTK("blktap probe failed\n"); blktap_remove(dev); return err; } @@ -243,7 +243,7 @@ static void tap_frontend_changed(struct xenbus_device *dev, struct backend_info *be = dev->dev.driver_data; int err; - DPRINTK(""); + DPRINTK("\n"); switch (frontend_state) { case XenbusStateInitialising: @@ -273,7 +273,6 @@ static void tap_frontend_changed(struct xenbus_device *dev, kthread_stop(be->blkif->xenblkd); be->blkif->xenblkd = NULL; } - tap_blkif_unmap(be->blkif); xenbus_switch_state(dev, XenbusStateClosing); break; @@ -319,7 +318,7 @@ static int connect_ring(struct backend_info *be) unsigned int evtchn; int err; - DPRINTK("%s", dev->otherend); + DPRINTK("%s\n", dev->otherend); err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, "event-channel", "%u", &evtchn, NULL); diff --git a/linux-2.6-xen-sparse/drivers/xen/char/mem.c b/linux-2.6-xen-sparse/drivers/xen/char/mem.c index 6576135c99..ac85c8dbb2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/char/mem.c +++ b/linux-2.6-xen-sparse/drivers/xen/char/mem.c @@ -28,13 +28,12 @@ #include <asm/io.h> #include <asm/hypervisor.h> -static inline int uncached_access(struct file *file) +#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE +static inline int valid_phys_addr_range(unsigned long addr, size_t *count) { - if (file->f_flags & O_SYNC) - return 1; - /* Xen sets correct MTRR type on non-RAM for us. */ - return 0; + return 1; } +#endif /* * This funcion reads the *physical* memory. The f_pos points directly to the @@ -47,6 +46,9 @@ static ssize_t read_mem(struct file * file, char __user * buf, ssize_t read = 0, sz; void __iomem *v; + if (!valid_phys_addr_range(p, &count)) + return -EFAULT; + while (count > 0) { /* * Handle first page in case it's not aligned @@ -58,13 +60,15 @@ static ssize_t read_mem(struct file * file, char __user * buf, sz = min_t(unsigned long, sz, count); - if ((v = ioremap(p, sz)) == NULL) { + v = xlate_dev_mem_ptr(p, sz); + if (IS_ERR(v) || v == NULL) { /* - * Some programs (e.g., dmidecode) groove off into weird RAM - * areas where no tables can possibly exist (because Xen will - * have stomped on them!). These programs get rather upset if - * we let them know that Xen failed their access, so we fake - * out a read of all zeroes. :-) + * Some programs (e.g., dmidecode) groove off into + * weird RAM areas where no tables can possibly exist + * (because Xen will have stomped on them!). These + * programs get rather upset if we let them know that + * Xen failed their access, so we fake out a read of + * all zeroes. */ if (clear_user(buf, count)) return -EFAULT; @@ -73,7 +77,7 @@ static ssize_t read_mem(struct file * file, char __user * buf, } ignored = copy_to_user(buf, v, sz); - iounmap(v); + xlate_dev_mem_ptr_unmap(v); if (ignored) return -EFAULT; buf += sz; @@ -93,6 +97,9 @@ static ssize_t write_mem(struct file * file, const char __user * buf, ssize_t written = 0, sz; void __iomem *v; + if (!valid_phys_addr_range(p, &count)) + return -EFAULT; + while (count > 0) { /* * Handle first page in case it's not aligned @@ -104,11 +111,17 @@ static ssize_t write_mem(struct file * file, const char __user * buf, sz = min_t(unsigned long, sz, count); - if ((v = ioremap(p, sz)) == NULL) + v = xlate_dev_mem_ptr(p, sz); + if (v == NULL) break; + if (IS_ERR(v)) { + if (written == 0) + return PTR_ERR(v); + break; + } ignored = copy_from_user(v, buf, sz); - iounmap(v); + xlate_dev_mem_ptr_unmap(v); if (ignored) { written += sz - ignored; if (written) @@ -125,6 +138,15 @@ static ssize_t write_mem(struct file * file, const char __user * buf, return written; } +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM +static inline int uncached_access(struct file *file) +{ + if (file->f_flags & O_SYNC) + return 1; + /* Xen sets correct MTRR type on non-RAM for us. */ + return 0; +} + static int mmap_mem(struct file * file, struct vm_area_struct * vma) { size_t size = vma->vm_end - vma->vm_start; @@ -136,6 +158,7 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot, DOMID_IO); } +#endif /* * The memory devices use the full 32/64 bits of the offset, and so we cannot diff --git a/linux-2.6-xen-sparse/drivers/xen/console/console.c b/linux-2.6-xen-sparse/drivers/xen/console/console.c index a45d21a69c..ff3e2a411e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/console/console.c +++ b/linux-2.6-xen-sparse/drivers/xen/console/console.c @@ -49,6 +49,7 @@ #include <linux/console.h> #include <linux/bootmem.h> #include <linux/sysrq.h> +#include <linux/screen_info.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/uaccess.h> @@ -266,6 +267,41 @@ void xencons_force_flush(void) } +void dom0_init_screen_info(const struct dom0_vga_console_info *info) +{ + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + screen_info.orig_video_mode = 3; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = info->u.text_mode_3.rows; + screen_info.orig_video_cols = info->u.text_mode_3.columns; + screen_info.orig_x = info->u.text_mode_3.cursor_x; + screen_info.orig_y = info->u.text_mode_3.cursor_y; + screen_info.orig_video_points = + info->u.text_mode_3.font_height; + break; + case XEN_VGATYPE_VESA_LFB: + screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info.lfb_width = info->u.vesa_lfb.width; + screen_info.lfb_height = info->u.vesa_lfb.height; + screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info.lfb_base = info->u.vesa_lfb.lfb_base; + screen_info.lfb_size = info->u.vesa_lfb.lfb_size; + screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info.red_size = info->u.vesa_lfb.red_size; + screen_info.red_pos = info->u.vesa_lfb.red_pos; + screen_info.green_size = info->u.vesa_lfb.green_size; + screen_info.green_pos = info->u.vesa_lfb.green_pos; + screen_info.blue_size = info->u.vesa_lfb.blue_size; + screen_info.blue_pos = info->u.vesa_lfb.blue_pos; + screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos; + break; + } +} + + /******************** User-space console driver (/dev/console) ************/ #define DRV(_d) (_d) diff --git a/linux-2.6-xen-sparse/drivers/xen/core/Makefile b/linux-2.6-xen-sparse/drivers/xen/core/Makefile index c1b0c1bd51..6154454339 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/core/Makefile @@ -9,5 +9,5 @@ obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SKBUFF) += skbuff.o -obj-$(CONFIG_XEN_REBOOT) += reboot.o +obj-$(CONFIG_XEN_REBOOT) += reboot.o machine_reboot.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o diff --git a/linux-2.6-xen-sparse/drivers/xen/core/features.c b/linux-2.6-xen-sparse/drivers/xen/core/features.c index 4d50caf50b..a76f58c04d 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/features.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/features.c @@ -11,6 +11,10 @@ #include <asm/hypervisor.h> #include <xen/features.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; /* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ EXPORT_SYMBOL(xen_features); diff --git a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c index 3195279a87..c5132c13bb 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c @@ -44,6 +44,10 @@ #include <asm/io.h> #include <xen/interface/memory.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 diff --git a/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c new file mode 100644 index 0000000000..02ee7f4728 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c @@ -0,0 +1,185 @@ +#define __KERNEL_SYSCALLS__ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <linux/stringify.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/interface/dom0_ops.h> +#include <xen/xenbus.h> +#include <linux/cpu.h> +#include <linux/kthread.h> +#include <xen/gnttab.h> +#include <xen/xencons.h> +#include <xen/cpu_hotplug.h> + +#if defined(__i386__) || defined(__x86_64__) + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +void machine_emergency_restart(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + HYPERVISOR_shutdown(SHUTDOWN_reboot); +} + +void machine_restart(char * __unused) +{ + machine_emergency_restart(); +} + +void machine_halt(void) +{ + machine_power_off(); +} + +void machine_power_off(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + if (pm_power_off) + pm_power_off(); + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +} + +int reboot_thru_bios = 0; /* for dmi_scan.c */ +EXPORT_SYMBOL(machine_restart); +EXPORT_SYMBOL(machine_halt); +EXPORT_SYMBOL(machine_power_off); + +/* Ensure we run on the idle task page tables so that we will + switch page tables before running user space. This is needed + on architectures with separate kernel and user page tables + because the user page table pointer is not saved/restored. */ +static void switch_idle_mm(void) +{ + struct mm_struct *mm = current->active_mm; + + if (mm == &init_mm) + return; + + atomic_inc(&init_mm.mm_count); + switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; + mmdrop(mm); +} + +static void pre_suspend(void) +{ + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + clear_fixmap(FIX_SHARED_INFO); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); +} + +static void post_suspend(void) +{ + int i, j, k, fpp; + extern unsigned long max_pfn; + extern unsigned long *pfn_to_mfn_frame_list_list; + extern unsigned long *pfn_to_mfn_frame_list[]; + + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + memset(empty_zero_page, 0, PAGE_SIZE); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + pfn_to_mfn_frame_list_list[k] = + virt_to_mfn(pfn_to_mfn_frame_list[k]); + j = 0; + } + pfn_to_mfn_frame_list[k][j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; +} + +#else /* !(defined(__i386__) || defined(__x86_64__)) */ + +#define switch_idle_mm() ((void)0) +#define mm_pin_all() ((void)0) +#define pre_suspend() ((void)0) +#define post_suspend() ((void)0) + +#endif + +int __xen_suspend(void) +{ + int err; + + extern void time_resume(void); + + BUG_ON(smp_processor_id() != 0); + BUG_ON(in_interrupt()); + +#if defined(__i386__) || defined(__x86_64__) + if (xen_feature(XENFEAT_auto_translated_physmap)) { + printk(KERN_WARNING "Cannot suspend in " + "auto_translated_physmap mode.\n"); + return -EOPNOTSUPP; + } +#endif + + err = smp_suspend(); + if (err) + return err; + + xenbus_suspend(); + + preempt_disable(); + + mm_pin_all(); + local_irq_disable(); + preempt_enable(); + + gnttab_suspend(); + + pre_suspend(); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + + post_suspend(); + + gnttab_resume(); + + irq_resume(); + + time_resume(); + + switch_idle_mm(); + + local_irq_enable(); + + xencons_resume(); + + xenbus_resume(); + + smp_resume(); + + return err; +} diff --git a/linux-2.6-xen-sparse/drivers/xen/core/reboot.c b/linux-2.6-xen-sparse/drivers/xen/core/reboot.c index 34c3930961..af3fe3a15c 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/reboot.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/reboot.c @@ -1,25 +1,15 @@ #define __KERNEL_SYSCALLS__ #include <linux/version.h> #include <linux/kernel.h> -#include <linux/mm.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/sysrq.h> -#include <linux/stringify.h> -#include <asm/irq.h> -#include <asm/mmu_context.h> -#include <xen/evtchn.h> #include <asm/hypervisor.h> -#include <xen/interface/dom0_ops.h> #include <xen/xenbus.h> -#include <linux/cpu.h> #include <linux/kthread.h> -#include <xen/gnttab.h> -#include <xen/xencons.h> -#include <xen/cpu_hotplug.h> -extern void ctrl_alt_del(void); +MODULE_LICENSE("Dual BSD/GPL"); #define SHUTDOWN_INVALID -1 #define SHUTDOWN_POWEROFF 0 @@ -31,186 +21,18 @@ extern void ctrl_alt_del(void); */ #define SHUTDOWN_HALT 4 -#if defined(__i386__) || defined(__x86_64__) - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -void machine_emergency_restart(void) -{ - /* We really want to get pending console data out before we die. */ - xencons_force_flush(); - HYPERVISOR_shutdown(SHUTDOWN_reboot); -} - -void machine_restart(char * __unused) -{ - machine_emergency_restart(); -} - -void machine_halt(void) -{ - machine_power_off(); -} - -void machine_power_off(void) -{ - /* We really want to get pending console data out before we die. */ - xencons_force_flush(); - if (pm_power_off) - pm_power_off(); - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -} - -int reboot_thru_bios = 0; /* for dmi_scan.c */ -EXPORT_SYMBOL(machine_restart); -EXPORT_SYMBOL(machine_halt); -EXPORT_SYMBOL(machine_power_off); - -#endif /* defined(__i386__) || defined(__x86_64__) */ - -/****************************************************************************** - * Stop/pickle callback handling. - */ - /* Ignore multiple shutdown requests. */ static int shutting_down = SHUTDOWN_INVALID; + static void __shutdown_handler(void *unused); static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); -#if defined(__i386__) || defined(__x86_64__) - -/* Ensure we run on the idle task page tables so that we will - switch page tables before running user space. This is needed - on architectures with separate kernel and user page tables - because the user page table pointer is not saved/restored. */ -static void switch_idle_mm(void) -{ - struct mm_struct *mm = current->active_mm; - - if (mm == &init_mm) - return; - - atomic_inc(&init_mm.mm_count); - switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; - mmdrop(mm); -} - -static void pre_suspend(void) -{ - HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; - clear_fixmap(FIX_SHARED_INFO); - - xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - mfn_to_pfn(xen_start_info->console.domU.mfn); -} - -static void post_suspend(void) -{ - int i, j, k, fpp; - extern unsigned long max_pfn; - extern unsigned long *pfn_to_mfn_frame_list_list; - extern unsigned long *pfn_to_mfn_frame_list[]; - - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); - - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - - memset(empty_zero_page, 0, PAGE_SIZE); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(pfn_to_mfn_frame_list_list); - - fpp = PAGE_SIZE/sizeof(unsigned long); - for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { - if ((j % fpp) == 0) { - k++; - pfn_to_mfn_frame_list_list[k] = - virt_to_mfn(pfn_to_mfn_frame_list[k]); - j = 0; - } - pfn_to_mfn_frame_list[k][j] = - virt_to_mfn(&phys_to_machine_mapping[i]); - } - HYPERVISOR_shared_info->arch.max_pfn = max_pfn; -} - -#else /* !(defined(__i386__) || defined(__x86_64__)) */ - -#define switch_idle_mm() ((void)0) -#define mm_pin_all() ((void)0) -#define pre_suspend() ((void)0) -#define post_suspend() ((void)0) - -#endif - -static int __do_suspend(void *ignore) -{ - int err; - - extern void time_resume(void); - - BUG_ON(smp_processor_id() != 0); - BUG_ON(in_interrupt()); - -#if defined(__i386__) || defined(__x86_64__) - if (xen_feature(XENFEAT_auto_translated_physmap)) { - printk(KERN_WARNING "Cannot suspend in " - "auto_translated_physmap mode.\n"); - return -EOPNOTSUPP; - } +#ifdef CONFIG_XEN +int __xen_suspend(void); +#else +#define __xen_suspend() (void)0 #endif - err = smp_suspend(); - if (err) - return err; - - xenbus_suspend(); - - preempt_disable(); - - mm_pin_all(); - local_irq_disable(); - preempt_enable(); - - gnttab_suspend(); - - pre_suspend(); - - /* - * We'll stop somewhere inside this hypercall. When it returns, - * we'll start resuming after the restore. - */ - HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); - - shutting_down = SHUTDOWN_INVALID; - - post_suspend(); - - gnttab_resume(); - - irq_resume(); - - time_resume(); - - switch_idle_mm(); - - local_irq_enable(); - - xencons_resume(); - - xenbus_resume(); - - smp_resume(); - - return err; -} - static int shutdown_process(void *__unused) { static char *envp[] = { "HOME=/", "TERM=linux", @@ -222,11 +44,13 @@ static int shutdown_process(void *__unused) if ((shutting_down == SHUTDOWN_POWEROFF) || (shutting_down == SHUTDOWN_HALT)) { - if (execve("/sbin/poweroff", poweroff_argv, envp) < 0) { + if (call_usermodehelper("/sbin/poweroff", poweroff_argv, envp, 0) < 0) { +#ifdef CONFIG_XEN sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_POWER_OFF, NULL); +#endif /* CONFIG_XEN */ } } @@ -235,6 +59,13 @@ static int shutdown_process(void *__unused) return 0; } +static int xen_suspend(void *__unused) +{ + __xen_suspend(); + shutting_down = SHUTDOWN_INVALID; + return 0; +} + static int kthread_create_on_cpu(int (*f)(void *arg), void *arg, const char *name, @@ -257,7 +88,7 @@ static void __shutdown_handler(void *unused) err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); else - err = kthread_create_on_cpu(__do_suspend, NULL, "suspend", 0); + err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0); if (err < 0) { printk(KERN_WARNING "Error creating shutdown process (%d): " @@ -298,7 +129,7 @@ static void shutdown_handler(struct xenbus_watch *watch, if (strcmp(str, "poweroff") == 0) shutting_down = SHUTDOWN_POWEROFF; else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); + kill_proc(1, SIGINT, 1); /* interrupt init */ else if (strcmp(str, "suspend") == 0) shutting_down = SHUTDOWN_SUSPEND; else if (strcmp(str, "halt") == 0) @@ -364,10 +195,14 @@ static int setup_shutdown_watcher(struct notifier_block *notifier, err = register_xenbus_watch(&shutdown_watch); if (err) printk(KERN_ERR "Failed to set shutdown watcher\n"); + else + xenbus_write(XBT_NIL, "control", "feature-reboot", "1"); err = register_xenbus_watch(&sysrq_watch); if (err) printk(KERN_ERR "Failed to set sysrq watcher\n"); + else + xenbus_write(XBT_NIL, "control", "feature-sysrq", "1"); return NOTIFY_DONE; } @@ -378,6 +213,7 @@ static int __init setup_shutdown_event(void) .notifier_call = setup_shutdown_watcher }; register_xenstore_notifier(&xenstore_notifier); + return 0; } diff --git a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c index a4a2e4edce..2fa88069c4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c @@ -18,7 +18,12 @@ /*static*/ kmem_cache_t *skbuff_cachep; EXPORT_SYMBOL(skbuff_cachep); -#define MAX_SKBUFF_ORDER 4 +/* Allow up to 64kB or page-sized packets (whichever is greater). */ +#if PAGE_SHIFT < 16 +#define MAX_SKBUFF_ORDER (16 - PAGE_SHIFT) +#else +#define MAX_SKBUFF_ORDER 0 +#endif static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1]; static struct { diff --git a/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c b/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c index 76bfab82e6..32f8de5bff 100644 --- a/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c +++ b/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c @@ -419,10 +419,9 @@ static struct file_operations evtchn_fops = { }; static struct miscdevice evtchn_miscdev = { - .minor = EVTCHN_MINOR, + .minor = MISC_DYNAMIC_MINOR, .name = "evtchn", .fops = &evtchn_fops, - .devfs_name = "misc/evtchn", }; static int __init evtchn_init(void) diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/common.h b/linux-2.6-xen-sparse/drivers/xen/netback/common.h index 434ff6bcf2..367c008d3b 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/netback/common.h @@ -92,6 +92,9 @@ typedef struct netif_st { unsigned long remaining_credit; struct timer_list credit_timeout; + /* Enforce draining of the transmit queue. */ + struct timer_list tx_queue_timeout; + /* Miscellaneous private stuff. */ struct list_head list; /* scheduling list */ atomic_t refcnt; @@ -106,7 +109,7 @@ typedef struct netif_st { void netif_disconnect(netif_t *netif); -netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]); +netif_t *netif_alloc(domid_t domid, unsigned int handle); int netif_map(netif_t *netif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn); @@ -119,6 +122,8 @@ int netif_map(netif_t *netif, unsigned long tx_ring_ref, void netif_xenbus_init(void); +#define netif_schedulable(dev) (netif_running(dev) && netif_carrier_ok(dev)) + void netif_schedule_work(netif_t *netif); void netif_deschedule_work(netif_t *netif); diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c index d60b23b0f2..9fae954bd2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c @@ -34,6 +34,23 @@ #include <linux/ethtool.h> #include <linux/rtnetlink.h> +/* + * Module parameter 'queue_length': + * + * Enables queuing in the network stack when a client has run out of receive + * descriptors. Although this feature can improve receive bandwidth by avoiding + * packet loss, it can also result in packets sitting in the 'tx_queue' for + * unbounded time. This is bad if those packets hold onto foreign resources. + * For example, consider a packet that holds onto resources belonging to the + * guest for which it is queued (e.g., packet received on vif1.0, destined for + * vif1.1 which is not activated in the guest): in this situation the guest + * will never be destroyed, unless vif1.1 is taken down. To avoid this, we + * run a timer (tx_queue_timeout) to drain the queue when the interface is + * blocked. + */ +static unsigned long netbk_queue_length = 32; +module_param_named(queue_length, netbk_queue_length, ulong, 0); + static void __netif_up(netif_t *netif) { enable_irq(netif->irq); @@ -107,9 +124,9 @@ static struct ethtool_ops network_ethtool_ops = .get_link = ethtool_op_get_link, }; -netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) +netif_t *netif_alloc(domid_t domid, unsigned int handle) { - int err = 0, i; + int err = 0; struct net_device *dev; netif_t *netif; char name[IFNAMSIZ] = {}; @@ -134,6 +151,10 @@ netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) netif->credit_bytes = netif->remaining_credit = ~0UL; netif->credit_usec = 0UL; init_timer(&netif->credit_timeout); + /* Initialize 'expires' now: it's used to track the credit window. */ + netif->credit_timeout.expires = jiffies; + + init_timer(&netif->tx_queue_timeout); dev->hard_start_xmit = netif_be_start_xmit; dev->get_stats = netif_be_get_stats; @@ -144,26 +165,16 @@ netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) SET_ETHTOOL_OPS(dev, &network_ethtool_ops); + dev->tx_queue_len = netbk_queue_length; + /* - * Reduce default TX queuelen so that each guest interface only - * allows it to eat around 6.4MB of host memory. - */ - dev->tx_queue_len = 100; - - for (i = 0; i < ETH_ALEN; i++) - if (be_mac[i] != 0) - break; - if (i == ETH_ALEN) { - /* - * Initialise a dummy MAC address. We choose the numerically - * largest non-broadcast address to prevent the address getting - * stolen by an Ethernet bridge for STP purposes. - * (FE:FF:FF:FF:FF:FF) - */ - memset(dev->dev_addr, 0xFF, ETH_ALEN); - dev->dev_addr[0] &= ~0x01; - } else - memcpy(dev->dev_addr, be_mac, ETH_ALEN); + * Initialise a dummy MAC address. We choose the numerically + * largest non-broadcast address to prevent the address getting + * stolen by an Ethernet bridge for STP purposes. + * (FE:FF:FF:FF:FF:FF) + */ + memset(dev->dev_addr, 0xFF, ETH_ALEN); + dev->dev_addr[0] &= ~0x01; rtnl_lock(); err = register_netdevice(dev); @@ -306,11 +317,23 @@ err_rx: return err; } -static void netif_free(netif_t *netif) +void netif_disconnect(netif_t *netif) { + if (netif_carrier_ok(netif->dev)) { + rtnl_lock(); + netif_carrier_off(netif->dev); + if (netif_running(netif->dev)) + __netif_down(netif); + rtnl_unlock(); + netif_put(netif); + } + atomic_dec(&netif->refcnt); wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); + del_timer_sync(&netif->credit_timeout); + del_timer_sync(&netif->tx_queue_timeout); + if (netif->irq) unbind_from_irqhandler(netif->irq, netif); @@ -324,16 +347,3 @@ static void netif_free(netif_t *netif) free_netdev(netif->dev); } - -void netif_disconnect(netif_t *netif) -{ - if (netif_carrier_ok(netif->dev)) { - rtnl_lock(); - netif_carrier_off(netif->dev); - if (netif_running(netif->dev)) - __netif_down(netif); - rtnl_unlock(); - netif_put(netif); - } - netif_free(netif); -} diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c index 391ace8a02..d021c9689a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c @@ -53,8 +53,10 @@ #include <linux/skbuff.h> #include <linux/ethtool.h> #include <net/dst.h> +#include <net/xfrm.h> /* secpath_reset() */ +#include <asm/hypervisor.h> /* is_initial_xendomain() */ -static int nloopbacks = 8; +static int nloopbacks = -1; module_param(nloopbacks, int, 0); MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create"); @@ -77,10 +79,60 @@ static int loopback_close(struct net_device *dev) return 0; } +#ifdef CONFIG_X86 +static int is_foreign(unsigned long pfn) +{ + /* NB. Play it safe for auto-translation mode. */ + return (xen_feature(XENFEAT_auto_translated_physmap) || + (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT)); +} +#else +/* How to detect a foreign mapping? Play it safe. */ +#define is_foreign(pfn) (1) +#endif + +static int skb_remove_foreign_references(struct sk_buff *skb) +{ + struct page *page; + unsigned long pfn; + int i, off; + char *vaddr; + + BUG_ON(skb_shinfo(skb)->frag_list); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page); + if (!is_foreign(pfn)) + continue; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!page)) + return 0; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + off = skb_shinfo(skb)->frags[i].page_offset; + memcpy(page_address(page) + off, + vaddr + off, + skb_shinfo(skb)->frags[i].size); + kunmap_skb_frag(vaddr); + + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = page; + } + + return 1; +} + static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_private *np = netdev_priv(dev); + if (!skb_remove_foreign_references(skb)) { + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; + } + dst_release(skb->dst); skb->dst = NULL; @@ -110,6 +162,11 @@ static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = eth_type_trans(skb, dev); skb->dev = dev; dev->last_rx = jiffies; + + /* Flush netfilter context: rx'ed skbuffs not expected to have any. */ + nf_reset(skb); + secpath_reset(skb); + netif_rx(skb); return 0; @@ -239,6 +296,9 @@ static int __init loopback_init(void) { int i, err = 0; + if (nloopbacks == -1) + nloopbacks = is_initial_xendomain() ? 4 : 0; + for (i = 0; i < nloopbacks; i++) if ((err = make_loopback(i)) != 0) break; diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c index ad8236c82f..1d24fc9b88 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c @@ -70,14 +70,15 @@ static struct timer_list net_timer; static struct sk_buff_head rx_queue; -static unsigned long mmap_vstart; -#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) - -static void *rx_mmap_area; +static struct page **mmap_pages; +static inline unsigned long idx_to_kaddr(unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx])); +} #define PKT_PROT_LEN 64 -static struct { +static struct pending_tx_info { netif_tx_request_t req; netif_t *netif; } pending_tx_info[MAX_PENDING_REQS]; @@ -186,7 +187,7 @@ static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) if (unlikely(!nskb)) goto err; - skb_reserve(nskb, 16); + skb_reserve(nskb, 16 + NET_IP_ALIGN); headlen = nskb->end - nskb->data; if (headlen > skb_headlen(skb)) headlen = skb_headlen(skb); @@ -217,7 +218,7 @@ static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) copy = len >= PAGE_SIZE ? PAGE_SIZE : len; zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; - page = alloc_page(GFP_ATOMIC | zero); + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); if (unlikely(!page)) goto err_free; @@ -263,6 +264,13 @@ static inline int netbk_queue_full(netif_t *netif) ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); } +static void tx_queue_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + if (netif_schedulable(netif->dev)) + netif_wake_queue(netif->dev); +} + int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) { netif_t *netif = netdev_priv(dev); @@ -270,20 +278,13 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) BUG_ON(skb->dev != dev); /* Drop the packet if the target domain has no receive buffers. */ - if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) + if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif))) goto drop; - if (unlikely(netbk_queue_full(netif))) { - /* Not a BUG_ON() -- misbehaving netfront can trigger this. */ - if (netbk_can_queue(dev)) - DPRINTK("Queue full but not stopped!\n"); - goto drop; - } - - /* Copy the packet here if it's destined for a flipping - interface but isn't flippable (e.g. extra references to - data) - */ + /* + * Copy the packet here if it's destined for a flipping interface + * but isn't flippable (e.g. extra references to data). + */ if (!netif->copying_receiver && !is_flippable_skb(skb)) { struct sk_buff *nskb = netbk_copy_skb(skb); if ( unlikely(nskb == NULL) ) @@ -304,8 +305,19 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) netif->rx.sring->req_event = netif->rx_req_cons_peek + netbk_max_required_rx_slots(netif); mb(); /* request notification /then/ check & stop the queue */ - if (netbk_queue_full(netif)) + if (netbk_queue_full(netif)) { netif_stop_queue(dev); + /* + * Schedule 500ms timeout to restart the queue, thus + * ensuring that an inactive queue will be drained. + * Packets will be immediately be dropped until more + * receive buffers become available (see + * netbk_queue_full() check above). + */ + netif->tx_queue_timeout.data = (unsigned long)netif; + netif->tx_queue_timeout.function = tx_queue_callback; + __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); + } } skb_queue_tail(&rx_queue, skb); @@ -373,14 +385,22 @@ static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta, flipped. */ meta->copy = 1; copy_gop = npo->copy + npo->copy_prod++; - copy_gop->source.domid = DOMID_SELF; + copy_gop->flags = GNTCOPY_dest_gref; + if (PageForeign(page)) { + struct pending_tx_info *src_pend = + &pending_tx_info[page->index]; + copy_gop->source.domid = src_pend->netif->domid; + copy_gop->source.u.ref = src_pend->req.gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = old_mfn; + } copy_gop->source.offset = offset; - copy_gop->source.u.gmfn = old_mfn; copy_gop->dest.domid = netif->domid; copy_gop->dest.offset = 0; copy_gop->dest.u.ref = req->gref; copy_gop->len = size; - copy_gop->flags = GNTCOPY_dest_gref; } else { meta->copy = 0; if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -474,7 +494,7 @@ static int netbk_check_gop(int nr_frags, domid_t domid, copy_op = npo->copy + npo->copy_cons++; if (copy_op->status != GNTST_okay) { DPRINTK("Bad status %d from copy to DOM%d.\n", - gop->status, domid); + copy_op->status, domid); status = NETIF_RSP_ERROR; } } else { @@ -697,6 +717,7 @@ static void net_rx_action(unsigned long unused) } if (netif_queue_stopped(netif->dev) && + netif_schedulable(netif->dev) && !netbk_queue_full(netif)) netif_wake_queue(netif->dev); @@ -754,8 +775,7 @@ static void add_to_net_schedule_list_tail(netif_t *netif) spin_lock_irq(&net_schedule_list_lock); if (!__on_net_schedule_list(netif) && - likely(netif_running(netif->dev) && - netif_carrier_ok(netif->dev))) { + likely(netif_schedulable(netif->dev))) { list_add_tail(&netif->list, &net_schedule_list); netif_get(netif); } @@ -792,10 +812,30 @@ void netif_deschedule_work(netif_t *netif) } +static void tx_add_credit(netif_t *netif) +{ + unsigned long max_burst, max_credit; + + /* + * Allow a burst big enough to transmit a jumbo packet of up to 128kB. + * Otherwise the interface can seize up due to insufficient credit. + */ + max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; + max_burst = min(max_burst, 131072UL); + max_burst = max(max_burst, netif->credit_bytes); + + /* Take care that adding a new chunk of credit doesn't wrap to zero. */ + max_credit = netif->remaining_credit + netif->credit_bytes; + if (max_credit < netif->remaining_credit) + max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ + + netif->remaining_credit = min(max_credit, max_burst); +} + static void tx_credit_callback(unsigned long data) { netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; + tx_add_credit(netif); netif_schedule_work(netif); } @@ -819,7 +859,7 @@ inline static void net_tx_action_dealloc(void) gop = tx_unmap_ops; while (dc != dp) { pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - gnttab_set_unmap_op(gop, MMAP_VADDR(pending_idx), + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), GNTMAP_host_map, grant_tx_handle[pending_idx]); gop++; @@ -857,20 +897,28 @@ static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end) netif_put(netif); } -static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp, - int work_to_do) +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first, + netif_tx_request_t *txp, int work_to_do) { - netif_tx_request_t *first = txp; RING_IDX cons = netif->tx.req_cons; int frags = 0; - while (txp->flags & NETTXF_more_data) { + if (!(first->flags & NETTXF_more_data)) + return 0; + + do { if (frags >= work_to_do) { DPRINTK("Need more frags\n"); return -frags; } - txp = RING_GET_REQUEST(&netif->tx, cons + frags); + if (unlikely(frags >= MAX_SKB_FRAGS)) { + DPRINTK("Too many frags\n"); + return -frags; + } + + memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), + sizeof(*txp)); if (txp->size > first->size) { DPRINTK("Frags galore\n"); return -frags; @@ -884,30 +932,28 @@ static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp, txp->offset, txp->size); return -frags; } - } + } while ((txp++)->flags & NETTXF_more_data); return frags; } static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, struct sk_buff *skb, + netif_tx_request_t *txp, gnttab_map_grant_ref_t *mop) { struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; - netif_tx_request_t *txp; unsigned long pending_idx = *((u16 *)skb->data); - RING_IDX cons = netif->tx.req_cons; int i, start; /* Skip first skb fragment if it is on same page as header fragment. */ start = ((unsigned long)shinfo->frags[0].page == pending_idx); - for (i = start; i < shinfo->nr_frags; i++) { - txp = RING_GET_REQUEST(&netif->tx, cons++); + for (i = start; i < shinfo->nr_frags; i++, txp++) { pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)]; - gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx), + gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), GNTMAP_host_map | GNTMAP_readonly, txp->gref, netif->domid); @@ -940,7 +986,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb, netif_put(netif); } else { set_phys_to_machine( - __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT, + __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT, FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); grant_tx_handle[pending_idx] = mop->handle; } @@ -957,7 +1003,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb, newerr = (++mop)->status; if (likely(!newerr)) { set_phys_to_machine( - __pa(MMAP_VADDR(pending_idx))>>PAGE_SHIFT, + __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT, FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); grant_tx_handle[pending_idx] = mop->handle; /* Had a previous error? Invalidate this fragment. */ @@ -1005,7 +1051,7 @@ static void netbk_fill_frags(struct sk_buff *skb) pending_idx = (unsigned long)frag->page; txp = &pending_tx_info[pending_idx].req; - frag->page = virt_to_page(MMAP_VADDR(pending_idx)); + frag->page = virt_to_page(idx_to_kaddr(pending_idx)); frag->size = txp->size; frag->page_offset = txp->offset; @@ -1018,7 +1064,7 @@ static void netbk_fill_frags(struct sk_buff *skb) int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, int work_to_do) { - struct netif_extra_info *extra; + struct netif_extra_info extra; RING_IDX cons = netif->tx.req_cons; do { @@ -1027,18 +1073,18 @@ int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, return -EBADR; } - extra = (struct netif_extra_info *) - RING_GET_REQUEST(&netif->tx, cons); - if (unlikely(!extra->type || - extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), + sizeof(extra)); + if (unlikely(!extra.type || + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { netif->tx.req_cons = ++cons; - DPRINTK("Invalid extra type: %d\n", extra->type); + DPRINTK("Invalid extra type: %d\n", extra.type); return -EINVAL; } - memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); + memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); netif->tx.req_cons = ++cons; - } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); return work_to_do; } @@ -1073,6 +1119,7 @@ static void net_tx_action(unsigned long unused) struct sk_buff *skb; netif_t *netif; netif_tx_request_t txreq; + netif_tx_request_t txfrags[MAX_SKB_FRAGS]; struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; u16 pending_idx; RING_IDX i; @@ -1101,6 +1148,7 @@ static void net_tx_action(unsigned long unused) i = netif->tx.req_cons; rmb(); /* Ensure that we see the request before we copy it. */ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + /* Credit-based scheduling. */ if (txreq.size > netif->remaining_credit) { unsigned long now = jiffies; @@ -1109,25 +1157,27 @@ static void net_tx_action(unsigned long unused) msecs_to_jiffies(netif->credit_usec / 1000); /* Timer could already be pending in rare cases. */ - if (timer_pending(&netif->credit_timeout)) - break; + if (timer_pending(&netif->credit_timeout)) { + netif_put(netif); + continue; + } /* Passed the point where we can replenish credit? */ if (time_after_eq(now, next_credit)) { netif->credit_timeout.expires = now; - netif->remaining_credit = netif->credit_bytes; + tx_add_credit(netif); } /* Still too big to send right now? Set a callback. */ if (txreq.size > netif->remaining_credit) { - netif->remaining_credit = 0; netif->credit_timeout.data = (unsigned long)netif; netif->credit_timeout.function = tx_credit_callback; __mod_timer(&netif->credit_timeout, next_credit); - break; + netif_put(netif); + continue; } } netif->remaining_credit -= txreq.size; @@ -1146,19 +1196,13 @@ static void net_tx_action(unsigned long unused) } } - ret = netbk_count_requests(netif, &txreq, work_to_do); + ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); if (unlikely(ret < 0)) { netbk_tx_err(netif, &txreq, i - ret); continue; } i += ret; - if (unlikely(ret > MAX_SKB_FRAGS)) { - DPRINTK("Too many frags\n"); - netbk_tx_err(netif, &txreq, i); - continue; - } - if (unlikely(txreq.size < ETH_HLEN)) { DPRINTK("Bad packet size: %d\n", txreq.size); netbk_tx_err(netif, &txreq, i); @@ -1180,7 +1224,7 @@ static void net_tx_action(unsigned long unused) ret < MAX_SKB_FRAGS) ? PKT_PROT_LEN : txreq.size; - skb = alloc_skb(data_len+16, GFP_ATOMIC); + skb = alloc_skb(data_len + 16 + NET_IP_ALIGN, GFP_ATOMIC); if (unlikely(skb == NULL)) { DPRINTK("Can't allocate a skb in start_xmit.\n"); netbk_tx_err(netif, &txreq, i); @@ -1188,7 +1232,7 @@ static void net_tx_action(unsigned long unused) } /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, 16); + skb_reserve(skb, 16 + NET_IP_ALIGN); if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { struct netif_extra_info *gso; @@ -1201,7 +1245,7 @@ static void net_tx_action(unsigned long unused) } } - gnttab_set_map_op(mop, MMAP_VADDR(pending_idx), + gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), GNTMAP_host_map | GNTMAP_readonly, txreq.gref, netif->domid); mop++; @@ -1227,7 +1271,7 @@ static void net_tx_action(unsigned long unused) pending_cons++; - mop = netbk_get_requests(netif, skb, mop); + mop = netbk_get_requests(netif, skb, txfrags, mop); netif->tx.req_cons = i; netif_schedule_work(netif); @@ -1260,8 +1304,8 @@ static void net_tx_action(unsigned long unused) } data_len = skb->len; - memcpy(skb->data, - (void *)(MMAP_VADDR(pending_idx)|txp->offset), + memcpy(skb->data, + (void *)(idx_to_kaddr(pending_idx)|txp->offset), data_len); if (data_len < txp->size) { /* Append the packet payload as a fragment. */ @@ -1315,18 +1359,10 @@ static void netif_idx_release(u16 pending_idx) static void netif_page_release(struct page *page) { - u16 pending_idx = page - virt_to_page(mmap_vstart); - /* Ready for next use. */ set_page_count(page, 1); - netif_idx_release(pending_idx); -} - -static void netif_rx_page_release(struct page *page) -{ - /* Ready for next use. */ - set_page_count(page, 1); + netif_idx_release(page->index); } irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) @@ -1336,7 +1372,7 @@ irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) add_to_net_schedule_list_tail(netif); maybe_schedule_tx_action(); - if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif)) + if (netif_schedulable(netif->dev) && !netbk_queue_full(netif)) netif_wake_queue(netif->dev); return IRQ_HANDLED; @@ -1446,27 +1482,17 @@ static int __init netback_init(void) init_timer(&net_timer); net_timer.data = 0; net_timer.function = net_alarm; - - page = balloon_alloc_empty_page_range(MAX_PENDING_REQS); - if (page == NULL) - return -ENOMEM; - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); + if (mmap_pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; + } for (i = 0; i < MAX_PENDING_REQS; i++) { - page = virt_to_page(MMAP_VADDR(i)); - set_page_count(page, 1); + page = mmap_pages[i]; SetPageForeign(page, netif_page_release); - } - - page = balloon_alloc_empty_page_range(NET_RX_RING_SIZE); - BUG_ON(page == NULL); - rx_mmap_area = pfn_to_kaddr(page_to_pfn(page)); - - for (i = 0; i < NET_RX_RING_SIZE; i++) { - page = virt_to_page(rx_mmap_area + (i * PAGE_SIZE)); - set_page_count(page, 1); - SetPageForeign(page, netif_rx_page_release); + page->index = i; } pending_cons = 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c index 6da614fc0c..7d301965f4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c @@ -28,29 +28,20 @@ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) #endif -struct backend_info -{ +struct backend_info { struct xenbus_device *dev; netif_t *netif; - struct xenbus_watch backend_watch; enum xenbus_state frontend_state; }; static int connect_rings(struct backend_info *); static void connect(struct backend_info *); -static void maybe_connect(struct backend_info *); -static void backend_changed(struct xenbus_watch *, const char **, - unsigned int); +static void backend_create_netif(struct backend_info *be); static int netback_remove(struct xenbus_device *dev) { struct backend_info *be = dev->dev.driver_data; - if (be->backend_watch.node) { - unregister_xenbus_watch(&be->backend_watch); - kfree(be->backend_watch.node); - be->backend_watch.node = NULL; - } if (be->netif) { netif_disconnect(be->netif); be->netif = NULL; @@ -63,8 +54,7 @@ static int netback_remove(struct xenbus_device *dev) /** * Entry point to this code when a new device is created. Allocate the basic - * structures, and watch the store waiting for the hotplug scripts to tell us - * the device's handle. Switch to InitWait. + * structures and switch to InitWait. */ static int netback_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) @@ -83,11 +73,6 @@ static int netback_probe(struct xenbus_device *dev, be->dev = dev; dev->dev.driver_data = be; - err = xenbus_watch_path2(dev, dev->nodename, "handle", - &be->backend_watch, backend_changed); - if (err) - goto fail; - do { err = xenbus_transaction_start(&xbt); if (err) { @@ -108,9 +93,22 @@ static int netback_probe(struct xenbus_device *dev, goto abort_transaction; } - err = xenbus_printf(xbt, dev->nodename, "feature-rx-copy", "%d", 1); + /* We support rx-copy path. */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-copy", "%d", 1); + if (err) { + message = "writing feature-rx-copy"; + goto abort_transaction; + } + + /* + * We don't support rx-flip path (except old guests who don't + * grok this feature flag). + */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-flip", "%d", 0); if (err) { - message = "writing feature-copying"; + message = "writing feature-rx-flip"; goto abort_transaction; } @@ -123,9 +121,11 @@ static int netback_probe(struct xenbus_device *dev, } err = xenbus_switch_state(dev, XenbusStateInitWait); - if (err) { + if (err) goto fail; - } + + /* This kicks hotplug scripts, so do it immediately. */ + backend_create_netif(be); return 0; @@ -175,48 +175,30 @@ static int netback_uevent(struct xenbus_device *xdev, char **envp, } -/** - * Callback received when the hotplug scripts have placed the handle node. - * Read it, and create a netif structure. If the frontend is ready, connect. - */ -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) +static void backend_create_netif(struct backend_info *be) { int err; long handle; - struct backend_info *be - = container_of(watch, struct backend_info, backend_watch); struct xenbus_device *dev = be->dev; - DPRINTK(""); + if (be->netif != NULL) + return; err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); - if (XENBUS_EXIST_ERR(err)) { - /* Since this watch will fire once immediately after it is - registered, we expect this. Ignore it, and wait for the - hotplug scripts. */ - return; - } if (err != 1) { xenbus_dev_fatal(dev, err, "reading handle"); return; } - if (be->netif == NULL) { - u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 }; - - be->netif = netif_alloc(dev->otherend_id, handle, be_mac); - if (IS_ERR(be->netif)) { - err = PTR_ERR(be->netif); - be->netif = NULL; - xenbus_dev_fatal(dev, err, "creating interface"); - return; - } - - kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); - - maybe_connect(be); + be->netif = netif_alloc(dev->otherend_id, handle); + if (IS_ERR(be->netif)) { + err = PTR_ERR(be->netif); + be->netif = NULL; + xenbus_dev_fatal(dev, err, "creating interface"); + return; } + + kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); } @@ -249,11 +231,9 @@ static void frontend_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - if (!be->netif) { - /* reconnect: setup be->netif */ - backend_changed(&be->backend_watch, NULL, 0); - } - maybe_connect(be); + backend_create_netif(be); + if (be->netif) + connect(be); break; case XenbusStateClosing: @@ -279,15 +259,6 @@ static void frontend_changed(struct xenbus_device *dev, } -/* ** Connection ** */ - - -static void maybe_connect(struct backend_info *be) -{ - if (be->netif && (be->frontend_state == XenbusStateConnected)) - connect(be); -} - static void xen_net_read_rate(struct xenbus_device *dev, unsigned long *bytes, unsigned long *usec) { @@ -366,6 +337,10 @@ static void connect(struct backend_info *be) be->netif->remaining_credit = be->netif->credit_bytes; xenbus_switch_state(dev, XenbusStateConnected); + + /* May not get a kick from the frontend, so start the tx_queue now. */ + if (!netbk_can_queue(be->netif->dev)) + netif_wake_queue(be->netif->dev); } @@ -403,14 +378,16 @@ static int connect_rings(struct backend_info *be) } be->netif->copying_receiver = !!rx_copy; - if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-notify", "%d", - &val) < 0) - val = 0; - if (val) - be->netif->can_queue = 1; - else - /* Must be non-zero for pfifo_fast to work. */ - be->netif->dev->tx_queue_len = 1; + if (be->netif->dev->tx_queue_len != 0) { + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-rx-notify", "%d", &val) < 0) + val = 0; + if (val) + be->netif->can_queue = 1; + else + /* Must be non-zero for pfifo_fast to work. */ + be->netif->dev->tx_queue_len = 1; + } if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0) val = 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c index a257cb6064..da22d45bf6 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c @@ -47,6 +47,7 @@ #include <linux/in.h> #include <linux/if_ether.h> #include <linux/io.h> +#include <linux/moduleparam.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/arp.h> @@ -63,20 +64,76 @@ #include <xen/interface/grant_table.h> #include <xen/gnttab.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +/* + * Mutually-exclusive module options to select receive data path: + * rx_copy : Packets are copied by network backend into local memory + * rx_flip : Page containing packet data is transferred to our ownership + * For fully-virtualised guests there is no option - copying must be used. + * For paravirtualised guests, flipping is the default. + */ +#ifdef CONFIG_XEN +static int MODPARM_rx_copy = 0; +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); +static int MODPARM_rx_flip = 0; +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); +#else +static const int MODPARM_rx_copy = 1; +static const int MODPARM_rx_flip = 0; +#endif + #define RX_COPY_THRESHOLD 256 /* If we don't have GSO, fake things up so that we never try to use it. */ -#ifndef NETIF_F_GSO -#define netif_needs_gso(dev, skb) 0 -#define dev_disable_gso_features(dev) ((void)0) -#else +#if defined(NETIF_F_GSO) #define HAVE_GSO 1 +#define HAVE_TSO 1 /* TSO is a subset of GSO */ static inline void dev_disable_gso_features(struct net_device *dev) { /* Turn off all GSO bits except ROBUST. */ dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; dev->features |= NETIF_F_GSO_ROBUST; } +#elif defined(NETIF_F_TSO) +#define HAVE_TSO 1 + +/* Some older kernels cannot cope with incorrect checksums, + * particularly in netfilter. I'm not sure there is 100% correlation + * with the presence of NETIF_F_TSO but it appears to be a good first + * approximiation. + */ +#define HAVE_NO_CSUM_OFFLOAD 1 + +#define gso_size tso_size +#define gso_segs tso_segs +static inline void dev_disable_gso_features(struct net_device *dev) +{ + /* Turn off all TSO bits. */ + dev->features &= ~NETIF_F_TSO; +} +static inline int skb_is_gso(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->tso_size; +} +static inline int skb_gso_ok(struct sk_buff *skb, int features) +{ + return (features & NETIF_F_TSO); +} + +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + return skb_is_gso(skb) && + (!skb_gso_ok(skb, dev->features) || + unlikely(skb->ip_summed != CHECKSUM_HW)); +} +#else +#define netif_needs_gso(dev, skb) 0 +#define dev_disable_gso_features(dev) ((void)0) #endif #define GRANT_INVALID_REF 0 @@ -96,7 +153,6 @@ struct netfront_info { spinlock_t tx_lock; spinlock_t rx_lock; - unsigned int handle; unsigned int evtchn, irq; unsigned int copying_receiver; @@ -120,7 +176,7 @@ struct netfront_info { grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; grant_ref_t gref_rx_head; - grant_ref_t grant_rx_ref[NET_TX_RING_SIZE]; + grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; struct xenbus_device *xbdev; int tx_ring_ref; @@ -185,9 +241,8 @@ static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np, #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "netfront: " fmt, ##args) -static int talk_to_backend(struct xenbus_device *, struct netfront_info *); static int setup_device(struct xenbus_device *, struct netfront_info *); -static struct net_device *create_netdev(int, int, struct xenbus_device *); +static struct net_device *create_netdev(struct xenbus_device *); static void netfront_closing(struct xenbus_device *); @@ -195,9 +250,8 @@ static void end_access(int, void *); static void netif_disconnect_backend(struct netfront_info *); static int open_netdev(struct netfront_info *); static void close_netdev(struct netfront_info *); -static void netif_free(struct netfront_info *); -static void network_connect(struct net_device *); +static int network_connect(struct net_device *); static void network_tx_buf_gc(struct net_device *); static void network_alloc_rx_buffers(struct net_device *); static int send_fake_arp(struct net_device *); @@ -220,8 +274,7 @@ static inline int xennet_can_sg(struct net_device *dev) /** * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffers for communication with the backend, and - * inform the backend of the appropriate details for those. Switch to - * Connected state. + * inform the backend of the appropriate details for those. */ static int __devinit netfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) @@ -229,31 +282,8 @@ static int __devinit netfront_probe(struct xenbus_device *dev, int err; struct net_device *netdev; struct netfront_info *info; - unsigned int handle; - unsigned feature_rx_copy; - - err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%u", &handle); - if (err != 1) { - xenbus_dev_fatal(dev, err, "reading handle"); - return err; - } - -#ifndef CONFIG_XEN - err = xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-copy", "%u", - &feature_rx_copy); - if (err != 1) { - xenbus_dev_fatal(dev, err, "reading feature-rx-copy"); - return err; - } - if (!feature_rx_copy) { - xenbus_dev_fatal(dev, 0, "need a copy-capable backend"); - return -EINVAL; - } -#else - feature_rx_copy = 0; -#endif - netdev = create_netdev(handle, feature_rx_copy, dev); + netdev = create_netdev(dev); if (IS_ERR(netdev)) { err = PTR_ERR(netdev); xenbus_dev_fatal(dev, err, "creating netdev"); @@ -263,20 +293,13 @@ static int __devinit netfront_probe(struct xenbus_device *dev, info = netdev_priv(netdev); dev->dev.driver_data = info; - err = talk_to_backend(dev, info); - if (err) - goto fail_backend; - err = open_netdev(info); if (err) - goto fail_open; + goto fail; return 0; - fail_open: - xennet_sysfs_delif(info->netdev); - unregister_netdev(netdev); - fail_backend: + fail: free_netdev(netdev); dev->dev.driver_data = NULL; return err; @@ -296,7 +319,7 @@ static int netfront_resume(struct xenbus_device *dev) DPRINTK("%s\n", dev->nodename); netif_disconnect_backend(info); - return talk_to_backend(dev, info); + return 0; } static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) @@ -379,13 +402,21 @@ again: goto abort_transaction; } +#ifdef HAVE_NO_CSUM_OFFLOAD + err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1); + if (err) { + message = "writing feature-no-csum-offload"; + goto abort_transaction; + } +#endif + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } -#ifdef HAVE_GSO +#ifdef HAVE_TSO err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; @@ -407,12 +438,11 @@ again: xenbus_transaction_end(xbt, 1); xenbus_dev_fatal(dev, err, "%s", message); destroy_ring: - netif_free(info); + netif_disconnect_backend(info); out: return err; } - static int setup_device(struct xenbus_device *dev, struct netfront_info *info) { struct netif_tx_sring *txs; @@ -472,11 +502,9 @@ static int setup_device(struct xenbus_device *dev, struct netfront_info *info) return 0; fail: - netif_free(info); return err; } - /** * Callback received when the backend's state changes. */ @@ -497,7 +525,8 @@ static void backend_changed(struct xenbus_device *dev, break; case XenbusStateInitWait: - network_connect(netdev); + if (network_connect(netdev) != 0) + break; xenbus_switch_state(dev, XenbusStateConnected); (void)send_fake_arp(netdev); break; @@ -508,7 +537,6 @@ static void backend_changed(struct xenbus_device *dev, } } - /** Send a packet on a net device to encourage switches to learn the * MAC. We send a fake ARP request. * @@ -537,7 +565,6 @@ static int send_fake_arp(struct net_device *dev) return dev_queue_xmit(skb); } - static int network_open(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); @@ -629,14 +656,12 @@ static void network_tx_buf_gc(struct net_device *dev) network_maybe_wake_tx(dev); } - static void rx_refill_timeout(unsigned long data) { struct net_device *dev = (struct net_device *)data; netif_rx_schedule(dev); } - static void network_alloc_rx_buffers(struct net_device *dev) { unsigned short id; @@ -669,7 +694,7 @@ static void network_alloc_rx_buffers(struct net_device *dev) * necessary here. * 16 bytes added as necessary headroom for netif_receive_skb. */ - skb = alloc_skb(RX_COPY_THRESHOLD + 16, + skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) goto no_skb; @@ -687,7 +712,7 @@ no_skb: break; } - skb_reserve(skb, 16); /* mimic dev_alloc_skb() */ + skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */ skb_shinfo(skb)->frags[0].page = page; skb_shinfo(skb)->nr_frags = 1; __skb_queue_tail(&np->rx_batch, skb); @@ -742,7 +767,7 @@ no_skb: } else { gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, - pfn, + pfn_to_mfn(pfn), 0); } @@ -917,7 +942,7 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) tx->flags |= NETTXF_data_validated; #endif -#ifdef HAVE_GSO +#ifdef HAVE_TSO if (skb_shinfo(skb)->gso_size) { struct netif_extra_info *gso = (struct netif_extra_info *) RING_GET_REQUEST(&np->tx, ++i); @@ -1071,6 +1096,7 @@ static int xennet_get_responses(struct netfront_info *np, if (net_ratelimit()) WPRINTK("rx->offset: %x, size: %u\n", rx->offset, rx->status); + xennet_move_rx_slot(np, skb, ref); err = -EINVAL; goto next; } @@ -1081,7 +1107,8 @@ static int xennet_get_responses(struct netfront_info *np, * situation to the system controller to reboot the backed. */ if (ref == GRANT_INVALID_REF) { - WPRINTK("Bad rx response id %d.\n", rx->id); + if (net_ratelimit()) + WPRINTK("Bad rx response id %d.\n", rx->id); err = -EINVAL; goto next; } @@ -1153,6 +1180,9 @@ next: err = -E2BIG; } + if (unlikely(err)) + np->rx.rsp_cons = cons + frags; + *pages_flipped_p = pages_flipped; return err; @@ -1205,12 +1235,14 @@ static int xennet_set_skb_gso(struct sk_buff *skb, return -EINVAL; } -#ifdef HAVE_GSO +#ifdef HAVE_TSO skb_shinfo(skb)->gso_size = gso->u.gso.size; +#ifdef HAVE_GSO skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; +#endif skb_shinfo(skb)->gso_segs = 0; return 0; @@ -1255,9 +1287,9 @@ static int netif_poll(struct net_device *dev, int *pbudget) rp = np->rx.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = np->rx.rsp_cons, work_done = 0; - (i != rp) && (work_done < budget); - np->rx.rsp_cons = ++i, work_done++) { + i = np->rx.rsp_cons; + work_done = 0; + while ((i != rp) && (work_done < budget)) { memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); memset(extras, 0, sizeof(extras)); @@ -1265,12 +1297,11 @@ static int netif_poll(struct net_device *dev, int *pbudget) &pages_flipped); if (unlikely(err)) { -err: - i = np->rx.rsp_cons + skb_queue_len(&tmpq) - 1; - work_done--; +err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); np->stats.rx_errors++; + i = np->rx.rsp_cons; continue; } @@ -1282,6 +1313,7 @@ err: if (unlikely(xennet_set_skb_gso(skb, gso))) { __skb_queue_head(&tmpq, skb); + np->rx.rsp_cons += skb_queue_len(&tmpq); goto err; } } @@ -1345,6 +1377,9 @@ err: np->stats.rx_bytes += skb->len; __skb_queue_tail(&rxq, skb); + + np->rx.rsp_cons = ++i; + work_done++; } if (pages_flipped) { @@ -1561,7 +1596,7 @@ static int xennet_set_sg(struct net_device *dev, u32 data) static int xennet_set_tso(struct net_device *dev, u32 data) { -#ifdef HAVE_GSO +#ifdef HAVE_TSO if (data) { struct netfront_info *np = netdev_priv(dev); int val; @@ -1588,20 +1623,53 @@ static void xennet_set_features(struct net_device *dev) if (!(dev->features & NETIF_F_IP_CSUM)) return; - if (!xennet_set_sg(dev, 1)) - xennet_set_tso(dev, 1); + if (xennet_set_sg(dev, 1)) + return; + + /* Before 2.6.9 TSO seems to be unreliable so do not enable it + * on older kernels. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + xennet_set_tso(dev, 1); +#endif + } -static void network_connect(struct net_device *dev) +static int network_connect(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - int i, requeue_idx; + int i, requeue_idx, err; struct sk_buff *skb; grant_ref_t ref; netif_rx_request_t *req; + unsigned int feature_rx_copy, feature_rx_flip; + + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-copy", "%u", &feature_rx_copy); + if (err != 1) + feature_rx_copy = 0; + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-flip", "%u", &feature_rx_flip); + if (err != 1) + feature_rx_flip = 1; + + /* + * Copy packets on receive path if: + * (a) This was requested by user, and the backend supports it; or + * (b) Flipping was requested, but this is unsupported by the backend. + */ + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || + (MODPARM_rx_flip && !feature_rx_flip)); + + err = talk_to_backend(np->xbdev, np); + if (err) + return err; xennet_set_features(dev); + IPRINTK("device %s has %sing receive path.\n", + dev->name, np->copying_receiver ? "copy" : "flipp"); + spin_lock_irq(&np->tx_lock); spin_lock(&np->rx_lock); @@ -1632,7 +1700,8 @@ static void network_connect(struct net_device *dev) } else { gnttab_grant_foreign_access_ref( ref, np->xbdev->otherend_id, - page_to_pfn(skb_shinfo(skb)->frags->page), + pfn_to_mfn(page_to_pfn(skb_shinfo(skb)-> + frags->page)), 0); } req->gref = ref; @@ -1656,6 +1725,8 @@ static void network_connect(struct net_device *dev) spin_unlock(&np->rx_lock); spin_unlock_irq(&np->tx_lock); + + return 0; } static void netif_uninit(struct net_device *dev) @@ -1821,8 +1892,7 @@ static void network_set_multicast_list(struct net_device *dev) { } -static struct net_device * __devinit -create_netdev(int handle, int copying_receiver, struct xenbus_device *dev) +static struct net_device * __devinit create_netdev(struct xenbus_device *dev) { int i, err = 0; struct net_device *netdev = NULL; @@ -1836,9 +1906,7 @@ create_netdev(int handle, int copying_receiver, struct xenbus_device *dev) } np = netdev_priv(netdev); - np->handle = handle; np->xbdev = dev; - np->copying_receiver = copying_receiver; netif_carrier_off(netdev); @@ -1969,10 +2037,12 @@ static int open_netdev(struct netfront_info *info) err = xennet_sysfs_addif(info->netdev); if (err) { - /* This can be non-fatal: it only means no tuning parameters */ + unregister_netdev(info->netdev); printk(KERN_WARNING "%s: add sysfs failed err=%d\n", __FUNCTION__, err); + return err; } + return 0; } @@ -2007,14 +2077,6 @@ static void netif_disconnect_backend(struct netfront_info *info) } -static void netif_free(struct netfront_info *info) -{ - close_netdev(info); - netif_disconnect_backend(info); - free_netdev(info->netdev); -} - - static void end_access(int ref, void *page) { if (ref != GRANT_INVALID_REF) @@ -2053,6 +2115,16 @@ static int __init netif_init(void) if (!is_running_on_xen()) return -ENODEV; +#ifdef CONFIG_XEN + if (MODPARM_rx_flip && MODPARM_rx_copy) { + WPRINTK("Cannot specify both rx_copy and rx_flip.\n"); + return -EINVAL; + } + + if (!MODPARM_rx_flip && !MODPARM_rx_copy) + MODPARM_rx_flip = 1; /* Default is to flip. */ +#endif + if (is_initial_xendomain()) return 0; @@ -2067,6 +2139,9 @@ module_init(netif_init); static void __exit netif_exit(void) { + if (is_initial_xendomain()) + return; + unregister_inetaddr_notifier(¬ifier_inetdev); return xenbus_unregister_driver(&netfront); diff --git a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c index a1c4b6f68e..d159e4ac74 100644 --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c @@ -35,6 +35,10 @@ static struct proc_dir_entry *privcmd_intf; static struct proc_dir_entry *capabilities_intf; +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); +#endif + static int privcmd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long data) { @@ -49,6 +53,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, return -EFAULT; #if defined(__i386__) + if (hypercall.op >= (PAGE_SIZE >> 5)) + break; __asm__ __volatile__ ( "pushl %%ebx; pushl %%ecx; pushl %%edx; " "pushl %%esi; pushl %%edi; " @@ -65,45 +71,36 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, "popl %%ecx; popl %%ebx" : "=a" (ret) : "0" (&hypercall) : "memory" ); #elif defined (__x86_64__) - { + if (hypercall.op < (PAGE_SIZE >> 5)) { long ign1, ign2, ign3; __asm__ __volatile__ ( "movq %8,%%r10; movq %9,%%r8;" - "shlq $5,%%rax ;" + "shll $5,%%eax ;" "addq $hypercall_page,%%rax ;" "call *%%rax" : "=a" (ret), "=D" (ign1), "=S" (ign2), "=d" (ign3) - : "0" ((unsigned long)hypercall.op), - "1" ((unsigned long)hypercall.arg[0]), - "2" ((unsigned long)hypercall.arg[1]), - "3" ((unsigned long)hypercall.arg[2]), - "g" ((unsigned long)hypercall.arg[3]), - "g" ((unsigned long)hypercall.arg[4]) + : "0" ((unsigned int)hypercall.op), + "1" (hypercall.arg[0]), + "2" (hypercall.arg[1]), + "3" (hypercall.arg[2]), + "g" (hypercall.arg[3]), + "g" (hypercall.arg[4]) : "r8", "r10", "memory" ); } #elif defined (__ia64__) - __asm__ __volatile__ ( - ";; mov r14=%2; mov r15=%3; " - "mov r16=%4; mov r17=%5; mov r18=%6;" - "mov r2=%1; break 0x1000;; mov %0=r8 ;;" - : "=r" (ret) - : "r" (hypercall.op), - "r" (hypercall.arg[0]), - "r" (hypercall.arg[1]), - "r" (hypercall.arg[2]), - "r" (hypercall.arg[3]), - "r" (hypercall.arg[4]) - : "r14","r15","r16","r17","r18","r2","r8","memory"); + ret = privcmd_hypercall(&hypercall); #endif } break; case IOCTL_PRIVCMD_MMAP: { -#define PRIVCMD_MMAP_SZ 32 privcmd_mmap_t mmapcmd; - privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ]; + privcmd_mmap_entry_t msg; privcmd_mmap_entry_t __user *p; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long va; int i, rc; if (!is_initial_xendomain()) @@ -113,85 +110,92 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, return -EFAULT; p = mmapcmd.entry; + if (copy_from_user(&msg, p, sizeof(msg))) + return -EFAULT; - for (i = 0; i < mmapcmd.num; - i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) { - int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? - PRIVCMD_MMAP_SZ:(mmapcmd.num-i); - - if (copy_from_user(&msg, p, - n*sizeof(privcmd_mmap_entry_t))) - return -EFAULT; - - for (j = 0; j < n; j++) { - struct vm_area_struct *vma = - find_vma( current->mm, msg[j].va ); - - if (!vma) - return -EINVAL; - - if (msg[j].va > PAGE_OFFSET) - return -EINVAL; - - if ((msg[j].va + (msg[j].npages << PAGE_SHIFT)) - > vma->vm_end ) - return -EINVAL; - - if ((rc = direct_remap_pfn_range( - vma, - msg[j].va&PAGE_MASK, - msg[j].mfn, - msg[j].npages<<PAGE_SHIFT, - vma->vm_page_prot, - mmapcmd.dom)) < 0) - return rc; - } + down_read(&mm->mmap_sem); + + vma = find_vma(mm, msg.va); + rc = -EINVAL; + if (!vma || (msg.va != vma->vm_start) || + !privcmd_enforce_singleshot_mapping(vma)) + goto mmap_out; + + va = vma->vm_start; + + for (i = 0; i < mmapcmd.num; i++) { + rc = -EFAULT; + if (copy_from_user(&msg, p, sizeof(msg))) + goto mmap_out; + + /* Do not allow range to wrap the address space. */ + rc = -EINVAL; + if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va)) + goto mmap_out; + + /* Range chunks must be contiguous in va space. */ + if ((msg.va != va) || + ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end)) + goto mmap_out; + + if ((rc = direct_remap_pfn_range( + vma, + msg.va & PAGE_MASK, + msg.mfn, + msg.npages << PAGE_SHIFT, + vma->vm_page_prot, + mmapcmd.dom)) < 0) + goto mmap_out; + + p++; + va += msg.npages << PAGE_SHIFT; } - ret = 0; + + rc = 0; + + mmap_out: + up_read(&mm->mmap_sem); + ret = rc; } break; case IOCTL_PRIVCMD_MMAPBATCH: { privcmd_mmapbatch_t m; - struct vm_area_struct *vma = NULL; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; xen_pfn_t __user *p; - unsigned long addr, mfn; + unsigned long addr, mfn, nr_pages; int i; if (!is_initial_xendomain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) { - ret = -EFAULT; - goto batch_err; - } - - if (m.dom == DOMID_SELF) { - ret = -EINVAL; - goto batch_err; - } + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; - vma = find_vma(current->mm, m.addr); - if (!vma) { - ret = -EINVAL; - goto batch_err; - } + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; - if (m.addr > PAGE_OFFSET) { - ret = -EFAULT; - goto batch_err; - } + down_read(&mm->mmap_sem); - if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) { - ret = -EFAULT; - goto batch_err; + vma = find_vma(mm, m.addr); + if (!vma || + (m.addr != vma->vm_start) || + ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || + !privcmd_enforce_singleshot_mapping(vma)) { + up_read(&mm->mmap_sem); + return -EINVAL; } p = m.arr; addr = m.addr; - for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) { - if (get_user(mfn, p)) + for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) { + if (get_user(mfn, p)) { + up_read(&mm->mmap_sem); return -EFAULT; + } ret = direct_remap_pfn_range(vma, addr & PAGE_MASK, mfn, PAGE_SIZE, @@ -200,15 +204,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, put_user(0xF0000000 | mfn, p); } + up_read(&mm->mmap_sem); ret = 0; - break; - - batch_err: - printk("batch_err ret=%d vma=%p addr=%lx " - "num=%d arr=%p %lx-%lx\n", - ret, vma, (unsigned long)m.addr, m.num, m.arr, - vma ? vma->vm_start : 0, vma ? vma->vm_end : 0); - break; } break; @@ -221,13 +218,35 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, } #ifndef HAVE_ARCH_PRIVCMD_MMAP +static struct page *privcmd_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + return NOPAGE_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .nopage = privcmd_nopage +}; + static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) { + /* Unsupported for auto-translate guests. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; return 0; } + +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +{ + return (xchg(&vma->vm_private_data, (void *)1) == NULL); +} #endif static struct file_operations privcmd_file_ops = { diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h index 27b8fd283a..b209b4f583 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h @@ -46,11 +46,10 @@ typedef struct tpmif_st { atomic_t refcnt; struct backend_info *bi; - unsigned long mmap_vstart; grant_handle_t shmem_handle; grant_ref_t shmem_ref; - struct page *pagerange; + struct page **mmap_pages; char devname[20]; } tpmif_t; @@ -80,6 +79,9 @@ int vtpm_release_packets(tpmif_t * tpmif, int send_msgs); extern int num_frontends; -#define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE)) +static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx])); +} #endif /* __TPMIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c index 0105bd93bf..2614aa5126 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c @@ -25,8 +25,8 @@ static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) tpmif_t *tpmif; tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); - if (!tpmif) - return ERR_PTR(-ENOMEM); + if (tpmif == NULL) + goto out_of_memory; memset(tpmif, 0, sizeof (*tpmif)); tpmif->domid = domid; @@ -35,22 +35,27 @@ static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid); atomic_set(&tpmif->refcnt, 1); - tpmif->pagerange = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE); - BUG_ON(tpmif->pagerange == NULL); - tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr( - page_to_pfn(tpmif->pagerange)); + tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE); + if (tpmif->mmap_pages == NULL) + goto out_of_memory; list_add(&tpmif->tpmif_list, &tpmif_list); num_frontends++; return tpmif; + + out_of_memory: + if (tpmif != NULL) + kmem_cache_free(tpmif_cachep, tpmif); + printk("%s: out of memory\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); } static void free_tpmif(tpmif_t * tpmif) { num_frontends--; list_del(&tpmif->tpmif_list); - balloon_dealloc_empty_page_range(tpmif->pagerange, TPMIF_TX_RING_SIZE); + free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE); kmem_cache_free(tpmif_cachep, tpmif); } diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c index 466c3ee581..701a5ad03e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c @@ -253,7 +253,7 @@ int _packet_write(struct packet *pak, return 0; } - gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i), + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, tx->ref, tpmif->domid); if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, @@ -270,7 +270,7 @@ int _packet_write(struct packet *pak, tocopy = min_t(size_t, size - offset, PAGE_SIZE); - if (copy_from_buffer((void *)(MMAP_VADDR(tpmif, i) | + if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) | (tx->addr & ~PAGE_MASK)), &data[offset], tocopy, isuserbuffer)) { tpmif_put(tpmif); @@ -278,7 +278,7 @@ int _packet_write(struct packet *pak, } tx->size = tocopy; - gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i), + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, handle); if (unlikely @@ -391,7 +391,7 @@ static int packet_read_shmem(struct packet *pak, tx = &tpmif->tx->ring[i].req; - gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i), + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, tx->ref, tpmif->domid); if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, @@ -414,10 +414,10 @@ static int packet_read_shmem(struct packet *pak, } DPRINTK("Copying from mapped memory at %08lx\n", - (unsigned long)(MMAP_VADDR(tpmif, i) | + (unsigned long)(idx_to_kaddr(tpmif, i) | (tx->addr & ~PAGE_MASK))); - src = (void *)(MMAP_VADDR(tpmif, i) | + src = (void *)(idx_to_kaddr(tpmif, i) | ((tx->addr & ~PAGE_MASK) + pg_offset)); if (copy_to_buffer(&buffer[offset], src, to_copy, isuserbuffer)) { @@ -428,7 +428,7 @@ static int packet_read_shmem(struct packet *pak, tpmif->domid, buffer[offset], buffer[offset + 1], buffer[offset + 2], buffer[offset + 3]); - gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i), + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, handle); if (unlikely diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c index 4ee5c5bbfe..f48b0e3726 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c @@ -157,10 +157,12 @@ static void frontend_changed(struct xenbus_device *dev, case XenbusStateClosing: be->instance = -1; + xenbus_switch_state(dev, XenbusStateClosing); break; - case XenbusStateUnknown: + case XenbusStateUnknown: /* keep it here */ case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); device_unregister(&be->dev->dev); tpmback_remove(dev); break; diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile index d7c7d05172..ce5acc2457 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile @@ -9,4 +9,5 @@ xenbus-objs += xenbus_client.o xenbus-objs += xenbus_comms.o xenbus-objs += xenbus_xs.o xenbus-objs += xenbus_probe.o +obj-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c index 9b389ec06b..0111e8e3a2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c @@ -35,8 +35,9 @@ #include <xen/xenbus.h> #include <xen/driver_util.h> -/* xenbus_probe.c */ -extern char *kasprintf(const char *fmt, ...); +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif #define DPRINTK(fmt, args...) \ pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) @@ -84,7 +85,7 @@ int xenbus_watch_path2(struct xenbus_device *dev, const char *path, const char **, unsigned int)) { int err; - char *state = kasprintf("%s/%s", path, path2); + char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2); if (!state) { xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; @@ -152,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_frontend_closed); */ static char *error_path(struct xenbus_device *dev) { - return kasprintf("error/%s", dev->nodename); + return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); } diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c index 38da320b67..f0e42ba715 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c @@ -30,15 +30,22 @@ * IN THE SOFTWARE. */ -#include <asm/hypervisor.h> -#include <xen/evtchn.h> #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/err.h> +#include <linux/ptrace.h> +#include <xen/evtchn.h> #include <xen/xenbus.h> + +#include <asm/hypervisor.h> + #include "xenbus_comms.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + static int xenbus_irq; extern void xenbus_probe(void *); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c index bbe4a8c5a8..ba37e61856 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c @@ -40,6 +40,7 @@ #include <linux/wait.h> #include <linux/fs.h> #include <linux/poll.h> +#include <linux/mutex.h> #include "xenbus_comms.h" @@ -49,6 +50,10 @@ #include <xen/xen_proc.h> #include <asm/hypervisor.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + struct xenbus_dev_transaction { struct list_head list; struct xenbus_transaction handle; diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c index bcd1f6df06..5320368443 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c @@ -42,6 +42,7 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/kthread.h> +#include <linux/mutex.h> #include <asm/io.h> #include <asm/page.h> @@ -55,6 +56,11 @@ #include <xen/hvm.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif int xen_store_evtchn; struct xenstore_domain_interface *xen_store_interface; @@ -67,12 +73,7 @@ static struct notifier_block *xenstore_chain; static void wait_for_devices(struct xenbus_driver *xendrv); static int xenbus_probe_frontend(const char *type, const char *name); -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); -static int xenbus_probe_backend(const char *type, const char *domid); -static int xenbus_dev_probe(struct device *_dev); -static int xenbus_dev_remove(struct device *_dev); static void xenbus_dev_shutdown(struct device *_dev); /* If something in array of ids matches this device, return it. */ @@ -86,7 +87,7 @@ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) return NULL; } -static int xenbus_match(struct device *_dev, struct device_driver *_drv) +int xenbus_match(struct device *_dev, struct device_driver *_drv) { struct xenbus_driver *drv = to_xenbus_driver(_drv); @@ -96,17 +97,6 @@ static int xenbus_match(struct device *_dev, struct device_driver *_drv) return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; } -struct xen_bus_type -{ - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); - int (*probe)(const char *type, const char *dir); - struct bus_type bus; - struct device dev; -}; - - /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) { @@ -143,7 +133,7 @@ static void free_otherend_watch(struct xenbus_device *dev) } -static int read_otherend_details(struct xenbus_device *xendev, +int read_otherend_details(struct xenbus_device *xendev, char *id_node, char *path_node) { int err = xenbus_gather(XBT_NIL, xendev->nodename, @@ -176,12 +166,6 @@ static int read_backend_details(struct xenbus_device *xendev) } -static int read_frontend_details(struct xenbus_device *xendev) -{ - return read_otherend_details(xendev, "frontend-id", "frontend"); -} - - /* Bus type for frontend drivers. */ static struct xen_bus_type xenbus_frontend = { .root = "device", @@ -191,115 +175,17 @@ static struct xen_bus_type xenbus_frontend = { .bus = { .name = "xen", .match = xenbus_match, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, +#endif }, .dev = { .bus_id = "xen", }, }; -/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ -static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) -{ - int domid, err; - const char *devid, *type, *frontend; - unsigned int typelen; - - type = strchr(nodename, '/'); - if (!type) - return -EINVAL; - type++; - typelen = strcspn(type, "/"); - if (!typelen || type[typelen] != '/') - return -EINVAL; - - devid = strrchr(nodename, '/') + 1; - - err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, - "frontend", NULL, &frontend, - NULL); - if (err) - return err; - if (strlen(frontend) == 0) - err = -ERANGE; - if (!err && !xenbus_exists(XBT_NIL, frontend, "")) - err = -ENOENT; - - kfree(frontend); - - if (err) - return err; - - if (snprintf(bus_id, BUS_ID_SIZE, - "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) - return -ENOSPC; - return 0; -} - -static struct xen_bus_type xenbus_backend = { - .root = "backend", - .levels = 3, /* backend/type/<frontend>/<id> */ - .get_bus_id = backend_bus_id, - .probe = xenbus_probe_backend, - .bus = { - .name = "xen-backend", - .match = xenbus_match, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, -// .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_backend, - }, - .dev = { - .bus_id = "xen-backend", - }, -}; - -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) -{ - struct xenbus_device *xdev; - struct xenbus_driver *drv; - int i = 0; - int length = 0; - - DPRINTK(""); - - if (dev == NULL) - return -ENODEV; - - xdev = to_xenbus_device(dev); - if (xdev == NULL) - return -ENODEV; - - /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_BASE_PATH=%s", xenbus_backend.root); - - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; - - if (dev->driver) { - drv = to_xenbus_driver(dev->driver); - if (drv && drv->uevent) - return drv->uevent(xdev, envp, num_envp, buffer, - buffer_size); - } - - return 0; -} - static void otherend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) { @@ -322,6 +208,20 @@ static void otherend_changed(struct xenbus_watch *watch, DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state), dev->otherend_watch.node, vec[XS_WATCH_PATH]); + /* + * Ignore xenbus transitions during shutdown. This prevents us doing + * work that can fail e.g., when the rootfs is gone. + */ + if (system_state > SYSTEM_RUNNING) { + struct xen_bus_type *bus = bus; + bus = container_of(dev->dev.bus, struct xen_bus_type, bus); + /* If we're frontend, drive the state machine to Closed. */ + /* This should cause the backend to release our resources. */ + if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) + xenbus_frontend_closed(dev); + return; + } + if (drv->otherend_changed) drv->otherend_changed(dev, state); } @@ -345,7 +245,7 @@ static int watch_otherend(struct xenbus_device *dev) } -static int xenbus_dev_probe(struct device *_dev) +int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -392,7 +292,7 @@ fail: return -ENODEV; } -static int xenbus_dev_remove(struct device *_dev) +int xenbus_dev_remove(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -430,14 +330,21 @@ static void xenbus_dev_shutdown(struct device *_dev) put_device(&dev->dev); } -static int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus) { int ret; drv->driver.name = drv->name; drv->driver.bus = &bus->bus; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) drv->driver.owner = drv->owner; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + drv->driver.probe = xenbus_dev_probe; + drv->driver.remove = xenbus_dev_remove; + drv->driver.shutdown = xenbus_dev_shutdown; +#endif mutex_lock(&xenwatch_mutex); ret = driver_register(&drv->driver); @@ -462,14 +369,6 @@ int xenbus_register_frontend(struct xenbus_driver *drv) } EXPORT_SYMBOL_GPL(xenbus_register_frontend); -int xenbus_register_backend(struct xenbus_driver *drv) -{ - drv->read_otherend_details = read_frontend_details; - - return xenbus_register_driver_common(drv, &xenbus_backend); -} -EXPORT_SYMBOL_GPL(xenbus_register_backend); - void xenbus_unregister_driver(struct xenbus_driver *drv) { driver_unregister(&drv->driver); @@ -545,45 +444,30 @@ static void xenbus_dev_release(struct device *dev) kfree(to_xenbus_device(dev)); } -/* Simplified asprintf. */ -char *kasprintf(const char *fmt, ...) -{ - va_list ap; - unsigned int len; - char *p, dummy[1]; - - va_start(ap, fmt); - /* FIXME: vsnprintf has a bug, NULL should work */ - len = vsnprintf(dummy, 0, fmt, ap); - va_end(ap); - - p = kmalloc(len + 1, GFP_KERNEL); - if (!p) - return NULL; - va_start(ap, fmt); - vsprintf(p, fmt, ap); - va_end(ap); - return p; -} - static ssize_t xendev_show_nodename(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); static ssize_t xendev_show_devtype(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); -static int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename) +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename) { int err; struct xenbus_device *xendev; @@ -642,7 +526,7 @@ static int xenbus_probe_frontend(const char *type, const char *name) char *nodename; int err; - nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name); + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name); if (!nodename) return -ENOMEM; @@ -653,55 +537,6 @@ static int xenbus_probe_frontend(const char *type, const char *name) return err; } -/* backend/<typename>/<frontend-uuid>/<name> */ -static int xenbus_probe_backend_unit(const char *dir, - const char *type, - const char *name) -{ - char *nodename; - int err; - - nodename = kasprintf("%s/%s", dir, name); - if (!nodename) - return -ENOMEM; - - DPRINTK("%s\n", nodename); - - err = xenbus_probe_node(&xenbus_backend, type, nodename); - kfree(nodename); - return err; -} - -/* backend/<typename>/<frontend-domid> */ -static int xenbus_probe_backend(const char *type, const char *domid) -{ - char *nodename; - int err = 0; - char **dir; - unsigned int i, dir_n = 0; - - DPRINTK(""); - - nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid); - if (!nodename) - return -ENOMEM; - - dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); - if (IS_ERR(dir)) { - kfree(nodename); - return PTR_ERR(dir); - } - - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_backend_unit(nodename, type, dir[i]); - if (err) - break; - } - kfree(dir); - kfree(nodename); - return err; -} - static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) { int err = 0; @@ -722,7 +557,7 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) return err; } -static int xenbus_probe_devices(struct xen_bus_type *bus) +int xenbus_probe_devices(struct xen_bus_type *bus) { int err = 0; char **dir; @@ -764,7 +599,7 @@ static int strsep_len(const char *str, char c, unsigned int len) return (len == 0) ? i : -ERANGE; } -static void dev_changed(const char *node, struct xen_bus_type *bus) +void dev_changed(const char *node, struct xen_bus_type *bus) { int exists, rootlen; struct xenbus_device *dev; @@ -788,7 +623,7 @@ static void dev_changed(const char *node, struct xen_bus_type *bus) rootlen = strsep_len(node, '/', bus->levels); if (rootlen < 0) return; - root = kasprintf("%.*s", rootlen, node); + root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node); if (!root) return; @@ -809,25 +644,12 @@ static void frontend_changed(struct xenbus_watch *watch, dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); } -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - DPRINTK(""); - - dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); -} - /* We watch for devices appearing and vanishing. */ static struct xenbus_watch fe_watch = { .node = "device", .callback = frontend_changed, }; -static struct xenbus_watch be_watch = { - .node = "backend", - .callback = backend_changed, -}; - static int suspend_dev(struct device *dev, void *data) { int err = 0; @@ -898,7 +720,7 @@ void xenbus_suspend(void) DPRINTK(""); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev); + xenbus_backend_suspend(suspend_dev); xs_suspend(); } EXPORT_SYMBOL_GPL(xenbus_suspend); @@ -908,7 +730,7 @@ void xenbus_resume(void) xb_init_comms(); xs_resume(); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev); + xenbus_backend_resume(resume_dev); } EXPORT_SYMBOL_GPL(xenbus_resume); @@ -941,20 +763,17 @@ void xenbus_probe(void *unused) { BUG_ON((xenstored_ready <= 0)); - /* Enumerate devices in xenstore. */ + /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); - xenbus_probe_devices(&xenbus_backend); - - /* Watch for changes. */ register_xenbus_watch(&fe_watch); - register_xenbus_watch(&be_watch); + xenbus_backend_probe_and_watch(); /* Notify others that xenstore is up */ notifier_call_chain(&xenstore_chain, 0, NULL); } -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) static struct file_operations xsd_kva_fops; static struct proc_dir_entry *xsd_kva_intf; static struct proc_dir_entry *xsd_port_intf; @@ -1006,7 +825,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel bus subsystem */ bus_register(&xenbus_frontend.bus); - bus_register(&xenbus_backend.bus); + xenbus_backend_bus_register(); /* * Domain0 doesn't have a store_evtchn or store_mfn yet. @@ -1035,7 +854,7 @@ static int __init xenbus_probe_init(void) xen_store_evtchn = xen_start_info->store_evtchn = alloc_unbound.port; -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) /* And finally publish the above info in /proc/xen */ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600); if (xsd_kva_intf) { @@ -1077,7 +896,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel device subsystem */ device_register(&xenbus_frontend.dev); - device_register(&xenbus_backend.dev); + xenbus_backend_device_register(); if (!is_initial_xendomain()) xenbus_probe(NULL); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h new file mode 100644 index 0000000000..2d2e567826 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h @@ -0,0 +1,74 @@ +/****************************************************************************** + * xenbus_probe.h + * + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_PROBE_H +#define _XENBUS_PROBE_H + +#ifdef CONFIG_XEN_BACKEND +extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); +extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); +extern void xenbus_backend_probe_and_watch(void); +extern void xenbus_backend_bus_register(void); +extern void xenbus_backend_device_register(void); +#else +static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_probe_and_watch(void) {} +static inline void xenbus_backend_bus_register(void) {} +static inline void xenbus_backend_device_register(void) {} +#endif + +struct xen_bus_type +{ + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); + int (*probe)(const char *type, const char *dir); + struct bus_type bus; + struct device dev; +}; + +extern int xenbus_match(struct device *_dev, struct device_driver *_drv); +extern int xenbus_dev_probe(struct device *_dev); +extern int xenbus_dev_remove(struct device *_dev); +extern int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus); +extern int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +extern int xenbus_probe_devices(struct xen_bus_type *bus); + +extern void dev_changed(const char *node, struct xen_bus_type *bus); + +#endif + diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 0000000000..934e79732d --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,271 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/kthread.h> + +#include <asm/io.h> +#include <asm/page.h> +#include <asm/maddr.h> +#include <asm/pgtable.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen_proc.h> +#include <xen/evtchn.h> +#include <xen/features.h> +#include <xen/hvm.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); +static int xenbus_probe_backend(const char *type, const char *domid); + +extern int read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return read_otherend_details(xendev, "frontend-id", "frontend"); +} + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, BUS_ID_SIZE, + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, + }, + .dev = { + .bus_id = "xen-backend", + }, +}; + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + int i = 0; + int length = 0; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + if (xdev == NULL) + return -ENODEV; + + /* stuff we want to pass to /sbin/hotplug */ + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_TYPE=%s", xdev->devicetype); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_PATH=%s", xdev->nodename); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_BASE_PATH=%s", xenbus_backend.root); + + /* terminate, set to next free slot, shrink available space */ + envp[i] = NULL; + envp = &envp[i]; + num_envp -= i; + buffer = &buffer[length]; + buffer_size -= length; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, envp, num_envp, buffer, + buffer_size); + } + + return 0; +} + +int xenbus_register_backend(struct xenbus_driver *drv) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend); +} +EXPORT_SYMBOL_GPL(xenbus_register_backend); + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(&xenbus_backend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(const char *type, const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +void xenbus_backend_suspend(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_resume(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_probe_and_watch(void) +{ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); +} + +void xenbus_backend_bus_register(void) +{ + bus_register(&xenbus_backend.bus); +} + +void xenbus_backend_device_register(void) +{ + device_register(&xenbus_backend.dev); +} diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c index 190fa1e794..4c5052d13a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c @@ -42,11 +42,14 @@ #include <linux/fcntl.h> #include <linux/kthread.h> #include <linux/rwsem.h> +#include <linux/module.h> +#include <linux/mutex.h> #include <xen/xenbus.h> #include "xenbus_comms.h" -/* xenbus_probe.c */ -extern char *kasprintf(const char *fmt, ...); +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif struct xs_stored_msg { struct list_head list; @@ -289,9 +292,9 @@ static char *join(const char *dir, const char *name) char *buffer; if (strlen(name) == 0) - buffer = kasprintf("%s", dir); + buffer = kasprintf(GFP_KERNEL, "%s", dir); else - buffer = kasprintf("%s/%s", dir, name); + buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name); return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; } diff --git a/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c b/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c new file mode 100644 index 0000000000..382a50f647 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c @@ -0,0 +1,500 @@ +/** + * @file xenoprofile.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + * + * Modified by Aravind Menon and Jose Renato Santos for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * + * Separated out arch-generic part + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/oprofile.h> +#include <linux/sysdev.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/vmalloc.h> +#include <asm/pgtable.h> +#include <xen/evtchn.h> +#include <xen/xenoprof.h> +#include <xen/driver_util.h> +#include <xen/interface/xen.h> +#include <xen/interface/xenoprof.h> +#include "../../../drivers/oprofile/cpu_buffer.h" +#include "../../../drivers/oprofile/event_buffer.h" + +#define MAX_XENOPROF_SAMPLES 16 + +/* sample buffers shared with Xen */ +xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS]; +/* Shared buffer area */ +struct xenoprof_shared_buffer shared_buffer; + +/* Passive sample buffers shared with Xen */ +xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS]; +/* Passive shared buffer area */ +struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS]; + +static int xenoprof_start(void); +static void xenoprof_stop(void); + +static int xenoprof_enabled = 0; +static int xenoprof_is_primary = 0; +static int active_defined; + +/* Number of buffers in shared area (one per VCPU) */ +int nbuf; +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */ +int ovf_irq[NR_CPUS]; +/* cpu model type string - copied from Xen memory space on XENOPROF_init command */ +char cpu_type[XENOPROF_CPU_TYPE_SIZE]; + +#ifdef CONFIG_PM + +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state) +{ + if (xenoprof_enabled == 1) + xenoprof_stop(); + return 0; +} + + +static int xenoprof_resume(struct sys_device * dev) +{ + if (xenoprof_enabled == 1) + xenoprof_start(); + return 0; +} + + +static struct sysdev_class oprofile_sysclass = { + set_kset_name("oprofile"), + .resume = xenoprof_resume, + .suspend = xenoprof_suspend +}; + + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + + +static int __init init_driverfs(void) +{ + int error; + if (!(error = sysdev_class_register(&oprofile_sysclass))) + error = sysdev_register(&device_oprofile); + return error; +} + + +static void exit_driverfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_driverfs() do { } while (0) +#define exit_driverfs() do { } while (0) +#endif /* CONFIG_PM */ + +unsigned long long oprofile_samples = 0; +unsigned long long p_oprofile_samples = 0; + +unsigned int pdomains; +struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS]; + +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive) +{ + int head, tail, size; + + head = buf->event_head; + tail = buf->event_tail; + size = buf->event_size; + + if (tail > head) { + while (tail < size) { + oprofile_add_pc(buf->event_log[tail].eip, + buf->event_log[tail].mode, + buf->event_log[tail].event); + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + tail++; + } + tail = 0; + } + while (tail < head) { + oprofile_add_pc(buf->event_log[tail].eip, + buf->event_log[tail].mode, + buf->event_log[tail].event); + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + tail++; + } + + buf->event_tail = tail; +} + +static void xenoprof_handle_passive(void) +{ + int i, j; + int flag_domain, flag_switch = 0; + + for (i = 0; i < pdomains; i++) { + flag_domain = 0; + for (j = 0; j < passive_domains[i].nbuf; j++) { + xenoprof_buf_t *buf = p_xenoprof_buf[i][j]; + if (buf->event_head == buf->event_tail) + continue; + if (!flag_domain) { + if (!oprofile_add_domain_switch(passive_domains[i]. + domain_id)) + goto done; + flag_domain = 1; + } + xenoprof_add_pc(buf, 1); + flag_switch = 1; + } + } +done: + if (flag_switch) + oprofile_add_domain_switch(COORDINATOR_DOMAIN); +} + +static irqreturn_t +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) +{ + struct xenoprof_buf * buf; + int cpu; + static unsigned long flag; + + cpu = smp_processor_id(); + buf = xenoprof_buf[cpu]; + + xenoprof_add_pc(buf, 0); + + if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) { + xenoprof_handle_passive(); + smp_mb__before_clear_bit(); + clear_bit(0, &flag); + } + + return IRQ_HANDLED; +} + + +static void unbind_virq(void) +{ + int i; + + for_each_online_cpu(i) { + if (ovf_irq[i] >= 0) { + unbind_from_irqhandler(ovf_irq[i], NULL); + ovf_irq[i] = -1; + } + } +} + + +static int bind_virq(void) +{ + int i, result; + + for_each_online_cpu(i) { + result = bind_virq_to_irqhandler(VIRQ_XENOPROF, + i, + xenoprof_ovf_interrupt, + SA_INTERRUPT, + "xenoprof", + NULL); + + if (result < 0) { + unbind_virq(); + return result; + } + + ovf_irq[i] = result; + } + + return 0; +} + + +static void unmap_passive_list(void) +{ + int i; + for (i = 0; i < pdomains; i++) + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + pdomains = 0; +} + + +static int map_xenoprof_buffer(int max_samples) +{ + struct xenoprof_get_buffer get_buffer; + struct xenoprof_buf *buf; + int ret, i; + + if ( shared_buffer.buffer ) + return 0; + + get_buffer.max_samples = max_samples; + ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer); + if (ret) + return ret; + nbuf = get_buffer.nbuf; + + for (i=0; i< nbuf; i++) { + buf = (struct xenoprof_buf*) + &shared_buffer.buffer[i * get_buffer.bufsize]; + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); + xenoprof_buf[buf->vcpu_id] = buf; + } + + return 0; +} + + +static int xenoprof_setup(void) +{ + int ret; + + if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) ) + return ret; + + if ( (ret = bind_virq()) ) + return ret; + + if (xenoprof_is_primary) { + /* Define dom0 as an active domain if not done yet */ + if (!active_defined) { + domid_t domid; + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + if (ret) + goto err; + domid = 0; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + if (ret) + goto err; + active_defined = 1; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); + if (ret) + goto err; + xenoprof_arch_counter(); + ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL); + + if (ret) + goto err; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL); + if (ret) + goto err; + + xenoprof_enabled = 1; + return 0; + err: + unbind_virq(); + return ret; +} + + +static void xenoprof_shutdown(void) +{ + xenoprof_enabled = 0; + + HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL); + + if (xenoprof_is_primary) { + HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL); + active_defined = 0; + } + + unbind_virq(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) + unmap_passive_list(); +} + + +static int xenoprof_start(void) +{ + int ret = 0; + + if (xenoprof_is_primary) + ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL); + if (!ret) + xenoprof_arch_start(); + return ret; +} + + +static void xenoprof_stop(void) +{ + if (xenoprof_is_primary) + HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL); + xenoprof_arch_stop(); +} + + +static int xenoprof_set_active(int * active_domains, + unsigned int adomains) +{ + int ret = 0; + int i; + int set_dom0 = 0; + domid_t domid; + + if (!xenoprof_is_primary) + return 0; + + if (adomains > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + if (ret) + return ret; + + for (i=0; i<adomains; i++) { + domid = active_domains[i]; + if (domid != active_domains[i]) { + ret = -EINVAL; + goto out; + } + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + if (ret) + goto out; + if (active_domains[i] == 0) + set_dom0 = 1; + } + /* dom0 must always be active but may not be in the list */ + if (!set_dom0) { + domid = 0; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + } + +out: + if (ret) + HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + active_defined = !ret; + return ret; +} + +static int xenoprof_set_passive(int * p_domains, + unsigned int pdoms) +{ + int ret; + int i, j; + struct xenoprof_buf *buf; + + if (!xenoprof_is_primary) + return 0; + + if (pdoms > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL); + if (ret) + return ret; + unmap_passive_list(); + + for (i = 0; i < pdoms; i++) { + passive_domains[i].domain_id = p_domains[i]; + passive_domains[i].max_samples = 2048; + ret = xenoprof_arch_set_passive(&passive_domains[i], + &p_shared_buffer[i]); + if (ret) + goto out; + for (j = 0; j < passive_domains[i].nbuf; j++) { + buf = (struct xenoprof_buf *) + &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize]; + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); + p_xenoprof_buf[i][buf->vcpu_id] = buf; + } + } + + pdomains = pdoms; + return 0; + +out: + for (j = 0; j < i; j++) + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + + return ret; +} + +struct oprofile_operations xenoprof_ops = { +#ifdef HAVE_XENOPROF_CREATE_FILES + .create_files = xenoprof_create_files, +#endif + .set_active = xenoprof_set_active, + .set_passive = xenoprof_set_passive, + .setup = xenoprof_setup, + .shutdown = xenoprof_shutdown, + .start = xenoprof_start, + .stop = xenoprof_stop +}; + + +/* in order to get driverfs right */ +static int using_xenoprof; + +int __init xenoprofile_init(struct oprofile_operations * ops) +{ + struct xenoprof_init init; + int ret, i; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init); + if (!ret) { + xenoprof_arch_init_counter(&init); + xenoprof_is_primary = init.is_primary; + + /* cpu_type is detected by Xen */ + cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0; + strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1); + xenoprof_ops.cpu_type = cpu_type; + + init_driverfs(); + using_xenoprof = 1; + *ops = xenoprof_ops; + + for (i=0; i<NR_CPUS; i++) + ovf_irq[i] = -1; + + active_defined = 0; + } + printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n", + __func__, ret, init.num_events, xenoprof_is_primary); + return ret; +} + + +void xenoprofile_exit(void) +{ + if (using_xenoprof) + exit_driverfs(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) { + unmap_passive_list(); + HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL); + } +} diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h index a6f3e9ea79..a9c3cc28fd 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h @@ -27,7 +27,6 @@ extern unsigned long __FIXADDR_TOP; #include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> -#include <xen/gnttab.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> #include <asm/kmap_types.h> @@ -99,7 +98,7 @@ enum fixed_addresses { extern void __set_fixmap(enum fixed_addresses idx, maddr_t phys, pgprot_t flags); -extern void set_fixaddr_top(unsigned long top); +extern void set_fixaddr_top(void); #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h index 2e6d1fa596..a12e349016 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h @@ -260,6 +260,8 @@ HYPERVISOR_event_channel_op( int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { struct evtchn_op op; op.cmd = cmd; @@ -267,6 +269,8 @@ HYPERVISOR_event_channel_op( rc = _hypercall1(int, event_channel_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } +#endif + return rc; } @@ -296,6 +300,8 @@ HYPERVISOR_physdev_op( int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); + +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { struct physdev_op op; op.cmd = cmd; @@ -303,6 +309,8 @@ HYPERVISOR_physdev_op( rc = _hypercall1(int, physdev_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } +#endif + return rc; } @@ -350,9 +358,11 @@ HYPERVISOR_suspend( int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, &sched_shutdown, srec); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, SHUTDOWN_suspend, srec); +#endif return rc; } diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypervisor.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypervisor.h index 47586d22f9..f7904ac0b0 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypervisor.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypervisor.h @@ -56,6 +56,10 @@ extern shared_info_t *HYPERVISOR_shared_info; +#ifdef CONFIG_X86_32 +extern unsigned long hypervisor_virt_start; +#endif + /* arch/xen/i386/kernel/setup.c */ extern start_info_t *xen_start_info; #ifdef CONFIG_XEN_PRIVILEGED_GUEST @@ -94,7 +98,6 @@ void xen_pgd_pin(unsigned long ptr); void xen_pgd_unpin(unsigned long ptr); void xen_set_ldt(unsigned long ptr, unsigned long bytes); -void xen_machphys_update(unsigned long mfn, unsigned long pfn); #ifdef CONFIG_SMP #include <linux/cpumask.h> @@ -131,8 +134,10 @@ HYPERVISOR_yield( { int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif return rc; } @@ -143,8 +148,10 @@ HYPERVISOR_block( { int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif return rc; } @@ -159,8 +166,10 @@ HYPERVISOR_shutdown( int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason); +#endif return rc; } @@ -177,8 +186,10 @@ HYPERVISOR_poll( set_xen_guest_handle(sched_poll.ports, ports); rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif return rc; } diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/io.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/io.h index 7f9b7cdd36..ed5203a573 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/io.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/io.h @@ -54,7 +54,8 @@ * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -#define xlate_dev_mem_ptr(p) __va(p) +#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz) +#define xlate_dev_mem_ptr_unmap(p) iounmap(p) /* * Convert a virtual cached pointer to an uncached pointer diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/maddr.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/maddr.h index b467320d5c..f805d6ea60 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/maddr.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/maddr.h @@ -9,6 +9,15 @@ #define FOREIGN_FRAME_BIT (1UL<<31) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + #ifdef CONFIG_XEN extern unsigned long *phys_to_machine_mapping; @@ -101,32 +110,13 @@ static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) phys_to_machine_mapping[pfn] = mfn; } - -#else /* !CONFIG_XEN */ - -#define pfn_to_mfn(pfn) (pfn) -#define mfn_to_pfn(mfn) (mfn) -#define mfn_to_local_pfn(mfn) (mfn) -#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn)) -#define phys_to_machine_mapping_valid(pfn) (1) - -#endif /* !CONFIG_XEN */ - -/* Definitions for machine and pseudophysical addresses. */ -#ifdef CONFIG_X86_PAE -typedef unsigned long long paddr_t; -typedef unsigned long long maddr_t; -#else -typedef unsigned long paddr_t; -typedef unsigned long maddr_t; -#endif - static inline maddr_t phys_to_machine(paddr_t phys) { maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); return machine; } + static inline paddr_t machine_to_phys(maddr_t machine) { paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); @@ -134,6 +124,32 @@ static inline paddr_t machine_to_phys(maddr_t machine) return phys; } +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn)) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pte_machine_to_phys(mach) ((paddr_t)(mach)) + +#endif /* !CONFIG_XEN */ + /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(__pa(v))) #define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h index 0f829c8cd3..ff183db39e 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h @@ -6,6 +6,16 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) +#ifdef CONFIG_X86_PAE +#define __PHYSICAL_MASK_SHIFT 36 +#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1) +#define PHYSICAL_PAGE_MASK (~((1ULL << PAGE_SHIFT) - 1) & __PHYSICAL_MASK) +#else +#define __PHYSICAL_MASK_SHIFT 32 +#define __PHYSICAL_MASK (~0UL) +#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK) +#endif + #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) @@ -85,7 +95,7 @@ static inline unsigned long long pte_val(pte_t x) if (x.pte_low) { ret = x.pte_low | (unsigned long long)x.pte_high << 32; - ret = machine_to_phys(ret) | 1; + ret = pte_machine_to_phys(ret) | 1; } else { ret = 0; } @@ -94,13 +104,13 @@ static inline unsigned long long pte_val(pte_t x) static inline unsigned long long pmd_val(pmd_t x) { unsigned long long ret = x.pmd; - if (ret) ret = machine_to_phys(ret) | 1; + if (ret) ret = pte_machine_to_phys(ret) | 1; return ret; } static inline unsigned long long pgd_val(pgd_t x) { unsigned long long ret = x.pgd; - if (ret) ret = machine_to_phys(ret) | 1; + if (ret) ret = pte_machine_to_phys(ret) | 1; return ret; } static inline unsigned long long pte_val_ma(pte_t x) @@ -115,7 +125,8 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #include <asm/maddr.h> #define boot_pte_t pte_t /* or would you rather have a typedef */ -#define pte_val(x) (((x).pte_low & 1) ? machine_to_phys((x).pte_low) : \ +#define pte_val(x) (((x).pte_low & 1) ? \ + pte_machine_to_phys((x).pte_low) : \ (x).pte_low) #define pte_val_ma(x) ((x).pte_low) #define __pte(x) ({ unsigned long _x = (x); \ @@ -125,7 +136,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pgd_val(pgd_t x) { unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret) | 1; + if (ret) ret = pte_machine_to_phys(ret) | 1; return ret; } #define HPAGE_SHIFT 22 diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h index 3791d2de39..bd6346f410 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h @@ -9,7 +9,6 @@ #define PGDIR_SHIFT 22 #define PTRS_PER_PGD 1024 -#define PTRS_PER_PGD_NO_HV (HYPERVISOR_VIRT_START >> PGDIR_SHIFT) /* * the i386 is two-level, so we don't really have any diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h index 10445c142c..148c8d9e78 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h @@ -8,7 +8,6 @@ */ #define PGDIR_SHIFT 30 #define PTRS_PER_PGD 4 -#define PTRS_PER_PGD_NO_HV 4 /* * PMD_SHIFT determines the size of the area a middle-level diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h index 6a4e5e4508..807ca388c5 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h @@ -9,6 +9,10 @@ #include <linux/config.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define ADDR (*(volatile long *) addr) static __inline__ void synch_set_bit(int nr, volatile void * addr) diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/xenoprof.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/xenoprof.h new file mode 100644 index 0000000000..2733e00ee4 --- /dev/null +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/xenoprof.h @@ -0,0 +1,48 @@ +/****************************************************************************** + * asm-i386/mach-xen/asm/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef __ASM_XENOPROF_H__ +#define __ASM_XENOPROF_H__ +#ifdef CONFIG_XEN + +struct super_block; +struct dentry; +int xenoprof_create_files(struct super_block * sb, struct dentry * root); +#define HAVE_XENOPROF_CREATE_FILES + +struct xenoprof_init; +void xenoprof_arch_init_counter(struct xenoprof_init *init); +void xenoprof_arch_counter(void); +void xenoprof_arch_start(void); +void xenoprof_arch_stop(void); + +struct xenoprof_arch_shared_buffer { + /* nothing */ +}; +struct xenoprof_shared_buffer; +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); +struct xenoprof_get_buffer; +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); +struct xenoprof_passive; +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); + +#endif /* CONFIG_XEN */ +#endif /* __ASM_XENOPROF_H__ */ diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h index 0f1caa0604..bed1e1d211 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h @@ -56,15 +56,15 @@ static void __init machine_specific_arch_setup(void) struct xen_machphys_mapping mapping; unsigned long machine_to_phys_nr_ents; struct xen_platform_parameters pp; - struct callback_register event = { + static struct callback_register __initdata event = { .type = CALLBACKTYPE_event, .address = { __KERNEL_CS, (unsigned long)hypervisor_callback }, }; - struct callback_register failsafe = { + static struct callback_register __initdata failsafe = { .type = CALLBACKTYPE_failsafe, .address = { __KERNEL_CS, (unsigned long)failsafe_callback }, }; - struct callback_register nmi_cb = { + static struct callback_register __initdata nmi_cb = { .type = CALLBACKTYPE_nmi, .address = { __KERNEL_CS, (unsigned long)nmi }, }; @@ -72,23 +72,30 @@ static void __init machine_specific_arch_setup(void) ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); if (ret == 0) ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#ifdef CONFIG_XEN_COMPAT_030002 if (ret == -ENOSYS) ret = HYPERVISOR_set_callbacks( event.address.cs, event.address.eip, failsafe.address.cs, failsafe.address.eip); +#endif BUG_ON(ret); ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#ifdef CONFIG_XEN_COMPAT_030002 if (ret == -ENOSYS) { - struct xennmi_callback cb; + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; - cb.handler_address = nmi_cb.address.eip; HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); } +#endif if (HYPERVISOR_xen_version(XENVER_platform_parameters, - &pp) == 0) - set_fixaddr_top(pp.virt_start - PAGE_SIZE); + &pp) == 0) { + hypervisor_virt_start = pp.virt_start; + set_fixaddr_top(); + } machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START; machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; diff --git a/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h b/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h index 8375336941..7a522be483 100644 --- a/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h +++ b/linux-2.6-xen-sparse/include/asm-ia64/hypercall.h @@ -33,12 +33,13 @@ #ifndef __HYPERCALL_H__ #define __HYPERCALL_H__ -#include <linux/string.h> /* memcpy() */ - #ifndef __HYPERVISOR_H__ # error "please don't include this file directly" #endif +#include <asm/xen/xcom_hcall.h> +struct xencomm_handle; + /* * Assembler stubs for hyper-calls. */ @@ -157,157 +158,117 @@ (type)__res; \ }) -static inline int -HYPERVISOR_sched_op_compat( - int cmd, unsigned long arg) -{ - return _hypercall2(int, sched_op_compat, cmd, arg); -} static inline int -HYPERVISOR_sched_op( - int cmd, void *arg) +xencomm_arch_hypercall_sched_op(int cmd, struct xencomm_handle *arg) { return _hypercall2(int, sched_op, cmd, arg); } static inline long -HYPERVISOR_set_timer_op( - u64 timeout) +HYPERVISOR_set_timer_op(u64 timeout) { - unsigned long timeout_hi = (unsigned long)(timeout>>32); - unsigned long timeout_lo = (unsigned long)timeout; - return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); + unsigned long timeout_hi = (unsigned long)(timeout >> 32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); } static inline int -HYPERVISOR_dom0_op( - dom0_op_t *dom0_op) +xencomm_arch_hypercall_dom0_op(struct xencomm_handle *op) { - dom0_op->interface_version = DOM0_INTERFACE_VERSION; - return _hypercall1(int, dom0_op, dom0_op); + return _hypercall1(int, dom0_op, op); } static inline int -HYPERVISOR_multicall( - void *call_list, int nr_calls) +xencomm_arch_hypercall_sysctl(struct xencomm_handle *op) { - return _hypercall2(int, multicall, call_list, nr_calls); + return _hypercall1(int, sysctl, op); } -//XXX xen/ia64 copy_from_guest() is broken. -// This is a temporal work around until it is fixed. static inline int -____HYPERVISOR_memory_op( - unsigned int cmd, void *arg) +xencomm_arch_hypercall_domctl(struct xencomm_handle *op) { - return _hypercall2(int, memory_op, cmd, arg); + return _hypercall1(int, domctl, op); } -#include <xen/interface/memory.h> -#ifdef CONFIG_VMX_GUEST -# define ia64_xenmem_reservation_op(op, xmr) (0) -#else -int ia64_xenmem_reservation_op(unsigned long op, - struct xen_memory_reservation* reservation__); -#endif static inline int -HYPERVISOR_memory_op( - unsigned int cmd, void *arg) +xencomm_arch_hypercall_multicall(struct xencomm_handle *call_list, + int nr_calls) { - switch (cmd) { - case XENMEM_increase_reservation: - case XENMEM_decrease_reservation: - case XENMEM_populate_physmap: - return ia64_xenmem_reservation_op(cmd, - (struct xen_memory_reservation*)arg); - default: - return ____HYPERVISOR_memory_op(cmd, arg); - } - /* NOTREACHED */ + return _hypercall2(int, multicall, call_list, nr_calls); } static inline int -HYPERVISOR_event_channel_op( - int cmd, void *arg) +xencomm_arch_hypercall_memory_op(unsigned int cmd, struct xencomm_handle *arg) { - int rc = _hypercall2(int, event_channel_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct evtchn_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, event_channel_op_compat, &op); - } - return rc; + return _hypercall2(int, memory_op, cmd, arg); } static inline int -HYPERVISOR_acm_op( - unsigned int cmd, void *arg) +xencomm_arch_hypercall_event_channel_op(int cmd, struct xencomm_handle *arg) { - return _hypercall2(int, acm_op, cmd, arg); + return _hypercall2(int, event_channel_op, cmd, arg); } static inline int -HYPERVISOR_xen_version( - int cmd, void *arg) +xencomm_arch_hypercall_acm_op(unsigned int cmd, struct xencomm_handle *arg) { - return _hypercall2(int, xen_version, cmd, arg); + return _hypercall2(int, acm_op, cmd, arg); } static inline int -HYPERVISOR_console_io( - int cmd, int count, char *str) +xencomm_arch_hypercall_xen_version(int cmd, struct xencomm_handle *arg) { - return _hypercall3(int, console_io, cmd, count, str); + return _hypercall2(int, xen_version, cmd, arg); } static inline int -HYPERVISOR_physdev_op( - int cmd, void *arg) +xencomm_arch_hypercall_console_io(int cmd, int count, + struct xencomm_handle *str) { - int rc = _hypercall2(int, physdev_op, cmd, arg); - if (unlikely(rc == -ENOSYS)) { - struct physdev_op op; - op.cmd = cmd; - memcpy(&op.u, arg, sizeof(op.u)); - rc = _hypercall1(int, physdev_op_compat, &op); - } - return rc; + return _hypercall3(int, console_io, cmd, count, str); } -//XXX __HYPERVISOR_grant_table_op is used for this hypercall constant. static inline int -____HYPERVISOR_grant_table_op( - unsigned int cmd, void *uop, unsigned int count, - unsigned long pa1, unsigned long pa2) +xencomm_arch_hypercall_physdev_op(int cmd, struct xencomm_handle *arg) { - return _hypercall5(int, grant_table_op, cmd, uop, count, pa1, pa2); + return _hypercall2(int, physdev_op, cmd, arg); +} + +static inline int +xencomm_arch_hypercall_grant_table_op(unsigned int cmd, + struct xencomm_handle *uop, + unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); } int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count); +extern int xencomm_arch_hypercall_suspend(struct xencomm_handle *arg); + static inline int -HYPERVISOR_vcpu_op( - int cmd, int vcpuid, void *extra_args) +xencomm_arch_hypercall_callback_op(int cmd, struct xencomm_handle *arg) { - return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); + return _hypercall2(int, callback_op, cmd, arg); } -extern int HYPERVISOR_suspend(unsigned long srec); - static inline unsigned long -HYPERVISOR_hvm_op( - int cmd, void *arg) +xencomm_arch_hypercall_hvm_op(int cmd, void *arg) { return _hypercall2(unsigned long, hvm_op, cmd, arg); } static inline int -HYPERVISOR_callback_op( - int cmd, void *arg) +HYPERVISOR_physdev_op(int cmd, void *arg) { - return _hypercall2(int, callback_op, cmd, arg); + switch (cmd) { + case PHYSDEVOP_eoi: + return _hypercall1(int, ia64_fast_eoi, + ((struct physdev_eoi *)arg)->irq); + default: + return xencomm_hypercall_physdev_op(cmd, arg); + } } extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs); @@ -322,6 +283,9 @@ static inline void exit_idle(void) {} #ifdef CONFIG_XEN #include <asm/xen/privop.h> #endif /* CONFIG_XEN */ +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif static inline unsigned long __HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size) @@ -417,7 +381,42 @@ HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn, return ret; } +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M +static inline unsigned long +HYPERVISOR_expose_p2m(unsigned long conv_start_gpfn, + unsigned long assign_start_gpfn, + unsigned long expose_size, unsigned long granule_pfn) +{ + return _hypercall5(unsigned long, ia64_dom0vp_op, + IA64_DOM0VP_expose_p2m, conv_start_gpfn, + assign_start_gpfn, expose_size, granule_pfn); +} +#endif + // for balloon driver #define HYPERVISOR_update_va_mapping(va, new_val, flags) (0) +/* Use xencomm to do hypercalls. */ +#ifdef MODULE +#define HYPERVISOR_sched_op xencomm_mini_hypercall_sched_op +#define HYPERVISOR_event_channel_op xencomm_mini_hypercall_event_channel_op +#define HYPERVISOR_callback_op xencomm_mini_hypercall_callback_op +#define HYPERVISOR_multicall xencomm_mini_hypercall_multicall +#define HYPERVISOR_xen_version xencomm_mini_hypercall_xen_version +#define HYPERVISOR_console_io xencomm_mini_hypercall_console_io +#define HYPERVISOR_hvm_op xencomm_mini_hypercall_hvm_op +#define HYPERVISOR_memory_op xencomm_mini_hypercall_memory_op +#else +#define HYPERVISOR_sched_op xencomm_hypercall_sched_op +#define HYPERVISOR_event_channel_op xencomm_hypercall_event_channel_op +#define HYPERVISOR_callback_op xencomm_hypercall_callback_op +#define HYPERVISOR_multicall xencomm_hypercall_multicall +#define HYPERVISOR_xen_version xencomm_hypercall_xen_version +#define HYPERVISOR_console_io xencomm_hypercall_console_io +#define HYPERVISOR_hvm_op xencomm_hypercall_hvm_op +#define HYPERVISOR_memory_op xencomm_hypercall_memory_op +#endif + +#define HYPERVISOR_suspend xencomm_hypercall_suspend + #endif /* __HYPERCALL_H__ */ diff --git a/linux-2.6-xen-sparse/include/asm-ia64/hypervisor.h b/linux-2.6-xen-sparse/include/asm-ia64/hypervisor.h index 7b1a9a7fc9..083884c130 100644 --- a/linux-2.6-xen-sparse/include/asm-ia64/hypervisor.h +++ b/linux-2.6-xen-sparse/include/asm-ia64/hypervisor.h @@ -75,9 +75,6 @@ HYPERVISOR_yield( { int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); - if (rc == -ENOSYS) - rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); - return rc; } @@ -87,9 +84,6 @@ HYPERVISOR_block( { int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); - if (rc == -ENOSYS) - rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); - return rc; } @@ -103,9 +97,6 @@ HYPERVISOR_shutdown( int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown); - if (rc == -ENOSYS) - rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason); - return rc; } @@ -122,8 +113,6 @@ HYPERVISOR_poll( set_xen_guest_handle(sched_poll.ports, ports); rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); - if (rc == -ENOSYS) - rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); return rc; } @@ -138,6 +127,7 @@ int direct_remap_pfn_range(struct vm_area_struct *vma, pgprot_t prot, domid_t domid); struct file; +int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); int privcmd_mmap(struct file * file, struct vm_area_struct * vma); #define HAVE_ARCH_PRIVCMD_MMAP @@ -201,6 +191,22 @@ MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, mcl->args[2] = count; } +/* + * for blktap.c + * int create_lookup_pte_addr(struct mm_struct *mm, + * unsigned long address, + * uint64_t *ptep); + */ +#define create_lookup_pte_addr(mm, address, ptep) \ + ({ \ + printk(KERN_EMERG \ + "%s:%d " \ + "create_lookup_pte_addr() isn't supported.\n", \ + __func__, __LINE__); \ + BUG(); \ + (-ENOSYS); \ + }) + // for debug asmlinkage int xprintk(const char *fmt, ...); #define xprintd(fmt, ...) xprintk("%s:%d " fmt, __func__, __LINE__, \ diff --git a/linux-2.6-xen-sparse/include/asm-ia64/maddr.h b/linux-2.6-xen-sparse/include/asm-ia64/maddr.h index 55c6f94d10..cbdef5a96e 100644 --- a/linux-2.6-xen-sparse/include/asm-ia64/maddr.h +++ b/linux-2.6-xen-sparse/include/asm-ia64/maddr.h @@ -10,11 +10,26 @@ #define INVALID_P2M_ENTRY (~0UL) +#ifdef CONFIG_XEN_IA64_EXPOSE_P2M +extern int p2m_initialized; +extern unsigned long p2m_min_low_pfn; +extern unsigned long p2m_max_low_pfn; +extern unsigned long p2m_convert_min_pfn; +extern unsigned long p2m_convert_max_pfn; +extern volatile const pte_t* p2m_pte; +unsigned long p2m_phystomach(unsigned long gpfn); +#else +#define p2m_initialized (0) +#define p2m_phystomach(gpfn) INVALID_MFN +#endif + /* XXX xen page size != page size */ static inline unsigned long pfn_to_mfn_for_dma(unsigned long pfn) { unsigned long mfn; + if (p2m_initialized) + return p2m_phystomach(pfn); mfn = HYPERVISOR_phystomach(pfn); BUG_ON(mfn == 0); // XXX BUG_ON(mfn == INVALID_P2M_ENTRY); // XXX @@ -81,11 +96,6 @@ mfn_to_local_pfn(unsigned long mfn) #define virt_to_machine(virt) __pa(virt) // for tpmfront.c #define set_phys_to_machine(pfn, mfn) do { } while (0) -#ifdef CONFIG_VMX_GUEST -extern void xen_machphys_update(unsigned long mfn, unsigned long pfn); -#else /* CONFIG_VMX_GUEST */ -#define xen_machphys_update(mfn, pfn) do { } while (0) -#endif /* CONFIG_VMX_GUEST */ typedef unsigned long maddr_t; // to compile netback, netfront diff --git a/linux-2.6-xen-sparse/include/asm-ia64/xen/privop.h b/linux-2.6-xen-sparse/include/asm-ia64/xen/privop.h index 073b3a2a77..6f3c20a8ed 100644 --- a/linux-2.6-xen-sparse/include/asm-ia64/xen/privop.h +++ b/linux-2.6-xen-sparse/include/asm-ia64/xen/privop.h @@ -14,12 +14,9 @@ #define IA64_PARAVIRTUALIZED -#if 0 -#undef XSI_BASE /* At 1 MB, before per-cpu space but still addressable using addl instead of movl. */ #define XSI_BASE 0xfffffffffff00000 -#endif /* Address of mapped regs. */ #define XMAPPEDREGS_BASE (XSI_BASE + XSI_SIZE) diff --git a/linux-2.6-xen-sparse/include/asm-ia64/xen/xcom_hcall.h b/linux-2.6-xen-sparse/include/asm-ia64/xen/xcom_hcall.h new file mode 100644 index 0000000000..3c073a71cd --- /dev/null +++ b/linux-2.6-xen-sparse/include/asm-ia64/xen/xcom_hcall.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2006 Tristan Gingold <tristan.gingold@bull.net>, Bull SAS + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_XENCOMM_HCALL_H_ +#define _LINUX_XENCOMM_HCALL_H_ + +/* These function creates inline descriptor for the parameters and + calls the corresponding xencomm_arch_hypercall_X. + Architectures should defines HYPERVISOR_xxx as xencomm_hypercall_xxx unless + they want to use their own wrapper. */ +extern int xencomm_hypercall_console_io(int cmd, int count, char *str); + +extern int xencomm_hypercall_event_channel_op(int cmd, void *op); + +extern int xencomm_hypercall_xen_version(int cmd, void *arg); + +extern int xencomm_hypercall_physdev_op(int cmd, void *op); + +extern int xencomm_hypercall_grant_table_op(unsigned int cmd, void *op, + unsigned int count); + +extern int xencomm_hypercall_sched_op(int cmd, void *arg); + +extern int xencomm_hypercall_multicall(void *call_list, int nr_calls); + +extern int xencomm_hypercall_callback_op(int cmd, void *arg); + +extern int xencomm_hypercall_memory_op(unsigned int cmd, void *arg); + +extern unsigned long xencomm_hypercall_hvm_op(int cmd, void *arg); + +extern int xencomm_hypercall_suspend(unsigned long srec); + +/* Using mini xencomm. */ +extern int xencomm_mini_hypercall_console_io(int cmd, int count, char *str); + +extern int xencomm_mini_hypercall_event_channel_op(int cmd, void *op); + +extern int xencomm_mini_hypercall_xen_version(int cmd, void *arg); + +extern int xencomm_mini_hypercall_physdev_op(int cmd, void *op); + +extern int xencomm_mini_hypercall_grant_table_op(unsigned int cmd, void *op, + unsigned int count); + +extern int xencomm_mini_hypercall_sched_op(int cmd, void *arg); + +extern int xencomm_mini_hypercall_multicall(void *call_list, int nr_calls); + +extern int xencomm_mini_hypercall_callback_op(int cmd, void *arg); + +extern int xencomm_mini_hypercall_memory_op(unsigned int cmd, void *arg); + +extern unsigned long xencomm_mini_hypercall_hvm_op(int cmd, void *arg); + +/* For privcmd. Locally declare argument type to avoid include storm. + Type coherency will be checked within privcmd.c */ +struct privcmd_hypercall; +extern int privcmd_hypercall(struct privcmd_hypercall *hypercall); + +#endif /* _LINUX_XENCOMM_HCALL_H_ */ diff --git a/linux-2.6-xen-sparse/include/asm-ia64/xen/xencomm.h b/linux-2.6-xen-sparse/include/asm-ia64/xen/xencomm.h new file mode 100644 index 0000000000..eae11369f1 --- /dev/null +++ b/linux-2.6-xen-sparse/include/asm-ia64/xen/xencomm.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2006 Hollis Blanchard <hollisb@us.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_XENCOMM_H_ +#define _LINUX_XENCOMM_H_ + +#include <xen/interface/xencomm.h> + +#define XENCOMM_MINI_ADDRS 3 +struct xencomm_mini { + struct xencomm_desc _desc; + uint64_t address[XENCOMM_MINI_ADDRS]; +}; + +/* Must be called before any hypercall. */ +extern void xencomm_init (void); + +/* To avoid additionnal virt to phys conversion, an opaque structure is + presented. */ +struct xencomm_handle; + +extern int xencomm_create(void *buffer, unsigned long bytes, + struct xencomm_handle **desc, gfp_t type); +extern void xencomm_free(struct xencomm_handle *desc); + +extern int xencomm_create_mini(struct xencomm_mini *area, int *nbr_area, + void *buffer, unsigned long bytes, + struct xencomm_handle **ret); + +/* Translate virtual address to physical address. */ +extern unsigned long xencomm_vaddr_to_paddr(unsigned long vaddr); + +/* Inline version. To be used only on linear space (kernel space). */ +static inline struct xencomm_handle * +xencomm_create_inline(void *buffer) +{ + unsigned long paddr; + + paddr = xencomm_vaddr_to_paddr((unsigned long)buffer); + return (struct xencomm_handle *)(paddr | XENCOMM_INLINE_FLAG); +} + +#define xen_guest_handle(hnd) ((hnd).p) + +#endif /* _LINUX_XENCOMM_H_ */ diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/fixmap.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/fixmap.h index 64ae42e1aa..29beed9512 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/fixmap.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/fixmap.h @@ -14,7 +14,6 @@ #include <linux/config.h> #include <linux/kernel.h> #include <asm/apicdef.h> -#include <xen/gnttab.h> #include <asm/page.h> #include <asm/vsyscall.h> #include <asm/vsyscall32.h> diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h index 14fb01d3d2..956a4c4b0d 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hypercall.h @@ -258,6 +258,8 @@ HYPERVISOR_event_channel_op( int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { struct evtchn_op op; op.cmd = cmd; @@ -265,6 +267,8 @@ HYPERVISOR_event_channel_op( rc = _hypercall1(int, event_channel_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } +#endif + return rc; } @@ -294,6 +298,8 @@ HYPERVISOR_physdev_op( int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); + +#ifdef CONFIG_XEN_COMPAT_030002 if (unlikely(rc == -ENOSYS)) { struct physdev_op op; op.cmd = cmd; @@ -301,6 +307,8 @@ HYPERVISOR_physdev_op( rc = _hypercall1(int, physdev_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } +#endif + return rc; } @@ -351,9 +359,11 @@ HYPERVISOR_suspend( int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, &sched_shutdown, srec); +#ifdef CONFIG_XEN_COMPAT_030002 if (rc == -ENOSYS) rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, SHUTDOWN_suspend, srec); +#endif return rc; } diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/io.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/io.h index 1ae9c89ba9..4d13b1b9c7 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/io.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/io.h @@ -346,7 +346,8 @@ extern int iommu_bio_merge; * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access */ -#define xlate_dev_mem_ptr(p) __va(p) +#define xlate_dev_mem_ptr(p, sz) ioremap(p, sz) +#define xlate_dev_mem_ptr_unmap(p) iounmap(p) /* * Convert a virtual cached pointer to an uncached pointer diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/maddr.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/maddr.h index 0104de8082..77544b3ca2 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/maddr.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/maddr.h @@ -9,6 +9,10 @@ #define FOREIGN_FRAME_BIT (1UL<<63) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) +/* Definitions for machine and pseudophysical addresses. */ +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; + #ifdef CONFIG_XEN extern unsigned long *phys_to_machine_mapping; @@ -99,20 +103,6 @@ static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) phys_to_machine_mapping[pfn] = mfn; } -#else /* !CONFIG_XEN */ - -#define pfn_to_mfn(pfn) (pfn) -#define mfn_to_pfn(mfn) (mfn) -#define mfn_to_local_pfn(mfn) (mfn) -#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn)) -#define phys_to_machine_mapping_valid(pfn) (1) - -#endif /* !CONFIG_XEN */ - -/* Definitions for machine and pseudophysical addresses. */ -typedef unsigned long paddr_t; -typedef unsigned long maddr_t; - static inline maddr_t phys_to_machine(paddr_t phys) { maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); @@ -127,6 +117,27 @@ static inline paddr_t machine_to_phys(maddr_t machine) return phys; } +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + paddr_t phys; + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) BUG_ON((pfn) != (mfn)) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pte_machine_to_phys(mach) ((paddr_t)(mach)) + +#endif /* !CONFIG_XEN */ + /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(__pa(v))) #define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h index cd23862b05..7573cce405 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h @@ -33,6 +33,13 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #define PAGE_MASK (~(PAGE_SIZE-1)) + +/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +#define __PHYSICAL_MASK_SHIFT 46 +#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) +#define __VIRTUAL_MASK_SHIFT 48 +#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) + #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) #define THREAD_ORDER 1 @@ -90,28 +97,28 @@ typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; -#define pte_val(x) (((x).pte & 1) ? machine_to_phys((x).pte) : \ +#define pte_val(x) (((x).pte & 1) ? pte_machine_to_phys((x).pte) : \ (x).pte) #define pte_val_ma(x) ((x).pte) static inline unsigned long pmd_val(pmd_t x) { unsigned long ret = x.pmd; - if (ret) ret = machine_to_phys(ret); + if (ret) ret = pte_machine_to_phys(ret); return ret; } static inline unsigned long pud_val(pud_t x) { unsigned long ret = x.pud; - if (ret) ret = machine_to_phys(ret); + if (ret) ret = pte_machine_to_phys(ret); return ret; } static inline unsigned long pgd_val(pgd_t x) { unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret); + if (ret) ret = pte_machine_to_phys(ret); return ret; } @@ -163,12 +170,6 @@ static inline pgd_t __pgd(unsigned long x) /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ -#define __PHYSICAL_MASK_SHIFT 46 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) -#define __VIRTUAL_MASK_SHIFT 48 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) - #define KERNEL_TEXT_SIZE (40UL*1024*1024) #define KERNEL_TEXT_START 0xffffffff80000000UL diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h index ff6d94f9e0..0c4d0a888e 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h @@ -205,8 +205,14 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_NX (1UL<<_PAGE_BIT_NX) +#ifdef CONFIG_XEN_COMPAT_030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) #define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -219,13 +225,13 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define __PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) #define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) #define __PAGE_KERNEL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) #define __PAGE_KERNEL_RO \ - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) #define __PAGE_KERNEL_VSYSCALL \ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) #define __PAGE_KERNEL_VSYSCALL_NOCACHE \ @@ -422,7 +428,8 @@ static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address) can temporarily clear it. */ #define pmd_present(x) (pmd_val(x)) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) +#define pmd_bad(x) ((pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \ + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT))) #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) diff --git a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/setup_arch_post.h b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/setup_arch_post.h index e21d4ee6f2..5244e2855e 100644 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/setup_arch_post.h +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/setup_arch_post.h @@ -15,20 +15,20 @@ extern void nmi(void); static void __init machine_specific_arch_setup(void) { int ret; - struct callback_register event = { + static struct callback_register __initdata event = { .type = CALLBACKTYPE_event, .address = (unsigned long) hypervisor_callback, }; - struct callback_register failsafe = { + static struct callback_register __initdata failsafe = { .type = CALLBACKTYPE_failsafe, .address = (unsigned long)failsafe_callback, }; - struct callback_register syscall = { + static struct callback_register __initdata syscall = { .type = CALLBACKTYPE_syscall, .address = (unsigned long)system_call, }; #ifdef CONFIG_X86_LOCAL_APIC - struct callback_register nmi_cb = { + static struct callback_register __initdata nmi_cb = { .type = CALLBACKTYPE_nmi, .address = (unsigned long)nmi, }; @@ -39,20 +39,25 @@ static void __init machine_specific_arch_setup(void) ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); if (ret == 0) ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); +#ifdef CONFIG_XEN_COMPAT_030002 if (ret == -ENOSYS) ret = HYPERVISOR_set_callbacks( event.address, failsafe.address, syscall.address); +#endif BUG_ON(ret); #ifdef CONFIG_X86_LOCAL_APIC ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#ifdef CONFIG_XEN_COMPAT_030002 if (ret == -ENOSYS) { - struct xennmi_callback cb; + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; - cb.handler_address = nmi_cb.address; HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); } #endif +#endif } diff --git a/linux-2.6-xen-sparse/include/linux/skbuff.h b/linux-2.6-xen-sparse/include/linux/skbuff.h index 9ea3924ab6..07b8f3036d 100644 --- a/linux-2.6-xen-sparse/include/linux/skbuff.h +++ b/linux-2.6-xen-sparse/include/linux/skbuff.h @@ -974,15 +974,16 @@ static inline void skb_reserve(struct sk_buff *skb, int len) #define NET_IP_ALIGN 2 #endif -extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data + len; - } else - ___pskb_trim(skb, len, 0); + if (unlikely(skb->data_len)) { + WARN_ON(1); + return; + } + skb->len = len; + skb->tail = skb->data + len; } /** @@ -992,6 +993,7 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len) * * Cut the length of a buffer down by removing data from the tail. If * the buffer is already under the length specified it is not modified. + * The skb must be linear. */ static inline void skb_trim(struct sk_buff *skb, unsigned int len) { @@ -1002,12 +1004,10 @@ static inline void skb_trim(struct sk_buff *skb, unsigned int len) static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data+len; - return 0; - } - return ___pskb_trim(skb, len, 1); + if (skb->data_len) + return ___pskb_trim(skb, len); + __skb_trim(skb, len); + return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) diff --git a/linux-2.6-xen-sparse/include/xen/balloon.h b/linux-2.6-xen-sparse/include/xen/balloon.h index 60d7099aa1..d26c62bef4 100644 --- a/linux-2.6-xen-sparse/include/xen/balloon.h +++ b/linux-2.6-xen-sparse/include/xen/balloon.h @@ -38,23 +38,13 @@ * Inform the balloon driver that it should allow some slop for device-driver * memory activities. */ -void -balloon_update_driver_allowance( - long delta); +void balloon_update_driver_allowance(long delta); -/* Allocate an empty low-memory page range. */ -struct page * -balloon_alloc_empty_page_range( - unsigned long nr_pages); +/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ +struct page **alloc_empty_pages_and_pagevec(int nr_pages); +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); -/* Deallocate an empty page range, adding to the balloon. */ -void -balloon_dealloc_empty_page_range( - struct page *page, unsigned long nr_pages); - -void -balloon_release_driver_page( - struct page *page); +void balloon_release_driver_page(struct page *page); /* * Prevent the balloon driver from changing the memory reservation during diff --git a/linux-2.6-xen-sparse/include/xen/gnttab.h b/linux-2.6-xen-sparse/include/xen/gnttab.h index 676fca5054..f1543c01d6 100644 --- a/linux-2.6-xen-sparse/include/xen/gnttab.h +++ b/linux-2.6-xen-sparse/include/xen/gnttab.h @@ -39,6 +39,7 @@ #include <linux/config.h> #include <asm/hypervisor.h> +#include <asm/maddr.h> /* maddr_t */ #include <xen/interface/grant_table.h> #include <xen/features.h> @@ -118,7 +119,7 @@ int gnttab_suspend(void); int gnttab_resume(void); static inline void -gnttab_set_map_op(struct gnttab_map_grant_ref *map, unsigned long addr, +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr, uint32_t flags, grant_ref_t ref, domid_t domid) { if (flags & GNTMAP_contains_pte) @@ -134,7 +135,7 @@ gnttab_set_map_op(struct gnttab_map_grant_ref *map, unsigned long addr, } static inline void -gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, unsigned long addr, +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr, uint32_t flags, grant_handle_t handle) { if (flags & GNTMAP_contains_pte) diff --git a/linux-2.6-xen-sparse/include/xen/public/evtchn.h b/linux-2.6-xen-sparse/include/xen/public/evtchn.h index 99e2948240..938d4da2bd 100644 --- a/linux-2.6-xen-sparse/include/xen/public/evtchn.h +++ b/linux-2.6-xen-sparse/include/xen/public/evtchn.h @@ -33,9 +33,6 @@ #ifndef __LINUX_PUBLIC_EVTCHN_H__ #define __LINUX_PUBLIC_EVTCHN_H__ -/* /dev/xen/evtchn resides at device number major=10, minor=201 */ -#define EVTCHN_MINOR 201 - /* * Bind a fresh port to VIRQ @virq. * Return allocated port. diff --git a/linux-2.6-xen-sparse/include/xen/xenbus.h b/linux-2.6-xen-sparse/include/xen/xenbus.h index 8e259ce777..c7cb7eaa3a 100644 --- a/linux-2.6-xen-sparse/include/xen/xenbus.h +++ b/linux-2.6-xen-sparse/include/xen/xenbus.h @@ -38,6 +38,7 @@ #include <linux/notifier.h> #include <linux/mutex.h> #include <linux/completion.h> +#include <linux/init.h> #include <xen/interface/xen.h> #include <xen/interface/grant_table.h> #include <xen/interface/io/xenbus.h> diff --git a/linux-2.6-xen-sparse/include/xen/xencons.h b/linux-2.6-xen-sparse/include/xen/xencons.h index fa2160d89d..ae873746aa 100644 --- a/linux-2.6-xen-sparse/include/xen/xencons.h +++ b/linux-2.6-xen-sparse/include/xen/xencons.h @@ -1,6 +1,9 @@ #ifndef __ASM_XENCONS_H__ #define __ASM_XENCONS_H__ +struct dom0_vga_console_info; +void dom0_init_screen_info(const struct dom0_vga_console_info *info); + void xencons_force_flush(void); void xencons_resume(void); diff --git a/linux-2.6-xen-sparse/include/xen/xenoprof.h b/linux-2.6-xen-sparse/include/xen/xenoprof.h new file mode 100644 index 0000000000..4c3ab0fb21 --- /dev/null +++ b/linux-2.6-xen-sparse/include/xen/xenoprof.h @@ -0,0 +1,42 @@ +/****************************************************************************** + * xen/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __XEN_XENOPROF_H__ +#define __XEN_XENOPROF_H__ +#ifdef CONFIG_XEN + +#include <asm/xenoprof.h> + +struct oprofile_operations; +int xenoprofile_init(struct oprofile_operations * ops); +void xenoprofile_exit(void); + +struct xenoprof_shared_buffer { + char *buffer; + struct xenoprof_arch_shared_buffer arch; +}; +#else +#define xenoprofile_init(ops) (-ENOSYS) +#define xenoprofile_exit() do { } while (0) + +#endif /* CONFIG_XEN */ +#endif /* __XEN_XENOPROF_H__ */ diff --git a/linux-2.6-xen-sparse/lib/Makefile b/linux-2.6-xen-sparse/lib/Makefile index 2657bb5d10..1f96de0517 100644 --- a/linux-2.6-xen-sparse/lib/Makefile +++ b/linux-2.6-xen-sparse/lib/Makefile @@ -45,9 +45,7 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -ifneq ($(CONFIG_XEN_IA64_DOM0_NON_VP),y) swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o -endif hostprogs-y := gen_crc32table clean-files := crc32table.h diff --git a/linux-2.6-xen-sparse/mm/Kconfig b/linux-2.6-xen-sparse/mm/Kconfig index c9e8c6ddb8..f54f49fbb6 100644 --- a/linux-2.6-xen-sparse/mm/Kconfig +++ b/linux-2.6-xen-sparse/mm/Kconfig @@ -115,7 +115,7 @@ config SPARSEMEM_EXTREME # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND diff --git a/linux-2.6-xen-sparse/mm/memory.c b/linux-2.6-xen-sparse/mm/memory.c index 1a63339203..d7319d32f9 100644 --- a/linux-2.6-xen-sparse/mm/memory.c +++ b/linux-2.6-xen-sparse/mm/memory.c @@ -390,7 +390,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ if (vma->vm_flags & VM_PFNMAP) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if ((pfn == vma->vm_pgoff + off) || !pfn_valid(pfn)) + if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; @@ -405,7 +405,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ * Remove this test eventually! */ if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); + if (!(vma->vm_flags & VM_RESERVED)) + print_bad_pte(vma, pte, addr); return NULL; } @@ -1534,6 +1535,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) memset(kaddr, 0, PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(dst); return; } diff --git a/linux-2.6-xen-sparse/mm/mmap.c b/linux-2.6-xen-sparse/mm/mmap.c index a01e3ffb9d..f1b2f0f0ed 100644 --- a/linux-2.6-xen-sparse/mm/mmap.c +++ b/linux-2.6-xen-sparse/mm/mmap.c @@ -30,6 +30,10 @@ #include <asm/cacheflush.h> #include <asm/tlb.h> +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -906,6 +910,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, if (!len) return -EINVAL; + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len || len > TASK_SIZE) @@ -1846,6 +1854,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long flags; struct rb_node ** rb_link, * rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; len = PAGE_ALIGN(len); if (!len) @@ -1854,6 +1863,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* * mlock MCL_FUTURE? */ @@ -1894,8 +1909,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL)) diff --git a/linux-2.6-xen-sparse/mm/page_alloc.c b/linux-2.6-xen-sparse/mm/page_alloc.c index c0f3c60537..f12323955a 100644 --- a/linux-2.6-xen-sparse/mm/page_alloc.c +++ b/linux-2.6-xen-sparse/mm/page_alloc.c @@ -951,7 +951,8 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - alloc_flags |= ALLOC_CPUSET; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations diff --git a/linux-2.6-xen-sparse/net/core/skbuff.c b/linux-2.6-xen-sparse/net/core/skbuff.c index 236946bd7e..064e6277b1 100644 --- a/linux-2.6-xen-sparse/net/core/skbuff.c +++ b/linux-2.6-xen-sparse/net/core/skbuff.c @@ -261,11 +261,11 @@ nodata: } -static void skb_drop_fraglist(struct sk_buff *skb) +static void skb_drop_list(struct sk_buff **listp) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *list = *listp; - skb_shinfo(skb)->frag_list = NULL; + *listp = NULL; do { struct sk_buff *this = list; @@ -274,6 +274,11 @@ static void skb_drop_fraglist(struct sk_buff *skb) } while (list); } +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(&skb_shinfo(skb)->frag_list); +} + static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -604,6 +609,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) n->csum = skb->csum; n->ip_summed = skb->ip_summed; + n->truesize += skb->data_len; n->data_len = skb->data_len; n->len = skb->len; @@ -798,49 +804,86 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad) return nskb; } -/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). +/* Trims skb to length len. It can change skb pointers. */ -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +int ___pskb_trim(struct sk_buff *skb, unsigned int len) { + struct sk_buff **fragp; + struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)->nr_frags; int i; + int err; - for (i = 0; i < nfrags; i++) { + if (skb_cloned(skb) && + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) + return err; + + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { int end = offset + skb_shinfo(skb)->frags[i].size; - if (end > len) { - if (skb_cloned(skb)) { - BUG_ON(!realloc); - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len <= offset) { - put_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->nr_frags--; - } else { - skb_shinfo(skb)->frags[i].size = len - offset; - } + + if (end < len) { + offset = end; + continue; + } + + skb_shinfo(skb)->frags[i++].size = len - offset; + +drop_pages: + skb_shinfo(skb)->nr_frags = i; + + for (; i < nfrags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + + if (skb_shinfo(skb)->frag_list) + skb_drop_fraglist(skb); + goto done; + } + + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); + fragp = &frag->next) { + int end = offset + frag->len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag->next = frag->next; + kfree_skb(frag); + frag = nfrag; + *fragp = frag; } - offset = end; + + if (end < len) { + offset = end; + continue; + } + + if (end > len && + unlikely((err = pskb_trim(frag, len - offset)))) + return err; + + if (frag->next) + skb_drop_list(&frag->next); + break; } - if (offset < len) { +done: + if (len > skb_headlen(skb)) { skb->data_len -= skb->len - len; skb->len = len; } else { - if (len <= skb_headlen(skb)) { - skb->len = len; - skb->data_len = 0; - skb->tail = skb->data + len; - if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) - skb_drop_fraglist(skb); - } else { - skb->data_len -= skb->len - len; - skb->len = len; - } + skb->len = len; + skb->data_len = 0; + skb->tail = skb->data + len; } return 0; |