diff options
Diffstat (limited to 'linux-2.6-xen-sparse/drivers')
45 files changed, 2816 insertions, 1416 deletions
diff --git a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c b/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c index 71c7dd3a00..adf016ba90 100644 --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c +++ b/linux-2.6-xen-sparse/drivers/char/tpm/tpm_xen.c @@ -41,6 +41,7 @@ #include <xen/evtchn.h> #include <xen/interface/grant_table.h> #include <xen/interface/io/tpmif.h> +#include <xen/gnttab.h> #include <xen/xenbus.h> #include "tpm.h" #include "tpm_vtpm.h" @@ -343,6 +344,7 @@ static void backend_changed(struct xenbus_device *dev, case XenbusStateInitialising: case XenbusStateInitWait: case XenbusStateInitialised: + case XenbusStateUnknown: break; case XenbusStateConnected: @@ -351,13 +353,14 @@ static void backend_changed(struct xenbus_device *dev, case XenbusStateClosing: tpmif_set_connected_state(tp, 0); + xenbus_frontend_closed(dev); break; - case XenbusStateUnknown: case XenbusStateClosed: + tpmif_set_connected_state(tp, 0); if (tp->is_suspended == 0) device_unregister(&dev->dev); - xenbus_switch_state(dev, XenbusStateClosed); + xenbus_frontend_closed(dev); break; } } @@ -419,9 +422,10 @@ static int tpmfront_suspend(struct xenbus_device *dev) mutex_lock(&suspend_lock); tp->is_suspended = 1; - for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 25; ctr++) { + for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 300; ctr++) { if ((ctr % 10) == 0) - printk("TPM-FE [INFO]: Waiting for outstanding request.\n"); + printk("TPM-FE [INFO]: Waiting for outstanding " + "request.\n"); /* * Wait for a request to be responded to. */ diff --git a/linux-2.6-xen-sparse/drivers/char/tty_io.c b/linux-2.6-xen-sparse/drivers/char/tty_io.c index f6f0689771..0372d93bca 100644 --- a/linux-2.6-xen-sparse/drivers/char/tty_io.c +++ b/linux-2.6-xen-sparse/drivers/char/tty_io.c @@ -2761,7 +2761,7 @@ static void flush_to_ldisc(void *private_) struct tty_struct *tty = (struct tty_struct *) private_; unsigned long flags; struct tty_ldisc *disc; - struct tty_buffer *tbuf; + struct tty_buffer *tbuf, *head; int count; char *char_buf; unsigned char *flag_buf; @@ -2778,7 +2778,9 @@ static void flush_to_ldisc(void *private_) goto out; } spin_lock_irqsave(&tty->buf.lock, flags); - while((tbuf = tty->buf.head) != NULL) { + head = tty->buf.head; + tty->buf.head = NULL; + while((tbuf = head) != NULL) { while ((count = tbuf->commit - tbuf->read) != 0) { char_buf = tbuf->char_buf_ptr + tbuf->read; flag_buf = tbuf->flag_buf_ptr + tbuf->read; @@ -2787,10 +2789,12 @@ static void flush_to_ldisc(void *private_) disc->receive_buf(tty, char_buf, flag_buf, count); spin_lock_irqsave(&tty->buf.lock, flags); } - if (tbuf->active) + if (tbuf->active) { + tty->buf.head = head; break; - tty->buf.head = tbuf->next; - if (tty->buf.head == NULL) + } + head = tbuf->next; + if (head == NULL) tty->buf.tail = NULL; tty_buffer_free(tty, tbuf); } diff --git a/linux-2.6-xen-sparse/drivers/serial/Kconfig b/linux-2.6-xen-sparse/drivers/serial/Kconfig index fa1fdb0b37..c6be86d83e 100644 --- a/linux-2.6-xen-sparse/drivers/serial/Kconfig +++ b/linux-2.6-xen-sparse/drivers/serial/Kconfig @@ -821,6 +821,7 @@ config SERIAL_ICOM tristate "IBM Multiport Serial Adapter" depends on PCI && (PPC_ISERIES || PPC_PSERIES) select SERIAL_CORE + select FW_LOADER help This driver is for a family of multiport serial adapters including 2 port RVX, 2 port internal modem, 4 port internal diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile b/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile index 0e3a3485c4..3fc3d0bae5 100644 --- a/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/Makefile @@ -1,2 +1,2 @@ -obj-y += balloon.o +obj-y := balloon.o sysfs.o diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c index a6a8396c05..b621d76383 100644 --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c @@ -53,10 +53,8 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <linux/list.h> - #include <xen/xenbus.h> - -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) +#include "common.h" #ifdef CONFIG_PROC_FS static struct proc_dir_entry *balloon_pde; @@ -71,9 +69,7 @@ static DECLARE_MUTEX(balloon_mutex); */ DEFINE_SPINLOCK(balloon_lock); -/* We aim for 'current allocation' == 'target allocation'. */ -static unsigned long current_pages; -static unsigned long target_pages; +struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -81,18 +77,8 @@ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; /* VM /proc information for memory */ extern unsigned long totalram_pages; -/* We may hit the hard limit in Xen. If we do then we remember it. */ -static unsigned long hard_limit; - -/* - * Drivers may alter the memory reservation independently, but they must - * inform the balloon driver so that we can avoid hitting the hard limit. - */ -static unsigned long driver_pages; - /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); -static unsigned long balloon_low, balloon_high; /* Main work function, always executed in process context. */ static void balloon_process(void *unused); @@ -124,10 +110,10 @@ static void balloon_append(struct page *page) /* Lowmem is re-populated first, so highmem pages go at list tail. */ if (PageHighMem(page)) { list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); - balloon_high++; + bs.balloon_high++; } else { list_add(PAGE_TO_LIST(page), &ballooned_pages); - balloon_low++; + bs.balloon_low++; } } @@ -143,9 +129,9 @@ static struct page *balloon_retrieve(void) UNLIST_PAGE(page); if (PageHighMem(page)) - balloon_high--; + bs.balloon_high--; else - balloon_low--; + bs.balloon_low--; return page; } @@ -172,9 +158,9 @@ static void balloon_alarm(unsigned long unused) static unsigned long current_target(void) { - unsigned long target = min(target_pages, hard_limit); - if (target > (current_pages + balloon_low + balloon_high)) - target = current_pages + balloon_low + balloon_high; + unsigned long target = min(bs.target_pages, bs.hard_limit); + if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) + target = bs.current_pages + bs.balloon_low + bs.balloon_high; return target; } @@ -216,7 +202,8 @@ static int increase_reservation(unsigned long nr_pages) BUG_ON(ret != rc); } if (rc >= 0) - hard_limit = current_pages + rc - driver_pages; + bs.hard_limit = (bs.current_pages + rc - + bs.driver_pages); goto out; } @@ -228,9 +215,7 @@ static int increase_reservation(unsigned long nr_pages) BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && phys_to_machine_mapping_valid(pfn)); - /* Update P->M and M->P tables. */ set_phys_to_machine(pfn, frame_list[i]); - xen_machphys_update(frame_list[i], pfn); /* Link back into the page tables if not highmem. */ if (pfn < max_low_pfn) { @@ -248,8 +233,8 @@ static int increase_reservation(unsigned long nr_pages) __free_page(page); } - current_pages += nr_pages; - totalram_pages = current_pages; + bs.current_pages += nr_pages; + totalram_pages = bs.current_pages; out: balloon_unlock(flags); @@ -317,8 +302,8 @@ static int decrease_reservation(unsigned long nr_pages) ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); BUG_ON(ret != nr_pages); - current_pages -= nr_pages; - totalram_pages = current_pages; + bs.current_pages -= nr_pages; + totalram_pages = bs.current_pages; balloon_unlock(flags); @@ -339,7 +324,7 @@ static void balloon_process(void *unused) down(&balloon_mutex); do { - credit = current_target() - current_pages; + credit = current_target() - bs.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) @@ -352,18 +337,18 @@ static void balloon_process(void *unused) } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ - if (current_target() != current_pages) + if (current_target() != bs.current_pages) mod_timer(&balloon_timer, jiffies + HZ); up(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - hard_limit = ~0UL; - target_pages = target; + bs.hard_limit = ~0UL; + bs.target_pages = target; schedule_work(&balloon_worker); } @@ -388,7 +373,7 @@ static void watch_target(struct xenbus_watch *watch, /* The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ - set_new_target(new_target >> (PAGE_SHIFT - 10)); + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); } static int balloon_init_watcher(struct notifier_block *notifier, @@ -424,7 +409,7 @@ static int balloon_write(struct file *file, const char __user *buffer, memstring[sizeof(memstring)-1] = '\0'; target_bytes = memparse(memstring, &endchar); - set_new_target(target_bytes >> PAGE_SHIFT); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); return count; } @@ -442,12 +427,13 @@ static int balloon_read(char *page, char **start, off_t off, "High-mem balloon: %8lu kB\n" "Driver pages: %8lu kB\n" "Xen hard limit: ", - PAGES2KB(current_pages), PAGES2KB(target_pages), - PAGES2KB(balloon_low), PAGES2KB(balloon_high), - PAGES2KB(driver_pages)); + PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), + PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high), + PAGES2KB(bs.driver_pages)); - if (hard_limit != ~0UL) - len += sprintf(page + len, "%8lu kB\n", PAGES2KB(hard_limit)); + if (bs.hard_limit != ~0UL) + len += sprintf(page + len, "%8lu kB\n", + PAGES2KB(bs.hard_limit)); else len += sprintf(page + len, " ??? kB\n"); @@ -468,13 +454,13 @@ static int __init balloon_init(void) IPRINTK("Initialising balloon driver.\n"); - current_pages = min(xen_start_info->nr_pages, max_pfn); - totalram_pages = current_pages; - target_pages = current_pages; - balloon_low = 0; - balloon_high = 0; - driver_pages = 0UL; - hard_limit = ~0UL; + bs.current_pages = min(xen_start_info->nr_pages, max_pfn); + totalram_pages = bs.current_pages; + bs.target_pages = bs.current_pages; + bs.balloon_low = 0; + bs.balloon_high = 0; + bs.driver_pages = 0UL; + bs.hard_limit = ~0UL; init_timer(&balloon_timer); balloon_timer.data = 0; @@ -489,6 +475,7 @@ static int __init balloon_init(void) balloon_pde->read_proc = balloon_read; balloon_pde->write_proc = balloon_write; #endif + balloon_sysfs_init(); /* Initialise the balloon with excess memory space. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { @@ -512,7 +499,7 @@ void balloon_update_driver_allowance(long delta) unsigned long flags; balloon_lock(flags); - driver_pages += delta; + bs.driver_pages += delta; balloon_unlock(flags); } @@ -534,75 +521,87 @@ static int dealloc_pte_fn( return 0; } -struct page *balloon_alloc_empty_page_range(unsigned long nr_pages) +struct page **alloc_empty_pages_and_pagevec(int nr_pages) { - unsigned long vstart, flags; - unsigned int order = get_order(nr_pages * PAGE_SIZE); - int ret; - unsigned long i; - struct page *page; + unsigned long vaddr, flags; + struct page *page, **pagevec; + int i, ret; - vstart = __get_free_pages(GFP_KERNEL, order); - if (vstart == 0) + pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); + if (pagevec == NULL) return NULL; - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - if (xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long gmfn = __pa(vstart) >> PAGE_SHIFT; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = order, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &gmfn); - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - if (ret == -ENOSYS) - goto err; - BUG_ON(ret != 1); - } else { - ret = apply_to_page_range(&init_mm, vstart, PAGE_SIZE << order, - dealloc_pte_fn, NULL); - if (ret == -ENOSYS) + for (i = 0; i < nr_pages; i++) { + page = pagevec[i] = alloc_page(GFP_KERNEL); + if (page == NULL) goto err; - BUG_ON(ret); - } - current_pages -= 1UL << order; - totalram_pages = current_pages; - balloon_unlock(flags); - schedule_work(&balloon_worker); + vaddr = (unsigned long)page_address(page); - flush_tlb_all(); + scrub_pages(vaddr, 1); - page = virt_to_page(vstart); + balloon_lock(flags); - for (i = 0; i < (1UL << order); i++) - set_page_count(page + i, 1); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long gmfn = page_to_pfn(page); + struct xen_memory_reservation reservation = { + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_xen_guest_handle(reservation.extent_start, &gmfn); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + if (ret == 1) + ret = 0; /* success */ + } else { + ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE, + dealloc_pte_fn, NULL); + } - return page; + if (ret != 0) { + balloon_unlock(flags); + __free_page(page); + goto err; + } + + totalram_pages = --bs.current_pages; + + balloon_unlock(flags); + } + + out: + schedule_work(&balloon_worker); + flush_tlb_all(); + return pagevec; err: - free_pages(vstart, order); + balloon_lock(flags); + while (--i >= 0) + balloon_append(pagevec[i]); balloon_unlock(flags); - return NULL; + kfree(pagevec); + pagevec = NULL; + goto out; } -void balloon_dealloc_empty_page_range( - struct page *page, unsigned long nr_pages) +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) { - unsigned long i, flags; - unsigned int order = get_order(nr_pages * PAGE_SIZE); + unsigned long flags; + int i; + + if (pagevec == NULL) + return; balloon_lock(flags); - for (i = 0; i < (1UL << order); i++) { - BUG_ON(page_count(page + i) != 1); - balloon_append(page + i); + for (i = 0; i < nr_pages; i++) { + BUG_ON(page_count(pagevec[i]) != 1); + balloon_append(pagevec[i]); } balloon_unlock(flags); + kfree(pagevec); + schedule_work(&balloon_worker); } @@ -612,15 +611,15 @@ void balloon_release_driver_page(struct page *page) balloon_lock(flags); balloon_append(page); - driver_pages--; + bs.driver_pages--; balloon_unlock(flags); schedule_work(&balloon_worker); } EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); -EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range); -EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range); +EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); +EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); EXPORT_SYMBOL_GPL(balloon_release_driver_page); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/common.h b/linux-2.6-xen-sparse/drivers/xen/balloon/common.h new file mode 100644 index 0000000000..4496d215e2 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/common.h @@ -0,0 +1,58 @@ +/****************************************************************************** + * balloon/common.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BALLOON_COMMON_H__ +#define __XEN_BALLOON_COMMON_H__ + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* We may hit the hard limit in Xen. If we do then we remember it. */ + unsigned long hard_limit; + /* + * Drivers may alter the memory reservation independently, but they + * must inform the balloon driver so we avoid hitting the hard limit. + */ + unsigned long driver_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; +}; + +extern struct balloon_stats balloon_stats; +#define bs balloon_stats + +int balloon_sysfs_init(void); +void balloon_sysfs_exit(void); + +void balloon_set_new_target(unsigned long target); + +#endif /* __XEN_BALLOON_COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c b/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c new file mode 100644 index 0000000000..a4ed8a6f1e --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/sysfs.c @@ -0,0 +1,165 @@ +/****************************************************************************** + * balloon/sysfs.c + * + * Xen balloon driver - sysfs interfaces. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/capability.h> +#include <linux/stat.h> +#include <linux/sysdev.h> +#include "common.h" + +#define BALLOON_CLASS_NAME "memory" + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high)); +BALLOON_SHOW(hard_limit_kb, + (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n", + (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0); +BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages)); + +static ssize_t show_target_kb(struct sys_device *dev, char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + const char *buf, + size_t count) +{ + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + strcpy(memstring, buf); + + target_bytes = memparse(memstring, &endchar); + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + &attr_hard_limit_kb.attr, + &attr_driver_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs, +}; + +static struct sysdev_class balloon_sysdev_class = { + set_kset_name(BALLOON_CLASS_NAME), +}; + +static struct sys_device balloon_sysdev; + +static int register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +static void unregister_balloon(struct sys_device *sysdev) +{ + int i; + + sysfs_remove_group(&sysdev->kobj, &balloon_info_group); + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); +} + +int balloon_sysfs_init(void) +{ + return register_balloon(&balloon_sysdev); +} + +void balloon_sysfs_exit(void) +{ + unregister_balloon(&balloon_sysdev); +} diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c index 416f7bc18c..e8df9e0346 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c @@ -56,8 +56,6 @@ static int blkif_reqs = 64; module_param_named(reqs, blkif_reqs, int, 0); MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); -static int mmap_pages; - /* Run-time switchable: /sys/module/blkback/parameters/ */ static unsigned int log_stats = 0; static unsigned int debug_lvl = 0; @@ -87,8 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); #define BLKBACK_INVALID_HANDLE (~0) -static unsigned long mmap_vstart; -static unsigned long *pending_vaddrs; +static struct page **pending_pages; static grant_handle_t *pending_grant_handles; static inline int vaddr_pagenr(pending_req_t *req, int seg) @@ -98,7 +95,8 @@ static inline int vaddr_pagenr(pending_req_t *req, int seg) static inline unsigned long vaddr(pending_req_t *req, int seg) { - return pending_vaddrs[vaddr_pagenr(req, seg)]; + unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]); + return (unsigned long)pfn_to_kaddr(pfn); } #define pending_handle(_req, _seg) \ @@ -191,9 +189,9 @@ static void fast_flush_area(pending_req_t *req) static void print_stats(blkif_t *blkif) { - printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", + printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req); + blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; @@ -243,11 +241,17 @@ int blkif_schedule(void *arg) * COMPLETION CALLBACK -- Called as bh->b_end_io() */ -static void __end_block_io_op(pending_req_t *pending_req, int uptodate) +static void __end_block_io_op(pending_req_t *pending_req, int error) { /* An error fails the entire request. */ - if (!uptodate) { - DPRINTK("Buffer not up-to-date at end of operation\n"); + if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + DPRINTK("blkback: write barrier op failed, not supported\n"); + blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + DPRINTK("Buffer not up-to-date at end of operation, " + "error=%d\n", error); pending_req->status = BLKIF_RSP_ERROR; } @@ -264,7 +268,7 @@ static int end_block_io_op(struct bio *bio, unsigned int done, int error) { if (bio->bi_size != 0) return 1; - __end_block_io_op(bio->bi_private, !error); + __end_block_io_op(bio->bi_private, error); bio_put(bio); return error; } @@ -295,7 +299,7 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) static int do_block_io_op(blkif_t *blkif) { blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; + blkif_request_t req; pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0; @@ -313,22 +317,25 @@ static int do_block_io_op(blkif_t *blkif) break; } - req = RING_GET_REQUEST(blk_ring, rc); + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req)); blk_ring->req_cons = ++rc; /* before make_response() */ - switch (req->operation) { + switch (req.operation) { case BLKIF_OP_READ: blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; + case BLKIF_OP_WRITE_BARRIER: + blkif->st_br_req++; + /* fall through */ case BLKIF_OP_WRITE: blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; default: DPRINTK("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, + req.operation); + make_response(blkif, req.id, req.operation, BLKIF_RSP_ERROR); free_req(pending_req); break; @@ -342,7 +349,6 @@ static void dispatch_rw_block_io(blkif_t *blkif, pending_req_t *pending_req) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); - int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct phys_req preq; struct { @@ -351,6 +357,22 @@ static void dispatch_rw_block_io(blkif_t *blkif, unsigned int nseg; struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int ret, i, nbio = 0; + int operation; + + switch (req->operation) { + case BLKIF_OP_READ: + operation = READ; + break; + case BLKIF_OP_WRITE: + operation = WRITE; + break; + case BLKIF_OP_WRITE_BARRIER: + operation = WRITE_BARRIER; + break; + default: + operation = 0; /* make gcc happy */ + BUG(); + } /* Check that number of segments is sane. */ nseg = req->nr_segments; @@ -366,7 +388,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, pending_req->blkif = blkif; pending_req->id = req->id; - pending_req->operation = operation; + pending_req->operation = req->operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; @@ -377,12 +399,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, req->seg[i].first_sect + 1; if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || - (seg[i].nsec <= 0)) + (req->seg[i].last_sect < req->seg[i].first_sect)) goto fail_response; preq.nr_sects += seg[i].nsec; flags = GNTMAP_host_map; - if ( operation == WRITE ) + if (operation != READ) flags |= GNTMAP_readonly; gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, req->seg[i].gref, blkif->domid); @@ -394,10 +416,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, for (i = 0; i < nseg; i++) { if (unlikely(map[i].status != 0)) { DPRINTK("invalid buffer -- could not remap it\n"); - goto fail_flush; + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= 1; } pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + set_phys_to_machine(__pa(vaddr( pending_req, i)) >> PAGE_SHIFT, FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); @@ -405,6 +432,9 @@ static void dispatch_rw_block_io(blkif_t *blkif, (req->seg[i].first_sect << 9); } + if (ret) + goto fail_flush; + if (vbd_translate(&preq, blkif, operation) != 0) { DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", @@ -506,52 +536,43 @@ static void make_response(blkif_t *blkif, unsigned long id, static int __init blkif_init(void) { - struct page *page; - int i; + int i, mmap_pages; if (!is_running_on_xen()) return -ENODEV; - mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - - page = balloon_alloc_empty_page_range(mmap_pages); - if (page == NULL) - return -ENOMEM; - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; pending_reqs = kmalloc(sizeof(pending_reqs[0]) * blkif_reqs, GFP_KERNEL); pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * mmap_pages, GFP_KERNEL); - pending_vaddrs = kmalloc(sizeof(pending_vaddrs[0]) * - mmap_pages, GFP_KERNEL); - if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) { - kfree(pending_reqs); - kfree(pending_grant_handles); - kfree(pending_vaddrs); - printk("%s: out of memory\n", __FUNCTION__); - return -ENOMEM; - } + pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); - blkif_interface_init(); - - printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", - __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart); - BUG_ON(mmap_vstart == 0); - for (i = 0; i < mmap_pages; i++) { - pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); + if (!pending_reqs || !pending_grant_handles || !pending_pages) + goto out_of_memory; + + for (i = 0; i < mmap_pages; i++) pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; - } + + blkif_interface_init(); memset(pending_reqs, 0, sizeof(pending_reqs)); INIT_LIST_HEAD(&pending_free); for (i = 0; i < blkif_reqs; i++) list_add_tail(&pending_reqs[i].free_list, &pending_free); - + blkif_xenbus_init(); return 0; + + out_of_memory: + kfree(pending_reqs); + kfree(pending_grant_handles); + free_empty_pages_and_pagevec(pending_pages, mmap_pages); + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; } module_init(blkif_init); diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h index 38cb756964..1b5b6a427e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h @@ -44,6 +44,7 @@ #include <xen/interface/io/ring.h> #include <xen/gnttab.h> #include <xen/driver_util.h> +#include <xen/xenbus.h> #define DPRINTK(_f, _a...) \ pr_debug("(file=%s, line=%d) " _f, \ @@ -87,6 +88,7 @@ typedef struct blkif_st { int st_rd_req; int st_wr_req; int st_oo_req; + int st_br_req; wait_queue_head_t waiting_to_free; @@ -111,7 +113,7 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, unsigned minor, int readonly); void vbd_free(struct vbd *vbd); -unsigned long vbd_size(struct vbd *vbd); +unsigned long long vbd_size(struct vbd *vbd); unsigned int vbd_info(struct vbd *vbd); unsigned long vbd_secsize(struct vbd *vbd); @@ -131,4 +133,7 @@ void blkif_xenbus_init(void); irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); int blkif_schedule(void *arg); +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); + #endif /* __BLKIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c index a809b04cd1..34048b32c4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c @@ -31,12 +31,11 @@ */ #include "common.h" -#include <xen/xenbus.h> #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) -unsigned long vbd_size(struct vbd *vbd) +unsigned long long vbd_size(struct vbd *vbd) { return vbd_sz(vbd); } @@ -104,7 +103,7 @@ int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) struct vbd *vbd = &blkif->vbd; int rc = -EACCES; - if ((operation == WRITE) && vbd->readonly) + if ((operation != READ) && vbd->readonly) goto out; if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c index 02f90a6803..349ae64d0f 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c @@ -20,7 +20,6 @@ #include <stdarg.h> #include <linux/module.h> #include <linux/kthread.h> -#include <xen/xenbus.h> #include "common.h" #undef DPRINTK @@ -91,11 +90,13 @@ static void update_blkif_status(blkif_t *blkif) VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); static struct attribute *vbdstat_attrs[] = { &dev_attr_oo_req.attr, &dev_attr_rd_req.attr, &dev_attr_wr_req.attr, + &dev_attr_br_req.attr, NULL }; @@ -165,6 +166,19 @@ static int blkback_remove(struct xenbus_device *dev) return 0; } +int blkback_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-barrier"); + + return err; +} /** * Entry point to this code when a new device is created. Allocate the basic @@ -366,13 +380,16 @@ static void connect(struct backend_info *be) /* Supply the information about the device the frontend needs */ again: err = xenbus_transaction_start(&xbt); - if (err) { xenbus_dev_fatal(dev, err, "starting transaction"); return; } - err = xenbus_printf(xbt, dev->nodename, "sectors", "%lu", + err = blkback_barrier(xbt, be, 1); + if (err) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", vbd_size(&be->blkif->vbd)); if (err) { xenbus_dev_fatal(dev, err, "writing %s/sectors", diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c index 4c44d7608d..95cff46ff9 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c @@ -48,6 +48,10 @@ #include <asm/hypervisor.h> #include <asm/maddr.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_STATE_DISCONNECTED 0 #define BLKIF_STATE_CONNECTED 1 #define BLKIF_STATE_SUSPENDED 2 @@ -134,10 +138,10 @@ static int blkfront_resume(struct xenbus_device *dev) DPRINTK("blkfront_resume: %s\n", dev->nodename); - blkif_free(info, 1); + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = talk_to_backend(dev, info); - if (!err) + if (info->connected == BLKIF_STATE_SUSPENDED && !err) blkif_recover(info); return err; @@ -273,7 +277,7 @@ static void backend_changed(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); down(&bd->bd_sem); - if (info->users > 0 && system_state == SYSTEM_RUNNING) + if (info->users > 0) xenbus_dev_error(dev, -EBUSY, "Device in use; refusing to close"); else @@ -294,7 +298,8 @@ static void backend_changed(struct xenbus_device *dev, */ static void connect(struct blkfront_info *info) { - unsigned long sectors, sector_size; + unsigned long long sectors; + unsigned long sector_size; unsigned int binfo; int err; @@ -305,7 +310,7 @@ static void connect(struct blkfront_info *info) DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "sectors", "%lu", §ors, + "sectors", "%llu", §ors, "info", "%u", &binfo, "sector-size", "%lu", §or_size, NULL); @@ -316,6 +321,12 @@ static void connect(struct blkfront_info *info) return; } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", @@ -355,9 +366,11 @@ static void blkfront_closing(struct xenbus_device *dev) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - flush_scheduled_work(); spin_unlock_irqrestore(&blkif_io_lock, flags); + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + xlvbd_del(info); xenbus_frontend_closed(dev); @@ -466,6 +479,27 @@ int blkif_ioctl(struct inode *inode, struct file *filep, command, (long)argument, inode->i_rdev); switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blkif_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif case CDROMMULTISESSION: DPRINTK("FIXME: support multisession CDs later\n"); for (i = 0; i < sizeof(struct cdrom_multisession); i++) @@ -542,11 +576,14 @@ static int blkif_queue_request(struct request *req) info->shadow[id].request = (unsigned long)req; ring_req->id = id; - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; ring_req->sector_number = (blkif_sector_t)req->sector; ring_req->handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (blk_barrier_rq(req)) + ring_req->operation = BLKIF_OP_WRITE_BARRIER; + ring_req->nr_segments = 0; rq_for_each_bio (bio, req) { bio_for_each_segment (bvec, bio, idx) { @@ -643,6 +680,7 @@ static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; + int uptodate; spin_lock_irqsave(&blkif_io_lock, flags); @@ -667,19 +705,27 @@ static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) ADD_ID_TO_FREELIST(info, id); + uptodate = (bret->status == BLKIF_RSP_OKAY); switch (bret->operation) { + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk("blkfront: %s: write barrier op failed\n", + info->gd->disk_name); + uptodate = -EOPNOTSUPP; + info->feature_barrier = 0; + xlvbd_barrier(info); + } + /* fall through */ case BLKIF_OP_READ: case BLKIF_OP_WRITE: if (unlikely(bret->status != BLKIF_RSP_OKAY)) DPRINTK("Bad return from blkdev data " "request: %x\n", bret->status); - ret = end_that_request_first( - req, (bret->status == BLKIF_RSP_OKAY), + ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); BUG_ON(ret); - end_that_request_last( - req, (bret->status == BLKIF_RSP_OKAY)); + end_that_request_last(req, uptodate); break; default: BUG(); @@ -714,9 +760,11 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - flush_scheduled_work(); spin_unlock_irq(&blkif_io_lock); + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_scheduled_work(); + /* Free resources associated with old device channel. */ if (info->ring_ref != GRANT_INVALID_REF) { gnttab_end_foreign_access(info->ring_ref, 0, diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h index 5ba3d1ebc3..b86360f405 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h @@ -126,6 +126,7 @@ struct blkfront_info struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; unsigned long shadow_free; + int feature_barrier; /** * The number of people holding this device open. We won't allow a @@ -152,5 +153,6 @@ extern void do_blkif_request (request_queue_t *rq); int xlvbd_add(blkif_sector_t capacity, int device, u16 vdisk_info, u16 sector_size, struct blkfront_info *info); void xlvbd_del(struct blkfront_info *info); +int xlvbd_barrier(struct blkfront_info *info); #endif /* __XEN_DRIVERS_BLOCK_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c index 8aa453d3a0..f040a2b7e3 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c @@ -36,6 +36,10 @@ #include <linux/blkdev.h> #include <linux/list.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_MAJOR(dev) ((dev)>>8) #define BLKIF_MINOR(dev) ((dev) & 0xff) @@ -46,7 +50,7 @@ */ #define NUM_IDE_MAJORS 10 -#define NUM_SCSI_MAJORS 9 +#define NUM_SCSI_MAJORS 17 #define NUM_VBD_MAJORS 1 static struct xlbd_type_info xlbd_ide_type = { @@ -91,7 +95,9 @@ static struct block_device_operations xlvbd_block_fops = .open = blkif_open, .release = blkif_release, .ioctl = blkif_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .getgeo = blkif_getgeo +#endif }; DEFINE_SPINLOCK(blkif_io_lock); @@ -159,8 +165,11 @@ xlbd_get_major_info(int vdevice) case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: index = 11 + major - SCSI_DISK1_MAJOR; break; - case SCSI_CDROM_MAJOR: index = 18; break; - default: index = 19; break; + case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: + index = 18 + major - SCSI_DISK8_MAJOR; + break; + case SCSI_CDROM_MAJOR: index = 26; break; + default: index = 27; break; } mi = ((major_info[index] != NULL) ? major_info[index] : @@ -186,7 +195,11 @@ xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) if (rq == NULL) return -1; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_hardsect_size(rq, sector_size); @@ -217,6 +230,7 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, struct xlbd_major_info *mi; int nr_minors = 1; int err = -ENODEV; + unsigned int offset; BUG_ON(info->gd != NULL); BUG_ON(info->mi != NULL); @@ -234,15 +248,33 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, if (gd == NULL) goto out; - if (nr_minors > 1) - sprintf(gd->disk_name, "%s%c", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift)); - else - sprintf(gd->disk_name, "%s%c%d", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift), - minor & ((1 << mi->type->partn_shift) - 1)); + offset = mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift); + if (nr_minors > 1) { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c", + mi->type->diskname, 'a' + offset ); + } + else { + sprintf(gd->disk_name, "%s%c%c", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26) ); + } + } + else { + if (offset < 26) { + sprintf(gd->disk_name, "%s%c%d", + mi->type->diskname, + 'a' + offset, + minor & ((1 << mi->type->partn_shift) - 1)); + } + else { + sprintf(gd->disk_name, "%s%c%c%d", + mi->type->diskname, + 'a' + ((offset/26)-1), 'a' + (offset%26), + minor & ((1 << mi->type->partn_shift) - 1)); + } + } gd->major = mi->major; gd->first_minor = minor; @@ -257,6 +289,10 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, } info->rq = gd->queue; + info->gd = gd; + + if (info->feature_barrier) + xlvbd_barrier(info); if (vdisk_info & VDISK_READONLY) set_disk_ro(gd, 1); @@ -267,8 +303,6 @@ xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, if (vdisk_info & VDISK_CDROM) gd->flags |= GENHD_FL_CD; - info->gd = gd; - return 0; out: @@ -316,3 +350,26 @@ xlvbd_del(struct blkfront_info *info) blk_cleanup_queue(info->rq); info->rq = NULL; } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +int +xlvbd_barrier(struct blkfront_info *info) +{ + int err; + + err = blk_queue_ordered(info->rq, + info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL); + if (err) + return err; + printk("blkfront: %s: barriers %s\n", + info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled"); + return 0; +} +#else +int +xlvbd_barrier(struct blkfront_info *info) +{ + printk("blkfront: %s: barriers disabled\n", info->gd->disk_name); + return -ENOSYS; +} +#endif diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c index a6f1379c27..e0d898ab98 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c @@ -10,6 +10,9 @@ * * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield * + * Clean ups and fix ups: + * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc. + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed @@ -44,7 +47,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/miscdevice.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/gfp.h> @@ -52,9 +54,33 @@ #include <asm/tlbflush.h> #include <linux/devfs_fs_kernel.h> -#define MAX_TAP_DEV 100 /*the maximum number of tapdisk ring devices */ +#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ + +struct class *xen_class; +EXPORT_SYMBOL_GPL(xen_class); + +/* + * Setup the xen class. This should probably go in another file, but + * since blktap is the only user of it so far, it gets to keep it. + */ +int setup_xen_class(void) +{ + int ret; + + if (xen_class) + return 0; + + xen_class = class_create(THIS_MODULE, "xen"); + if ((ret = IS_ERR(xen_class))) { + xen_class = NULL; + return ret; + } + + return 0; +} + /* * The maximum number of requests that can be outstanding at any time * is determined by @@ -67,8 +93,9 @@ * mmap_alloc is initialised to 2 and should be adjustable on the fly via * sysfs. */ -#define MAX_DYNAMIC_MEM 64 -#define MAX_PENDING_REQS 64 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE #define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) #define MMAP_VADDR(_start, _req,_seg) \ (_start + \ @@ -82,6 +109,12 @@ static int mmap_pages = MMAP_PAGES; * memory rings. */ +/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ +typedef struct domid_translate { + unsigned short domid; + unsigned short busid; +} domid_translate_t ; + /*Data struct associated with each of the tapdisk devices*/ typedef struct tap_blkif { struct vm_area_struct *vma; /*Shared memory area */ @@ -100,22 +133,11 @@ typedef struct tap_blkif { unsigned long *idx_map; /*Record the user ring id to kern [req id, idx] tuple */ blkif_t *blkif; /*Associate blkif with tapdev */ + struct domid_translate trans; /*Translation from domid to bus. */ } tap_blkif_t; -/*Private data struct associated with the inode*/ -typedef struct private_info { - int idx; -} private_info_t; - -/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ -typedef struct domid_translate { - unsigned short domid; - unsigned short busid; -} domid_translate_t ; - - -static domid_translate_t translate_domid[MAX_TAP_DEV]; -static tap_blkif_t *tapfds[MAX_TAP_DEV]; +static struct tap_blkif *tapfds[MAX_TAP_DEV]; +static int blktap_next_minor; static int __init set_blkif_reqs(char *str) { @@ -168,16 +190,18 @@ static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { #define BLKBACK_INVALID_HANDLE (~0) -typedef struct mmap_page { - unsigned long start; - struct page *mpage; -} mmap_page_t; +static struct page **foreign_pages[MAX_DYNAMIC_MEM]; +static inline unsigned long idx_to_kaddr( + unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) +{ + unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx; + unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]); + return (unsigned long)pfn_to_kaddr(pfn); +} -static mmap_page_t mmap_start[MAX_DYNAMIC_MEM]; static unsigned short mmap_alloc = 0; static unsigned short mmap_lock = 0; static unsigned short mmap_inuse = 0; -static unsigned long *pending_addrs[MAX_DYNAMIC_MEM]; /****************************************************************** * GRANT HANDLES @@ -192,6 +216,7 @@ struct grant_handle_pair grant_handle_t kernel; grant_handle_t user; }; +#define INVALID_GRANT_HANDLE 0xFFFF static struct grant_handle_pair pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; @@ -200,15 +225,13 @@ static struct grant_handle_pair + (_i)]) -static int blktap_read_ufe_ring(int idx); /*local prototypes*/ +static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/ -#define BLKTAP_MINOR 0 /*/dev/xen/blktap resides at device number - major=254, minor numbers begin at 0 */ -#define BLKTAP_DEV_MAJOR 254 /* TODO: Make major number dynamic * - * and create devices in the kernel * - */ +#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */ #define BLKTAP_DEV_DIR "/dev/xen" +static int blktap_major; + /* blktap IOCTLs: */ #define BLKTAP_IOCTL_KICK_FE 1 #define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ @@ -264,17 +287,19 @@ static inline int GET_NEXT_REQ(unsigned long *idx_map) { int i; for (i = 0; i < MAX_PENDING_REQS; i++) - if (idx_map[i] == INVALID_REQ) return i; + if (idx_map[i] == INVALID_REQ) + return i; return INVALID_REQ; } #define BLKTAP_INVALID_HANDLE(_g) \ - (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) + (((_g->kernel) == INVALID_GRANT_HANDLE) && \ + ((_g->user) == INVALID_GRANT_HANDLE)) #define BLKTAP_INVALIDATE_HANDLE(_g) do { \ - (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ + (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \ } while(0) @@ -303,7 +328,7 @@ struct vm_operations_struct blktap_vm_ops = { */ /*Function Declarations*/ -static int get_next_free_dev(void); +static tap_blkif_t *get_next_free_dev(void); static int blktap_open(struct inode *inode, struct file *filp); static int blktap_release(struct inode *inode, struct file *filp); static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); @@ -311,8 +336,6 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); static unsigned int blktap_poll(struct file *file, poll_table *wait); -struct miscdevice *set_misc(int minor, char *name, int dev); - static struct file_operations blktap_fops = { .owner = THIS_MODULE, .poll = blktap_poll, @@ -323,41 +346,96 @@ static struct file_operations blktap_fops = { }; -static int get_next_free_dev(void) +static tap_blkif_t *get_next_free_dev(void) { tap_blkif_t *info; - int i = 0, ret = -1; - unsigned long flags; + int minor; - spin_lock_irqsave(&pending_free_lock, flags); - - while (i < MAX_TAP_DEV) { - info = tapfds[i]; - if ( (tapfds[i] != NULL) && (info->dev_inuse == 0) - && (info->dev_pending == 0) ) { + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&pending_free_lock); + + /* tapfds[0] is always NULL */ + + for (minor = 1; minor < blktap_next_minor; minor++) { + info = tapfds[minor]; + /* we could have failed a previous attempt. */ + if (!info || + ((info->dev_inuse == 0) && + (info->dev_pending == 0)) ) { info->dev_pending = 1; - ret = i; - goto done; + goto found; } - i++; } - -done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return ret; + info = NULL; + minor = -1; + + /* + * We didn't find free device. If we can still allocate + * more, then we grab the next device minor that is + * available. This is done while we are still under + * the protection of the pending_free_lock. + */ + if (blktap_next_minor < MAX_TAP_DEV) + minor = blktap_next_minor++; +found: + spin_unlock_irq(&pending_free_lock); + + if (!info && minor > 0) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (unlikely(!info)) { + /* + * If we failed here, try to put back + * the next minor number. But if one + * was just taken, then we just lose this + * minor. We can try to allocate this + * minor again later. + */ + spin_lock_irq(&pending_free_lock); + if (blktap_next_minor == minor+1) + blktap_next_minor--; + spin_unlock_irq(&pending_free_lock); + goto out; + } + + info->minor = minor; + /* + * Make sure that we have a minor before others can + * see us. + */ + wmb(); + tapfds[minor] = info; + + class_device_create(xen_class, NULL, + MKDEV(blktap_major, minor), NULL, + "blktap%d", minor); + devfs_mk_cdev(MKDEV(blktap_major, minor), + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", minor); + } + +out: + return info; } int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) { + tap_blkif_t *info; int i; - - for (i = 0; i < MAX_TAP_DEV; i++) - if ( (translate_domid[i].domid == domid) - && (translate_domid[i].busid == xenbus_id) ) { - tapfds[i]->blkif = blkif; - tapfds[i]->status = RUNNING; + + for (i = 1; i < blktap_next_minor; i++) { + info = tapfds[i]; + if ( info && + (info->trans.domid == domid) && + (info->trans.busid == xenbus_id) ) { + info->blkif = blkif; + info->status = RUNNING; return i; } + } return -1; } @@ -367,13 +445,16 @@ void signal_tapdisk(int idx) struct task_struct *ptask; info = tapfds[idx]; - if ( (idx > 0) && (idx < MAX_TAP_DEV) && (info->pid > 0) ) { + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + if (info->pid > 0) { ptask = find_task_by_pid(info->pid); - if (ptask) { + if (ptask) info->status = CLEANSHUTDOWN; - } } info->blkif = NULL; + return; } @@ -382,18 +463,22 @@ static int blktap_open(struct inode *inode, struct file *filp) blkif_sring_t *sring; int idx = iminor(inode) - BLKTAP_MINOR; tap_blkif_t *info; - private_info_t *prv; int i; - if (tapfds[idx] == NULL) { + /* ctrl device, treat differently */ + if (!idx) + return 0; + + info = tapfds[idx]; + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) { WPRINTK("Unable to open device /dev/xen/blktap%d\n", - idx); - return -ENOMEM; + idx); + return -ENODEV; } + DPRINTK("Opening device /dev/xen/blktap%d\n",idx); - info = tapfds[idx]; - /*Only one process can access device at a time*/ if (test_and_set_bit(0, &info->dev_inuse)) return -EBUSY; @@ -410,9 +495,7 @@ static int blktap_open(struct inode *inode, struct file *filp) SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); - prv = kzalloc(sizeof(private_info_t),GFP_KERNEL); - prv->idx = idx; - filp->private_data = prv; + filp->private_data = info; info->vma = NULL; info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, @@ -433,17 +516,14 @@ static int blktap_open(struct inode *inode, struct file *filp) static int blktap_release(struct inode *inode, struct file *filp) { - int idx = iminor(inode) - BLKTAP_MINOR; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - if (tapfds[idx] == NULL) { - WPRINTK("Trying to free device that doesn't exist " - "[/dev/xen/blktap%d]\n",idx); - return -1; - } - info = tapfds[idx]; + /* check for control device */ + if (!info) + return 0; + info->dev_inuse = 0; - DPRINTK("Freeing device [/dev/xen/blktap%d]\n",idx); + DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor); /* Free the ring page. */ ClearPageReserved(virt_to_page(info->ufe_ring.sring)); @@ -457,11 +537,11 @@ static int blktap_release(struct inode *inode, struct file *filp) info->vma = NULL; } - if (filp->private_data) kfree(filp->private_data); - if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { - kthread_stop(info->blkif->xenblkd); - info->blkif->xenblkd = NULL; + if (info->blkif->xenblkd != NULL) { + kthread_stop(info->blkif->xenblkd); + info->blkif->xenblkd = NULL; + } info->status = CLEANSHUTDOWN; } return 0; @@ -491,16 +571,12 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) int size; struct page **map; int i; - private_info_t *prv; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - /*Retrieve the dev info*/ - prv = (private_info_t *)filp->private_data; - if (prv == NULL) { + if (info == NULL) { WPRINTK("blktap: mmap, retrieving idx failed\n"); return -ENOMEM; } - info = tapfds[prv->idx]; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &blktap_vm_ops; @@ -517,8 +593,6 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); /* Map the ring pages to the start of the region and reserve it. */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (remap_pfn_range(vma, vma->vm_start, __pa(info->ufe_ring.sring) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) { @@ -556,20 +630,17 @@ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - int idx = iminor(inode) - BLKTAP_MINOR; + tap_blkif_t *info = filp->private_data; + switch(cmd) { case BLKTAP_IOCTL_KICK_FE: { /* There are fe messages to process. */ - return blktap_read_ufe_ring(idx); + return blktap_read_ufe_ring(info); } case BLKTAP_IOCTL_SETMODE: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { if (BLKTAP_MODE_VALID(arg)) { info->mode = arg; /* XXX: may need to flush rings here. */ @@ -582,11 +653,7 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, } case BLKTAP_IOCTL_PRINT_IDXS: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { printk("User Rings: \n-----------\n"); printk("UF: rsp_cons: %2d, req_prod_prv: %2d " "| req_prod: %2d, rsp_prod: %2d\n", @@ -599,11 +666,7 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, } case BLKTAP_IOCTL_SENDPID: { - tap_blkif_t *info = tapfds[idx]; - - if ( (idx > 0) && (idx < MAX_TAP_DEV) - && (tapfds[idx] != NULL) ) - { + if (info) { info->pid = (pid_t)arg; DPRINTK("blktap: pid received %d\n", info->pid); @@ -614,43 +677,49 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, { uint64_t val = (uint64_t)arg; domid_translate_t *tr = (domid_translate_t *)&val; - int newdev; DPRINTK("NEWINTF Req for domid %d and bus id %d\n", tr->domid, tr->busid); - newdev = get_next_free_dev(); - if (newdev < 1) { + info = get_next_free_dev(); + if (!info) { WPRINTK("Error initialising /dev/xen/blktap - " "No more devices\n"); return -1; } - translate_domid[newdev].domid = tr->domid; - translate_domid[newdev].busid = tr->busid; - return newdev; + info->trans.domid = tr->domid; + info->trans.busid = tr->busid; + return info->minor; } case BLKTAP_IOCTL_FREEINTF: { unsigned long dev = arg; - tap_blkif_t *info = NULL; + unsigned long flags; + + info = tapfds[dev]; - if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; + if ((dev > MAX_TAP_DEV) || !info) + return 0; /* should this be an error? */ - if ( (info != NULL) && (info->dev_pending) ) + spin_lock_irqsave(&pending_free_lock, flags); + if (info->dev_pending) info->dev_pending = 0; + spin_unlock_irqrestore(&pending_free_lock, flags); + return 0; } case BLKTAP_IOCTL_MINOR: { unsigned long dev = arg; - tap_blkif_t *info = NULL; - - if ( (dev > 0) && (dev < MAX_TAP_DEV) ) info = tapfds[dev]; - - if (info != NULL) return info->minor; - else return -1; + + info = tapfds[dev]; + + if ((dev > MAX_TAP_DEV) || !info) + return -EINVAL; + + return info->minor; } case BLKTAP_IOCTL_MAJOR: - return BLKTAP_DEV_MAJOR; + return blktap_major; case BLKTAP_QUERY_ALLOC_REQS: { @@ -662,25 +731,16 @@ static int blktap_ioctl(struct inode *inode, struct file *filp, return -ENOIOCTLCMD; } -static unsigned int blktap_poll(struct file *file, poll_table *wait) +static unsigned int blktap_poll(struct file *filp, poll_table *wait) { - private_info_t *prv; - tap_blkif_t *info; + tap_blkif_t *info = filp->private_data; - /*Retrieve the dev info*/ - prv = (private_info_t *)file->private_data; - if (prv == NULL) { - WPRINTK(" poll, retrieving idx failed\n"); + /* do not work on the control device */ + if (!info) return 0; - } - - if (prv->idx == 0) return 0; - - info = tapfds[prv->idx]; - - poll_wait(file, &info->wait, wait); + + poll_wait(filp, &info->wait, wait); if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { - flush_tlb_all(); RING_PUSH_REQUESTS(&info->ufe_ring); return POLLIN | POLLRDNORM; } @@ -691,11 +751,13 @@ void blktap_kick_user(int idx) { tap_blkif_t *info; - if (idx == 0) return; - info = tapfds[idx]; - - if (info != NULL) wake_up_interruptible(&info->wait); + + if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) + return; + + wake_up_interruptible(&info->wait); + return; } @@ -712,66 +774,21 @@ static void make_response(blkif_t *blkif, unsigned long id, static int req_increase(void) { int i, j; - struct page *page; - unsigned long flags; - int ret; - spin_lock_irqsave(&pending_free_lock, flags); - - ret = -EINVAL; if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) - goto done; - -#ifdef __ia64__ - extern unsigned long alloc_empty_foreign_map_page_range( - unsigned long pages); - mmap_start[mmap_alloc].start = (unsigned long) - alloc_empty_foreign_map_page_range(mmap_pages); -#else /* ! ia64 */ - page = balloon_alloc_empty_page_range(mmap_pages); - ret = -ENOMEM; - if (page == NULL) { - printk("%s balloon_alloc_empty_page_range gave NULL\n", __FUNCTION__); - goto done; - } - - /* Pin all of the pages. */ - for (i=0; i<mmap_pages; i++) - get_page(&page[i]); - - mmap_start[mmap_alloc].start = - (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - mmap_start[mmap_alloc].mpage = page; - -#endif - - pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) * - blkif_reqs, GFP_KERNEL); - pending_addrs[mmap_alloc] = kzalloc(sizeof(unsigned long) * - mmap_pages, GFP_KERNEL); - - ret = -ENOMEM; - if (!pending_reqs[mmap_alloc] || !pending_addrs[mmap_alloc]) { - kfree(pending_reqs[mmap_alloc]); - kfree(pending_addrs[mmap_alloc]); - WPRINTK("%s: out of memory\n", __FUNCTION__); - ret = -ENOMEM; - goto done; - } - - ret = 0; + return -EINVAL; - DPRINTK("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n", - __FUNCTION__, blkif_reqs, mmap_pages, - mmap_start[mmap_alloc].start); + pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) + * blkif_reqs, GFP_KERNEL); + foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages); - BUG_ON(mmap_start[mmap_alloc].start == 0); + if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc]) + goto out_of_memory; - for (i = 0; i < mmap_pages; i++) - pending_addrs[mmap_alloc][i] = - mmap_start[mmap_alloc].start + (i << PAGE_SHIFT); + DPRINTK("%s: reqs=%d, pages=%d\n", + __FUNCTION__, blkif_reqs, mmap_pages); - for (i = 0; i < MAX_PENDING_REQS ; i++) { + for (i = 0; i < MAX_PENDING_REQS; i++) { list_add_tail(&pending_reqs[mmap_alloc][i].free_list, &pending_free); pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc; @@ -782,67 +799,30 @@ static int req_increase(void) mmap_alloc++; DPRINTK("# MMAPs increased to %d\n",mmap_alloc); - done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return ret; + return 0; + + out_of_memory: + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + kfree(pending_reqs[mmap_alloc]); + WPRINTK("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; } static void mmap_req_del(int mmap) { - int i; - struct page *page; + BUG_ON(!spin_is_locked(&pending_free_lock)); - /*Spinlock already acquired*/ kfree(pending_reqs[mmap]); - kfree(pending_addrs[mmap]); - -#ifdef __ia64__ - /*Not sure what goes here yet!*/ -#else - - /* Unpin all of the pages. */ - page = mmap_start[mmap].mpage; - for (i=0; i<mmap_pages; i++) - put_page(&page[i]); + pending_reqs[mmap] = NULL; - balloon_dealloc_empty_page_range(mmap_start[mmap].mpage, mmap_pages); -#endif + free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); + foreign_pages[mmap] = NULL; mmap_lock = 0; DPRINTK("# MMAPs decreased to %d\n",mmap_alloc); mmap_alloc--; } -/*N.B. Currently unused - will be accessed via sysfs*/ -static void req_decrease(void) -{ - pending_req_t *req; - int i; - unsigned long flags; - - spin_lock_irqsave(&pending_free_lock, flags); - - DPRINTK("Req decrease called.\n"); - if (mmap_lock || mmap_alloc == 1) - goto done; - - mmap_lock = 1; - mmap_inuse = MAX_PENDING_REQS; - - /*Go through reqs and remove any that aren't in use*/ - for (i = 0; i < MAX_PENDING_REQS ; i++) { - req = &pending_reqs[mmap_alloc-1][i]; - if (req->inuse == 0) { - list_del(&req->free_list); - mmap_inuse--; - } - } - if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); - done: - spin_unlock_irqrestore(&pending_free_lock, flags); - return; -} - static pending_req_t* alloc_req(void) { pending_req_t *req = NULL; @@ -888,8 +868,8 @@ static void free_req(pending_req_t *req) wake_up(&pending_free_wq); } -static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int - tapidx) +static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, + int tapidx) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int i, invcount = 0; @@ -897,49 +877,65 @@ static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, int uint64_t ptep; int ret, mmap_idx; unsigned long kvaddr, uvaddr; - - tap_blkif_t *info = tapfds[tapidx]; + tap_blkif_t *info; - if (info == NULL) { + + info = tapfds[tapidx]; + + if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) { WPRINTK("fast_flush: Couldn't get info!\n"); return; } + + if (info->vma != NULL && + xen_feature(XENFEAT_auto_translated_physmap)) { + down_write(&info->vma->vm_mm->mmap_sem); + zap_page_range(info->vma, + MMAP_VADDR(info->user_vstart, u_idx, 0), + req->nr_pages << PAGE_SHIFT, NULL); + up_write(&info->vma->vm_mm->mmap_sem); + } + mmap_idx = req->mem_idx; for (i = 0; i < req->nr_pages; i++) { - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, k_idx, i); uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); khandle = &pending_handle(mmap_idx, k_idx, i); - if (BLKTAP_INVALID_HANDLE(khandle)) { - WPRINTK("BLKTAP_INVALID_HANDLE\n"); - continue; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[invcount], + idx_to_kaddr(mmap_idx, k_idx, i), + GNTMAP_host_map, khandle->kernel); + invcount++; } - gnttab_set_unmap_op(&unmap[invcount], - MMAP_VADDR(mmap_start[mmap_idx].start, k_idx, i), - GNTMAP_host_map, khandle->kernel); - invcount++; - - if (create_lookup_pte_addr( - info->vma->vm_mm, - MMAP_VADDR(info->user_vstart, u_idx, i), - &ptep) !=0) { - WPRINTK("Couldn't get a pte addr!\n"); - return; + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (create_lookup_pte_addr( + info->vma->vm_mm, + MMAP_VADDR(info->user_vstart, u_idx, i), + &ptep) !=0) { + WPRINTK("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[invcount], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + invcount++; } - gnttab_set_unmap_op(&unmap[invcount], - ptep, GNTMAP_host_map, - khandle->user); - invcount++; - BLKTAP_INVALIDATE_HANDLE(khandle); } ret = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, unmap, invcount); BUG_ON(ret); - if (info->vma != NULL) + if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) zap_page_range(info->vma, MMAP_VADDR(info->user_vstart, u_idx, 0), req->nr_pages << PAGE_SHIFT, NULL); @@ -1002,7 +998,7 @@ int tap_blkif_schedule(void *arg) * COMPLETION CALLBACK -- Called by user level ioctl() */ -static int blktap_read_ufe_ring(int idx) +static int blktap_read_ufe_ring(tap_blkif_t *info) { /* This is called to read responses from the UFE ring. */ RING_IDX i, j, rp; @@ -1010,12 +1006,9 @@ static int blktap_read_ufe_ring(int idx) blkif_t *blkif=NULL; int pending_idx, usr_idx, mmap_idx; pending_req_t *pending_req; - tap_blkif_t *info; - info = tapfds[idx]; - if (info == NULL) { + if (!info) return 0; - } /* We currently only forward packets in INTERCEPT_FE mode. */ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) @@ -1026,11 +1019,14 @@ static int blktap_read_ufe_ring(int idx) rmb(); for (i = info->ufe_ring.rsp_cons; i != rp; i++) { + blkif_response_t res; resp = RING_GET_RESPONSE(&info->ufe_ring, i); + memcpy(&res, resp, sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ ++info->ufe_ring.rsp_cons; /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ - usr_idx = (int)resp->id; + usr_idx = (int)res.id; pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); @@ -1053,9 +1049,8 @@ static int blktap_read_ufe_ring(int idx) struct page *pg; int offset; - uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, j); + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ClearPageReserved(pg); @@ -1063,10 +1058,10 @@ static int blktap_read_ufe_ring(int idx) >> PAGE_SHIFT; map[offset] = NULL; } - fast_flush_area(pending_req, pending_idx, usr_idx, idx); - make_response(blkif, pending_req->id, resp->operation, - resp->status); + fast_flush_area(pending_req, pending_idx, usr_idx, info->minor); info->idx_map[usr_idx] = INVALID_REQ; + make_response(blkif, pending_req->id, res.operation, + res.status); blkif_put(pending_req->blkif); free_req(pending_req); } @@ -1100,7 +1095,7 @@ static int print_dbug = 1; static int do_block_io_op(blkif_t *blkif) { blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; + blkif_request_t req; pending_req_t *pending_req; RING_IDX rc, rp; int more_to_do = 0; @@ -1111,7 +1106,7 @@ static int do_block_io_op(blkif_t *blkif) rmb(); /* Ensure we see queued requests up to 'rp'. */ /*Check blkif has corresponding UE ring*/ - if (blkif->dev_num == -1) { + if (blkif->dev_num < 0) { /*oops*/ if (print_dbug) { WPRINTK("Corresponding UE " @@ -1122,7 +1117,8 @@ static int do_block_io_op(blkif_t *blkif) } info = tapfds[blkif->dev_num]; - if (info == NULL || !info->dev_inuse) { + + if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) { if (print_dbug) { WPRINTK("Can't get UE info!\n"); print_dbug = 0; @@ -1152,24 +1148,24 @@ static int do_block_io_op(blkif_t *blkif) break; } - req = RING_GET_REQUEST(blk_ring, rc); + memcpy(&req, RING_GET_REQUEST(blk_ring, rc), sizeof(req)); blk_ring->req_cons = ++rc; /* before make_response() */ - switch (req->operation) { + switch (req.operation) { case BLKIF_OP_READ: blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; case BLKIF_OP_WRITE: blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); + dispatch_rw_block_io(blkif, &req, pending_req); break; default: WPRINTK("unknown operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, + req.operation); + make_response(blkif, req.id, req.operation, BLKIF_RSP_ERROR); free_req(pending_req); break; @@ -1190,17 +1186,27 @@ static void dispatch_rw_block_io(blkif_t *blkif, struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; unsigned int nseg; int ret, i; - tap_blkif_t *info = tapfds[blkif->dev_num]; + tap_blkif_t *info; uint64_t sector; - blkif_request_t *target; int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); - int usr_idx = GET_NEXT_REQ(info->idx_map); + int usr_idx; uint16_t mmap_idx = pending_req->mem_idx; - /*Check we have space on user ring - should never fail*/ - if(usr_idx == INVALID_REQ) goto fail_flush; - + if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV) + goto fail_response; + + info = tapfds[blkif->dev_num]; + if (info == NULL) + goto fail_response; + + /* Check we have space on user ring - should never fail. */ + usr_idx = GET_NEXT_REQ(info->idx_map); + if (usr_idx == INVALID_REQ) { + BUG(); + goto fail_response; + } + /* Check that number of segments is sane. */ nseg = req->nr_segments; if ( unlikely(nseg == 0) || @@ -1233,15 +1239,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, unsigned long uvaddr; unsigned long kvaddr; uint64_t ptep; - struct page *page; uint32_t flags; uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i); - page = virt_to_page(kvaddr); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); - sector = req->sector_number + (8*i); + sector = req->sector_number + ((PAGE_SIZE / 512) * i); if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) { WPRINTK("BLKTAP: Sector request greater" "than size\n"); @@ -1251,7 +1254,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, BLKIF_OP_WRITE ? "WRITE" : "READ"), (long long unsigned) sector, (long long unsigned) sector>>9, - blkif->sectors); + (long long unsigned) blkif->sectors); } flags = GNTMAP_host_map; @@ -1261,71 +1264,123 @@ static void dispatch_rw_block_io(blkif_t *blkif, req->seg[i].gref, blkif->domid); op++; - /* Now map it to user. */ - ret = create_lookup_pte_addr(info->vma->vm_mm, - uvaddr, &ptep); - if (ret) { - WPRINTK("Couldn't get a pte addr!\n"); - fast_flush_area(pending_req, pending_idx, usr_idx, - blkif->dev_num); - goto fail_flush; - } + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Now map it to user. */ + ret = create_lookup_pte_addr(info->vma->vm_mm, + uvaddr, &ptep); + if (ret) { + WPRINTK("Couldn't get a pte addr!\n"); + goto fail_flush; + } - flags = GNTMAP_host_map | GNTMAP_application_map - | GNTMAP_contains_pte; - if (operation == WRITE) - flags |= GNTMAP_readonly; - gnttab_set_map_op(&map[op], ptep, flags, - req->seg[i].gref, blkif->domid); - op++; + flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + if (operation == WRITE) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[op], ptep, flags, + req->seg[i].gref, blkif->domid); + op++; + } } ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); BUG_ON(ret); - for (i = 0; i < (nseg*2); i+=2) { - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long offset; - struct page *pg; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + for (i = 0; i < (nseg*2); i+=2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + struct page *pg; - uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i/2); + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2); - if (unlikely(map[i].status != 0)) { - WPRINTK("invalid kernel buffer -- " - "could not remap it\n"); - goto fail_flush; - } + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } - if (unlikely(map[i+1].status != 0)) { - WPRINTK("invalid user buffer -- " - "could not remap it\n"); - goto fail_flush; + if (unlikely(map[i+1].status != 0)) { + WPRINTK("invalid user buffer -- " + "could not remap it\n"); + ret |= 1; + map[i+1].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i/2).kernel + = map[i].handle; + pending_handle(mmap_idx, pending_idx, i/2).user + = map[i+1].handle; + + if (ret) + continue; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(map[i].dev_bus_addr + >> PAGE_SHIFT)); + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ((struct page **)info->vma->vm_private_data)[offset] = + pg; } + } else { + for (i = 0; i < nseg; i++) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + struct page *pg; - pending_handle(mmap_idx, pending_idx, i/2).kernel - = map[i].handle; - pending_handle(mmap_idx, pending_idx, i/2).user - = map[i+1].handle; - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, - FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); - offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; - pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - ((struct page **)info->vma->vm_private_data)[offset] = - pg; + uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); + + if (unlikely(map[i].status != 0)) { + WPRINTK("invalid kernel buffer -- " + "could not remap it\n"); + ret |= 1; + map[i].handle = INVALID_GRANT_HANDLE; + } + + pending_handle(mmap_idx, pending_idx, i).kernel + = map[i].handle; + + if (ret) + continue; + + offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; + pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ((struct page **)info->vma->vm_private_data)[offset] = + pg; + } } + + if (ret) + goto fail_flush; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + down_write(&info->vma->vm_mm->mmap_sem); /* Mark mapped pages as reserved: */ for (i = 0; i < req->nr_segments; i++) { unsigned long kvaddr; struct page *pg; - kvaddr = MMAP_VADDR(mmap_start[mmap_idx].start, - pending_idx, i); + kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); SetPageReserved(pg); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = vm_insert_page(info->vma, + MMAP_VADDR(info->user_vstart, + usr_idx, i), pg); + if (ret) { + up_write(&info->vma->vm_mm->mmap_sem); + goto fail_flush; + } + } } + if (xen_feature(XENFEAT_auto_translated_physmap)) + up_write(&info->vma->vm_mm->mmap_sem); /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); @@ -1336,6 +1391,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, info->ufe_ring.req_prod_pvt); memcpy(target, req, sizeof(*req)); target->id = usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ info->ufe_ring.req_prod_pvt++; return; @@ -1393,7 +1449,6 @@ static void make_response(blkif_t *blkif, unsigned long id, static int __init blkif_init(void) { int i,ret,blktap_dir; - tap_blkif_t *info; if (!is_running_on_xen()) return -ENODEV; @@ -1413,10 +1468,8 @@ static int __init blkif_init(void) tap_blkif_xenbus_init(); - /*Create the blktap devices, but do not map memory or waitqueue*/ - for(i = 0; i < MAX_TAP_DEV; i++) translate_domid[i].domid = 0xFFFF; - - ret = register_chrdev(BLKTAP_DEV_MAJOR,"blktap",&blktap_fops); + /* Dynamically allocate a major for this device */ + ret = register_chrdev(0, "blktap", &blktap_fops); blktap_dir = devfs_mk_dir(NULL, "xen", 0, NULL); if ( (ret < 0)||(blktap_dir < 0) ) { @@ -1424,22 +1477,36 @@ static int __init blkif_init(void) return -ENOMEM; } - for(i = 0; i < MAX_TAP_DEV; i++ ) { - info = tapfds[i] = kzalloc(sizeof(tap_blkif_t),GFP_KERNEL); - if(tapfds[i] == NULL) return -ENOMEM; - info->minor = i; - info->pid = 0; - info->blkif = NULL; + blktap_major = ret; - ret = devfs_mk_cdev(MKDEV(BLKTAP_DEV_MAJOR, i), - S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); + /* tapfds[0] is always NULL */ + blktap_next_minor++; - if(ret != 0) return -ENOMEM; - info->dev_pending = info->dev_inuse = 0; + ret = devfs_mk_cdev(MKDEV(blktap_major, i), + S_IFCHR|S_IRUGO|S_IWUSR, "xen/blktap%d", i); - DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + if(ret != 0) + return -ENOMEM; + + DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); + + /* Make sure the xen class exists */ + if (!setup_xen_class()) { + /* + * This will allow udev to create the blktap ctrl device. + * We only want to create blktap0 first. We don't want + * to flood the sysfs system with needless blktap devices. + * We only create the device when a request of a new device is + * made. + */ + class_device_create(xen_class, NULL, + MKDEV(blktap_major, 0), NULL, + "blktap0"); + } else { + /* this is bad, but not fatal */ + WPRINTK("blktap: sysfs xen_class not created\n"); } - + DPRINTK("Blktap device successfully created\n"); return 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c index 6c16a2e60b..553ad45c48 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c @@ -189,7 +189,7 @@ static int blktap_probe(struct xenbus_device *dev, return 0; fail: - DPRINTK("blktap probe failed"); + DPRINTK("blktap probe failed\n"); blktap_remove(dev); return err; } @@ -243,7 +243,7 @@ static void tap_frontend_changed(struct xenbus_device *dev, struct backend_info *be = dev->dev.driver_data; int err; - DPRINTK(""); + DPRINTK("\n"); switch (frontend_state) { case XenbusStateInitialising: @@ -273,7 +273,6 @@ static void tap_frontend_changed(struct xenbus_device *dev, kthread_stop(be->blkif->xenblkd); be->blkif->xenblkd = NULL; } - tap_blkif_unmap(be->blkif); xenbus_switch_state(dev, XenbusStateClosing); break; @@ -319,7 +318,7 @@ static int connect_ring(struct backend_info *be) unsigned int evtchn; int err; - DPRINTK("%s", dev->otherend); + DPRINTK("%s\n", dev->otherend); err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, "event-channel", "%u", &evtchn, NULL); diff --git a/linux-2.6-xen-sparse/drivers/xen/char/mem.c b/linux-2.6-xen-sparse/drivers/xen/char/mem.c index 6576135c99..ac85c8dbb2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/char/mem.c +++ b/linux-2.6-xen-sparse/drivers/xen/char/mem.c @@ -28,13 +28,12 @@ #include <asm/io.h> #include <asm/hypervisor.h> -static inline int uncached_access(struct file *file) +#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE +static inline int valid_phys_addr_range(unsigned long addr, size_t *count) { - if (file->f_flags & O_SYNC) - return 1; - /* Xen sets correct MTRR type on non-RAM for us. */ - return 0; + return 1; } +#endif /* * This funcion reads the *physical* memory. The f_pos points directly to the @@ -47,6 +46,9 @@ static ssize_t read_mem(struct file * file, char __user * buf, ssize_t read = 0, sz; void __iomem *v; + if (!valid_phys_addr_range(p, &count)) + return -EFAULT; + while (count > 0) { /* * Handle first page in case it's not aligned @@ -58,13 +60,15 @@ static ssize_t read_mem(struct file * file, char __user * buf, sz = min_t(unsigned long, sz, count); - if ((v = ioremap(p, sz)) == NULL) { + v = xlate_dev_mem_ptr(p, sz); + if (IS_ERR(v) || v == NULL) { /* - * Some programs (e.g., dmidecode) groove off into weird RAM - * areas where no tables can possibly exist (because Xen will - * have stomped on them!). These programs get rather upset if - * we let them know that Xen failed their access, so we fake - * out a read of all zeroes. :-) + * Some programs (e.g., dmidecode) groove off into + * weird RAM areas where no tables can possibly exist + * (because Xen will have stomped on them!). These + * programs get rather upset if we let them know that + * Xen failed their access, so we fake out a read of + * all zeroes. */ if (clear_user(buf, count)) return -EFAULT; @@ -73,7 +77,7 @@ static ssize_t read_mem(struct file * file, char __user * buf, } ignored = copy_to_user(buf, v, sz); - iounmap(v); + xlate_dev_mem_ptr_unmap(v); if (ignored) return -EFAULT; buf += sz; @@ -93,6 +97,9 @@ static ssize_t write_mem(struct file * file, const char __user * buf, ssize_t written = 0, sz; void __iomem *v; + if (!valid_phys_addr_range(p, &count)) + return -EFAULT; + while (count > 0) { /* * Handle first page in case it's not aligned @@ -104,11 +111,17 @@ static ssize_t write_mem(struct file * file, const char __user * buf, sz = min_t(unsigned long, sz, count); - if ((v = ioremap(p, sz)) == NULL) + v = xlate_dev_mem_ptr(p, sz); + if (v == NULL) break; + if (IS_ERR(v)) { + if (written == 0) + return PTR_ERR(v); + break; + } ignored = copy_from_user(v, buf, sz); - iounmap(v); + xlate_dev_mem_ptr_unmap(v); if (ignored) { written += sz - ignored; if (written) @@ -125,6 +138,15 @@ static ssize_t write_mem(struct file * file, const char __user * buf, return written; } +#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM +static inline int uncached_access(struct file *file) +{ + if (file->f_flags & O_SYNC) + return 1; + /* Xen sets correct MTRR type on non-RAM for us. */ + return 0; +} + static int mmap_mem(struct file * file, struct vm_area_struct * vma) { size_t size = vma->vm_end - vma->vm_start; @@ -136,6 +158,7 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot, DOMID_IO); } +#endif /* * The memory devices use the full 32/64 bits of the offset, and so we cannot diff --git a/linux-2.6-xen-sparse/drivers/xen/console/console.c b/linux-2.6-xen-sparse/drivers/xen/console/console.c index a45d21a69c..ff3e2a411e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/console/console.c +++ b/linux-2.6-xen-sparse/drivers/xen/console/console.c @@ -49,6 +49,7 @@ #include <linux/console.h> #include <linux/bootmem.h> #include <linux/sysrq.h> +#include <linux/screen_info.h> #include <asm/io.h> #include <asm/irq.h> #include <asm/uaccess.h> @@ -266,6 +267,41 @@ void xencons_force_flush(void) } +void dom0_init_screen_info(const struct dom0_vga_console_info *info) +{ + switch (info->video_type) { + case XEN_VGATYPE_TEXT_MODE_3: + screen_info.orig_video_mode = 3; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = info->u.text_mode_3.rows; + screen_info.orig_video_cols = info->u.text_mode_3.columns; + screen_info.orig_x = info->u.text_mode_3.cursor_x; + screen_info.orig_y = info->u.text_mode_3.cursor_y; + screen_info.orig_video_points = + info->u.text_mode_3.font_height; + break; + case XEN_VGATYPE_VESA_LFB: + screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; + screen_info.lfb_width = info->u.vesa_lfb.width; + screen_info.lfb_height = info->u.vesa_lfb.height; + screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel; + screen_info.lfb_base = info->u.vesa_lfb.lfb_base; + screen_info.lfb_size = info->u.vesa_lfb.lfb_size; + screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line; + screen_info.red_size = info->u.vesa_lfb.red_size; + screen_info.red_pos = info->u.vesa_lfb.red_pos; + screen_info.green_size = info->u.vesa_lfb.green_size; + screen_info.green_pos = info->u.vesa_lfb.green_pos; + screen_info.blue_size = info->u.vesa_lfb.blue_size; + screen_info.blue_pos = info->u.vesa_lfb.blue_pos; + screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size; + screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos; + break; + } +} + + /******************** User-space console driver (/dev/console) ************/ #define DRV(_d) (_d) diff --git a/linux-2.6-xen-sparse/drivers/xen/core/Makefile b/linux-2.6-xen-sparse/drivers/xen/core/Makefile index c1b0c1bd51..6154454339 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/core/Makefile @@ -9,5 +9,5 @@ obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SKBUFF) += skbuff.o -obj-$(CONFIG_XEN_REBOOT) += reboot.o +obj-$(CONFIG_XEN_REBOOT) += reboot.o machine_reboot.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o diff --git a/linux-2.6-xen-sparse/drivers/xen/core/features.c b/linux-2.6-xen-sparse/drivers/xen/core/features.c index 4d50caf50b..a76f58c04d 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/features.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/features.c @@ -11,6 +11,10 @@ #include <asm/hypervisor.h> #include <xen/features.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; /* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ EXPORT_SYMBOL(xen_features); diff --git a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c index 3195279a87..c5132c13bb 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c @@ -44,6 +44,10 @@ #include <asm/io.h> #include <xen/interface/memory.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 diff --git a/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c new file mode 100644 index 0000000000..02ee7f4728 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c @@ -0,0 +1,185 @@ +#define __KERNEL_SYSCALLS__ +#include <linux/version.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <linux/stringify.h> +#include <asm/irq.h> +#include <asm/mmu_context.h> +#include <xen/evtchn.h> +#include <asm/hypervisor.h> +#include <xen/interface/dom0_ops.h> +#include <xen/xenbus.h> +#include <linux/cpu.h> +#include <linux/kthread.h> +#include <xen/gnttab.h> +#include <xen/xencons.h> +#include <xen/cpu_hotplug.h> + +#if defined(__i386__) || defined(__x86_64__) + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +void machine_emergency_restart(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + HYPERVISOR_shutdown(SHUTDOWN_reboot); +} + +void machine_restart(char * __unused) +{ + machine_emergency_restart(); +} + +void machine_halt(void) +{ + machine_power_off(); +} + +void machine_power_off(void) +{ + /* We really want to get pending console data out before we die. */ + xencons_force_flush(); + if (pm_power_off) + pm_power_off(); + HYPERVISOR_shutdown(SHUTDOWN_poweroff); +} + +int reboot_thru_bios = 0; /* for dmi_scan.c */ +EXPORT_SYMBOL(machine_restart); +EXPORT_SYMBOL(machine_halt); +EXPORT_SYMBOL(machine_power_off); + +/* Ensure we run on the idle task page tables so that we will + switch page tables before running user space. This is needed + on architectures with separate kernel and user page tables + because the user page table pointer is not saved/restored. */ +static void switch_idle_mm(void) +{ + struct mm_struct *mm = current->active_mm; + + if (mm == &init_mm) + return; + + atomic_inc(&init_mm.mm_count); + switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; + mmdrop(mm); +} + +static void pre_suspend(void) +{ + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + clear_fixmap(FIX_SHARED_INFO); + + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); +} + +static void post_suspend(void) +{ + int i, j, k, fpp; + extern unsigned long max_pfn; + extern unsigned long *pfn_to_mfn_frame_list_list; + extern unsigned long *pfn_to_mfn_frame_list[]; + + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + memset(empty_zero_page, 0, PAGE_SIZE); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(pfn_to_mfn_frame_list_list); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { + if ((j % fpp) == 0) { + k++; + pfn_to_mfn_frame_list_list[k] = + virt_to_mfn(pfn_to_mfn_frame_list[k]); + j = 0; + } + pfn_to_mfn_frame_list[k][j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; +} + +#else /* !(defined(__i386__) || defined(__x86_64__)) */ + +#define switch_idle_mm() ((void)0) +#define mm_pin_all() ((void)0) +#define pre_suspend() ((void)0) +#define post_suspend() ((void)0) + +#endif + +int __xen_suspend(void) +{ + int err; + + extern void time_resume(void); + + BUG_ON(smp_processor_id() != 0); + BUG_ON(in_interrupt()); + +#if defined(__i386__) || defined(__x86_64__) + if (xen_feature(XENFEAT_auto_translated_physmap)) { + printk(KERN_WARNING "Cannot suspend in " + "auto_translated_physmap mode.\n"); + return -EOPNOTSUPP; + } +#endif + + err = smp_suspend(); + if (err) + return err; + + xenbus_suspend(); + + preempt_disable(); + + mm_pin_all(); + local_irq_disable(); + preempt_enable(); + + gnttab_suspend(); + + pre_suspend(); + + /* + * We'll stop somewhere inside this hypercall. When it returns, + * we'll start resuming after the restore. + */ + HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + + post_suspend(); + + gnttab_resume(); + + irq_resume(); + + time_resume(); + + switch_idle_mm(); + + local_irq_enable(); + + xencons_resume(); + + xenbus_resume(); + + smp_resume(); + + return err; +} diff --git a/linux-2.6-xen-sparse/drivers/xen/core/reboot.c b/linux-2.6-xen-sparse/drivers/xen/core/reboot.c index 34c3930961..af3fe3a15c 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/reboot.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/reboot.c @@ -1,25 +1,15 @@ #define __KERNEL_SYSCALLS__ #include <linux/version.h> #include <linux/kernel.h> -#include <linux/mm.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/reboot.h> #include <linux/sysrq.h> -#include <linux/stringify.h> -#include <asm/irq.h> -#include <asm/mmu_context.h> -#include <xen/evtchn.h> #include <asm/hypervisor.h> -#include <xen/interface/dom0_ops.h> #include <xen/xenbus.h> -#include <linux/cpu.h> #include <linux/kthread.h> -#include <xen/gnttab.h> -#include <xen/xencons.h> -#include <xen/cpu_hotplug.h> -extern void ctrl_alt_del(void); +MODULE_LICENSE("Dual BSD/GPL"); #define SHUTDOWN_INVALID -1 #define SHUTDOWN_POWEROFF 0 @@ -31,186 +21,18 @@ extern void ctrl_alt_del(void); */ #define SHUTDOWN_HALT 4 -#if defined(__i386__) || defined(__x86_64__) - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -void machine_emergency_restart(void) -{ - /* We really want to get pending console data out before we die. */ - xencons_force_flush(); - HYPERVISOR_shutdown(SHUTDOWN_reboot); -} - -void machine_restart(char * __unused) -{ - machine_emergency_restart(); -} - -void machine_halt(void) -{ - machine_power_off(); -} - -void machine_power_off(void) -{ - /* We really want to get pending console data out before we die. */ - xencons_force_flush(); - if (pm_power_off) - pm_power_off(); - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -} - -int reboot_thru_bios = 0; /* for dmi_scan.c */ -EXPORT_SYMBOL(machine_restart); -EXPORT_SYMBOL(machine_halt); -EXPORT_SYMBOL(machine_power_off); - -#endif /* defined(__i386__) || defined(__x86_64__) */ - -/****************************************************************************** - * Stop/pickle callback handling. - */ - /* Ignore multiple shutdown requests. */ static int shutting_down = SHUTDOWN_INVALID; + static void __shutdown_handler(void *unused); static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); -#if defined(__i386__) || defined(__x86_64__) - -/* Ensure we run on the idle task page tables so that we will - switch page tables before running user space. This is needed - on architectures with separate kernel and user page tables - because the user page table pointer is not saved/restored. */ -static void switch_idle_mm(void) -{ - struct mm_struct *mm = current->active_mm; - - if (mm == &init_mm) - return; - - atomic_inc(&init_mm.mm_count); - switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; - mmdrop(mm); -} - -static void pre_suspend(void) -{ - HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; - clear_fixmap(FIX_SHARED_INFO); - - xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); - xen_start_info->console.domU.mfn = - mfn_to_pfn(xen_start_info->console.domU.mfn); -} - -static void post_suspend(void) -{ - int i, j, k, fpp; - extern unsigned long max_pfn; - extern unsigned long *pfn_to_mfn_frame_list_list; - extern unsigned long *pfn_to_mfn_frame_list[]; - - set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); - - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - - memset(empty_zero_page, 0, PAGE_SIZE); - - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(pfn_to_mfn_frame_list_list); - - fpp = PAGE_SIZE/sizeof(unsigned long); - for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { - if ((j % fpp) == 0) { - k++; - pfn_to_mfn_frame_list_list[k] = - virt_to_mfn(pfn_to_mfn_frame_list[k]); - j = 0; - } - pfn_to_mfn_frame_list[k][j] = - virt_to_mfn(&phys_to_machine_mapping[i]); - } - HYPERVISOR_shared_info->arch.max_pfn = max_pfn; -} - -#else /* !(defined(__i386__) || defined(__x86_64__)) */ - -#define switch_idle_mm() ((void)0) -#define mm_pin_all() ((void)0) -#define pre_suspend() ((void)0) -#define post_suspend() ((void)0) - -#endif - -static int __do_suspend(void *ignore) -{ - int err; - - extern void time_resume(void); - - BUG_ON(smp_processor_id() != 0); - BUG_ON(in_interrupt()); - -#if defined(__i386__) || defined(__x86_64__) - if (xen_feature(XENFEAT_auto_translated_physmap)) { - printk(KERN_WARNING "Cannot suspend in " - "auto_translated_physmap mode.\n"); - return -EOPNOTSUPP; - } +#ifdef CONFIG_XEN +int __xen_suspend(void); +#else +#define __xen_suspend() (void)0 #endif - err = smp_suspend(); - if (err) - return err; - - xenbus_suspend(); - - preempt_disable(); - - mm_pin_all(); - local_irq_disable(); - preempt_enable(); - - gnttab_suspend(); - - pre_suspend(); - - /* - * We'll stop somewhere inside this hypercall. When it returns, - * we'll start resuming after the restore. - */ - HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); - - shutting_down = SHUTDOWN_INVALID; - - post_suspend(); - - gnttab_resume(); - - irq_resume(); - - time_resume(); - - switch_idle_mm(); - - local_irq_enable(); - - xencons_resume(); - - xenbus_resume(); - - smp_resume(); - - return err; -} - static int shutdown_process(void *__unused) { static char *envp[] = { "HOME=/", "TERM=linux", @@ -222,11 +44,13 @@ static int shutdown_process(void *__unused) if ((shutting_down == SHUTDOWN_POWEROFF) || (shutting_down == SHUTDOWN_HALT)) { - if (execve("/sbin/poweroff", poweroff_argv, envp) < 0) { + if (call_usermodehelper("/sbin/poweroff", poweroff_argv, envp, 0) < 0) { +#ifdef CONFIG_XEN sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_POWER_OFF, NULL); +#endif /* CONFIG_XEN */ } } @@ -235,6 +59,13 @@ static int shutdown_process(void *__unused) return 0; } +static int xen_suspend(void *__unused) +{ + __xen_suspend(); + shutting_down = SHUTDOWN_INVALID; + return 0; +} + static int kthread_create_on_cpu(int (*f)(void *arg), void *arg, const char *name, @@ -257,7 +88,7 @@ static void __shutdown_handler(void *unused) err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); else - err = kthread_create_on_cpu(__do_suspend, NULL, "suspend", 0); + err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0); if (err < 0) { printk(KERN_WARNING "Error creating shutdown process (%d): " @@ -298,7 +129,7 @@ static void shutdown_handler(struct xenbus_watch *watch, if (strcmp(str, "poweroff") == 0) shutting_down = SHUTDOWN_POWEROFF; else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); + kill_proc(1, SIGINT, 1); /* interrupt init */ else if (strcmp(str, "suspend") == 0) shutting_down = SHUTDOWN_SUSPEND; else if (strcmp(str, "halt") == 0) @@ -364,10 +195,14 @@ static int setup_shutdown_watcher(struct notifier_block *notifier, err = register_xenbus_watch(&shutdown_watch); if (err) printk(KERN_ERR "Failed to set shutdown watcher\n"); + else + xenbus_write(XBT_NIL, "control", "feature-reboot", "1"); err = register_xenbus_watch(&sysrq_watch); if (err) printk(KERN_ERR "Failed to set sysrq watcher\n"); + else + xenbus_write(XBT_NIL, "control", "feature-sysrq", "1"); return NOTIFY_DONE; } @@ -378,6 +213,7 @@ static int __init setup_shutdown_event(void) .notifier_call = setup_shutdown_watcher }; register_xenstore_notifier(&xenstore_notifier); + return 0; } diff --git a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c index a4a2e4edce..2fa88069c4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c @@ -18,7 +18,12 @@ /*static*/ kmem_cache_t *skbuff_cachep; EXPORT_SYMBOL(skbuff_cachep); -#define MAX_SKBUFF_ORDER 4 +/* Allow up to 64kB or page-sized packets (whichever is greater). */ +#if PAGE_SHIFT < 16 +#define MAX_SKBUFF_ORDER (16 - PAGE_SHIFT) +#else +#define MAX_SKBUFF_ORDER 0 +#endif static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1]; static struct { diff --git a/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c b/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c index 76bfab82e6..32f8de5bff 100644 --- a/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c +++ b/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c @@ -419,10 +419,9 @@ static struct file_operations evtchn_fops = { }; static struct miscdevice evtchn_miscdev = { - .minor = EVTCHN_MINOR, + .minor = MISC_DYNAMIC_MINOR, .name = "evtchn", .fops = &evtchn_fops, - .devfs_name = "misc/evtchn", }; static int __init evtchn_init(void) diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/common.h b/linux-2.6-xen-sparse/drivers/xen/netback/common.h index 434ff6bcf2..367c008d3b 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/netback/common.h @@ -92,6 +92,9 @@ typedef struct netif_st { unsigned long remaining_credit; struct timer_list credit_timeout; + /* Enforce draining of the transmit queue. */ + struct timer_list tx_queue_timeout; + /* Miscellaneous private stuff. */ struct list_head list; /* scheduling list */ atomic_t refcnt; @@ -106,7 +109,7 @@ typedef struct netif_st { void netif_disconnect(netif_t *netif); -netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]); +netif_t *netif_alloc(domid_t domid, unsigned int handle); int netif_map(netif_t *netif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn); @@ -119,6 +122,8 @@ int netif_map(netif_t *netif, unsigned long tx_ring_ref, void netif_xenbus_init(void); +#define netif_schedulable(dev) (netif_running(dev) && netif_carrier_ok(dev)) + void netif_schedule_work(netif_t *netif); void netif_deschedule_work(netif_t *netif); diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c index d60b23b0f2..9fae954bd2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c @@ -34,6 +34,23 @@ #include <linux/ethtool.h> #include <linux/rtnetlink.h> +/* + * Module parameter 'queue_length': + * + * Enables queuing in the network stack when a client has run out of receive + * descriptors. Although this feature can improve receive bandwidth by avoiding + * packet loss, it can also result in packets sitting in the 'tx_queue' for + * unbounded time. This is bad if those packets hold onto foreign resources. + * For example, consider a packet that holds onto resources belonging to the + * guest for which it is queued (e.g., packet received on vif1.0, destined for + * vif1.1 which is not activated in the guest): in this situation the guest + * will never be destroyed, unless vif1.1 is taken down. To avoid this, we + * run a timer (tx_queue_timeout) to drain the queue when the interface is + * blocked. + */ +static unsigned long netbk_queue_length = 32; +module_param_named(queue_length, netbk_queue_length, ulong, 0); + static void __netif_up(netif_t *netif) { enable_irq(netif->irq); @@ -107,9 +124,9 @@ static struct ethtool_ops network_ethtool_ops = .get_link = ethtool_op_get_link, }; -netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) +netif_t *netif_alloc(domid_t domid, unsigned int handle) { - int err = 0, i; + int err = 0; struct net_device *dev; netif_t *netif; char name[IFNAMSIZ] = {}; @@ -134,6 +151,10 @@ netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) netif->credit_bytes = netif->remaining_credit = ~0UL; netif->credit_usec = 0UL; init_timer(&netif->credit_timeout); + /* Initialize 'expires' now: it's used to track the credit window. */ + netif->credit_timeout.expires = jiffies; + + init_timer(&netif->tx_queue_timeout); dev->hard_start_xmit = netif_be_start_xmit; dev->get_stats = netif_be_get_stats; @@ -144,26 +165,16 @@ netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) SET_ETHTOOL_OPS(dev, &network_ethtool_ops); + dev->tx_queue_len = netbk_queue_length; + /* - * Reduce default TX queuelen so that each guest interface only - * allows it to eat around 6.4MB of host memory. - */ - dev->tx_queue_len = 100; - - for (i = 0; i < ETH_ALEN; i++) - if (be_mac[i] != 0) - break; - if (i == ETH_ALEN) { - /* - * Initialise a dummy MAC address. We choose the numerically - * largest non-broadcast address to prevent the address getting - * stolen by an Ethernet bridge for STP purposes. - * (FE:FF:FF:FF:FF:FF) - */ - memset(dev->dev_addr, 0xFF, ETH_ALEN); - dev->dev_addr[0] &= ~0x01; - } else - memcpy(dev->dev_addr, be_mac, ETH_ALEN); + * Initialise a dummy MAC address. We choose the numerically + * largest non-broadcast address to prevent the address getting + * stolen by an Ethernet bridge for STP purposes. + * (FE:FF:FF:FF:FF:FF) + */ + memset(dev->dev_addr, 0xFF, ETH_ALEN); + dev->dev_addr[0] &= ~0x01; rtnl_lock(); err = register_netdevice(dev); @@ -306,11 +317,23 @@ err_rx: return err; } -static void netif_free(netif_t *netif) +void netif_disconnect(netif_t *netif) { + if (netif_carrier_ok(netif->dev)) { + rtnl_lock(); + netif_carrier_off(netif->dev); + if (netif_running(netif->dev)) + __netif_down(netif); + rtnl_unlock(); + netif_put(netif); + } + atomic_dec(&netif->refcnt); wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); + del_timer_sync(&netif->credit_timeout); + del_timer_sync(&netif->tx_queue_timeout); + if (netif->irq) unbind_from_irqhandler(netif->irq, netif); @@ -324,16 +347,3 @@ static void netif_free(netif_t *netif) free_netdev(netif->dev); } - -void netif_disconnect(netif_t *netif) -{ - if (netif_carrier_ok(netif->dev)) { - rtnl_lock(); - netif_carrier_off(netif->dev); - if (netif_running(netif->dev)) - __netif_down(netif); - rtnl_unlock(); - netif_put(netif); - } - netif_free(netif); -} diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c index 391ace8a02..d021c9689a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c @@ -53,8 +53,10 @@ #include <linux/skbuff.h> #include <linux/ethtool.h> #include <net/dst.h> +#include <net/xfrm.h> /* secpath_reset() */ +#include <asm/hypervisor.h> /* is_initial_xendomain() */ -static int nloopbacks = 8; +static int nloopbacks = -1; module_param(nloopbacks, int, 0); MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create"); @@ -77,10 +79,60 @@ static int loopback_close(struct net_device *dev) return 0; } +#ifdef CONFIG_X86 +static int is_foreign(unsigned long pfn) +{ + /* NB. Play it safe for auto-translation mode. */ + return (xen_feature(XENFEAT_auto_translated_physmap) || + (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT)); +} +#else +/* How to detect a foreign mapping? Play it safe. */ +#define is_foreign(pfn) (1) +#endif + +static int skb_remove_foreign_references(struct sk_buff *skb) +{ + struct page *page; + unsigned long pfn; + int i, off; + char *vaddr; + + BUG_ON(skb_shinfo(skb)->frag_list); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page); + if (!is_foreign(pfn)) + continue; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!page)) + return 0; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + off = skb_shinfo(skb)->frags[i].page_offset; + memcpy(page_address(page) + off, + vaddr + off, + skb_shinfo(skb)->frags[i].size); + kunmap_skb_frag(vaddr); + + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = page; + } + + return 1; +} + static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_private *np = netdev_priv(dev); + if (!skb_remove_foreign_references(skb)) { + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; + } + dst_release(skb->dst); skb->dst = NULL; @@ -110,6 +162,11 @@ static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->protocol = eth_type_trans(skb, dev); skb->dev = dev; dev->last_rx = jiffies; + + /* Flush netfilter context: rx'ed skbuffs not expected to have any. */ + nf_reset(skb); + secpath_reset(skb); + netif_rx(skb); return 0; @@ -239,6 +296,9 @@ static int __init loopback_init(void) { int i, err = 0; + if (nloopbacks == -1) + nloopbacks = is_initial_xendomain() ? 4 : 0; + for (i = 0; i < nloopbacks; i++) if ((err = make_loopback(i)) != 0) break; diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c index ad8236c82f..1d24fc9b88 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c @@ -70,14 +70,15 @@ static struct timer_list net_timer; static struct sk_buff_head rx_queue; -static unsigned long mmap_vstart; -#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) - -static void *rx_mmap_area; +static struct page **mmap_pages; +static inline unsigned long idx_to_kaddr(unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx])); +} #define PKT_PROT_LEN 64 -static struct { +static struct pending_tx_info { netif_tx_request_t req; netif_t *netif; } pending_tx_info[MAX_PENDING_REQS]; @@ -186,7 +187,7 @@ static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) if (unlikely(!nskb)) goto err; - skb_reserve(nskb, 16); + skb_reserve(nskb, 16 + NET_IP_ALIGN); headlen = nskb->end - nskb->data; if (headlen > skb_headlen(skb)) headlen = skb_headlen(skb); @@ -217,7 +218,7 @@ static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) copy = len >= PAGE_SIZE ? PAGE_SIZE : len; zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; - page = alloc_page(GFP_ATOMIC | zero); + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); if (unlikely(!page)) goto err_free; @@ -263,6 +264,13 @@ static inline int netbk_queue_full(netif_t *netif) ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); } +static void tx_queue_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + if (netif_schedulable(netif->dev)) + netif_wake_queue(netif->dev); +} + int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) { netif_t *netif = netdev_priv(dev); @@ -270,20 +278,13 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) BUG_ON(skb->dev != dev); /* Drop the packet if the target domain has no receive buffers. */ - if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) + if (unlikely(!netif_schedulable(dev) || netbk_queue_full(netif))) goto drop; - if (unlikely(netbk_queue_full(netif))) { - /* Not a BUG_ON() -- misbehaving netfront can trigger this. */ - if (netbk_can_queue(dev)) - DPRINTK("Queue full but not stopped!\n"); - goto drop; - } - - /* Copy the packet here if it's destined for a flipping - interface but isn't flippable (e.g. extra references to - data) - */ + /* + * Copy the packet here if it's destined for a flipping interface + * but isn't flippable (e.g. extra references to data). + */ if (!netif->copying_receiver && !is_flippable_skb(skb)) { struct sk_buff *nskb = netbk_copy_skb(skb); if ( unlikely(nskb == NULL) ) @@ -304,8 +305,19 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) netif->rx.sring->req_event = netif->rx_req_cons_peek + netbk_max_required_rx_slots(netif); mb(); /* request notification /then/ check & stop the queue */ - if (netbk_queue_full(netif)) + if (netbk_queue_full(netif)) { netif_stop_queue(dev); + /* + * Schedule 500ms timeout to restart the queue, thus + * ensuring that an inactive queue will be drained. + * Packets will be immediately be dropped until more + * receive buffers become available (see + * netbk_queue_full() check above). + */ + netif->tx_queue_timeout.data = (unsigned long)netif; + netif->tx_queue_timeout.function = tx_queue_callback; + __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); + } } skb_queue_tail(&rx_queue, skb); @@ -373,14 +385,22 @@ static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta, flipped. */ meta->copy = 1; copy_gop = npo->copy + npo->copy_prod++; - copy_gop->source.domid = DOMID_SELF; + copy_gop->flags = GNTCOPY_dest_gref; + if (PageForeign(page)) { + struct pending_tx_info *src_pend = + &pending_tx_info[page->index]; + copy_gop->source.domid = src_pend->netif->domid; + copy_gop->source.u.ref = src_pend->req.gref; + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; + copy_gop->source.u.gmfn = old_mfn; + } copy_gop->source.offset = offset; - copy_gop->source.u.gmfn = old_mfn; copy_gop->dest.domid = netif->domid; copy_gop->dest.offset = 0; copy_gop->dest.u.ref = req->gref; copy_gop->len = size; - copy_gop->flags = GNTCOPY_dest_gref; } else { meta->copy = 0; if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -474,7 +494,7 @@ static int netbk_check_gop(int nr_frags, domid_t domid, copy_op = npo->copy + npo->copy_cons++; if (copy_op->status != GNTST_okay) { DPRINTK("Bad status %d from copy to DOM%d.\n", - gop->status, domid); + copy_op->status, domid); status = NETIF_RSP_ERROR; } } else { @@ -697,6 +717,7 @@ static void net_rx_action(unsigned long unused) } if (netif_queue_stopped(netif->dev) && + netif_schedulable(netif->dev) && !netbk_queue_full(netif)) netif_wake_queue(netif->dev); @@ -754,8 +775,7 @@ static void add_to_net_schedule_list_tail(netif_t *netif) spin_lock_irq(&net_schedule_list_lock); if (!__on_net_schedule_list(netif) && - likely(netif_running(netif->dev) && - netif_carrier_ok(netif->dev))) { + likely(netif_schedulable(netif->dev))) { list_add_tail(&netif->list, &net_schedule_list); netif_get(netif); } @@ -792,10 +812,30 @@ void netif_deschedule_work(netif_t *netif) } +static void tx_add_credit(netif_t *netif) +{ + unsigned long max_burst, max_credit; + + /* + * Allow a burst big enough to transmit a jumbo packet of up to 128kB. + * Otherwise the interface can seize up due to insufficient credit. + */ + max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; + max_burst = min(max_burst, 131072UL); + max_burst = max(max_burst, netif->credit_bytes); + + /* Take care that adding a new chunk of credit doesn't wrap to zero. */ + max_credit = netif->remaining_credit + netif->credit_bytes; + if (max_credit < netif->remaining_credit) + max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ + + netif->remaining_credit = min(max_credit, max_burst); +} + static void tx_credit_callback(unsigned long data) { netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; + tx_add_credit(netif); netif_schedule_work(netif); } @@ -819,7 +859,7 @@ inline static void net_tx_action_dealloc(void) gop = tx_unmap_ops; while (dc != dp) { pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - gnttab_set_unmap_op(gop, MMAP_VADDR(pending_idx), + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), GNTMAP_host_map, grant_tx_handle[pending_idx]); gop++; @@ -857,20 +897,28 @@ static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end) netif_put(netif); } -static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp, - int work_to_do) +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first, + netif_tx_request_t *txp, int work_to_do) { - netif_tx_request_t *first = txp; RING_IDX cons = netif->tx.req_cons; int frags = 0; - while (txp->flags & NETTXF_more_data) { + if (!(first->flags & NETTXF_more_data)) + return 0; + + do { if (frags >= work_to_do) { DPRINTK("Need more frags\n"); return -frags; } - txp = RING_GET_REQUEST(&netif->tx, cons + frags); + if (unlikely(frags >= MAX_SKB_FRAGS)) { + DPRINTK("Too many frags\n"); + return -frags; + } + + memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), + sizeof(*txp)); if (txp->size > first->size) { DPRINTK("Frags galore\n"); return -frags; @@ -884,30 +932,28 @@ static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp, txp->offset, txp->size); return -frags; } - } + } while ((txp++)->flags & NETTXF_more_data); return frags; } static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, struct sk_buff *skb, + netif_tx_request_t *txp, gnttab_map_grant_ref_t *mop) { struct skb_shared_info *shinfo = skb_shinfo(skb); skb_frag_t *frags = shinfo->frags; - netif_tx_request_t *txp; unsigned long pending_idx = *((u16 *)skb->data); - RING_IDX cons = netif->tx.req_cons; int i, start; /* Skip first skb fragment if it is on same page as header fragment. */ start = ((unsigned long)shinfo->frags[0].page == pending_idx); - for (i = start; i < shinfo->nr_frags; i++) { - txp = RING_GET_REQUEST(&netif->tx, cons++); + for (i = start; i < shinfo->nr_frags; i++, txp++) { pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)]; - gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx), + gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), GNTMAP_host_map | GNTMAP_readonly, txp->gref, netif->domid); @@ -940,7 +986,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb, netif_put(netif); } else { set_phys_to_machine( - __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT, + __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT, FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); grant_tx_handle[pending_idx] = mop->handle; } @@ -957,7 +1003,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb, newerr = (++mop)->status; if (likely(!newerr)) { set_phys_to_machine( - __pa(MMAP_VADDR(pending_idx))>>PAGE_SHIFT, + __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT, FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); grant_tx_handle[pending_idx] = mop->handle; /* Had a previous error? Invalidate this fragment. */ @@ -1005,7 +1051,7 @@ static void netbk_fill_frags(struct sk_buff *skb) pending_idx = (unsigned long)frag->page; txp = &pending_tx_info[pending_idx].req; - frag->page = virt_to_page(MMAP_VADDR(pending_idx)); + frag->page = virt_to_page(idx_to_kaddr(pending_idx)); frag->size = txp->size; frag->page_offset = txp->offset; @@ -1018,7 +1064,7 @@ static void netbk_fill_frags(struct sk_buff *skb) int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, int work_to_do) { - struct netif_extra_info *extra; + struct netif_extra_info extra; RING_IDX cons = netif->tx.req_cons; do { @@ -1027,18 +1073,18 @@ int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, return -EBADR; } - extra = (struct netif_extra_info *) - RING_GET_REQUEST(&netif->tx, cons); - if (unlikely(!extra->type || - extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { + memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), + sizeof(extra)); + if (unlikely(!extra.type || + extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { netif->tx.req_cons = ++cons; - DPRINTK("Invalid extra type: %d\n", extra->type); + DPRINTK("Invalid extra type: %d\n", extra.type); return -EINVAL; } - memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); + memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); netif->tx.req_cons = ++cons; - } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); return work_to_do; } @@ -1073,6 +1119,7 @@ static void net_tx_action(unsigned long unused) struct sk_buff *skb; netif_t *netif; netif_tx_request_t txreq; + netif_tx_request_t txfrags[MAX_SKB_FRAGS]; struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; u16 pending_idx; RING_IDX i; @@ -1101,6 +1148,7 @@ static void net_tx_action(unsigned long unused) i = netif->tx.req_cons; rmb(); /* Ensure that we see the request before we copy it. */ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + /* Credit-based scheduling. */ if (txreq.size > netif->remaining_credit) { unsigned long now = jiffies; @@ -1109,25 +1157,27 @@ static void net_tx_action(unsigned long unused) msecs_to_jiffies(netif->credit_usec / 1000); /* Timer could already be pending in rare cases. */ - if (timer_pending(&netif->credit_timeout)) - break; + if (timer_pending(&netif->credit_timeout)) { + netif_put(netif); + continue; + } /* Passed the point where we can replenish credit? */ if (time_after_eq(now, next_credit)) { netif->credit_timeout.expires = now; - netif->remaining_credit = netif->credit_bytes; + tx_add_credit(netif); } /* Still too big to send right now? Set a callback. */ if (txreq.size > netif->remaining_credit) { - netif->remaining_credit = 0; netif->credit_timeout.data = (unsigned long)netif; netif->credit_timeout.function = tx_credit_callback; __mod_timer(&netif->credit_timeout, next_credit); - break; + netif_put(netif); + continue; } } netif->remaining_credit -= txreq.size; @@ -1146,19 +1196,13 @@ static void net_tx_action(unsigned long unused) } } - ret = netbk_count_requests(netif, &txreq, work_to_do); + ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); if (unlikely(ret < 0)) { netbk_tx_err(netif, &txreq, i - ret); continue; } i += ret; - if (unlikely(ret > MAX_SKB_FRAGS)) { - DPRINTK("Too many frags\n"); - netbk_tx_err(netif, &txreq, i); - continue; - } - if (unlikely(txreq.size < ETH_HLEN)) { DPRINTK("Bad packet size: %d\n", txreq.size); netbk_tx_err(netif, &txreq, i); @@ -1180,7 +1224,7 @@ static void net_tx_action(unsigned long unused) ret < MAX_SKB_FRAGS) ? PKT_PROT_LEN : txreq.size; - skb = alloc_skb(data_len+16, GFP_ATOMIC); + skb = alloc_skb(data_len + 16 + NET_IP_ALIGN, GFP_ATOMIC); if (unlikely(skb == NULL)) { DPRINTK("Can't allocate a skb in start_xmit.\n"); netbk_tx_err(netif, &txreq, i); @@ -1188,7 +1232,7 @@ static void net_tx_action(unsigned long unused) } /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, 16); + skb_reserve(skb, 16 + NET_IP_ALIGN); if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { struct netif_extra_info *gso; @@ -1201,7 +1245,7 @@ static void net_tx_action(unsigned long unused) } } - gnttab_set_map_op(mop, MMAP_VADDR(pending_idx), + gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), GNTMAP_host_map | GNTMAP_readonly, txreq.gref, netif->domid); mop++; @@ -1227,7 +1271,7 @@ static void net_tx_action(unsigned long unused) pending_cons++; - mop = netbk_get_requests(netif, skb, mop); + mop = netbk_get_requests(netif, skb, txfrags, mop); netif->tx.req_cons = i; netif_schedule_work(netif); @@ -1260,8 +1304,8 @@ static void net_tx_action(unsigned long unused) } data_len = skb->len; - memcpy(skb->data, - (void *)(MMAP_VADDR(pending_idx)|txp->offset), + memcpy(skb->data, + (void *)(idx_to_kaddr(pending_idx)|txp->offset), data_len); if (data_len < txp->size) { /* Append the packet payload as a fragment. */ @@ -1315,18 +1359,10 @@ static void netif_idx_release(u16 pending_idx) static void netif_page_release(struct page *page) { - u16 pending_idx = page - virt_to_page(mmap_vstart); - /* Ready for next use. */ set_page_count(page, 1); - netif_idx_release(pending_idx); -} - -static void netif_rx_page_release(struct page *page) -{ - /* Ready for next use. */ - set_page_count(page, 1); + netif_idx_release(page->index); } irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) @@ -1336,7 +1372,7 @@ irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) add_to_net_schedule_list_tail(netif); maybe_schedule_tx_action(); - if (netif_queue_stopped(netif->dev) && !netbk_queue_full(netif)) + if (netif_schedulable(netif->dev) && !netbk_queue_full(netif)) netif_wake_queue(netif->dev); return IRQ_HANDLED; @@ -1446,27 +1482,17 @@ static int __init netback_init(void) init_timer(&net_timer); net_timer.data = 0; net_timer.function = net_alarm; - - page = balloon_alloc_empty_page_range(MAX_PENDING_REQS); - if (page == NULL) - return -ENOMEM; - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); + if (mmap_pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); + return -ENOMEM; + } for (i = 0; i < MAX_PENDING_REQS; i++) { - page = virt_to_page(MMAP_VADDR(i)); - set_page_count(page, 1); + page = mmap_pages[i]; SetPageForeign(page, netif_page_release); - } - - page = balloon_alloc_empty_page_range(NET_RX_RING_SIZE); - BUG_ON(page == NULL); - rx_mmap_area = pfn_to_kaddr(page_to_pfn(page)); - - for (i = 0; i < NET_RX_RING_SIZE; i++) { - page = virt_to_page(rx_mmap_area + (i * PAGE_SIZE)); - set_page_count(page, 1); - SetPageForeign(page, netif_rx_page_release); + page->index = i; } pending_cons = 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c index 6da614fc0c..7d301965f4 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c @@ -28,29 +28,20 @@ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) #endif -struct backend_info -{ +struct backend_info { struct xenbus_device *dev; netif_t *netif; - struct xenbus_watch backend_watch; enum xenbus_state frontend_state; }; static int connect_rings(struct backend_info *); static void connect(struct backend_info *); -static void maybe_connect(struct backend_info *); -static void backend_changed(struct xenbus_watch *, const char **, - unsigned int); +static void backend_create_netif(struct backend_info *be); static int netback_remove(struct xenbus_device *dev) { struct backend_info *be = dev->dev.driver_data; - if (be->backend_watch.node) { - unregister_xenbus_watch(&be->backend_watch); - kfree(be->backend_watch.node); - be->backend_watch.node = NULL; - } if (be->netif) { netif_disconnect(be->netif); be->netif = NULL; @@ -63,8 +54,7 @@ static int netback_remove(struct xenbus_device *dev) /** * Entry point to this code when a new device is created. Allocate the basic - * structures, and watch the store waiting for the hotplug scripts to tell us - * the device's handle. Switch to InitWait. + * structures and switch to InitWait. */ static int netback_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) @@ -83,11 +73,6 @@ static int netback_probe(struct xenbus_device *dev, be->dev = dev; dev->dev.driver_data = be; - err = xenbus_watch_path2(dev, dev->nodename, "handle", - &be->backend_watch, backend_changed); - if (err) - goto fail; - do { err = xenbus_transaction_start(&xbt); if (err) { @@ -108,9 +93,22 @@ static int netback_probe(struct xenbus_device *dev, goto abort_transaction; } - err = xenbus_printf(xbt, dev->nodename, "feature-rx-copy", "%d", 1); + /* We support rx-copy path. */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-copy", "%d", 1); + if (err) { + message = "writing feature-rx-copy"; + goto abort_transaction; + } + + /* + * We don't support rx-flip path (except old guests who don't + * grok this feature flag). + */ + err = xenbus_printf(xbt, dev->nodename, + "feature-rx-flip", "%d", 0); if (err) { - message = "writing feature-copying"; + message = "writing feature-rx-flip"; goto abort_transaction; } @@ -123,9 +121,11 @@ static int netback_probe(struct xenbus_device *dev, } err = xenbus_switch_state(dev, XenbusStateInitWait); - if (err) { + if (err) goto fail; - } + + /* This kicks hotplug scripts, so do it immediately. */ + backend_create_netif(be); return 0; @@ -175,48 +175,30 @@ static int netback_uevent(struct xenbus_device *xdev, char **envp, } -/** - * Callback received when the hotplug scripts have placed the handle node. - * Read it, and create a netif structure. If the frontend is ready, connect. - */ -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) +static void backend_create_netif(struct backend_info *be) { int err; long handle; - struct backend_info *be - = container_of(watch, struct backend_info, backend_watch); struct xenbus_device *dev = be->dev; - DPRINTK(""); + if (be->netif != NULL) + return; err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); - if (XENBUS_EXIST_ERR(err)) { - /* Since this watch will fire once immediately after it is - registered, we expect this. Ignore it, and wait for the - hotplug scripts. */ - return; - } if (err != 1) { xenbus_dev_fatal(dev, err, "reading handle"); return; } - if (be->netif == NULL) { - u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 }; - - be->netif = netif_alloc(dev->otherend_id, handle, be_mac); - if (IS_ERR(be->netif)) { - err = PTR_ERR(be->netif); - be->netif = NULL; - xenbus_dev_fatal(dev, err, "creating interface"); - return; - } - - kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); - - maybe_connect(be); + be->netif = netif_alloc(dev->otherend_id, handle); + if (IS_ERR(be->netif)) { + err = PTR_ERR(be->netif); + be->netif = NULL; + xenbus_dev_fatal(dev, err, "creating interface"); + return; } + + kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); } @@ -249,11 +231,9 @@ static void frontend_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - if (!be->netif) { - /* reconnect: setup be->netif */ - backend_changed(&be->backend_watch, NULL, 0); - } - maybe_connect(be); + backend_create_netif(be); + if (be->netif) + connect(be); break; case XenbusStateClosing: @@ -279,15 +259,6 @@ static void frontend_changed(struct xenbus_device *dev, } -/* ** Connection ** */ - - -static void maybe_connect(struct backend_info *be) -{ - if (be->netif && (be->frontend_state == XenbusStateConnected)) - connect(be); -} - static void xen_net_read_rate(struct xenbus_device *dev, unsigned long *bytes, unsigned long *usec) { @@ -366,6 +337,10 @@ static void connect(struct backend_info *be) be->netif->remaining_credit = be->netif->credit_bytes; xenbus_switch_state(dev, XenbusStateConnected); + + /* May not get a kick from the frontend, so start the tx_queue now. */ + if (!netbk_can_queue(be->netif->dev)) + netif_wake_queue(be->netif->dev); } @@ -403,14 +378,16 @@ static int connect_rings(struct backend_info *be) } be->netif->copying_receiver = !!rx_copy; - if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-notify", "%d", - &val) < 0) - val = 0; - if (val) - be->netif->can_queue = 1; - else - /* Must be non-zero for pfifo_fast to work. */ - be->netif->dev->tx_queue_len = 1; + if (be->netif->dev->tx_queue_len != 0) { + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-rx-notify", "%d", &val) < 0) + val = 0; + if (val) + be->netif->can_queue = 1; + else + /* Must be non-zero for pfifo_fast to work. */ + be->netif->dev->tx_queue_len = 1; + } if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0) val = 0; diff --git a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c index a257cb6064..da22d45bf6 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c @@ -47,6 +47,7 @@ #include <linux/in.h> #include <linux/if_ether.h> #include <linux/io.h> +#include <linux/moduleparam.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/arp.h> @@ -63,20 +64,76 @@ #include <xen/interface/grant_table.h> #include <xen/gnttab.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +/* + * Mutually-exclusive module options to select receive data path: + * rx_copy : Packets are copied by network backend into local memory + * rx_flip : Page containing packet data is transferred to our ownership + * For fully-virtualised guests there is no option - copying must be used. + * For paravirtualised guests, flipping is the default. + */ +#ifdef CONFIG_XEN +static int MODPARM_rx_copy = 0; +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); +static int MODPARM_rx_flip = 0; +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); +#else +static const int MODPARM_rx_copy = 1; +static const int MODPARM_rx_flip = 0; +#endif + #define RX_COPY_THRESHOLD 256 /* If we don't have GSO, fake things up so that we never try to use it. */ -#ifndef NETIF_F_GSO -#define netif_needs_gso(dev, skb) 0 -#define dev_disable_gso_features(dev) ((void)0) -#else +#if defined(NETIF_F_GSO) #define HAVE_GSO 1 +#define HAVE_TSO 1 /* TSO is a subset of GSO */ static inline void dev_disable_gso_features(struct net_device *dev) { /* Turn off all GSO bits except ROBUST. */ dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; dev->features |= NETIF_F_GSO_ROBUST; } +#elif defined(NETIF_F_TSO) +#define HAVE_TSO 1 + +/* Some older kernels cannot cope with incorrect checksums, + * particularly in netfilter. I'm not sure there is 100% correlation + * with the presence of NETIF_F_TSO but it appears to be a good first + * approximiation. + */ +#define HAVE_NO_CSUM_OFFLOAD 1 + +#define gso_size tso_size +#define gso_segs tso_segs +static inline void dev_disable_gso_features(struct net_device *dev) +{ + /* Turn off all TSO bits. */ + dev->features &= ~NETIF_F_TSO; +} +static inline int skb_is_gso(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->tso_size; +} +static inline int skb_gso_ok(struct sk_buff *skb, int features) +{ + return (features & NETIF_F_TSO); +} + +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) +{ + return skb_is_gso(skb) && + (!skb_gso_ok(skb, dev->features) || + unlikely(skb->ip_summed != CHECKSUM_HW)); +} +#else +#define netif_needs_gso(dev, skb) 0 +#define dev_disable_gso_features(dev) ((void)0) #endif #define GRANT_INVALID_REF 0 @@ -96,7 +153,6 @@ struct netfront_info { spinlock_t tx_lock; spinlock_t rx_lock; - unsigned int handle; unsigned int evtchn, irq; unsigned int copying_receiver; @@ -120,7 +176,7 @@ struct netfront_info { grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; grant_ref_t gref_rx_head; - grant_ref_t grant_rx_ref[NET_TX_RING_SIZE]; + grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; struct xenbus_device *xbdev; int tx_ring_ref; @@ -185,9 +241,8 @@ static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np, #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "netfront: " fmt, ##args) -static int talk_to_backend(struct xenbus_device *, struct netfront_info *); static int setup_device(struct xenbus_device *, struct netfront_info *); -static struct net_device *create_netdev(int, int, struct xenbus_device *); +static struct net_device *create_netdev(struct xenbus_device *); static void netfront_closing(struct xenbus_device *); @@ -195,9 +250,8 @@ static void end_access(int, void *); static void netif_disconnect_backend(struct netfront_info *); static int open_netdev(struct netfront_info *); static void close_netdev(struct netfront_info *); -static void netif_free(struct netfront_info *); -static void network_connect(struct net_device *); +static int network_connect(struct net_device *); static void network_tx_buf_gc(struct net_device *); static void network_alloc_rx_buffers(struct net_device *); static int send_fake_arp(struct net_device *); @@ -220,8 +274,7 @@ static inline int xennet_can_sg(struct net_device *dev) /** * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffers for communication with the backend, and - * inform the backend of the appropriate details for those. Switch to - * Connected state. + * inform the backend of the appropriate details for those. */ static int __devinit netfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) @@ -229,31 +282,8 @@ static int __devinit netfront_probe(struct xenbus_device *dev, int err; struct net_device *netdev; struct netfront_info *info; - unsigned int handle; - unsigned feature_rx_copy; - - err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%u", &handle); - if (err != 1) { - xenbus_dev_fatal(dev, err, "reading handle"); - return err; - } - -#ifndef CONFIG_XEN - err = xenbus_scanf(XBT_NIL, dev->otherend, "feature-rx-copy", "%u", - &feature_rx_copy); - if (err != 1) { - xenbus_dev_fatal(dev, err, "reading feature-rx-copy"); - return err; - } - if (!feature_rx_copy) { - xenbus_dev_fatal(dev, 0, "need a copy-capable backend"); - return -EINVAL; - } -#else - feature_rx_copy = 0; -#endif - netdev = create_netdev(handle, feature_rx_copy, dev); + netdev = create_netdev(dev); if (IS_ERR(netdev)) { err = PTR_ERR(netdev); xenbus_dev_fatal(dev, err, "creating netdev"); @@ -263,20 +293,13 @@ static int __devinit netfront_probe(struct xenbus_device *dev, info = netdev_priv(netdev); dev->dev.driver_data = info; - err = talk_to_backend(dev, info); - if (err) - goto fail_backend; - err = open_netdev(info); if (err) - goto fail_open; + goto fail; return 0; - fail_open: - xennet_sysfs_delif(info->netdev); - unregister_netdev(netdev); - fail_backend: + fail: free_netdev(netdev); dev->dev.driver_data = NULL; return err; @@ -296,7 +319,7 @@ static int netfront_resume(struct xenbus_device *dev) DPRINTK("%s\n", dev->nodename); netif_disconnect_backend(info); - return talk_to_backend(dev, info); + return 0; } static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) @@ -379,13 +402,21 @@ again: goto abort_transaction; } +#ifdef HAVE_NO_CSUM_OFFLOAD + err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1); + if (err) { + message = "writing feature-no-csum-offload"; + goto abort_transaction; + } +#endif + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); if (err) { message = "writing feature-sg"; goto abort_transaction; } -#ifdef HAVE_GSO +#ifdef HAVE_TSO err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1); if (err) { message = "writing feature-gso-tcpv4"; @@ -407,12 +438,11 @@ again: xenbus_transaction_end(xbt, 1); xenbus_dev_fatal(dev, err, "%s", message); destroy_ring: - netif_free(info); + netif_disconnect_backend(info); out: return err; } - static int setup_device(struct xenbus_device *dev, struct netfront_info *info) { struct netif_tx_sring *txs; @@ -472,11 +502,9 @@ static int setup_device(struct xenbus_device *dev, struct netfront_info *info) return 0; fail: - netif_free(info); return err; } - /** * Callback received when the backend's state changes. */ @@ -497,7 +525,8 @@ static void backend_changed(struct xenbus_device *dev, break; case XenbusStateInitWait: - network_connect(netdev); + if (network_connect(netdev) != 0) + break; xenbus_switch_state(dev, XenbusStateConnected); (void)send_fake_arp(netdev); break; @@ -508,7 +537,6 @@ static void backend_changed(struct xenbus_device *dev, } } - /** Send a packet on a net device to encourage switches to learn the * MAC. We send a fake ARP request. * @@ -537,7 +565,6 @@ static int send_fake_arp(struct net_device *dev) return dev_queue_xmit(skb); } - static int network_open(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); @@ -629,14 +656,12 @@ static void network_tx_buf_gc(struct net_device *dev) network_maybe_wake_tx(dev); } - static void rx_refill_timeout(unsigned long data) { struct net_device *dev = (struct net_device *)data; netif_rx_schedule(dev); } - static void network_alloc_rx_buffers(struct net_device *dev) { unsigned short id; @@ -669,7 +694,7 @@ static void network_alloc_rx_buffers(struct net_device *dev) * necessary here. * 16 bytes added as necessary headroom for netif_receive_skb. */ - skb = alloc_skb(RX_COPY_THRESHOLD + 16, + skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) goto no_skb; @@ -687,7 +712,7 @@ no_skb: break; } - skb_reserve(skb, 16); /* mimic dev_alloc_skb() */ + skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */ skb_shinfo(skb)->frags[0].page = page; skb_shinfo(skb)->nr_frags = 1; __skb_queue_tail(&np->rx_batch, skb); @@ -742,7 +767,7 @@ no_skb: } else { gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, - pfn, + pfn_to_mfn(pfn), 0); } @@ -917,7 +942,7 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) tx->flags |= NETTXF_data_validated; #endif -#ifdef HAVE_GSO +#ifdef HAVE_TSO if (skb_shinfo(skb)->gso_size) { struct netif_extra_info *gso = (struct netif_extra_info *) RING_GET_REQUEST(&np->tx, ++i); @@ -1071,6 +1096,7 @@ static int xennet_get_responses(struct netfront_info *np, if (net_ratelimit()) WPRINTK("rx->offset: %x, size: %u\n", rx->offset, rx->status); + xennet_move_rx_slot(np, skb, ref); err = -EINVAL; goto next; } @@ -1081,7 +1107,8 @@ static int xennet_get_responses(struct netfront_info *np, * situation to the system controller to reboot the backed. */ if (ref == GRANT_INVALID_REF) { - WPRINTK("Bad rx response id %d.\n", rx->id); + if (net_ratelimit()) + WPRINTK("Bad rx response id %d.\n", rx->id); err = -EINVAL; goto next; } @@ -1153,6 +1180,9 @@ next: err = -E2BIG; } + if (unlikely(err)) + np->rx.rsp_cons = cons + frags; + *pages_flipped_p = pages_flipped; return err; @@ -1205,12 +1235,14 @@ static int xennet_set_skb_gso(struct sk_buff *skb, return -EINVAL; } -#ifdef HAVE_GSO +#ifdef HAVE_TSO skb_shinfo(skb)->gso_size = gso->u.gso.size; +#ifdef HAVE_GSO skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; +#endif skb_shinfo(skb)->gso_segs = 0; return 0; @@ -1255,9 +1287,9 @@ static int netif_poll(struct net_device *dev, int *pbudget) rp = np->rx.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = np->rx.rsp_cons, work_done = 0; - (i != rp) && (work_done < budget); - np->rx.rsp_cons = ++i, work_done++) { + i = np->rx.rsp_cons; + work_done = 0; + while ((i != rp) && (work_done < budget)) { memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); memset(extras, 0, sizeof(extras)); @@ -1265,12 +1297,11 @@ static int netif_poll(struct net_device *dev, int *pbudget) &pages_flipped); if (unlikely(err)) { -err: - i = np->rx.rsp_cons + skb_queue_len(&tmpq) - 1; - work_done--; +err: while ((skb = __skb_dequeue(&tmpq))) __skb_queue_tail(&errq, skb); np->stats.rx_errors++; + i = np->rx.rsp_cons; continue; } @@ -1282,6 +1313,7 @@ err: if (unlikely(xennet_set_skb_gso(skb, gso))) { __skb_queue_head(&tmpq, skb); + np->rx.rsp_cons += skb_queue_len(&tmpq); goto err; } } @@ -1345,6 +1377,9 @@ err: np->stats.rx_bytes += skb->len; __skb_queue_tail(&rxq, skb); + + np->rx.rsp_cons = ++i; + work_done++; } if (pages_flipped) { @@ -1561,7 +1596,7 @@ static int xennet_set_sg(struct net_device *dev, u32 data) static int xennet_set_tso(struct net_device *dev, u32 data) { -#ifdef HAVE_GSO +#ifdef HAVE_TSO if (data) { struct netfront_info *np = netdev_priv(dev); int val; @@ -1588,20 +1623,53 @@ static void xennet_set_features(struct net_device *dev) if (!(dev->features & NETIF_F_IP_CSUM)) return; - if (!xennet_set_sg(dev, 1)) - xennet_set_tso(dev, 1); + if (xennet_set_sg(dev, 1)) + return; + + /* Before 2.6.9 TSO seems to be unreliable so do not enable it + * on older kernels. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) + xennet_set_tso(dev, 1); +#endif + } -static void network_connect(struct net_device *dev) +static int network_connect(struct net_device *dev) { struct netfront_info *np = netdev_priv(dev); - int i, requeue_idx; + int i, requeue_idx, err; struct sk_buff *skb; grant_ref_t ref; netif_rx_request_t *req; + unsigned int feature_rx_copy, feature_rx_flip; + + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-copy", "%u", &feature_rx_copy); + if (err != 1) + feature_rx_copy = 0; + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-flip", "%u", &feature_rx_flip); + if (err != 1) + feature_rx_flip = 1; + + /* + * Copy packets on receive path if: + * (a) This was requested by user, and the backend supports it; or + * (b) Flipping was requested, but this is unsupported by the backend. + */ + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || + (MODPARM_rx_flip && !feature_rx_flip)); + + err = talk_to_backend(np->xbdev, np); + if (err) + return err; xennet_set_features(dev); + IPRINTK("device %s has %sing receive path.\n", + dev->name, np->copying_receiver ? "copy" : "flipp"); + spin_lock_irq(&np->tx_lock); spin_lock(&np->rx_lock); @@ -1632,7 +1700,8 @@ static void network_connect(struct net_device *dev) } else { gnttab_grant_foreign_access_ref( ref, np->xbdev->otherend_id, - page_to_pfn(skb_shinfo(skb)->frags->page), + pfn_to_mfn(page_to_pfn(skb_shinfo(skb)-> + frags->page)), 0); } req->gref = ref; @@ -1656,6 +1725,8 @@ static void network_connect(struct net_device *dev) spin_unlock(&np->rx_lock); spin_unlock_irq(&np->tx_lock); + + return 0; } static void netif_uninit(struct net_device *dev) @@ -1821,8 +1892,7 @@ static void network_set_multicast_list(struct net_device *dev) { } -static struct net_device * __devinit -create_netdev(int handle, int copying_receiver, struct xenbus_device *dev) +static struct net_device * __devinit create_netdev(struct xenbus_device *dev) { int i, err = 0; struct net_device *netdev = NULL; @@ -1836,9 +1906,7 @@ create_netdev(int handle, int copying_receiver, struct xenbus_device *dev) } np = netdev_priv(netdev); - np->handle = handle; np->xbdev = dev; - np->copying_receiver = copying_receiver; netif_carrier_off(netdev); @@ -1969,10 +2037,12 @@ static int open_netdev(struct netfront_info *info) err = xennet_sysfs_addif(info->netdev); if (err) { - /* This can be non-fatal: it only means no tuning parameters */ + unregister_netdev(info->netdev); printk(KERN_WARNING "%s: add sysfs failed err=%d\n", __FUNCTION__, err); + return err; } + return 0; } @@ -2007,14 +2077,6 @@ static void netif_disconnect_backend(struct netfront_info *info) } -static void netif_free(struct netfront_info *info) -{ - close_netdev(info); - netif_disconnect_backend(info); - free_netdev(info->netdev); -} - - static void end_access(int ref, void *page) { if (ref != GRANT_INVALID_REF) @@ -2053,6 +2115,16 @@ static int __init netif_init(void) if (!is_running_on_xen()) return -ENODEV; +#ifdef CONFIG_XEN + if (MODPARM_rx_flip && MODPARM_rx_copy) { + WPRINTK("Cannot specify both rx_copy and rx_flip.\n"); + return -EINVAL; + } + + if (!MODPARM_rx_flip && !MODPARM_rx_copy) + MODPARM_rx_flip = 1; /* Default is to flip. */ +#endif + if (is_initial_xendomain()) return 0; @@ -2067,6 +2139,9 @@ module_init(netif_init); static void __exit netif_exit(void) { + if (is_initial_xendomain()) + return; + unregister_inetaddr_notifier(¬ifier_inetdev); return xenbus_unregister_driver(&netfront); diff --git a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c index a1c4b6f68e..d159e4ac74 100644 --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c @@ -35,6 +35,10 @@ static struct proc_dir_entry *privcmd_intf; static struct proc_dir_entry *capabilities_intf; +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); +#endif + static int privcmd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long data) { @@ -49,6 +53,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, return -EFAULT; #if defined(__i386__) + if (hypercall.op >= (PAGE_SIZE >> 5)) + break; __asm__ __volatile__ ( "pushl %%ebx; pushl %%ecx; pushl %%edx; " "pushl %%esi; pushl %%edi; " @@ -65,45 +71,36 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, "popl %%ecx; popl %%ebx" : "=a" (ret) : "0" (&hypercall) : "memory" ); #elif defined (__x86_64__) - { + if (hypercall.op < (PAGE_SIZE >> 5)) { long ign1, ign2, ign3; __asm__ __volatile__ ( "movq %8,%%r10; movq %9,%%r8;" - "shlq $5,%%rax ;" + "shll $5,%%eax ;" "addq $hypercall_page,%%rax ;" "call *%%rax" : "=a" (ret), "=D" (ign1), "=S" (ign2), "=d" (ign3) - : "0" ((unsigned long)hypercall.op), - "1" ((unsigned long)hypercall.arg[0]), - "2" ((unsigned long)hypercall.arg[1]), - "3" ((unsigned long)hypercall.arg[2]), - "g" ((unsigned long)hypercall.arg[3]), - "g" ((unsigned long)hypercall.arg[4]) + : "0" ((unsigned int)hypercall.op), + "1" (hypercall.arg[0]), + "2" (hypercall.arg[1]), + "3" (hypercall.arg[2]), + "g" (hypercall.arg[3]), + "g" (hypercall.arg[4]) : "r8", "r10", "memory" ); } #elif defined (__ia64__) - __asm__ __volatile__ ( - ";; mov r14=%2; mov r15=%3; " - "mov r16=%4; mov r17=%5; mov r18=%6;" - "mov r2=%1; break 0x1000;; mov %0=r8 ;;" - : "=r" (ret) - : "r" (hypercall.op), - "r" (hypercall.arg[0]), - "r" (hypercall.arg[1]), - "r" (hypercall.arg[2]), - "r" (hypercall.arg[3]), - "r" (hypercall.arg[4]) - : "r14","r15","r16","r17","r18","r2","r8","memory"); + ret = privcmd_hypercall(&hypercall); #endif } break; case IOCTL_PRIVCMD_MMAP: { -#define PRIVCMD_MMAP_SZ 32 privcmd_mmap_t mmapcmd; - privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ]; + privcmd_mmap_entry_t msg; privcmd_mmap_entry_t __user *p; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long va; int i, rc; if (!is_initial_xendomain()) @@ -113,85 +110,92 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, return -EFAULT; p = mmapcmd.entry; + if (copy_from_user(&msg, p, sizeof(msg))) + return -EFAULT; - for (i = 0; i < mmapcmd.num; - i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) { - int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? - PRIVCMD_MMAP_SZ:(mmapcmd.num-i); - - if (copy_from_user(&msg, p, - n*sizeof(privcmd_mmap_entry_t))) - return -EFAULT; - - for (j = 0; j < n; j++) { - struct vm_area_struct *vma = - find_vma( current->mm, msg[j].va ); - - if (!vma) - return -EINVAL; - - if (msg[j].va > PAGE_OFFSET) - return -EINVAL; - - if ((msg[j].va + (msg[j].npages << PAGE_SHIFT)) - > vma->vm_end ) - return -EINVAL; - - if ((rc = direct_remap_pfn_range( - vma, - msg[j].va&PAGE_MASK, - msg[j].mfn, - msg[j].npages<<PAGE_SHIFT, - vma->vm_page_prot, - mmapcmd.dom)) < 0) - return rc; - } + down_read(&mm->mmap_sem); + + vma = find_vma(mm, msg.va); + rc = -EINVAL; + if (!vma || (msg.va != vma->vm_start) || + !privcmd_enforce_singleshot_mapping(vma)) + goto mmap_out; + + va = vma->vm_start; + + for (i = 0; i < mmapcmd.num; i++) { + rc = -EFAULT; + if (copy_from_user(&msg, p, sizeof(msg))) + goto mmap_out; + + /* Do not allow range to wrap the address space. */ + rc = -EINVAL; + if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va)) + goto mmap_out; + + /* Range chunks must be contiguous in va space. */ + if ((msg.va != va) || + ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end)) + goto mmap_out; + + if ((rc = direct_remap_pfn_range( + vma, + msg.va & PAGE_MASK, + msg.mfn, + msg.npages << PAGE_SHIFT, + vma->vm_page_prot, + mmapcmd.dom)) < 0) + goto mmap_out; + + p++; + va += msg.npages << PAGE_SHIFT; } - ret = 0; + + rc = 0; + + mmap_out: + up_read(&mm->mmap_sem); + ret = rc; } break; case IOCTL_PRIVCMD_MMAPBATCH: { privcmd_mmapbatch_t m; - struct vm_area_struct *vma = NULL; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; xen_pfn_t __user *p; - unsigned long addr, mfn; + unsigned long addr, mfn, nr_pages; int i; if (!is_initial_xendomain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) { - ret = -EFAULT; - goto batch_err; - } - - if (m.dom == DOMID_SELF) { - ret = -EINVAL; - goto batch_err; - } + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; - vma = find_vma(current->mm, m.addr); - if (!vma) { - ret = -EINVAL; - goto batch_err; - } + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; - if (m.addr > PAGE_OFFSET) { - ret = -EFAULT; - goto batch_err; - } + down_read(&mm->mmap_sem); - if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) { - ret = -EFAULT; - goto batch_err; + vma = find_vma(mm, m.addr); + if (!vma || + (m.addr != vma->vm_start) || + ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || + !privcmd_enforce_singleshot_mapping(vma)) { + up_read(&mm->mmap_sem); + return -EINVAL; } p = m.arr; addr = m.addr; - for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) { - if (get_user(mfn, p)) + for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) { + if (get_user(mfn, p)) { + up_read(&mm->mmap_sem); return -EFAULT; + } ret = direct_remap_pfn_range(vma, addr & PAGE_MASK, mfn, PAGE_SIZE, @@ -200,15 +204,8 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, put_user(0xF0000000 | mfn, p); } + up_read(&mm->mmap_sem); ret = 0; - break; - - batch_err: - printk("batch_err ret=%d vma=%p addr=%lx " - "num=%d arr=%p %lx-%lx\n", - ret, vma, (unsigned long)m.addr, m.num, m.arr, - vma ? vma->vm_start : 0, vma ? vma->vm_end : 0); - break; } break; @@ -221,13 +218,35 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, } #ifndef HAVE_ARCH_PRIVCMD_MMAP +static struct page *privcmd_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + return NOPAGE_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .nopage = privcmd_nopage +}; + static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) { + /* Unsupported for auto-translate guests. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; return 0; } + +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +{ + return (xchg(&vma->vm_private_data, (void *)1) == NULL); +} #endif static struct file_operations privcmd_file_ops = { diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h index 27b8fd283a..b209b4f583 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h @@ -46,11 +46,10 @@ typedef struct tpmif_st { atomic_t refcnt; struct backend_info *bi; - unsigned long mmap_vstart; grant_handle_t shmem_handle; grant_ref_t shmem_ref; - struct page *pagerange; + struct page **mmap_pages; char devname[20]; } tpmif_t; @@ -80,6 +79,9 @@ int vtpm_release_packets(tpmif_t * tpmif, int send_msgs); extern int num_frontends; -#define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE)) +static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx) +{ + return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx])); +} #endif /* __TPMIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c index 0105bd93bf..2614aa5126 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c @@ -25,8 +25,8 @@ static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) tpmif_t *tpmif; tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); - if (!tpmif) - return ERR_PTR(-ENOMEM); + if (tpmif == NULL) + goto out_of_memory; memset(tpmif, 0, sizeof (*tpmif)); tpmif->domid = domid; @@ -35,22 +35,27 @@ static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid); atomic_set(&tpmif->refcnt, 1); - tpmif->pagerange = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE); - BUG_ON(tpmif->pagerange == NULL); - tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr( - page_to_pfn(tpmif->pagerange)); + tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE); + if (tpmif->mmap_pages == NULL) + goto out_of_memory; list_add(&tpmif->tpmif_list, &tpmif_list); num_frontends++; return tpmif; + + out_of_memory: + if (tpmif != NULL) + kmem_cache_free(tpmif_cachep, tpmif); + printk("%s: out of memory\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); } static void free_tpmif(tpmif_t * tpmif) { num_frontends--; list_del(&tpmif->tpmif_list); - balloon_dealloc_empty_page_range(tpmif->pagerange, TPMIF_TX_RING_SIZE); + free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE); kmem_cache_free(tpmif_cachep, tpmif); } diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c index 466c3ee581..701a5ad03e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c @@ -253,7 +253,7 @@ int _packet_write(struct packet *pak, return 0; } - gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i), + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, tx->ref, tpmif->domid); if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, @@ -270,7 +270,7 @@ int _packet_write(struct packet *pak, tocopy = min_t(size_t, size - offset, PAGE_SIZE); - if (copy_from_buffer((void *)(MMAP_VADDR(tpmif, i) | + if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) | (tx->addr & ~PAGE_MASK)), &data[offset], tocopy, isuserbuffer)) { tpmif_put(tpmif); @@ -278,7 +278,7 @@ int _packet_write(struct packet *pak, } tx->size = tocopy; - gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i), + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, handle); if (unlikely @@ -391,7 +391,7 @@ static int packet_read_shmem(struct packet *pak, tx = &tpmif->tx->ring[i].req; - gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i), + gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, tx->ref, tpmif->domid); if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, @@ -414,10 +414,10 @@ static int packet_read_shmem(struct packet *pak, } DPRINTK("Copying from mapped memory at %08lx\n", - (unsigned long)(MMAP_VADDR(tpmif, i) | + (unsigned long)(idx_to_kaddr(tpmif, i) | (tx->addr & ~PAGE_MASK))); - src = (void *)(MMAP_VADDR(tpmif, i) | + src = (void *)(idx_to_kaddr(tpmif, i) | ((tx->addr & ~PAGE_MASK) + pg_offset)); if (copy_to_buffer(&buffer[offset], src, to_copy, isuserbuffer)) { @@ -428,7 +428,7 @@ static int packet_read_shmem(struct packet *pak, tpmif->domid, buffer[offset], buffer[offset + 1], buffer[offset + 2], buffer[offset + 3]); - gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i), + gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), GNTMAP_host_map, handle); if (unlikely diff --git a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c index 4ee5c5bbfe..f48b0e3726 100644 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c @@ -157,10 +157,12 @@ static void frontend_changed(struct xenbus_device *dev, case XenbusStateClosing: be->instance = -1; + xenbus_switch_state(dev, XenbusStateClosing); break; - case XenbusStateUnknown: + case XenbusStateUnknown: /* keep it here */ case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); device_unregister(&be->dev->dev); tpmback_remove(dev); break; diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile index d7c7d05172..ce5acc2457 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile @@ -9,4 +9,5 @@ xenbus-objs += xenbus_client.o xenbus-objs += xenbus_comms.o xenbus-objs += xenbus_xs.o xenbus-objs += xenbus_probe.o +obj-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c index 9b389ec06b..0111e8e3a2 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c @@ -35,8 +35,9 @@ #include <xen/xenbus.h> #include <xen/driver_util.h> -/* xenbus_probe.c */ -extern char *kasprintf(const char *fmt, ...); +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif #define DPRINTK(fmt, args...) \ pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) @@ -84,7 +85,7 @@ int xenbus_watch_path2(struct xenbus_device *dev, const char *path, const char **, unsigned int)) { int err; - char *state = kasprintf("%s/%s", path, path2); + char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2); if (!state) { xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; @@ -152,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_frontend_closed); */ static char *error_path(struct xenbus_device *dev) { - return kasprintf("error/%s", dev->nodename); + return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); } diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c index 38da320b67..f0e42ba715 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c @@ -30,15 +30,22 @@ * IN THE SOFTWARE. */ -#include <asm/hypervisor.h> -#include <xen/evtchn.h> #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/err.h> +#include <linux/ptrace.h> +#include <xen/evtchn.h> #include <xen/xenbus.h> + +#include <asm/hypervisor.h> + #include "xenbus_comms.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + static int xenbus_irq; extern void xenbus_probe(void *); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c index bbe4a8c5a8..ba37e61856 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c @@ -40,6 +40,7 @@ #include <linux/wait.h> #include <linux/fs.h> #include <linux/poll.h> +#include <linux/mutex.h> #include "xenbus_comms.h" @@ -49,6 +50,10 @@ #include <xen/xen_proc.h> #include <asm/hypervisor.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + struct xenbus_dev_transaction { struct list_head list; struct xenbus_transaction handle; diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c index bcd1f6df06..5320368443 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c @@ -42,6 +42,7 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/kthread.h> +#include <linux/mutex.h> #include <asm/io.h> #include <asm/page.h> @@ -55,6 +56,11 @@ #include <xen/hvm.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif int xen_store_evtchn; struct xenstore_domain_interface *xen_store_interface; @@ -67,12 +73,7 @@ static struct notifier_block *xenstore_chain; static void wait_for_devices(struct xenbus_driver *xendrv); static int xenbus_probe_frontend(const char *type, const char *name); -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); -static int xenbus_probe_backend(const char *type, const char *domid); -static int xenbus_dev_probe(struct device *_dev); -static int xenbus_dev_remove(struct device *_dev); static void xenbus_dev_shutdown(struct device *_dev); /* If something in array of ids matches this device, return it. */ @@ -86,7 +87,7 @@ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) return NULL; } -static int xenbus_match(struct device *_dev, struct device_driver *_drv) +int xenbus_match(struct device *_dev, struct device_driver *_drv) { struct xenbus_driver *drv = to_xenbus_driver(_drv); @@ -96,17 +97,6 @@ static int xenbus_match(struct device *_dev, struct device_driver *_drv) return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; } -struct xen_bus_type -{ - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); - int (*probe)(const char *type, const char *dir); - struct bus_type bus; - struct device dev; -}; - - /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) { @@ -143,7 +133,7 @@ static void free_otherend_watch(struct xenbus_device *dev) } -static int read_otherend_details(struct xenbus_device *xendev, +int read_otherend_details(struct xenbus_device *xendev, char *id_node, char *path_node) { int err = xenbus_gather(XBT_NIL, xendev->nodename, @@ -176,12 +166,6 @@ static int read_backend_details(struct xenbus_device *xendev) } -static int read_frontend_details(struct xenbus_device *xendev) -{ - return read_otherend_details(xendev, "frontend-id", "frontend"); -} - - /* Bus type for frontend drivers. */ static struct xen_bus_type xenbus_frontend = { .root = "device", @@ -191,115 +175,17 @@ static struct xen_bus_type xenbus_frontend = { .bus = { .name = "xen", .match = xenbus_match, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, +#endif }, .dev = { .bus_id = "xen", }, }; -/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ -static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) -{ - int domid, err; - const char *devid, *type, *frontend; - unsigned int typelen; - - type = strchr(nodename, '/'); - if (!type) - return -EINVAL; - type++; - typelen = strcspn(type, "/"); - if (!typelen || type[typelen] != '/') - return -EINVAL; - - devid = strrchr(nodename, '/') + 1; - - err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, - "frontend", NULL, &frontend, - NULL); - if (err) - return err; - if (strlen(frontend) == 0) - err = -ERANGE; - if (!err && !xenbus_exists(XBT_NIL, frontend, "")) - err = -ENOENT; - - kfree(frontend); - - if (err) - return err; - - if (snprintf(bus_id, BUS_ID_SIZE, - "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) - return -ENOSPC; - return 0; -} - -static struct xen_bus_type xenbus_backend = { - .root = "backend", - .levels = 3, /* backend/type/<frontend>/<id> */ - .get_bus_id = backend_bus_id, - .probe = xenbus_probe_backend, - .bus = { - .name = "xen-backend", - .match = xenbus_match, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, -// .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_backend, - }, - .dev = { - .bus_id = "xen-backend", - }, -}; - -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) -{ - struct xenbus_device *xdev; - struct xenbus_driver *drv; - int i = 0; - int length = 0; - - DPRINTK(""); - - if (dev == NULL) - return -ENODEV; - - xdev = to_xenbus_device(dev); - if (xdev == NULL) - return -ENODEV; - - /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_BASE_PATH=%s", xenbus_backend.root); - - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; - - if (dev->driver) { - drv = to_xenbus_driver(dev->driver); - if (drv && drv->uevent) - return drv->uevent(xdev, envp, num_envp, buffer, - buffer_size); - } - - return 0; -} - static void otherend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) { @@ -322,6 +208,20 @@ static void otherend_changed(struct xenbus_watch *watch, DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state), dev->otherend_watch.node, vec[XS_WATCH_PATH]); + /* + * Ignore xenbus transitions during shutdown. This prevents us doing + * work that can fail e.g., when the rootfs is gone. + */ + if (system_state > SYSTEM_RUNNING) { + struct xen_bus_type *bus = bus; + bus = container_of(dev->dev.bus, struct xen_bus_type, bus); + /* If we're frontend, drive the state machine to Closed. */ + /* This should cause the backend to release our resources. */ + if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) + xenbus_frontend_closed(dev); + return; + } + if (drv->otherend_changed) drv->otherend_changed(dev, state); } @@ -345,7 +245,7 @@ static int watch_otherend(struct xenbus_device *dev) } -static int xenbus_dev_probe(struct device *_dev) +int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -392,7 +292,7 @@ fail: return -ENODEV; } -static int xenbus_dev_remove(struct device *_dev) +int xenbus_dev_remove(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -430,14 +330,21 @@ static void xenbus_dev_shutdown(struct device *_dev) put_device(&dev->dev); } -static int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus) { int ret; drv->driver.name = drv->name; drv->driver.bus = &bus->bus; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) drv->driver.owner = drv->owner; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + drv->driver.probe = xenbus_dev_probe; + drv->driver.remove = xenbus_dev_remove; + drv->driver.shutdown = xenbus_dev_shutdown; +#endif mutex_lock(&xenwatch_mutex); ret = driver_register(&drv->driver); @@ -462,14 +369,6 @@ int xenbus_register_frontend(struct xenbus_driver *drv) } EXPORT_SYMBOL_GPL(xenbus_register_frontend); -int xenbus_register_backend(struct xenbus_driver *drv) -{ - drv->read_otherend_details = read_frontend_details; - - return xenbus_register_driver_common(drv, &xenbus_backend); -} -EXPORT_SYMBOL_GPL(xenbus_register_backend); - void xenbus_unregister_driver(struct xenbus_driver *drv) { driver_unregister(&drv->driver); @@ -545,45 +444,30 @@ static void xenbus_dev_release(struct device *dev) kfree(to_xenbus_device(dev)); } -/* Simplified asprintf. */ -char *kasprintf(const char *fmt, ...) -{ - va_list ap; - unsigned int len; - char *p, dummy[1]; - - va_start(ap, fmt); - /* FIXME: vsnprintf has a bug, NULL should work */ - len = vsnprintf(dummy, 0, fmt, ap); - va_end(ap); - - p = kmalloc(len + 1, GFP_KERNEL); - if (!p) - return NULL; - va_start(ap, fmt); - vsprintf(p, fmt, ap); - va_end(ap); - return p; -} - static ssize_t xendev_show_nodename(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); static ssize_t xendev_show_devtype(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); -static int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename) +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename) { int err; struct xenbus_device *xendev; @@ -642,7 +526,7 @@ static int xenbus_probe_frontend(const char *type, const char *name) char *nodename; int err; - nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name); + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name); if (!nodename) return -ENOMEM; @@ -653,55 +537,6 @@ static int xenbus_probe_frontend(const char *type, const char *name) return err; } -/* backend/<typename>/<frontend-uuid>/<name> */ -static int xenbus_probe_backend_unit(const char *dir, - const char *type, - const char *name) -{ - char *nodename; - int err; - - nodename = kasprintf("%s/%s", dir, name); - if (!nodename) - return -ENOMEM; - - DPRINTK("%s\n", nodename); - - err = xenbus_probe_node(&xenbus_backend, type, nodename); - kfree(nodename); - return err; -} - -/* backend/<typename>/<frontend-domid> */ -static int xenbus_probe_backend(const char *type, const char *domid) -{ - char *nodename; - int err = 0; - char **dir; - unsigned int i, dir_n = 0; - - DPRINTK(""); - - nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid); - if (!nodename) - return -ENOMEM; - - dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); - if (IS_ERR(dir)) { - kfree(nodename); - return PTR_ERR(dir); - } - - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_backend_unit(nodename, type, dir[i]); - if (err) - break; - } - kfree(dir); - kfree(nodename); - return err; -} - static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) { int err = 0; @@ -722,7 +557,7 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) return err; } -static int xenbus_probe_devices(struct xen_bus_type *bus) +int xenbus_probe_devices(struct xen_bus_type *bus) { int err = 0; char **dir; @@ -764,7 +599,7 @@ static int strsep_len(const char *str, char c, unsigned int len) return (len == 0) ? i : -ERANGE; } -static void dev_changed(const char *node, struct xen_bus_type *bus) +void dev_changed(const char *node, struct xen_bus_type *bus) { int exists, rootlen; struct xenbus_device *dev; @@ -788,7 +623,7 @@ static void dev_changed(const char *node, struct xen_bus_type *bus) rootlen = strsep_len(node, '/', bus->levels); if (rootlen < 0) return; - root = kasprintf("%.*s", rootlen, node); + root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node); if (!root) return; @@ -809,25 +644,12 @@ static void frontend_changed(struct xenbus_watch *watch, dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); } -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - DPRINTK(""); - - dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); -} - /* We watch for devices appearing and vanishing. */ static struct xenbus_watch fe_watch = { .node = "device", .callback = frontend_changed, }; -static struct xenbus_watch be_watch = { - .node = "backend", - .callback = backend_changed, -}; - static int suspend_dev(struct device *dev, void *data) { int err = 0; @@ -898,7 +720,7 @@ void xenbus_suspend(void) DPRINTK(""); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev); + xenbus_backend_suspend(suspend_dev); xs_suspend(); } EXPORT_SYMBOL_GPL(xenbus_suspend); @@ -908,7 +730,7 @@ void xenbus_resume(void) xb_init_comms(); xs_resume(); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev); + xenbus_backend_resume(resume_dev); } EXPORT_SYMBOL_GPL(xenbus_resume); @@ -941,20 +763,17 @@ void xenbus_probe(void *unused) { BUG_ON((xenstored_ready <= 0)); - /* Enumerate devices in xenstore. */ + /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); - xenbus_probe_devices(&xenbus_backend); - - /* Watch for changes. */ register_xenbus_watch(&fe_watch); - register_xenbus_watch(&be_watch); + xenbus_backend_probe_and_watch(); /* Notify others that xenstore is up */ notifier_call_chain(&xenstore_chain, 0, NULL); } -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) static struct file_operations xsd_kva_fops; static struct proc_dir_entry *xsd_kva_intf; static struct proc_dir_entry *xsd_port_intf; @@ -1006,7 +825,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel bus subsystem */ bus_register(&xenbus_frontend.bus); - bus_register(&xenbus_backend.bus); + xenbus_backend_bus_register(); /* * Domain0 doesn't have a store_evtchn or store_mfn yet. @@ -1035,7 +854,7 @@ static int __init xenbus_probe_init(void) xen_store_evtchn = xen_start_info->store_evtchn = alloc_unbound.port; -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) /* And finally publish the above info in /proc/xen */ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600); if (xsd_kva_intf) { @@ -1077,7 +896,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel device subsystem */ device_register(&xenbus_frontend.dev); - device_register(&xenbus_backend.dev); + xenbus_backend_device_register(); if (!is_initial_xendomain()) xenbus_probe(NULL); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h new file mode 100644 index 0000000000..2d2e567826 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h @@ -0,0 +1,74 @@ +/****************************************************************************** + * xenbus_probe.h + * + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_PROBE_H +#define _XENBUS_PROBE_H + +#ifdef CONFIG_XEN_BACKEND +extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); +extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); +extern void xenbus_backend_probe_and_watch(void); +extern void xenbus_backend_bus_register(void); +extern void xenbus_backend_device_register(void); +#else +static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_probe_and_watch(void) {} +static inline void xenbus_backend_bus_register(void) {} +static inline void xenbus_backend_device_register(void) {} +#endif + +struct xen_bus_type +{ + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); + int (*probe)(const char *type, const char *dir); + struct bus_type bus; + struct device dev; +}; + +extern int xenbus_match(struct device *_dev, struct device_driver *_drv); +extern int xenbus_dev_probe(struct device *_dev); +extern int xenbus_dev_remove(struct device *_dev); +extern int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus); +extern int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +extern int xenbus_probe_devices(struct xen_bus_type *bus); + +extern void dev_changed(const char *node, struct xen_bus_type *bus); + +#endif + diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 0000000000..934e79732d --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,271 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/kthread.h> + +#include <asm/io.h> +#include <asm/page.h> +#include <asm/maddr.h> +#include <asm/pgtable.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen_proc.h> +#include <xen/evtchn.h> +#include <xen/features.h> +#include <xen/hvm.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); +static int xenbus_probe_backend(const char *type, const char *domid); + +extern int read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return read_otherend_details(xendev, "frontend-id", "frontend"); +} + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, BUS_ID_SIZE, + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, + }, + .dev = { + .bus_id = "xen-backend", + }, +}; + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + int i = 0; + int length = 0; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + if (xdev == NULL) + return -ENODEV; + + /* stuff we want to pass to /sbin/hotplug */ + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_TYPE=%s", xdev->devicetype); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_PATH=%s", xdev->nodename); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_BASE_PATH=%s", xenbus_backend.root); + + /* terminate, set to next free slot, shrink available space */ + envp[i] = NULL; + envp = &envp[i]; + num_envp -= i; + buffer = &buffer[length]; + buffer_size -= length; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, envp, num_envp, buffer, + buffer_size); + } + + return 0; +} + +int xenbus_register_backend(struct xenbus_driver *drv) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend); +} +EXPORT_SYMBOL_GPL(xenbus_register_backend); + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(&xenbus_backend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(const char *type, const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +void xenbus_backend_suspend(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_resume(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_probe_and_watch(void) +{ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); +} + +void xenbus_backend_bus_register(void) +{ + bus_register(&xenbus_backend.bus); +} + +void xenbus_backend_device_register(void) +{ + device_register(&xenbus_backend.dev); +} diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c index 190fa1e794..4c5052d13a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c @@ -42,11 +42,14 @@ #include <linux/fcntl.h> #include <linux/kthread.h> #include <linux/rwsem.h> +#include <linux/module.h> +#include <linux/mutex.h> #include <xen/xenbus.h> #include "xenbus_comms.h" -/* xenbus_probe.c */ -extern char *kasprintf(const char *fmt, ...); +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif struct xs_stored_msg { struct list_head list; @@ -289,9 +292,9 @@ static char *join(const char *dir, const char *name) char *buffer; if (strlen(name) == 0) - buffer = kasprintf("%s", dir); + buffer = kasprintf(GFP_KERNEL, "%s", dir); else - buffer = kasprintf("%s/%s", dir, name); + buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name); return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; } diff --git a/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c b/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c new file mode 100644 index 0000000000..382a50f647 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenoprof/xenoprofile.c @@ -0,0 +1,500 @@ +/** + * @file xenoprofile.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + * + * Modified by Aravind Menon and Jose Renato Santos for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * + * Separated out arch-generic part + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/oprofile.h> +#include <linux/sysdev.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/vmalloc.h> +#include <asm/pgtable.h> +#include <xen/evtchn.h> +#include <xen/xenoprof.h> +#include <xen/driver_util.h> +#include <xen/interface/xen.h> +#include <xen/interface/xenoprof.h> +#include "../../../drivers/oprofile/cpu_buffer.h" +#include "../../../drivers/oprofile/event_buffer.h" + +#define MAX_XENOPROF_SAMPLES 16 + +/* sample buffers shared with Xen */ +xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS]; +/* Shared buffer area */ +struct xenoprof_shared_buffer shared_buffer; + +/* Passive sample buffers shared with Xen */ +xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS]; +/* Passive shared buffer area */ +struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS]; + +static int xenoprof_start(void); +static void xenoprof_stop(void); + +static int xenoprof_enabled = 0; +static int xenoprof_is_primary = 0; +static int active_defined; + +/* Number of buffers in shared area (one per VCPU) */ +int nbuf; +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */ +int ovf_irq[NR_CPUS]; +/* cpu model type string - copied from Xen memory space on XENOPROF_init command */ +char cpu_type[XENOPROF_CPU_TYPE_SIZE]; + +#ifdef CONFIG_PM + +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state) +{ + if (xenoprof_enabled == 1) + xenoprof_stop(); + return 0; +} + + +static int xenoprof_resume(struct sys_device * dev) +{ + if (xenoprof_enabled == 1) + xenoprof_start(); + return 0; +} + + +static struct sysdev_class oprofile_sysclass = { + set_kset_name("oprofile"), + .resume = xenoprof_resume, + .suspend = xenoprof_suspend +}; + + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + + +static int __init init_driverfs(void) +{ + int error; + if (!(error = sysdev_class_register(&oprofile_sysclass))) + error = sysdev_register(&device_oprofile); + return error; +} + + +static void exit_driverfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_driverfs() do { } while (0) +#define exit_driverfs() do { } while (0) +#endif /* CONFIG_PM */ + +unsigned long long oprofile_samples = 0; +unsigned long long p_oprofile_samples = 0; + +unsigned int pdomains; +struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS]; + +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive) +{ + int head, tail, size; + + head = buf->event_head; + tail = buf->event_tail; + size = buf->event_size; + + if (tail > head) { + while (tail < size) { + oprofile_add_pc(buf->event_log[tail].eip, + buf->event_log[tail].mode, + buf->event_log[tail].event); + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + tail++; + } + tail = 0; + } + while (tail < head) { + oprofile_add_pc(buf->event_log[tail].eip, + buf->event_log[tail].mode, + buf->event_log[tail].event); + if (!is_passive) + oprofile_samples++; + else + p_oprofile_samples++; + tail++; + } + + buf->event_tail = tail; +} + +static void xenoprof_handle_passive(void) +{ + int i, j; + int flag_domain, flag_switch = 0; + + for (i = 0; i < pdomains; i++) { + flag_domain = 0; + for (j = 0; j < passive_domains[i].nbuf; j++) { + xenoprof_buf_t *buf = p_xenoprof_buf[i][j]; + if (buf->event_head == buf->event_tail) + continue; + if (!flag_domain) { + if (!oprofile_add_domain_switch(passive_domains[i]. + domain_id)) + goto done; + flag_domain = 1; + } + xenoprof_add_pc(buf, 1); + flag_switch = 1; + } + } +done: + if (flag_switch) + oprofile_add_domain_switch(COORDINATOR_DOMAIN); +} + +static irqreturn_t +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) +{ + struct xenoprof_buf * buf; + int cpu; + static unsigned long flag; + + cpu = smp_processor_id(); + buf = xenoprof_buf[cpu]; + + xenoprof_add_pc(buf, 0); + + if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) { + xenoprof_handle_passive(); + smp_mb__before_clear_bit(); + clear_bit(0, &flag); + } + + return IRQ_HANDLED; +} + + +static void unbind_virq(void) +{ + int i; + + for_each_online_cpu(i) { + if (ovf_irq[i] >= 0) { + unbind_from_irqhandler(ovf_irq[i], NULL); + ovf_irq[i] = -1; + } + } +} + + +static int bind_virq(void) +{ + int i, result; + + for_each_online_cpu(i) { + result = bind_virq_to_irqhandler(VIRQ_XENOPROF, + i, + xenoprof_ovf_interrupt, + SA_INTERRUPT, + "xenoprof", + NULL); + + if (result < 0) { + unbind_virq(); + return result; + } + + ovf_irq[i] = result; + } + + return 0; +} + + +static void unmap_passive_list(void) +{ + int i; + for (i = 0; i < pdomains; i++) + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + pdomains = 0; +} + + +static int map_xenoprof_buffer(int max_samples) +{ + struct xenoprof_get_buffer get_buffer; + struct xenoprof_buf *buf; + int ret, i; + + if ( shared_buffer.buffer ) + return 0; + + get_buffer.max_samples = max_samples; + ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer); + if (ret) + return ret; + nbuf = get_buffer.nbuf; + + for (i=0; i< nbuf; i++) { + buf = (struct xenoprof_buf*) + &shared_buffer.buffer[i * get_buffer.bufsize]; + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); + xenoprof_buf[buf->vcpu_id] = buf; + } + + return 0; +} + + +static int xenoprof_setup(void) +{ + int ret; + + if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) ) + return ret; + + if ( (ret = bind_virq()) ) + return ret; + + if (xenoprof_is_primary) { + /* Define dom0 as an active domain if not done yet */ + if (!active_defined) { + domid_t domid; + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + if (ret) + goto err; + domid = 0; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + if (ret) + goto err; + active_defined = 1; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); + if (ret) + goto err; + xenoprof_arch_counter(); + ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL); + + if (ret) + goto err; + } + + ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL); + if (ret) + goto err; + + xenoprof_enabled = 1; + return 0; + err: + unbind_virq(); + return ret; +} + + +static void xenoprof_shutdown(void) +{ + xenoprof_enabled = 0; + + HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL); + + if (xenoprof_is_primary) { + HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL); + active_defined = 0; + } + + unbind_virq(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) + unmap_passive_list(); +} + + +static int xenoprof_start(void) +{ + int ret = 0; + + if (xenoprof_is_primary) + ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL); + if (!ret) + xenoprof_arch_start(); + return ret; +} + + +static void xenoprof_stop(void) +{ + if (xenoprof_is_primary) + HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL); + xenoprof_arch_stop(); +} + + +static int xenoprof_set_active(int * active_domains, + unsigned int adomains) +{ + int ret = 0; + int i; + int set_dom0 = 0; + domid_t domid; + + if (!xenoprof_is_primary) + return 0; + + if (adomains > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + if (ret) + return ret; + + for (i=0; i<adomains; i++) { + domid = active_domains[i]; + if (domid != active_domains[i]) { + ret = -EINVAL; + goto out; + } + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + if (ret) + goto out; + if (active_domains[i] == 0) + set_dom0 = 1; + } + /* dom0 must always be active but may not be in the list */ + if (!set_dom0) { + domid = 0; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); + } + +out: + if (ret) + HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); + active_defined = !ret; + return ret; +} + +static int xenoprof_set_passive(int * p_domains, + unsigned int pdoms) +{ + int ret; + int i, j; + struct xenoprof_buf *buf; + + if (!xenoprof_is_primary) + return 0; + + if (pdoms > MAX_OPROF_DOMAINS) + return -E2BIG; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL); + if (ret) + return ret; + unmap_passive_list(); + + for (i = 0; i < pdoms; i++) { + passive_domains[i].domain_id = p_domains[i]; + passive_domains[i].max_samples = 2048; + ret = xenoprof_arch_set_passive(&passive_domains[i], + &p_shared_buffer[i]); + if (ret) + goto out; + for (j = 0; j < passive_domains[i].nbuf; j++) { + buf = (struct xenoprof_buf *) + &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize]; + BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); + p_xenoprof_buf[i][buf->vcpu_id] = buf; + } + } + + pdomains = pdoms; + return 0; + +out: + for (j = 0; j < i; j++) + xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); + + return ret; +} + +struct oprofile_operations xenoprof_ops = { +#ifdef HAVE_XENOPROF_CREATE_FILES + .create_files = xenoprof_create_files, +#endif + .set_active = xenoprof_set_active, + .set_passive = xenoprof_set_passive, + .setup = xenoprof_setup, + .shutdown = xenoprof_shutdown, + .start = xenoprof_start, + .stop = xenoprof_stop +}; + + +/* in order to get driverfs right */ +static int using_xenoprof; + +int __init xenoprofile_init(struct oprofile_operations * ops) +{ + struct xenoprof_init init; + int ret, i; + + ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init); + if (!ret) { + xenoprof_arch_init_counter(&init); + xenoprof_is_primary = init.is_primary; + + /* cpu_type is detected by Xen */ + cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0; + strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1); + xenoprof_ops.cpu_type = cpu_type; + + init_driverfs(); + using_xenoprof = 1; + *ops = xenoprof_ops; + + for (i=0; i<NR_CPUS; i++) + ovf_irq[i] = -1; + + active_defined = 0; + } + printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n", + __func__, ret, init.num_events, xenoprof_is_primary); + return ret; +} + + +void xenoprofile_exit(void) +{ + if (using_xenoprof) + exit_driverfs(); + + xenoprof_arch_unmap_shared_buffer(&shared_buffer); + if (xenoprof_is_primary) { + unmap_passive_list(); + HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL); + } +} |