diff options
80 files changed, 2505 insertions, 428 deletions
diff --git a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c index 37b33cbe4d..e9a7e7d070 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c @@ -392,10 +392,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, for (i = 0; i < nseg; i++) { if (unlikely(map[i].status != 0)) { DPRINTK("invalid buffer -- could not remap it\n"); - goto fail_flush; + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= 1; } pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + set_phys_to_machine(__pa(vaddr( pending_req, i)) >> PAGE_SHIFT, FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); @@ -403,6 +408,9 @@ static void dispatch_rw_block_io(blkif_t *blkif, (req->seg[i].first_sect << 9); } + if (ret) + goto fail_flush; + if (vbd_translate(&preq, blkif, operation) != 0) { DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c index e79b653a97..63ebf8ed93 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c @@ -48,6 +48,10 @@ #include <asm/hypervisor.h> #include <asm/maddr.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_STATE_DISCONNECTED 0 #define BLKIF_STATE_CONNECTED 1 #define BLKIF_STATE_SUSPENDED 2 @@ -468,6 +472,27 @@ int blkif_ioctl(struct inode *inode, struct file *filep, command, (long)argument, inode->i_rdev); switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blkif_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif case CDROMMULTISESSION: DPRINTK("FIXME: support multisession CDs later\n"); for (i = 0; i < sizeof(struct cdrom_multisession); i++) diff --git a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c index 8aa453d3a0..0c8b508c9a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c @@ -36,6 +36,10 @@ #include <linux/blkdev.h> #include <linux/list.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define BLKIF_MAJOR(dev) ((dev)>>8) #define BLKIF_MINOR(dev) ((dev) & 0xff) @@ -91,7 +95,9 @@ static struct block_device_operations xlvbd_block_fops = .open = blkif_open, .release = blkif_release, .ioctl = blkif_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .getgeo = blkif_getgeo +#endif }; DEFINE_SPINLOCK(blkif_io_lock); @@ -186,7 +192,11 @@ xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) if (rq == NULL) return -1; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_hardsect_size(rq, sector_size); diff --git a/linux-2.6-xen-sparse/drivers/xen/core/features.c b/linux-2.6-xen-sparse/drivers/xen/core/features.c index 4d50caf50b..a76f58c04d 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/features.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/features.c @@ -11,6 +11,10 @@ #include <asm/hypervisor.h> #include <xen/features.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; /* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ EXPORT_SYMBOL(xen_features); diff --git a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c index 3195279a87..c5132c13bb 100644 --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c @@ -44,6 +44,10 @@ #include <asm/io.h> #include <xen/interface/memory.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* External tools reserve first few grant table entries. */ #define NR_RESERVED_ENTRIES 8 diff --git a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c index 26e0610d15..e03e44a05a 100644 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c @@ -64,6 +64,10 @@ #include <xen/interface/grant_table.h> #include <xen/gnttab.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* * Mutually-exclusive module options to select receive data path: * rx_copy : Packets are copied by network backend into local memory diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile index d7c7d05172..ce5acc2457 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile @@ -9,4 +9,5 @@ xenbus-objs += xenbus_client.o xenbus-objs += xenbus_comms.o xenbus-objs += xenbus_xs.o xenbus-objs += xenbus_probe.o +obj-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c index 9b389ec06b..fd8355f6dd 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_client.c @@ -35,6 +35,10 @@ #include <xen/xenbus.h> #include <xen/driver_util.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* xenbus_probe.c */ extern char *kasprintf(const char *fmt, ...); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c index 38da320b67..ea8f3c283e 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c @@ -39,6 +39,10 @@ #include <xen/xenbus.h> #include "xenbus_comms.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + static int xenbus_irq; extern void xenbus_probe(void *); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c index bbe4a8c5a8..ba37e61856 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c @@ -40,6 +40,7 @@ #include <linux/wait.h> #include <linux/fs.h> #include <linux/poll.h> +#include <linux/mutex.h> #include "xenbus_comms.h" @@ -49,6 +50,10 @@ #include <xen/xen_proc.h> #include <asm/hypervisor.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + struct xenbus_dev_transaction { struct list_head list; struct xenbus_transaction handle; diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c index 87c9e6ed90..13e9f2105d 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c @@ -42,6 +42,7 @@ #include <linux/mm.h> #include <linux/notifier.h> #include <linux/kthread.h> +#include <linux/mutex.h> #include <asm/io.h> #include <asm/page.h> @@ -55,6 +56,11 @@ #include <xen/hvm.h> #include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif int xen_store_evtchn; struct xenstore_domain_interface *xen_store_interface; @@ -67,12 +73,7 @@ static struct notifier_block *xenstore_chain; static void wait_for_devices(struct xenbus_driver *xendrv); static int xenbus_probe_frontend(const char *type, const char *name); -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); -static int xenbus_probe_backend(const char *type, const char *domid); -static int xenbus_dev_probe(struct device *_dev); -static int xenbus_dev_remove(struct device *_dev); static void xenbus_dev_shutdown(struct device *_dev); /* If something in array of ids matches this device, return it. */ @@ -86,7 +87,7 @@ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) return NULL; } -static int xenbus_match(struct device *_dev, struct device_driver *_drv) +int xenbus_match(struct device *_dev, struct device_driver *_drv) { struct xenbus_driver *drv = to_xenbus_driver(_drv); @@ -96,17 +97,6 @@ static int xenbus_match(struct device *_dev, struct device_driver *_drv) return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; } -struct xen_bus_type -{ - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); - int (*probe)(const char *type, const char *dir); - struct bus_type bus; - struct device dev; -}; - - /* device/<type>/<id> => <type>-<id> */ static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) { @@ -143,7 +133,7 @@ static void free_otherend_watch(struct xenbus_device *dev) } -static int read_otherend_details(struct xenbus_device *xendev, +int read_otherend_details(struct xenbus_device *xendev, char *id_node, char *path_node) { int err = xenbus_gather(XBT_NIL, xendev->nodename, @@ -176,12 +166,6 @@ static int read_backend_details(struct xenbus_device *xendev) } -static int read_frontend_details(struct xenbus_device *xendev) -{ - return read_otherend_details(xendev, "frontend-id", "frontend"); -} - - /* Bus type for frontend drivers. */ static struct xen_bus_type xenbus_frontend = { .root = "device", @@ -191,115 +175,17 @@ static struct xen_bus_type xenbus_frontend = { .bus = { .name = "xen", .match = xenbus_match, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) .probe = xenbus_dev_probe, .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, +#endif }, .dev = { .bus_id = "xen", }, }; -/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ -static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) -{ - int domid, err; - const char *devid, *type, *frontend; - unsigned int typelen; - - type = strchr(nodename, '/'); - if (!type) - return -EINVAL; - type++; - typelen = strcspn(type, "/"); - if (!typelen || type[typelen] != '/') - return -EINVAL; - - devid = strrchr(nodename, '/') + 1; - - err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, - "frontend", NULL, &frontend, - NULL); - if (err) - return err; - if (strlen(frontend) == 0) - err = -ERANGE; - if (!err && !xenbus_exists(XBT_NIL, frontend, "")) - err = -ENOENT; - - kfree(frontend); - - if (err) - return err; - - if (snprintf(bus_id, BUS_ID_SIZE, - "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) - return -ENOSPC; - return 0; -} - -static struct xen_bus_type xenbus_backend = { - .root = "backend", - .levels = 3, /* backend/type/<frontend>/<id> */ - .get_bus_id = backend_bus_id, - .probe = xenbus_probe_backend, - .bus = { - .name = "xen-backend", - .match = xenbus_match, - .probe = xenbus_dev_probe, - .remove = xenbus_dev_remove, -// .shutdown = xenbus_dev_shutdown, - .uevent = xenbus_uevent_backend, - }, - .dev = { - .bus_id = "xen-backend", - }, -}; - -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) -{ - struct xenbus_device *xdev; - struct xenbus_driver *drv; - int i = 0; - int length = 0; - - DPRINTK(""); - - if (dev == NULL) - return -ENODEV; - - xdev = to_xenbus_device(dev); - if (xdev == NULL) - return -ENODEV; - - /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); - - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_BASE_PATH=%s", xenbus_backend.root); - - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; - - if (dev->driver) { - drv = to_xenbus_driver(dev->driver); - if (drv && drv->uevent) - return drv->uevent(xdev, envp, num_envp, buffer, - buffer_size); - } - - return 0; -} - static void otherend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) { @@ -359,7 +245,7 @@ static int watch_otherend(struct xenbus_device *dev) } -static int xenbus_dev_probe(struct device *_dev) +int xenbus_dev_probe(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -406,7 +292,7 @@ fail: return -ENODEV; } -static int xenbus_dev_remove(struct device *_dev) +int xenbus_dev_remove(struct device *_dev) { struct xenbus_device *dev = to_xenbus_device(_dev); struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); @@ -444,14 +330,21 @@ static void xenbus_dev_shutdown(struct device *_dev) put_device(&dev->dev); } -static int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus) +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus) { int ret; drv->driver.name = drv->name; drv->driver.bus = &bus->bus; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) drv->driver.owner = drv->owner; +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + drv->driver.probe = xenbus_dev_probe; + drv->driver.remove = xenbus_dev_remove; + drv->driver.shutdown = xenbus_dev_shutdown; +#endif mutex_lock(&xenwatch_mutex); ret = driver_register(&drv->driver); @@ -476,14 +369,6 @@ int xenbus_register_frontend(struct xenbus_driver *drv) } EXPORT_SYMBOL_GPL(xenbus_register_frontend); -int xenbus_register_backend(struct xenbus_driver *drv) -{ - drv->read_otherend_details = read_frontend_details; - - return xenbus_register_driver_common(drv, &xenbus_backend); -} -EXPORT_SYMBOL_GPL(xenbus_register_backend); - void xenbus_unregister_driver(struct xenbus_driver *drv) { driver_unregister(&drv->driver); @@ -581,23 +466,29 @@ char *kasprintf(const char *fmt, ...) } static ssize_t xendev_show_nodename(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); static ssize_t xendev_show_devtype(struct device *dev, - struct device_attribute *attr, char *buf) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) + struct device_attribute *attr, +#endif + char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); -static int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename) +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename) { int err; struct xenbus_device *xendev; @@ -667,55 +558,6 @@ static int xenbus_probe_frontend(const char *type, const char *name) return err; } -/* backend/<typename>/<frontend-uuid>/<name> */ -static int xenbus_probe_backend_unit(const char *dir, - const char *type, - const char *name) -{ - char *nodename; - int err; - - nodename = kasprintf("%s/%s", dir, name); - if (!nodename) - return -ENOMEM; - - DPRINTK("%s\n", nodename); - - err = xenbus_probe_node(&xenbus_backend, type, nodename); - kfree(nodename); - return err; -} - -/* backend/<typename>/<frontend-domid> */ -static int xenbus_probe_backend(const char *type, const char *domid) -{ - char *nodename; - int err = 0; - char **dir; - unsigned int i, dir_n = 0; - - DPRINTK(""); - - nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid); - if (!nodename) - return -ENOMEM; - - dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); - if (IS_ERR(dir)) { - kfree(nodename); - return PTR_ERR(dir); - } - - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_backend_unit(nodename, type, dir[i]); - if (err) - break; - } - kfree(dir); - kfree(nodename); - return err; -} - static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) { int err = 0; @@ -736,7 +578,7 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) return err; } -static int xenbus_probe_devices(struct xen_bus_type *bus) +int xenbus_probe_devices(struct xen_bus_type *bus) { int err = 0; char **dir; @@ -778,7 +620,7 @@ static int strsep_len(const char *str, char c, unsigned int len) return (len == 0) ? i : -ERANGE; } -static void dev_changed(const char *node, struct xen_bus_type *bus) +void dev_changed(const char *node, struct xen_bus_type *bus) { int exists, rootlen; struct xenbus_device *dev; @@ -823,25 +665,12 @@ static void frontend_changed(struct xenbus_watch *watch, dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); } -static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - DPRINTK(""); - - dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); -} - /* We watch for devices appearing and vanishing. */ static struct xenbus_watch fe_watch = { .node = "device", .callback = frontend_changed, }; -static struct xenbus_watch be_watch = { - .node = "backend", - .callback = backend_changed, -}; - static int suspend_dev(struct device *dev, void *data) { int err = 0; @@ -912,7 +741,7 @@ void xenbus_suspend(void) DPRINTK(""); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev); + xenbus_backend_suspend(suspend_dev); xs_suspend(); } EXPORT_SYMBOL_GPL(xenbus_suspend); @@ -922,7 +751,7 @@ void xenbus_resume(void) xb_init_comms(); xs_resume(); bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev); + xenbus_backend_resume(resume_dev); } EXPORT_SYMBOL_GPL(xenbus_resume); @@ -955,20 +784,17 @@ void xenbus_probe(void *unused) { BUG_ON((xenstored_ready <= 0)); - /* Enumerate devices in xenstore. */ + /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); - xenbus_probe_devices(&xenbus_backend); - - /* Watch for changes. */ register_xenbus_watch(&fe_watch); - register_xenbus_watch(&be_watch); + xenbus_backend_probe_and_watch(); /* Notify others that xenstore is up */ notifier_call_chain(&xenstore_chain, 0, NULL); } -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) static struct file_operations xsd_kva_fops; static struct proc_dir_entry *xsd_kva_intf; static struct proc_dir_entry *xsd_port_intf; @@ -1020,7 +846,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel bus subsystem */ bus_register(&xenbus_frontend.bus); - bus_register(&xenbus_backend.bus); + xenbus_backend_bus_register(); /* * Domain0 doesn't have a store_evtchn or store_mfn yet. @@ -1049,7 +875,7 @@ static int __init xenbus_probe_init(void) xen_store_evtchn = xen_start_info->store_evtchn = alloc_unbound.port; -#ifdef CONFIG_PROC_FS +#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) /* And finally publish the above info in /proc/xen */ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600); if (xsd_kva_intf) { @@ -1091,7 +917,7 @@ static int __init xenbus_probe_init(void) /* Register ourselves with the kernel device subsystem */ device_register(&xenbus_frontend.dev); - device_register(&xenbus_backend.dev); + xenbus_backend_device_register(); if (!is_initial_xendomain()) xenbus_probe(NULL); diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h new file mode 100644 index 0000000000..1f61c6cca6 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.h @@ -0,0 +1,77 @@ +/****************************************************************************** + * xenbus_probe.h + * + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_PROBE_H +#define _XENBUS_PROBE_H + +#ifdef CONFIG_XEN_BACKEND +extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); +extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); +extern void xenbus_backend_probe_and_watch(void); +extern void xenbus_backend_bus_register(void); +extern void xenbus_backend_device_register(void); +#else +static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} +static inline void xenbus_backend_probe_and_watch(void) {} +static inline void xenbus_backend_bus_register(void) {} +static inline void xenbus_backend_device_register(void) {} +#endif + +struct xen_bus_type +{ + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); + int (*probe)(const char *type, const char *dir); + struct bus_type bus; + struct device dev; +}; + +extern int xenbus_match(struct device *_dev, struct device_driver *_drv); +extern int xenbus_dev_probe(struct device *_dev); +extern int xenbus_dev_remove(struct device *_dev); +extern int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus); +extern int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +extern int xenbus_probe_devices(struct xen_bus_type *bus); + +extern void dev_changed(const char *node, struct xen_bus_type *bus); + +/* Simplified asprintf. Probably belongs in lib */ +extern char *kasprintf(const char *fmt, ...); + +#endif + diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 0000000000..7f0dedd577 --- /dev/null +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,271 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __FUNCTION__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/kthread.h> + +#include <asm/io.h> +#include <asm/page.h> +#include <asm/maddr.h> +#include <asm/pgtable.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen_proc.h> +#include <xen/evtchn.h> +#include <xen/features.h> +#include <xen/hvm.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size); +static int xenbus_probe_backend(const char *type, const char *domid); + +extern int read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return read_otherend_details(xendev, "frontend-id", "frontend"); +} + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, BUS_ID_SIZE, + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, +// .shutdown = xenbus_dev_shutdown, + .uevent = xenbus_uevent_backend, + }, + .dev = { + .bus_id = "xen-backend", + }, +}; + +static int xenbus_uevent_backend(struct device *dev, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + int i = 0; + int length = 0; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + if (xdev == NULL) + return -ENODEV; + + /* stuff we want to pass to /sbin/hotplug */ + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_TYPE=%s", xdev->devicetype); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_PATH=%s", xdev->nodename); + + add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, + "XENBUS_BASE_PATH=%s", xenbus_backend.root); + + /* terminate, set to next free slot, shrink available space */ + envp[i] = NULL; + envp = &envp[i]; + num_envp -= i; + buffer = &buffer[length]; + buffer_size -= length; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, envp, num_envp, buffer, + buffer_size); + } + + return 0; +} + +int xenbus_register_backend(struct xenbus_driver *drv) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend); +} +EXPORT_SYMBOL_GPL(xenbus_register_backend); + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf("%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(&xenbus_backend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(const char *type, const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +void xenbus_backend_suspend(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_resume(int (*fn)(struct device *, void *)) +{ + DPRINTK(""); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); +} + +void xenbus_backend_probe_and_watch(void) +{ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); +} + +void xenbus_backend_bus_register(void) +{ + bus_register(&xenbus_backend.bus); +} + +void xenbus_backend_device_register(void) +{ + device_register(&xenbus_backend.dev); +} diff --git a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c index 190fa1e794..1c1fc576c0 100644 --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c @@ -42,9 +42,15 @@ #include <linux/fcntl.h> #include <linux/kthread.h> #include <linux/rwsem.h> +#include <linux/module.h> +#include <linux/mutex.h> #include <xen/xenbus.h> #include "xenbus_comms.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + /* xenbus_probe.c */ extern char *kasprintf(const char *fmt, ...); diff --git a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h index 6a4e5e4508..807ca388c5 100644 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/synch_bitops.h @@ -9,6 +9,10 @@ #include <linux/config.h> +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define ADDR (*(volatile long *) addr) static __inline__ void synch_set_bit(int nr, volatile void * addr) diff --git a/linux-2.6-xen-sparse/include/xen/xenbus.h b/linux-2.6-xen-sparse/include/xen/xenbus.h index 8e259ce777..c7cb7eaa3a 100644 --- a/linux-2.6-xen-sparse/include/xen/xenbus.h +++ b/linux-2.6-xen-sparse/include/xen/xenbus.h @@ -38,6 +38,7 @@ #include <linux/notifier.h> #include <linux/mutex.h> #include <linux/completion.h> +#include <linux/init.h> #include <xen/interface/xen.h> #include <xen/interface/grant_table.h> #include <xen/interface/io/xenbus.h> diff --git a/tools/blktap/drivers/blktapctrl.c b/tools/blktap/drivers/blktapctrl.c index 1c27e9eae7..0b00bc4bfd 100644 --- a/tools/blktap/drivers/blktapctrl.c +++ b/tools/blktap/drivers/blktapctrl.c @@ -204,81 +204,49 @@ static blkif_t *test_path(char *path, char **dev, int *type) static void add_disktype(blkif_t *blkif, int type) { - driver_list_entry_t *entry, *ptr, *last; + driver_list_entry_t *entry, **pprev; - if (type > MAX_DISK_TYPES) return; + if (type > MAX_DISK_TYPES) + return; entry = malloc(sizeof(driver_list_entry_t)); entry->blkif = blkif; - entry->next = NULL; - ptr = active_disks[type]; + entry->next = NULL; - if (ptr == NULL) { - active_disks[type] = entry; - entry->prev = NULL; - return; - } - - while (ptr != NULL) { - last = ptr; - ptr = ptr->next; - } + pprev = &active_disks[type]; + while (*pprev != NULL) + pprev = &(*pprev)->next; - /*We've found the end of the list*/ - last->next = entry; - entry->prev = last; - - return; + *pprev = entry; + entry->pprev = pprev; } static int del_disktype(blkif_t *blkif) { - driver_list_entry_t *ptr, *cur, *last; + driver_list_entry_t *entry, **pprev; int type = blkif->drivertype, count = 0, close = 0; - if (type > MAX_DISK_TYPES) return 1; - - ptr = active_disks[type]; - last = NULL; - while (ptr != NULL) { - count++; - if (blkif == ptr->blkif) { - cur = ptr; - if (ptr->next != NULL) { - /*There's more later in the chain*/ - if (!last) { - /*We're first in the list*/ - active_disks[type] = ptr->next; - ptr = ptr->next; - ptr->prev = NULL; - } - else { - /*We're sandwiched*/ - last->next = ptr->next; - ptr = ptr->next; - ptr->prev = last; - } - - } else if (last) { - /*There's more earlier in the chain*/ - last->next = NULL; - } else { - /*We're the only entry*/ - active_disks[type] = NULL; - if(dtypes[type]->single_handler == 1) - close = 1; - } - DPRINTF("DEL_DISKTYPE: Freeing entry\n"); - free(cur); - if (dtypes[type]->single_handler == 0) close = 1; + if (type > MAX_DISK_TYPES) + return 1; - return close; - } - last = ptr; - ptr = ptr->next; + pprev = &active_disks[type]; + while ((*pprev != NULL) && ((*pprev)->blkif != blkif)) + pprev = &(*pprev)->next; + + if ((entry = *pprev) == NULL) { + DPRINTF("DEL_DISKTYPE: No match\n"); + return 1; } - DPRINTF("DEL_DISKTYPE: No match\n"); - return 1; + + *pprev = entry->next; + if (entry->next) + entry->next->pprev = pprev; + + DPRINTF("DEL_DISKTYPE: Freeing entry\n"); + free(entry); + + /* Caller should close() if no single controller, or list is empty. */ + return (!dtypes[type]->single_handler || (active_disks[type] == NULL)); } static int write_msg(int fd, int msgtype, void *ptr, void *ptr2) @@ -592,8 +560,8 @@ int unmap_blktapctrl(blkif_t *blkif) if (del_disktype(blkif)) { close(blkif->fds[WRITE]); close(blkif->fds[READ]); - } + return 0; } diff --git a/tools/blktap/drivers/tapdisk.c b/tools/blktap/drivers/tapdisk.c index 7c88027eb3..859687d8b3 100644 --- a/tools/blktap/drivers/tapdisk.c +++ b/tools/blktap/drivers/tapdisk.c @@ -79,31 +79,17 @@ static void unmap_disk(struct td_state *s) { tapdev_info_t *info = s->ring_info; struct tap_disk *drv = s->drv; - fd_list_entry_t *ptr, *prev; + fd_list_entry_t *entry; drv->td_close(s); if (info != NULL && info->mem > 0) munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE); - ptr = s->fd_entry; - prev = ptr->prev; - - if (prev) { - /*There are entries earlier in the list*/ - prev->next = ptr->next; - if (ptr->next) { - ptr = ptr->next; - ptr->prev = prev; - } - } else { - /*We are the first entry in list*/ - if (ptr->next) { - ptr = ptr->next; - fd_start = ptr; - ptr->prev = NULL; - } else fd_start = NULL; - } + entry = s->fd_entry; + *entry->pprev = entry->next; + if (entry->next) + entry->next->pprev = entry->pprev; close(info->fd); @@ -144,35 +130,29 @@ static inline int LOCAL_FD_SET(fd_set *readfds) return 0; } -static inline fd_list_entry_t *add_fd_entry(int tap_fd, int io_fd[MAX_IOFD], struct td_state *s) +static inline fd_list_entry_t *add_fd_entry( + int tap_fd, int io_fd[MAX_IOFD], struct td_state *s) { - fd_list_entry_t *ptr, *last, *entry; + fd_list_entry_t **pprev, *entry; int i; + DPRINTF("Adding fd_list_entry\n"); /*Add to linked list*/ s->fd_entry = entry = malloc(sizeof(fd_list_entry_t)); entry->tap_fd = tap_fd; - for (i = 0; i < MAX_IOFD; i++) entry->io_fd[i] = io_fd[i]; + for (i = 0; i < MAX_IOFD; i++) + entry->io_fd[i] = io_fd[i]; entry->s = s; entry->next = NULL; - ptr = fd_start; - if (ptr == NULL) { - /*We are the first entry*/ - fd_start = entry; - entry->prev = NULL; - goto finish; - } + pprev = &fd_start; + while (*pprev != NULL) + pprev = &(*pprev)->next; - while (ptr != NULL) { - last = ptr; - ptr = ptr->next; - } - last->next = entry; - entry->prev = last; + *pprev = entry; + entry->pprev = pprev; - finish: return entry; } diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h index 1f03156456..238350016b 100644 --- a/tools/blktap/drivers/tapdisk.h +++ b/tools/blktap/drivers/tapdisk.h @@ -191,9 +191,8 @@ static disk_info_t *dtypes[] = { }; typedef struct driver_list_entry { - void *blkif; - void *prev; - void *next; + struct blkif *blkif; + struct driver_list_entry **pprev, *next; } driver_list_entry_t; typedef struct fd_list_entry { @@ -201,8 +200,7 @@ typedef struct fd_list_entry { int tap_fd; int io_fd[MAX_IOFD]; struct td_state *s; - void *prev; - void *next; + struct fd_list_entry **pprev, *next; } fd_list_entry_t; int qcow_create(const char *filename, uint64_t total_size, diff --git a/tools/firmware/acpi/acpi_fadt.h b/tools/firmware/acpi/acpi_fadt.h index d1ecea5588..f30a1dac98 100644 --- a/tools/firmware/acpi/acpi_fadt.h +++ b/tools/firmware/acpi/acpi_fadt.h @@ -18,6 +18,8 @@ #ifndef _FADT_H_ #define _FADT_H_ +#include <xen/hvm/ioreq.h> + // // FADT Definitions, see ACPI 2.0 specification for details. // @@ -51,7 +53,9 @@ // // Fixed Feature Flags // -#define ACPI_FIXED_FEATURE_FLAGS (ACPI_PROC_C1|ACPI_SLP_BUTTON|ACPI_WBINVD|ACPI_PWR_BUTTON|ACPI_FIX_RTC) +#define ACPI_FIXED_FEATURE_FLAGS (ACPI_PROC_C1 | ACPI_SLP_BUTTON | \ + ACPI_WBINVD | ACPI_PWR_BUTTON | \ + ACPI_FIX_RTC | ACPI_TMR_VAL_EXT) // // PM1A Event Register Block Generic Address Information @@ -59,7 +63,6 @@ #define ACPI_PM1A_EVT_BLK_ADDRESS_SPACE_ID ACPI_SYSTEM_IO #define ACPI_PM1A_EVT_BLK_BIT_WIDTH 0x20 #define ACPI_PM1A_EVT_BLK_BIT_OFFSET 0x00 -#define ACPI_PM1A_EVT_BLK_ADDRESS 0x000000000000c010 // // PM1B Event Register Block Generic Address Information @@ -75,7 +78,6 @@ #define ACPI_PM1A_CNT_BLK_ADDRESS_SPACE_ID ACPI_SYSTEM_IO #define ACPI_PM1A_CNT_BLK_BIT_WIDTH 0x10 #define ACPI_PM1A_CNT_BLK_BIT_OFFSET 0x00 -#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04) // // PM1B Control Register Block Generic Address Information @@ -100,7 +102,6 @@ #define ACPI_PM_TMR_BLK_ADDRESS_SPACE_ID ACPI_SYSTEM_IO #define ACPI_PM_TMR_BLK_BIT_WIDTH 0x20 #define ACPI_PM_TMR_BLK_BIT_OFFSET 0x00 -#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08) // // General Purpose Event 0 Register Block Generic Address diff --git a/tools/ioemu/vl.c b/tools/ioemu/vl.c index 185547743a..e331abd1ae 100644 --- a/tools/ioemu/vl.c +++ b/tools/ioemu/vl.c @@ -6448,7 +6448,6 @@ int main(int argc, char **argv) fprintf(logfile, "shared page at pfn:%lx, mfn: %"PRIx64"\n", shared_page_nr, (uint64_t)(page_array[shared_page_nr])); - /* not yet add for IA64 */ buffered_io_page = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ|PROT_WRITE, page_array[shared_page_nr - 2]); @@ -6465,7 +6464,7 @@ int main(int argc, char **argv) #elif defined(__ia64__) if (xc_ia64_get_pfn_list(xc_handle, domid, page_array, - IO_PAGE_START >> PAGE_SHIFT, 1) != 1) { + IO_PAGE_START >> PAGE_SHIFT, 3) != 3) { fprintf(logfile, "xc_ia64_get_pfn_list returned error %d\n", errno); exit(-1); } @@ -6477,6 +6476,12 @@ int main(int argc, char **argv) fprintf(logfile, "shared page at pfn:%lx, mfn: %016lx\n", IO_PAGE_START >> PAGE_SHIFT, page_array[0]); + buffered_io_page =xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, + PROT_READ|PROT_WRITE, + page_array[2]); + fprintf(logfile, "Buffered IO page at pfn:%lx, mfn: %016lx\n", + BUFFER_IO_PAGE_START >> PAGE_SHIFT, page_array[2]); + if (xc_ia64_get_pfn_list(xc_handle, domid, page_array, 0, nr_pages) != nr_pages) { fprintf(logfile, "xc_ia64_get_pfn_list returned error %d\n", errno); @@ -6496,6 +6501,7 @@ int main(int argc, char **argv) fprintf(logfile, "xc_map_foreign_batch returned error %d\n", errno); exit(-1); } + free(page_array); #endif #else /* !CONFIG_DM */ diff --git a/tools/ioemu/vnc.c b/tools/ioemu/vnc.c index 9b8bcffa37..631754ca03 100644 --- a/tools/ioemu/vnc.c +++ b/tools/ioemu/vnc.c @@ -203,6 +203,8 @@ static void set_bits_in_row(VncState *vs, uint64_t *row, mask = ~(0ULL); h += y; + if (h > vs->ds->height) + h = vs->ds->height; for (; y < h; y++) row[y] |= mask; } diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile index b5e61af64d..129b867ff6 100644 --- a/tools/libxc/Makefile +++ b/tools/libxc/Makefile @@ -31,7 +31,7 @@ GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c -include $(XEN_TARGET_ARCH)/Makefile -CFLAGS += -Werror +CFLAGS += -Werror -Wmissing-prototypes CFLAGS += -fno-strict-aliasing CFLAGS += $(INCLUDES) -I. diff --git a/tools/libxc/ia64/xc_ia64_hvm_build.c b/tools/libxc/ia64/xc_ia64_hvm_build.c index 2c34b44a1d..0caaf343b3 100644 --- a/tools/libxc/ia64/xc_ia64_hvm_build.c +++ b/tools/libxc/ia64/xc_ia64_hvm_build.c @@ -551,8 +551,9 @@ setup_guest(int xc_handle, uint32_t dom, unsigned long memsize, char *image, unsigned long image_size, uint32_t vcpus, unsigned int store_evtchn, unsigned long *store_mfn) { - unsigned long page_array[2]; + unsigned long page_array[3]; shared_iopage_t *sp; + void *ioreq_buffer_page; unsigned long dom_memsize = (memsize << 20); DECLARE_DOMCTL; @@ -587,7 +588,7 @@ setup_guest(int xc_handle, uint32_t dom, unsigned long memsize, /* Retrieve special pages like io, xenstore, etc. */ if (xc_ia64_get_pfn_list(xc_handle, dom, page_array, - IO_PAGE_START>>PAGE_SHIFT, 2) != 2) { + IO_PAGE_START>>PAGE_SHIFT, 3) != 3) { PERROR("Could not get the page frame list"); goto error_out; } @@ -604,7 +605,10 @@ setup_guest(int xc_handle, uint32_t dom, unsigned long memsize, memset(sp, 0, PAGE_SIZE); munmap(sp, PAGE_SIZE); - + ioreq_buffer_page = xc_map_foreign_range(xc_handle, dom, + PAGE_SIZE, PROT_READ|PROT_WRITE, page_array[2]); + memset(ioreq_buffer_page,0,PAGE_SIZE); + munmap(ioreq_buffer_page, PAGE_SIZE); return 0; error_out: diff --git a/tools/libxc/xc_linux_build.c b/tools/libxc/xc_linux_build.c index 822e55601b..e215d7e198 100644 --- a/tools/libxc/xc_linux_build.c +++ b/tools/libxc/xc_linux_build.c @@ -128,7 +128,7 @@ static int probeimageformat(const char *image, return 0; } -int load_initrd(int xc_handle, domid_t dom, +static int load_initrd(int xc_handle, domid_t dom, struct initrd_info *initrd, unsigned long physbase, xen_pfn_t *phys_to_mach) diff --git a/tools/libxc/xc_linux_restore.c b/tools/libxc/xc_linux_restore.c index 6e323340e3..e4bd09ed19 100644 --- a/tools/libxc/xc_linux_restore.c +++ b/tools/libxc/xc_linux_restore.c @@ -57,7 +57,7 @@ read_exact(int fd, void *buf, size_t count) ** This function inverts that operation, replacing the pfn values with ** the (now known) appropriate mfn values. */ -int uncanonicalize_pagetable(unsigned long type, void *page) +static int uncanonicalize_pagetable(unsigned long type, void *page) { int i, pte_last; unsigned long pfn; diff --git a/tools/libxc/xc_linux_save.c b/tools/libxc/xc_linux_save.c index d955072726..7a5e4eaad6 100644 --- a/tools/libxc/xc_linux_save.c +++ b/tools/libxc/xc_linux_save.c @@ -413,7 +413,7 @@ static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, ** which entries do not require canonicalization (in particular, those ** entries which map the virtual address reserved for the hypervisor). */ -int canonicalize_pagetable(unsigned long type, unsigned long pfn, +static int canonicalize_pagetable(unsigned long type, unsigned long pfn, const void *spage, void *dpage) { diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c index aea9cd78d8..768cf5c5cf 100644 --- a/tools/libxc/xc_private.c +++ b/tools/libxc/xc_private.c @@ -6,6 +6,7 @@ #include <inttypes.h> #include "xc_private.h" +#include "xg_private.h" int lock_pages(void *addr, size_t len) { @@ -35,23 +36,6 @@ int xc_get_pfn_type_batch(int xc_handle, return do_domctl(xc_handle, &domctl); } -#define GETPFN_ERR (~0U) -unsigned int get_pfn_type(int xc_handle, - unsigned long mfn, - uint32_t dom) -{ - DECLARE_DOMCTL; - domctl.cmd = XEN_DOMCTL_getpageframeinfo; - domctl.u.getpageframeinfo.gmfn = mfn; - domctl.domain = (domid_t)dom; - if ( do_domctl(xc_handle, &domctl) < 0 ) - { - PERROR("Unexpected failure when getting page frame info!"); - return GETPFN_ERR; - } - return domctl.u.getpageframeinfo.type; -} - int xc_mmuext_op( int xc_handle, struct mmuext_op *op, diff --git a/tools/python/xen/xend/image.py b/tools/python/xen/xend/image.py index 88a8a169bc..8a761d89cf 100644 --- a/tools/python/xen/xend/image.py +++ b/tools/python/xen/xend/image.py @@ -471,7 +471,7 @@ class IA64_HVM_ImageHandler(HVMImageHandler): def getRequiredAvailableMemory(self, mem_kb): page_kb = 16 # ROM size for guest firmware, ioreq page and xenstore page - extra_pages = 1024 + 2 + extra_pages = 1024 + 3 return mem_kb + extra_pages * page_kb def getRequiredShadowMemory(self, shadow_mem_kb, maxmem_kb): @@ -500,9 +500,12 @@ class X86_HVM_ImageHandler(HVMImageHandler): # overhead due to getRequiredInitialReservation. maxmem_kb = self.getRequiredInitialReservation(maxmem_kb) - # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than - # the minimum that Xen would allocate if no value were given. - return max(1024 * self.vm.getVCpuCount() + maxmem_kb / 256, + # 256 pages (1MB) per vcpu, + # plus 1 page per MiB of RAM for the P2M map, + # plus 1 page per MiB of RAM to shadow the resident processes. + # This is higher than the minimum that Xen would allocate if no value + # were given (but the Xen minimum is for safety, not performance). + return max(4 * (256 * self.vm.getVCpuCount() + 2 * (maxmem_kb / 1024)), shadow_mem_kb) diff --git a/tools/python/xen/xend/server/SrvDaemon.py b/tools/python/xen/xend/server/SrvDaemon.py index f883e7da85..baba3c437d 100644 --- a/tools/python/xen/xend/server/SrvDaemon.py +++ b/tools/python/xen/xend/server/SrvDaemon.py @@ -9,6 +9,7 @@ import os import signal import sys import threading +import time import linecache import pwd import re @@ -106,12 +107,14 @@ class Daemon: os.close(2) if XEND_DEBUG: os.open('/dev/null', os.O_RDONLY) - os.open(XEND_DEBUG_LOG, os.O_WRONLY|os.O_CREAT) + os.open(XEND_DEBUG_LOG, os.O_WRONLY|os.O_CREAT|os.O_APPEND) os.dup(1) else: os.open('/dev/null', os.O_RDWR) os.dup(0) - os.open(XEND_DEBUG_LOG, os.O_WRONLY|os.O_CREAT) + os.open(XEND_DEBUG_LOG, os.O_WRONLY|os.O_CREAT|os.O_APPEND) + print >>sys.stderr, ("Xend started at %s." % + time.asctime(time.localtime())) def start(self, trace=0): diff --git a/tools/xenstat/xentop/xentop.1 b/tools/xenstat/xentop/xentop.1 index c7a856bed1..b925a3795f 100644 --- a/tools/xenstat/xentop/xentop.1 +++ b/tools/xenstat/xentop/xentop.1 @@ -47,6 +47,9 @@ seconds between updates (default 3) \fB\-n\fR, \fB\-\-networks\fR output network information .TP +\fB\-x\fR, \fB\-\-vbds\fR +output vbd block device data +.TP \fB\-r\fR, \fB\-\-repeat\-header\fR repeat table header before each domain .TP diff --git a/tools/xenstat/xentop/xentop.c b/tools/xenstat/xentop/xentop.c index 7d3ec59d2e..b772f951fb 100644 --- a/tools/xenstat/xentop/xentop.c +++ b/tools/xenstat/xentop/xentop.c @@ -204,7 +204,7 @@ static void usage(const char *program) "-V, --version output version information and exit\n" "-d, --delay=SECONDS seconds between updates (default 3)\n" "-n, --networks output vif network data\n" - "-b, --vbds output vbd block device data\n" + "-x, --vbds output vbd block device data\n" "-r, --repeat-header repeat table header before each domain\n" "-v, --vcpus output vcpu data\n" "-b, --batch output in batch mode, no user input accepted\n" @@ -976,7 +976,7 @@ int main(int argc, char **argv) { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, { "networks", no_argument, NULL, 'n' }, - { "vbds", no_argument, NULL, 'x' }, + { "vbds", no_argument, NULL, 'x' }, { "repeat-header", no_argument, NULL, 'r' }, { "vcpus", no_argument, NULL, 'v' }, { "delay", required_argument, NULL, 'd' }, @@ -1065,7 +1065,7 @@ int main(int argc, char **argv) break; } while (1); } - + /* Cleanup occurs in cleanup(), so no work to do here. */ return 0; diff --git a/tools/xm-test/lib/XmTestLib/arch.py b/tools/xm-test/lib/XmTestLib/arch.py index d5a1aa55cb..5625a53546 100644 --- a/tools/xm-test/lib/XmTestLib/arch.py +++ b/tools/xm-test/lib/XmTestLib/arch.py @@ -124,6 +124,7 @@ _uname_to_arch_map = { "i486" : "x86", "i586" : "x86", "i686" : "x86", + "x86_64": "x86_64", "ia64" : "ia64", "ppc" : "powerpc", "ppc64" : "powerpc", @@ -131,7 +132,7 @@ _uname_to_arch_map = { # Lookup current platform. _arch = _uname_to_arch_map.get(os.uname()[4], "Unknown") -if _arch == "x86" or _arch == "ia64": +if _arch == "x86" or _arch == "x86_64" or _arch == "ia64": minSafeMem = ia_minSafeMem getDefaultKernel = ia_getDefaultKernel checkBuffer = ia_checkBuffer diff --git a/unmodified_drivers/linux-2.6/blkfront/Makefile b/unmodified_drivers/linux-2.6/blkfront/Makefile new file mode 100644 index 0000000000..64e7acd194 --- /dev/null +++ b/unmodified_drivers/linux-2.6/blkfront/Makefile @@ -0,0 +1,3 @@ +ifneq ($(KERNELRELEASE),) +include $(src)/Kbuild +endif diff --git a/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopmd.h b/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopmd.h new file mode 100644 index 0000000000..ebde567575 --- /dev/null +++ b/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopmd.h @@ -0,0 +1,14 @@ +#ifndef _PGTABLE_NOPMD_H +#define _PGTABLE_NOPMD_H + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) +#error "This version of Linux should not need compat pgtable-nopmd.h" +#endif + +#define pud_t pgd_t +#define pud_offset(d, va) d +#define pud_none(pud) 0 +#define pud_present(pud) 1 +#define PTRS_PER_PUD 1 + +#endif /* _PGTABLE_NOPMD_H */ diff --git a/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopud.h b/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopud.h new file mode 100644 index 0000000000..8b23299dd0 --- /dev/null +++ b/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopud.h @@ -0,0 +1,14 @@ +#ifndef _PGTABLE_NOPUD_H +#define _PGTABLE_NOPUD_H + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) +#error "This version of Linux should not need compat pgtable-nopud.h" +#endif + +#define pud_t pgd_t +#define pud_offset(d, va) d +#define pud_none(pud) 0 +#define pud_present(pud) 1 +#define PTRS_PER_PUD 1 + +#endif /* _PGTABLE_NOPUD_H */ diff --git a/unmodified_drivers/linux-2.6/compat-include/linux/io.h b/unmodified_drivers/linux-2.6/compat-include/linux/io.h new file mode 100644 index 0000000000..10499023a5 --- /dev/null +++ b/unmodified_drivers/linux-2.6/compat-include/linux/io.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_IO_H +#define _LINUX_IO_H + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +#error "This version of Linux should not need compat linux/io.h" +#endif + +#include <asm/io.h> + +#endif diff --git a/unmodified_drivers/linux-2.6/compat-include/linux/mutex.h b/unmodified_drivers/linux-2.6/compat-include/linux/mutex.h new file mode 100644 index 0000000000..fcb4a899c7 --- /dev/null +++ b/unmodified_drivers/linux-2.6/compat-include/linux/mutex.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2006 Cisco Systems. All rights reserved. + * + * This file is released under the GPLv2. + */ + +/* mutex compatibility for pre-2.6.16 kernels */ + +#ifndef __LINUX_MUTEX_H +#define __LINUX_MUTEX_H + +#include <linux/version.h> + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +#error "This version of Linux should not need compat mutex.h" +#endif + +#include <linux/version.h> +#include <asm/semaphore.h> + +#define mutex semaphore +#define DEFINE_MUTEX(foo) DECLARE_MUTEX(foo) +#define mutex_init(foo) init_MUTEX(foo) +#define mutex_lock(foo) down(foo) +#define mutex_lock_interruptible(foo) down_interruptible(foo) +/* this function follows the spin_trylock() convention, so * + * it is negated to the down_trylock() return values! Be careful */ +#define mutex_trylock(foo) !down_trylock(foo) +#define mutex_unlock(foo) up(foo) + +#endif /* __LINUX_MUTEX_H */ diff --git a/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h b/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h new file mode 100644 index 0000000000..4978c63610 --- /dev/null +++ b/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h @@ -0,0 +1,52 @@ +#ifndef COMPAT_INCLUDE_XEN_PLATFORM_COMPAT_H +#define COMPAT_INCLUDE_XEN_PLATFORM_COMPAT_H + +#include <linux/version.h> + +#include <linux/spinlock.h> + +#if defined(__LINUX_COMPILER_H) && !defined(__always_inline) +#define __always_inline inline +#endif + +#if defined(__LINUX_SPINLOCK_H) && !defined(DEFINE_SPINLOCK) +#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED +#endif + +#if defined(_LINUX_INIT_H) && !defined(__init) +#define __init +#endif + +#if defined(__LINUX_CACHE_H) && !defined(__read_mostly) +#define __read_mostly +#endif + +#if defined(_LINUX_SKBUFF_H) && !defined(NET_IP_ALIGN) +#define NET_IP_ALIGN 0 +#endif + +#if defined(_LINUX_FS_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) +#define nonseekable_open(inode, filp) /* Nothing to do */ +#endif + +#if defined(_LINUX_MM_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) +unsigned long vmalloc_to_pfn(void *addr); +#endif + +#if defined(__LINUX_COMPLETION_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) +unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); +#endif + +#if defined(_LINUX_SCHED_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +signed long schedule_timeout_interruptible(signed long timeout); +#endif + +#if defined(_LINUX_SLAB_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +void *kzalloc(size_t size, int flags); +#endif + +#if defined(_LINUX_BLKDEV_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) +#define end_that_request_last(req, uptodate) end_that_request_last(req) +#endif + +#endif diff --git a/unmodified_drivers/linux-2.6/netfront/Makefile b/unmodified_drivers/linux-2.6/netfront/Makefile new file mode 100644 index 0000000000..64e7acd194 --- /dev/null +++ b/unmodified_drivers/linux-2.6/netfront/Makefile @@ -0,0 +1,3 @@ +ifneq ($(KERNELRELEASE),) +include $(src)/Kbuild +endif diff --git a/unmodified_drivers/linux-2.6/overrides.mk b/unmodified_drivers/linux-2.6/overrides.mk index 74ef12c4c9..53a96d87a4 100644 --- a/unmodified_drivers/linux-2.6/overrides.mk +++ b/unmodified_drivers/linux-2.6/overrides.mk @@ -9,4 +9,4 @@ EXTRA_CFLAGS += -DCONFIG_XEN_SHADOW_MODE -DCONFIG_XEN_SHADOW_TRANSLATE EXTRA_CFLAGS += -DCONFIG_XEN_BLKDEV_GRANT -DXEN_EVTCHN_MASK_OPS EXTRA_CFLAGS += -DCONFIG_XEN_NETDEV_GRANT_RX -DCONFIG_XEN_NETDEV_GRANT_TX EXTRA_CFLAGS += -D__XEN_INTERFACE_VERSION__=0x00030202 -EXTRA_CFLAGS += -I$(M)/include +EXTRA_CFLAGS += -I$(M)/include -I$(M)/compat-include -DHAVE_XEN_PLATFORM_COMPAT_H diff --git a/unmodified_drivers/linux-2.6/platform-pci/Kbuild b/unmodified_drivers/linux-2.6/platform-pci/Kbuild index dda3d0e7cf..a44e50e94c 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/Kbuild +++ b/unmodified_drivers/linux-2.6/platform-pci/Kbuild @@ -4,7 +4,7 @@ obj-m := xen-platform-pci.o EXTRA_CFLAGS += -I$(M)/platform-pci -xen-platform-pci-objs := evtchn.o platform-pci.o gnttab.o xen_support.o features.o +xen-platform-pci-objs := evtchn.o platform-pci.o gnttab.o xen_support.o features.o platform-compat.o # Can we do better ? ifeq ($(ARCH),ia64) diff --git a/unmodified_drivers/linux-2.6/platform-pci/Makefile b/unmodified_drivers/linux-2.6/platform-pci/Makefile new file mode 100644 index 0000000000..64e7acd194 --- /dev/null +++ b/unmodified_drivers/linux-2.6/platform-pci/Makefile @@ -0,0 +1,3 @@ +ifneq ($(KERNELRELEASE),) +include $(src)/Kbuild +endif diff --git a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c index a38c50c1c4..4bd9592754 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c +++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c @@ -36,6 +36,10 @@ #include <xen/features.h> #include "platform-pci.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + void *shared_info_area; #define MAX_EVTCHN 256 diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c new file mode 100644 index 0000000000..f3cef11620 --- /dev/null +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c @@ -0,0 +1,116 @@ +#include <linux/config.h> +#include <linux/version.h> + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <xen/platform-compat.h> + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) +static int system_state = 1; +EXPORT_SYMBOL(system_state); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) +size_t strcspn(const char *s, const char *reject) +{ + const char *p; + const char *r; + size_t count = 0; + + for (p = s; *p != '\0'; ++p) { + for (r = reject; *r != '\0'; ++r) { + if (*p == *r) + return count; + } + ++count; + } + + return count; +} +EXPORT_SYMBOL(strcspn); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(void * vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) +unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + if (!timeout) { + __remove_wait_queue(&x->wait, &wait); + goto out; + } + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_timeout); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) +/* + fake do_exit using complete_and_exit + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) +asmlinkage NORET_TYPE void do_exit(long code) +#else +fastcall NORET_TYPE void do_exit(long code) +#endif +{ + complete_and_exit(NULL, code); +} +EXPORT_SYMBOL_GPL(do_exit); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +signed long schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +/** + * kzalloc - allocate memory. The memory is set to zero. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + */ +void *kzalloc(size_t size, int flags) +{ + void *ret = kmalloc(size, flags); + if (ret) + memset(ret, 0, size); + return ret; +} +EXPORT_SYMBOL(kzalloc); +#endif diff --git a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c index cb9e8dd7e5..5ff6ba83f7 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c @@ -33,6 +33,7 @@ #include <asm/irq.h> #include <asm/uaccess.h> #include <asm/hypervisor.h> +#include <asm/pgtable.h> #include <xen/interface/memory.h> #include <xen/features.h> #ifdef __ia64__ @@ -41,6 +42,10 @@ #include "platform-pci.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + #define DRV_NAME "xen-platform-pci" #define DRV_VERSION "0.10" #define DRV_RELDATE "03/03/2005" diff --git a/unmodified_drivers/linux-2.6/platform-pci/xen_support.c b/unmodified_drivers/linux-2.6/platform-pci/xen_support.c index b1a903b1c7..423d2f2e24 100644 --- a/unmodified_drivers/linux-2.6/platform-pci/xen_support.c +++ b/unmodified_drivers/linux-2.6/platform-pci/xen_support.c @@ -26,6 +26,10 @@ #include <asm/hypervisor.h> #include "platform-pci.h" +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + void xen_machphys_update(unsigned long mfn, unsigned long pfn) { BUG(); diff --git a/unmodified_drivers/linux-2.6/xenbus/Makefile b/unmodified_drivers/linux-2.6/xenbus/Makefile new file mode 100644 index 0000000000..64e7acd194 --- /dev/null +++ b/unmodified_drivers/linux-2.6/xenbus/Makefile @@ -0,0 +1,3 @@ +ifneq ($(KERNELRELEASE),) +include $(src)/Kbuild +endif diff --git a/xen/arch/ia64/vmx/mmio.c b/xen/arch/ia64/vmx/mmio.c index 579785b563..d605e828f0 100644 --- a/xen/arch/ia64/vmx/mmio.c +++ b/xen/arch/ia64/vmx/mmio.c @@ -52,6 +52,70 @@ struct mmio_list *lookup_mmio(u64 gpa, struct mmio_list *mio_base) #define PIB_OFST_INTA 0x1E0000 #define PIB_OFST_XTP 0x1E0008 +#define HVM_BUFFERED_IO_RANGE_NR 1 + +struct hvm_buffered_io_range { + unsigned long start_addr; + unsigned long length; +}; + +static struct hvm_buffered_io_range buffered_stdvga_range = {0xA0000, 0x20000}; +static struct hvm_buffered_io_range +*hvm_buffered_io_ranges[HVM_BUFFERED_IO_RANGE_NR] = +{ + &buffered_stdvga_range +}; + +int hvm_buffered_io_intercept(ioreq_t *p) +{ + struct vcpu *v = current; + spinlock_t *buffered_io_lock; + buffered_iopage_t *buffered_iopage = + (buffered_iopage_t *)(v->domain->arch.hvm_domain.buffered_io_va); + unsigned long tmp_write_pointer = 0; + int i; + + /* ignore READ ioreq_t! */ + if ( p->dir == IOREQ_READ ) + return 0; + + for ( i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++ ) { + if ( p->addr >= hvm_buffered_io_ranges[i]->start_addr && + p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr + + hvm_buffered_io_ranges[i]->length ) + break; + } + + if ( i == HVM_BUFFERED_IO_RANGE_NR ) + return 0; + + buffered_io_lock = &v->domain->arch.hvm_domain.buffered_io_lock; + spin_lock(buffered_io_lock); + + if ( buffered_iopage->write_pointer - buffered_iopage->read_pointer == + (unsigned long)IOREQ_BUFFER_SLOT_NUM ) { + /* the queue is full. + * send the iopacket through the normal path. + * NOTE: The arithimetic operation could handle the situation for + * write_pointer overflow. + */ + spin_unlock(buffered_io_lock); + return 0; + } + + tmp_write_pointer = buffered_iopage->write_pointer % IOREQ_BUFFER_SLOT_NUM; + + memcpy(&buffered_iopage->ioreq[tmp_write_pointer], p, sizeof(ioreq_t)); + + /*make the ioreq_t visible before write_pointer*/ + wmb(); + buffered_iopage->write_pointer++; + + spin_unlock(buffered_io_lock); + + return 1; +} + static void write_ipi (VCPU *vcpu, uint64_t addr, uint64_t value); static void pib_write(VCPU *vcpu, void *src, uint64_t pib_off, size_t s, int ma) @@ -156,7 +220,11 @@ static void low_mmio_access(VCPU *vcpu, u64 pa, u64 *val, size_t s, int dir) p->df = 0; p->io_count++; - + if(hvm_buffered_io_intercept(p)){ + p->state = STATE_IORESP_READY; + vmx_io_assist(v); + return ; + }else vmx_send_assist_req(v); if(dir==IOREQ_READ){ //read *val=p->u.data; diff --git a/xen/arch/ia64/vmx/vmx_init.c b/xen/arch/ia64/vmx/vmx_init.c index 2694149d5b..9d8fbe8ec8 100644 --- a/xen/arch/ia64/vmx/vmx_init.c +++ b/xen/arch/ia64/vmx/vmx_init.c @@ -362,8 +362,8 @@ static const io_range_t io_ranges[] = { {PIB_START, PIB_SIZE, GPFN_PIB}, }; -/* Reseve 1 page for shared I/O and 1 page for xenstore. */ -#define VMX_SYS_PAGES (2 + (GFW_SIZE >> PAGE_SHIFT)) +/* Reseve 1 page for shared I/O ,1 page for xenstore and 1 page for buffer I/O. */ +#define VMX_SYS_PAGES (3 + (GFW_SIZE >> PAGE_SHIFT)) #define VMX_CONFIG_PAGES(d) ((d)->max_pages - VMX_SYS_PAGES) static void vmx_build_physmap_table(struct domain *d) @@ -425,8 +425,12 @@ static void vmx_build_physmap_table(struct domain *d) mfn = page_to_mfn(list_entry(list_ent, struct page_info, list)); assign_domain_page(d, STORE_PAGE_START, mfn << PAGE_SHIFT); list_ent = mfn_to_page(mfn)->list.next; + ASSERT(list_ent != &d->page_list); + + mfn = page_to_mfn(list_entry(list_ent, struct page_info, list)); + assign_domain_page(d, BUFFER_IO_PAGE_START, mfn << PAGE_SHIFT); + list_ent = mfn_to_page(mfn)->list.next; ASSERT(list_ent == &d->page_list); - } void vmx_setup_platform(struct domain *d) @@ -437,6 +441,10 @@ void vmx_setup_platform(struct domain *d) d->arch.vmx_platform.shared_page_va = (unsigned long)__va(__gpa_to_mpa(d, IO_PAGE_START)); + //For buffered IO requests. + spin_lock_init(&d->arch.hvm_domain.buffered_io_lock); + d->arch.hvm_domain.buffered_io_va = + (unsigned long)__va(__gpa_to_mpa(d, BUFFER_IO_PAGE_START)); /* TEMP */ d->arch.vmx_platform.pib_base = 0xfee00000UL; diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index 31f2793fb9..89cc508d02 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -28,12 +28,14 @@ obj-y += microcode.o obj-y += mm.o obj-y += mpparse.o obj-y += nmi.o +obj-y += numa.o obj-y += physdev.o obj-y += rwlock.o obj-y += setup.o obj-y += shutdown.o obj-y += smp.o obj-y += smpboot.o +obj-y += srat.o obj-y += string.o obj-y += sysctl.o obj-y += time.o diff --git a/xen/arch/x86/hvm/Makefile b/xen/arch/x86/hvm/Makefile index 37623ff5eb..843a9232bf 100644 --- a/xen/arch/x86/hvm/Makefile +++ b/xen/arch/x86/hvm/Makefile @@ -5,6 +5,7 @@ obj-y += hvm.o obj-y += i8254.o obj-y += i8259.o obj-y += rtc.o +obj-y += pmtimer.o obj-y += instrlen.o obj-y += intercept.o obj-y += io.o diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 47d7ca46c4..f950d05295 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -43,7 +43,7 @@ #include <asm/mc146818rtc.h> #include <asm/spinlock.h> #include <asm/hvm/hvm.h> -#include <asm/hvm/vpit.h> +#include <asm/hvm/vpt.h> #include <asm/hvm/support.h> #include <public/sched.h> #include <public/hvm/ioreq.h> @@ -285,6 +285,7 @@ void hvm_setup_platform(struct domain* d) pt_timer_fn, v, v->processor); pit_init(v, cpu_khz); rtc_init(v, RTC_PORT(0), RTC_IRQ); + pmtimer_init(v, ACPI_PM_TMR_BLK_ADDRESS); } void pic_irq_request(void *data, int level) diff --git a/xen/arch/x86/hvm/i8254.c b/xen/arch/x86/hvm/i8254.c index 5f27ee25b2..464ddee8f9 100644 --- a/xen/arch/x86/hvm/i8254.c +++ b/xen/arch/x86/hvm/i8254.c @@ -38,7 +38,7 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/io.h> #include <asm/hvm/support.h> -#include <asm/hvm/vpit.h> +#include <asm/hvm/vpt.h> #include <asm/current.h> /* Enable DEBUG_PIT may cause guest calibration inaccuracy */ diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c index 8993572e10..a1ce8ddf33 100644 --- a/xen/arch/x86/hvm/io.c +++ b/xen/arch/x86/hvm/io.c @@ -35,7 +35,7 @@ #include <asm/shadow.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> -#include <asm/hvm/vpit.h> +#include <asm/hvm/vpt.h> #include <asm/hvm/vpic.h> #include <asm/hvm/vlapic.h> diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c new file mode 100644 index 0000000000..e0c93536ea --- /dev/null +++ b/xen/arch/x86/hvm/pmtimer.c @@ -0,0 +1,63 @@ +#include <asm/hvm/vpt.h> +#include <asm/hvm/io.h> +#include <asm/hvm/support.h> + +#define TMR_STS (1 << 0) +static void pmt_update_status(void *opaque) +{ + PMTState *s = opaque; + s->pm1_status |= TMR_STS; + + /* TODO: When TMR_EN == 1, generate a SCI event */ + + set_timer(&s->timer, NOW() + (1000000000ULL << 31) / FREQUENCE_PMTIMER); +} + +static int handle_pmt_io(ioreq_t *p) +{ + struct vcpu *v = current; + PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt; + uint64_t curr_gtime; + + if (p->size != 4 || + p->pdata_valid || + p->type != IOREQ_TYPE_PIO){ + printk("HVM_PMT: wrong PM timer IO\n"); + return 1; + } + + if (p->dir == 0) { /* write */ + /* PM_TMR_BLK is read-only */ + return 1; + } else if (p->dir == 1) { /* read */ + curr_gtime = hvm_get_guest_time(s->vcpu); + s->pm1_timer += ((curr_gtime - s->last_gtime) * s->scale) >> 32; + p->u.data = s->pm1_timer; + s->last_gtime = curr_gtime; + return 1; + } + return 0; +} + +void pmtimer_init(struct vcpu *v, int base) +{ + PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt; + + s->pm1_timer = 0; + s->pm1_status = 0; + s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / ticks_per_sec(v); + s->vcpu = v; + + init_timer(&s->timer, pmt_update_status, s, v->processor); + /* ACPI supports a 32-bit power management timer */ + set_timer(&s->timer, NOW() + (1000000000ULL << 31) / FREQUENCE_PMTIMER); + + register_portio_handler(base, 4, handle_pmt_io); +} + +void pmtimer_deinit(struct domain *d) +{ + PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; + + kill_timer(&s->timer); +} diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c index 210168c26b..0f5e11986e 100644 --- a/xen/arch/x86/hvm/rtc.c +++ b/xen/arch/x86/hvm/rtc.c @@ -23,7 +23,7 @@ */ #include <asm/mc146818rtc.h> -#include <asm/hvm/vpit.h> +#include <asm/hvm/vpt.h> #include <asm/hvm/io.h> #include <asm/hvm/support.h> #include <asm/current.h> diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c index 92167d74fb..88c0802425 100644 --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -922,6 +922,7 @@ static void svm_relinquish_guest_resources(struct domain *d) kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer); rtc_deinit(d); + pmtimer_deinit(d); if ( d->arch.hvm_domain.shared_page_va ) unmap_domain_page_global( @@ -937,6 +938,7 @@ static void svm_migrate_timers(struct vcpu *v) struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm); struct RTCState *vrtc = &v->domain->arch.hvm_domain.pl_time.vrtc; + struct PMTState *vpmt = &v->domain->arch.hvm_domain.pl_time.vpmt; if ( pt->enabled ) { @@ -947,6 +949,7 @@ static void svm_migrate_timers(struct vcpu *v) migrate_timer(&VLAPIC(v)->vlapic_timer, v->processor); migrate_timer(&vrtc->second_timer, v->processor); migrate_timer(&vrtc->second_timer2, v->processor); + migrate_timer(&vpmt->timer, v->processor); } diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 476f8beae9..ac1be73556 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -147,6 +147,7 @@ static void vmx_relinquish_guest_resources(struct domain *d) kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer); rtc_deinit(d); + pmtimer_deinit(d); if ( d->arch.hvm_domain.shared_page_va ) unmap_domain_page_global( @@ -489,6 +490,7 @@ void vmx_migrate_timers(struct vcpu *v) { struct periodic_time *pt = &(v->domain->arch.hvm_domain.pl_time.periodic_tm); struct RTCState *vrtc = &v->domain->arch.hvm_domain.pl_time.vrtc; + struct PMTState *vpmt = &v->domain->arch.hvm_domain.pl_time.vpmt; if ( pt->enabled ) { @@ -499,6 +501,7 @@ void vmx_migrate_timers(struct vcpu *v) migrate_timer(&VLAPIC(v)->vlapic_timer, v->processor); migrate_timer(&vrtc->second_timer, v->processor); migrate_timer(&vrtc->second_timer2, v->processor); + migrate_timer(&vpmt->timer, v->processor); } static void vmx_store_cpu_guest_regs( diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c new file mode 100644 index 0000000000..d332320af6 --- /dev/null +++ b/xen/arch/x86/numa.c @@ -0,0 +1,308 @@ +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com> + */ + +#include <xen/mm.h> +#include <xen/string.h> +#include <xen/init.h> +#include <xen/ctype.h> +#include <xen/nodemask.h> +#include <xen/numa.h> +#include <xen/keyhandler.h> +#include <xen/time.h> +#include <xen/smp.h> +#include <asm/acpi.h> + +static int numa_setup(char *s); +custom_param("numa", numa_setup); + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +/* from proto.h */ +#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) + +struct node_data node_data[MAX_NUMNODES]; + +int memnode_shift; +u8 memnodemap[NODEMAPSIZE]; + +unsigned char cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; +unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +nodemask_t node_online_map = { { [0] = 1UL } }; + +/* Default NUMA to off for now. acpi=on required to enable it. */ +int numa_off __initdata = 1; + +int acpi_numa __initdata; + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct node *nodes, int numnodes, int shift) +{ + int i; + int res = -1; + unsigned long addr, end; + + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); + for (i = 0; i < numnodes; i++) { + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) + continue; + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) + return -1; + memnodemap[addr >> shift] = i; + addr += (1UL << shift); + } while (addr < end); + res = 1; + } + return res; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } + return shift; +} + +/* initialize NODE_DATA given nodeid and start/end */ +void __init setup_node_bootmem(int nodeid, u64 start, u64 end) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + NODE_DATA(nodeid)->node_id = nodeid; + NODE_DATA(nodeid)->node_start_pfn = start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + + node_set_online(nodeid); +} + +void __init numa_init_array(void) +{ + int rr, i; + /* There are unfortunately some poorly designed mainboards around + that only connect memory to a single CPU. This breaks the 1:1 cpu->node + mapping. To avoid this fill in the mapping for all possible + CPUs, as the number of CPUs is not known yet. + We round robin the existing nodes. */ + rr = first_node(node_online_map); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } + +} + +#ifdef CONFIG_NUMA_EMU +static int numa_fake __initdata = 0; + +/* Numa emulation */ +static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + struct node nodes[MAX_NUMNODES]; + unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; + + /* Kludge needed for the hash function */ + if (hweight64(sz) > 1) { + unsigned long x = 1; + while ((x << 1) < sz) + x <<= 1; + if (x < sz/2) + printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); + sz = x; + } + + memset(&nodes,0,sizeof(nodes)); + for (i = 0; i < numa_fake; i++) { + nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; + if (i == numa_fake-1) + sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; + nodes[i].end = nodes[i].start + sz; + printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n", + i, + nodes[i].start, nodes[i].end, + (nodes[i].end - nodes[i].start) >> 20); + node_set_online(i); + } + memnode_shift = compute_hash_shift(nodes, numa_fake); + if (memnode_shift < 0) { + memnode_shift = 0; + printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); + return -1; + } + for_each_online_node(i) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + numa_init_array(); + return 0; +} +#endif + +void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + +#ifdef CONFIG_NUMA_EMU + if (numa_fake && !numa_emulation(start_pfn, end_pfn)) + return; +#endif + +#ifdef CONFIG_ACPI_NUMA + if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT)) + return; +#endif + + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + start_pfn << PAGE_SHIFT, + end_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ + memnode_shift = 63; + memnodemap[0] = 0; + nodes_clear(node_online_map); + node_set_online(0); + for (i = 0; i < NR_CPUS; i++) + numa_set_node(i, 0); + node_to_cpumask[0] = cpumask_of_cpu(0); + setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +} + +__cpuinit void numa_add_cpu(int cpu) +{ + set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + cpu_to_node[cpu] = node; +} + +/* [numa=off] */ +static __init int numa_setup(char *opt) +{ + if (!strncmp(opt,"off",3)) + numa_off = 1; + if (!strncmp(opt,"on",2)) + numa_off = 0; +#ifdef CONFIG_NUMA_EMU + if(!strncmp(opt, "fake=", 5)) { + numa_off = 0; + numa_fake = simple_strtoul(opt+5,NULL,0); ; + if (numa_fake >= MAX_NUMNODES) + numa_fake = MAX_NUMNODES; + } +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt,"noacpi",6)) { + numa_off = 0; + acpi_numa = -1; + } +#endif + return 1; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + numa_set_node(i,apicid_to_node[apicid]); + } +} + +EXPORT_SYMBOL(cpu_to_node); +EXPORT_SYMBOL(node_to_cpumask); +EXPORT_SYMBOL(memnode_shift); +EXPORT_SYMBOL(memnodemap); +EXPORT_SYMBOL(node_data); + +static void dump_numa(unsigned char key) +{ + s_time_t now = NOW(); + int i; + + printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); + + for_each_online_node(i) { + unsigned long pa = (NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; + printk("idx%d -> NODE%d start->%lu size->%lu\n", + i, NODE_DATA(i)->node_id, + NODE_DATA(i)->node_start_pfn, + NODE_DATA(i)->node_spanned_pages); + /* sanity check phys_to_nid() */ + printk("phys_to_nid(%lx) -> %d should be %d\n", pa, phys_to_nid(pa), + NODE_DATA(i)->node_id); + } + for_each_online_cpu(i) + printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]); +} + +static __init int register_numa_trigger(void) +{ + register_keyhandler('u', dump_numa, "dump numa info"); + return 0; +} +__initcall(register_numa_trigger); + diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 2c8b638944..15c42b133c 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -16,6 +16,7 @@ #include <xen/percpu.h> #include <xen/hypercall.h> #include <xen/keyhandler.h> +#include <xen/numa.h> #include <public/version.h> #include <asm/bitops.h> #include <asm/smp.h> @@ -29,6 +30,7 @@ extern void dmi_scan_machine(void); extern void generic_apic_probe(void); +extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); /* * opt_xenheap_megabytes: Size of Xen heap in megabytes, excluding the @@ -257,6 +259,20 @@ static void __init init_idle_domain(void) setup_idle_pagetable(); } +static void srat_detect_node(int cpu) +{ + unsigned node; + u8 apicid = x86_cpu_to_apicid[cpu]; + + node = apicid_to_node[apicid]; + if ( node == NUMA_NO_NODE ) + node = 0; + numa_set_node(cpu, node); + + if ( acpi_numa > 0 ) + printk(KERN_INFO "CPU %d APIC %d -> Node %d\n", cpu, apicid, node); +} + void __init __start_xen(multiboot_info_t *mbi) { char __cmdline[] = "", *cmdline = __cmdline; @@ -485,6 +501,12 @@ void __init __start_xen(multiboot_info_t *mbi) init_frametable(); + acpi_boot_table_init(); + + acpi_numa_init(); + + numa_initmem_init(0, max_page); + end_boot_allocator(); /* Initialise the Xen heap, skipping RAM holes. */ @@ -536,9 +558,10 @@ void __init __start_xen(multiboot_info_t *mbi) generic_apic_probe(); - acpi_boot_table_init(); acpi_boot_init(); + init_cpu_to_node(); + if ( smp_found_config ) get_smp_config(); @@ -589,6 +612,11 @@ void __init __start_xen(multiboot_info_t *mbi) break; if ( !cpu_online(i) ) __cpu_up(i); + + /* Set up cpu_to_node[]. */ + srat_detect_node(i); + /* Set up node_to_cpumask based on cpu_to_node[]. */ + numa_add_cpu(i); } printk("Brought up %ld CPUs\n", (long)num_online_cpus()); diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index eb2d21111c..f7d8712563 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -43,6 +43,7 @@ #include <xen/delay.h> #include <xen/softirq.h> #include <xen/serial.h> +#include <xen/numa.h> #include <asm/current.h> #include <asm/mc146818rtc.h> #include <asm/desc.h> @@ -628,7 +629,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICI static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); + int apicid = hard_smp_processor_id(); cpu_2_logical_apicid[cpu] = apicid; map_cpu_to_node(cpu, apicid_to_node(apicid)); diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c new file mode 100644 index 0000000000..ea462e222b --- /dev/null +++ b/xen/arch/x86/srat.c @@ -0,0 +1,315 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + * + * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com> + */ + +#include <xen/init.h> +#include <xen/mm.h> +#include <xen/inttypes.h> +#include <xen/nodemask.h> +#include <xen/acpi.h> +#include <xen/numa.h> +#include <asm/page.h> + +static struct acpi_table_slit *acpi_slit; + +static nodemask_t nodes_parsed __initdata; +static nodemask_t nodes_found __initdata; +static struct node nodes[MAX_NUMNODES] __initdata; +static u8 pxm2node[256] = { [0 ... 255] = 0xff }; + +/* Too small nodes confuse the VM badly. Usually they result + from BIOS bugs. */ +#define NODE_MIN_SIZE (4*1024*1024) + +static int node_to_pxm(int n); + +int pxm_to_node(int pxm) +{ + if ((unsigned)pxm >= 256) + return -1; + /* Extend 0xff to (int)-1 */ + return (signed char)pxm2node[pxm]; +} + +static __init int setup_node(int pxm) +{ + unsigned node = pxm2node[pxm]; + if (node == 0xff) { + if (nodes_weight(nodes_found) >= MAX_NUMNODES) + return -1; + node = first_unset_node(nodes_found); + node_set(node, nodes_found); + pxm2node[pxm] = node; + } + return pxm2node[pxm]; +} + +static __init int conflicting_nodes(u64 start, u64 end) +{ + int i; + for_each_node_mask(i, nodes_parsed) { + struct node *nd = &nodes[i]; + if (nd->start == nd->end) + continue; + if (nd->end > start && nd->start < end) + return i; + if (nd->end == end && nd->start == start) + return i; + } + return -1; +} + +static __init void cutoff_node(int i, u64 start, u64 end) +{ + struct node *nd = &nodes[i]; + if (nd->start < start) { + nd->start = start; + if (nd->end < nd->start) + nd->start = nd->end; + } + if (nd->end > end) { + nd->end = end; + if (nd->start > nd->end) + nd->start = nd->end; + } +} + +static __init void bad_srat(void) +{ + int i; + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; + for (i = 0; i < MAX_LOCAL_APIC; i++) + apicid_to_node[i] = NUMA_NO_NODE; +} + +static __init inline int srat_disabled(void) +{ + return numa_off || acpi_numa < 0; +} + +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static __init int slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->localities; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != 10) + return 0; + } else if (val <= 10) + return 0; + } + } + return 1; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + if (!slit_valid(slit)) { + printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + return; + } + acpi_slit = slit; +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) +{ + int pxm, node; + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + return; + } + if (pa->flags.enabled == 0) + return; + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + apicid_to_node[pa->apic_id] = node; + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", + pxm, pa->apic_id, node); +} + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) +{ + struct node *nd; + u64 start, end; + int node, pxm; + int i; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) { + bad_srat(); + return; + } + if (ma->flags.enabled == 0) + return; + start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); + end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + /* It is fine to add this area to the nodes data it will be used later*/ + if (ma->flags.hot_pluggable == 1) + printk(KERN_INFO "SRAT: hot plug zone found %"PRIx64" - %"PRIx64" \n", + start, end); + i = conflicting_nodes(start, end); + if (i == node) { + printk(KERN_WARNING + "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%" + PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end); + } else if (i >= 0) { + printk(KERN_ERR + "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%" + PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i), + nodes[i].start, nodes[i].end); + bad_srat(); + return; + } + nd = &nodes[node]; + if (!node_test_and_set(node, nodes_parsed)) { + nd->start = start; + nd->end = end; + } else { + if (start < nd->start) + nd->start = start; + if (nd->end < end) + nd->end = end; + } + printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"\n", node, pxm, + nd->start, nd->end); +} + +/* Sanity check to catch more bad SRATs (they are amazingly common). + Make sure the PXMs cover all memory. */ +static int nodes_cover_memory(void) +{ + int i; + u64 pxmram, e820ram; + + pxmram = 0; + for_each_node_mask(i, nodes_parsed) { + u64 s = nodes[i].start >> PAGE_SHIFT; + u64 e = nodes[i].end >> PAGE_SHIFT; + pxmram += e - s; + } + + e820ram = max_page; + /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ + if ((long)(e820ram - pxmram) >= 1*1024*1024) { + printk(KERN_ERR "SRAT: PXMs only cover %"PRIu64"MB of your %" + PRIu64"MB e820 RAM. Not used.\n", + (pxmram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + +static void unparse_node(int node) +{ + int i; + node_clear(node, nodes_parsed); + for (i = 0; i < MAX_LOCAL_APIC; i++) { + if (apicid_to_node[i] == node) + apicid_to_node[i] = NUMA_NO_NODE; + } +} + +void __init acpi_numa_arch_fixup(void) {} + +/* Use the information discovered above to actually set up the nodes. */ +int __init acpi_scan_nodes(u64 start, u64 end) +{ + int i; + + /* First clean up the node list */ + for (i = 0; i < MAX_NUMNODES; i++) { + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) + unparse_node(i); + } + + if (acpi_numa <= 0) + return -1; + + if (!nodes_cover_memory()) { + bad_srat(); + return -1; + } + + memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES); + if (memnode_shift < 0) { + printk(KERN_ERR + "SRAT: No NUMA node hash function found. Contact maintainer\n"); + bad_srat(); + return -1; + } + + /* Finally register nodes */ + for_each_node_mask(i, nodes_parsed) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { + if (cpu_to_node[i] == NUMA_NO_NODE) + continue; + if (!node_isset(cpu_to_node[i], nodes_parsed)) + numa_set_node(i, NUMA_NO_NODE); + } + numa_init_array(); + return 0; +} + +static int node_to_pxm(int n) +{ + int i; + if (pxm2node[n] == n) + return n; + for (i = 0; i < 256; i++) + if (pxm2node[i] == n) + return i; + return 0; +} + +int __node_distance(int a, int b) +{ + int index; + + if (!acpi_slit) + return a == b ? 10 : 20; + index = acpi_slit->localities * node_to_pxm(a); + return acpi_slit->entry[index + node_to_pxm(b)]; +} + +EXPORT_SYMBOL(__node_distance); diff --git a/xen/common/memory.c b/xen/common/memory.c index c2827fa59f..9ab62a48ec 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -41,6 +41,8 @@ increase_reservation( struct page_info *page; unsigned long i; xen_pfn_t mfn; + /* use domain's first processor for locality parameter */ + unsigned int cpu = d->vcpu[0]->processor; if ( !guest_handle_is_null(extent_list) && !guest_handle_okay(extent_list, nr_extents) ) @@ -58,8 +60,8 @@ increase_reservation( return i; } - if ( unlikely((page = alloc_domheap_pages( - d, extent_order, memflags)) == NULL) ) + if ( unlikely((page = __alloc_domheap_pages( d, cpu, + extent_order, memflags )) == NULL) ) { DPRINTK("Could not allocate order=%d extent: " "id=%d memflags=%x (%ld of %d)\n", @@ -92,6 +94,8 @@ populate_physmap( unsigned long i, j; xen_pfn_t gpfn; xen_pfn_t mfn; + /* use domain's first processor for locality parameter */ + unsigned int cpu = d->vcpu[0]->processor; if ( !guest_handle_okay(extent_list, nr_extents) ) return 0; @@ -111,8 +115,8 @@ populate_physmap( if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) ) goto out; - if ( unlikely((page = alloc_domheap_pages( - d, extent_order, memflags)) == NULL) ) + if ( unlikely((page = __alloc_domheap_pages( d, cpu, + extent_order, memflags )) == NULL) ) { DPRINTK("Could not allocate order=%d extent: " "id=%d memflags=%x (%ld of %d)\n", @@ -294,7 +298,7 @@ memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg) unsigned long in_chunk_order, out_chunk_order; xen_pfn_t gpfn, gmfn, mfn; unsigned long i, j, k; - unsigned int memflags = 0; + unsigned int memflags = 0, cpu; long rc = 0; struct domain *d; struct page_info *page; @@ -368,6 +372,9 @@ memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg) } d = current->domain; + /* use domain's first processor for locality parameter */ + cpu = d->vcpu[0]->processor; + for ( i = 0; i < (exch.in.nr_extents >> in_chunk_order); i++ ) { if ( hypercall_preempt_check() ) @@ -413,8 +420,8 @@ memory_exchange(XEN_GUEST_HANDLE(xen_memory_exchange_t) arg) /* Allocate a chunk's worth of anonymous output pages. */ for ( j = 0; j < (1UL << out_chunk_order); j++ ) { - page = alloc_domheap_pages( - NULL, exch.out.extent_order, memflags); + page = __alloc_domheap_pages( NULL, cpu, + exch.out.extent_order, memflags); if ( unlikely(page == NULL) ) { rc = -ENOMEM; diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index fbbe837780..f4a1adc274 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -4,6 +4,7 @@ * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser + * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,6 +34,8 @@ #include <xen/domain_page.h> #include <xen/keyhandler.h> #include <xen/perfc.h> +#include <xen/numa.h> +#include <xen/nodemask.h> #include <asm/page.h> /* @@ -247,22 +250,23 @@ unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align) #define pfn_dom_zone_type(_pfn) \ (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM) -static struct list_head heap[NR_ZONES][MAX_ORDER+1]; +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1]; -static unsigned long avail[NR_ZONES]; +static unsigned long avail[NR_ZONES][MAX_NUMNODES]; static DEFINE_SPINLOCK(heap_lock); void end_boot_allocator(void) { - unsigned long i, j; + unsigned long i, j, k; int curr_free = 0, next_free = 0; memset(avail, 0, sizeof(avail)); for ( i = 0; i < NR_ZONES; i++ ) - for ( j = 0; j <= MAX_ORDER; j++ ) - INIT_LIST_HEAD(&heap[i][j]); + for ( j = 0; j < MAX_NUMNODES; j++ ) + for ( k = 0; k <= MAX_ORDER; k++ ) + INIT_LIST_HEAD(&heap[i][j][k]); /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < max_page; i++ ) @@ -272,29 +276,59 @@ void end_boot_allocator(void) if ( next_free ) map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */ if ( curr_free ) - free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0); + init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1); } } -/* Hand the specified arbitrary page range to the specified heap zone. */ +/* + * Hand the specified arbitrary page range to the specified heap zone + * checking the node_id of the previous page. If they differ and the + * latter is not on a MAX_ORDER boundary, then we reserve the page by + * not freeing it to the buddy allocator. + */ +#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER)) void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages) { + unsigned int nid_curr,nid_prev; unsigned long i; ASSERT(zone < NR_ZONES); + if ( likely(page_to_mfn(pg) != 0) ) + nid_prev = phys_to_nid(page_to_maddr(pg-1)); + else + nid_prev = phys_to_nid(page_to_maddr(pg)); + for ( i = 0; i < nr_pages; i++ ) - free_heap_pages(zone, pg+i, 0); + { + nid_curr = phys_to_nid(page_to_maddr(pg+i)); + + /* + * free pages of the same node, or if they differ, but are on a + * MAX_ORDER alignement boundary (which already get reserved) + */ + if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) & + MAX_ORDER_ALIGNED) ) + free_heap_pages(zone, pg+i, 0); + else + printk("Reserving non-aligned node boundary @ mfn %lu\n", + page_to_mfn(pg+i)); + + nid_prev = nid_curr; + } } - /* Allocate 2^@order contiguous pages. */ -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu, + unsigned int order) { - int i; + unsigned int i,j, node = cpu_to_node(cpu), num_nodes = num_online_nodes(); + unsigned int request = (1UL << order); struct page_info *pg; + ASSERT(node >= 0); + ASSERT(node < num_nodes); ASSERT(zone < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) @@ -302,29 +336,46 @@ struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) spin_lock(&heap_lock); - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= MAX_ORDER; i++ ) - if ( !list_empty(&heap[zone][i]) ) - goto found; + /* start with requested node, but exhaust all node memory + * in requested zone before failing, only calc new node + * value if we fail to find memory in target node, this avoids + * needless computation on fast-path */ + for ( i = 0; i < num_nodes; i++ ) + { + /* check if target node can support the allocation */ + if ( avail[zone][node] >= request ) + { + /* Find smallest order which can satisfy the request. */ + for ( j = order; j <= MAX_ORDER; j++ ) + { + if ( !list_empty(&heap[zone][node][j]) ) + goto found; + } + } + /* pick next node, wrapping around if needed */ + if ( ++node == num_nodes ) + node = 0; + } /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: - pg = list_entry(heap[zone][i].next, struct page_info, list); + pg = list_entry(heap[zone][node][j].next, struct page_info, list); list_del(&pg->list); /* We may have to halve the chunk a number of times. */ - while ( i != order ) + while ( j != order ) { - PFN_ORDER(pg) = --i; - list_add_tail(&pg->list, &heap[zone][i]); - pg += 1 << i; + PFN_ORDER(pg) = --j; + list_add_tail(&pg->list, &heap[zone][node][j]); + pg += 1 << j; } - map_alloc(page_to_mfn(pg), 1 << order); - avail[zone] -= 1 << order; + map_alloc(page_to_mfn(pg), request); + ASSERT(avail[zone][node] >= request); + avail[zone][node] -= request; spin_unlock(&heap_lock); @@ -337,14 +388,17 @@ void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order) { unsigned long mask; + int node = phys_to_nid(page_to_maddr(pg)); ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); + ASSERT(node >= 0); + ASSERT(node < num_online_nodes()); spin_lock(&heap_lock); map_free(page_to_mfn(pg), 1 << order); - avail[zone] += 1 << order; + avail[zone][node] += 1 << order; /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) @@ -370,10 +424,13 @@ void free_heap_pages( } order++; + + /* after merging, pg should be in the same node */ + ASSERT(phys_to_nid(page_to_maddr(pg)) == node ); } PFN_ORDER(pg) = order; - list_add_tail(&pg->list, &heap[zone][order]); + list_add_tail(&pg->list, &heap[zone][node][order]); spin_unlock(&heap_lock); } @@ -466,7 +523,7 @@ void *alloc_xenheap_pages(unsigned int order) int i; local_irq_save(flags); - pg = alloc_heap_pages(MEMZONE_XEN, order); + pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order); local_irq_restore(flags); if ( unlikely(pg == NULL) ) @@ -580,8 +637,9 @@ int assign_pages( } -struct page_info *alloc_domheap_pages( - struct domain *d, unsigned int order, unsigned int memflags) +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, + unsigned int memflags) { struct page_info *pg = NULL; cpumask_t mask; @@ -591,17 +649,17 @@ struct page_info *alloc_domheap_pages( if ( !(memflags & MEMF_dma) ) { - pg = alloc_heap_pages(MEMZONE_DOM, order); + pg = alloc_heap_pages(MEMZONE_DOM, cpu, order); /* Failure? Then check if we can fall back to the DMA pool. */ if ( unlikely(pg == NULL) && ((order > MAX_ORDER) || - (avail[MEMZONE_DMADOM] < + (avail_heap_pages(MEMZONE_DMADOM,-1) < (lowmem_emergency_pool_pages + (1UL << order)))) ) return NULL; } if ( pg == NULL ) - if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL ) + if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL ) return NULL; mask = pg->u.free.cpumask; @@ -640,6 +698,11 @@ struct page_info *alloc_domheap_pages( return pg; } +inline struct page_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) +{ + return __alloc_domheap_pages(d, smp_processor_id(), order, flags); +} void free_domheap_pages(struct page_info *pg, unsigned int order) { @@ -714,13 +777,27 @@ void free_domheap_pages(struct page_info *pg, unsigned int order) } +unsigned long avail_heap_pages(int zone, int node) +{ + int i,j, num_nodes = num_online_nodes(); + unsigned long free_pages = 0; + + for (i=0; i<NR_ZONES; i++) + if ( (zone == -1) || (zone == i) ) + for (j=0; j < num_nodes; j++) + if ( (node == -1) || (node == j) ) + free_pages += avail[i][j]; + + return free_pages; +} + unsigned long avail_domheap_pages(void) { unsigned long avail_nrm, avail_dma; + + avail_nrm = avail_heap_pages(MEMZONE_DOM,-1); - avail_nrm = avail[MEMZONE_DOM]; - - avail_dma = avail[MEMZONE_DMADOM]; + avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1); if ( avail_dma > lowmem_emergency_pool_pages ) avail_dma -= lowmem_emergency_pool_pages; else @@ -729,6 +806,10 @@ unsigned long avail_domheap_pages(void) return avail_nrm + avail_dma; } +unsigned long avail_nodeheap_pages(int node) +{ + return avail_heap_pages(-1, node); +} static void pagealloc_keyhandler(unsigned char key) { @@ -736,9 +817,9 @@ static void pagealloc_keyhandler(unsigned char key) printk(" Xen heap: %lukB free\n" " DMA heap: %lukB free\n" " Dom heap: %lukB free\n", - avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), - avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), - avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); + avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10)); } @@ -806,6 +887,46 @@ unsigned long avail_scrub_pages(void) return scrub_pages; } +static unsigned long count_bucket(struct list_head* l, int order) +{ + unsigned long total_pages = 0; + int pages = 1 << order; + struct page_info *pg; + + list_for_each_entry(pg, l, list) + total_pages += pages; + + return total_pages; +} + +static void dump_heap(unsigned char key) +{ + s_time_t now = NOW(); + int i,j,k; + unsigned long total; + + printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); + + for (i=0; i<NR_ZONES; i++ ) + for (j=0;j<MAX_NUMNODES;j++) + for (k=0;k<=MAX_ORDER;k++) + if ( !list_empty(&heap[i][j][k]) ) + { + total = count_bucket(&heap[i][j][k], k); + printk("heap[%d][%d][%d]-> %lu pages\n", + i, j, k, total); + } +} + +static __init int register_heap_trigger(void) +{ + register_keyhandler('H', dump_heap, "dump heap info"); + return 0; +} +__initcall(register_heap_trigger); + + static __init int page_scrub_init(void) { open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq); diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile index 68dafe3a52..08844a529d 100644 --- a/xen/drivers/acpi/Makefile +++ b/xen/drivers/acpi/Makefile @@ -1 +1,2 @@ obj-y += tables.o +obj-y += numa.o diff --git a/xen/drivers/acpi/numa.c b/xen/drivers/acpi/numa.c new file mode 100644 index 0000000000..ecf426ece4 --- /dev/null +++ b/xen/drivers/acpi/numa.c @@ -0,0 +1,216 @@ +/* + * acpi_numa.c - ACPI NUMA support + * + * Copyright (C) 2002 Takayoshi Kochi <t-kochi@bq.jp.nec.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + */ +#if 0 +#include <linux/module.h> +#include <linux/kernel.h> +#endif +#include <xen/config.h> +#include <xen/init.h> +#include <xen/types.h> +#include <xen/errno.h> +#include <xen/acpi.h> +#include <xen/numa.h> +#include <acpi/acpi_bus.h> +#include <acpi/acmacros.h> +#include <asm/page.h> /* __va() */ + +#define ACPI_NUMA 0x80000000 +#define _COMPONENT ACPI_NUMA +ACPI_MODULE_NAME("numa") + +extern int __init acpi_table_parse_madt_family(enum acpi_table_id id, + unsigned long madt_size, + int entry_id, + acpi_madt_entry_handler handler, + unsigned int max_entries); + +void __init acpi_table_print_srat_entry(acpi_table_entry_header * header) +{ + + ACPI_FUNCTION_NAME("acpi_table_print_srat_entry"); + + if (!header) + return; + + switch (header->type) { + + case ACPI_SRAT_PROCESSOR_AFFINITY: +#ifdef ACPI_DEBUG_OUTPUT + { + struct acpi_table_processor_affinity *p = + (struct acpi_table_processor_affinity *)header; + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", + p->apic_id, p->lsapic_eid, + p->proximity_domain, + p->flags. + enabled ? "enabled" : "disabled")); + } +#endif /* ACPI_DEBUG_OUTPUT */ + break; + + case ACPI_SRAT_MEMORY_AFFINITY: +#ifdef ACPI_DEBUG_OUTPUT + { + struct acpi_table_memory_affinity *p = + (struct acpi_table_memory_affinity *)header; + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n", + p->base_addr_hi, p->base_addr_lo, + p->length_hi, p->length_lo, + p->memory_type, p->proximity_domain, + p->flags. + enabled ? "enabled" : "disabled", + p->flags. + hot_pluggable ? " hot-pluggable" : + "")); + } +#endif /* ACPI_DEBUG_OUTPUT */ + break; + + default: + printk(KERN_WARNING PREFIX + "Found unsupported SRAT entry (type = 0x%x)\n", + header->type); + break; + } +} + +static int __init acpi_parse_slit(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_slit *slit; + u32 localities; + + if (!phys_addr || !size) + return -EINVAL; + + slit = (struct acpi_table_slit *)__va(phys_addr); + + /* downcast just for %llu vs %lu for i386/ia64 */ + localities = (u32) slit->localities; + + acpi_numa_slit_init(slit); + + return 0; +} + +static int __init +acpi_parse_processor_affinity(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_processor_affinity *processor_affinity; + + processor_affinity = (struct acpi_table_processor_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_processor_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_memory_affinity(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_memory_affinity *memory_affinity; + + memory_affinity = (struct acpi_table_memory_affinity *)header; + if (!memory_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(header); + + /* let architecture-dependent part to do it */ + acpi_numa_memory_affinity_init(memory_affinity); + + return 0; +} + +static int __init acpi_parse_srat(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_srat *srat; + + if (!phys_addr || !size) + return -EINVAL; + + srat = (struct acpi_table_srat *)__va(phys_addr); + + return 0; +} + +int __init +acpi_table_parse_srat(enum acpi_srat_entry_id id, + acpi_madt_entry_handler handler, unsigned int max_entries) +{ + return acpi_table_parse_madt_family(ACPI_SRAT, + sizeof(struct acpi_table_srat), id, + handler, max_entries); +} + +int __init acpi_numa_init(void) +{ + int result; + + /* SRAT: Static Resource Affinity Table */ + result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat); + + if (result > 0) { + result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY, + acpi_parse_processor_affinity, + NR_CPUS); + result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); // IA64 specific + } + + /* SLIT: System Locality Information Table */ + result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit); + + acpi_numa_arch_fixup(); + return 0; +} + +#if 0 +int acpi_get_pxm(acpi_handle h) +{ + unsigned long pxm; + acpi_status status; + acpi_handle handle; + acpi_handle phandle = h; + + do { + handle = phandle; + status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); + if (ACPI_SUCCESS(status)) + return (int)pxm; + status = acpi_get_parent(handle, &phandle); + } while (ACPI_SUCCESS(status)); + return -1; +} + +EXPORT_SYMBOL(acpi_get_pxm); +#endif diff --git a/xen/include/asm-ia64/vmx_platform.h b/xen/include/asm-ia64/vmx_platform.h index 33c4003cf3..07d05a68c6 100644 --- a/xen/include/asm-ia64/vmx_platform.h +++ b/xen/include/asm-ia64/vmx_platform.h @@ -24,6 +24,8 @@ #include <asm/hvm/vioapic.h> struct mmio_list; typedef struct virtual_platform_def { + unsigned long buffered_io_va; + spinlock_t buffered_io_lock; unsigned long shared_page_va; unsigned long pib_base; unsigned char xtp; diff --git a/xen/include/asm-x86/acpi.h b/xen/include/asm-x86/acpi.h index 51c4b8e293..227f76325c 100644 --- a/xen/include/asm-x86/acpi.h +++ b/xen/include/asm-x86/acpi.h @@ -157,6 +157,9 @@ static inline void check_acpi_pci(void) { } static inline void acpi_noirq_set(void) { acpi_noirq = 1; } static inline int acpi_irq_balance_set(char *str) { return 0; } +extern int acpi_scan_nodes(u64 start, u64 end); +extern int acpi_numa; +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #ifdef CONFIG_ACPI_SLEEP @@ -173,5 +176,6 @@ extern void acpi_reserve_bootmem(void); #endif /*CONFIG_ACPI_SLEEP*/ extern u8 x86_acpiid_to_apicid[]; +#define MAX_LOCAL_APIC 256 #endif /*_ASM_ACPI_H*/ diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index e2ef90700c..879bdbf80b 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -24,6 +24,11 @@ #define CONFIG_X86_IO_APIC 1 #define CONFIG_HPET_TIMER 1 #define CONFIG_X86_MCE_P4THERMAL 1 +#define CONFIG_ACPI_NUMA 1 +#define CONFIG_NUMA 1 +#define CONFIG_ACPI_SRAT 1 +#define CONFIG_DISCONTIGMEM 1 +#define CONFIG_NUMA_EMU 1 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */ #define CONFIG_X86_L1_CACHE_SHIFT 7 diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h index 6561519cb1..0ebec779c1 100644 --- a/xen/include/asm-x86/hvm/domain.h +++ b/xen/include/asm-x86/hvm/domain.h @@ -23,7 +23,7 @@ #define __ASM_X86_HVM_DOMAIN_H__ #include <asm/hvm/vpic.h> -#include <asm/hvm/vpit.h> +#include <asm/hvm/vpt.h> #include <asm/hvm/vlapic.h> #include <asm/hvm/vioapic.h> #include <public/hvm/params.h> diff --git a/xen/include/asm-x86/hvm/vpit.h b/xen/include/asm-x86/hvm/vpt.h index 83b1af2622..ada8936af7 100644 --- a/xen/include/asm-x86/hvm/vpit.h +++ b/xen/include/asm-x86/hvm/vpt.h @@ -1,5 +1,5 @@ /* - * vpit.h: Virtual PIT definitions + * vpt.h: Virtual Platform Timer definitions * * Copyright (c) 2004, Intel Corporation. * @@ -17,8 +17,8 @@ * Place - Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef __ASM_X86_HVM_VPIT_H__ -#define __ASM_X86_HVM_VPIT_H__ +#ifndef __ASM_X86_HVM_VPT_H__ +#define __ASM_X86_HVM_VPT_H__ #include <xen/config.h> #include <xen/init.h> @@ -70,7 +70,17 @@ typedef struct RTCState { struct vcpu *vcpu; struct periodic_time *pt; } RTCState; - + +#define FREQUENCE_PMTIMER 3579545 +typedef struct PMTState { + uint32_t pm1_timer; + uint32_t pm1_status; + uint64_t last_gtime; + struct timer timer; + uint64_t scale; + struct vcpu *vcpu; +} PMTState; + /* * Abstract layer of periodic time, one short time. */ @@ -95,7 +105,7 @@ struct pl_time { /* platform time */ struct periodic_time periodic_tm; struct PITState vpit; struct RTCState vrtc; - /* TODO: ACPI time */ + struct PMTState vpmt; }; static __inline__ s_time_t get_scheduled( @@ -132,8 +142,10 @@ extern void destroy_periodic_time(struct periodic_time *pt); void pit_init(struct vcpu *v, unsigned long cpu_khz); void rtc_init(struct vcpu *v, int base, int irq); void rtc_deinit(struct domain *d); +void pmtimer_init(struct vcpu *v, int base); +void pmtimer_deinit(struct domain *d); int is_rtc_periodic_irq(void *opaque); void pt_timer_fn(void *data); void pit_time_fired(struct vcpu *v, void *priv); -#endif /* __ASM_X86_HVM_VPIT_H__ */ +#endif /* __ASM_X86_HVM_VPT_H__ */ diff --git a/xen/include/asm-x86/mach-generic/mach_apic.h b/xen/include/asm-x86/mach-generic/mach_apic.h index 1d3ed4dc67..1e0a6019d6 100644 --- a/xen/include/asm-x86/mach-generic/mach_apic.h +++ b/xen/include/asm-x86/mach-generic/mach_apic.h @@ -22,11 +22,7 @@ static inline void enable_apic_mode(void) return; } -/* No sane NUMA support right now. We should parse ACPI SRAT. */ -static inline int apicid_to_node(int logical_apicid) -{ - return 0; -} +#define apicid_to_node(apicid) ((int)apicid_to_node[(u8)apicid]) extern u8 bios_cpu_apicid[]; static inline int cpu_present_to_apicid(int mps_cpu) diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h new file mode 100644 index 0000000000..caa6491c96 --- /dev/null +++ b/xen/include/asm-x86/numa.h @@ -0,0 +1,78 @@ +#ifndef _ASM_X8664_NUMA_H +#define _ASM_X8664_NUMA_H 1 + +#include <xen/cpumask.h> + +#define NODES_SHIFT 6 + +extern unsigned char cpu_to_node[]; +extern cpumask_t node_to_cpumask[]; + +#define cpu_to_node(cpu) (cpu_to_node[cpu]) +#define parent_node(node) (node) +#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) +#define node_to_cpumask(node) (node_to_cpumask[node]) + +struct node { + u64 start,end; +}; + +extern int compute_hash_shift(struct node *nodes, int numnodes); +extern int pxm_to_node(int nid); + +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) +#define VIRTUAL_BUG_ON(x) +#define NODEMAPSIZE 0xfff + +extern void numa_add_cpu(int cpu); +extern void numa_init_array(void); +extern int numa_off; + +extern void numa_set_node(int cpu, int node); + +extern void setup_node_bootmem(int nodeid, u64 start, u64 end); +extern unsigned char apicid_to_node[256]; +#ifdef CONFIG_NUMA +extern void __init init_cpu_to_node(void); + +static inline void clear_node_cpumask(int cpu) +{ + clear_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +/* Simple perfect hash to map physical addresses to node numbers */ +extern int memnode_shift; +extern u8 memnodemap[NODEMAPSIZE]; + +struct node_data { + unsigned long node_start_pfn; + unsigned long node_spanned_pages; + unsigned int node_id; +}; + +extern struct node_data node_data[]; + +static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) +{ + unsigned nid; + VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); + nid = memnodemap[addr >> memnode_shift]; + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); + return nid; +} + +#define NODE_DATA(nid) (&(node_data[nid])) + +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_spanned_pages) + + +#else +#define init_cpu_to_node() do {} while (0) +#define clear_node_cpumask(cpu) do {} while (0) +#endif + +#define NUMA_NO_NODE 0xff + +#endif diff --git a/xen/include/public/arch-ia64.h b/xen/include/public/arch-ia64.h index fd05ff9233..d7b35b4524 100644 --- a/xen/include/public/arch-ia64.h +++ b/xen/include/public/arch-ia64.h @@ -68,6 +68,9 @@ typedef unsigned long xen_ulong_t; #define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE) #define STORE_PAGE_SIZE PAGE_SIZE +#define BUFFER_IO_PAGE_START (STORE_PAGE_START+PAGE_SIZE) +#define BUFFER_IO_PAGE_SIZE PAGE_SIZE + #define IO_SAPIC_START 0xfec00000UL #define IO_SAPIC_SIZE 0x100000 diff --git a/xen/include/public/hvm/ioreq.h b/xen/include/public/hvm/ioreq.h index 8e92b004b1..992505e1c5 100644 --- a/xen/include/public/hvm/ioreq.h +++ b/xen/include/public/hvm/ioreq.h @@ -86,6 +86,10 @@ struct buffered_iopage { }; /* sizeof this structure must be in one page */ typedef struct buffered_iopage buffered_iopage_t; +#define ACPI_PM1A_EVT_BLK_ADDRESS 0x000000000000c010 +#define ACPI_PM1A_CNT_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x04) +#define ACPI_PM_TMR_BLK_ADDRESS (ACPI_PM1A_EVT_BLK_ADDRESS + 0x08) + #endif /* _IOREQ_H_ */ /* diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h index e3f94d5843..f79472da77 100644 --- a/xen/include/xen/config.h +++ b/xen/include/xen/config.h @@ -50,5 +50,7 @@ #endif /* !__ASSEMBLY__ */ #define fastcall +#define __cpuinitdata +#define __cpuinit #endif /* __XEN_CONFIG_H__ */ diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h index 8c9713971b..4d05f6917f 100644 --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -45,7 +45,8 @@ void end_boot_allocator(void); /* Generic allocator. These functions are *not* interrupt-safe. */ void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages); -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order); +struct page_info *alloc_heap_pages( + unsigned int zone, unsigned int cpu, unsigned int order); void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order); void scrub_heap_pages(void); @@ -61,8 +62,12 @@ void free_xenheap_pages(void *v, unsigned int order); void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int memflags); +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, + unsigned int memflags); void free_domheap_pages(struct page_info *pg, unsigned int order); unsigned long avail_domheap_pages(void); +unsigned long avail_heap_pages(int zone, int node); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h new file mode 100644 index 0000000000..30ed6f4524 --- /dev/null +++ b/xen/include/xen/nodemask.h @@ -0,0 +1,338 @@ +#ifndef __LINUX_NODEMASK_H +#define __LINUX_NODEMASK_H + +/* + * Nodemasks provide a bitmap suitable for representing the + * set of Node's in a system, one bit position per Node number. + * + * See detailed comments in the file linux/bitmap.h describing the + * data type on which these nodemasks are based. + * + * For details of nodemask_scnprintf() and nodemask_parse(), + * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * + * The available nodemask operations are: + * + * void node_set(node, mask) turn on bit 'node' in mask + * void node_clear(node, mask) turn off bit 'node' in mask + * void nodes_setall(mask) set all bits + * void nodes_clear(mask) clear all bits + * int node_isset(node, mask) true iff bit 'node' set in mask + * int node_test_and_set(node, mask) test and set bit 'node' in mask + * + * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] + * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] + * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 + * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 + * void nodes_complement(dst, src) dst = ~src + * + * int nodes_equal(mask1, mask2) Does mask1 == mask2? + * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? + * int nodes_empty(mask) Is mask empty (no bits sets)? + * int nodes_full(mask) Is mask full (all bits sets)? + * int nodes_weight(mask) Hamming weight - number of set bits + * + * void nodes_shift_right(dst, src, n) Shift right + * void nodes_shift_left(dst, src, n) Shift left + * + * int first_node(mask) Number lowest set bit, or MAX_NUMNODES + * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int first_unset_node(mask) First node not set in mask, or + * MAX_NUMNODES. + * + * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set + * NODE_MASK_ALL Initializer - all bits set + * NODE_MASK_NONE Initializer - no bits set + * unsigned long *nodes_addr(mask) Array of unsigned long's in mask + * + * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing + * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask + * + * for_each_node_mask(node, mask) for-loop node over mask + * + * int num_online_nodes() Number of online Nodes + * int num_possible_nodes() Number of all possible Nodes + * + * int node_online(node) Is some node online? + * int node_possible(node) Is some node possible? + * + * int any_online_node(mask) First online node in mask + * + * node_set_online(node) set bit 'node' in node_online_map + * node_set_offline(node) clear bit 'node' in node_online_map + * + * for_each_node(node) for-loop node over node_possible_map + * for_each_online_node(node) for-loop node over node_online_map + * + * Subtlety: + * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) + * to generate slightly worse code. So use a simple one-line #define + * for node_isset(), instead of wrapping an inline inside a macro, the + * way we do the other calls. + */ + +#include <xen/kernel.h> +#include <xen/bitmap.h> +#include <xen/numa.h> + +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; +extern nodemask_t _unused_nodemask_arg_; + +#define node_set(node, dst) __node_set((node), &(dst)) +static inline void __node_set(int node, volatile nodemask_t *dstp) +{ + set_bit(node, dstp->bits); +} + +#define node_clear(node, dst) __node_clear((node), &(dst)) +static inline void __node_clear(int node, volatile nodemask_t *dstp) +{ + clear_bit(node, dstp->bits); +} + +#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) +static inline void __nodes_setall(nodemask_t *dstp, int nbits) +{ + bitmap_fill(dstp->bits, nbits); +} + +#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) +static inline void __nodes_clear(nodemask_t *dstp, int nbits) +{ + bitmap_zero(dstp->bits, nbits); +} + +/* No static inline type checking - see Subtlety (1) above. */ +#define node_isset(node, nodemask) test_bit((node), (nodemask).bits) + +#define node_test_and_set(node, nodemask) \ + __node_test_and_set((node), &(nodemask)) +static inline int __node_test_and_set(int node, nodemask_t *addr) +{ + return test_and_set_bit(node, addr->bits); +} + +#define nodes_and(dst, src1, src2) \ + __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_or(dst, src1, src2) \ + __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_xor(dst, src1, src2) \ + __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_andnot(dst, src1, src2) \ + __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) +static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define nodes_complement(dst, src) \ + __nodes_complement(&(dst), &(src), MAX_NUMNODES) +static inline void __nodes_complement(nodemask_t *dstp, + const nodemask_t *srcp, int nbits) +{ + bitmap_complement(dstp->bits, srcp->bits, nbits); +} + +#define nodes_equal(src1, src2) \ + __nodes_equal(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_equal(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_equal(src1p->bits, src2p->bits, nbits); +} + +#define nodes_intersects(src1, src2) \ + __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_intersects(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_intersects(src1p->bits, src2p->bits, nbits); +} + +#define nodes_subset(src1, src2) \ + __nodes_subset(&(src1), &(src2), MAX_NUMNODES) +static inline int __nodes_subset(const nodemask_t *src1p, + const nodemask_t *src2p, int nbits) +{ + return bitmap_subset(src1p->bits, src2p->bits, nbits); +} + +#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) +static inline int __nodes_empty(const nodemask_t *srcp, int nbits) +{ + return bitmap_empty(srcp->bits, nbits); +} + +#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) +static inline int __nodes_full(const nodemask_t *srcp, int nbits) +{ + return bitmap_full(srcp->bits, nbits); +} + +#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) +static inline int __nodes_weight(const nodemask_t *srcp, int nbits) +{ + return bitmap_weight(srcp->bits, nbits); +} + +#define nodes_shift_right(dst, src, n) \ + __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) +static inline void __nodes_shift_right(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) +{ + bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); +} + +#define nodes_shift_left(dst, src, n) \ + __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) +static inline void __nodes_shift_left(nodemask_t *dstp, + const nodemask_t *srcp, int n, int nbits) +{ + bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); +} + +/* FIXME: better would be to fix all architectures to never return + > MAX_NUMNODES, then the silly min_ts could be dropped. */ + +#define first_node(src) __first_node(&(src)) +static inline int __first_node(const nodemask_t *srcp) +{ + return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); +} + +#define next_node(n, src) __next_node((n), &(src)) +static inline int __next_node(int n, const nodemask_t *srcp) +{ + return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); +} + +#define nodemask_of_node(node) \ +({ \ + typeof(_unused_nodemask_arg_) m; \ + if (sizeof(m) == sizeof(unsigned long)) { \ + m.bits[0] = 1UL<<(node); \ + } else { \ + nodes_clear(m); \ + node_set((node), m); \ + } \ + m; \ +}) + +#define first_unset_node(mask) __first_unset_node(&(mask)) +static inline int __first_unset_node(const nodemask_t *maskp) +{ + return min_t(int,MAX_NUMNODES, + find_first_zero_bit(maskp->bits, MAX_NUMNODES)); +} + +#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) + +#if MAX_NUMNODES <= BITS_PER_LONG + +#define NODE_MASK_ALL \ +((nodemask_t) { { \ + [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ +} }) + +#else + +#define NODE_MASK_ALL \ +((nodemask_t) { { \ + [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ + [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ +} }) + +#endif + +#define NODE_MASK_NONE \ +((nodemask_t) { { \ + [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ +} }) + +#define nodes_addr(src) ((src).bits) + +#if 0 +#define nodemask_scnprintf(buf, len, src) \ + __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) +static inline int __nodemask_scnprintf(char *buf, int len, + const nodemask_t *srcp, int nbits) +{ + return bitmap_scnprintf(buf, len, srcp->bits, nbits); +} + +#define nodemask_parse(ubuf, ulen, dst) \ + __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES) +static inline int __nodemask_parse(const char __user *buf, int len, + nodemask_t *dstp, int nbits) +{ + return bitmap_parse(buf, len, dstp->bits, nbits); +} +#endif + +#if MAX_NUMNODES > 1 +#define for_each_node_mask(node, mask) \ + for ((node) = first_node(mask); \ + (node) < MAX_NUMNODES; \ + (node) = next_node((node), (mask))) +#else /* MAX_NUMNODES == 1 */ +#define for_each_node_mask(node, mask) \ + if (!nodes_empty(mask)) \ + for ((node) = 0; (node) < 1; (node)++) +#endif /* MAX_NUMNODES */ + +/* + * The following particular system nodemasks and operations + * on them manage all possible and online nodes. + */ + +extern nodemask_t node_online_map; +extern nodemask_t node_possible_map; + +#if MAX_NUMNODES > 1 +#define num_online_nodes() nodes_weight(node_online_map) +#define num_possible_nodes() nodes_weight(node_possible_map) +#define node_online(node) node_isset((node), node_online_map) +#define node_possible(node) node_isset((node), node_possible_map) +#else +#define num_online_nodes() 1 +#define num_possible_nodes() 1 +#define node_online(node) ((node) == 0) +#define node_possible(node) ((node) == 0) +#endif + +#define any_online_node(mask) \ +({ \ + int node; \ + for_each_node_mask(node, (mask)) \ + if (node_online(node)) \ + break; \ + node; \ +}) + +#define node_set_online(node) set_bit((node), node_online_map.bits) +#define node_set_offline(node) clear_bit((node), node_online_map.bits) + +#define for_each_node(node) for_each_node_mask((node), node_possible_map) +#define for_each_online_node(node) for_each_node_mask((node), node_online_map) + +#endif /* __LINUX_NODEMASK_H */ diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h new file mode 100644 index 0000000000..9585fc9c48 --- /dev/null +++ b/xen/include/xen/numa.h @@ -0,0 +1,13 @@ +#ifndef _XEN_NUMA_H +#define _XEN_NUMA_H + +#include <xen/config.h> +#include <asm/numa.h> + +#ifndef NODES_SHIFT +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#endif /* _XEN_NUMA_H */ |